trismik 0.9.1__py3-none-any.whl → 0.9.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- trismik/__init__.py +10 -21
- trismik/_mapper.py +327 -47
- trismik/_utils.py +92 -15
- trismik/adaptive_test.py +671 -0
- trismik/client_async.py +250 -184
- trismik/exceptions.py +57 -6
- trismik/settings.py +15 -0
- trismik/types.py +246 -88
- trismik-0.9.5.dist-info/METADATA +174 -0
- trismik-0.9.5.dist-info/RECORD +12 -0
- {trismik-0.9.1.dist-info → trismik-0.9.5.dist-info}/WHEEL +1 -1
- trismik/client.py +0 -330
- trismik/runner.py +0 -119
- trismik/runner_async.py +0 -121
- trismik-0.9.1.dist-info/METADATA +0 -54
- trismik-0.9.1.dist-info/RECORD +0 -13
- {trismik-0.9.1.dist-info → trismik-0.9.5.dist-info/licenses}/LICENSE +0 -0
trismik/settings.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Default settings for the Trismik client."""
|
|
2
|
+
|
|
3
|
+
evaluation_settings = {
|
|
4
|
+
"max_iterations": 150,
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
client_settings = {"endpoint": "https://dashboard.trismik.com/api"}
|
|
8
|
+
|
|
9
|
+
# Environment variable names used by the Trismik client
|
|
10
|
+
environment_settings = {
|
|
11
|
+
# URL of the Trismik service
|
|
12
|
+
"trismik_service_url": "TRISMIK_SERVICE_URL",
|
|
13
|
+
# API key for authentication
|
|
14
|
+
"trismik_api_key": "TRISMIK_API_KEY",
|
|
15
|
+
}
|
trismik/types.py
CHANGED
|
@@ -1,105 +1,110 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
from typing import Dict, List, Any, Optional
|
|
4
|
-
|
|
1
|
+
"""
|
|
2
|
+
Type definitions for the Trismik client.
|
|
5
3
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
Authentication token.
|
|
4
|
+
This module defines the data structures used throughout the Trismik client
|
|
5
|
+
library.
|
|
6
|
+
"""
|
|
10
7
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
"""
|
|
15
|
-
token: str
|
|
16
|
-
expires: datetime
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from typing import Any, Dict, List, Optional, Union
|
|
17
11
|
|
|
18
12
|
|
|
19
13
|
@dataclass
|
|
20
|
-
class
|
|
21
|
-
"""
|
|
22
|
-
Available test.
|
|
14
|
+
class TrismikDataset:
|
|
15
|
+
"""Test metadata including ID and name."""
|
|
23
16
|
|
|
24
|
-
Attributes:
|
|
25
|
-
id (str): Test ID.
|
|
26
|
-
name (str): Test name.
|
|
27
|
-
"""
|
|
28
17
|
id: str
|
|
29
18
|
name: str
|
|
30
19
|
|
|
31
20
|
|
|
32
21
|
@dataclass
|
|
33
|
-
class
|
|
34
|
-
"""
|
|
35
|
-
Test session.
|
|
22
|
+
class TrismikRun:
|
|
23
|
+
"""Run metadata including ID, URL, and status."""
|
|
36
24
|
|
|
37
|
-
Attributes:
|
|
38
|
-
id (str): Session ID.
|
|
39
|
-
url (str): Session URL.
|
|
40
|
-
status (str): Session status
|
|
41
|
-
"""
|
|
42
25
|
id: str
|
|
43
26
|
url: str
|
|
44
27
|
status: str
|
|
45
28
|
|
|
46
29
|
|
|
30
|
+
@dataclass
|
|
31
|
+
class TrismikRunInfo:
|
|
32
|
+
"""Run info from new API endpoints."""
|
|
33
|
+
|
|
34
|
+
id: str
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class TrismikRunState:
|
|
39
|
+
"""Run state including responses, thetas, and other metrics."""
|
|
40
|
+
|
|
41
|
+
responses: List[str]
|
|
42
|
+
thetas: List[float]
|
|
43
|
+
std_error_history: List[float]
|
|
44
|
+
kl_info_history: List[float]
|
|
45
|
+
effective_difficulties: List[float]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class TrismikRunResponse:
|
|
50
|
+
"""Response from run endpoints (start and continue)."""
|
|
51
|
+
|
|
52
|
+
run_info: TrismikRunInfo
|
|
53
|
+
state: TrismikRunState
|
|
54
|
+
next_item: Optional["TrismikItem"]
|
|
55
|
+
completed: bool
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class TrismikAdaptiveTestState:
|
|
60
|
+
"""State tracking for adaptive tests."""
|
|
61
|
+
|
|
62
|
+
run_id: str
|
|
63
|
+
state: TrismikRunState
|
|
64
|
+
completed: bool
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class AdaptiveTestScore:
|
|
69
|
+
"""Final scores of an adaptive test run."""
|
|
70
|
+
|
|
71
|
+
theta: float
|
|
72
|
+
std_error: float
|
|
73
|
+
|
|
74
|
+
|
|
47
75
|
@dataclass
|
|
48
76
|
class TrismikItem:
|
|
49
|
-
"""
|
|
50
|
-
Base class for test items.
|
|
77
|
+
"""Base class for test items."""
|
|
51
78
|
|
|
52
|
-
Attributes:
|
|
53
|
-
id (str): Item ID.
|
|
54
|
-
"""
|
|
55
79
|
id: str
|
|
56
80
|
|
|
57
81
|
|
|
58
82
|
@dataclass
|
|
59
83
|
class TrismikChoice:
|
|
60
|
-
"""
|
|
61
|
-
Base class for choices in items that use them.
|
|
84
|
+
"""Base class for choices in items that use them."""
|
|
62
85
|
|
|
63
|
-
Attributes:
|
|
64
|
-
id (str): Choice ID.
|
|
65
|
-
"""
|
|
66
86
|
id: str
|
|
67
87
|
|
|
68
88
|
|
|
69
89
|
@dataclass
|
|
70
90
|
class TrismikTextChoice(TrismikChoice):
|
|
71
|
-
"""
|
|
72
|
-
Text choice.
|
|
91
|
+
"""Text choice for multiple choice questions."""
|
|
73
92
|
|
|
74
|
-
Attributes:
|
|
75
|
-
text (str): Choice text.
|
|
76
|
-
"""
|
|
77
93
|
text: str
|
|
78
94
|
|
|
79
95
|
|
|
80
96
|
@dataclass
|
|
81
97
|
class TrismikMultipleChoiceTextItem(TrismikItem):
|
|
82
|
-
"""
|
|
83
|
-
Multiple choice text item.
|
|
98
|
+
"""Multiple choice text question."""
|
|
84
99
|
|
|
85
|
-
Attributes:
|
|
86
|
-
question (str): Question text.
|
|
87
|
-
choices (List[TrismikTextChoice]): List of choices.
|
|
88
|
-
"""
|
|
89
100
|
question: str
|
|
90
101
|
choices: List[TrismikTextChoice]
|
|
91
102
|
|
|
92
103
|
|
|
93
104
|
@dataclass
|
|
94
105
|
class TrismikResult:
|
|
95
|
-
"""
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
Attributes:
|
|
99
|
-
trait (str): Trait name.
|
|
100
|
-
name (str): Result name/type.
|
|
101
|
-
value (Any): Result value.
|
|
102
|
-
"""
|
|
106
|
+
"""Test result for a specific trait."""
|
|
107
|
+
|
|
103
108
|
trait: str
|
|
104
109
|
name: str
|
|
105
110
|
value: Any
|
|
@@ -107,57 +112,210 @@ class TrismikResult:
|
|
|
107
112
|
|
|
108
113
|
@dataclass
|
|
109
114
|
class TrismikResponse:
|
|
110
|
-
"""
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
Attributes:
|
|
114
|
-
item_id (str): Item ID.
|
|
115
|
-
value (Any): Result value.
|
|
116
|
-
score (float): Score.
|
|
117
|
-
"""
|
|
118
|
-
item_id: str
|
|
115
|
+
"""Response to a test item."""
|
|
116
|
+
|
|
117
|
+
dataset_item_id: str
|
|
119
118
|
value: Any
|
|
120
|
-
|
|
119
|
+
correct: bool
|
|
121
120
|
|
|
122
121
|
|
|
123
122
|
@dataclass
|
|
124
123
|
class TrismikRunResults:
|
|
125
|
-
"""
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
results (List[TrismikResult]): Results.
|
|
130
|
-
responses (List[TrismikResponse]): Responses.
|
|
131
|
-
"""
|
|
132
|
-
session_id: str
|
|
133
|
-
results: List[TrismikResult]
|
|
124
|
+
"""Test results and responses."""
|
|
125
|
+
|
|
126
|
+
run_id: str
|
|
127
|
+
score: Optional[AdaptiveTestScore] = None
|
|
134
128
|
responses: Optional[List[TrismikResponse]] = None
|
|
135
129
|
|
|
130
|
+
|
|
136
131
|
@dataclass
|
|
137
|
-
class
|
|
138
|
-
"""
|
|
139
|
-
|
|
132
|
+
class TrismikRunSummary:
|
|
133
|
+
"""Complete run summary."""
|
|
134
|
+
|
|
135
|
+
id: str
|
|
136
|
+
dataset_id: str
|
|
137
|
+
state: TrismikRunState
|
|
138
|
+
dataset: List[TrismikItem]
|
|
139
|
+
responses: List[TrismikResponse]
|
|
140
|
+
metadata: dict
|
|
141
|
+
|
|
142
|
+
@property
|
|
143
|
+
def theta(self) -> float:
|
|
144
|
+
"""Get the theta of the run."""
|
|
145
|
+
return self.state.thetas[-1]
|
|
146
|
+
|
|
147
|
+
@property
|
|
148
|
+
def std_error(self) -> float:
|
|
149
|
+
"""Get the standard error of the run."""
|
|
150
|
+
return self.state.std_error_history[-1]
|
|
151
|
+
|
|
152
|
+
@property
|
|
153
|
+
def total_responses(self) -> int:
|
|
154
|
+
"""Get the total number of responses in the run."""
|
|
155
|
+
return len(self.responses)
|
|
156
|
+
|
|
157
|
+
@property
|
|
158
|
+
def correct_responses(self) -> int:
|
|
159
|
+
"""Get the number of correct responses in the run."""
|
|
160
|
+
return sum(response.correct for response in self.responses)
|
|
161
|
+
|
|
162
|
+
@property
|
|
163
|
+
def wrong_responses(self) -> int:
|
|
164
|
+
"""Get the number of wrong responses in the run."""
|
|
165
|
+
return self.total_responses - self.correct_responses
|
|
140
166
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
"""
|
|
167
|
+
|
|
168
|
+
@dataclass
|
|
169
|
+
class TrismikRunMetadata:
|
|
170
|
+
"""Metadata for a test run."""
|
|
146
171
|
|
|
147
172
|
class ModelMetadata:
|
|
173
|
+
"""Model metadata for a test run."""
|
|
174
|
+
|
|
148
175
|
def __init__(self, name: str, **kwargs: Any):
|
|
176
|
+
"""Initialize ModelMetadata with a name and optional attributes."""
|
|
149
177
|
self.name = name
|
|
150
178
|
for key, value in kwargs.items():
|
|
151
|
-
setattr(self, key, value)
|
|
179
|
+
setattr(self, key, value)
|
|
152
180
|
|
|
153
181
|
model_metadata: ModelMetadata
|
|
154
|
-
test_configuration:
|
|
155
|
-
inference_setup:
|
|
182
|
+
test_configuration: Dict[str, Any]
|
|
183
|
+
inference_setup: Dict[str, Any]
|
|
156
184
|
|
|
157
185
|
def toDict(self) -> Dict[str, Any]:
|
|
186
|
+
"""Convert run metadata to a dictionary."""
|
|
158
187
|
return {
|
|
159
188
|
"model_metadata": vars(self.model_metadata),
|
|
160
189
|
"test_configuration": self.test_configuration,
|
|
161
190
|
"inference_setup": self.inference_setup,
|
|
162
191
|
}
|
|
163
192
|
|
|
193
|
+
|
|
194
|
+
@dataclass
|
|
195
|
+
class TrismikReplayRequestItem:
|
|
196
|
+
"""Item in a replay request."""
|
|
197
|
+
|
|
198
|
+
itemId: str
|
|
199
|
+
itemChoiceId: str
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
@dataclass
|
|
203
|
+
class TrismikReplayRequest:
|
|
204
|
+
"""Request to replay a run with specific responses."""
|
|
205
|
+
|
|
206
|
+
responses: List[TrismikReplayRequestItem]
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
@dataclass
|
|
210
|
+
class TrismikReplayResponse:
|
|
211
|
+
"""Response from a replay run request."""
|
|
212
|
+
|
|
213
|
+
id: str
|
|
214
|
+
datasetId: str
|
|
215
|
+
state: TrismikRunState
|
|
216
|
+
replay_of_run: str
|
|
217
|
+
completedAt: Optional[datetime] = None
|
|
218
|
+
createdAt: Optional[datetime] = None
|
|
219
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
220
|
+
dataset: List[TrismikItem] = field(default_factory=list)
|
|
221
|
+
responses: List[TrismikResponse] = field(default_factory=list)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
@dataclass
|
|
225
|
+
class TrismikOrganization:
|
|
226
|
+
"""Organization information."""
|
|
227
|
+
|
|
228
|
+
id: str
|
|
229
|
+
name: str
|
|
230
|
+
type: str
|
|
231
|
+
role: str
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
@dataclass
|
|
235
|
+
class TrismikUserInfo:
|
|
236
|
+
"""User information."""
|
|
237
|
+
|
|
238
|
+
id: str
|
|
239
|
+
email: str
|
|
240
|
+
firstname: str
|
|
241
|
+
lastname: str
|
|
242
|
+
createdAt: Optional[str] = None
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
@dataclass
|
|
246
|
+
class TrismikMeResponse:
|
|
247
|
+
"""Response from the /admin/api-keys/me endpoint."""
|
|
248
|
+
|
|
249
|
+
user: TrismikUserInfo
|
|
250
|
+
organizations: List[TrismikOrganization]
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
@dataclass
|
|
254
|
+
class TrismikClassicEvalItem:
|
|
255
|
+
"""Item in a classic evaluation request."""
|
|
256
|
+
|
|
257
|
+
datasetItemId: str
|
|
258
|
+
modelInput: str
|
|
259
|
+
modelOutput: str
|
|
260
|
+
goldOutput: str
|
|
261
|
+
metrics: Dict[str, Any]
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
@dataclass
|
|
265
|
+
class TrismikClassicEvalMetric:
|
|
266
|
+
"""Metric in a classic evaluation request."""
|
|
267
|
+
|
|
268
|
+
metricId: str
|
|
269
|
+
value: Union[str, float, int, bool]
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
@dataclass
|
|
273
|
+
class TrismikClassicEvalRequest:
|
|
274
|
+
"""Request to submit a classic evaluation."""
|
|
275
|
+
|
|
276
|
+
projectId: str
|
|
277
|
+
experimentName: str
|
|
278
|
+
datasetId: str
|
|
279
|
+
modelName: str
|
|
280
|
+
hyperparameters: Dict[str, Any]
|
|
281
|
+
items: List[TrismikClassicEvalItem]
|
|
282
|
+
metrics: List[TrismikClassicEvalMetric]
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
@dataclass
|
|
286
|
+
class TrismikClassicEvalResponse:
|
|
287
|
+
"""Response from a classic evaluation submission."""
|
|
288
|
+
|
|
289
|
+
id: str
|
|
290
|
+
organizationId: str
|
|
291
|
+
projectId: str
|
|
292
|
+
experimentId: str
|
|
293
|
+
experimentName: str
|
|
294
|
+
datasetId: str
|
|
295
|
+
userId: str
|
|
296
|
+
type: str
|
|
297
|
+
modelName: str
|
|
298
|
+
hyperparameters: Dict[str, Any]
|
|
299
|
+
createdAt: str
|
|
300
|
+
user: TrismikUserInfo
|
|
301
|
+
responseCount: int
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
@dataclass
|
|
305
|
+
class TrismikProjectRequest:
|
|
306
|
+
"""Request to create a new project."""
|
|
307
|
+
|
|
308
|
+
name: str
|
|
309
|
+
description: Optional[str] = None
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
@dataclass
|
|
313
|
+
class TrismikProject:
|
|
314
|
+
"""Project information."""
|
|
315
|
+
|
|
316
|
+
id: str
|
|
317
|
+
name: str
|
|
318
|
+
description: Optional[str]
|
|
319
|
+
organizationId: str
|
|
320
|
+
createdAt: str
|
|
321
|
+
updatedAt: str
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: trismik
|
|
3
|
+
Version: 0.9.5
|
|
4
|
+
Summary:
|
|
5
|
+
License-File: LICENSE
|
|
6
|
+
Author: Bartosz Kielczewski
|
|
7
|
+
Author-email: bk352@cam.ac.uk
|
|
8
|
+
Requires-Python: >=3.9
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
16
|
+
Provides-Extra: examples
|
|
17
|
+
Requires-Dist: accelerate (>=1.7.0,<2.0.0) ; extra == "examples"
|
|
18
|
+
Requires-Dist: notebook (>=7.4.4,<8.0.0) ; extra == "examples"
|
|
19
|
+
Requires-Dist: openai (>=1.81.0,<2.0.0) ; extra == "examples"
|
|
20
|
+
Requires-Dist: torch (>=2.7.0,<3.0.0) ; extra == "examples"
|
|
21
|
+
Requires-Dist: torchaudio (>=2.7.0,<3.0.0) ; extra == "examples"
|
|
22
|
+
Requires-Dist: torchvision (>=0.22.0,<1.0.0) ; extra == "examples"
|
|
23
|
+
Requires-Dist: transformers (>=4.51.3,<5.0.0) ; extra == "examples"
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
<h1 align="center"> Trismik SDK</h1>
|
|
27
|
+
|
|
28
|
+
<p align="center">
|
|
29
|
+
<img alt="PyPI - Version" src="https://img.shields.io/pypi/v/trismik">
|
|
30
|
+
<img alt="Python Version" src="https://img.shields.io/badge/python-3.9%2B-blue">
|
|
31
|
+
<img alt="License" src="https://img.shields.io/badge/license-MIT-green">
|
|
32
|
+
</p>
|
|
33
|
+
|
|
34
|
+
## Table of Contents
|
|
35
|
+
- [Overview](#overview)
|
|
36
|
+
- [Quick Start](#quick-start)
|
|
37
|
+
- [Installation](#installation)
|
|
38
|
+
- [API Key Setup](#api-key-setup)
|
|
39
|
+
- [Basic Usage](#basic-usage)
|
|
40
|
+
- [Interpreting Results](#interpreting-results)
|
|
41
|
+
- [Theta (θ)](#theta-θ)
|
|
42
|
+
- [Other Metrics](#other-metrics)
|
|
43
|
+
- [Contributing](#contributing)
|
|
44
|
+
- [License](#license)
|
|
45
|
+
|
|
46
|
+
## Overview
|
|
47
|
+
|
|
48
|
+
[**Trismik**](https://trismik.com) is a Cambridge, UK based startup offering adversarial testing for LLMs. The APIs we provide through this library allow you to call our adaptive test engine and evaluate LLMs up to 95% faster (and cheaper!) than traditional evaluation techniques.
|
|
49
|
+
|
|
50
|
+
Our **adaptive testing** algorithm allows to estimate the precision of the model by looking only at a small portion of a dataset. Through this library, we provide access to a number of open source datasets over several dimensions (reasoning, toxicity, tool use...) to speed up model testing in several scenarios, like foundation model training, supervised fine tuning, prompt engineering, and so on.
|
|
51
|
+
|
|
52
|
+
## Quick Start
|
|
53
|
+
|
|
54
|
+
### Installation
|
|
55
|
+
|
|
56
|
+
To use our API, you need to get an API key first. Please register on [dashboard.trismik.com](https://dashboard.trismik.com) and obtain an API key.
|
|
57
|
+
|
|
58
|
+
Trismik is available via [pypi](https://pypi.org/project/trismik/). To install Trismik, run the following in your terminal (in a virtualenv, if you use one):
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
pip install trismik
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### API Key Setup
|
|
65
|
+
|
|
66
|
+
You can provide your API key in one of the following ways:
|
|
67
|
+
|
|
68
|
+
1. **Environment Variable**:
|
|
69
|
+
```bash
|
|
70
|
+
export TRISMIK_API_KEY="your-api-key"
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
2. **`.env` File**:
|
|
74
|
+
```bash
|
|
75
|
+
# .env
|
|
76
|
+
TRISMIK_API_KEY=your-api-key
|
|
77
|
+
```
|
|
78
|
+
Then load it with `python-dotenv`:
|
|
79
|
+
```python
|
|
80
|
+
from dotenv import load_dotenv
|
|
81
|
+
load_dotenv()
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
3. **Direct Initialization**:
|
|
85
|
+
```python
|
|
86
|
+
client = TrismikAsyncClient(api_key="YOUR_API_KEY")
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Basic Usage
|
|
90
|
+
|
|
91
|
+
Running a test is straightforward:
|
|
92
|
+
|
|
93
|
+
1. Implement a method that wraps model inference over a dataset item
|
|
94
|
+
2. Create an `AdaptiveTest` instance
|
|
95
|
+
3. Run the test!
|
|
96
|
+
|
|
97
|
+
Here's a basic example:
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
def model_inference(item: TrismikItem) -> Any:
|
|
101
|
+
model_output = ... # call your model here
|
|
102
|
+
return model_output
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# Initialize the test runner
|
|
106
|
+
runner = AdaptiveTest(model_inference)
|
|
107
|
+
|
|
108
|
+
# Run the test
|
|
109
|
+
results = await runner.run_async(
|
|
110
|
+
"MMLUPro2025", # or any dataset we support
|
|
111
|
+
with_responses=True,
|
|
112
|
+
run_metadata=sample_metadata,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Print the test output
|
|
116
|
+
for result in results:
|
|
117
|
+
print(f"{result.trait} ({result.name}): {result.value}")
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Examples
|
|
121
|
+
|
|
122
|
+
You can find more examples in the `examples` folder:
|
|
123
|
+
- [`example_transformers.py`](examples/example_transformers.py) - Example using Hugging Face Transformers models
|
|
124
|
+
- [`example_openai.py`](examples/example_openai.py) - Example using OpenAI models
|
|
125
|
+
- [`example_adaptive_test.py`](examples/example_adaptive_test.py) - Example of adaptive testing configuration
|
|
126
|
+
|
|
127
|
+
To run the examples, you will need to clone this repo, navigate to the
|
|
128
|
+
source folder, and then run:
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
poetry install --with examples
|
|
132
|
+
poetry run python examples/example_adaptive_test.py # or any other example
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Interpreting Results
|
|
136
|
+
|
|
137
|
+
### Theta (θ)
|
|
138
|
+
|
|
139
|
+
Our adversarial test returns several values; however, you will be interested mainly in `theta`. Theta ($\theta$) is our metric; it measures the ability of the model on a certain dataset, and it can be used as a proxy to approximate the original metric used on that dataset. For example, on an accuracy-based dataset, a high theta correlates with a high accuracy, and low theta correlates with low accuracy.
|
|
140
|
+
|
|
141
|
+
To interpret a theta score, consider that $\theta=0$ corresponds to a 50% chance for a model to get an answer right - in other words, to an accuracy of 50%.
|
|
142
|
+
A negative theta means that the model will give more bad answers then good ones, while a positive theta means that the model will give more good answers then bad answers.
|
|
143
|
+
While theta is unbounded in our implementation (i.e. $-\infty < \theta < \infty$), in practice we have that for most cases $\theta$ will take values between -3 and 3.
|
|
144
|
+
|
|
145
|
+
Compared to classical benchmark testing, the estimated accuracy from adaptive testing uses fewer but more informative items while avoiding noise from overly easy or difficult questions. This makes it a more efficient and stable measure, especially on very large datasets.
|
|
146
|
+
|
|
147
|
+
### Other Metrics
|
|
148
|
+
|
|
149
|
+
- **Standard Deviation (`std`)**:
|
|
150
|
+
- A measure of the uncertainty or error in the theta estimate
|
|
151
|
+
- A smaller `std` indicates a more precise estimate
|
|
152
|
+
- You should see a `std` around or below 0.25
|
|
153
|
+
|
|
154
|
+
- **Correct Responses (`responsesCorrect`)**:
|
|
155
|
+
- The number of correct answers delivered by the model
|
|
156
|
+
|
|
157
|
+
- **Important note**: A higher number of correct answers does not necessarily
|
|
158
|
+
correlate with a high theta. Our algorithm navigates the dataset to find a
|
|
159
|
+
balance of “hard” and “easy” items for your model, so by the end of the test,
|
|
160
|
+
it encounters a representative mix of inputs it can and cannot handle. In
|
|
161
|
+
practice, expect responsesCorrect to be roughly half of responsesTotal.
|
|
162
|
+
|
|
163
|
+
- **Total Responses (`responsesTotal`)**:
|
|
164
|
+
- The number of items processed before reaching a stable theta.
|
|
165
|
+
- Expected range: 60 ≤ responses_total ≤ 80
|
|
166
|
+
|
|
167
|
+
## Contributing
|
|
168
|
+
|
|
169
|
+
See `CONTRIBUTING.md`.
|
|
170
|
+
|
|
171
|
+
## License
|
|
172
|
+
|
|
173
|
+
This library is licensed under the MIT license. See `LICENSE` file.
|
|
174
|
+
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
trismik/__init__.py,sha256=20SwXrda9YsgykaoPohwz6foj2FkraniPA-GTQS9m00,197
|
|
2
|
+
trismik/_mapper.py,sha256=9LtEtekXas6tLP_BETl5geDeLKmTGrjdGwkU3bkgmHk,10995
|
|
3
|
+
trismik/_utils.py,sha256=WZ7x0EaG7PdXdFZMIs1wzgwqiPQm59QDDmQzBKYGCEg,3753
|
|
4
|
+
trismik/adaptive_test.py,sha256=Fmc6PJZg_SKQjV_xQF29ZJQwPxpGwfKuvi2VznAmOB0,21646
|
|
5
|
+
trismik/client_async.py,sha256=Z0tmePA86nO3Ul_55eZdT_52LSgRESANYqqrX4hH9Us,13883
|
|
6
|
+
trismik/exceptions.py,sha256=2wb4_K7GdDf00s3xUaiSfw6718ZV3Eaa4M2lYbiEZl4,1945
|
|
7
|
+
trismik/settings.py,sha256=FCP-d8ZEiYWUTWoa9nOVzSwTOLvg8y0pU08dAseiOwY,412
|
|
8
|
+
trismik/types.py,sha256=DQ88f_PcJmvmSAQPFDxEmtJSrQQDTSvhxbdrzzRHd-4,6571
|
|
9
|
+
trismik-0.9.5.dist-info/METADATA,sha256=rMF2N0d9MaoA2uIA9CL9LuoURTXs2ZCPSb4ZNZ2yZHc,6692
|
|
10
|
+
trismik-0.9.5.dist-info/WHEEL,sha256=M5asmiAlL6HEcOq52Yi5mmk9KmTVjY2RDPtO4p9DMrc,88
|
|
11
|
+
trismik-0.9.5.dist-info/licenses/LICENSE,sha256=tgetRhapGLh7ZxfknW6Mm-WobfziPd64nAK52X5XKaw,1077
|
|
12
|
+
trismik-0.9.5.dist-info/RECORD,,
|