trismik 0.9.1__py3-none-any.whl → 0.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
trismik/settings.py ADDED
@@ -0,0 +1,15 @@
1
+ """Default settings for the Trismik client."""
2
+
3
+ evaluation_settings = {
4
+ "max_iterations": 150,
5
+ }
6
+
7
+ client_settings = {"endpoint": "https://dashboard.trismik.com/api"}
8
+
9
+ # Environment variable names used by the Trismik client
10
+ environment_settings = {
11
+ # URL of the Trismik service
12
+ "trismik_service_url": "TRISMIK_SERVICE_URL",
13
+ # API key for authentication
14
+ "trismik_api_key": "TRISMIK_API_KEY",
15
+ }
trismik/types.py CHANGED
@@ -1,105 +1,110 @@
1
- from dataclasses import dataclass
2
- from datetime import datetime
3
- from typing import Dict, List, Any, Optional
4
-
1
+ """
2
+ Type definitions for the Trismik client.
5
3
 
6
- @dataclass
7
- class TrismikAuth:
8
- """
9
- Authentication token.
4
+ This module defines the data structures used throughout the Trismik client
5
+ library.
6
+ """
10
7
 
11
- Attributes:
12
- token (str): Authentication token value.
13
- expires (datetime): Expiration date.
14
- """
15
- token: str
16
- expires: datetime
8
+ from dataclasses import dataclass, field
9
+ from datetime import datetime
10
+ from typing import Any, Dict, List, Optional, Union
17
11
 
18
12
 
19
13
  @dataclass
20
- class TrismikTest:
21
- """
22
- Available test.
14
+ class TrismikDataset:
15
+ """Test metadata including ID and name."""
23
16
 
24
- Attributes:
25
- id (str): Test ID.
26
- name (str): Test name.
27
- """
28
17
  id: str
29
18
  name: str
30
19
 
31
20
 
32
21
  @dataclass
33
- class TrismikSession:
34
- """
35
- Test session.
22
+ class TrismikRun:
23
+ """Run metadata including ID, URL, and status."""
36
24
 
37
- Attributes:
38
- id (str): Session ID.
39
- url (str): Session URL.
40
- status (str): Session status
41
- """
42
25
  id: str
43
26
  url: str
44
27
  status: str
45
28
 
46
29
 
30
+ @dataclass
31
+ class TrismikRunInfo:
32
+ """Run info from new API endpoints."""
33
+
34
+ id: str
35
+
36
+
37
+ @dataclass
38
+ class TrismikRunState:
39
+ """Run state including responses, thetas, and other metrics."""
40
+
41
+ responses: List[str]
42
+ thetas: List[float]
43
+ std_error_history: List[float]
44
+ kl_info_history: List[float]
45
+ effective_difficulties: List[float]
46
+
47
+
48
+ @dataclass
49
+ class TrismikRunResponse:
50
+ """Response from run endpoints (start and continue)."""
51
+
52
+ run_info: TrismikRunInfo
53
+ state: TrismikRunState
54
+ next_item: Optional["TrismikItem"]
55
+ completed: bool
56
+
57
+
58
+ @dataclass
59
+ class TrismikAdaptiveTestState:
60
+ """State tracking for adaptive tests."""
61
+
62
+ run_id: str
63
+ state: TrismikRunState
64
+ completed: bool
65
+
66
+
67
+ @dataclass
68
+ class AdaptiveTestScore:
69
+ """Final scores of an adaptive test run."""
70
+
71
+ theta: float
72
+ std_error: float
73
+
74
+
47
75
  @dataclass
48
76
  class TrismikItem:
49
- """
50
- Base class for test items.
77
+ """Base class for test items."""
51
78
 
52
- Attributes:
53
- id (str): Item ID.
54
- """
55
79
  id: str
56
80
 
57
81
 
58
82
  @dataclass
59
83
  class TrismikChoice:
60
- """
61
- Base class for choices in items that use them.
84
+ """Base class for choices in items that use them."""
62
85
 
63
- Attributes:
64
- id (str): Choice ID.
65
- """
66
86
  id: str
67
87
 
68
88
 
69
89
  @dataclass
70
90
  class TrismikTextChoice(TrismikChoice):
71
- """
72
- Text choice.
91
+ """Text choice for multiple choice questions."""
73
92
 
74
- Attributes:
75
- text (str): Choice text.
76
- """
77
93
  text: str
78
94
 
79
95
 
80
96
  @dataclass
81
97
  class TrismikMultipleChoiceTextItem(TrismikItem):
82
- """
83
- Multiple choice text item.
98
+ """Multiple choice text question."""
84
99
 
85
- Attributes:
86
- question (str): Question text.
87
- choices (List[TrismikTextChoice]): List of choices.
88
- """
89
100
  question: str
90
101
  choices: List[TrismikTextChoice]
91
102
 
92
103
 
93
104
  @dataclass
94
105
  class TrismikResult:
95
- """
96
- Test result.
97
-
98
- Attributes:
99
- trait (str): Trait name.
100
- name (str): Result name/type.
101
- value (Any): Result value.
102
- """
106
+ """Test result for a specific trait."""
107
+
103
108
  trait: str
104
109
  name: str
105
110
  value: Any
@@ -107,57 +112,210 @@ class TrismikResult:
107
112
 
108
113
  @dataclass
109
114
  class TrismikResponse:
110
- """
111
- Test result.
112
-
113
- Attributes:
114
- item_id (str): Item ID.
115
- value (Any): Result value.
116
- score (float): Score.
117
- """
118
- item_id: str
115
+ """Response to a test item."""
116
+
117
+ dataset_item_id: str
119
118
  value: Any
120
- score: float
119
+ correct: bool
121
120
 
122
121
 
123
122
  @dataclass
124
123
  class TrismikRunResults:
125
- """
126
- Test results and responses.
127
-
128
- Attributes:
129
- results (List[TrismikResult]): Results.
130
- responses (List[TrismikResponse]): Responses.
131
- """
132
- session_id: str
133
- results: List[TrismikResult]
124
+ """Test results and responses."""
125
+
126
+ run_id: str
127
+ score: Optional[AdaptiveTestScore] = None
134
128
  responses: Optional[List[TrismikResponse]] = None
135
129
 
130
+
136
131
  @dataclass
137
- class TrismikSessionMetadata:
138
- """
139
- Metadata associated to a session
132
+ class TrismikRunSummary:
133
+ """Complete run summary."""
134
+
135
+ id: str
136
+ dataset_id: str
137
+ state: TrismikRunState
138
+ dataset: List[TrismikItem]
139
+ responses: List[TrismikResponse]
140
+ metadata: dict
141
+
142
+ @property
143
+ def theta(self) -> float:
144
+ """Get the theta of the run."""
145
+ return self.state.thetas[-1]
146
+
147
+ @property
148
+ def std_error(self) -> float:
149
+ """Get the standard error of the run."""
150
+ return self.state.std_error_history[-1]
151
+
152
+ @property
153
+ def total_responses(self) -> int:
154
+ """Get the total number of responses in the run."""
155
+ return len(self.responses)
156
+
157
+ @property
158
+ def correct_responses(self) -> int:
159
+ """Get the number of correct responses in the run."""
160
+ return sum(response.correct for response in self.responses)
161
+
162
+ @property
163
+ def wrong_responses(self) -> int:
164
+ """Get the number of wrong responses in the run."""
165
+ return self.total_responses - self.correct_responses
140
166
 
141
- Attributes:
142
- model_metadata (dict[str, Any]): Metadata about the model.
143
- test_configuration (dict[str, Any]): Metadata about the test.
144
- inference_setup (dict[str, Any]): Metadata about the inference setup.
145
- """
167
+
168
+ @dataclass
169
+ class TrismikRunMetadata:
170
+ """Metadata for a test run."""
146
171
 
147
172
  class ModelMetadata:
173
+ """Model metadata for a test run."""
174
+
148
175
  def __init__(self, name: str, **kwargs: Any):
176
+ """Initialize ModelMetadata with a name and optional attributes."""
149
177
  self.name = name
150
178
  for key, value in kwargs.items():
151
- setattr(self, key, value)
179
+ setattr(self, key, value)
152
180
 
153
181
  model_metadata: ModelMetadata
154
- test_configuration: dict[str, Any]
155
- inference_setup: dict[str, Any]
182
+ test_configuration: Dict[str, Any]
183
+ inference_setup: Dict[str, Any]
156
184
 
157
185
  def toDict(self) -> Dict[str, Any]:
186
+ """Convert run metadata to a dictionary."""
158
187
  return {
159
188
  "model_metadata": vars(self.model_metadata),
160
189
  "test_configuration": self.test_configuration,
161
190
  "inference_setup": self.inference_setup,
162
191
  }
163
192
 
193
+
194
+ @dataclass
195
+ class TrismikReplayRequestItem:
196
+ """Item in a replay request."""
197
+
198
+ itemId: str
199
+ itemChoiceId: str
200
+
201
+
202
+ @dataclass
203
+ class TrismikReplayRequest:
204
+ """Request to replay a run with specific responses."""
205
+
206
+ responses: List[TrismikReplayRequestItem]
207
+
208
+
209
+ @dataclass
210
+ class TrismikReplayResponse:
211
+ """Response from a replay run request."""
212
+
213
+ id: str
214
+ datasetId: str
215
+ state: TrismikRunState
216
+ replay_of_run: str
217
+ completedAt: Optional[datetime] = None
218
+ createdAt: Optional[datetime] = None
219
+ metadata: Dict[str, Any] = field(default_factory=dict)
220
+ dataset: List[TrismikItem] = field(default_factory=list)
221
+ responses: List[TrismikResponse] = field(default_factory=list)
222
+
223
+
224
+ @dataclass
225
+ class TrismikOrganization:
226
+ """Organization information."""
227
+
228
+ id: str
229
+ name: str
230
+ type: str
231
+ role: str
232
+
233
+
234
+ @dataclass
235
+ class TrismikUserInfo:
236
+ """User information."""
237
+
238
+ id: str
239
+ email: str
240
+ firstname: str
241
+ lastname: str
242
+ createdAt: Optional[str] = None
243
+
244
+
245
+ @dataclass
246
+ class TrismikMeResponse:
247
+ """Response from the /admin/api-keys/me endpoint."""
248
+
249
+ user: TrismikUserInfo
250
+ organizations: List[TrismikOrganization]
251
+
252
+
253
+ @dataclass
254
+ class TrismikClassicEvalItem:
255
+ """Item in a classic evaluation request."""
256
+
257
+ datasetItemId: str
258
+ modelInput: str
259
+ modelOutput: str
260
+ goldOutput: str
261
+ metrics: Dict[str, Any]
262
+
263
+
264
+ @dataclass
265
+ class TrismikClassicEvalMetric:
266
+ """Metric in a classic evaluation request."""
267
+
268
+ metricId: str
269
+ value: Union[str, float, int, bool]
270
+
271
+
272
+ @dataclass
273
+ class TrismikClassicEvalRequest:
274
+ """Request to submit a classic evaluation."""
275
+
276
+ projectId: str
277
+ experimentName: str
278
+ datasetId: str
279
+ modelName: str
280
+ hyperparameters: Dict[str, Any]
281
+ items: List[TrismikClassicEvalItem]
282
+ metrics: List[TrismikClassicEvalMetric]
283
+
284
+
285
+ @dataclass
286
+ class TrismikClassicEvalResponse:
287
+ """Response from a classic evaluation submission."""
288
+
289
+ id: str
290
+ organizationId: str
291
+ projectId: str
292
+ experimentId: str
293
+ experimentName: str
294
+ datasetId: str
295
+ userId: str
296
+ type: str
297
+ modelName: str
298
+ hyperparameters: Dict[str, Any]
299
+ createdAt: str
300
+ user: TrismikUserInfo
301
+ responseCount: int
302
+
303
+
304
+ @dataclass
305
+ class TrismikProjectRequest:
306
+ """Request to create a new project."""
307
+
308
+ name: str
309
+ description: Optional[str] = None
310
+
311
+
312
+ @dataclass
313
+ class TrismikProject:
314
+ """Project information."""
315
+
316
+ id: str
317
+ name: str
318
+ description: Optional[str]
319
+ organizationId: str
320
+ createdAt: str
321
+ updatedAt: str
@@ -0,0 +1,174 @@
1
+ Metadata-Version: 2.4
2
+ Name: trismik
3
+ Version: 0.9.5
4
+ Summary:
5
+ License-File: LICENSE
6
+ Author: Bartosz Kielczewski
7
+ Author-email: bk352@cam.ac.uk
8
+ Requires-Python: >=3.9
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.9
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Programming Language :: Python :: 3.14
16
+ Provides-Extra: examples
17
+ Requires-Dist: accelerate (>=1.7.0,<2.0.0) ; extra == "examples"
18
+ Requires-Dist: notebook (>=7.4.4,<8.0.0) ; extra == "examples"
19
+ Requires-Dist: openai (>=1.81.0,<2.0.0) ; extra == "examples"
20
+ Requires-Dist: torch (>=2.7.0,<3.0.0) ; extra == "examples"
21
+ Requires-Dist: torchaudio (>=2.7.0,<3.0.0) ; extra == "examples"
22
+ Requires-Dist: torchvision (>=0.22.0,<1.0.0) ; extra == "examples"
23
+ Requires-Dist: transformers (>=4.51.3,<5.0.0) ; extra == "examples"
24
+ Description-Content-Type: text/markdown
25
+
26
+ <h1 align="center"> Trismik SDK</h1>
27
+
28
+ <p align="center">
29
+ <img alt="PyPI - Version" src="https://img.shields.io/pypi/v/trismik">
30
+ <img alt="Python Version" src="https://img.shields.io/badge/python-3.9%2B-blue">
31
+ <img alt="License" src="https://img.shields.io/badge/license-MIT-green">
32
+ </p>
33
+
34
+ ## Table of Contents
35
+ - [Overview](#overview)
36
+ - [Quick Start](#quick-start)
37
+ - [Installation](#installation)
38
+ - [API Key Setup](#api-key-setup)
39
+ - [Basic Usage](#basic-usage)
40
+ - [Interpreting Results](#interpreting-results)
41
+ - [Theta (θ)](#theta-θ)
42
+ - [Other Metrics](#other-metrics)
43
+ - [Contributing](#contributing)
44
+ - [License](#license)
45
+
46
+ ## Overview
47
+
48
+ [**Trismik**](https://trismik.com) is a Cambridge, UK based startup offering adversarial testing for LLMs. The APIs we provide through this library allow you to call our adaptive test engine and evaluate LLMs up to 95% faster (and cheaper!) than traditional evaluation techniques.
49
+
50
+ Our **adaptive testing** algorithm allows to estimate the precision of the model by looking only at a small portion of a dataset. Through this library, we provide access to a number of open source datasets over several dimensions (reasoning, toxicity, tool use...) to speed up model testing in several scenarios, like foundation model training, supervised fine tuning, prompt engineering, and so on.
51
+
52
+ ## Quick Start
53
+
54
+ ### Installation
55
+
56
+ To use our API, you need to get an API key first. Please register on [dashboard.trismik.com](https://dashboard.trismik.com) and obtain an API key.
57
+
58
+ Trismik is available via [pypi](https://pypi.org/project/trismik/). To install Trismik, run the following in your terminal (in a virtualenv, if you use one):
59
+
60
+ ```bash
61
+ pip install trismik
62
+ ```
63
+
64
+ ### API Key Setup
65
+
66
+ You can provide your API key in one of the following ways:
67
+
68
+ 1. **Environment Variable**:
69
+ ```bash
70
+ export TRISMIK_API_KEY="your-api-key"
71
+ ```
72
+
73
+ 2. **`.env` File**:
74
+ ```bash
75
+ # .env
76
+ TRISMIK_API_KEY=your-api-key
77
+ ```
78
+ Then load it with `python-dotenv`:
79
+ ```python
80
+ from dotenv import load_dotenv
81
+ load_dotenv()
82
+ ```
83
+
84
+ 3. **Direct Initialization**:
85
+ ```python
86
+ client = TrismikAsyncClient(api_key="YOUR_API_KEY")
87
+ ```
88
+
89
+ ### Basic Usage
90
+
91
+ Running a test is straightforward:
92
+
93
+ 1. Implement a method that wraps model inference over a dataset item
94
+ 2. Create an `AdaptiveTest` instance
95
+ 3. Run the test!
96
+
97
+ Here's a basic example:
98
+
99
+ ```python
100
+ def model_inference(item: TrismikItem) -> Any:
101
+ model_output = ... # call your model here
102
+ return model_output
103
+
104
+
105
+ # Initialize the test runner
106
+ runner = AdaptiveTest(model_inference)
107
+
108
+ # Run the test
109
+ results = await runner.run_async(
110
+ "MMLUPro2025", # or any dataset we support
111
+ with_responses=True,
112
+ run_metadata=sample_metadata,
113
+ )
114
+
115
+ # Print the test output
116
+ for result in results:
117
+ print(f"{result.trait} ({result.name}): {result.value}")
118
+ ```
119
+
120
+ ### Examples
121
+
122
+ You can find more examples in the `examples` folder:
123
+ - [`example_transformers.py`](examples/example_transformers.py) - Example using Hugging Face Transformers models
124
+ - [`example_openai.py`](examples/example_openai.py) - Example using OpenAI models
125
+ - [`example_adaptive_test.py`](examples/example_adaptive_test.py) - Example of adaptive testing configuration
126
+
127
+ To run the examples, you will need to clone this repo, navigate to the
128
+ source folder, and then run:
129
+
130
+ ```bash
131
+ poetry install --with examples
132
+ poetry run python examples/example_adaptive_test.py # or any other example
133
+ ```
134
+
135
+ ## Interpreting Results
136
+
137
+ ### Theta (θ)
138
+
139
+ Our adversarial test returns several values; however, you will be interested mainly in `theta`. Theta ($\theta$) is our metric; it measures the ability of the model on a certain dataset, and it can be used as a proxy to approximate the original metric used on that dataset. For example, on an accuracy-based dataset, a high theta correlates with a high accuracy, and low theta correlates with low accuracy.
140
+
141
+ To interpret a theta score, consider that $\theta=0$ corresponds to a 50% chance for a model to get an answer right - in other words, to an accuracy of 50%.
142
+ A negative theta means that the model will give more bad answers then good ones, while a positive theta means that the model will give more good answers then bad answers.
143
+ While theta is unbounded in our implementation (i.e. $-\infty < \theta < \infty$), in practice we have that for most cases $\theta$ will take values between -3 and 3.
144
+
145
+ Compared to classical benchmark testing, the estimated accuracy from adaptive testing uses fewer but more informative items while avoiding noise from overly easy or difficult questions. This makes it a more efficient and stable measure, especially on very large datasets.
146
+
147
+ ### Other Metrics
148
+
149
+ - **Standard Deviation (`std`)**:
150
+ - A measure of the uncertainty or error in the theta estimate
151
+ - A smaller `std` indicates a more precise estimate
152
+ - You should see a `std` around or below 0.25
153
+
154
+ - **Correct Responses (`responsesCorrect`)**:
155
+ - The number of correct answers delivered by the model
156
+
157
+ - **Important note**: A higher number of correct answers does not necessarily
158
+ correlate with a high theta. Our algorithm navigates the dataset to find a
159
+ balance of “hard” and “easy” items for your model, so by the end of the test,
160
+ it encounters a representative mix of inputs it can and cannot handle. In
161
+ practice, expect responsesCorrect to be roughly half of responsesTotal.
162
+
163
+ - **Total Responses (`responsesTotal`)**:
164
+ - The number of items processed before reaching a stable theta.
165
+ - Expected range: 60 ≤ responses_total ≤ 80
166
+
167
+ ## Contributing
168
+
169
+ See `CONTRIBUTING.md`.
170
+
171
+ ## License
172
+
173
+ This library is licensed under the MIT license. See `LICENSE` file.
174
+
@@ -0,0 +1,12 @@
1
+ trismik/__init__.py,sha256=20SwXrda9YsgykaoPohwz6foj2FkraniPA-GTQS9m00,197
2
+ trismik/_mapper.py,sha256=9LtEtekXas6tLP_BETl5geDeLKmTGrjdGwkU3bkgmHk,10995
3
+ trismik/_utils.py,sha256=WZ7x0EaG7PdXdFZMIs1wzgwqiPQm59QDDmQzBKYGCEg,3753
4
+ trismik/adaptive_test.py,sha256=Fmc6PJZg_SKQjV_xQF29ZJQwPxpGwfKuvi2VznAmOB0,21646
5
+ trismik/client_async.py,sha256=Z0tmePA86nO3Ul_55eZdT_52LSgRESANYqqrX4hH9Us,13883
6
+ trismik/exceptions.py,sha256=2wb4_K7GdDf00s3xUaiSfw6718ZV3Eaa4M2lYbiEZl4,1945
7
+ trismik/settings.py,sha256=FCP-d8ZEiYWUTWoa9nOVzSwTOLvg8y0pU08dAseiOwY,412
8
+ trismik/types.py,sha256=DQ88f_PcJmvmSAQPFDxEmtJSrQQDTSvhxbdrzzRHd-4,6571
9
+ trismik-0.9.5.dist-info/METADATA,sha256=rMF2N0d9MaoA2uIA9CL9LuoURTXs2ZCPSb4ZNZ2yZHc,6692
10
+ trismik-0.9.5.dist-info/WHEEL,sha256=M5asmiAlL6HEcOq52Yi5mmk9KmTVjY2RDPtO4p9DMrc,88
11
+ trismik-0.9.5.dist-info/licenses/LICENSE,sha256=tgetRhapGLh7ZxfknW6Mm-WobfziPd64nAK52X5XKaw,1077
12
+ trismik-0.9.5.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.1.2
2
+ Generator: poetry-core 2.2.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any