trismik 0.9.11__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- trismik-1.0.0/PKG-INFO +258 -0
- trismik-1.0.0/README.md +229 -0
- {trismik-0.9.11 → trismik-1.0.0}/pyproject.toml +26 -10
- trismik-1.0.0/src/trismik/__init__.py +38 -0
- trismik-1.0.0/src/trismik/_async/__init__.py +1 -0
- trismik-1.0.0/src/trismik/_async/_test_transform.py +58 -0
- trismik-1.0.0/src/trismik/_async/client.py +731 -0
- trismik-1.0.0/src/trismik/_async/helpers.py +23 -0
- {trismik-0.9.11 → trismik-1.0.0}/src/trismik/_mapper.py +10 -30
- trismik-1.0.0/src/trismik/_sync/__init__.py +1 -0
- trismik-1.0.0/src/trismik/_sync/_test_transform.py +58 -0
- trismik-1.0.0/src/trismik/_sync/client.py +731 -0
- trismik-1.0.0/src/trismik/_sync/helpers.py +27 -0
- {trismik-0.9.11 → trismik-1.0.0}/src/trismik/_utils.py +1 -3
- {trismik-0.9.11 → trismik-1.0.0}/src/trismik/settings.py +1 -1
- {trismik-0.9.11 → trismik-1.0.0}/src/trismik/types.py +1 -1
- trismik-0.9.11/PKG-INFO +0 -177
- trismik-0.9.11/README.md +0 -148
- trismik-0.9.11/src/trismik/__init__.py +0 -10
- trismik-0.9.11/src/trismik/adaptive_test.py +0 -671
- trismik-0.9.11/src/trismik/client_async.py +0 -404
- {trismik-0.9.11 → trismik-1.0.0}/LICENSE +0 -0
- {trismik-0.9.11 → trismik-1.0.0}/src/trismik/exceptions.py +0 -0
trismik-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: trismik
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary:
|
|
5
|
+
License-File: LICENSE
|
|
6
|
+
Author: Bartosz Kielczewski
|
|
7
|
+
Author-email: bk352@cam.ac.uk
|
|
8
|
+
Requires-Python: >=3.9, <3.14
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Provides-Extra: examples
|
|
16
|
+
Requires-Dist: accelerate (>=1.7.0,<2.0.0) ; extra == "examples"
|
|
17
|
+
Requires-Dist: httpx (>=0.27.2,<1.0.0)
|
|
18
|
+
Requires-Dist: jupyterlab (>=4.4.8) ; extra == "examples"
|
|
19
|
+
Requires-Dist: nest-asyncio (>=1.6.0,<2.0.0)
|
|
20
|
+
Requires-Dist: notebook (>=7.4.4,<8.0.0) ; extra == "examples"
|
|
21
|
+
Requires-Dist: openai (>=1.81.0,<2.0.0) ; extra == "examples"
|
|
22
|
+
Requires-Dist: torch (>=2.8.0,<3.0.0) ; extra == "examples"
|
|
23
|
+
Requires-Dist: torchaudio (>=2.7.0,<3.0.0) ; extra == "examples"
|
|
24
|
+
Requires-Dist: torchvision (>=0.22.0,<1.0.0) ; extra == "examples"
|
|
25
|
+
Requires-Dist: tqdm (>=4.67.1,<5.0.0) ; extra == "examples"
|
|
26
|
+
Requires-Dist: transformers (>=4.51.3,<5.0.0) ; extra == "examples"
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
<h1 align="center"> Trismik SDK</h1>
|
|
30
|
+
|
|
31
|
+
<p align="center">
|
|
32
|
+
<img alt="PyPI - Version" src="https://img.shields.io/pypi/v/trismik">
|
|
33
|
+
<img alt="Python Version" src="https://img.shields.io/badge/python-3.9%2B-blue">
|
|
34
|
+
<img alt="License" src="https://img.shields.io/badge/license-MIT-green">
|
|
35
|
+
</p>
|
|
36
|
+
|
|
37
|
+
## Table of Contents
|
|
38
|
+
- [Overview](#overview)
|
|
39
|
+
- [Quick Start](#quick-start)
|
|
40
|
+
- [Installation](#installation)
|
|
41
|
+
- [API Key Setup](#api-key-setup)
|
|
42
|
+
- [Basic Usage](#basic-usage)
|
|
43
|
+
- [Features](#features)
|
|
44
|
+
- [Progress Reporting](#progress-reporting)
|
|
45
|
+
- [Replay Functionality](#replay-functionality)
|
|
46
|
+
- [Examples](#examples)
|
|
47
|
+
- [Interpreting Results](#interpreting-results)
|
|
48
|
+
- [Theta (θ)](#theta-θ)
|
|
49
|
+
- [Other Metrics](#other-metrics)
|
|
50
|
+
- [Contributing](#contributing)
|
|
51
|
+
- [License](#license)
|
|
52
|
+
|
|
53
|
+
## Overview
|
|
54
|
+
|
|
55
|
+
[**Trismik**](https://trismik.com) is a Cambridge, UK based startup offering adversarial testing for LLMs. The APIs we provide through this library allow you to call our adaptive test engine and evaluate LLMs up to 95% faster (and cheaper!) than traditional evaluation techniques.
|
|
56
|
+
|
|
57
|
+
Our **adaptive testing** algorithm allows to estimate the precision of the model by looking only at a small portion of a dataset. Through this library, we provide access to a number of open source datasets over several dimensions (reasoning, toxicity, tool use...) to speed up model testing in several scenarios, like foundation model training, supervised fine tuning, prompt engineering, and so on.
|
|
58
|
+
|
|
59
|
+
## Quick Start
|
|
60
|
+
|
|
61
|
+
### Installation
|
|
62
|
+
|
|
63
|
+
To use our API, you need to get an API key first. Please register on [dashboard.trismik.com](https://dashboard.trismik.com) and obtain an API key.
|
|
64
|
+
|
|
65
|
+
Trismik is available via [pypi](https://pypi.org/project/trismik/). To install Trismik, run the following in your terminal (in a virtualenv, if you use one):
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install trismik
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### API Key Setup
|
|
72
|
+
|
|
73
|
+
You can provide your API key in one of the following ways:
|
|
74
|
+
|
|
75
|
+
1. **Environment Variable**:
|
|
76
|
+
```bash
|
|
77
|
+
export TRISMIK_API_KEY="your-api-key"
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
2. **`.env` File**:
|
|
81
|
+
```bash
|
|
82
|
+
# .env
|
|
83
|
+
TRISMIK_API_KEY=your-api-key
|
|
84
|
+
```
|
|
85
|
+
Then load it with `python-dotenv`:
|
|
86
|
+
```python
|
|
87
|
+
from dotenv import load_dotenv
|
|
88
|
+
load_dotenv()
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
3. **Direct Initialization**:
|
|
92
|
+
```python
|
|
93
|
+
client = TrismikClient(api_key="YOUR_API_KEY")
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Basic Usage
|
|
97
|
+
|
|
98
|
+
Here's the simplest way to run an adaptive test:
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
from trismik import TrismikClient, TrismikRunMetadata
|
|
102
|
+
from trismik.types import TrismikItem
|
|
103
|
+
|
|
104
|
+
# Define your item processor
|
|
105
|
+
def model_inference(item: TrismikItem) -> str:
|
|
106
|
+
# Your model inference logic here
|
|
107
|
+
# See examples/ folder for real-world implementations
|
|
108
|
+
return item.choices[0].id # Example: pick first choice
|
|
109
|
+
|
|
110
|
+
# Run the test
|
|
111
|
+
with TrismikClient() as client:
|
|
112
|
+
results = client.run(
|
|
113
|
+
test_id="MMLUPro2024",
|
|
114
|
+
project_id="your-project-id", # Get from dashboard or create with client.create_project()
|
|
115
|
+
experiment="my-experiment",
|
|
116
|
+
run_metadata=TrismikRunMetadata(
|
|
117
|
+
model_metadata={"name": "my-model", "provider": "local"},
|
|
118
|
+
test_configuration={"task_name": "MMLUPro2024"},
|
|
119
|
+
inference_setup={},
|
|
120
|
+
),
|
|
121
|
+
item_processor=model_inference,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
print(f"Theta: {results.score.theta}")
|
|
125
|
+
print(f"Standard Error: {results.score.std_error}")
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
**For async usage:**
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
from trismik import TrismikAsyncClient
|
|
132
|
+
|
|
133
|
+
async with TrismikAsyncClient() as client:
|
|
134
|
+
results = await client.run(
|
|
135
|
+
test_id="MMLUPro2024",
|
|
136
|
+
project_id="your-project-id",
|
|
137
|
+
experiment="my-experiment",
|
|
138
|
+
run_metadata=TrismikRunMetadata(...),
|
|
139
|
+
item_processor=model_inference, # Can be sync or async
|
|
140
|
+
)
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## Features
|
|
144
|
+
|
|
145
|
+
### Progress Reporting
|
|
146
|
+
|
|
147
|
+
Add optional progress tracking with a callback:
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
from tqdm.auto import tqdm
|
|
151
|
+
from trismik.settings import evaluation_settings
|
|
152
|
+
|
|
153
|
+
def create_progress_callback():
|
|
154
|
+
pbar = tqdm(total=evaluation_settings["max_iterations"], desc="Running test")
|
|
155
|
+
|
|
156
|
+
def callback(current: int, total: int):
|
|
157
|
+
pbar.total = total
|
|
158
|
+
pbar.n = current
|
|
159
|
+
pbar.refresh()
|
|
160
|
+
if current >= total:
|
|
161
|
+
pbar.close()
|
|
162
|
+
|
|
163
|
+
return callback
|
|
164
|
+
|
|
165
|
+
# Use it in your run
|
|
166
|
+
with TrismikClient() as client:
|
|
167
|
+
results = client.run(
|
|
168
|
+
# ... other parameters ...
|
|
169
|
+
on_progress=create_progress_callback(),
|
|
170
|
+
)
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
The library is silent by default - progress reporting is entirely optional.
|
|
174
|
+
|
|
175
|
+
### Replay Functionality
|
|
176
|
+
|
|
177
|
+
Replay the exact sequence of questions from a previous run to test model stability:
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
with TrismikClient() as client:
|
|
181
|
+
# Run initial test
|
|
182
|
+
results = client.run(
|
|
183
|
+
test_id="MMLUPro2024",
|
|
184
|
+
project_id="your-project-id",
|
|
185
|
+
experiment="experiment-1",
|
|
186
|
+
run_metadata=metadata,
|
|
187
|
+
item_processor=model_inference,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# Replay with same questions
|
|
191
|
+
replay_results = client.run_replay(
|
|
192
|
+
previous_run_id=results.run_id,
|
|
193
|
+
run_metadata=new_metadata,
|
|
194
|
+
item_processor=model_inference,
|
|
195
|
+
with_responses=True, # Include individual responses
|
|
196
|
+
)
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
## Examples
|
|
200
|
+
|
|
201
|
+
Complete working examples are available in the `examples/` folder:
|
|
202
|
+
|
|
203
|
+
- **[`example_adaptive_test.py`](examples/example_adaptive_test.py)** - Basic adaptive testing with both sync and async patterns, including replay functionality
|
|
204
|
+
- **[`example_openai.py`](examples/example_openai.py)** - Integration with OpenAI API models
|
|
205
|
+
- **[`example_transformers.py`](examples/example_transformers.py)** - Integration with Hugging Face Transformers models
|
|
206
|
+
|
|
207
|
+
To run the examples:
|
|
208
|
+
|
|
209
|
+
```bash
|
|
210
|
+
# Clone the repository and install with examples dependencies
|
|
211
|
+
git clone https://github.com/trismik/trismik-python
|
|
212
|
+
cd trismik-python
|
|
213
|
+
poetry install --with examples
|
|
214
|
+
|
|
215
|
+
# Run an example
|
|
216
|
+
poetry run python examples/example_adaptive_test.py --dataset-name MMLUPro2024
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
## Interpreting Results
|
|
220
|
+
|
|
221
|
+
### Theta (θ)
|
|
222
|
+
|
|
223
|
+
Our adaptive test returns several values; however, you will be interested mainly in `theta`. Theta ($\theta$) is our metric; it measures the ability of the model on a certain dataset, and it can be used as a proxy to approximate the original metric used on that dataset. For example, on an accuracy-based dataset, a high theta correlates with a high accuracy, and low theta correlates with low accuracy.
|
|
224
|
+
|
|
225
|
+
$\theta$ is intrinsically linked to the difficulty of the items a model can answer correctly. On a datasets where the item difficulties are balanced and evenly distributed, $\theta=0$ corresponds to a 50% chance for a model to get an answer right - in other words, to an accuracy of 50%.
|
|
226
|
+
A negative theta means that the model will give more bad answers than good ones, while a positive theta means that the model will give more good answers than bad answers.
|
|
227
|
+
While theta is unbounded in our implementation (i.e. $-\infty < \theta < \infty$), in practice we have that for most cases $\theta$ will take values between -3 and 3.
|
|
228
|
+
|
|
229
|
+
Compared to classical benchmark testing, $\theta$ from adaptive testing uses fewer but more informative items while avoiding noise from overly easy or difficult questions. This makes it a more efficient and stable measure, especially on very large datasets.
|
|
230
|
+
|
|
231
|
+
### Other Metrics
|
|
232
|
+
|
|
233
|
+
- **Standard Deviation (`std`)**:
|
|
234
|
+
- A measure of the uncertainty or error in the theta estimate
|
|
235
|
+
- A smaller `std` indicates a more precise estimate
|
|
236
|
+
- You should see a `std` around or below 0.25
|
|
237
|
+
|
|
238
|
+
- **Correct Responses (`responsesCorrect`)**:
|
|
239
|
+
- The number of correct answers delivered by the model
|
|
240
|
+
|
|
241
|
+
- **Important note**: A higher number of correct answers does not necessarily
|
|
242
|
+
correlate with a high theta. Our algorithm navigates the dataset to find a
|
|
243
|
+
balance of "hard" and "easy" items for your model, so by the end of the test,
|
|
244
|
+
it encounters a representative mix of inputs it can and cannot handle. In
|
|
245
|
+
practice, expect responsesCorrect to be roughly half of responsesTotal.
|
|
246
|
+
|
|
247
|
+
- **Total Responses (`responsesTotal`)**:
|
|
248
|
+
- The number of items processed before reaching a stable theta.
|
|
249
|
+
- Expected range: 60 ≤ responses_total ≤ 150
|
|
250
|
+
|
|
251
|
+
## Contributing
|
|
252
|
+
|
|
253
|
+
See `CONTRIBUTING.md`.
|
|
254
|
+
|
|
255
|
+
## License
|
|
256
|
+
|
|
257
|
+
This library is licensed under the MIT license. See `LICENSE` file.
|
|
258
|
+
|
trismik-1.0.0/README.md
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
<h1 align="center"> Trismik SDK</h1>
|
|
2
|
+
|
|
3
|
+
<p align="center">
|
|
4
|
+
<img alt="PyPI - Version" src="https://img.shields.io/pypi/v/trismik">
|
|
5
|
+
<img alt="Python Version" src="https://img.shields.io/badge/python-3.9%2B-blue">
|
|
6
|
+
<img alt="License" src="https://img.shields.io/badge/license-MIT-green">
|
|
7
|
+
</p>
|
|
8
|
+
|
|
9
|
+
## Table of Contents
|
|
10
|
+
- [Overview](#overview)
|
|
11
|
+
- [Quick Start](#quick-start)
|
|
12
|
+
- [Installation](#installation)
|
|
13
|
+
- [API Key Setup](#api-key-setup)
|
|
14
|
+
- [Basic Usage](#basic-usage)
|
|
15
|
+
- [Features](#features)
|
|
16
|
+
- [Progress Reporting](#progress-reporting)
|
|
17
|
+
- [Replay Functionality](#replay-functionality)
|
|
18
|
+
- [Examples](#examples)
|
|
19
|
+
- [Interpreting Results](#interpreting-results)
|
|
20
|
+
- [Theta (θ)](#theta-θ)
|
|
21
|
+
- [Other Metrics](#other-metrics)
|
|
22
|
+
- [Contributing](#contributing)
|
|
23
|
+
- [License](#license)
|
|
24
|
+
|
|
25
|
+
## Overview
|
|
26
|
+
|
|
27
|
+
[**Trismik**](https://trismik.com) is a Cambridge, UK based startup offering adversarial testing for LLMs. The APIs we provide through this library allow you to call our adaptive test engine and evaluate LLMs up to 95% faster (and cheaper!) than traditional evaluation techniques.
|
|
28
|
+
|
|
29
|
+
Our **adaptive testing** algorithm allows to estimate the precision of the model by looking only at a small portion of a dataset. Through this library, we provide access to a number of open source datasets over several dimensions (reasoning, toxicity, tool use...) to speed up model testing in several scenarios, like foundation model training, supervised fine tuning, prompt engineering, and so on.
|
|
30
|
+
|
|
31
|
+
## Quick Start
|
|
32
|
+
|
|
33
|
+
### Installation
|
|
34
|
+
|
|
35
|
+
To use our API, you need to get an API key first. Please register on [dashboard.trismik.com](https://dashboard.trismik.com) and obtain an API key.
|
|
36
|
+
|
|
37
|
+
Trismik is available via [pypi](https://pypi.org/project/trismik/). To install Trismik, run the following in your terminal (in a virtualenv, if you use one):
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install trismik
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### API Key Setup
|
|
44
|
+
|
|
45
|
+
You can provide your API key in one of the following ways:
|
|
46
|
+
|
|
47
|
+
1. **Environment Variable**:
|
|
48
|
+
```bash
|
|
49
|
+
export TRISMIK_API_KEY="your-api-key"
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
2. **`.env` File**:
|
|
53
|
+
```bash
|
|
54
|
+
# .env
|
|
55
|
+
TRISMIK_API_KEY=your-api-key
|
|
56
|
+
```
|
|
57
|
+
Then load it with `python-dotenv`:
|
|
58
|
+
```python
|
|
59
|
+
from dotenv import load_dotenv
|
|
60
|
+
load_dotenv()
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
3. **Direct Initialization**:
|
|
64
|
+
```python
|
|
65
|
+
client = TrismikClient(api_key="YOUR_API_KEY")
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Basic Usage
|
|
69
|
+
|
|
70
|
+
Here's the simplest way to run an adaptive test:
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from trismik import TrismikClient, TrismikRunMetadata
|
|
74
|
+
from trismik.types import TrismikItem
|
|
75
|
+
|
|
76
|
+
# Define your item processor
|
|
77
|
+
def model_inference(item: TrismikItem) -> str:
|
|
78
|
+
# Your model inference logic here
|
|
79
|
+
# See examples/ folder for real-world implementations
|
|
80
|
+
return item.choices[0].id # Example: pick first choice
|
|
81
|
+
|
|
82
|
+
# Run the test
|
|
83
|
+
with TrismikClient() as client:
|
|
84
|
+
results = client.run(
|
|
85
|
+
test_id="MMLUPro2024",
|
|
86
|
+
project_id="your-project-id", # Get from dashboard or create with client.create_project()
|
|
87
|
+
experiment="my-experiment",
|
|
88
|
+
run_metadata=TrismikRunMetadata(
|
|
89
|
+
model_metadata={"name": "my-model", "provider": "local"},
|
|
90
|
+
test_configuration={"task_name": "MMLUPro2024"},
|
|
91
|
+
inference_setup={},
|
|
92
|
+
),
|
|
93
|
+
item_processor=model_inference,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
print(f"Theta: {results.score.theta}")
|
|
97
|
+
print(f"Standard Error: {results.score.std_error}")
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
**For async usage:**
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
from trismik import TrismikAsyncClient
|
|
104
|
+
|
|
105
|
+
async with TrismikAsyncClient() as client:
|
|
106
|
+
results = await client.run(
|
|
107
|
+
test_id="MMLUPro2024",
|
|
108
|
+
project_id="your-project-id",
|
|
109
|
+
experiment="my-experiment",
|
|
110
|
+
run_metadata=TrismikRunMetadata(...),
|
|
111
|
+
item_processor=model_inference, # Can be sync or async
|
|
112
|
+
)
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## Features
|
|
116
|
+
|
|
117
|
+
### Progress Reporting
|
|
118
|
+
|
|
119
|
+
Add optional progress tracking with a callback:
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
from tqdm.auto import tqdm
|
|
123
|
+
from trismik.settings import evaluation_settings
|
|
124
|
+
|
|
125
|
+
def create_progress_callback():
|
|
126
|
+
pbar = tqdm(total=evaluation_settings["max_iterations"], desc="Running test")
|
|
127
|
+
|
|
128
|
+
def callback(current: int, total: int):
|
|
129
|
+
pbar.total = total
|
|
130
|
+
pbar.n = current
|
|
131
|
+
pbar.refresh()
|
|
132
|
+
if current >= total:
|
|
133
|
+
pbar.close()
|
|
134
|
+
|
|
135
|
+
return callback
|
|
136
|
+
|
|
137
|
+
# Use it in your run
|
|
138
|
+
with TrismikClient() as client:
|
|
139
|
+
results = client.run(
|
|
140
|
+
# ... other parameters ...
|
|
141
|
+
on_progress=create_progress_callback(),
|
|
142
|
+
)
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
The library is silent by default - progress reporting is entirely optional.
|
|
146
|
+
|
|
147
|
+
### Replay Functionality
|
|
148
|
+
|
|
149
|
+
Replay the exact sequence of questions from a previous run to test model stability:
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
with TrismikClient() as client:
|
|
153
|
+
# Run initial test
|
|
154
|
+
results = client.run(
|
|
155
|
+
test_id="MMLUPro2024",
|
|
156
|
+
project_id="your-project-id",
|
|
157
|
+
experiment="experiment-1",
|
|
158
|
+
run_metadata=metadata,
|
|
159
|
+
item_processor=model_inference,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Replay with same questions
|
|
163
|
+
replay_results = client.run_replay(
|
|
164
|
+
previous_run_id=results.run_id,
|
|
165
|
+
run_metadata=new_metadata,
|
|
166
|
+
item_processor=model_inference,
|
|
167
|
+
with_responses=True, # Include individual responses
|
|
168
|
+
)
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## Examples
|
|
172
|
+
|
|
173
|
+
Complete working examples are available in the `examples/` folder:
|
|
174
|
+
|
|
175
|
+
- **[`example_adaptive_test.py`](examples/example_adaptive_test.py)** - Basic adaptive testing with both sync and async patterns, including replay functionality
|
|
176
|
+
- **[`example_openai.py`](examples/example_openai.py)** - Integration with OpenAI API models
|
|
177
|
+
- **[`example_transformers.py`](examples/example_transformers.py)** - Integration with Hugging Face Transformers models
|
|
178
|
+
|
|
179
|
+
To run the examples:
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
# Clone the repository and install with examples dependencies
|
|
183
|
+
git clone https://github.com/trismik/trismik-python
|
|
184
|
+
cd trismik-python
|
|
185
|
+
poetry install --with examples
|
|
186
|
+
|
|
187
|
+
# Run an example
|
|
188
|
+
poetry run python examples/example_adaptive_test.py --dataset-name MMLUPro2024
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## Interpreting Results
|
|
192
|
+
|
|
193
|
+
### Theta (θ)
|
|
194
|
+
|
|
195
|
+
Our adaptive test returns several values; however, you will be interested mainly in `theta`. Theta ($\theta$) is our metric; it measures the ability of the model on a certain dataset, and it can be used as a proxy to approximate the original metric used on that dataset. For example, on an accuracy-based dataset, a high theta correlates with a high accuracy, and low theta correlates with low accuracy.
|
|
196
|
+
|
|
197
|
+
$\theta$ is intrinsically linked to the difficulty of the items a model can answer correctly. On a datasets where the item difficulties are balanced and evenly distributed, $\theta=0$ corresponds to a 50% chance for a model to get an answer right - in other words, to an accuracy of 50%.
|
|
198
|
+
A negative theta means that the model will give more bad answers than good ones, while a positive theta means that the model will give more good answers than bad answers.
|
|
199
|
+
While theta is unbounded in our implementation (i.e. $-\infty < \theta < \infty$), in practice we have that for most cases $\theta$ will take values between -3 and 3.
|
|
200
|
+
|
|
201
|
+
Compared to classical benchmark testing, $\theta$ from adaptive testing uses fewer but more informative items while avoiding noise from overly easy or difficult questions. This makes it a more efficient and stable measure, especially on very large datasets.
|
|
202
|
+
|
|
203
|
+
### Other Metrics
|
|
204
|
+
|
|
205
|
+
- **Standard Deviation (`std`)**:
|
|
206
|
+
- A measure of the uncertainty or error in the theta estimate
|
|
207
|
+
- A smaller `std` indicates a more precise estimate
|
|
208
|
+
- You should see a `std` around or below 0.25
|
|
209
|
+
|
|
210
|
+
- **Correct Responses (`responsesCorrect`)**:
|
|
211
|
+
- The number of correct answers delivered by the model
|
|
212
|
+
|
|
213
|
+
- **Important note**: A higher number of correct answers does not necessarily
|
|
214
|
+
correlate with a high theta. Our algorithm navigates the dataset to find a
|
|
215
|
+
balance of "hard" and "easy" items for your model, so by the end of the test,
|
|
216
|
+
it encounters a representative mix of inputs it can and cannot handle. In
|
|
217
|
+
practice, expect responsesCorrect to be roughly half of responsesTotal.
|
|
218
|
+
|
|
219
|
+
- **Total Responses (`responsesTotal`)**:
|
|
220
|
+
- The number of items processed before reaching a stable theta.
|
|
221
|
+
- Expected range: 60 ≤ responses_total ≤ 150
|
|
222
|
+
|
|
223
|
+
## Contributing
|
|
224
|
+
|
|
225
|
+
See `CONTRIBUTING.md`.
|
|
226
|
+
|
|
227
|
+
## License
|
|
228
|
+
|
|
229
|
+
This library is licensed under the MIT license. See `LICENSE` file.
|
|
@@ -9,23 +9,24 @@ authors = [
|
|
|
9
9
|
{ name = "Marco Basaldella", email = "marco@trismik.com" }
|
|
10
10
|
]
|
|
11
11
|
readme = "README.md"
|
|
12
|
-
requires-python = ">=3.9"
|
|
12
|
+
requires-python = ">=3.9, <3.14"
|
|
13
13
|
dependencies = [
|
|
14
14
|
"httpx (>=0.27.2,<1.0.0)",
|
|
15
15
|
"nest-asyncio (>=1.6.0,<2.0.0)",
|
|
16
|
-
"tqdm (>=4.67.1, <5.0.0)",
|
|
17
16
|
]
|
|
18
|
-
version = "0.
|
|
17
|
+
version = "1.0.0"
|
|
19
18
|
|
|
20
19
|
[project.optional-dependencies]
|
|
21
20
|
examples = [
|
|
22
21
|
"transformers>=4.51.3,<5.0.0",
|
|
23
|
-
"torch>=2.
|
|
22
|
+
"torch>=2.8.0,<3.0.0",
|
|
24
23
|
"torchvision>=0.22.0,<1.0.0",
|
|
25
24
|
"torchaudio>=2.7.0,<3.0.0",
|
|
26
25
|
"accelerate>=1.7.0,<2.0.0",
|
|
27
26
|
"openai>=1.81.0,<2.0.0",
|
|
28
|
-
"notebook>=7.4.4,<8.0.0"
|
|
27
|
+
"notebook>=7.4.4,<8.0.0",
|
|
28
|
+
"jupyterlab>=4.4.8",
|
|
29
|
+
"tqdm>=4.67.1,<5.0.0"
|
|
29
30
|
]
|
|
30
31
|
|
|
31
32
|
|
|
@@ -35,9 +36,6 @@ packages = [{ include = "trismik", from = "src" }]
|
|
|
35
36
|
[tool.poetry.requires-plugins]
|
|
36
37
|
poetry-dynamic-versioning = { version = ">=1.0.0,<2.0.0", extras = ["plugin"] }
|
|
37
38
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
39
|
[tool.poetry.group.dev.dependencies]
|
|
42
40
|
pytest = "^8.3.2"
|
|
43
41
|
python-dotenv = "^1.0.1"
|
|
@@ -49,6 +47,8 @@ Flake8-pyproject = "^1.2.3"
|
|
|
49
47
|
flake8 = "^7.0.0"
|
|
50
48
|
mypy = "^1.15.0"
|
|
51
49
|
autoflake = "^2.3.1"
|
|
50
|
+
unasync = "^0.6.0"
|
|
51
|
+
tomlkit = "^0.13.2"
|
|
52
52
|
|
|
53
53
|
[tool.poetry-dynamic-versioning]
|
|
54
54
|
# Expected Git tags: vX.Y.Z, vX.Y.Z-alphaN, vX.Y.Z-betaN, vX.Y.Z-rcN
|
|
@@ -60,16 +60,17 @@ requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"]
|
|
|
60
60
|
build-backend = "poetry_dynamic_versioning.backend"
|
|
61
61
|
|
|
62
62
|
[tool.pytest.ini_options]
|
|
63
|
+
testpaths = ["tests"]
|
|
63
64
|
asyncio_default_fixture_loop_scope = "class"
|
|
64
65
|
|
|
65
66
|
[tool.black]
|
|
66
|
-
line-length =
|
|
67
|
+
line-length = 100
|
|
67
68
|
target-version = ['py39']
|
|
68
69
|
include = '\.pyi?$'
|
|
69
70
|
|
|
70
71
|
[tool.isort]
|
|
71
72
|
profile = "black"
|
|
72
|
-
line_length =
|
|
73
|
+
line_length = 100
|
|
73
74
|
multi_line_output = 3
|
|
74
75
|
|
|
75
76
|
[tool.mypy]
|
|
@@ -89,3 +90,18 @@ install_types = true
|
|
|
89
90
|
|
|
90
91
|
[tool.flake8] # note that this depends on Flake8-pyproject
|
|
91
92
|
ignore = ["D202", "W503", "W504"]
|
|
93
|
+
|
|
94
|
+
[tool.unasync]
|
|
95
|
+
[[tool.unasync.rules]]
|
|
96
|
+
fromdir = "src/trismik/_async/"
|
|
97
|
+
todir = "src/trismik/_sync/"
|
|
98
|
+
exclude = [
|
|
99
|
+
"**/helpers.py" # Manually maintained in both versions
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
# Only custom replacements (defaults handle async/await/async with/async for automatically)
|
|
103
|
+
[tool.unasync.rules.replacements]
|
|
104
|
+
"TrismikAsyncClient" = "TrismikClient"
|
|
105
|
+
"AsyncClient" = "Client"
|
|
106
|
+
"_async" = "_sync"
|
|
107
|
+
"aclose" = "close"
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Trismik Python Client.
|
|
3
|
+
|
|
4
|
+
A Python client for the Trismik API.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import importlib.metadata
|
|
8
|
+
|
|
9
|
+
from trismik._async.client import TrismikAsyncClient
|
|
10
|
+
from trismik._sync.client import TrismikClient
|
|
11
|
+
from trismik.types import (
|
|
12
|
+
AdaptiveTestScore,
|
|
13
|
+
TrismikDataset,
|
|
14
|
+
TrismikItem,
|
|
15
|
+
TrismikMeResponse,
|
|
16
|
+
TrismikProject,
|
|
17
|
+
TrismikRunMetadata,
|
|
18
|
+
TrismikRunResults,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# get version from pyproject.toml
|
|
22
|
+
__version__ = importlib.metadata.version(__package__ or __name__)
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
# Clients
|
|
26
|
+
"TrismikAsyncClient",
|
|
27
|
+
"TrismikClient",
|
|
28
|
+
# Common types
|
|
29
|
+
"AdaptiveTestScore",
|
|
30
|
+
"TrismikDataset",
|
|
31
|
+
"TrismikItem",
|
|
32
|
+
"TrismikMeResponse",
|
|
33
|
+
"TrismikProject",
|
|
34
|
+
"TrismikRunMetadata",
|
|
35
|
+
"TrismikRunResults",
|
|
36
|
+
# Version
|
|
37
|
+
"__version__",
|
|
38
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Async implementation of Trismik client (source of truth)."""
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Minimal test file to validate unasync transformation pipeline.
|
|
3
|
+
|
|
4
|
+
This file tests that unasync correctly transforms:
|
|
5
|
+
- async def -> def
|
|
6
|
+
- await -> (removed)
|
|
7
|
+
- httpx.AsyncClient -> httpx.Client
|
|
8
|
+
- TrismikAsyncClient -> TrismikClient
|
|
9
|
+
- __aenter__/__aexit__ -> __enter__/__exit__
|
|
10
|
+
- _async -> _sync in imports
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
import httpx
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TrismikAsyncClient:
|
|
19
|
+
"""Test async client to validate transformation."""
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
api_key: str,
|
|
24
|
+
http_client: Optional[httpx.AsyncClient] = None,
|
|
25
|
+
) -> None:
|
|
26
|
+
"""Initialize test client."""
|
|
27
|
+
self._api_key = api_key
|
|
28
|
+
self._owns_client = http_client is None
|
|
29
|
+
self._http_client = http_client or httpx.AsyncClient(headers={"x-api-key": api_key})
|
|
30
|
+
|
|
31
|
+
async def __aenter__(self) -> "TrismikAsyncClient":
|
|
32
|
+
"""Enter async context manager."""
|
|
33
|
+
return self
|
|
34
|
+
|
|
35
|
+
async def __aexit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
|
|
36
|
+
"""Exit async context manager."""
|
|
37
|
+
if self._owns_client:
|
|
38
|
+
await self._http_client.aclose()
|
|
39
|
+
|
|
40
|
+
async def aclose(self) -> None:
|
|
41
|
+
"""Close the HTTP client."""
|
|
42
|
+
if self._owns_client:
|
|
43
|
+
await self._http_client.aclose()
|
|
44
|
+
|
|
45
|
+
async def get_data(self) -> str:
|
|
46
|
+
"""Test async method with await."""
|
|
47
|
+
response = await self._http_client.get("/test")
|
|
48
|
+
response.raise_for_status()
|
|
49
|
+
return str(response.text)
|
|
50
|
+
|
|
51
|
+
async def process_items(self) -> int:
|
|
52
|
+
"""Test async method with multiple awaits."""
|
|
53
|
+
count = 0
|
|
54
|
+
async with self._http_client as client:
|
|
55
|
+
response = await client.get("/items")
|
|
56
|
+
data = response.json()
|
|
57
|
+
count = len(data)
|
|
58
|
+
return count
|