trismik 0.9.1__tar.gz → 0.9.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
trismik-0.9.5/PKG-INFO ADDED
@@ -0,0 +1,174 @@
1
+ Metadata-Version: 2.4
2
+ Name: trismik
3
+ Version: 0.9.5
4
+ Summary:
5
+ License-File: LICENSE
6
+ Author: Bartosz Kielczewski
7
+ Author-email: bk352@cam.ac.uk
8
+ Requires-Python: >=3.9
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.9
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Programming Language :: Python :: 3.14
16
+ Provides-Extra: examples
17
+ Requires-Dist: accelerate (>=1.7.0,<2.0.0) ; extra == "examples"
18
+ Requires-Dist: notebook (>=7.4.4,<8.0.0) ; extra == "examples"
19
+ Requires-Dist: openai (>=1.81.0,<2.0.0) ; extra == "examples"
20
+ Requires-Dist: torch (>=2.7.0,<3.0.0) ; extra == "examples"
21
+ Requires-Dist: torchaudio (>=2.7.0,<3.0.0) ; extra == "examples"
22
+ Requires-Dist: torchvision (>=0.22.0,<1.0.0) ; extra == "examples"
23
+ Requires-Dist: transformers (>=4.51.3,<5.0.0) ; extra == "examples"
24
+ Description-Content-Type: text/markdown
25
+
26
+ <h1 align="center"> Trismik SDK</h1>
27
+
28
+ <p align="center">
29
+ <img alt="PyPI - Version" src="https://img.shields.io/pypi/v/trismik">
30
+ <img alt="Python Version" src="https://img.shields.io/badge/python-3.9%2B-blue">
31
+ <img alt="License" src="https://img.shields.io/badge/license-MIT-green">
32
+ </p>
33
+
34
+ ## Table of Contents
35
+ - [Overview](#overview)
36
+ - [Quick Start](#quick-start)
37
+ - [Installation](#installation)
38
+ - [API Key Setup](#api-key-setup)
39
+ - [Basic Usage](#basic-usage)
40
+ - [Interpreting Results](#interpreting-results)
41
+ - [Theta (θ)](#theta-θ)
42
+ - [Other Metrics](#other-metrics)
43
+ - [Contributing](#contributing)
44
+ - [License](#license)
45
+
46
+ ## Overview
47
+
48
+ [**Trismik**](https://trismik.com) is a Cambridge, UK based startup offering adversarial testing for LLMs. The APIs we provide through this library allow you to call our adaptive test engine and evaluate LLMs up to 95% faster (and cheaper!) than traditional evaluation techniques.
49
+
50
+ Our **adaptive testing** algorithm allows to estimate the precision of the model by looking only at a small portion of a dataset. Through this library, we provide access to a number of open source datasets over several dimensions (reasoning, toxicity, tool use...) to speed up model testing in several scenarios, like foundation model training, supervised fine tuning, prompt engineering, and so on.
51
+
52
+ ## Quick Start
53
+
54
+ ### Installation
55
+
56
+ To use our API, you need to get an API key first. Please register on [dashboard.trismik.com](https://dashboard.trismik.com) and obtain an API key.
57
+
58
+ Trismik is available via [pypi](https://pypi.org/project/trismik/). To install Trismik, run the following in your terminal (in a virtualenv, if you use one):
59
+
60
+ ```bash
61
+ pip install trismik
62
+ ```
63
+
64
+ ### API Key Setup
65
+
66
+ You can provide your API key in one of the following ways:
67
+
68
+ 1. **Environment Variable**:
69
+ ```bash
70
+ export TRISMIK_API_KEY="your-api-key"
71
+ ```
72
+
73
+ 2. **`.env` File**:
74
+ ```bash
75
+ # .env
76
+ TRISMIK_API_KEY=your-api-key
77
+ ```
78
+ Then load it with `python-dotenv`:
79
+ ```python
80
+ from dotenv import load_dotenv
81
+ load_dotenv()
82
+ ```
83
+
84
+ 3. **Direct Initialization**:
85
+ ```python
86
+ client = TrismikAsyncClient(api_key="YOUR_API_KEY")
87
+ ```
88
+
89
+ ### Basic Usage
90
+
91
+ Running a test is straightforward:
92
+
93
+ 1. Implement a method that wraps model inference over a dataset item
94
+ 2. Create an `AdaptiveTest` instance
95
+ 3. Run the test!
96
+
97
+ Here's a basic example:
98
+
99
+ ```python
100
+ def model_inference(item: TrismikItem) -> Any:
101
+ model_output = ... # call your model here
102
+ return model_output
103
+
104
+
105
+ # Initialize the test runner
106
+ runner = AdaptiveTest(model_inference)
107
+
108
+ # Run the test
109
+ results = await runner.run_async(
110
+ "MMLUPro2025", # or any dataset we support
111
+ with_responses=True,
112
+ run_metadata=sample_metadata,
113
+ )
114
+
115
+ # Print the test output
116
+ for result in results:
117
+ print(f"{result.trait} ({result.name}): {result.value}")
118
+ ```
119
+
120
+ ### Examples
121
+
122
+ You can find more examples in the `examples` folder:
123
+ - [`example_transformers.py`](examples/example_transformers.py) - Example using Hugging Face Transformers models
124
+ - [`example_openai.py`](examples/example_openai.py) - Example using OpenAI models
125
+ - [`example_adaptive_test.py`](examples/example_adaptive_test.py) - Example of adaptive testing configuration
126
+
127
+ To run the examples, you will need to clone this repo, navigate to the
128
+ source folder, and then run:
129
+
130
+ ```bash
131
+ poetry install --with examples
132
+ poetry run python examples/example_adaptive_test.py # or any other example
133
+ ```
134
+
135
+ ## Interpreting Results
136
+
137
+ ### Theta (θ)
138
+
139
+ Our adversarial test returns several values; however, you will be interested mainly in `theta`. Theta ($\theta$) is our metric; it measures the ability of the model on a certain dataset, and it can be used as a proxy to approximate the original metric used on that dataset. For example, on an accuracy-based dataset, a high theta correlates with a high accuracy, and low theta correlates with low accuracy.
140
+
141
+ To interpret a theta score, consider that $\theta=0$ corresponds to a 50% chance for a model to get an answer right - in other words, to an accuracy of 50%.
142
+ A negative theta means that the model will give more bad answers then good ones, while a positive theta means that the model will give more good answers then bad answers.
143
+ While theta is unbounded in our implementation (i.e. $-\infty < \theta < \infty$), in practice we have that for most cases $\theta$ will take values between -3 and 3.
144
+
145
+ Compared to classical benchmark testing, the estimated accuracy from adaptive testing uses fewer but more informative items while avoiding noise from overly easy or difficult questions. This makes it a more efficient and stable measure, especially on very large datasets.
146
+
147
+ ### Other Metrics
148
+
149
+ - **Standard Deviation (`std`)**:
150
+ - A measure of the uncertainty or error in the theta estimate
151
+ - A smaller `std` indicates a more precise estimate
152
+ - You should see a `std` around or below 0.25
153
+
154
+ - **Correct Responses (`responsesCorrect`)**:
155
+ - The number of correct answers delivered by the model
156
+
157
+ - **Important note**: A higher number of correct answers does not necessarily
158
+ correlate with a high theta. Our algorithm navigates the dataset to find a
159
+ balance of “hard” and “easy” items for your model, so by the end of the test,
160
+ it encounters a representative mix of inputs it can and cannot handle. In
161
+ practice, expect responsesCorrect to be roughly half of responsesTotal.
162
+
163
+ - **Total Responses (`responsesTotal`)**:
164
+ - The number of items processed before reaching a stable theta.
165
+ - Expected range: 60 ≤ responses_total ≤ 80
166
+
167
+ ## Contributing
168
+
169
+ See `CONTRIBUTING.md`.
170
+
171
+ ## License
172
+
173
+ This library is licensed under the MIT license. See `LICENSE` file.
174
+
@@ -0,0 +1,148 @@
1
+ <h1 align="center"> Trismik SDK</h1>
2
+
3
+ <p align="center">
4
+ <img alt="PyPI - Version" src="https://img.shields.io/pypi/v/trismik">
5
+ <img alt="Python Version" src="https://img.shields.io/badge/python-3.9%2B-blue">
6
+ <img alt="License" src="https://img.shields.io/badge/license-MIT-green">
7
+ </p>
8
+
9
+ ## Table of Contents
10
+ - [Overview](#overview)
11
+ - [Quick Start](#quick-start)
12
+ - [Installation](#installation)
13
+ - [API Key Setup](#api-key-setup)
14
+ - [Basic Usage](#basic-usage)
15
+ - [Interpreting Results](#interpreting-results)
16
+ - [Theta (θ)](#theta-θ)
17
+ - [Other Metrics](#other-metrics)
18
+ - [Contributing](#contributing)
19
+ - [License](#license)
20
+
21
+ ## Overview
22
+
23
+ [**Trismik**](https://trismik.com) is a Cambridge, UK based startup offering adversarial testing for LLMs. The APIs we provide through this library allow you to call our adaptive test engine and evaluate LLMs up to 95% faster (and cheaper!) than traditional evaluation techniques.
24
+
25
+ Our **adaptive testing** algorithm allows to estimate the precision of the model by looking only at a small portion of a dataset. Through this library, we provide access to a number of open source datasets over several dimensions (reasoning, toxicity, tool use...) to speed up model testing in several scenarios, like foundation model training, supervised fine tuning, prompt engineering, and so on.
26
+
27
+ ## Quick Start
28
+
29
+ ### Installation
30
+
31
+ To use our API, you need to get an API key first. Please register on [dashboard.trismik.com](https://dashboard.trismik.com) and obtain an API key.
32
+
33
+ Trismik is available via [pypi](https://pypi.org/project/trismik/). To install Trismik, run the following in your terminal (in a virtualenv, if you use one):
34
+
35
+ ```bash
36
+ pip install trismik
37
+ ```
38
+
39
+ ### API Key Setup
40
+
41
+ You can provide your API key in one of the following ways:
42
+
43
+ 1. **Environment Variable**:
44
+ ```bash
45
+ export TRISMIK_API_KEY="your-api-key"
46
+ ```
47
+
48
+ 2. **`.env` File**:
49
+ ```bash
50
+ # .env
51
+ TRISMIK_API_KEY=your-api-key
52
+ ```
53
+ Then load it with `python-dotenv`:
54
+ ```python
55
+ from dotenv import load_dotenv
56
+ load_dotenv()
57
+ ```
58
+
59
+ 3. **Direct Initialization**:
60
+ ```python
61
+ client = TrismikAsyncClient(api_key="YOUR_API_KEY")
62
+ ```
63
+
64
+ ### Basic Usage
65
+
66
+ Running a test is straightforward:
67
+
68
+ 1. Implement a method that wraps model inference over a dataset item
69
+ 2. Create an `AdaptiveTest` instance
70
+ 3. Run the test!
71
+
72
+ Here's a basic example:
73
+
74
+ ```python
75
+ def model_inference(item: TrismikItem) -> Any:
76
+ model_output = ... # call your model here
77
+ return model_output
78
+
79
+
80
+ # Initialize the test runner
81
+ runner = AdaptiveTest(model_inference)
82
+
83
+ # Run the test
84
+ results = await runner.run_async(
85
+ "MMLUPro2025", # or any dataset we support
86
+ with_responses=True,
87
+ run_metadata=sample_metadata,
88
+ )
89
+
90
+ # Print the test output
91
+ for result in results:
92
+ print(f"{result.trait} ({result.name}): {result.value}")
93
+ ```
94
+
95
+ ### Examples
96
+
97
+ You can find more examples in the `examples` folder:
98
+ - [`example_transformers.py`](examples/example_transformers.py) - Example using Hugging Face Transformers models
99
+ - [`example_openai.py`](examples/example_openai.py) - Example using OpenAI models
100
+ - [`example_adaptive_test.py`](examples/example_adaptive_test.py) - Example of adaptive testing configuration
101
+
102
+ To run the examples, you will need to clone this repo, navigate to the
103
+ source folder, and then run:
104
+
105
+ ```bash
106
+ poetry install --with examples
107
+ poetry run python examples/example_adaptive_test.py # or any other example
108
+ ```
109
+
110
+ ## Interpreting Results
111
+
112
+ ### Theta (θ)
113
+
114
+ Our adversarial test returns several values; however, you will be interested mainly in `theta`. Theta ($\theta$) is our metric; it measures the ability of the model on a certain dataset, and it can be used as a proxy to approximate the original metric used on that dataset. For example, on an accuracy-based dataset, a high theta correlates with a high accuracy, and low theta correlates with low accuracy.
115
+
116
+ To interpret a theta score, consider that $\theta=0$ corresponds to a 50% chance for a model to get an answer right - in other words, to an accuracy of 50%.
117
+ A negative theta means that the model will give more bad answers then good ones, while a positive theta means that the model will give more good answers then bad answers.
118
+ While theta is unbounded in our implementation (i.e. $-\infty < \theta < \infty$), in practice we have that for most cases $\theta$ will take values between -3 and 3.
119
+
120
+ Compared to classical benchmark testing, the estimated accuracy from adaptive testing uses fewer but more informative items while avoiding noise from overly easy or difficult questions. This makes it a more efficient and stable measure, especially on very large datasets.
121
+
122
+ ### Other Metrics
123
+
124
+ - **Standard Deviation (`std`)**:
125
+ - A measure of the uncertainty or error in the theta estimate
126
+ - A smaller `std` indicates a more precise estimate
127
+ - You should see a `std` around or below 0.25
128
+
129
+ - **Correct Responses (`responsesCorrect`)**:
130
+ - The number of correct answers delivered by the model
131
+
132
+ - **Important note**: A higher number of correct answers does not necessarily
133
+ correlate with a high theta. Our algorithm navigates the dataset to find a
134
+ balance of “hard” and “easy” items for your model, so by the end of the test,
135
+ it encounters a representative mix of inputs it can and cannot handle. In
136
+ practice, expect responsesCorrect to be roughly half of responsesTotal.
137
+
138
+ - **Total Responses (`responsesTotal`)**:
139
+ - The number of items processed before reaching a stable theta.
140
+ - Expected range: 60 ≤ responses_total ≤ 80
141
+
142
+ ## Contributing
143
+
144
+ See `CONTRIBUTING.md`.
145
+
146
+ ## License
147
+
148
+ This library is licensed under the MIT license. See `LICENSE` file.
@@ -0,0 +1,90 @@
1
+ [project]
2
+ name = "trismik"
3
+ dynamic = []
4
+ description = ""
5
+ authors = [
6
+ { name = "Bartosz Kielczewski", email = "bk352@cam.ac.uk" },
7
+ { name = "Peter Monks", email = "peter.monks@systemc.com" },
8
+ { name = "Greg Holdridge", email = "greg.holdridge@systemc.com" },
9
+ { name = "Marco Basaldella", email = "marco@trismik.com" }
10
+ ]
11
+ readme = "README.md"
12
+ requires-python = ">=3.9"
13
+ version = "0.9.5"
14
+
15
+ [project.optional-dependencies]
16
+ examples = [
17
+ "transformers>=4.51.3,<5.0.0",
18
+ "torch>=2.7.0,<3.0.0",
19
+ "torchvision>=0.22.0,<1.0.0",
20
+ "torchaudio>=2.7.0,<3.0.0",
21
+ "accelerate>=1.7.0,<2.0.0",
22
+ "openai>=1.81.0,<2.0.0",
23
+ "notebook>=7.4.4,<8.0.0"
24
+ ]
25
+
26
+
27
+ [tool.poetry]
28
+ packages = [{ include = "trismik", from = "src" }]
29
+
30
+ [tool.poetry.requires-plugins]
31
+ poetry-dynamic-versioning = { version = ">=1.0.0,<2.0.0", extras = ["plugin"] }
32
+
33
+ [tool.poetry.group.main.dependencies]
34
+ httpx = "^0.27.2"
35
+ nest-asyncio = "^1.6.0"
36
+ tqdm = "^4.67.1"
37
+
38
+
39
+
40
+ [tool.poetry.group.dev.dependencies]
41
+ pytest = "^8.3.2"
42
+ python-dotenv = "^1.0.1"
43
+ pytest-asyncio = "^0.24.0"
44
+ pre-commit = "^3.6.2"
45
+ black = "^24.3.0"
46
+ isort = "^5.13.2"
47
+ Flake8-pyproject = "^1.2.3"
48
+ flake8 = "^7.0.0"
49
+ mypy = "^1.15.0"
50
+ autoflake = "^2.3.1"
51
+
52
+ [tool.poetry-dynamic-versioning]
53
+ # Expected Git tags: vX.Y.Z, vX.Y.Z-alphaN, vX.Y.Z-betaN, vX.Y.Z-rcN
54
+ enable = false
55
+ vcs = "git"
56
+
57
+ [build-system]
58
+ requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"]
59
+ build-backend = "poetry_dynamic_versioning.backend"
60
+
61
+ [tool.pytest.ini_options]
62
+ asyncio_default_fixture_loop_scope = "class"
63
+
64
+ [tool.black]
65
+ line-length = 80
66
+ target-version = ['py39']
67
+ include = '\.pyi?$'
68
+
69
+ [tool.isort]
70
+ profile = "black"
71
+ line_length = 80
72
+ multi_line_output = 3
73
+
74
+ [tool.mypy]
75
+ python_version = "3.9"
76
+ warn_return_any = true
77
+ warn_unused_configs = true
78
+ disallow_untyped_defs = true
79
+ disallow_incomplete_defs = true
80
+ check_untyped_defs = true
81
+ disallow_untyped_decorators = true
82
+ no_implicit_optional = true
83
+ warn_redundant_casts = true
84
+ warn_unused_ignores = true
85
+ warn_no_return = true
86
+ warn_unreachable = true
87
+ install_types = true
88
+
89
+ [tool.flake8] # note that this depends on Flake8-pyproject
90
+ ignore = ["D202", "W503", "W504"]
@@ -0,0 +1,10 @@
1
+ """
2
+ Trismik Python Client.
3
+
4
+ A Python client for the Trismik API.
5
+ """
6
+
7
+ import importlib.metadata
8
+
9
+ # get version from pyproject.toml
10
+ __version__ = importlib.metadata.version(__package__ or __name__)