scorebook 0.0.13__tar.gz → 0.0.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook-0.0.14/PKG-INFO +292 -0
- scorebook-0.0.14/README.md +249 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/pyproject.toml +1 -27
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/__init__.py +10 -5
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/cli/auth.py +1 -1
- scorebook-0.0.14/src/scorebook/dashboard/__init__.py +1 -0
- scorebook-0.0.14/src/scorebook/dashboard/create_project.py +91 -0
- {scorebook-0.0.13/src/scorebook/trismik → scorebook-0.0.14/src/scorebook/dashboard}/credentials.py +24 -9
- {scorebook-0.0.13/src/scorebook/trismik → scorebook-0.0.14/src/scorebook/dashboard}/upload_results.py +1 -1
- scorebook-0.0.14/src/scorebook/eval_datasets/__init__.py +1 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/eval_datasets/eval_dataset.py +4 -2
- scorebook-0.0.14/src/scorebook/evaluate/__init__.py +1 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/evaluate/_async/evaluate_async.py +9 -8
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/evaluate/_sync/evaluate.py +9 -8
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/evaluate/evaluate_helpers.py +4 -3
- scorebook-0.0.14/src/scorebook/inference/__init__.py +1 -0
- scorebook-0.0.14/src/scorebook/inference/clients/__init__.py +1 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/inference/inference_pipeline.py +1 -1
- scorebook-0.0.14/src/scorebook/metrics/__init__.py +1 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/metrics/metric_registry.py +2 -0
- scorebook-0.0.14/src/scorebook/score/__init__.py +1 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/score/_async/score_async.py +3 -2
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/score/_sync/score.py +3 -2
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/score/score_helpers.py +1 -1
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/types.py +1 -1
- scorebook-0.0.14/src/scorebook/utils/__init__.py +1 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/utils/common_helpers.py +1 -1
- scorebook-0.0.14/src/scorebook/utils/mock_llm/__init__.py +41 -0
- scorebook-0.0.14/src/scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
- scorebook-0.0.13/PKG-INFO +0 -389
- scorebook-0.0.13/README.md +0 -346
- scorebook-0.0.13/src/scorebook/eval_datasets/__init__.py +0 -5
- scorebook-0.0.13/src/scorebook/evaluate/__init__.py +0 -15
- scorebook-0.0.13/src/scorebook/inference/__init__.py +0 -11
- scorebook-0.0.13/src/scorebook/inference/clients/__init__.py +0 -8
- scorebook-0.0.13/src/scorebook/metrics/__init__.py +0 -18
- scorebook-0.0.13/src/scorebook/score/__init__.py +0 -6
- scorebook-0.0.13/src/scorebook/trismik/__init__.py +0 -10
- scorebook-0.0.13/src/scorebook/utils/__init__.py +0 -23
- {scorebook-0.0.13 → scorebook-0.0.14}/LICENSE +0 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/cli/__init__.py +0 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/cli/main.py +0 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/evaluate/_async/__init__.py +0 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/evaluate/_sync/__init__.py +0 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/exceptions.py +0 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/inference/clients/bedrock.py +0 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/inference/clients/openai.py +0 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/inference/clients/portkey.py +0 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/inference/clients/vertex.py +0 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/metrics/accuracy.py +0 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/metrics/metric_base.py +0 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/metrics/precision.py +0 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/score/_async/__init__.py +0 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/score/_sync/__init__.py +0 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/settings.py +0 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/utils/async_utils.py +0 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/utils/io_helpers.py +0 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/utils/jinja_helpers.py +0 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/utils/mappers.py +0 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/utils/progress_bars.py +0 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/utils/render_template.py +0 -0
- {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/utils/transform_helpers.py +0 -0
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: scorebook
|
|
3
|
+
Version: 0.0.14
|
|
4
|
+
Summary: A Python project for LLM evaluation.
|
|
5
|
+
License-File: LICENSE
|
|
6
|
+
Author: Euan Campbell
|
|
7
|
+
Author-email: euan@trismik.com
|
|
8
|
+
Requires-Python: >=3.9, <3.14
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Provides-Extra: bedrock
|
|
16
|
+
Provides-Extra: examples
|
|
17
|
+
Provides-Extra: openai
|
|
18
|
+
Provides-Extra: portkey
|
|
19
|
+
Provides-Extra: vertex
|
|
20
|
+
Requires-Dist: accelerate ; extra == "examples"
|
|
21
|
+
Requires-Dist: boto3 (==1.40.0) ; extra == "bedrock"
|
|
22
|
+
Requires-Dist: datasets (>=3.6.0)
|
|
23
|
+
Requires-Dist: fsspec[gcs] ; extra == "vertex"
|
|
24
|
+
Requires-Dist: google-cloud-storage ; extra == "vertex"
|
|
25
|
+
Requires-Dist: google-genai ; extra == "vertex"
|
|
26
|
+
Requires-Dist: ipywidgets (>=8.0.0)
|
|
27
|
+
Requires-Dist: notebook (>=7.4.5,<8.0.0)
|
|
28
|
+
Requires-Dist: notebook ; extra == "examples"
|
|
29
|
+
Requires-Dist: openai ; extra == "openai"
|
|
30
|
+
Requires-Dist: pandas ; extra == "vertex"
|
|
31
|
+
Requires-Dist: portkey-ai ; extra == "portkey"
|
|
32
|
+
Requires-Dist: python-dotenv ; extra == "bedrock"
|
|
33
|
+
Requires-Dist: python-dotenv ; extra == "openai"
|
|
34
|
+
Requires-Dist: python-dotenv ; extra == "portkey"
|
|
35
|
+
Requires-Dist: python-dotenv ; extra == "vertex"
|
|
36
|
+
Requires-Dist: torch ; extra == "examples"
|
|
37
|
+
Requires-Dist: torchaudio ; extra == "examples"
|
|
38
|
+
Requires-Dist: torchvision ; extra == "examples"
|
|
39
|
+
Requires-Dist: transformers ; extra == "examples"
|
|
40
|
+
Requires-Dist: trismik (==1.0.2)
|
|
41
|
+
Description-Content-Type: text/markdown
|
|
42
|
+
|
|
43
|
+
<h1 align="center">Scorebook</h1>
|
|
44
|
+
|
|
45
|
+
<p align="center"><strong>A Python library for Model evaluation</strong></p>
|
|
46
|
+
|
|
47
|
+
<p align="center">
|
|
48
|
+
<img alt="Dynamic TOML Badge" src="https://img.shields.io/badge/dynamic/toml?url=https%3A%2F%2Fraw.githubusercontent.com%2Ftrismik%2Fscorebook%2Frefs%2Fheads%2Fmain%2Fpyproject.toml&query=tool.poetry.version&style=flat&label=version">
|
|
49
|
+
<img alt="Python Version" src="https://img.shields.io/badge/python-3.9%2B-blue">
|
|
50
|
+
<a href="https://docs.trismik.com/scorebook/introduction-to-scorebook/" target="_blank" rel="noopener">
|
|
51
|
+
<img alt="Documentation" src="https://img.shields.io/badge/docs-Scorebook-blue?style=flat">
|
|
52
|
+
</a>
|
|
53
|
+
<img alt="License" src="https://img.shields.io/badge/license-MIT-green">
|
|
54
|
+
</p>
|
|
55
|
+
|
|
56
|
+
Scorebook provides a flexible and extensible framework for evaluating models such as large language models (LLMs). Easily evaluate any model using evaluation datasets from Hugging Face such as MMLU-Pro, HellaSwag, and CommonSenseQA, or with data from any other source. Evaluations calculate scores for any number of specified metrics such as accuracy, precision, and recall, as well as any custom defined metrics, including LLM as a judge (LLMaJ).
|
|
57
|
+
|
|
58
|
+
## Use Cases
|
|
59
|
+
|
|
60
|
+
Scorebook's evaluations can be used for:
|
|
61
|
+
|
|
62
|
+
- **Model Benchmarking**: Compare different models on standard datasets.
|
|
63
|
+
- **Model Optimization**: Find optimal model configurations.
|
|
64
|
+
- **Iterative Experimentation**: Reproducible evaluation workflows.
|
|
65
|
+
|
|
66
|
+
## Key Features
|
|
67
|
+
|
|
68
|
+
- **Model Agnostic**: Evaluate any model, running locally or deployed on the cloud.
|
|
69
|
+
- **Dataset Agnostic**: Create evaluation datasets from Hugging Face datasets or any other source.
|
|
70
|
+
- **Extensible Metric Engine**: Use the Scorebook's built-in or implement your own.
|
|
71
|
+
- **Hyperparameter Sweeping**: Evaluate over multiple model hyperparameter configurations.
|
|
72
|
+
- **Adaptive Evaluations**: Run Trismik's ultra-fast [adaptive evaluations](https://docs.trismik.com/adaptiveTesting/adaptive-testing-introduction/).
|
|
73
|
+
- **Trismik Integration**: Upload evaluations to [Trismik's platform](https://www.trismik.com/).
|
|
74
|
+
|
|
75
|
+
## Installation
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
pip install scorebook
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Scoring Models Output
|
|
82
|
+
|
|
83
|
+
Scorebooks score function can be used to evaluate pre-generated model outputs.
|
|
84
|
+
|
|
85
|
+
### Score Example
|
|
86
|
+
```python
|
|
87
|
+
from scorebook import score
|
|
88
|
+
from scorebook.metrics import Accuracy
|
|
89
|
+
|
|
90
|
+
# 1. Prepare a list of generated model outputs and labels
|
|
91
|
+
model_predictions = [
|
|
92
|
+
{"input": "What is 2 + 2?", "output": "4", "label": "4"},
|
|
93
|
+
{"input": "What is the capital of France?", "output": "London", "label": "Paris"},
|
|
94
|
+
{"input": "Who wrote Romeo and Juliette?", "output": "William Shakespeare", "label": "William Shakespeare"},
|
|
95
|
+
{"input": "What is the chemical symbol for gold?", "output": "Au", "label": "Au"},
|
|
96
|
+
]
|
|
97
|
+
|
|
98
|
+
# 2. Score the model's predictions against labels using metrics
|
|
99
|
+
results = score(
|
|
100
|
+
items = model_predictions,
|
|
101
|
+
metrics = Accuracy,
|
|
102
|
+
)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### Score Results:
|
|
106
|
+
```json
|
|
107
|
+
{
|
|
108
|
+
"aggregate_results": [
|
|
109
|
+
{
|
|
110
|
+
"dataset": "scored_items",
|
|
111
|
+
"accuracy": 0.75
|
|
112
|
+
}
|
|
113
|
+
],
|
|
114
|
+
"item_results": [
|
|
115
|
+
{
|
|
116
|
+
"id": 0,
|
|
117
|
+
"dataset": "scored_items",
|
|
118
|
+
"input": "What is 2 + 2?",
|
|
119
|
+
"output": "4",
|
|
120
|
+
"label": "4",
|
|
121
|
+
"accuracy": true
|
|
122
|
+
}
|
|
123
|
+
// ... additional items
|
|
124
|
+
]
|
|
125
|
+
}
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## _Classical_ Evaluations
|
|
129
|
+
|
|
130
|
+
Running a classical evaluation in Scorebook executes model inference on every item in the dataset, then scores the generated outputs using the dataset’s specified metrics to quantify model performance.
|
|
131
|
+
|
|
132
|
+
### Classical Evaluation example:
|
|
133
|
+
```python
|
|
134
|
+
from scorebook import evaluate, EvalDataset
|
|
135
|
+
from scorebook.metrics import Accuracy
|
|
136
|
+
|
|
137
|
+
# 1. Create an evaluation dataset
|
|
138
|
+
evaluation_items = [
|
|
139
|
+
{"question": "What is 2 + 2?", "answer": "4"},
|
|
140
|
+
{"question": "What is the capital of France?", "answer": "Paris"},
|
|
141
|
+
{"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"}
|
|
142
|
+
]
|
|
143
|
+
|
|
144
|
+
evaluation_dataset = EvalDataset.from_list(
|
|
145
|
+
name = "basic_questions",
|
|
146
|
+
items = evaluation_items,
|
|
147
|
+
input = "question",
|
|
148
|
+
label = "answer",
|
|
149
|
+
metrics = Accuracy,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# 2. Define an inference function - This is a pseudocode example
|
|
153
|
+
def inference_function(inputs: List[Any], **hyperparameters):
|
|
154
|
+
|
|
155
|
+
# Create or call a model
|
|
156
|
+
model = Model()
|
|
157
|
+
model.temperature = hyperparameters.get("temperature")
|
|
158
|
+
|
|
159
|
+
# Call model inference
|
|
160
|
+
model_outputs = model(inputs)
|
|
161
|
+
|
|
162
|
+
# Return outputs
|
|
163
|
+
return model_outputs
|
|
164
|
+
|
|
165
|
+
# 3. Run evaluation
|
|
166
|
+
evaluation_results = evaluate(
|
|
167
|
+
inference_function,
|
|
168
|
+
evaluation_dataset,
|
|
169
|
+
hyperparameters = {"temperature": 0.7}
|
|
170
|
+
)
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### Evaluation Results:
|
|
174
|
+
```json
|
|
175
|
+
{
|
|
176
|
+
"aggregate_results": [
|
|
177
|
+
{
|
|
178
|
+
"dataset": "basic_questions",
|
|
179
|
+
"temperature": 0.7,
|
|
180
|
+
"accuracy": 1.0,
|
|
181
|
+
"run_completed": true
|
|
182
|
+
}
|
|
183
|
+
],
|
|
184
|
+
"item_results": [
|
|
185
|
+
{
|
|
186
|
+
"id": 0,
|
|
187
|
+
"dataset": "basic_questions",
|
|
188
|
+
"input": "What is 2 + 2?",
|
|
189
|
+
"output": "4",
|
|
190
|
+
"label": "4",
|
|
191
|
+
"temperature": 0.7,
|
|
192
|
+
"accuracy": true
|
|
193
|
+
}
|
|
194
|
+
// ... additional items
|
|
195
|
+
]
|
|
196
|
+
}
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
### _Adaptive_ Evaluations with `evaluate`
|
|
200
|
+
|
|
201
|
+
To run an adaptive evaluation, use a Trismik adaptive dataset
|
|
202
|
+
The CAT algorithm dynamically selects items to estimate the model’s ability (θ) with minimal standard error and fewest questions.
|
|
203
|
+
|
|
204
|
+
### Adaptive Evaluation Example
|
|
205
|
+
```python
|
|
206
|
+
from scorebook import evaluate, login
|
|
207
|
+
|
|
208
|
+
# 1. Log in with your Trismik API key
|
|
209
|
+
login("TRISMIK_API_KEY")
|
|
210
|
+
|
|
211
|
+
# 2. Define an inference function
|
|
212
|
+
def inference_function(inputs: List[Any], **hyperparameters):
|
|
213
|
+
|
|
214
|
+
# Create or call a model
|
|
215
|
+
model = Model()
|
|
216
|
+
|
|
217
|
+
# Call model inference
|
|
218
|
+
outputs = model(inputs)
|
|
219
|
+
|
|
220
|
+
# Return outputs
|
|
221
|
+
return outputs
|
|
222
|
+
|
|
223
|
+
# 3. Run an adaptive evaluation
|
|
224
|
+
results = evaluate(
|
|
225
|
+
inference_function,
|
|
226
|
+
datasets = "trismik/headQA:adaptive", # Adaptive datasets have the ":adaptive" suffix
|
|
227
|
+
project_id = "TRISMIK_PROJECT_ID", # Required: Create a project on your Trismik dashboard
|
|
228
|
+
experiment_id = "TRISMIK_EXPERIMENT_ID", # Optional: An identifier to upload this run under
|
|
229
|
+
)
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
### Adaptive Evaluation Results
|
|
233
|
+
```json
|
|
234
|
+
{
|
|
235
|
+
"aggregate_results": [
|
|
236
|
+
{
|
|
237
|
+
"dataset": "trismik/headQA:adaptive",
|
|
238
|
+
"experiment_id": "TRISMIK_EXPERIMENT_ID",
|
|
239
|
+
"project_id": "TRISMIK_PROJECT_ID",
|
|
240
|
+
"run_id": "RUN_ID",
|
|
241
|
+
"score": {
|
|
242
|
+
"theta": 1.2,
|
|
243
|
+
"std_error": 0.20
|
|
244
|
+
},
|
|
245
|
+
"responses": null
|
|
246
|
+
}
|
|
247
|
+
],
|
|
248
|
+
"item_results": []
|
|
249
|
+
}
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
## Metrics
|
|
253
|
+
|
|
254
|
+
| Metric | Sync/Async | Aggregate Scores | Item Scores |
|
|
255
|
+
|------------|------------|--------------------------------------------------|-----------------------------------------|
|
|
256
|
+
| `Accuracy` | Sync | `Float`: Percentage of correct outputs | `Boolean`: Exact match between output and label |
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
## Tutorials
|
|
260
|
+
|
|
261
|
+
For local more detailed and runnable examples:
|
|
262
|
+
```bash
|
|
263
|
+
pip install scorebook[examples]
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
The `tutorials/` directory contains comprehensive tutorials as notebooks and code examples:
|
|
267
|
+
|
|
268
|
+
- **`tutorials/notebooks`**: Interactive Jupyter Notebooks showcasing Scorebook's capabilities.
|
|
269
|
+
- **`tutorials/examples`**: Runnable Python examples incrementally implementing Scorebook's features.
|
|
270
|
+
|
|
271
|
+
**Run a notebook:**
|
|
272
|
+
```bash
|
|
273
|
+
jupyter notebook tutorials/notebooks
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
**Run an example:**
|
|
277
|
+
```bash
|
|
278
|
+
python3 tutorials/examples/1-score/1-scoring_model_accuracy.py
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
## Contributing
|
|
282
|
+
|
|
283
|
+
We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
284
|
+
|
|
285
|
+
## License
|
|
286
|
+
|
|
287
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
288
|
+
|
|
289
|
+
## About
|
|
290
|
+
|
|
291
|
+
Scorebook is developed by [Trismik](https://trismik.com) to simplify and speed up your LLM evaluations.
|
|
292
|
+
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
<h1 align="center">Scorebook</h1>
|
|
2
|
+
|
|
3
|
+
<p align="center"><strong>A Python library for Model evaluation</strong></p>
|
|
4
|
+
|
|
5
|
+
<p align="center">
|
|
6
|
+
<img alt="Dynamic TOML Badge" src="https://img.shields.io/badge/dynamic/toml?url=https%3A%2F%2Fraw.githubusercontent.com%2Ftrismik%2Fscorebook%2Frefs%2Fheads%2Fmain%2Fpyproject.toml&query=tool.poetry.version&style=flat&label=version">
|
|
7
|
+
<img alt="Python Version" src="https://img.shields.io/badge/python-3.9%2B-blue">
|
|
8
|
+
<a href="https://docs.trismik.com/scorebook/introduction-to-scorebook/" target="_blank" rel="noopener">
|
|
9
|
+
<img alt="Documentation" src="https://img.shields.io/badge/docs-Scorebook-blue?style=flat">
|
|
10
|
+
</a>
|
|
11
|
+
<img alt="License" src="https://img.shields.io/badge/license-MIT-green">
|
|
12
|
+
</p>
|
|
13
|
+
|
|
14
|
+
Scorebook provides a flexible and extensible framework for evaluating models such as large language models (LLMs). Easily evaluate any model using evaluation datasets from Hugging Face such as MMLU-Pro, HellaSwag, and CommonSenseQA, or with data from any other source. Evaluations calculate scores for any number of specified metrics such as accuracy, precision, and recall, as well as any custom defined metrics, including LLM as a judge (LLMaJ).
|
|
15
|
+
|
|
16
|
+
## Use Cases
|
|
17
|
+
|
|
18
|
+
Scorebook's evaluations can be used for:
|
|
19
|
+
|
|
20
|
+
- **Model Benchmarking**: Compare different models on standard datasets.
|
|
21
|
+
- **Model Optimization**: Find optimal model configurations.
|
|
22
|
+
- **Iterative Experimentation**: Reproducible evaluation workflows.
|
|
23
|
+
|
|
24
|
+
## Key Features
|
|
25
|
+
|
|
26
|
+
- **Model Agnostic**: Evaluate any model, running locally or deployed on the cloud.
|
|
27
|
+
- **Dataset Agnostic**: Create evaluation datasets from Hugging Face datasets or any other source.
|
|
28
|
+
- **Extensible Metric Engine**: Use the Scorebook's built-in or implement your own.
|
|
29
|
+
- **Hyperparameter Sweeping**: Evaluate over multiple model hyperparameter configurations.
|
|
30
|
+
- **Adaptive Evaluations**: Run Trismik's ultra-fast [adaptive evaluations](https://docs.trismik.com/adaptiveTesting/adaptive-testing-introduction/).
|
|
31
|
+
- **Trismik Integration**: Upload evaluations to [Trismik's platform](https://www.trismik.com/).
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install scorebook
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Scoring Models Output
|
|
40
|
+
|
|
41
|
+
Scorebooks score function can be used to evaluate pre-generated model outputs.
|
|
42
|
+
|
|
43
|
+
### Score Example
|
|
44
|
+
```python
|
|
45
|
+
from scorebook import score
|
|
46
|
+
from scorebook.metrics import Accuracy
|
|
47
|
+
|
|
48
|
+
# 1. Prepare a list of generated model outputs and labels
|
|
49
|
+
model_predictions = [
|
|
50
|
+
{"input": "What is 2 + 2?", "output": "4", "label": "4"},
|
|
51
|
+
{"input": "What is the capital of France?", "output": "London", "label": "Paris"},
|
|
52
|
+
{"input": "Who wrote Romeo and Juliette?", "output": "William Shakespeare", "label": "William Shakespeare"},
|
|
53
|
+
{"input": "What is the chemical symbol for gold?", "output": "Au", "label": "Au"},
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
# 2. Score the model's predictions against labels using metrics
|
|
57
|
+
results = score(
|
|
58
|
+
items = model_predictions,
|
|
59
|
+
metrics = Accuracy,
|
|
60
|
+
)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Score Results:
|
|
64
|
+
```json
|
|
65
|
+
{
|
|
66
|
+
"aggregate_results": [
|
|
67
|
+
{
|
|
68
|
+
"dataset": "scored_items",
|
|
69
|
+
"accuracy": 0.75
|
|
70
|
+
}
|
|
71
|
+
],
|
|
72
|
+
"item_results": [
|
|
73
|
+
{
|
|
74
|
+
"id": 0,
|
|
75
|
+
"dataset": "scored_items",
|
|
76
|
+
"input": "What is 2 + 2?",
|
|
77
|
+
"output": "4",
|
|
78
|
+
"label": "4",
|
|
79
|
+
"accuracy": true
|
|
80
|
+
}
|
|
81
|
+
// ... additional items
|
|
82
|
+
]
|
|
83
|
+
}
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## _Classical_ Evaluations
|
|
87
|
+
|
|
88
|
+
Running a classical evaluation in Scorebook executes model inference on every item in the dataset, then scores the generated outputs using the dataset’s specified metrics to quantify model performance.
|
|
89
|
+
|
|
90
|
+
### Classical Evaluation example:
|
|
91
|
+
```python
|
|
92
|
+
from scorebook import evaluate, EvalDataset
|
|
93
|
+
from scorebook.metrics import Accuracy
|
|
94
|
+
|
|
95
|
+
# 1. Create an evaluation dataset
|
|
96
|
+
evaluation_items = [
|
|
97
|
+
{"question": "What is 2 + 2?", "answer": "4"},
|
|
98
|
+
{"question": "What is the capital of France?", "answer": "Paris"},
|
|
99
|
+
{"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"}
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
evaluation_dataset = EvalDataset.from_list(
|
|
103
|
+
name = "basic_questions",
|
|
104
|
+
items = evaluation_items,
|
|
105
|
+
input = "question",
|
|
106
|
+
label = "answer",
|
|
107
|
+
metrics = Accuracy,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# 2. Define an inference function - This is a pseudocode example
|
|
111
|
+
def inference_function(inputs: List[Any], **hyperparameters):
|
|
112
|
+
|
|
113
|
+
# Create or call a model
|
|
114
|
+
model = Model()
|
|
115
|
+
model.temperature = hyperparameters.get("temperature")
|
|
116
|
+
|
|
117
|
+
# Call model inference
|
|
118
|
+
model_outputs = model(inputs)
|
|
119
|
+
|
|
120
|
+
# Return outputs
|
|
121
|
+
return model_outputs
|
|
122
|
+
|
|
123
|
+
# 3. Run evaluation
|
|
124
|
+
evaluation_results = evaluate(
|
|
125
|
+
inference_function,
|
|
126
|
+
evaluation_dataset,
|
|
127
|
+
hyperparameters = {"temperature": 0.7}
|
|
128
|
+
)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Evaluation Results:
|
|
132
|
+
```json
|
|
133
|
+
{
|
|
134
|
+
"aggregate_results": [
|
|
135
|
+
{
|
|
136
|
+
"dataset": "basic_questions",
|
|
137
|
+
"temperature": 0.7,
|
|
138
|
+
"accuracy": 1.0,
|
|
139
|
+
"run_completed": true
|
|
140
|
+
}
|
|
141
|
+
],
|
|
142
|
+
"item_results": [
|
|
143
|
+
{
|
|
144
|
+
"id": 0,
|
|
145
|
+
"dataset": "basic_questions",
|
|
146
|
+
"input": "What is 2 + 2?",
|
|
147
|
+
"output": "4",
|
|
148
|
+
"label": "4",
|
|
149
|
+
"temperature": 0.7,
|
|
150
|
+
"accuracy": true
|
|
151
|
+
}
|
|
152
|
+
// ... additional items
|
|
153
|
+
]
|
|
154
|
+
}
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### _Adaptive_ Evaluations with `evaluate`
|
|
158
|
+
|
|
159
|
+
To run an adaptive evaluation, use a Trismik adaptive dataset
|
|
160
|
+
The CAT algorithm dynamically selects items to estimate the model’s ability (θ) with minimal standard error and fewest questions.
|
|
161
|
+
|
|
162
|
+
### Adaptive Evaluation Example
|
|
163
|
+
```python
|
|
164
|
+
from scorebook import evaluate, login
|
|
165
|
+
|
|
166
|
+
# 1. Log in with your Trismik API key
|
|
167
|
+
login("TRISMIK_API_KEY")
|
|
168
|
+
|
|
169
|
+
# 2. Define an inference function
|
|
170
|
+
def inference_function(inputs: List[Any], **hyperparameters):
|
|
171
|
+
|
|
172
|
+
# Create or call a model
|
|
173
|
+
model = Model()
|
|
174
|
+
|
|
175
|
+
# Call model inference
|
|
176
|
+
outputs = model(inputs)
|
|
177
|
+
|
|
178
|
+
# Return outputs
|
|
179
|
+
return outputs
|
|
180
|
+
|
|
181
|
+
# 3. Run an adaptive evaluation
|
|
182
|
+
results = evaluate(
|
|
183
|
+
inference_function,
|
|
184
|
+
datasets = "trismik/headQA:adaptive", # Adaptive datasets have the ":adaptive" suffix
|
|
185
|
+
project_id = "TRISMIK_PROJECT_ID", # Required: Create a project on your Trismik dashboard
|
|
186
|
+
experiment_id = "TRISMIK_EXPERIMENT_ID", # Optional: An identifier to upload this run under
|
|
187
|
+
)
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
### Adaptive Evaluation Results
|
|
191
|
+
```json
|
|
192
|
+
{
|
|
193
|
+
"aggregate_results": [
|
|
194
|
+
{
|
|
195
|
+
"dataset": "trismik/headQA:adaptive",
|
|
196
|
+
"experiment_id": "TRISMIK_EXPERIMENT_ID",
|
|
197
|
+
"project_id": "TRISMIK_PROJECT_ID",
|
|
198
|
+
"run_id": "RUN_ID",
|
|
199
|
+
"score": {
|
|
200
|
+
"theta": 1.2,
|
|
201
|
+
"std_error": 0.20
|
|
202
|
+
},
|
|
203
|
+
"responses": null
|
|
204
|
+
}
|
|
205
|
+
],
|
|
206
|
+
"item_results": []
|
|
207
|
+
}
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
## Metrics
|
|
211
|
+
|
|
212
|
+
| Metric | Sync/Async | Aggregate Scores | Item Scores |
|
|
213
|
+
|------------|------------|--------------------------------------------------|-----------------------------------------|
|
|
214
|
+
| `Accuracy` | Sync | `Float`: Percentage of correct outputs | `Boolean`: Exact match between output and label |
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
## Tutorials
|
|
218
|
+
|
|
219
|
+
For local more detailed and runnable examples:
|
|
220
|
+
```bash
|
|
221
|
+
pip install scorebook[examples]
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
The `tutorials/` directory contains comprehensive tutorials as notebooks and code examples:
|
|
225
|
+
|
|
226
|
+
- **`tutorials/notebooks`**: Interactive Jupyter Notebooks showcasing Scorebook's capabilities.
|
|
227
|
+
- **`tutorials/examples`**: Runnable Python examples incrementally implementing Scorebook's features.
|
|
228
|
+
|
|
229
|
+
**Run a notebook:**
|
|
230
|
+
```bash
|
|
231
|
+
jupyter notebook tutorials/notebooks
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
**Run an example:**
|
|
235
|
+
```bash
|
|
236
|
+
python3 tutorials/examples/1-score/1-scoring_model_accuracy.py
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
## Contributing
|
|
240
|
+
|
|
241
|
+
We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
242
|
+
|
|
243
|
+
## License
|
|
244
|
+
|
|
245
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
246
|
+
|
|
247
|
+
## About
|
|
248
|
+
|
|
249
|
+
Scorebook is developed by [Trismik](https://trismik.com) to simplify and speed up your LLM evaluations.
|
|
@@ -19,37 +19,11 @@ dependencies = [
|
|
|
19
19
|
scorebook = "scorebook.cli.main:main"
|
|
20
20
|
|
|
21
21
|
[tool.poetry]
|
|
22
|
-
version = "0.0.
|
|
22
|
+
version = "0.0.14" # base version
|
|
23
23
|
packages = [{ include = "scorebook", from = "src" }]
|
|
24
24
|
|
|
25
25
|
[tool.poetry.dependencies]
|
|
26
26
|
python = ">=3.9,<3.14"
|
|
27
|
-
datasets = ">=3.6.0"
|
|
28
|
-
notebook = ">=7.4.5,<8.0.0"
|
|
29
|
-
trismik = "1.0.2"
|
|
30
|
-
ipywidgets = ">=8.0.0"
|
|
31
|
-
|
|
32
|
-
# Optional dependencies
|
|
33
|
-
openai = {version = "*", optional = true}
|
|
34
|
-
python-dotenv = {version = "*", optional = true}
|
|
35
|
-
portkey-ai = {version = "*", optional = true}
|
|
36
|
-
boto3 = {version = "1.40.0", optional = true}
|
|
37
|
-
google-genai = {version = "*", optional = true}
|
|
38
|
-
pandas = {version = "*", optional = true}
|
|
39
|
-
google-cloud-storage = {version = "*", optional = true}
|
|
40
|
-
fsspec = {version = "*", extras = ["gcs"], optional = true}
|
|
41
|
-
transformers = {version = "*", optional = true}
|
|
42
|
-
torch = {version = "*", optional = true}
|
|
43
|
-
torchvision = {version = "*", optional = true}
|
|
44
|
-
torchaudio = {version = "*", optional = true}
|
|
45
|
-
accelerate = {version = "*", optional = true}
|
|
46
|
-
|
|
47
|
-
[tool.poetry.extras]
|
|
48
|
-
openai = ["openai", "python-dotenv"]
|
|
49
|
-
portkey = ["portkey-ai", "python-dotenv"]
|
|
50
|
-
bedrock = ["boto3", "python-dotenv"]
|
|
51
|
-
vertex = ["google-genai", "pandas", "google-cloud-storage", "fsspec", "python-dotenv"]
|
|
52
|
-
examples = ["transformers", "torch", "torchvision", "torchaudio", "accelerate", "notebook"]
|
|
53
27
|
|
|
54
28
|
[[tool.poetry.source]]
|
|
55
29
|
name = "testpypi"
|
|
@@ -9,12 +9,15 @@ import importlib.metadata
|
|
|
9
9
|
# get version from pyproject.toml
|
|
10
10
|
__version__ = importlib.metadata.version(__package__ or __name__)
|
|
11
11
|
|
|
12
|
-
from scorebook.
|
|
13
|
-
from scorebook.
|
|
12
|
+
from scorebook.dashboard.create_project import create_project, create_project_async
|
|
13
|
+
from scorebook.dashboard.credentials import login, logout, whoami
|
|
14
|
+
from scorebook.dashboard.upload_results import upload_result, upload_result_async
|
|
15
|
+
from scorebook.eval_datasets.eval_dataset import EvalDataset
|
|
16
|
+
from scorebook.evaluate._async.evaluate_async import evaluate_async
|
|
17
|
+
from scorebook.evaluate._sync.evaluate import evaluate
|
|
14
18
|
from scorebook.inference.inference_pipeline import InferencePipeline
|
|
15
|
-
from scorebook.score import
|
|
16
|
-
from scorebook.
|
|
17
|
-
from scorebook.trismik.upload_results import upload_result, upload_result_async
|
|
19
|
+
from scorebook.score._async.score_async import score_async
|
|
20
|
+
from scorebook.score._sync.score import score
|
|
18
21
|
from scorebook.utils.render_template import render_template
|
|
19
22
|
|
|
20
23
|
__all__ = [
|
|
@@ -28,6 +31,8 @@ __all__ = [
|
|
|
28
31
|
"logout",
|
|
29
32
|
"whoami",
|
|
30
33
|
"InferencePipeline",
|
|
34
|
+
"create_project",
|
|
35
|
+
"create_project_async",
|
|
31
36
|
"upload_result",
|
|
32
37
|
"upload_result_async",
|
|
33
38
|
]
|
|
@@ -4,7 +4,7 @@ import argparse
|
|
|
4
4
|
import getpass
|
|
5
5
|
import sys
|
|
6
6
|
|
|
7
|
-
from scorebook.
|
|
7
|
+
from scorebook.dashboard.credentials import get_stored_token, get_token_path, login, logout, whoami
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def auth_command(args: argparse.Namespace) -> int:
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Trismik authentication and API integration."""
|