scorebook 0.0.12__py3-none-any.whl → 0.0.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. scorebook/__init__.py +10 -5
  2. scorebook/cli/auth.py +1 -1
  3. scorebook/dashboard/__init__.py +1 -0
  4. scorebook/dashboard/create_project.py +91 -0
  5. scorebook/{trismik → dashboard}/credentials.py +24 -9
  6. scorebook/{trismik → dashboard}/upload_results.py +1 -1
  7. scorebook/eval_datasets/__init__.py +0 -4
  8. scorebook/eval_datasets/eval_dataset.py +22 -2
  9. scorebook/evaluate/__init__.py +1 -15
  10. scorebook/evaluate/_async/evaluate_async.py +25 -9
  11. scorebook/evaluate/_sync/evaluate.py +25 -9
  12. scorebook/evaluate/evaluate_helpers.py +79 -5
  13. scorebook/inference/__init__.py +1 -11
  14. scorebook/inference/clients/__init__.py +1 -8
  15. scorebook/inference/inference_pipeline.py +1 -1
  16. scorebook/metrics/__init__.py +1 -18
  17. scorebook/metrics/metric_registry.py +2 -0
  18. scorebook/score/__init__.py +0 -5
  19. scorebook/score/_async/score_async.py +3 -2
  20. scorebook/score/_sync/score.py +3 -2
  21. scorebook/score/score_helpers.py +1 -1
  22. scorebook/types.py +3 -1
  23. scorebook/utils/__init__.py +0 -22
  24. scorebook/utils/common_helpers.py +1 -1
  25. scorebook/utils/mock_llm/__init__.py +41 -0
  26. scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
  27. scorebook-0.0.14.dist-info/METADATA +292 -0
  28. scorebook-0.0.14.dist-info/RECORD +53 -0
  29. scorebook/trismik/__init__.py +0 -10
  30. scorebook-0.0.12.dist-info/METADATA +0 -389
  31. scorebook-0.0.12.dist-info/RECORD +0 -50
  32. {scorebook-0.0.12.dist-info → scorebook-0.0.14.dist-info}/WHEEL +0 -0
  33. {scorebook-0.0.12.dist-info → scorebook-0.0.14.dist-info}/entry_points.txt +0 -0
  34. {scorebook-0.0.12.dist-info → scorebook-0.0.14.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,292 @@
1
+ Metadata-Version: 2.4
2
+ Name: scorebook
3
+ Version: 0.0.14
4
+ Summary: A Python project for LLM evaluation.
5
+ License-File: LICENSE
6
+ Author: Euan Campbell
7
+ Author-email: euan@trismik.com
8
+ Requires-Python: >=3.9, <3.14
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.9
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Provides-Extra: bedrock
16
+ Provides-Extra: examples
17
+ Provides-Extra: openai
18
+ Provides-Extra: portkey
19
+ Provides-Extra: vertex
20
+ Requires-Dist: accelerate ; extra == "examples"
21
+ Requires-Dist: boto3 (==1.40.0) ; extra == "bedrock"
22
+ Requires-Dist: datasets (>=3.6.0)
23
+ Requires-Dist: fsspec[gcs] ; extra == "vertex"
24
+ Requires-Dist: google-cloud-storage ; extra == "vertex"
25
+ Requires-Dist: google-genai ; extra == "vertex"
26
+ Requires-Dist: ipywidgets (>=8.0.0)
27
+ Requires-Dist: notebook (>=7.4.5,<8.0.0)
28
+ Requires-Dist: notebook ; extra == "examples"
29
+ Requires-Dist: openai ; extra == "openai"
30
+ Requires-Dist: pandas ; extra == "vertex"
31
+ Requires-Dist: portkey-ai ; extra == "portkey"
32
+ Requires-Dist: python-dotenv ; extra == "bedrock"
33
+ Requires-Dist: python-dotenv ; extra == "openai"
34
+ Requires-Dist: python-dotenv ; extra == "portkey"
35
+ Requires-Dist: python-dotenv ; extra == "vertex"
36
+ Requires-Dist: torch ; extra == "examples"
37
+ Requires-Dist: torchaudio ; extra == "examples"
38
+ Requires-Dist: torchvision ; extra == "examples"
39
+ Requires-Dist: transformers ; extra == "examples"
40
+ Requires-Dist: trismik (==1.0.2)
41
+ Description-Content-Type: text/markdown
42
+
43
+ <h1 align="center">Scorebook</h1>
44
+
45
+ <p align="center"><strong>A Python library for Model evaluation</strong></p>
46
+
47
+ <p align="center">
48
+ <img alt="Dynamic TOML Badge" src="https://img.shields.io/badge/dynamic/toml?url=https%3A%2F%2Fraw.githubusercontent.com%2Ftrismik%2Fscorebook%2Frefs%2Fheads%2Fmain%2Fpyproject.toml&query=tool.poetry.version&style=flat&label=version">
49
+ <img alt="Python Version" src="https://img.shields.io/badge/python-3.9%2B-blue">
50
+ <a href="https://docs.trismik.com/scorebook/introduction-to-scorebook/" target="_blank" rel="noopener">
51
+ <img alt="Documentation" src="https://img.shields.io/badge/docs-Scorebook-blue?style=flat">
52
+ </a>
53
+ <img alt="License" src="https://img.shields.io/badge/license-MIT-green">
54
+ </p>
55
+
56
+ Scorebook provides a flexible and extensible framework for evaluating models such as large language models (LLMs). Easily evaluate any model using evaluation datasets from Hugging Face such as MMLU-Pro, HellaSwag, and CommonSenseQA, or with data from any other source. Evaluations calculate scores for any number of specified metrics such as accuracy, precision, and recall, as well as any custom defined metrics, including LLM as a judge (LLMaJ).
57
+
58
+ ## Use Cases
59
+
60
+ Scorebook's evaluations can be used for:
61
+
62
+ - **Model Benchmarking**: Compare different models on standard datasets.
63
+ - **Model Optimization**: Find optimal model configurations.
64
+ - **Iterative Experimentation**: Reproducible evaluation workflows.
65
+
66
+ ## Key Features
67
+
68
+ - **Model Agnostic**: Evaluate any model, running locally or deployed on the cloud.
69
+ - **Dataset Agnostic**: Create evaluation datasets from Hugging Face datasets or any other source.
70
+ - **Extensible Metric Engine**: Use the Scorebook's built-in or implement your own.
71
+ - **Hyperparameter Sweeping**: Evaluate over multiple model hyperparameter configurations.
72
+ - **Adaptive Evaluations**: Run Trismik's ultra-fast [adaptive evaluations](https://docs.trismik.com/adaptiveTesting/adaptive-testing-introduction/).
73
+ - **Trismik Integration**: Upload evaluations to [Trismik's platform](https://www.trismik.com/).
74
+
75
+ ## Installation
76
+
77
+ ```bash
78
+ pip install scorebook
79
+ ```
80
+
81
+ ## Scoring Models Output
82
+
83
+ Scorebooks score function can be used to evaluate pre-generated model outputs.
84
+
85
+ ### Score Example
86
+ ```python
87
+ from scorebook import score
88
+ from scorebook.metrics import Accuracy
89
+
90
+ # 1. Prepare a list of generated model outputs and labels
91
+ model_predictions = [
92
+ {"input": "What is 2 + 2?", "output": "4", "label": "4"},
93
+ {"input": "What is the capital of France?", "output": "London", "label": "Paris"},
94
+ {"input": "Who wrote Romeo and Juliette?", "output": "William Shakespeare", "label": "William Shakespeare"},
95
+ {"input": "What is the chemical symbol for gold?", "output": "Au", "label": "Au"},
96
+ ]
97
+
98
+ # 2. Score the model's predictions against labels using metrics
99
+ results = score(
100
+ items = model_predictions,
101
+ metrics = Accuracy,
102
+ )
103
+ ```
104
+
105
+ ### Score Results:
106
+ ```json
107
+ {
108
+ "aggregate_results": [
109
+ {
110
+ "dataset": "scored_items",
111
+ "accuracy": 0.75
112
+ }
113
+ ],
114
+ "item_results": [
115
+ {
116
+ "id": 0,
117
+ "dataset": "scored_items",
118
+ "input": "What is 2 + 2?",
119
+ "output": "4",
120
+ "label": "4",
121
+ "accuracy": true
122
+ }
123
+ // ... additional items
124
+ ]
125
+ }
126
+ ```
127
+
128
+ ## _Classical_ Evaluations
129
+
130
+ Running a classical evaluation in Scorebook executes model inference on every item in the dataset, then scores the generated outputs using the dataset’s specified metrics to quantify model performance.
131
+
132
+ ### Classical Evaluation example:
133
+ ```python
134
+ from scorebook import evaluate, EvalDataset
135
+ from scorebook.metrics import Accuracy
136
+
137
+ # 1. Create an evaluation dataset
138
+ evaluation_items = [
139
+ {"question": "What is 2 + 2?", "answer": "4"},
140
+ {"question": "What is the capital of France?", "answer": "Paris"},
141
+ {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"}
142
+ ]
143
+
144
+ evaluation_dataset = EvalDataset.from_list(
145
+ name = "basic_questions",
146
+ items = evaluation_items,
147
+ input = "question",
148
+ label = "answer",
149
+ metrics = Accuracy,
150
+ )
151
+
152
+ # 2. Define an inference function - This is a pseudocode example
153
+ def inference_function(inputs: List[Any], **hyperparameters):
154
+
155
+ # Create or call a model
156
+ model = Model()
157
+ model.temperature = hyperparameters.get("temperature")
158
+
159
+ # Call model inference
160
+ model_outputs = model(inputs)
161
+
162
+ # Return outputs
163
+ return model_outputs
164
+
165
+ # 3. Run evaluation
166
+ evaluation_results = evaluate(
167
+ inference_function,
168
+ evaluation_dataset,
169
+ hyperparameters = {"temperature": 0.7}
170
+ )
171
+ ```
172
+
173
+ ### Evaluation Results:
174
+ ```json
175
+ {
176
+ "aggregate_results": [
177
+ {
178
+ "dataset": "basic_questions",
179
+ "temperature": 0.7,
180
+ "accuracy": 1.0,
181
+ "run_completed": true
182
+ }
183
+ ],
184
+ "item_results": [
185
+ {
186
+ "id": 0,
187
+ "dataset": "basic_questions",
188
+ "input": "What is 2 + 2?",
189
+ "output": "4",
190
+ "label": "4",
191
+ "temperature": 0.7,
192
+ "accuracy": true
193
+ }
194
+ // ... additional items
195
+ ]
196
+ }
197
+ ```
198
+
199
+ ### _Adaptive_ Evaluations with `evaluate`
200
+
201
+ To run an adaptive evaluation, use a Trismik adaptive dataset
202
+ The CAT algorithm dynamically selects items to estimate the model’s ability (θ) with minimal standard error and fewest questions.
203
+
204
+ ### Adaptive Evaluation Example
205
+ ```python
206
+ from scorebook import evaluate, login
207
+
208
+ # 1. Log in with your Trismik API key
209
+ login("TRISMIK_API_KEY")
210
+
211
+ # 2. Define an inference function
212
+ def inference_function(inputs: List[Any], **hyperparameters):
213
+
214
+ # Create or call a model
215
+ model = Model()
216
+
217
+ # Call model inference
218
+ outputs = model(inputs)
219
+
220
+ # Return outputs
221
+ return outputs
222
+
223
+ # 3. Run an adaptive evaluation
224
+ results = evaluate(
225
+ inference_function,
226
+ datasets = "trismik/headQA:adaptive", # Adaptive datasets have the ":adaptive" suffix
227
+ project_id = "TRISMIK_PROJECT_ID", # Required: Create a project on your Trismik dashboard
228
+ experiment_id = "TRISMIK_EXPERIMENT_ID", # Optional: An identifier to upload this run under
229
+ )
230
+ ```
231
+
232
+ ### Adaptive Evaluation Results
233
+ ```json
234
+ {
235
+ "aggregate_results": [
236
+ {
237
+ "dataset": "trismik/headQA:adaptive",
238
+ "experiment_id": "TRISMIK_EXPERIMENT_ID",
239
+ "project_id": "TRISMIK_PROJECT_ID",
240
+ "run_id": "RUN_ID",
241
+ "score": {
242
+ "theta": 1.2,
243
+ "std_error": 0.20
244
+ },
245
+ "responses": null
246
+ }
247
+ ],
248
+ "item_results": []
249
+ }
250
+ ```
251
+
252
+ ## Metrics
253
+
254
+ | Metric | Sync/Async | Aggregate Scores | Item Scores |
255
+ |------------|------------|--------------------------------------------------|-----------------------------------------|
256
+ | `Accuracy` | Sync | `Float`: Percentage of correct outputs | `Boolean`: Exact match between output and label |
257
+
258
+
259
+ ## Tutorials
260
+
261
+ For local more detailed and runnable examples:
262
+ ```bash
263
+ pip install scorebook[examples]
264
+ ```
265
+
266
+ The `tutorials/` directory contains comprehensive tutorials as notebooks and code examples:
267
+
268
+ - **`tutorials/notebooks`**: Interactive Jupyter Notebooks showcasing Scorebook's capabilities.
269
+ - **`tutorials/examples`**: Runnable Python examples incrementally implementing Scorebook's features.
270
+
271
+ **Run a notebook:**
272
+ ```bash
273
+ jupyter notebook tutorials/notebooks
274
+ ```
275
+
276
+ **Run an example:**
277
+ ```bash
278
+ python3 tutorials/examples/1-score/1-scoring_model_accuracy.py
279
+ ```
280
+
281
+ ## Contributing
282
+
283
+ We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
284
+
285
+ ## License
286
+
287
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
288
+
289
+ ## About
290
+
291
+ Scorebook is developed by [Trismik](https://trismik.com) to simplify and speed up your LLM evaluations.
292
+
@@ -0,0 +1,53 @@
1
+ scorebook/__init__.py,sha256=S2JaZZsx76p0EjXtKz4UPdSzuO60jAjOvooYP-idBu8,1144
2
+ scorebook/cli/__init__.py,sha256=E89jR1DljFSHhfjEGSRKLgz0KhxGyRQ9a3vpUOmQL9o,32
3
+ scorebook/cli/auth.py,sha256=VGS5T0CSeS0n_7bntNggrYx-vDwxJJHdYxbKedFAq74,2939
4
+ scorebook/cli/main.py,sha256=cEvShENl6L6feX_sa7FGNTeoz5UtwqzwenmcHaON1hg,1589
5
+ scorebook/dashboard/__init__.py,sha256=36DxO3oXVcZ2I6kizLFCcJkLBpXOU8UIXFT_ZjeFTB4,50
6
+ scorebook/dashboard/create_project.py,sha256=RK90aMN0_XVM-DnawTY_b59yPJaRnpb_GoidCqXB5Vw,2845
7
+ scorebook/dashboard/credentials.py,sha256=Q_khY5AX3fnyWshHe6LaesBHcCmNBse6a_XFGT8OOaw,3474
8
+ scorebook/dashboard/upload_results.py,sha256=sdgOEf0C7QLt7t2QiXvSoceQpAiiPmlG_4SFEEzVPlc,9738
9
+ scorebook/eval_datasets/__init__.py,sha256=wsmFNyuZJdBxjokcKG4NRfuUzPZKuzsKX3aG21zfFV4,39
10
+ scorebook/eval_datasets/eval_dataset.py,sha256=xnG7VaceWUmg8Wrk2IGnVFZs9umzmZrW8F7THvtWMqs,28041
11
+ scorebook/evaluate/__init__.py,sha256=Qqe-l4y3Nu81Fdx83RbtCQESoXC0XukBgOC3DPSWZZA,39
12
+ scorebook/evaluate/_async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ scorebook/evaluate/_async/evaluate_async.py,sha256=G0RB_A1f5mQ42D82DnxkzAZhyV5kgbxi9Lr7qKaKUyY,16590
14
+ scorebook/evaluate/_sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ scorebook/evaluate/_sync/evaluate.py,sha256=OIUsW2U1IrdwYIIPsfpTCOfJDAYJ6BYl-6pQQiafSNE,16364
16
+ scorebook/evaluate/evaluate_helpers.py,sha256=NnanxLEeHwoZNztGXQJc6u_WqKfDkn1vYmck2BrKF-c,17028
17
+ scorebook/exceptions.py,sha256=3sxCWhFqYgXiWNUAMRR2ggLfqvbDI8e5vLjnT9V7X1M,3649
18
+ scorebook/inference/__init__.py,sha256=gGuZG1rdpxKYC54q0eAS6oTHQbRYhgxlBeAqonqHvRU,60
19
+ scorebook/inference/clients/__init__.py,sha256=VaLW7mi4tywJtR3Q9wr2pPci8NlEQ3bJanZyM5S81Z4,51
20
+ scorebook/inference/clients/bedrock.py,sha256=bsnz0IB6ufjZVPd6syD3yVaOelerB5G_YAmPAVqmBmI,10071
21
+ scorebook/inference/clients/openai.py,sha256=JPPcJdxYwwZNUfXCGTRRzzUUA8I8WiV3bu6-pgS1_UE,9043
22
+ scorebook/inference/clients/portkey.py,sha256=RCuEZB8xNAVeGEt3IJ0esh_wqreZNB3jrDKiWH6miV0,5949
23
+ scorebook/inference/clients/vertex.py,sha256=g6oNnag0qcLOYCtQ4SXAXfnqKtvPAVdigB--I7wU1yM,9871
24
+ scorebook/inference/inference_pipeline.py,sha256=1qSmfI4fBJFS3EcAhRlA-f4-8aI6wDiupSJu-vNXoYI,5571
25
+ scorebook/metrics/__init__.py,sha256=bsEq15LpFt3h0AQQFbnvL4CU7KpIpifVdJAsfduPGXk,48
26
+ scorebook/metrics/accuracy.py,sha256=5KQ4hfOn9M94sB7WsXUelJWJiuKfoCGQEl5q5q9vNfo,1467
27
+ scorebook/metrics/metric_base.py,sha256=I3L0DGcRojFp93UGFnXG1tZ2UK9ilTcXXJG6lj5ddXA,857
28
+ scorebook/metrics/metric_registry.py,sha256=YcbKGf2kPMQqyqJ9NYVq_-J19rARXSo22HjTW5WU-QU,3404
29
+ scorebook/metrics/precision.py,sha256=AaYPYYKnY74Nwqp_p3jd2Ewf3VHNOJjoRWf5fhb-tXk,563
30
+ scorebook/score/__init__.py,sha256=CqkslUvOw8QfCCbSfwZgGrbmXeSLpZqIVo4ntrctYuY,66
31
+ scorebook/score/_async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
+ scorebook/score/_async/score_async.py,sha256=SatV9hEUT8MAru2ACSyM03weKX6VTFx7crW59_uX0L8,6155
33
+ scorebook/score/_sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
+ scorebook/score/_sync/score.py,sha256=nANQbuyYyIaWnoTQzyGMwPZRMFP6MmyIyHb1GO1mktQ,6101
35
+ scorebook/score/score_helpers.py,sha256=Gjx2Lgd94ISvunb5CHj-tDWYVEOVj9ySjjVYnnhpk_Q,7086
36
+ scorebook/settings.py,sha256=qZrNiki6rFXn43udmhjQSmdDKOEaX62WYoEs2Rbggr0,720
37
+ scorebook/types.py,sha256=2lv1YUky7aDGIEPjgj18aKTpBMdmqD01TKLbwli19pQ,4904
38
+ scorebook/utils/__init__.py,sha256=oBTybVHI5EdHIgzb0TeoAnSLMQdUh20Ww6vcL9542Pk,72
39
+ scorebook/utils/async_utils.py,sha256=2ewk_VOePib8z7DTRl-pZQBGzVI3L3JvnEuYW-DTkRA,1325
40
+ scorebook/utils/common_helpers.py,sha256=lJIqO9XGf1T3S3rdGBTjZJ1BzVPvaU_XTONEfPApnEM,1218
41
+ scorebook/utils/io_helpers.py,sha256=ORO6DwtXOKWJq9v_isuunUrz0viE3xy2qYO4lrgU-TM,1437
42
+ scorebook/utils/jinja_helpers.py,sha256=ksIKHiKdj8N0o7ZJZGasfbSNoAY6K5d9X_KM6mcKYD4,4208
43
+ scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
44
+ scorebook/utils/mock_llm/__init__.py,sha256=dK70wNVBKk4hv1o3fceDTBG1_maFbkMvoOtTriPCe78,1293
45
+ scorebook/utils/mock_llm/data/mock_llm_data.json,sha256=b28j7OCR0igpP0rkXDJAR2NWIiuVkOaAkzB-Miv665Y,381567
46
+ scorebook/utils/progress_bars.py,sha256=gdT6dJ9LMLYzs7TospP3wQNY9htm_FhVLdX0ueluC6E,31890
47
+ scorebook/utils/render_template.py,sha256=NOaZt-N1WcR5MA7at1XxzD-4sFMFKo9X0k7fKq6oSSM,1654
48
+ scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
49
+ scorebook-0.0.14.dist-info/METADATA,sha256=jPqVszfpCiAKf3yt45XD6lXfIJL1-TFvSMDVGrIoCPs,9491
50
+ scorebook-0.0.14.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
51
+ scorebook-0.0.14.dist-info/entry_points.txt,sha256=9gNd3Q0MEozhJ7fog-Q-Z_PrcGMnF-404Jon40MH2_U,53
52
+ scorebook-0.0.14.dist-info/licenses/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
53
+ scorebook-0.0.14.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- """Trismik authentication and API integration.
2
-
3
- Note: Trismik evaluation functionality has been moved to scorebook.evaluate module.
4
- This module now only provides authentication functions.
5
- """
6
-
7
- # Import shared credential functions
8
- from .credentials import get_stored_token, get_token, login, logout, whoami
9
-
10
- __all__ = ["login", "logout", "whoami", "get_stored_token", "get_token"]