scorebook 0.0.13__tar.gz → 0.0.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. scorebook-0.0.14/PKG-INFO +292 -0
  2. scorebook-0.0.14/README.md +249 -0
  3. {scorebook-0.0.13 → scorebook-0.0.14}/pyproject.toml +1 -27
  4. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/__init__.py +10 -5
  5. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/cli/auth.py +1 -1
  6. scorebook-0.0.14/src/scorebook/dashboard/__init__.py +1 -0
  7. scorebook-0.0.14/src/scorebook/dashboard/create_project.py +91 -0
  8. {scorebook-0.0.13/src/scorebook/trismik → scorebook-0.0.14/src/scorebook/dashboard}/credentials.py +24 -9
  9. {scorebook-0.0.13/src/scorebook/trismik → scorebook-0.0.14/src/scorebook/dashboard}/upload_results.py +1 -1
  10. scorebook-0.0.14/src/scorebook/eval_datasets/__init__.py +1 -0
  11. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/eval_datasets/eval_dataset.py +4 -2
  12. scorebook-0.0.14/src/scorebook/evaluate/__init__.py +1 -0
  13. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/evaluate/_async/evaluate_async.py +9 -8
  14. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/evaluate/_sync/evaluate.py +9 -8
  15. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/evaluate/evaluate_helpers.py +4 -3
  16. scorebook-0.0.14/src/scorebook/inference/__init__.py +1 -0
  17. scorebook-0.0.14/src/scorebook/inference/clients/__init__.py +1 -0
  18. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/inference/inference_pipeline.py +1 -1
  19. scorebook-0.0.14/src/scorebook/metrics/__init__.py +1 -0
  20. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/metrics/metric_registry.py +2 -0
  21. scorebook-0.0.14/src/scorebook/score/__init__.py +1 -0
  22. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/score/_async/score_async.py +3 -2
  23. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/score/_sync/score.py +3 -2
  24. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/score/score_helpers.py +1 -1
  25. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/types.py +1 -1
  26. scorebook-0.0.14/src/scorebook/utils/__init__.py +1 -0
  27. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/utils/common_helpers.py +1 -1
  28. scorebook-0.0.14/src/scorebook/utils/mock_llm/__init__.py +41 -0
  29. scorebook-0.0.14/src/scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
  30. scorebook-0.0.13/PKG-INFO +0 -389
  31. scorebook-0.0.13/README.md +0 -346
  32. scorebook-0.0.13/src/scorebook/eval_datasets/__init__.py +0 -5
  33. scorebook-0.0.13/src/scorebook/evaluate/__init__.py +0 -15
  34. scorebook-0.0.13/src/scorebook/inference/__init__.py +0 -11
  35. scorebook-0.0.13/src/scorebook/inference/clients/__init__.py +0 -8
  36. scorebook-0.0.13/src/scorebook/metrics/__init__.py +0 -18
  37. scorebook-0.0.13/src/scorebook/score/__init__.py +0 -6
  38. scorebook-0.0.13/src/scorebook/trismik/__init__.py +0 -10
  39. scorebook-0.0.13/src/scorebook/utils/__init__.py +0 -23
  40. {scorebook-0.0.13 → scorebook-0.0.14}/LICENSE +0 -0
  41. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/cli/__init__.py +0 -0
  42. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/cli/main.py +0 -0
  43. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/evaluate/_async/__init__.py +0 -0
  44. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/evaluate/_sync/__init__.py +0 -0
  45. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/exceptions.py +0 -0
  46. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/inference/clients/bedrock.py +0 -0
  47. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/inference/clients/openai.py +0 -0
  48. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/inference/clients/portkey.py +0 -0
  49. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/inference/clients/vertex.py +0 -0
  50. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/metrics/accuracy.py +0 -0
  51. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/metrics/metric_base.py +0 -0
  52. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/metrics/precision.py +0 -0
  53. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/score/_async/__init__.py +0 -0
  54. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/score/_sync/__init__.py +0 -0
  55. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/settings.py +0 -0
  56. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/utils/async_utils.py +0 -0
  57. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/utils/io_helpers.py +0 -0
  58. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/utils/jinja_helpers.py +0 -0
  59. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/utils/mappers.py +0 -0
  60. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/utils/progress_bars.py +0 -0
  61. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/utils/render_template.py +0 -0
  62. {scorebook-0.0.13 → scorebook-0.0.14}/src/scorebook/utils/transform_helpers.py +0 -0
@@ -0,0 +1,292 @@
1
+ Metadata-Version: 2.4
2
+ Name: scorebook
3
+ Version: 0.0.14
4
+ Summary: A Python project for LLM evaluation.
5
+ License-File: LICENSE
6
+ Author: Euan Campbell
7
+ Author-email: euan@trismik.com
8
+ Requires-Python: >=3.9, <3.14
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.9
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Provides-Extra: bedrock
16
+ Provides-Extra: examples
17
+ Provides-Extra: openai
18
+ Provides-Extra: portkey
19
+ Provides-Extra: vertex
20
+ Requires-Dist: accelerate ; extra == "examples"
21
+ Requires-Dist: boto3 (==1.40.0) ; extra == "bedrock"
22
+ Requires-Dist: datasets (>=3.6.0)
23
+ Requires-Dist: fsspec[gcs] ; extra == "vertex"
24
+ Requires-Dist: google-cloud-storage ; extra == "vertex"
25
+ Requires-Dist: google-genai ; extra == "vertex"
26
+ Requires-Dist: ipywidgets (>=8.0.0)
27
+ Requires-Dist: notebook (>=7.4.5,<8.0.0)
28
+ Requires-Dist: notebook ; extra == "examples"
29
+ Requires-Dist: openai ; extra == "openai"
30
+ Requires-Dist: pandas ; extra == "vertex"
31
+ Requires-Dist: portkey-ai ; extra == "portkey"
32
+ Requires-Dist: python-dotenv ; extra == "bedrock"
33
+ Requires-Dist: python-dotenv ; extra == "openai"
34
+ Requires-Dist: python-dotenv ; extra == "portkey"
35
+ Requires-Dist: python-dotenv ; extra == "vertex"
36
+ Requires-Dist: torch ; extra == "examples"
37
+ Requires-Dist: torchaudio ; extra == "examples"
38
+ Requires-Dist: torchvision ; extra == "examples"
39
+ Requires-Dist: transformers ; extra == "examples"
40
+ Requires-Dist: trismik (==1.0.2)
41
+ Description-Content-Type: text/markdown
42
+
43
+ <h1 align="center">Scorebook</h1>
44
+
45
+ <p align="center"><strong>A Python library for Model evaluation</strong></p>
46
+
47
+ <p align="center">
48
+ <img alt="Dynamic TOML Badge" src="https://img.shields.io/badge/dynamic/toml?url=https%3A%2F%2Fraw.githubusercontent.com%2Ftrismik%2Fscorebook%2Frefs%2Fheads%2Fmain%2Fpyproject.toml&query=tool.poetry.version&style=flat&label=version">
49
+ <img alt="Python Version" src="https://img.shields.io/badge/python-3.9%2B-blue">
50
+ <a href="https://docs.trismik.com/scorebook/introduction-to-scorebook/" target="_blank" rel="noopener">
51
+ <img alt="Documentation" src="https://img.shields.io/badge/docs-Scorebook-blue?style=flat">
52
+ </a>
53
+ <img alt="License" src="https://img.shields.io/badge/license-MIT-green">
54
+ </p>
55
+
56
+ Scorebook provides a flexible and extensible framework for evaluating models such as large language models (LLMs). Easily evaluate any model using evaluation datasets from Hugging Face such as MMLU-Pro, HellaSwag, and CommonSenseQA, or with data from any other source. Evaluations calculate scores for any number of specified metrics such as accuracy, precision, and recall, as well as any custom defined metrics, including LLM as a judge (LLMaJ).
57
+
58
+ ## Use Cases
59
+
60
+ Scorebook's evaluations can be used for:
61
+
62
+ - **Model Benchmarking**: Compare different models on standard datasets.
63
+ - **Model Optimization**: Find optimal model configurations.
64
+ - **Iterative Experimentation**: Reproducible evaluation workflows.
65
+
66
+ ## Key Features
67
+
68
+ - **Model Agnostic**: Evaluate any model, running locally or deployed on the cloud.
69
+ - **Dataset Agnostic**: Create evaluation datasets from Hugging Face datasets or any other source.
70
+ - **Extensible Metric Engine**: Use the Scorebook's built-in or implement your own.
71
+ - **Hyperparameter Sweeping**: Evaluate over multiple model hyperparameter configurations.
72
+ - **Adaptive Evaluations**: Run Trismik's ultra-fast [adaptive evaluations](https://docs.trismik.com/adaptiveTesting/adaptive-testing-introduction/).
73
+ - **Trismik Integration**: Upload evaluations to [Trismik's platform](https://www.trismik.com/).
74
+
75
+ ## Installation
76
+
77
+ ```bash
78
+ pip install scorebook
79
+ ```
80
+
81
+ ## Scoring Models Output
82
+
83
+ Scorebooks score function can be used to evaluate pre-generated model outputs.
84
+
85
+ ### Score Example
86
+ ```python
87
+ from scorebook import score
88
+ from scorebook.metrics import Accuracy
89
+
90
+ # 1. Prepare a list of generated model outputs and labels
91
+ model_predictions = [
92
+ {"input": "What is 2 + 2?", "output": "4", "label": "4"},
93
+ {"input": "What is the capital of France?", "output": "London", "label": "Paris"},
94
+ {"input": "Who wrote Romeo and Juliette?", "output": "William Shakespeare", "label": "William Shakespeare"},
95
+ {"input": "What is the chemical symbol for gold?", "output": "Au", "label": "Au"},
96
+ ]
97
+
98
+ # 2. Score the model's predictions against labels using metrics
99
+ results = score(
100
+ items = model_predictions,
101
+ metrics = Accuracy,
102
+ )
103
+ ```
104
+
105
+ ### Score Results:
106
+ ```json
107
+ {
108
+ "aggregate_results": [
109
+ {
110
+ "dataset": "scored_items",
111
+ "accuracy": 0.75
112
+ }
113
+ ],
114
+ "item_results": [
115
+ {
116
+ "id": 0,
117
+ "dataset": "scored_items",
118
+ "input": "What is 2 + 2?",
119
+ "output": "4",
120
+ "label": "4",
121
+ "accuracy": true
122
+ }
123
+ // ... additional items
124
+ ]
125
+ }
126
+ ```
127
+
128
+ ## _Classical_ Evaluations
129
+
130
+ Running a classical evaluation in Scorebook executes model inference on every item in the dataset, then scores the generated outputs using the dataset’s specified metrics to quantify model performance.
131
+
132
+ ### Classical Evaluation example:
133
+ ```python
134
+ from scorebook import evaluate, EvalDataset
135
+ from scorebook.metrics import Accuracy
136
+
137
+ # 1. Create an evaluation dataset
138
+ evaluation_items = [
139
+ {"question": "What is 2 + 2?", "answer": "4"},
140
+ {"question": "What is the capital of France?", "answer": "Paris"},
141
+ {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"}
142
+ ]
143
+
144
+ evaluation_dataset = EvalDataset.from_list(
145
+ name = "basic_questions",
146
+ items = evaluation_items,
147
+ input = "question",
148
+ label = "answer",
149
+ metrics = Accuracy,
150
+ )
151
+
152
+ # 2. Define an inference function - This is a pseudocode example
153
+ def inference_function(inputs: List[Any], **hyperparameters):
154
+
155
+ # Create or call a model
156
+ model = Model()
157
+ model.temperature = hyperparameters.get("temperature")
158
+
159
+ # Call model inference
160
+ model_outputs = model(inputs)
161
+
162
+ # Return outputs
163
+ return model_outputs
164
+
165
+ # 3. Run evaluation
166
+ evaluation_results = evaluate(
167
+ inference_function,
168
+ evaluation_dataset,
169
+ hyperparameters = {"temperature": 0.7}
170
+ )
171
+ ```
172
+
173
+ ### Evaluation Results:
174
+ ```json
175
+ {
176
+ "aggregate_results": [
177
+ {
178
+ "dataset": "basic_questions",
179
+ "temperature": 0.7,
180
+ "accuracy": 1.0,
181
+ "run_completed": true
182
+ }
183
+ ],
184
+ "item_results": [
185
+ {
186
+ "id": 0,
187
+ "dataset": "basic_questions",
188
+ "input": "What is 2 + 2?",
189
+ "output": "4",
190
+ "label": "4",
191
+ "temperature": 0.7,
192
+ "accuracy": true
193
+ }
194
+ // ... additional items
195
+ ]
196
+ }
197
+ ```
198
+
199
+ ### _Adaptive_ Evaluations with `evaluate`
200
+
201
+ To run an adaptive evaluation, use a Trismik adaptive dataset
202
+ The CAT algorithm dynamically selects items to estimate the model’s ability (θ) with minimal standard error and fewest questions.
203
+
204
+ ### Adaptive Evaluation Example
205
+ ```python
206
+ from scorebook import evaluate, login
207
+
208
+ # 1. Log in with your Trismik API key
209
+ login("TRISMIK_API_KEY")
210
+
211
+ # 2. Define an inference function
212
+ def inference_function(inputs: List[Any], **hyperparameters):
213
+
214
+ # Create or call a model
215
+ model = Model()
216
+
217
+ # Call model inference
218
+ outputs = model(inputs)
219
+
220
+ # Return outputs
221
+ return outputs
222
+
223
+ # 3. Run an adaptive evaluation
224
+ results = evaluate(
225
+ inference_function,
226
+ datasets = "trismik/headQA:adaptive", # Adaptive datasets have the ":adaptive" suffix
227
+ project_id = "TRISMIK_PROJECT_ID", # Required: Create a project on your Trismik dashboard
228
+ experiment_id = "TRISMIK_EXPERIMENT_ID", # Optional: An identifier to upload this run under
229
+ )
230
+ ```
231
+
232
+ ### Adaptive Evaluation Results
233
+ ```json
234
+ {
235
+ "aggregate_results": [
236
+ {
237
+ "dataset": "trismik/headQA:adaptive",
238
+ "experiment_id": "TRISMIK_EXPERIMENT_ID",
239
+ "project_id": "TRISMIK_PROJECT_ID",
240
+ "run_id": "RUN_ID",
241
+ "score": {
242
+ "theta": 1.2,
243
+ "std_error": 0.20
244
+ },
245
+ "responses": null
246
+ }
247
+ ],
248
+ "item_results": []
249
+ }
250
+ ```
251
+
252
+ ## Metrics
253
+
254
+ | Metric | Sync/Async | Aggregate Scores | Item Scores |
255
+ |------------|------------|--------------------------------------------------|-----------------------------------------|
256
+ | `Accuracy` | Sync | `Float`: Percentage of correct outputs | `Boolean`: Exact match between output and label |
257
+
258
+
259
+ ## Tutorials
260
+
261
+ For local more detailed and runnable examples:
262
+ ```bash
263
+ pip install scorebook[examples]
264
+ ```
265
+
266
+ The `tutorials/` directory contains comprehensive tutorials as notebooks and code examples:
267
+
268
+ - **`tutorials/notebooks`**: Interactive Jupyter Notebooks showcasing Scorebook's capabilities.
269
+ - **`tutorials/examples`**: Runnable Python examples incrementally implementing Scorebook's features.
270
+
271
+ **Run a notebook:**
272
+ ```bash
273
+ jupyter notebook tutorials/notebooks
274
+ ```
275
+
276
+ **Run an example:**
277
+ ```bash
278
+ python3 tutorials/examples/1-score/1-scoring_model_accuracy.py
279
+ ```
280
+
281
+ ## Contributing
282
+
283
+ We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
284
+
285
+ ## License
286
+
287
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
288
+
289
+ ## About
290
+
291
+ Scorebook is developed by [Trismik](https://trismik.com) to simplify and speed up your LLM evaluations.
292
+
@@ -0,0 +1,249 @@
1
+ <h1 align="center">Scorebook</h1>
2
+
3
+ <p align="center"><strong>A Python library for Model evaluation</strong></p>
4
+
5
+ <p align="center">
6
+ <img alt="Dynamic TOML Badge" src="https://img.shields.io/badge/dynamic/toml?url=https%3A%2F%2Fraw.githubusercontent.com%2Ftrismik%2Fscorebook%2Frefs%2Fheads%2Fmain%2Fpyproject.toml&query=tool.poetry.version&style=flat&label=version">
7
+ <img alt="Python Version" src="https://img.shields.io/badge/python-3.9%2B-blue">
8
+ <a href="https://docs.trismik.com/scorebook/introduction-to-scorebook/" target="_blank" rel="noopener">
9
+ <img alt="Documentation" src="https://img.shields.io/badge/docs-Scorebook-blue?style=flat">
10
+ </a>
11
+ <img alt="License" src="https://img.shields.io/badge/license-MIT-green">
12
+ </p>
13
+
14
+ Scorebook provides a flexible and extensible framework for evaluating models such as large language models (LLMs). Easily evaluate any model using evaluation datasets from Hugging Face such as MMLU-Pro, HellaSwag, and CommonSenseQA, or with data from any other source. Evaluations calculate scores for any number of specified metrics such as accuracy, precision, and recall, as well as any custom defined metrics, including LLM as a judge (LLMaJ).
15
+
16
+ ## Use Cases
17
+
18
+ Scorebook's evaluations can be used for:
19
+
20
+ - **Model Benchmarking**: Compare different models on standard datasets.
21
+ - **Model Optimization**: Find optimal model configurations.
22
+ - **Iterative Experimentation**: Reproducible evaluation workflows.
23
+
24
+ ## Key Features
25
+
26
+ - **Model Agnostic**: Evaluate any model, running locally or deployed on the cloud.
27
+ - **Dataset Agnostic**: Create evaluation datasets from Hugging Face datasets or any other source.
28
+ - **Extensible Metric Engine**: Use the Scorebook's built-in or implement your own.
29
+ - **Hyperparameter Sweeping**: Evaluate over multiple model hyperparameter configurations.
30
+ - **Adaptive Evaluations**: Run Trismik's ultra-fast [adaptive evaluations](https://docs.trismik.com/adaptiveTesting/adaptive-testing-introduction/).
31
+ - **Trismik Integration**: Upload evaluations to [Trismik's platform](https://www.trismik.com/).
32
+
33
+ ## Installation
34
+
35
+ ```bash
36
+ pip install scorebook
37
+ ```
38
+
39
+ ## Scoring Models Output
40
+
41
+ Scorebooks score function can be used to evaluate pre-generated model outputs.
42
+
43
+ ### Score Example
44
+ ```python
45
+ from scorebook import score
46
+ from scorebook.metrics import Accuracy
47
+
48
+ # 1. Prepare a list of generated model outputs and labels
49
+ model_predictions = [
50
+ {"input": "What is 2 + 2?", "output": "4", "label": "4"},
51
+ {"input": "What is the capital of France?", "output": "London", "label": "Paris"},
52
+ {"input": "Who wrote Romeo and Juliette?", "output": "William Shakespeare", "label": "William Shakespeare"},
53
+ {"input": "What is the chemical symbol for gold?", "output": "Au", "label": "Au"},
54
+ ]
55
+
56
+ # 2. Score the model's predictions against labels using metrics
57
+ results = score(
58
+ items = model_predictions,
59
+ metrics = Accuracy,
60
+ )
61
+ ```
62
+
63
+ ### Score Results:
64
+ ```json
65
+ {
66
+ "aggregate_results": [
67
+ {
68
+ "dataset": "scored_items",
69
+ "accuracy": 0.75
70
+ }
71
+ ],
72
+ "item_results": [
73
+ {
74
+ "id": 0,
75
+ "dataset": "scored_items",
76
+ "input": "What is 2 + 2?",
77
+ "output": "4",
78
+ "label": "4",
79
+ "accuracy": true
80
+ }
81
+ // ... additional items
82
+ ]
83
+ }
84
+ ```
85
+
86
+ ## _Classical_ Evaluations
87
+
88
+ Running a classical evaluation in Scorebook executes model inference on every item in the dataset, then scores the generated outputs using the dataset’s specified metrics to quantify model performance.
89
+
90
+ ### Classical Evaluation example:
91
+ ```python
92
+ from scorebook import evaluate, EvalDataset
93
+ from scorebook.metrics import Accuracy
94
+
95
+ # 1. Create an evaluation dataset
96
+ evaluation_items = [
97
+ {"question": "What is 2 + 2?", "answer": "4"},
98
+ {"question": "What is the capital of France?", "answer": "Paris"},
99
+ {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"}
100
+ ]
101
+
102
+ evaluation_dataset = EvalDataset.from_list(
103
+ name = "basic_questions",
104
+ items = evaluation_items,
105
+ input = "question",
106
+ label = "answer",
107
+ metrics = Accuracy,
108
+ )
109
+
110
+ # 2. Define an inference function - This is a pseudocode example
111
+ def inference_function(inputs: List[Any], **hyperparameters):
112
+
113
+ # Create or call a model
114
+ model = Model()
115
+ model.temperature = hyperparameters.get("temperature")
116
+
117
+ # Call model inference
118
+ model_outputs = model(inputs)
119
+
120
+ # Return outputs
121
+ return model_outputs
122
+
123
+ # 3. Run evaluation
124
+ evaluation_results = evaluate(
125
+ inference_function,
126
+ evaluation_dataset,
127
+ hyperparameters = {"temperature": 0.7}
128
+ )
129
+ ```
130
+
131
+ ### Evaluation Results:
132
+ ```json
133
+ {
134
+ "aggregate_results": [
135
+ {
136
+ "dataset": "basic_questions",
137
+ "temperature": 0.7,
138
+ "accuracy": 1.0,
139
+ "run_completed": true
140
+ }
141
+ ],
142
+ "item_results": [
143
+ {
144
+ "id": 0,
145
+ "dataset": "basic_questions",
146
+ "input": "What is 2 + 2?",
147
+ "output": "4",
148
+ "label": "4",
149
+ "temperature": 0.7,
150
+ "accuracy": true
151
+ }
152
+ // ... additional items
153
+ ]
154
+ }
155
+ ```
156
+
157
+ ### _Adaptive_ Evaluations with `evaluate`
158
+
159
+ To run an adaptive evaluation, use a Trismik adaptive dataset
160
+ The CAT algorithm dynamically selects items to estimate the model’s ability (θ) with minimal standard error and fewest questions.
161
+
162
+ ### Adaptive Evaluation Example
163
+ ```python
164
+ from scorebook import evaluate, login
165
+
166
+ # 1. Log in with your Trismik API key
167
+ login("TRISMIK_API_KEY")
168
+
169
+ # 2. Define an inference function
170
+ def inference_function(inputs: List[Any], **hyperparameters):
171
+
172
+ # Create or call a model
173
+ model = Model()
174
+
175
+ # Call model inference
176
+ outputs = model(inputs)
177
+
178
+ # Return outputs
179
+ return outputs
180
+
181
+ # 3. Run an adaptive evaluation
182
+ results = evaluate(
183
+ inference_function,
184
+ datasets = "trismik/headQA:adaptive", # Adaptive datasets have the ":adaptive" suffix
185
+ project_id = "TRISMIK_PROJECT_ID", # Required: Create a project on your Trismik dashboard
186
+ experiment_id = "TRISMIK_EXPERIMENT_ID", # Optional: An identifier to upload this run under
187
+ )
188
+ ```
189
+
190
+ ### Adaptive Evaluation Results
191
+ ```json
192
+ {
193
+ "aggregate_results": [
194
+ {
195
+ "dataset": "trismik/headQA:adaptive",
196
+ "experiment_id": "TRISMIK_EXPERIMENT_ID",
197
+ "project_id": "TRISMIK_PROJECT_ID",
198
+ "run_id": "RUN_ID",
199
+ "score": {
200
+ "theta": 1.2,
201
+ "std_error": 0.20
202
+ },
203
+ "responses": null
204
+ }
205
+ ],
206
+ "item_results": []
207
+ }
208
+ ```
209
+
210
+ ## Metrics
211
+
212
+ | Metric | Sync/Async | Aggregate Scores | Item Scores |
213
+ |------------|------------|--------------------------------------------------|-----------------------------------------|
214
+ | `Accuracy` | Sync | `Float`: Percentage of correct outputs | `Boolean`: Exact match between output and label |
215
+
216
+
217
+ ## Tutorials
218
+
219
+ For local more detailed and runnable examples:
220
+ ```bash
221
+ pip install scorebook[examples]
222
+ ```
223
+
224
+ The `tutorials/` directory contains comprehensive tutorials as notebooks and code examples:
225
+
226
+ - **`tutorials/notebooks`**: Interactive Jupyter Notebooks showcasing Scorebook's capabilities.
227
+ - **`tutorials/examples`**: Runnable Python examples incrementally implementing Scorebook's features.
228
+
229
+ **Run a notebook:**
230
+ ```bash
231
+ jupyter notebook tutorials/notebooks
232
+ ```
233
+
234
+ **Run an example:**
235
+ ```bash
236
+ python3 tutorials/examples/1-score/1-scoring_model_accuracy.py
237
+ ```
238
+
239
+ ## Contributing
240
+
241
+ We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
242
+
243
+ ## License
244
+
245
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
246
+
247
+ ## About
248
+
249
+ Scorebook is developed by [Trismik](https://trismik.com) to simplify and speed up your LLM evaluations.
@@ -19,37 +19,11 @@ dependencies = [
19
19
  scorebook = "scorebook.cli.main:main"
20
20
 
21
21
  [tool.poetry]
22
- version = "0.0.13" # base version
22
+ version = "0.0.14" # base version
23
23
  packages = [{ include = "scorebook", from = "src" }]
24
24
 
25
25
  [tool.poetry.dependencies]
26
26
  python = ">=3.9,<3.14"
27
- datasets = ">=3.6.0"
28
- notebook = ">=7.4.5,<8.0.0"
29
- trismik = "1.0.2"
30
- ipywidgets = ">=8.0.0"
31
-
32
- # Optional dependencies
33
- openai = {version = "*", optional = true}
34
- python-dotenv = {version = "*", optional = true}
35
- portkey-ai = {version = "*", optional = true}
36
- boto3 = {version = "1.40.0", optional = true}
37
- google-genai = {version = "*", optional = true}
38
- pandas = {version = "*", optional = true}
39
- google-cloud-storage = {version = "*", optional = true}
40
- fsspec = {version = "*", extras = ["gcs"], optional = true}
41
- transformers = {version = "*", optional = true}
42
- torch = {version = "*", optional = true}
43
- torchvision = {version = "*", optional = true}
44
- torchaudio = {version = "*", optional = true}
45
- accelerate = {version = "*", optional = true}
46
-
47
- [tool.poetry.extras]
48
- openai = ["openai", "python-dotenv"]
49
- portkey = ["portkey-ai", "python-dotenv"]
50
- bedrock = ["boto3", "python-dotenv"]
51
- vertex = ["google-genai", "pandas", "google-cloud-storage", "fsspec", "python-dotenv"]
52
- examples = ["transformers", "torch", "torchvision", "torchaudio", "accelerate", "notebook"]
53
27
 
54
28
  [[tool.poetry.source]]
55
29
  name = "testpypi"
@@ -9,12 +9,15 @@ import importlib.metadata
9
9
  # get version from pyproject.toml
10
10
  __version__ = importlib.metadata.version(__package__ or __name__)
11
11
 
12
- from scorebook.eval_datasets import EvalDataset
13
- from scorebook.evaluate import evaluate, evaluate_async
12
+ from scorebook.dashboard.create_project import create_project, create_project_async
13
+ from scorebook.dashboard.credentials import login, logout, whoami
14
+ from scorebook.dashboard.upload_results import upload_result, upload_result_async
15
+ from scorebook.eval_datasets.eval_dataset import EvalDataset
16
+ from scorebook.evaluate._async.evaluate_async import evaluate_async
17
+ from scorebook.evaluate._sync.evaluate import evaluate
14
18
  from scorebook.inference.inference_pipeline import InferencePipeline
15
- from scorebook.score import score, score_async
16
- from scorebook.trismik.credentials import login, logout, whoami
17
- from scorebook.trismik.upload_results import upload_result, upload_result_async
19
+ from scorebook.score._async.score_async import score_async
20
+ from scorebook.score._sync.score import score
18
21
  from scorebook.utils.render_template import render_template
19
22
 
20
23
  __all__ = [
@@ -28,6 +31,8 @@ __all__ = [
28
31
  "logout",
29
32
  "whoami",
30
33
  "InferencePipeline",
34
+ "create_project",
35
+ "create_project_async",
31
36
  "upload_result",
32
37
  "upload_result_async",
33
38
  ]
@@ -4,7 +4,7 @@ import argparse
4
4
  import getpass
5
5
  import sys
6
6
 
7
- from scorebook.trismik.credentials import get_stored_token, get_token_path, login, logout, whoami
7
+ from scorebook.dashboard.credentials import get_stored_token, get_token_path, login, logout, whoami
8
8
 
9
9
 
10
10
  def auth_command(args: argparse.Namespace) -> int:
@@ -0,0 +1 @@
1
+ """Trismik authentication and API integration."""