scorebook 0.0.13__tar.gz → 0.0.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. scorebook-0.0.15/PKG-INFO +300 -0
  2. scorebook-0.0.15/README.md +259 -0
  3. {scorebook-0.0.13 → scorebook-0.0.15}/pyproject.toml +20 -43
  4. scorebook-0.0.15/src/scorebook/__init__.py +40 -0
  5. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/cli/auth.py +1 -1
  6. scorebook-0.0.15/src/scorebook/dashboard/__init__.py +1 -0
  7. scorebook-0.0.15/src/scorebook/dashboard/create_project.py +91 -0
  8. {scorebook-0.0.13/src/scorebook/trismik → scorebook-0.0.15/src/scorebook/dashboard}/credentials.py +57 -12
  9. {scorebook-0.0.13/src/scorebook/trismik → scorebook-0.0.15/src/scorebook/dashboard}/upload_results.py +1 -1
  10. scorebook-0.0.15/src/scorebook/eval_datasets/__init__.py +1 -0
  11. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/eval_datasets/eval_dataset.py +4 -2
  12. scorebook-0.0.15/src/scorebook/evaluate/__init__.py +1 -0
  13. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/evaluate/_async/evaluate_async.py +36 -19
  14. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/evaluate/_sync/evaluate.py +36 -19
  15. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/evaluate/evaluate_helpers.py +4 -3
  16. scorebook-0.0.15/src/scorebook/inference/__init__.py +1 -0
  17. scorebook-0.0.15/src/scorebook/inference/clients/__init__.py +1 -0
  18. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/inference/inference_pipeline.py +1 -1
  19. scorebook-0.0.15/src/scorebook/metrics/README.md +121 -0
  20. scorebook-0.0.15/src/scorebook/metrics/__init__.py +9 -0
  21. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/metrics/accuracy.py +2 -6
  22. scorebook-0.0.15/src/scorebook/metrics/bertscore.py +50 -0
  23. scorebook-0.0.15/src/scorebook/metrics/bleu.py +82 -0
  24. scorebook-0.0.15/src/scorebook/metrics/core/__init__.py +1 -0
  25. {scorebook-0.0.13/src/scorebook/metrics → scorebook-0.0.15/src/scorebook/metrics/core}/metric_base.py +1 -2
  26. scorebook-0.0.15/src/scorebook/metrics/core/metric_registry.py +195 -0
  27. scorebook-0.0.15/src/scorebook/metrics/exactmatch.py +95 -0
  28. scorebook-0.0.15/src/scorebook/metrics/f1.py +96 -0
  29. scorebook-0.0.15/src/scorebook/metrics/precision.py +94 -0
  30. scorebook-0.0.15/src/scorebook/metrics/recall.py +94 -0
  31. scorebook-0.0.15/src/scorebook/metrics/rouge.py +85 -0
  32. scorebook-0.0.15/src/scorebook/score/__init__.py +1 -0
  33. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/score/_async/score_async.py +3 -2
  34. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/score/_sync/score.py +3 -2
  35. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/score/score_helpers.py +29 -12
  36. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/types.py +3 -3
  37. scorebook-0.0.15/src/scorebook/utils/__init__.py +1 -0
  38. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/utils/common_helpers.py +1 -1
  39. scorebook-0.0.15/src/scorebook/utils/mock_llm/__init__.py +41 -0
  40. scorebook-0.0.15/src/scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
  41. scorebook-0.0.15/src/scorebook/utils/progress_bars.py +128 -0
  42. scorebook-0.0.15/tutorials/README.md +147 -0
  43. scorebook-0.0.15/tutorials/__init__.py +5 -0
  44. scorebook-0.0.15/tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
  45. scorebook-0.0.15/tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
  46. scorebook-0.0.15/tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
  47. scorebook-0.0.15/tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
  48. scorebook-0.0.15/tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
  49. scorebook-0.0.15/tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
  50. scorebook-0.0.15/tutorials/examples/1-score/__init__.py +0 -0
  51. scorebook-0.0.15/tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
  52. scorebook-0.0.15/tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
  53. scorebook-0.0.15/tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
  54. scorebook-0.0.15/tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
  55. scorebook-0.0.15/tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
  56. scorebook-0.0.15/tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
  57. scorebook-0.0.15/tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
  58. scorebook-0.0.15/tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
  59. scorebook-0.0.15/tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
  60. scorebook-0.0.15/tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
  61. scorebook-0.0.15/tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
  62. scorebook-0.0.15/tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
  63. scorebook-0.0.15/tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
  64. scorebook-0.0.15/tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
  65. scorebook-0.0.15/tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
  66. scorebook-0.0.15/tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
  67. scorebook-0.0.15/tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
  68. scorebook-0.0.15/tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
  69. scorebook-0.0.15/tutorials/examples/6-providers/aws/__init__.py +1 -0
  70. scorebook-0.0.15/tutorials/examples/6-providers/aws/batch_example.py +219 -0
  71. scorebook-0.0.15/tutorials/examples/6-providers/portkey/__init__.py +1 -0
  72. scorebook-0.0.15/tutorials/examples/6-providers/portkey/batch_example.py +120 -0
  73. scorebook-0.0.15/tutorials/examples/6-providers/portkey/messages_example.py +121 -0
  74. scorebook-0.0.15/tutorials/examples/6-providers/vertex/__init__.py +1 -0
  75. scorebook-0.0.15/tutorials/examples/6-providers/vertex/batch_example.py +166 -0
  76. scorebook-0.0.15/tutorials/examples/6-providers/vertex/messages_example.py +142 -0
  77. scorebook-0.0.15/tutorials/examples/__init__.py +0 -0
  78. scorebook-0.0.15/tutorials/notebooks/1-scoring.ipynb +162 -0
  79. scorebook-0.0.15/tutorials/notebooks/2-evaluating.ipynb +316 -0
  80. scorebook-0.0.15/tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
  81. scorebook-0.0.15/tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
  82. scorebook-0.0.15/tutorials/notebooks/4-uploading_results.ipynb +175 -0
  83. scorebook-0.0.15/tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
  84. scorebook-0.0.15/tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
  85. scorebook-0.0.15/tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
  86. scorebook-0.0.15/tutorials/quickstarts/getting_started.ipynb +197 -0
  87. scorebook-0.0.15/tutorials/utils/__init__.py +35 -0
  88. scorebook-0.0.15/tutorials/utils/args_parser.py +132 -0
  89. scorebook-0.0.15/tutorials/utils/output.py +23 -0
  90. scorebook-0.0.15/tutorials/utils/setup.py +98 -0
  91. scorebook-0.0.13/PKG-INFO +0 -389
  92. scorebook-0.0.13/README.md +0 -346
  93. scorebook-0.0.13/src/scorebook/__init__.py +0 -33
  94. scorebook-0.0.13/src/scorebook/eval_datasets/__init__.py +0 -5
  95. scorebook-0.0.13/src/scorebook/evaluate/__init__.py +0 -15
  96. scorebook-0.0.13/src/scorebook/inference/__init__.py +0 -11
  97. scorebook-0.0.13/src/scorebook/inference/clients/__init__.py +0 -8
  98. scorebook-0.0.13/src/scorebook/metrics/__init__.py +0 -18
  99. scorebook-0.0.13/src/scorebook/metrics/metric_registry.py +0 -105
  100. scorebook-0.0.13/src/scorebook/metrics/precision.py +0 -19
  101. scorebook-0.0.13/src/scorebook/score/__init__.py +0 -6
  102. scorebook-0.0.13/src/scorebook/trismik/__init__.py +0 -10
  103. scorebook-0.0.13/src/scorebook/utils/__init__.py +0 -23
  104. scorebook-0.0.13/src/scorebook/utils/progress_bars.py +0 -856
  105. {scorebook-0.0.13 → scorebook-0.0.15}/LICENSE +0 -0
  106. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/cli/__init__.py +0 -0
  107. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/cli/main.py +0 -0
  108. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/evaluate/_async/__init__.py +0 -0
  109. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/evaluate/_sync/__init__.py +0 -0
  110. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/exceptions.py +0 -0
  111. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/inference/clients/bedrock.py +0 -0
  112. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/inference/clients/openai.py +0 -0
  113. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/inference/clients/portkey.py +0 -0
  114. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/inference/clients/vertex.py +0 -0
  115. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/score/_async/__init__.py +0 -0
  116. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/score/_sync/__init__.py +0 -0
  117. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/settings.py +0 -0
  118. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/utils/async_utils.py +0 -0
  119. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/utils/io_helpers.py +0 -0
  120. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/utils/jinja_helpers.py +0 -0
  121. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/utils/mappers.py +0 -0
  122. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/utils/render_template.py +0 -0
  123. {scorebook-0.0.13 → scorebook-0.0.15}/src/scorebook/utils/transform_helpers.py +0 -0
@@ -0,0 +1,300 @@
1
+ Metadata-Version: 2.4
2
+ Name: scorebook
3
+ Version: 0.0.15
4
+ Summary: A Python project for LLM evaluation.
5
+ License-File: LICENSE
6
+ Author: Euan Campbell
7
+ Author-email: euan@trismik.com
8
+ Requires-Python: >=3.10, <3.14
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Provides-Extra: examples
15
+ Provides-Extra: metrics
16
+ Provides-Extra: providers
17
+ Requires-Dist: accelerate ; extra == "examples"
18
+ Requires-Dist: bert-score ; extra == "metrics"
19
+ Requires-Dist: boto3 (==1.40.0) ; extra == "providers"
20
+ Requires-Dist: datasets (>=3.6.0)
21
+ Requires-Dist: fsspec[gcs] ; extra == "providers"
22
+ Requires-Dist: google-cloud-storage ; extra == "providers"
23
+ Requires-Dist: google-genai ; extra == "providers"
24
+ Requires-Dist: ipywidgets ; extra == "examples"
25
+ Requires-Dist: jinja2 (>=3.1.6,<4.0.0)
26
+ Requires-Dist: notebook ; extra == "examples"
27
+ Requires-Dist: openai ; extra == "providers"
28
+ Requires-Dist: pandas ; extra == "providers"
29
+ Requires-Dist: portkey-ai ; extra == "providers"
30
+ Requires-Dist: python-dotenv (>=1.0.0)
31
+ Requires-Dist: rouge-score ; extra == "metrics"
32
+ Requires-Dist: sacrebleu ; extra == "metrics"
33
+ Requires-Dist: scikit-learn (>=1.0.0) ; extra == "metrics"
34
+ Requires-Dist: torch ; extra == "examples"
35
+ Requires-Dist: torchaudio ; extra == "examples"
36
+ Requires-Dist: torchvision ; extra == "examples"
37
+ Requires-Dist: transformers ; extra == "examples"
38
+ Requires-Dist: trismik (>=1.0.3)
39
+ Description-Content-Type: text/markdown
40
+
41
+ <h1 align="center">Scorebook</h1>
42
+
43
+ <p align="center"><strong>A Python library for Model evaluation</strong></p>
44
+
45
+ <p align="center">
46
+ <img alt="Dynamic TOML Badge" src="https://img.shields.io/badge/dynamic/toml?url=https%3A%2F%2Fraw.githubusercontent.com%2Ftrismik%2Fscorebook%2Frefs%2Fheads%2Fmain%2Fpyproject.toml&query=tool.poetry.version&style=flat&label=version">
47
+ <img alt="Python Version" src="https://img.shields.io/badge/python-3.9%2B-blue">
48
+ <a href="https://docs.trismik.com/scorebook/introduction-to-scorebook/" target="_blank" rel="noopener">
49
+ <img alt="Documentation" src="https://img.shields.io/badge/docs-Scorebook-blue?style=flat">
50
+ </a>
51
+ <img alt="License" src="https://img.shields.io/badge/license-MIT-green">
52
+ <a target="_blank" href="https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/getting_started.ipynb">
53
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
54
+ </a>
55
+ </p>
56
+
57
+ Scorebook provides a flexible and extensible framework for evaluating models such as large language models (LLMs). Easily evaluate any model using evaluation datasets from Hugging Face such as MMLU-Pro, HellaSwag, and CommonSenseQA, or with data from any other source. Evaluations calculate scores for any number of specified metrics such as accuracy, precision, and recall, as well as any custom defined metrics, including LLM as a judge (LLMaJ).
58
+
59
+ ## Use Cases
60
+
61
+ Scorebook's evaluations can be used for:
62
+
63
+ - **Model Benchmarking**: Compare different models on standard datasets.
64
+ - **Model Optimization**: Find optimal model configurations.
65
+ - **Iterative Experimentation**: Reproducible evaluation workflows.
66
+
67
+ ## Key Features
68
+
69
+ - **Model Agnostic**: Evaluate any model, running locally or deployed on the cloud.
70
+ - **Dataset Agnostic**: Create evaluation datasets from Hugging Face datasets or any other source.
71
+ - **Extensible Metric Engine**: Use the Scorebook's built-in or implement your own.
72
+ - **Hyperparameter Sweeping**: Evaluate over multiple model hyperparameter configurations.
73
+ - **Adaptive Evaluations**: Run Trismik's ultra-fast [adaptive evaluations](https://docs.trismik.com/adaptiveTesting/adaptive-testing-introduction/).
74
+ - **Trismik Integration**: Upload evaluations to [Trismik's platform](https://www.trismik.com/).
75
+
76
+ ## Installation
77
+
78
+ ```bash
79
+ pip install scorebook
80
+ ```
81
+
82
+ ## Scoring Models Output
83
+
84
+ Scorebooks score function can be used to evaluate pre-generated model outputs.
85
+
86
+ ### Score Example
87
+ ```python
88
+ from scorebook import score
89
+ from scorebook.metrics import Accuracy
90
+
91
+ # 1. Prepare a list of generated model outputs and labels
92
+ model_predictions = [
93
+ {"input": "What is 2 + 2?", "output": "4", "label": "4"},
94
+ {"input": "What is the capital of France?", "output": "London", "label": "Paris"},
95
+ {"input": "Who wrote Romeo and Juliette?", "output": "William Shakespeare", "label": "William Shakespeare"},
96
+ {"input": "What is the chemical symbol for gold?", "output": "Au", "label": "Au"},
97
+ ]
98
+
99
+ # 2. Score the model's predictions against labels using metrics
100
+ results = score(
101
+ items = model_predictions,
102
+ metrics = Accuracy,
103
+ )
104
+ ```
105
+
106
+ ### Score Results:
107
+ ```json
108
+ {
109
+ "aggregate_results": [
110
+ {
111
+ "dataset": "scored_items",
112
+ "accuracy": 0.75
113
+ }
114
+ ],
115
+ "item_results": [
116
+ {
117
+ "id": 0,
118
+ "dataset": "scored_items",
119
+ "input": "What is 2 + 2?",
120
+ "output": "4",
121
+ "label": "4",
122
+ "accuracy": true
123
+ }
124
+ // ... additional items
125
+ ]
126
+ }
127
+ ```
128
+
129
+ ## _Classical_ Evaluations
130
+
131
+ Running a classical evaluation in Scorebook executes model inference on every item in the dataset, then scores the generated outputs using the dataset’s specified metrics to quantify model performance.
132
+
133
+ ### Classical Evaluation example:
134
+ ```python
135
+ from scorebook import evaluate, EvalDataset
136
+ from scorebook.metrics import Accuracy
137
+
138
+ # 1. Create an evaluation dataset
139
+ evaluation_items = [
140
+ {"question": "What is 2 + 2?", "answer": "4"},
141
+ {"question": "What is the capital of France?", "answer": "Paris"},
142
+ {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"}
143
+ ]
144
+
145
+ evaluation_dataset = EvalDataset.from_list(
146
+ name = "basic_questions",
147
+ items = evaluation_items,
148
+ input = "question",
149
+ label = "answer",
150
+ metrics = Accuracy,
151
+ )
152
+
153
+ # 2. Define an inference function - This is a pseudocode example
154
+ def inference_function(inputs: List[Any], **hyperparameters):
155
+
156
+ # Create or call a model
157
+ model = Model()
158
+ model.temperature = hyperparameters.get("temperature")
159
+
160
+ # Call model inference
161
+ model_outputs = model(inputs)
162
+
163
+ # Return outputs
164
+ return model_outputs
165
+
166
+ # 3. Run evaluation
167
+ evaluation_results = evaluate(
168
+ inference_function,
169
+ evaluation_dataset,
170
+ hyperparameters = {"temperature": 0.7}
171
+ )
172
+ ```
173
+
174
+ ### Evaluation Results:
175
+ ```json
176
+ {
177
+ "aggregate_results": [
178
+ {
179
+ "dataset": "basic_questions",
180
+ "temperature": 0.7,
181
+ "accuracy": 1.0,
182
+ "run_completed": true
183
+ }
184
+ ],
185
+ "item_results": [
186
+ {
187
+ "id": 0,
188
+ "dataset": "basic_questions",
189
+ "input": "What is 2 + 2?",
190
+ "output": "4",
191
+ "label": "4",
192
+ "temperature": 0.7,
193
+ "accuracy": true
194
+ }
195
+ // ... additional items
196
+ ]
197
+ }
198
+ ```
199
+
200
+ ### _Adaptive_ Evaluations with `evaluate`
201
+
202
+ To run an adaptive evaluation, use a Trismik adaptive dataset
203
+ The CAT algorithm dynamically selects items to estimate the model’s ability (θ) with minimal standard error and fewest questions.
204
+
205
+ ### Adaptive Evaluation Example
206
+ ```python
207
+ from scorebook import evaluate, login
208
+
209
+ # 1. Log in with your Trismik API key
210
+ login("TRISMIK_API_KEY")
211
+
212
+ # 2. Define an inference function
213
+ def inference_function(inputs: List[Any], **hyperparameters):
214
+
215
+ # Create or call a model
216
+ model = Model()
217
+
218
+ # Call model inference
219
+ outputs = model(inputs)
220
+
221
+ # Return outputs
222
+ return outputs
223
+
224
+ # 3. Run an adaptive evaluation
225
+ results = evaluate(
226
+ inference_function,
227
+ datasets = "trismik/headQA:adaptive", # Adaptive datasets have the ":adaptive" suffix
228
+ project_id = "TRISMIK_PROJECT_ID", # Required: Create a project on your Trismik dashboard
229
+ experiment_id = "TRISMIK_EXPERIMENT_ID", # Optional: An identifier to upload this run under
230
+ )
231
+ ```
232
+
233
+ ### Adaptive Evaluation Results
234
+ ```json
235
+ {
236
+ "aggregate_results": [
237
+ {
238
+ "dataset": "trismik/headQA:adaptive",
239
+ "experiment_id": "TRISMIK_EXPERIMENT_ID",
240
+ "project_id": "TRISMIK_PROJECT_ID",
241
+ "run_id": "RUN_ID",
242
+ "score": {
243
+ "theta": 1.2,
244
+ "std_error": 0.20
245
+ },
246
+ "responses": null
247
+ }
248
+ ],
249
+ "item_results": []
250
+ }
251
+ ```
252
+
253
+ ## Metrics
254
+
255
+ | Metric | Sync/Async | Aggregate Scores | Item Scores |
256
+ |--------------|------------|--------------------------------------------------|-----------------------------------------|
257
+ | `Accuracy` | Sync | `Float`: Percentage of correct outputs | `Boolean`: Exact match between output and label |
258
+ | `ExactMatch` | Sync | `Float`: Percentage of exact string matches | `Boolean`: Exact match with optional case/whitespace normalization |
259
+ | `F1` | Sync | `Dict[str, Float]`: F1 scores per averaging method (macro, micro, weighted) | `Boolean`: Exact match between output and label |
260
+ | `Precision` | Sync | `Dict[str, Float]`: Precision scores per averaging method (macro, micro, weighted) | `Boolean`: Exact match between output and label |
261
+ | `Recall` | Sync | `Dict[str, Float]`: Recall scores per averaging method (macro, micro, weighted) | `Boolean`: Exact match between output and label |
262
+ | `BLEU` | Sync | `Float`: Corpus-level BLEU score | `Float`: Sentence-level BLEU score |
263
+ | `ROUGE` | Sync | `Dict[str, Float]`: Average F1 scores per ROUGE type | `Dict[str, Float]`: F1 scores per ROUGE type |
264
+ | `BertScore` | Sync | `Dict[str, Float]`: Average precision, recall, and F1 scores | `Dict[str, Float]`: Precision, recall, and F1 scores per item |
265
+
266
+
267
+ ## Tutorials
268
+
269
+ For local more detailed and runnable examples:
270
+ ```bash
271
+ pip install scorebook[examples]
272
+ ```
273
+
274
+ The `tutorials/` directory contains comprehensive tutorials as notebooks and code examples:
275
+
276
+ - **`tutorials/notebooks`**: Interactive Jupyter Notebooks showcasing Scorebook's capabilities.
277
+ - **`tutorials/examples`**: Runnable Python examples incrementally implementing Scorebook's features.
278
+
279
+ **Run a notebook:**
280
+ ```bash
281
+ jupyter notebook tutorials/notebooks
282
+ ```
283
+
284
+ **Run an example:**
285
+ ```bash
286
+ python3 tutorials/examples/1-score/1-scoring_model_accuracy.py
287
+ ```
288
+
289
+ ## Contributing
290
+
291
+ We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
292
+
293
+ ## License
294
+
295
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
296
+
297
+ ## About
298
+
299
+ Scorebook is developed by [Trismik](https://trismik.com) to simplify and speed up your LLM evaluations.
300
+
@@ -0,0 +1,259 @@
1
+ <h1 align="center">Scorebook</h1>
2
+
3
+ <p align="center"><strong>A Python library for Model evaluation</strong></p>
4
+
5
+ <p align="center">
6
+ <img alt="Dynamic TOML Badge" src="https://img.shields.io/badge/dynamic/toml?url=https%3A%2F%2Fraw.githubusercontent.com%2Ftrismik%2Fscorebook%2Frefs%2Fheads%2Fmain%2Fpyproject.toml&query=tool.poetry.version&style=flat&label=version">
7
+ <img alt="Python Version" src="https://img.shields.io/badge/python-3.9%2B-blue">
8
+ <a href="https://docs.trismik.com/scorebook/introduction-to-scorebook/" target="_blank" rel="noopener">
9
+ <img alt="Documentation" src="https://img.shields.io/badge/docs-Scorebook-blue?style=flat">
10
+ </a>
11
+ <img alt="License" src="https://img.shields.io/badge/license-MIT-green">
12
+ <a target="_blank" href="https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/getting_started.ipynb">
13
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
14
+ </a>
15
+ </p>
16
+
17
+ Scorebook provides a flexible and extensible framework for evaluating models such as large language models (LLMs). Easily evaluate any model using evaluation datasets from Hugging Face such as MMLU-Pro, HellaSwag, and CommonSenseQA, or with data from any other source. Evaluations calculate scores for any number of specified metrics such as accuracy, precision, and recall, as well as any custom defined metrics, including LLM as a judge (LLMaJ).
18
+
19
+ ## Use Cases
20
+
21
+ Scorebook's evaluations can be used for:
22
+
23
+ - **Model Benchmarking**: Compare different models on standard datasets.
24
+ - **Model Optimization**: Find optimal model configurations.
25
+ - **Iterative Experimentation**: Reproducible evaluation workflows.
26
+
27
+ ## Key Features
28
+
29
+ - **Model Agnostic**: Evaluate any model, running locally or deployed on the cloud.
30
+ - **Dataset Agnostic**: Create evaluation datasets from Hugging Face datasets or any other source.
31
+ - **Extensible Metric Engine**: Use the Scorebook's built-in or implement your own.
32
+ - **Hyperparameter Sweeping**: Evaluate over multiple model hyperparameter configurations.
33
+ - **Adaptive Evaluations**: Run Trismik's ultra-fast [adaptive evaluations](https://docs.trismik.com/adaptiveTesting/adaptive-testing-introduction/).
34
+ - **Trismik Integration**: Upload evaluations to [Trismik's platform](https://www.trismik.com/).
35
+
36
+ ## Installation
37
+
38
+ ```bash
39
+ pip install scorebook
40
+ ```
41
+
42
+ ## Scoring Models Output
43
+
44
+ Scorebooks score function can be used to evaluate pre-generated model outputs.
45
+
46
+ ### Score Example
47
+ ```python
48
+ from scorebook import score
49
+ from scorebook.metrics import Accuracy
50
+
51
+ # 1. Prepare a list of generated model outputs and labels
52
+ model_predictions = [
53
+ {"input": "What is 2 + 2?", "output": "4", "label": "4"},
54
+ {"input": "What is the capital of France?", "output": "London", "label": "Paris"},
55
+ {"input": "Who wrote Romeo and Juliette?", "output": "William Shakespeare", "label": "William Shakespeare"},
56
+ {"input": "What is the chemical symbol for gold?", "output": "Au", "label": "Au"},
57
+ ]
58
+
59
+ # 2. Score the model's predictions against labels using metrics
60
+ results = score(
61
+ items = model_predictions,
62
+ metrics = Accuracy,
63
+ )
64
+ ```
65
+
66
+ ### Score Results:
67
+ ```json
68
+ {
69
+ "aggregate_results": [
70
+ {
71
+ "dataset": "scored_items",
72
+ "accuracy": 0.75
73
+ }
74
+ ],
75
+ "item_results": [
76
+ {
77
+ "id": 0,
78
+ "dataset": "scored_items",
79
+ "input": "What is 2 + 2?",
80
+ "output": "4",
81
+ "label": "4",
82
+ "accuracy": true
83
+ }
84
+ // ... additional items
85
+ ]
86
+ }
87
+ ```
88
+
89
+ ## _Classical_ Evaluations
90
+
91
+ Running a classical evaluation in Scorebook executes model inference on every item in the dataset, then scores the generated outputs using the dataset’s specified metrics to quantify model performance.
92
+
93
+ ### Classical Evaluation example:
94
+ ```python
95
+ from scorebook import evaluate, EvalDataset
96
+ from scorebook.metrics import Accuracy
97
+
98
+ # 1. Create an evaluation dataset
99
+ evaluation_items = [
100
+ {"question": "What is 2 + 2?", "answer": "4"},
101
+ {"question": "What is the capital of France?", "answer": "Paris"},
102
+ {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"}
103
+ ]
104
+
105
+ evaluation_dataset = EvalDataset.from_list(
106
+ name = "basic_questions",
107
+ items = evaluation_items,
108
+ input = "question",
109
+ label = "answer",
110
+ metrics = Accuracy,
111
+ )
112
+
113
+ # 2. Define an inference function - This is a pseudocode example
114
+ def inference_function(inputs: List[Any], **hyperparameters):
115
+
116
+ # Create or call a model
117
+ model = Model()
118
+ model.temperature = hyperparameters.get("temperature")
119
+
120
+ # Call model inference
121
+ model_outputs = model(inputs)
122
+
123
+ # Return outputs
124
+ return model_outputs
125
+
126
+ # 3. Run evaluation
127
+ evaluation_results = evaluate(
128
+ inference_function,
129
+ evaluation_dataset,
130
+ hyperparameters = {"temperature": 0.7}
131
+ )
132
+ ```
133
+
134
+ ### Evaluation Results:
135
+ ```json
136
+ {
137
+ "aggregate_results": [
138
+ {
139
+ "dataset": "basic_questions",
140
+ "temperature": 0.7,
141
+ "accuracy": 1.0,
142
+ "run_completed": true
143
+ }
144
+ ],
145
+ "item_results": [
146
+ {
147
+ "id": 0,
148
+ "dataset": "basic_questions",
149
+ "input": "What is 2 + 2?",
150
+ "output": "4",
151
+ "label": "4",
152
+ "temperature": 0.7,
153
+ "accuracy": true
154
+ }
155
+ // ... additional items
156
+ ]
157
+ }
158
+ ```
159
+
160
+ ### _Adaptive_ Evaluations with `evaluate`
161
+
162
+ To run an adaptive evaluation, use a Trismik adaptive dataset
163
+ The CAT algorithm dynamically selects items to estimate the model’s ability (θ) with minimal standard error and fewest questions.
164
+
165
+ ### Adaptive Evaluation Example
166
+ ```python
167
+ from scorebook import evaluate, login
168
+
169
+ # 1. Log in with your Trismik API key
170
+ login("TRISMIK_API_KEY")
171
+
172
+ # 2. Define an inference function
173
+ def inference_function(inputs: List[Any], **hyperparameters):
174
+
175
+ # Create or call a model
176
+ model = Model()
177
+
178
+ # Call model inference
179
+ outputs = model(inputs)
180
+
181
+ # Return outputs
182
+ return outputs
183
+
184
+ # 3. Run an adaptive evaluation
185
+ results = evaluate(
186
+ inference_function,
187
+ datasets = "trismik/headQA:adaptive", # Adaptive datasets have the ":adaptive" suffix
188
+ project_id = "TRISMIK_PROJECT_ID", # Required: Create a project on your Trismik dashboard
189
+ experiment_id = "TRISMIK_EXPERIMENT_ID", # Optional: An identifier to upload this run under
190
+ )
191
+ ```
192
+
193
+ ### Adaptive Evaluation Results
194
+ ```json
195
+ {
196
+ "aggregate_results": [
197
+ {
198
+ "dataset": "trismik/headQA:adaptive",
199
+ "experiment_id": "TRISMIK_EXPERIMENT_ID",
200
+ "project_id": "TRISMIK_PROJECT_ID",
201
+ "run_id": "RUN_ID",
202
+ "score": {
203
+ "theta": 1.2,
204
+ "std_error": 0.20
205
+ },
206
+ "responses": null
207
+ }
208
+ ],
209
+ "item_results": []
210
+ }
211
+ ```
212
+
213
+ ## Metrics
214
+
215
+ | Metric | Sync/Async | Aggregate Scores | Item Scores |
216
+ |--------------|------------|--------------------------------------------------|-----------------------------------------|
217
+ | `Accuracy` | Sync | `Float`: Percentage of correct outputs | `Boolean`: Exact match between output and label |
218
+ | `ExactMatch` | Sync | `Float`: Percentage of exact string matches | `Boolean`: Exact match with optional case/whitespace normalization |
219
+ | `F1` | Sync | `Dict[str, Float]`: F1 scores per averaging method (macro, micro, weighted) | `Boolean`: Exact match between output and label |
220
+ | `Precision` | Sync | `Dict[str, Float]`: Precision scores per averaging method (macro, micro, weighted) | `Boolean`: Exact match between output and label |
221
+ | `Recall` | Sync | `Dict[str, Float]`: Recall scores per averaging method (macro, micro, weighted) | `Boolean`: Exact match between output and label |
222
+ | `BLEU` | Sync | `Float`: Corpus-level BLEU score | `Float`: Sentence-level BLEU score |
223
+ | `ROUGE` | Sync | `Dict[str, Float]`: Average F1 scores per ROUGE type | `Dict[str, Float]`: F1 scores per ROUGE type |
224
+ | `BertScore` | Sync | `Dict[str, Float]`: Average precision, recall, and F1 scores | `Dict[str, Float]`: Precision, recall, and F1 scores per item |
225
+
226
+
227
+ ## Tutorials
228
+
229
+ For local more detailed and runnable examples:
230
+ ```bash
231
+ pip install scorebook[examples]
232
+ ```
233
+
234
+ The `tutorials/` directory contains comprehensive tutorials as notebooks and code examples:
235
+
236
+ - **`tutorials/notebooks`**: Interactive Jupyter Notebooks showcasing Scorebook's capabilities.
237
+ - **`tutorials/examples`**: Runnable Python examples incrementally implementing Scorebook's features.
238
+
239
+ **Run a notebook:**
240
+ ```bash
241
+ jupyter notebook tutorials/notebooks
242
+ ```
243
+
244
+ **Run an example:**
245
+ ```bash
246
+ python3 tutorials/examples/1-score/1-scoring_model_accuracy.py
247
+ ```
248
+
249
+ ## Contributing
250
+
251
+ We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
252
+
253
+ ## License
254
+
255
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
256
+
257
+ ## About
258
+
259
+ Scorebook is developed by [Trismik](https://trismik.com) to simplify and speed up your LLM evaluations.
@@ -7,49 +7,26 @@ authors = [
7
7
  { name = "Marco Basaldella", email = "marco@trismik.com" }
8
8
  ]
9
9
  readme = "README.md"
10
- requires-python = ">=3.9, <3.14"
10
+ requires-python = ">=3.10, <3.14"
11
11
  dependencies = [
12
12
  "datasets>=3.6.0",
13
- "notebook (>=7.4.5,<8.0.0)",
14
- "trismik==1.0.2",
15
- "ipywidgets>=8.0.0",
13
+ "trismik>=1.0.3",
14
+ "python-dotenv>=1.0.0",
15
+ "jinja2 (>=3.1.6,<4.0.0)",
16
16
  ]
17
17
 
18
18
  [project.scripts]
19
19
  scorebook = "scorebook.cli.main:main"
20
20
 
21
21
  [tool.poetry]
22
- version = "0.0.13" # base version
23
- packages = [{ include = "scorebook", from = "src" }]
22
+ version = "0.0.15" # base version
23
+ packages = [
24
+ { include = "scorebook", from = "src" },
25
+ { include = "tutorials" }
26
+ ]
24
27
 
25
28
  [tool.poetry.dependencies]
26
- python = ">=3.9,<3.14"
27
- datasets = ">=3.6.0"
28
- notebook = ">=7.4.5,<8.0.0"
29
- trismik = "1.0.2"
30
- ipywidgets = ">=8.0.0"
31
-
32
- # Optional dependencies
33
- openai = {version = "*", optional = true}
34
- python-dotenv = {version = "*", optional = true}
35
- portkey-ai = {version = "*", optional = true}
36
- boto3 = {version = "1.40.0", optional = true}
37
- google-genai = {version = "*", optional = true}
38
- pandas = {version = "*", optional = true}
39
- google-cloud-storage = {version = "*", optional = true}
40
- fsspec = {version = "*", extras = ["gcs"], optional = true}
41
- transformers = {version = "*", optional = true}
42
- torch = {version = "*", optional = true}
43
- torchvision = {version = "*", optional = true}
44
- torchaudio = {version = "*", optional = true}
45
- accelerate = {version = "*", optional = true}
46
-
47
- [tool.poetry.extras]
48
- openai = ["openai", "python-dotenv"]
49
- portkey = ["portkey-ai", "python-dotenv"]
50
- bedrock = ["boto3", "python-dotenv"]
51
- vertex = ["google-genai", "pandas", "google-cloud-storage", "fsspec", "python-dotenv"]
52
- examples = ["transformers", "torch", "torchvision", "torchaudio", "accelerate", "notebook"]
29
+ python = ">=3.10,<3.14"
53
30
 
54
31
  [[tool.poetry.source]]
55
32
  name = "testpypi"
@@ -68,17 +45,15 @@ mypy = "^1.15.0"
68
45
  autoflake = "^2.3.1"
69
46
  toml = "^0.10.2"
70
47
  types-pyyaml = "^6.0.12.20250822"
71
- unasync = {version = "^0.5.0", python = ">=3.9,<4"}
48
+ unasync = {version = "^0.5.0", python = ">=3.10,<4"}
72
49
  tomlkit = "^0.13.2"
73
50
  detect-secrets = "^1.5.0"
51
+ setuptools = "^75.0.0"
74
52
 
75
53
  [project.optional-dependencies]
76
- openai = ["openai", "python-dotenv"]
77
- portkey = ["portkey-ai", "python-dotenv"]
78
- bedrock = ["boto3==1.40.0", "python-dotenv"]
79
- vertex = ["google-genai", "pandas", "google-cloud-storage", "fsspec[gcs]", "python-dotenv"]
80
- examples = ["transformers", "torch", "torchvision", "torchaudio", "accelerate", "notebook"]
81
-
54
+ providers = ["openai", "portkey-ai", "boto3==1.40.0", "google-genai", "pandas", "google-cloud-storage", "fsspec[gcs]"]
55
+ examples = ["transformers", "torch", "torchvision", "torchaudio", "accelerate", "notebook", "ipywidgets"]
56
+ metrics = ["sacrebleu", "rouge-score", "scikit-learn>=1.0.0", "bert-score"]
82
57
 
83
58
  [build-system]
84
59
  requires = ["poetry-core"]
@@ -86,14 +61,16 @@ build-backend = "poetry.core.masonry.api"
86
61
 
87
62
  [tool.pytest.ini_options]
88
63
  asyncio_default_fixture_loop_scope = "class"
64
+ testpaths = ["tests/unit"]
89
65
  markers = [
90
- "unit: Unit tests that use mocks and don't require external dependencies",
66
+ "unit: Unit tests using only core dependencies (no optional packages)",
67
+ "metrics: Tests requiring metrics extras (sklearn, sacrebleu, rouge-score, bert-score)",
91
68
  "integration: Integration tests that may require network access or external services",
92
69
  ]
93
70
 
94
71
  [tool.black]
95
72
  line-length = 100
96
- target-version = ['py39']
73
+ target-version = ['py310']
97
74
  include = '\.pyi?$'
98
75
 
99
76
  [tool.isort]
@@ -102,7 +79,7 @@ line_length = 100
102
79
  multi_line_output = 3
103
80
 
104
81
  [tool.mypy]
105
- python_version = "3.9"
82
+ python_version = "3.10"
106
83
  warn_return_any = true
107
84
  warn_unused_configs = true
108
85
  disallow_untyped_defs = true