scorebook 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. scorebook/__init__.py +12 -5
  2. scorebook/cli/auth.py +1 -1
  3. scorebook/dashboard/__init__.py +1 -0
  4. scorebook/dashboard/create_project.py +91 -0
  5. scorebook/{trismik → dashboard}/credentials.py +57 -12
  6. scorebook/{trismik → dashboard}/upload_results.py +1 -1
  7. scorebook/eval_datasets/__init__.py +0 -4
  8. scorebook/eval_datasets/eval_dataset.py +4 -2
  9. scorebook/evaluate/__init__.py +1 -15
  10. scorebook/evaluate/_async/evaluate_async.py +36 -19
  11. scorebook/evaluate/_sync/evaluate.py +36 -19
  12. scorebook/evaluate/evaluate_helpers.py +4 -3
  13. scorebook/inference/__init__.py +1 -11
  14. scorebook/inference/clients/__init__.py +1 -8
  15. scorebook/inference/inference_pipeline.py +1 -1
  16. scorebook/metrics/README.md +121 -0
  17. scorebook/metrics/__init__.py +7 -16
  18. scorebook/metrics/accuracy.py +2 -6
  19. scorebook/metrics/bertscore.py +50 -0
  20. scorebook/metrics/bleu.py +82 -0
  21. scorebook/metrics/core/__init__.py +1 -0
  22. scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
  23. scorebook/metrics/core/metric_registry.py +195 -0
  24. scorebook/metrics/exactmatch.py +95 -0
  25. scorebook/metrics/f1.py +96 -0
  26. scorebook/metrics/precision.py +84 -9
  27. scorebook/metrics/recall.py +94 -0
  28. scorebook/metrics/rouge.py +85 -0
  29. scorebook/score/__init__.py +0 -5
  30. scorebook/score/_async/score_async.py +3 -2
  31. scorebook/score/_sync/score.py +3 -2
  32. scorebook/score/score_helpers.py +29 -12
  33. scorebook/types.py +3 -3
  34. scorebook/utils/__init__.py +0 -22
  35. scorebook/utils/common_helpers.py +1 -1
  36. scorebook/utils/mock_llm/__init__.py +41 -0
  37. scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
  38. scorebook/utils/progress_bars.py +58 -786
  39. scorebook-0.0.15.dist-info/METADATA +300 -0
  40. scorebook-0.0.15.dist-info/RECORD +110 -0
  41. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
  42. tutorials/README.md +147 -0
  43. tutorials/__init__.py +5 -0
  44. tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
  45. tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
  46. tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
  47. tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
  48. tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
  49. tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
  50. tutorials/examples/1-score/__init__.py +0 -0
  51. tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
  52. tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
  53. tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
  54. tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
  55. tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
  56. tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
  57. tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
  58. tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
  59. tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
  60. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
  61. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
  62. tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
  63. tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
  64. tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
  65. tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
  66. tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
  67. tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
  68. tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
  69. tutorials/examples/6-providers/aws/__init__.py +1 -0
  70. tutorials/examples/6-providers/aws/batch_example.py +219 -0
  71. tutorials/examples/6-providers/portkey/__init__.py +1 -0
  72. tutorials/examples/6-providers/portkey/batch_example.py +120 -0
  73. tutorials/examples/6-providers/portkey/messages_example.py +121 -0
  74. tutorials/examples/6-providers/vertex/__init__.py +1 -0
  75. tutorials/examples/6-providers/vertex/batch_example.py +166 -0
  76. tutorials/examples/6-providers/vertex/messages_example.py +142 -0
  77. tutorials/examples/__init__.py +0 -0
  78. tutorials/notebooks/1-scoring.ipynb +162 -0
  79. tutorials/notebooks/2-evaluating.ipynb +316 -0
  80. tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
  81. tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
  82. tutorials/notebooks/4-uploading_results.ipynb +175 -0
  83. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
  84. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
  85. tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
  86. tutorials/quickstarts/getting_started.ipynb +197 -0
  87. tutorials/utils/__init__.py +35 -0
  88. tutorials/utils/args_parser.py +132 -0
  89. tutorials/utils/output.py +23 -0
  90. tutorials/utils/setup.py +98 -0
  91. scorebook/metrics/metric_registry.py +0 -105
  92. scorebook/trismik/__init__.py +0 -10
  93. scorebook-0.0.13.dist-info/METADATA +0 -389
  94. scorebook-0.0.13.dist-info/RECORD +0 -50
  95. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
  96. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,300 @@
1
+ Metadata-Version: 2.4
2
+ Name: scorebook
3
+ Version: 0.0.15
4
+ Summary: A Python project for LLM evaluation.
5
+ License-File: LICENSE
6
+ Author: Euan Campbell
7
+ Author-email: euan@trismik.com
8
+ Requires-Python: >=3.10, <3.14
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Provides-Extra: examples
15
+ Provides-Extra: metrics
16
+ Provides-Extra: providers
17
+ Requires-Dist: accelerate ; extra == "examples"
18
+ Requires-Dist: bert-score ; extra == "metrics"
19
+ Requires-Dist: boto3 (==1.40.0) ; extra == "providers"
20
+ Requires-Dist: datasets (>=3.6.0)
21
+ Requires-Dist: fsspec[gcs] ; extra == "providers"
22
+ Requires-Dist: google-cloud-storage ; extra == "providers"
23
+ Requires-Dist: google-genai ; extra == "providers"
24
+ Requires-Dist: ipywidgets ; extra == "examples"
25
+ Requires-Dist: jinja2 (>=3.1.6,<4.0.0)
26
+ Requires-Dist: notebook ; extra == "examples"
27
+ Requires-Dist: openai ; extra == "providers"
28
+ Requires-Dist: pandas ; extra == "providers"
29
+ Requires-Dist: portkey-ai ; extra == "providers"
30
+ Requires-Dist: python-dotenv (>=1.0.0)
31
+ Requires-Dist: rouge-score ; extra == "metrics"
32
+ Requires-Dist: sacrebleu ; extra == "metrics"
33
+ Requires-Dist: scikit-learn (>=1.0.0) ; extra == "metrics"
34
+ Requires-Dist: torch ; extra == "examples"
35
+ Requires-Dist: torchaudio ; extra == "examples"
36
+ Requires-Dist: torchvision ; extra == "examples"
37
+ Requires-Dist: transformers ; extra == "examples"
38
+ Requires-Dist: trismik (>=1.0.3)
39
+ Description-Content-Type: text/markdown
40
+
41
+ <h1 align="center">Scorebook</h1>
42
+
43
+ <p align="center"><strong>A Python library for Model evaluation</strong></p>
44
+
45
+ <p align="center">
46
+ <img alt="Dynamic TOML Badge" src="https://img.shields.io/badge/dynamic/toml?url=https%3A%2F%2Fraw.githubusercontent.com%2Ftrismik%2Fscorebook%2Frefs%2Fheads%2Fmain%2Fpyproject.toml&query=tool.poetry.version&style=flat&label=version">
47
+ <img alt="Python Version" src="https://img.shields.io/badge/python-3.9%2B-blue">
48
+ <a href="https://docs.trismik.com/scorebook/introduction-to-scorebook/" target="_blank" rel="noopener">
49
+ <img alt="Documentation" src="https://img.shields.io/badge/docs-Scorebook-blue?style=flat">
50
+ </a>
51
+ <img alt="License" src="https://img.shields.io/badge/license-MIT-green">
52
+ <a target="_blank" href="https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/getting_started.ipynb">
53
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
54
+ </a>
55
+ </p>
56
+
57
+ Scorebook provides a flexible and extensible framework for evaluating models such as large language models (LLMs). Easily evaluate any model using evaluation datasets from Hugging Face such as MMLU-Pro, HellaSwag, and CommonSenseQA, or with data from any other source. Evaluations calculate scores for any number of specified metrics such as accuracy, precision, and recall, as well as any custom defined metrics, including LLM as a judge (LLMaJ).
58
+
59
+ ## Use Cases
60
+
61
+ Scorebook's evaluations can be used for:
62
+
63
+ - **Model Benchmarking**: Compare different models on standard datasets.
64
+ - **Model Optimization**: Find optimal model configurations.
65
+ - **Iterative Experimentation**: Reproducible evaluation workflows.
66
+
67
+ ## Key Features
68
+
69
+ - **Model Agnostic**: Evaluate any model, running locally or deployed on the cloud.
70
+ - **Dataset Agnostic**: Create evaluation datasets from Hugging Face datasets or any other source.
71
+ - **Extensible Metric Engine**: Use the Scorebook's built-in or implement your own.
72
+ - **Hyperparameter Sweeping**: Evaluate over multiple model hyperparameter configurations.
73
+ - **Adaptive Evaluations**: Run Trismik's ultra-fast [adaptive evaluations](https://docs.trismik.com/adaptiveTesting/adaptive-testing-introduction/).
74
+ - **Trismik Integration**: Upload evaluations to [Trismik's platform](https://www.trismik.com/).
75
+
76
+ ## Installation
77
+
78
+ ```bash
79
+ pip install scorebook
80
+ ```
81
+
82
+ ## Scoring Models Output
83
+
84
+ Scorebooks score function can be used to evaluate pre-generated model outputs.
85
+
86
+ ### Score Example
87
+ ```python
88
+ from scorebook import score
89
+ from scorebook.metrics import Accuracy
90
+
91
+ # 1. Prepare a list of generated model outputs and labels
92
+ model_predictions = [
93
+ {"input": "What is 2 + 2?", "output": "4", "label": "4"},
94
+ {"input": "What is the capital of France?", "output": "London", "label": "Paris"},
95
+ {"input": "Who wrote Romeo and Juliette?", "output": "William Shakespeare", "label": "William Shakespeare"},
96
+ {"input": "What is the chemical symbol for gold?", "output": "Au", "label": "Au"},
97
+ ]
98
+
99
+ # 2. Score the model's predictions against labels using metrics
100
+ results = score(
101
+ items = model_predictions,
102
+ metrics = Accuracy,
103
+ )
104
+ ```
105
+
106
+ ### Score Results:
107
+ ```json
108
+ {
109
+ "aggregate_results": [
110
+ {
111
+ "dataset": "scored_items",
112
+ "accuracy": 0.75
113
+ }
114
+ ],
115
+ "item_results": [
116
+ {
117
+ "id": 0,
118
+ "dataset": "scored_items",
119
+ "input": "What is 2 + 2?",
120
+ "output": "4",
121
+ "label": "4",
122
+ "accuracy": true
123
+ }
124
+ // ... additional items
125
+ ]
126
+ }
127
+ ```
128
+
129
+ ## _Classical_ Evaluations
130
+
131
+ Running a classical evaluation in Scorebook executes model inference on every item in the dataset, then scores the generated outputs using the dataset’s specified metrics to quantify model performance.
132
+
133
+ ### Classical Evaluation example:
134
+ ```python
135
+ from scorebook import evaluate, EvalDataset
136
+ from scorebook.metrics import Accuracy
137
+
138
+ # 1. Create an evaluation dataset
139
+ evaluation_items = [
140
+ {"question": "What is 2 + 2?", "answer": "4"},
141
+ {"question": "What is the capital of France?", "answer": "Paris"},
142
+ {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"}
143
+ ]
144
+
145
+ evaluation_dataset = EvalDataset.from_list(
146
+ name = "basic_questions",
147
+ items = evaluation_items,
148
+ input = "question",
149
+ label = "answer",
150
+ metrics = Accuracy,
151
+ )
152
+
153
+ # 2. Define an inference function - This is a pseudocode example
154
+ def inference_function(inputs: List[Any], **hyperparameters):
155
+
156
+ # Create or call a model
157
+ model = Model()
158
+ model.temperature = hyperparameters.get("temperature")
159
+
160
+ # Call model inference
161
+ model_outputs = model(inputs)
162
+
163
+ # Return outputs
164
+ return model_outputs
165
+
166
+ # 3. Run evaluation
167
+ evaluation_results = evaluate(
168
+ inference_function,
169
+ evaluation_dataset,
170
+ hyperparameters = {"temperature": 0.7}
171
+ )
172
+ ```
173
+
174
+ ### Evaluation Results:
175
+ ```json
176
+ {
177
+ "aggregate_results": [
178
+ {
179
+ "dataset": "basic_questions",
180
+ "temperature": 0.7,
181
+ "accuracy": 1.0,
182
+ "run_completed": true
183
+ }
184
+ ],
185
+ "item_results": [
186
+ {
187
+ "id": 0,
188
+ "dataset": "basic_questions",
189
+ "input": "What is 2 + 2?",
190
+ "output": "4",
191
+ "label": "4",
192
+ "temperature": 0.7,
193
+ "accuracy": true
194
+ }
195
+ // ... additional items
196
+ ]
197
+ }
198
+ ```
199
+
200
+ ### _Adaptive_ Evaluations with `evaluate`
201
+
202
+ To run an adaptive evaluation, use a Trismik adaptive dataset
203
+ The CAT algorithm dynamically selects items to estimate the model’s ability (θ) with minimal standard error and fewest questions.
204
+
205
+ ### Adaptive Evaluation Example
206
+ ```python
207
+ from scorebook import evaluate, login
208
+
209
+ # 1. Log in with your Trismik API key
210
+ login("TRISMIK_API_KEY")
211
+
212
+ # 2. Define an inference function
213
+ def inference_function(inputs: List[Any], **hyperparameters):
214
+
215
+ # Create or call a model
216
+ model = Model()
217
+
218
+ # Call model inference
219
+ outputs = model(inputs)
220
+
221
+ # Return outputs
222
+ return outputs
223
+
224
+ # 3. Run an adaptive evaluation
225
+ results = evaluate(
226
+ inference_function,
227
+ datasets = "trismik/headQA:adaptive", # Adaptive datasets have the ":adaptive" suffix
228
+ project_id = "TRISMIK_PROJECT_ID", # Required: Create a project on your Trismik dashboard
229
+ experiment_id = "TRISMIK_EXPERIMENT_ID", # Optional: An identifier to upload this run under
230
+ )
231
+ ```
232
+
233
+ ### Adaptive Evaluation Results
234
+ ```json
235
+ {
236
+ "aggregate_results": [
237
+ {
238
+ "dataset": "trismik/headQA:adaptive",
239
+ "experiment_id": "TRISMIK_EXPERIMENT_ID",
240
+ "project_id": "TRISMIK_PROJECT_ID",
241
+ "run_id": "RUN_ID",
242
+ "score": {
243
+ "theta": 1.2,
244
+ "std_error": 0.20
245
+ },
246
+ "responses": null
247
+ }
248
+ ],
249
+ "item_results": []
250
+ }
251
+ ```
252
+
253
+ ## Metrics
254
+
255
+ | Metric | Sync/Async | Aggregate Scores | Item Scores |
256
+ |--------------|------------|--------------------------------------------------|-----------------------------------------|
257
+ | `Accuracy` | Sync | `Float`: Percentage of correct outputs | `Boolean`: Exact match between output and label |
258
+ | `ExactMatch` | Sync | `Float`: Percentage of exact string matches | `Boolean`: Exact match with optional case/whitespace normalization |
259
+ | `F1` | Sync | `Dict[str, Float]`: F1 scores per averaging method (macro, micro, weighted) | `Boolean`: Exact match between output and label |
260
+ | `Precision` | Sync | `Dict[str, Float]`: Precision scores per averaging method (macro, micro, weighted) | `Boolean`: Exact match between output and label |
261
+ | `Recall` | Sync | `Dict[str, Float]`: Recall scores per averaging method (macro, micro, weighted) | `Boolean`: Exact match between output and label |
262
+ | `BLEU` | Sync | `Float`: Corpus-level BLEU score | `Float`: Sentence-level BLEU score |
263
+ | `ROUGE` | Sync | `Dict[str, Float]`: Average F1 scores per ROUGE type | `Dict[str, Float]`: F1 scores per ROUGE type |
264
+ | `BertScore` | Sync | `Dict[str, Float]`: Average precision, recall, and F1 scores | `Dict[str, Float]`: Precision, recall, and F1 scores per item |
265
+
266
+
267
+ ## Tutorials
268
+
269
+ For local more detailed and runnable examples:
270
+ ```bash
271
+ pip install scorebook[examples]
272
+ ```
273
+
274
+ The `tutorials/` directory contains comprehensive tutorials as notebooks and code examples:
275
+
276
+ - **`tutorials/notebooks`**: Interactive Jupyter Notebooks showcasing Scorebook's capabilities.
277
+ - **`tutorials/examples`**: Runnable Python examples incrementally implementing Scorebook's features.
278
+
279
+ **Run a notebook:**
280
+ ```bash
281
+ jupyter notebook tutorials/notebooks
282
+ ```
283
+
284
+ **Run an example:**
285
+ ```bash
286
+ python3 tutorials/examples/1-score/1-scoring_model_accuracy.py
287
+ ```
288
+
289
+ ## Contributing
290
+
291
+ We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
292
+
293
+ ## License
294
+
295
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
296
+
297
+ ## About
298
+
299
+ Scorebook is developed by [Trismik](https://trismik.com) to simplify and speed up your LLM evaluations.
300
+
@@ -0,0 +1,110 @@
1
+ scorebook/__init__.py,sha256=YB3PyPB0-sRicIwiQZ8aRhviu04_FZY8Ne5o-FuNWtA,1236
2
+ scorebook/cli/__init__.py,sha256=E89jR1DljFSHhfjEGSRKLgz0KhxGyRQ9a3vpUOmQL9o,32
3
+ scorebook/cli/auth.py,sha256=VGS5T0CSeS0n_7bntNggrYx-vDwxJJHdYxbKedFAq74,2939
4
+ scorebook/cli/main.py,sha256=cEvShENl6L6feX_sa7FGNTeoz5UtwqzwenmcHaON1hg,1589
5
+ scorebook/dashboard/__init__.py,sha256=36DxO3oXVcZ2I6kizLFCcJkLBpXOU8UIXFT_ZjeFTB4,50
6
+ scorebook/dashboard/create_project.py,sha256=RK90aMN0_XVM-DnawTY_b59yPJaRnpb_GoidCqXB5Vw,2845
7
+ scorebook/dashboard/credentials.py,sha256=CCxafElx_pMLD-c69JvYAcC-9SzZf3tjAnJQUf8q5Us,4796
8
+ scorebook/dashboard/upload_results.py,sha256=sdgOEf0C7QLt7t2QiXvSoceQpAiiPmlG_4SFEEzVPlc,9738
9
+ scorebook/eval_datasets/__init__.py,sha256=wsmFNyuZJdBxjokcKG4NRfuUzPZKuzsKX3aG21zfFV4,39
10
+ scorebook/eval_datasets/eval_dataset.py,sha256=R7upmIhvDTBwyfuFErfc2iICI6M6AkUTMeO0Oi9NFk0,28051
11
+ scorebook/evaluate/__init__.py,sha256=Qqe-l4y3Nu81Fdx83RbtCQESoXC0XukBgOC3DPSWZZA,39
12
+ scorebook/evaluate/_async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ scorebook/evaluate/_async/evaluate_async.py,sha256=Y0bH4nSdwU13A1oUO3gM6oEyM4bCQCpr7fhUWDZZJpY,17198
14
+ scorebook/evaluate/_sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ scorebook/evaluate/_sync/evaluate.py,sha256=MQqKByraT22EUPfqoq1H4VIyRcPIo0ahv6KZpHgQX1A,16972
16
+ scorebook/evaluate/evaluate_helpers.py,sha256=NnanxLEeHwoZNztGXQJc6u_WqKfDkn1vYmck2BrKF-c,17028
17
+ scorebook/exceptions.py,sha256=3sxCWhFqYgXiWNUAMRR2ggLfqvbDI8e5vLjnT9V7X1M,3649
18
+ scorebook/inference/__init__.py,sha256=gGuZG1rdpxKYC54q0eAS6oTHQbRYhgxlBeAqonqHvRU,60
19
+ scorebook/inference/clients/__init__.py,sha256=VaLW7mi4tywJtR3Q9wr2pPci8NlEQ3bJanZyM5S81Z4,51
20
+ scorebook/inference/clients/bedrock.py,sha256=bsnz0IB6ufjZVPd6syD3yVaOelerB5G_YAmPAVqmBmI,10071
21
+ scorebook/inference/clients/openai.py,sha256=JPPcJdxYwwZNUfXCGTRRzzUUA8I8WiV3bu6-pgS1_UE,9043
22
+ scorebook/inference/clients/portkey.py,sha256=RCuEZB8xNAVeGEt3IJ0esh_wqreZNB3jrDKiWH6miV0,5949
23
+ scorebook/inference/clients/vertex.py,sha256=g6oNnag0qcLOYCtQ4SXAXfnqKtvPAVdigB--I7wU1yM,9871
24
+ scorebook/inference/inference_pipeline.py,sha256=1qSmfI4fBJFS3EcAhRlA-f4-8aI6wDiupSJu-vNXoYI,5571
25
+ scorebook/metrics/README.md,sha256=DTMgT-aYLqlx32vAPEOCvbfFNrwhYXd3z9g1oTjCPPc,4065
26
+ scorebook/metrics/__init__.py,sha256=5m688iVI8k_NUzZJdOL_-IxvwOWU8j4fa6b1yOGPx7w,232
27
+ scorebook/metrics/accuracy.py,sha256=0Qxu0nbzHhQbIdd5b3iOlbTnr4Nwi_wMh7hu4EStw4I,1284
28
+ scorebook/metrics/bertscore.py,sha256=DDjdLZ8sBTROMF1UpGN5gyUYIkEYv7kQZkEpoMMYyuY,1853
29
+ scorebook/metrics/bleu.py,sha256=xUAHIxWisvyq2f0k0zzrf3eEfCdHZ-VSrcmt73z4Xto,3075
30
+ scorebook/metrics/core/__init__.py,sha256=dWfy6rXNvOxxdHSEvB_eJcbbHZ3xDxhGKEXYHAfrXKI,40
31
+ scorebook/metrics/core/metric_base.py,sha256=XDHn-U8oUzhoPukKyYkg5xJLN05fSvnciAYCrCB28Mg,845
32
+ scorebook/metrics/core/metric_registry.py,sha256=jWt8pB4q96po724TzOivjrabuOj6zBP9_uEdYqHxQlU,7242
33
+ scorebook/metrics/exactmatch.py,sha256=xjd9iSlmdJ8Vk-LjfxAGJZyQ75FLjmeT4polG-P3JkY,3128
34
+ scorebook/metrics/f1.py,sha256=R8yQ5mkb4bfPZM6a1X3IL1Q4OnUFD_--qwQdBSlBqaE,3710
35
+ scorebook/metrics/precision.py,sha256=AFBDemPcs03txaihw3BqSoJfPQUFykZ_QR-IFx3du-s,3705
36
+ scorebook/metrics/recall.py,sha256=ek5NV55p_Ux-2mT2xAkvQdVsg3xCJtJ3zT9O-vPEQyA,3658
37
+ scorebook/metrics/rouge.py,sha256=ZDld4mrOmERRuEESgeZITZ-NYO3QzBTYRidULwuPWqA,3228
38
+ scorebook/score/__init__.py,sha256=CqkslUvOw8QfCCbSfwZgGrbmXeSLpZqIVo4ntrctYuY,66
39
+ scorebook/score/_async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
+ scorebook/score/_async/score_async.py,sha256=SatV9hEUT8MAru2ACSyM03weKX6VTFx7crW59_uX0L8,6155
41
+ scorebook/score/_sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
+ scorebook/score/_sync/score.py,sha256=nANQbuyYyIaWnoTQzyGMwPZRMFP6MmyIyHb1GO1mktQ,6101
43
+ scorebook/score/score_helpers.py,sha256=NQt5K-hI4EZErenMyAoR8iQa76KBVTAc-nmUMCbqh8U,8179
44
+ scorebook/settings.py,sha256=qZrNiki6rFXn43udmhjQSmdDKOEaX62WYoEs2Rbggr0,720
45
+ scorebook/types.py,sha256=NJZCV7hB-hKLN-Mmijtm0DoQVMUJVTahPo_n4nfQ5mE,4901
46
+ scorebook/utils/__init__.py,sha256=oBTybVHI5EdHIgzb0TeoAnSLMQdUh20Ww6vcL9542Pk,72
47
+ scorebook/utils/async_utils.py,sha256=2ewk_VOePib8z7DTRl-pZQBGzVI3L3JvnEuYW-DTkRA,1325
48
+ scorebook/utils/common_helpers.py,sha256=lJIqO9XGf1T3S3rdGBTjZJ1BzVPvaU_XTONEfPApnEM,1218
49
+ scorebook/utils/io_helpers.py,sha256=ORO6DwtXOKWJq9v_isuunUrz0viE3xy2qYO4lrgU-TM,1437
50
+ scorebook/utils/jinja_helpers.py,sha256=ksIKHiKdj8N0o7ZJZGasfbSNoAY6K5d9X_KM6mcKYD4,4208
51
+ scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
52
+ scorebook/utils/mock_llm/__init__.py,sha256=dK70wNVBKk4hv1o3fceDTBG1_maFbkMvoOtTriPCe78,1293
53
+ scorebook/utils/mock_llm/data/mock_llm_data.json,sha256=b28j7OCR0igpP0rkXDJAR2NWIiuVkOaAkzB-Miv665Y,381567
54
+ scorebook/utils/progress_bars.py,sha256=4k52Y40oXaUo40ZPYGcMM3rFbWF7nsbzLo3DDpkYv9o,3531
55
+ scorebook/utils/render_template.py,sha256=NOaZt-N1WcR5MA7at1XxzD-4sFMFKo9X0k7fKq6oSSM,1654
56
+ scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
57
+ tutorials/README.md,sha256=DETPgdCTTZLYMoVZ9DSfad1y7Jq4EMzi6UA9Ijm0GQc,5588
58
+ tutorials/__init__.py,sha256=I1ki4-8iZsO9pbK4_xmRpjzSDYoH0JdYbtwzvgPgeo8,196
59
+ tutorials/examples/1-score/1-scoring_model_accuracy.py,sha256=Uc9DPa31TOR_1egsUhLptUIGXb67FYLtLKpDPtQvcGA,1367
60
+ tutorials/examples/1-score/2-scoring_model_bleu.py,sha256=pE4SMKR32Noa1eU8QUYU1a533WvXzyPi440jsgYs9gk,1991
61
+ tutorials/examples/1-score/3-scoring_model_f1.py,sha256=g_uH_IgVNWtTaQ47MEBztUmGqgUn0y2464xNIoAN_dI,2078
62
+ tutorials/examples/1-score/4-scoring_model_rouge.py,sha256=8HaPzv-Y8oTy88FtxiFR4i0SEd7BndxL_YgzpVSLEP8,2512
63
+ tutorials/examples/1-score/5-scoring_model_exact_match.py,sha256=PcnrF7OfULRQd4veSKwlp5lQ1lMo749cDSdRu88sOVE,3235
64
+ tutorials/examples/1-score/6-scoring_with_bertscore.py,sha256=_LRDrC5K_VmfxH37L7RbvLrQB4DpHbf-8K0Vr0R4cxc,2148
65
+ tutorials/examples/1-score/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
+ tutorials/examples/2-evaluate/1-evaluating_local_models.py,sha256=FSFWQUism2HWHELX3Q-F1jkRJ1feeyx1TZqttSRruPY,3630
67
+ tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py,sha256=EKdyTP7svdtxeOdM0E569RZckwZ_GAH04ZMFi5kaVS8,3840
68
+ tutorials/examples/2-evaluate/3-evaluating_cloud_models.py,sha256=C8QFmfkEd3APhh5k9LLgaxa3rlj1iQfQdV5k6jRAgx8,3566
69
+ tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py,sha256=ckd8NZe9Qyv1FvCbuhAqj3qL1ZKBlSE1fdEy3_jxvXw,6338
70
+ tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py,sha256=p0A_dwoVaGnAgpdRDu1MfC-HUp8UoBanSHvyJBpM97I,3954
71
+ tutorials/examples/2-evaluate/6-inference_pipelines.py,sha256=fy0dmVXV1D7jfLq6Xhhqe3C3UYDBRRtPybvkGctV2E8,4991
72
+ tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py,sha256=VihoUKmdaHfvJftyqFTSP0hkK1N1g1tAQZmw5yH438E,3458
73
+ tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py,sha256=e9PvbrAoXzIVXtqlg9dZP6Xd8b81nKqzEuTuRXLhutk,3225
74
+ tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py,sha256=1odW_2qFNyGES9L0bYWLvP8Cplc8ciIx42SZ_JZ6yis,3911
75
+ tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv,sha256=MvTl1wWx-TX0LgR43BvPCrk6tKUuGItp7_jA7RcWfMw,374
76
+ tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json,sha256=au86RNV2JRvw8IHNZK-768lWJdJQnCrod-6N-F-Chmk,921
77
+ tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml,sha256=F_Aiuw1BJBUNFy_ENiZDiQvcT2FqNprlUb1Y9kouzhE,320
78
+ tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml,sha256=-D9T78LJ1ihlobHgzrEeJIplimItj6pkzms3lWz50bI,362
79
+ tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py,sha256=2k5U61z3g1kYEwrCW4W27Yt6dYYS-PGL-bVXmwWs5YE,4042
80
+ tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py,sha256=KRhzAg6JxhF0ie3YeE2UXY34Lv6zfDuL_536L9i7Qdg,3598
81
+ tutorials/examples/5-upload_results/1-uploading_score_results.py,sha256=VUq9WzymPzm663OEAGpWHh7Hze-tpo_S6pFbxlZsqjw,2914
82
+ tutorials/examples/5-upload_results/2-uploading_evaluate_results.py,sha256=SkDMtw_I4SZO1ooQpvwmq_ptOMzbYSSZlYuDEU2uJb4,3808
83
+ tutorials/examples/5-upload_results/3-uploading_your_results.py,sha256=XdTNTChV78oFAL9fa58YHHsGRNRFzcQ8qJdzCfDBFdI,5562
84
+ tutorials/examples/6-providers/aws/__init__.py,sha256=hgW4VxBUWhsxOLGErdoA5fo9JnXoj1Hz3E1tNLulHSY,63
85
+ tutorials/examples/6-providers/aws/batch_example.py,sha256=3SSx85E36NV2UTpB6N-FimXQRZL-jvVw9-wee0xodyk,7279
86
+ tutorials/examples/6-providers/portkey/__init__.py,sha256=YoLlgr7ULGjAL6BwSJ6hMmLWhI_rlEHpN-sn9QS0fa4,67
87
+ tutorials/examples/6-providers/portkey/batch_example.py,sha256=ea4BQhiiOKwSJSmW6uoxHLMQVk1Ji3k_k_CZ-y9ixJM,4011
88
+ tutorials/examples/6-providers/portkey/messages_example.py,sha256=9wKGZlqksrSuoxgDY2w9IWts4fBzmZmijRdHZzSRAQs,4397
89
+ tutorials/examples/6-providers/vertex/__init__.py,sha256=zVlz0-hpUZDgYFjM-OmxcPsKT1lTE-inZyRAf20aHD8,77
90
+ tutorials/examples/6-providers/vertex/batch_example.py,sha256=rOQKQ4BO8VFkiLYXcCmZByc6zfqM-Iu3LUMvzP_oioU,5433
91
+ tutorials/examples/6-providers/vertex/messages_example.py,sha256=MeH9AiNtpi0BYL_TRGEY-B-d7eS6qGdc3d0usXSWMQY,4989
92
+ tutorials/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
93
+ tutorials/notebooks/1-scoring.ipynb,sha256=0mWJlJuq2MY0OdS9Ts9BX8MH7G53Tv8rGJT4yEuCAv0,4663
94
+ tutorials/notebooks/2-evaluating.ipynb,sha256=XjCtnypK0CJXDxI18RZocdvr28XWsJykGxs47Ig9SUc,9007
95
+ tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb,sha256=r7_tAw-pswVxvVt8xA7-ewPHepMfkL1YA39p0u-UJyM,12269
96
+ tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb,sha256=6dhU2JFL7Mebu2VslY21WFy8C9lrzmo0gE9DQk8kj9w,9160
97
+ tutorials/notebooks/4-uploading_results.ipynb,sha256=a7DzNglzkBoCkN-dyncixzKNkEFalGaHWsLvhu3Kyd4,10361
98
+ tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb,sha256=MZoHegB31jnI6wbXVtC0B2kCeNn_hyUf53m0B8G1LJw,7701
99
+ tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb,sha256=1CIbdr_Y5XcL0iS9GcNs_Al62Mv2hSgdxvoj3lP1zU8,8428
100
+ tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb,sha256=f7Kjy5AVK_nNxTd3RFtFPmFCEkhr3a17aUvVDvlGyKQ,9295
101
+ tutorials/quickstarts/getting_started.ipynb,sha256=5083CRiTlH-LfGdwjKwBDtn5i2c4FQUGxn0bRgE_DGw,7117
102
+ tutorials/utils/__init__.py,sha256=bI1-auvdIRV53jpT3bWRuKHUhZ92qHVJA_4S6xBB4P0,807
103
+ tutorials/utils/args_parser.py,sha256=SZeqhNaSosw7iT5u0Wkb9m0XqXRx_DAitGXb7QK2nEM,4011
104
+ tutorials/utils/output.py,sha256=4kVdySdxOl8cjSy-as4O71AhJOj7OlGXiOmlww4cZ-E,714
105
+ tutorials/utils/setup.py,sha256=6obslk7PIgUlD4RyAi8SaeoB9ioxM0Vhda65ptovEKQ,3470
106
+ scorebook-0.0.15.dist-info/METADATA,sha256=IOkRQaLVGMQIyfJbkIOqOZxZk90PCANAI3VP3w32PJg,10759
107
+ scorebook-0.0.15.dist-info/WHEEL,sha256=kJCRJT_g0adfAJzTx2GUMmS80rTJIVHRCfG0DQgLq3o,88
108
+ scorebook-0.0.15.dist-info/entry_points.txt,sha256=9gNd3Q0MEozhJ7fog-Q-Z_PrcGMnF-404Jon40MH2_U,53
109
+ scorebook-0.0.15.dist-info/licenses/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
110
+ scorebook-0.0.15.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.2.1
2
+ Generator: poetry-core 2.3.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
tutorials/README.md ADDED
@@ -0,0 +1,147 @@
1
+ # Scorebook Tutorials
2
+
3
+ This directory contains tutorials, examples, and quickstarts for learning Scorebook - a Python framework for evaluating large language models.
4
+
5
+ ## Directory Structure
6
+
7
+ ```
8
+ tutorials/
9
+ ├── quickstarts/ # Quick start notebooks for getting up and running
10
+ ├── notebooks/ # Interactive Jupyter notebooks
11
+ ├── examples/ # Python script examples organized by topic
12
+ │ ├── 1-score/ # Scoring pre-computed outputs
13
+ │ ├── 2-evaluate/ # Running evaluations
14
+ │ ├── 3-evaluation_datasets/ # Loading datasets
15
+ │ ├── 4-adaptive_evaluations/ # Trismik adaptive testing
16
+ │ ├── 5-upload_results/ # Uploading to Trismik
17
+ │ └── 6-providers/ # Cloud provider integrations
18
+ └── utils/ # Helper utilities for examples
19
+ ```
20
+
21
+ ## Getting Started
22
+
23
+ ### Prerequisites
24
+
25
+ - Python 3.9+
26
+ - Install Scorebook: `pip install scorebook`
27
+ - For local model examples: `pip install scorebook[examples]` (includes transformers, torch)
28
+ - For cloud examples: `pip install scorebook[openai]` and set `OPENAI_API_KEY`
29
+ - For Trismik features: Set `TRISMIK_API_KEY` environment variable
30
+
31
+ ### Quickstarts
32
+
33
+ Start here for a rapid introduction:
34
+
35
+ ```bash
36
+ jupyter notebook tutorials/quickstarts/getting_started.ipynb
37
+ ```
38
+
39
+ Available quickstarts:
40
+ - `getting_started.ipynb` - Introduction to Scorebook basics
41
+ - `classical_evaluations/` - Standard evaluation workflows
42
+ - `adaptive_evaluations/` - Trismik's adaptive testing feature
43
+
44
+ ### Notebooks
45
+
46
+ Interactive tutorials covering core concepts:
47
+
48
+ ```bash
49
+ jupyter notebook tutorials/notebooks/
50
+ ```
51
+
52
+ | Notebook | Description |
53
+ |----------|-------------|
54
+ | `1-scoring.ipynb` | Score pre-computed model outputs |
55
+ | `2-evaluating.ipynb` | Run full evaluation pipelines |
56
+ | `3.1-adaptive_evaluation_phi.ipynb` | Adaptive evaluation with local models |
57
+ | `3.2-adaptive_evaluation_gpt.ipynb` | Adaptive evaluation with OpenAI |
58
+ | `4-uploading_results.ipynb` | Upload results to Trismik dashboard |
59
+
60
+ ## Examples
61
+
62
+ Python scripts demonstrating specific features. Run examples from the project root:
63
+
64
+ ```bash
65
+ python tutorials/examples/1-score/1-scoring_model_accuracy.py
66
+ ```
67
+ ### 1-score: Scoring Pre-computed Outputs
68
+
69
+ Score model predictions that have already been generated.
70
+
71
+ | Example | Description |
72
+ |---------|-------------|
73
+ | `1-scoring_model_accuracy.py` | Score outputs using accuracy metric |
74
+ | `2-scoring_model_bleu.py` | Score using BLEU metric |
75
+ | `3-scoring_model_f1.py` | Score using F1 metric |
76
+ | `4-scoring_model_rouge.py` | Score using ROUGE metric |
77
+
78
+ ### 2-evaluate: Running Evaluations
79
+
80
+ End-to-end evaluation workflows with inference.
81
+
82
+ | Example | Description | Requirements |
83
+ |---------|-------------|--------------|
84
+ | `1-evaluating_local_models.py` | Basic evaluation with local HuggingFace model | - |
85
+ | `2-evaluating_local_models_with_batching.py` | Batch processing for improved throughput | - |
86
+ | `3-evaluating_cloud_models.py` | Evaluate using OpenAI API | OpenAI API key |
87
+ | `4-evaluating_cloud_models_with_batching.py` | OpenAI Batch API for cost savings | OpenAI API key |
88
+ | `5-hyperparameter_sweeps.py` | Test multiple hyperparameter configurations | - |
89
+ | `6-inference_pipelines.py` | Modular preprocessing/inference/postprocessing | - |
90
+
91
+ ### 3-evaluation_datasets: Loading Datasets
92
+
93
+ Different ways to load evaluation data.
94
+
95
+ | Example | Description | Requirements |
96
+ |---------|-------------|--------------|
97
+ | `1-evaluation_datasets_from_files.py` | Load from JSON/CSV files | - |
98
+ | `2-evaluation_datasets_from_huggingface.py` | Load from HuggingFace Hub | OpenAI API key |
99
+ | `3-evaluation_datasets_from_huggingface_with_yaml_configs.py` | Use YAML configs for HuggingFace datasets | OpenAI API key |
100
+
101
+ ### 4-adaptive_evaluations: Trismik Adaptive Testing
102
+
103
+ Efficient evaluation using Item Response Theory (IRT).
104
+
105
+ | Example | Description | Requirements |
106
+ |---------|-------------|--------------|
107
+ | `1-adaptive_evaluation.py` | Basic adaptive evaluation | Trismik + OpenAI |
108
+ | `2-adaptive_dataset_splits.py` | Adaptive evaluation with dataset splits | Trismik + OpenAI |
109
+
110
+ ### 5-upload_results: Uploading to Trismik
111
+
112
+ Persist and share results on the Trismik dashboard.
113
+
114
+ | Example | Description | Requirements |
115
+ |---------|-------------|--------------|
116
+ | `1-uploading_score_results.py` | Upload `score()` results | Trismik API key |
117
+ | `2-uploading_evaluate_results.py` | Upload `evaluate()` results | Trismik API key |
118
+ | `3-uploading_your_results.py` | Upload custom results | Trismik API key |
119
+
120
+ ### 6-providers: Cloud Provider Integrations
121
+
122
+ Batch processing with different cloud providers.
123
+
124
+ #### AWS Bedrock (`6-providers/aws/`)
125
+ - `batch_example.py` - Batch inference with Claude models via AWS Bedrock
126
+
127
+ **Requirements:** AWS CLI configured, S3 bucket, IAM role for Bedrock
128
+
129
+ #### Google Cloud Vertex AI (`6-providers/vertex/`)
130
+ - `batch_example.py` - Batch inference with Gemini models
131
+ - `messages_example.py` - Real-time inference with Gemini
132
+
133
+ **Requirements:** Google Cloud SDK, Vertex AI enabled project
134
+
135
+ #### Portkey (`6-providers/portkey/`)
136
+ - `batch_example.py` - Batch inference via Portkey gateway
137
+ - `messages_example.py` - Real-time inference via Portkey
138
+
139
+ **Requirements:** Portkey API key, linked provider account
140
+
141
+ ## Additional Resources
142
+
143
+ - [Scorebook Documentation](https://docs.trismik.com/)
144
+ - [Trismik Platform](https://trismik.com)
145
+ - [API Reference](https://docs.trismik.com/category/reference/)
146
+ - [GitHub Issues](https://github.com/trismik/scorebook/issues)
147
+ - Contact support at support@trismik.com
tutorials/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """Scorebook tutorials package.
2
+
3
+ This package contains tutorial examples and utilities for learning Scorebook.
4
+ The tutorials are optional and not required for using the core Scorebook package.
5
+ """
@@ -0,0 +1,47 @@
1
+ """Tutorials - Score - Example 1 - Scoring Models."""
2
+
3
+ from pathlib import Path
4
+ from pprint import pprint
5
+ from typing import Any
6
+
7
+ from dotenv import load_dotenv
8
+
9
+ from tutorials.utils import save_results_to_json, setup_logging
10
+
11
+ from scorebook import score
12
+ from scorebook.metrics.accuracy import Accuracy
13
+
14
+
15
+ def main() -> Any:
16
+ """Score pre-computed model predictions using Scorebook.
17
+
18
+ This example demonstrates how to score generated model predictions.
19
+ """
20
+
21
+ # Prepare a list of items with generated outputs and labels
22
+ model_predictions = [
23
+ {"output": "4", "label": "4"},
24
+ {"output": "Paris", "label": "Paris"},
25
+ {"output": "George R. R. Martin", "label": "William Shakespeare"},
26
+ ]
27
+
28
+ # Score the predictions against labels using the accuracy metric
29
+ results = score(
30
+ items=model_predictions,
31
+ metrics=Accuracy,
32
+ upload_results=False, # Disable uploading for this example
33
+ )
34
+
35
+ print("\nResults:")
36
+ pprint(results)
37
+
38
+ return results
39
+
40
+
41
+ if __name__ == "__main__":
42
+ load_dotenv()
43
+ log_file = setup_logging(experiment_id="1-scoring_model_accuracy", base_dir=Path(__file__).parent)
44
+ output_dir = Path(__file__).parent / "results"
45
+ output_dir.mkdir(exist_ok=True)
46
+ results_dict = main()
47
+ save_results_to_json(results_dict, output_dir, "1-scoring_model_accuracy_output.json")
@@ -0,0 +1,46 @@
1
+ """Tutorials - Score - Example 2 - Scoring Models with BLEU."""
2
+
3
+ from pathlib import Path
4
+ from pprint import pprint
5
+ from typing import Any
6
+
7
+ from dotenv import load_dotenv
8
+
9
+ from tutorials.utils import save_results_to_json, setup_logging
10
+ from scorebook import score
11
+ from scorebook.metrics.bleu import BLEU
12
+
13
+
14
+ def main() -> Any:
15
+ """Score pre-computed model predictions using Scorebook.
16
+
17
+ This example demonstrates how to score generated model predictions.
18
+ """
19
+
20
+ # Prepare a list of items with generated outputs and labels
21
+ model_predictions = [
22
+ {"output": "28-jähriger Koch wurde in San Francisco Mall entdeckt.", "label": "28-jähriger Koch in San Francisco Mall tot aufgefunden"},
23
+ {"output": "Ein 28-jähriger Koch, der kürzlich nach San Francisco gezogen war, wurde in der Treppe eines lokalen Einkaufszentrums dieser Woche ermordet.", "label": "Ein 28-jähriger Koch, der vor kurzem nach San Francisco gezogen ist, wurde im Treppenhaus eines örtlichen Einkaufzentrums tot aufgefunden."},
24
+ {"output": 'Der Bruder des Opfers sagt, er könne sich nicht vorstellen, wer ihm schaden wolle, und sagt: "Die Dinge waren endlich gut für ihn."', "label": 'Der Bruder des Opfers sagte aus, dass er sich niemanden vorstellen kann, der ihm schaden wollen würde, "Endlich ging es bei ihm wieder bergauf."'},
25
+ ]
26
+
27
+ # Score the predictions against labels using the accuracy metric
28
+ results = score(
29
+ items=model_predictions,
30
+ metrics=BLEU(compact=False),
31
+ upload_results=False, # Disable uploading for this example
32
+ )
33
+
34
+ print("\nResults:")
35
+ pprint(results)
36
+
37
+ return results
38
+
39
+
40
+ if __name__ == "__main__":
41
+ load_dotenv()
42
+ log_file = setup_logging(experiment_id="2-scoring_model_bleu", base_dir=Path(__file__).parent)
43
+ output_dir = Path(__file__).parent / "results"
44
+ output_dir.mkdir(exist_ok=True)
45
+ results_dict = main()
46
+ save_results_to_json(results_dict, output_dir, "2-scoring_model_bleu_output.json")