scorebook 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +2 -0
- scorebook/dashboard/credentials.py +34 -4
- scorebook/eval_datasets/eval_dataset.py +2 -2
- scorebook/evaluate/_async/evaluate_async.py +27 -11
- scorebook/evaluate/_sync/evaluate.py +27 -11
- scorebook/metrics/README.md +121 -0
- scorebook/metrics/__init__.py +8 -0
- scorebook/metrics/accuracy.py +2 -6
- scorebook/metrics/bertscore.py +50 -0
- scorebook/metrics/bleu.py +82 -0
- scorebook/metrics/core/__init__.py +1 -0
- scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
- scorebook/metrics/core/metric_registry.py +195 -0
- scorebook/metrics/exactmatch.py +95 -0
- scorebook/metrics/f1.py +96 -0
- scorebook/metrics/precision.py +84 -9
- scorebook/metrics/recall.py +94 -0
- scorebook/metrics/rouge.py +85 -0
- scorebook/score/score_helpers.py +28 -11
- scorebook/types.py +2 -2
- scorebook/utils/progress_bars.py +58 -786
- {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/METADATA +32 -24
- scorebook-0.0.15.dist-info/RECORD +110 -0
- {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
- tutorials/README.md +147 -0
- tutorials/__init__.py +5 -0
- tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
- tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
- tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
- tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
- tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
- tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
- tutorials/examples/1-score/__init__.py +0 -0
- tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
- tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
- tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
- tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
- tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
- tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
- tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
- tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
- tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
- tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
- tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
- tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
- tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
- tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
- tutorials/examples/6-providers/aws/__init__.py +1 -0
- tutorials/examples/6-providers/aws/batch_example.py +219 -0
- tutorials/examples/6-providers/portkey/__init__.py +1 -0
- tutorials/examples/6-providers/portkey/batch_example.py +120 -0
- tutorials/examples/6-providers/portkey/messages_example.py +121 -0
- tutorials/examples/6-providers/vertex/__init__.py +1 -0
- tutorials/examples/6-providers/vertex/batch_example.py +166 -0
- tutorials/examples/6-providers/vertex/messages_example.py +142 -0
- tutorials/examples/__init__.py +0 -0
- tutorials/notebooks/1-scoring.ipynb +162 -0
- tutorials/notebooks/2-evaluating.ipynb +316 -0
- tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
- tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
- tutorials/notebooks/4-uploading_results.ipynb +175 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
- tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
- tutorials/quickstarts/getting_started.ipynb +197 -0
- tutorials/utils/__init__.py +35 -0
- tutorials/utils/args_parser.py +132 -0
- tutorials/utils/output.py +23 -0
- tutorials/utils/setup.py +98 -0
- scorebook/metrics/metric_registry.py +0 -107
- scorebook-0.0.14.dist-info/RECORD +0 -53
- {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"id": "b1ef68792a1c82f2",
|
|
6
|
+
"metadata": {},
|
|
7
|
+
"source": [
|
|
8
|
+
"# _Classical_ Evaluations with Scorebook\n",
|
|
9
|
+
"\n",
|
|
10
|
+
"Scorebook, developed by Trismik, is an open-source Python library for model evaluation. It supports both Trismik’s adaptive testing and traditional classical evaluations. In a classical evaluation, a model runs inference on every item in a dataset, and the results are scored using Scorebook’s built-in metrics, such as accuracy, to produce evaluation results. Evaluation results can be automatically uploaded to the Scorebook dashboard, organized by project, for storing, managing, and visualizing model evaluation experiments.\n",
|
|
11
|
+
"\n",
|
|
12
|
+
"## Prerequisites\n",
|
|
13
|
+
"\n",
|
|
14
|
+
"- **Trismik API key**: Generate a Trismik API key from the [Trismik dashboard's settings page](https://app.trismik.com/settings).\n",
|
|
15
|
+
"- **Trismik Project Id**: We recommend you use the project id generated in the [Getting Started Quick-Start Guide](https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/getting_started.ipynb).\n",
|
|
16
|
+
"\n",
|
|
17
|
+
"### Install Scorebook\n"
|
|
18
|
+
]
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"metadata": {},
|
|
22
|
+
"cell_type": "code",
|
|
23
|
+
"source": "!pip install scorebook",
|
|
24
|
+
"id": "c2fecbb3d7b699b3",
|
|
25
|
+
"outputs": [],
|
|
26
|
+
"execution_count": null
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"metadata": {},
|
|
30
|
+
"cell_type": "markdown",
|
|
31
|
+
"source": [
|
|
32
|
+
"\n",
|
|
33
|
+
"\n",
|
|
34
|
+
"### Setup Credentials\n",
|
|
35
|
+
"\n",
|
|
36
|
+
"Enter your Trismik API key and project id below."
|
|
37
|
+
],
|
|
38
|
+
"id": "8e18d0e628fdb944"
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"metadata": {},
|
|
42
|
+
"cell_type": "code",
|
|
43
|
+
"source": [
|
|
44
|
+
"# Set your credentials here\n",
|
|
45
|
+
"TRISMIK_API_KEY = \"your-trismik-api-key-here\"\n",
|
|
46
|
+
"TRISMIK_PROJECT_ID = \"your-trismik-project-id-key-here\""
|
|
47
|
+
],
|
|
48
|
+
"id": "4cb833e7b092ae4",
|
|
49
|
+
"outputs": [],
|
|
50
|
+
"execution_count": null
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
"metadata": {},
|
|
54
|
+
"cell_type": "markdown",
|
|
55
|
+
"source": "### Login with Trismik API Key",
|
|
56
|
+
"id": "ed4e8281ecb99685"
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
"cell_type": "code",
|
|
60
|
+
"id": "f59fb493a575d361",
|
|
61
|
+
"metadata": {},
|
|
62
|
+
"source": [
|
|
63
|
+
"from scorebook import login\n",
|
|
64
|
+
"\n",
|
|
65
|
+
"login(TRISMIK_API_KEY)\n",
|
|
66
|
+
"print(\"✓ Logged in to Trismik\")"
|
|
67
|
+
],
|
|
68
|
+
"outputs": [],
|
|
69
|
+
"execution_count": null
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
"cell_type": "markdown",
|
|
73
|
+
"id": "cb58f57296229115",
|
|
74
|
+
"metadata": {},
|
|
75
|
+
"source": [
|
|
76
|
+
"\n",
|
|
77
|
+
"\n",
|
|
78
|
+
"## Evaluation Datasets\n",
|
|
79
|
+
"\n",
|
|
80
|
+
"A scorebook evaluation requires an evaluation dataset, represented by the `EvalDataset` class. Evaluation datasets can be constructed via a number of factory methods. In this example we will create a basic evaluation dataset from a list of evaluation items."
|
|
81
|
+
]
|
|
82
|
+
},
|
|
83
|
+
{
|
|
84
|
+
"cell_type": "code",
|
|
85
|
+
"id": "fa7c936b75c83cad",
|
|
86
|
+
"metadata": {},
|
|
87
|
+
"source": [
|
|
88
|
+
"from scorebook import EvalDataset\n",
|
|
89
|
+
"from scorebook.metrics.accuracy import Accuracy\n",
|
|
90
|
+
"\n",
|
|
91
|
+
"# Create a sample dataset from a list of multiple-choice questions\n",
|
|
92
|
+
"evaluation_items = [\n",
|
|
93
|
+
" {\"question\": \"What is 2 + 2?\", \"answer\": \"4\"},\n",
|
|
94
|
+
" {\"question\": \"What is the capital of France?\", \"answer\": \"Paris\"},\n",
|
|
95
|
+
" {\"question\": \"Who wrote Romeo and Juliet?\", \"answer\": \"William Shakespeare\"},\n",
|
|
96
|
+
" {\"question\": \"What is the chemical symbol for gold?\", \"answer\": \"Au\"}\n",
|
|
97
|
+
"]\n",
|
|
98
|
+
"\n",
|
|
99
|
+
"# Create an EvalDataset from the list\n",
|
|
100
|
+
"dataset = EvalDataset.from_list(\n",
|
|
101
|
+
" name = \"sample_multiple_choice\",\n",
|
|
102
|
+
" metrics = Accuracy,\n",
|
|
103
|
+
" items = evaluation_items,\n",
|
|
104
|
+
" input = \"question\",\n",
|
|
105
|
+
" label = \"answer\",\n",
|
|
106
|
+
")\n",
|
|
107
|
+
"\n",
|
|
108
|
+
"print(f\"✓ Created dataset with {len(dataset.items)} items\")"
|
|
109
|
+
],
|
|
110
|
+
"outputs": [],
|
|
111
|
+
"execution_count": null
|
|
112
|
+
},
|
|
113
|
+
{
|
|
114
|
+
"cell_type": "markdown",
|
|
115
|
+
"id": "609a95a43d8cfc2c",
|
|
116
|
+
"metadata": {},
|
|
117
|
+
"source": [
|
|
118
|
+
"## Preparing Models for Evaluation\n",
|
|
119
|
+
"\n",
|
|
120
|
+
"To evaluate a model with Scorebook, it must be encapsulated within an inference function. An inference function must accept a list of model inputs, pass these to the model for inference, collect and return outputs generated.\n",
|
|
121
|
+
"\n",
|
|
122
|
+
"### Instantiate a Local Qwen Model\n",
|
|
123
|
+
"\n",
|
|
124
|
+
"For this quick-start guide, we will use the lightweight Qwen2.5 0.5B instruct model, via Hugging Face's transformers package."
|
|
125
|
+
]
|
|
126
|
+
},
|
|
127
|
+
{
|
|
128
|
+
"cell_type": "code",
|
|
129
|
+
"id": "d1da8af72ef8de6f",
|
|
130
|
+
"metadata": {},
|
|
131
|
+
"source": [
|
|
132
|
+
"import transformers\n",
|
|
133
|
+
"\n",
|
|
134
|
+
"# Instantiate a model\n",
|
|
135
|
+
"pipeline = transformers.pipeline(\n",
|
|
136
|
+
" \"text-generation\",\n",
|
|
137
|
+
" model=\"Qwen/Qwen2.5-0.5B-Instruct\",\n",
|
|
138
|
+
" model_kwargs={\"torch_dtype\": \"auto\"},\n",
|
|
139
|
+
" device_map=\"auto\",\n",
|
|
140
|
+
")\n",
|
|
141
|
+
"\n",
|
|
142
|
+
"print(\"✓ Transformers pipeline instantiated\")"
|
|
143
|
+
],
|
|
144
|
+
"outputs": [],
|
|
145
|
+
"execution_count": null
|
|
146
|
+
},
|
|
147
|
+
{
|
|
148
|
+
"cell_type": "markdown",
|
|
149
|
+
"id": "3b56a72374920220",
|
|
150
|
+
"metadata": {},
|
|
151
|
+
"source": [
|
|
152
|
+
"### Define an Inference Function\n",
|
|
153
|
+
"\n",
|
|
154
|
+
"An inference function can be defined to encapsulate any model, local or cloud-hosted. There is flexibility in how an inference function can be defined, the only requirements are the function signature. An inference function must,\n",
|
|
155
|
+
"\n",
|
|
156
|
+
"Accept:\n",
|
|
157
|
+
"\n",
|
|
158
|
+
"- A list of model inputs.\n",
|
|
159
|
+
"- Hyperparameters which can be optionally accessed via kwargs.\n",
|
|
160
|
+
"\n",
|
|
161
|
+
"Return\n",
|
|
162
|
+
"\n",
|
|
163
|
+
"- A list of parsed model outputs for scoring.\n"
|
|
164
|
+
]
|
|
165
|
+
},
|
|
166
|
+
{
|
|
167
|
+
"cell_type": "code",
|
|
168
|
+
"id": "55f6d1eee2fa886e",
|
|
169
|
+
"metadata": {},
|
|
170
|
+
"source": [
|
|
171
|
+
"from typing import Any, List\n",
|
|
172
|
+
"\n",
|
|
173
|
+
"# Define an inference function for the Qwen model.\n",
|
|
174
|
+
"def qwen(inputs: List[Any], **hyperparameters: Any) -> List[Any]:\n",
|
|
175
|
+
" \"\"\"Run inference on a list of inputs using the 0.5B Qwen model.\"\"\"\n",
|
|
176
|
+
" inference_outputs = []\n",
|
|
177
|
+
"\n",
|
|
178
|
+
" for model_input in inputs:\n",
|
|
179
|
+
" messages = [\n",
|
|
180
|
+
" {\"role\": \"system\", \"content\": hyperparameters.get(\"system_message\", \"You are a helpful assistant.\")},\n",
|
|
181
|
+
" {\"role\": \"user\", \"content\": str(model_input)},\n",
|
|
182
|
+
" ]\n",
|
|
183
|
+
"\n",
|
|
184
|
+
" output = pipeline(\n",
|
|
185
|
+
" messages,\n",
|
|
186
|
+
" temperature = hyperparameters.get(\"temperature\", 0.7),\n",
|
|
187
|
+
" top_p = hyperparameters.get(\"top_p\", 0.9),\n",
|
|
188
|
+
" top_k = hyperparameters.get(\"top_k\", 50),\n",
|
|
189
|
+
" max_new_tokens = 512,\n",
|
|
190
|
+
" do_sample = hyperparameters.get(\"temperature\", 0.7) > 0,\n",
|
|
191
|
+
" )\n",
|
|
192
|
+
"\n",
|
|
193
|
+
" inference_outputs.append(output[0][\"generated_text\"][-1][\"content\"])\n",
|
|
194
|
+
"\n",
|
|
195
|
+
" return inference_outputs\n",
|
|
196
|
+
"\n",
|
|
197
|
+
"print(\"✓ Inference function for Qwen2.5 0.5B defined\")\n",
|
|
198
|
+
"print(qwen([\"Hello!\"]))"
|
|
199
|
+
],
|
|
200
|
+
"outputs": [],
|
|
201
|
+
"execution_count": null
|
|
202
|
+
},
|
|
203
|
+
{
|
|
204
|
+
"cell_type": "markdown",
|
|
205
|
+
"id": "1230b9ad762482af",
|
|
206
|
+
"metadata": {},
|
|
207
|
+
"source": [
|
|
208
|
+
"## Running an Evaluation\n",
|
|
209
|
+
"\n",
|
|
210
|
+
"Running a scorebook evaluation with `evaluate` only requires an inference function and a dataset. When uploading results to Trismik's dashboard, an experiment and project id are also required. We can also specify in hyperparameters, which are passed to the inference function."
|
|
211
|
+
]
|
|
212
|
+
},
|
|
213
|
+
{
|
|
214
|
+
"cell_type": "code",
|
|
215
|
+
"id": "7031045c655d1062",
|
|
216
|
+
"metadata": {},
|
|
217
|
+
"source": [
|
|
218
|
+
"from scorebook import evaluate\n",
|
|
219
|
+
"\n",
|
|
220
|
+
"# Run evaluation\n",
|
|
221
|
+
"results = evaluate(\n",
|
|
222
|
+
" inference=qwen,\n",
|
|
223
|
+
" datasets = dataset,\n",
|
|
224
|
+
" hyperparameters = {\n",
|
|
225
|
+
" 'temperature': 0.9,\n",
|
|
226
|
+
" 'top_p': 0.8,\n",
|
|
227
|
+
" 'top_k': 40,\n",
|
|
228
|
+
" 'system_message': \"Answer the question directly, provide no additional context.\"\n",
|
|
229
|
+
" },\n",
|
|
230
|
+
" experiment_id = \"Qwen-Classical-Evaluation\",\n",
|
|
231
|
+
" project_id = TRISMIK_PROJECT_ID,\n",
|
|
232
|
+
")\n",
|
|
233
|
+
"\n",
|
|
234
|
+
"print(\"Qwen2.5 0.5B Evaluation Results:\")\n",
|
|
235
|
+
"print(f\"accuracy: {results[0]['accuracy']}\")"
|
|
236
|
+
],
|
|
237
|
+
"outputs": [],
|
|
238
|
+
"execution_count": null
|
|
239
|
+
},
|
|
240
|
+
{
|
|
241
|
+
"metadata": {},
|
|
242
|
+
"cell_type": "markdown",
|
|
243
|
+
"source": [
|
|
244
|
+
"The results are encapsulated within a list of dictionaries, with a dict for each evaluation run. the above example only executes a single run as only 1 dataset, and hyperparameter configuration is evaluated.\n",
|
|
245
|
+
"\n",
|
|
246
|
+
"---\n",
|
|
247
|
+
"\n",
|
|
248
|
+
"## Next Steps\n",
|
|
249
|
+
"\n",
|
|
250
|
+
"- [Scorebook Docs](https://docs.trismik.com/scorebook/introduction-to-scorebook/): Scorebook's full documentation.\n",
|
|
251
|
+
"- [Scorebook Repository](https://github.com/trismik/scorebook): Scorebook is an open-source library, view the code and more examples."
|
|
252
|
+
],
|
|
253
|
+
"id": "61d9ff67d63624d6"
|
|
254
|
+
}
|
|
255
|
+
],
|
|
256
|
+
"metadata": {
|
|
257
|
+
"kernelspec": {
|
|
258
|
+
"display_name": "Python 3 (ipykernel)",
|
|
259
|
+
"language": "python",
|
|
260
|
+
"name": "python3"
|
|
261
|
+
},
|
|
262
|
+
"language_info": {
|
|
263
|
+
"codemirror_mode": {
|
|
264
|
+
"name": "ipython",
|
|
265
|
+
"version": 3
|
|
266
|
+
},
|
|
267
|
+
"file_extension": ".py",
|
|
268
|
+
"mimetype": "text/x-python",
|
|
269
|
+
"name": "python",
|
|
270
|
+
"nbconvert_exporter": "python",
|
|
271
|
+
"pygments_lexer": "ipython3",
|
|
272
|
+
"version": "3.13.5"
|
|
273
|
+
}
|
|
274
|
+
},
|
|
275
|
+
"nbformat": 4,
|
|
276
|
+
"nbformat_minor": 5
|
|
277
|
+
}
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"id": "bda7b5add96e4d97",
|
|
6
|
+
"metadata": {},
|
|
7
|
+
"source": [
|
|
8
|
+
"# Getting Started with Trismik's Adaptive Testing\n",
|
|
9
|
+
"\n",
|
|
10
|
+
"This notebook demonstrates how to run Trismik's adaptive evaluations using Scorebook.\n",
|
|
11
|
+
"\n",
|
|
12
|
+
"## What is Adaptive Testing?\n",
|
|
13
|
+
"\n",
|
|
14
|
+
"Trismik’s adaptive testing service leverages item response theory (IRT), a psychometric framework, to evaluate large language models. Using computerized adaptive testing (CAT) it dynamically selects the most informative items, enabling faster, more cost-efficient model evaluations with fewer items required.\n",
|
|
15
|
+
"\n",
|
|
16
|
+
"## Setup\n",
|
|
17
|
+
"\n",
|
|
18
|
+
"### Install Scorebook\n"
|
|
19
|
+
]
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
"cell_type": "code",
|
|
23
|
+
"id": "286eca2349c6ddc6",
|
|
24
|
+
"metadata": {},
|
|
25
|
+
"source": "!pip install scorebook",
|
|
26
|
+
"outputs": [],
|
|
27
|
+
"execution_count": null
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"cell_type": "markdown",
|
|
31
|
+
"id": "d2fa56528e14c46d",
|
|
32
|
+
"metadata": {},
|
|
33
|
+
"source": [
|
|
34
|
+
"\n",
|
|
35
|
+
"\n",
|
|
36
|
+
"### Generate a Trismik API Key\n",
|
|
37
|
+
"\n",
|
|
38
|
+
"To run an adaptive evaluation, a Trismik API key is required. You can [sign up](https://dashboard.trismik.com/signup) for a free Trismik account and generate an API key.\n",
|
|
39
|
+
"\n",
|
|
40
|
+
"**How to generate an API key from the Trismik dashboard**:\n",
|
|
41
|
+
"1. click on your initials in the top-right corner of the screen.\n",
|
|
42
|
+
"2. click on \"API Keys\" in the drop-down menu.\n",
|
|
43
|
+
"3. click \"Create API Key\" to create a new API key.\n",
|
|
44
|
+
"\n",
|
|
45
|
+
"### Set Trismik API Key"
|
|
46
|
+
]
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
"cell_type": "code",
|
|
50
|
+
"id": "5ed8a62ac56560e9",
|
|
51
|
+
"metadata": {},
|
|
52
|
+
"source": [
|
|
53
|
+
"# Set your API key here and run this cell to login\n",
|
|
54
|
+
"TRISMIK_API_KEY = \"your-trismik-api-key-here\""
|
|
55
|
+
],
|
|
56
|
+
"outputs": [],
|
|
57
|
+
"execution_count": null
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
"cell_type": "markdown",
|
|
61
|
+
"id": "10d842e5b99e95bd",
|
|
62
|
+
"metadata": {},
|
|
63
|
+
"source": [
|
|
64
|
+
"### Login with Trismik API Key"
|
|
65
|
+
]
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
"cell_type": "code",
|
|
69
|
+
"id": "initial_id",
|
|
70
|
+
"metadata": {},
|
|
71
|
+
"source": [
|
|
72
|
+
"import scorebook\n",
|
|
73
|
+
"\n",
|
|
74
|
+
"scorebook.login(TRISMIK_API_KEY)\n",
|
|
75
|
+
"print(\"✓ Logged in to Trismik\")"
|
|
76
|
+
],
|
|
77
|
+
"outputs": [],
|
|
78
|
+
"execution_count": null
|
|
79
|
+
},
|
|
80
|
+
{
|
|
81
|
+
"cell_type": "markdown",
|
|
82
|
+
"id": "eedbc41adba92248",
|
|
83
|
+
"metadata": {},
|
|
84
|
+
"source": [
|
|
85
|
+
"### Create a Project\n",
|
|
86
|
+
"\n",
|
|
87
|
+
"When running an adaptive evaluation, your evaluation results are stored under a project on the Trismik dashboard. Projects can be created from the dashboard's interface, or programmatically via Scorebook."
|
|
88
|
+
]
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
"cell_type": "code",
|
|
92
|
+
"id": "bd7271800bf91478",
|
|
93
|
+
"metadata": {},
|
|
94
|
+
"source": [
|
|
95
|
+
"from scorebook import create_project\n",
|
|
96
|
+
"\n",
|
|
97
|
+
"# Create a project\n",
|
|
98
|
+
"project = create_project(\n",
|
|
99
|
+
" name = \"Quick-Start Guides\",\n",
|
|
100
|
+
" description = \"A project created for Trismik's quick-start guides.\"\n",
|
|
101
|
+
")\n",
|
|
102
|
+
"\n",
|
|
103
|
+
"print(\"✓ Project created\")\n",
|
|
104
|
+
"print(f\"Project ID: {project.id}\")"
|
|
105
|
+
],
|
|
106
|
+
"outputs": [],
|
|
107
|
+
"execution_count": null
|
|
108
|
+
},
|
|
109
|
+
{
|
|
110
|
+
"cell_type": "markdown",
|
|
111
|
+
"id": "e30ea2a2674e6005",
|
|
112
|
+
"metadata": {},
|
|
113
|
+
"source": [
|
|
114
|
+
"## Run an Adaptive Evaluation\n",
|
|
115
|
+
"\n",
|
|
116
|
+
"For this quick-start guide, we will use a mock model, that replicates the responses generated by an Amazon LLM (Nova-Pro). `mock_llm` is an inference function which accepts a list of model inputs and returns a list of model outputs for scoring. In this example, we use model responses that we pre-computed to showcase how adaptive evaluations work."
|
|
117
|
+
]
|
|
118
|
+
},
|
|
119
|
+
{
|
|
120
|
+
"cell_type": "code",
|
|
121
|
+
"id": "d9f96d063e08c4fd",
|
|
122
|
+
"metadata": {},
|
|
123
|
+
"source": [
|
|
124
|
+
"from scorebook.utils.mock_llm import mock_llm\n",
|
|
125
|
+
"\n",
|
|
126
|
+
"# Run adaptive evaluation\n",
|
|
127
|
+
"results = scorebook.evaluate(\n",
|
|
128
|
+
" inference = mock_llm,\n",
|
|
129
|
+
" datasets = \"trismik/MMLUPro:adaptive\",\n",
|
|
130
|
+
" experiment_id = \"Getting-Started\",\n",
|
|
131
|
+
" project_id = project.id,\n",
|
|
132
|
+
")\n",
|
|
133
|
+
"\n",
|
|
134
|
+
"# Print the adaptive evaluation results\n",
|
|
135
|
+
"print(\"✓ Adaptive evaluation complete!\")\n",
|
|
136
|
+
"print(\"Results: \", results[0][\"score\"])\n",
|
|
137
|
+
"\n",
|
|
138
|
+
"print(f\"You can view your results here: https://dashboard.trismik.com/projects/{project.id}\")"
|
|
139
|
+
],
|
|
140
|
+
"outputs": [],
|
|
141
|
+
"execution_count": null
|
|
142
|
+
},
|
|
143
|
+
{
|
|
144
|
+
"cell_type": "markdown",
|
|
145
|
+
"id": "63d3f34a76e90ac0",
|
|
146
|
+
"metadata": {},
|
|
147
|
+
"source": [
|
|
148
|
+
"### Adaptive Evaluation Results\n",
|
|
149
|
+
"\n",
|
|
150
|
+
"The metrics generated by an adaptive evaluation are:\n",
|
|
151
|
+
"\n",
|
|
152
|
+
"- Theta (θ): The primary score measuring model ability on the dataset, a higher value represents better performance.\n",
|
|
153
|
+
"- Standard Error: The theta score is a proxy for the underlying metric, and the standard error is the uncertainty in the theta estimate.\n",
|
|
154
|
+
"\n",
|
|
155
|
+
"You can find more information about adaptive testing [here](https://docs.trismik.com/adaptiveTesting/adaptive-testing-introduction/)!\n",
|
|
156
|
+
"\n",
|
|
157
|
+
"---\n",
|
|
158
|
+
"\n",
|
|
159
|
+
"## Next Steps\n",
|
|
160
|
+
"\n",
|
|
161
|
+
"**More Quick-Start Guides**:\n",
|
|
162
|
+
"\n",
|
|
163
|
+
"1. [HuggingFace Adaptive Evaluation](https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb): For a demo showcasing how Scorebook's adaptive evaluations can be used for local HF-based models. \n",
|
|
164
|
+
"2. [API-based Adaptive Evaluation](https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb): For a demo showcasing how Scorebook's adaptive evaluations can be used for API-based models, using the OpenAI API.\n",
|
|
165
|
+
"3. [HuggingFace Full Dataset Evaluation](https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb): For a demo showcasing how Scorebook can be used for full dataset evaluations with familiar metrics, like `Accuracy`.\n",
|
|
166
|
+
"\n",
|
|
167
|
+
"**More details on Adaptive Testing and Scorebook**:\n",
|
|
168
|
+
"\n",
|
|
169
|
+
"- [Introduction to Adaptive Testing](https://docs.trismik.com/adaptiveTesting/adaptive-testing-introduction/): a quick introduction to Adaptive Testing.\n",
|
|
170
|
+
"- [Dataset Page](https://dashboard.trismik.com/datasets): Trismik's full set of currently adaptive datasets from the Trismik dashboard.\n",
|
|
171
|
+
"- [Scorebook Docs](https://docs.trismik.com/scorebook/introduction-to-scorebook/): Scorebook's full documentation.\n",
|
|
172
|
+
"- [Scorebook Repository](https://github.com/trismik/scorebook): Scorebook is an open-source library, view the code and more examples."
|
|
173
|
+
]
|
|
174
|
+
}
|
|
175
|
+
],
|
|
176
|
+
"metadata": {
|
|
177
|
+
"kernelspec": {
|
|
178
|
+
"display_name": "Python 3 (ipykernel)",
|
|
179
|
+
"language": "python",
|
|
180
|
+
"name": "python3"
|
|
181
|
+
},
|
|
182
|
+
"language_info": {
|
|
183
|
+
"codemirror_mode": {
|
|
184
|
+
"name": "ipython",
|
|
185
|
+
"version": 3
|
|
186
|
+
},
|
|
187
|
+
"file_extension": ".py",
|
|
188
|
+
"mimetype": "text/x-python",
|
|
189
|
+
"name": "python",
|
|
190
|
+
"nbconvert_exporter": "python",
|
|
191
|
+
"pygments_lexer": "ipython3",
|
|
192
|
+
"version": "3.13.7"
|
|
193
|
+
}
|
|
194
|
+
},
|
|
195
|
+
"nbformat": 4,
|
|
196
|
+
"nbformat_minor": 5
|
|
197
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Helper utilities for Scorebook examples.
|
|
3
|
+
|
|
4
|
+
This module provides common helper functions used across multiple Scorebook examples
|
|
5
|
+
for setup, output handling, and argument parsing.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# Argument parsing utilities
|
|
9
|
+
from .args_parser import (
|
|
10
|
+
add_model_selection_arg,
|
|
11
|
+
create_parser,
|
|
12
|
+
parse_args_with_config,
|
|
13
|
+
setup_batch_model_parser,
|
|
14
|
+
setup_openai_model_parser,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
# Output utilities
|
|
18
|
+
from .output import save_results_to_json
|
|
19
|
+
|
|
20
|
+
# Setup utilities
|
|
21
|
+
from .setup import setup_logging, setup_output_directory
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
# Setup
|
|
25
|
+
"setup_logging",
|
|
26
|
+
"setup_output_directory",
|
|
27
|
+
# Output
|
|
28
|
+
"save_results_to_json",
|
|
29
|
+
# Argument parsing
|
|
30
|
+
"create_parser",
|
|
31
|
+
"add_model_selection_arg",
|
|
32
|
+
"setup_openai_model_parser",
|
|
33
|
+
"setup_batch_model_parser",
|
|
34
|
+
"parse_args_with_config",
|
|
35
|
+
]
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Generic argument parsing utilities for Scorebook examples.
|
|
3
|
+
|
|
4
|
+
This module provides reusable argument parsing functions that can be used
|
|
5
|
+
across multiple Scorebook examples for consistent command-line interfaces.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import argparse
|
|
9
|
+
from typing import Any, Dict, List, Optional
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def create_parser(description: str) -> argparse.ArgumentParser:
|
|
13
|
+
"""Create a basic argument parser with a description.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
description: Description for the argument parser
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
Configured ArgumentParser instance
|
|
20
|
+
"""
|
|
21
|
+
return argparse.ArgumentParser(description=description)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def add_model_selection_arg(
|
|
25
|
+
parser: argparse.ArgumentParser,
|
|
26
|
+
default: str = "gpt-4o-mini",
|
|
27
|
+
help_text: Optional[str] = None,
|
|
28
|
+
supported_models: Optional[List[str]] = None,
|
|
29
|
+
) -> argparse.ArgumentParser:
|
|
30
|
+
"""Add model selection argument to parser.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
parser: ArgumentParser to add the argument to
|
|
34
|
+
default: Default model name
|
|
35
|
+
help_text: Custom help text for the argument
|
|
36
|
+
supported_models: List of supported models for validation
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
The modified parser
|
|
40
|
+
"""
|
|
41
|
+
if help_text is None:
|
|
42
|
+
help_text = f"OpenAI model to use for inference (default: {default})"
|
|
43
|
+
if supported_models:
|
|
44
|
+
help_text += f". Supported models: {', '.join(supported_models)}"
|
|
45
|
+
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
"--model",
|
|
48
|
+
type=str,
|
|
49
|
+
default=default,
|
|
50
|
+
help=help_text,
|
|
51
|
+
)
|
|
52
|
+
return parser
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def setup_openai_model_parser(
|
|
56
|
+
description: str = "Select OpenAI model for evaluation.",
|
|
57
|
+
default: str = "gpt-4o-mini",
|
|
58
|
+
supported_models: Optional[List[str]] = None,
|
|
59
|
+
) -> str:
|
|
60
|
+
"""Set up and parse OpenAI model selection arguments.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
description: Description for the argument parser
|
|
64
|
+
default: Default model name
|
|
65
|
+
supported_models: List of supported models for help text
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Selected model name
|
|
69
|
+
"""
|
|
70
|
+
parser = create_parser(description)
|
|
71
|
+
add_model_selection_arg(parser, default=default, supported_models=supported_models)
|
|
72
|
+
args = parser.parse_args()
|
|
73
|
+
return str(args.model)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def setup_batch_model_parser(
|
|
77
|
+
description: str = "Select OpenAI model for batch evaluation.", default: str = "gpt-4o-mini"
|
|
78
|
+
) -> str:
|
|
79
|
+
"""Set up and parse OpenAI model selection arguments for batch inference.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
description: Description for the argument parser
|
|
83
|
+
default: Default model name
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Selected model name
|
|
87
|
+
"""
|
|
88
|
+
supported_models = ["gpt-4o", "gpt-4o-mini", "gpt-4-turbo", "gpt-3.5-turbo"]
|
|
89
|
+
help_text = (
|
|
90
|
+
f"OpenAI model to use for batch inference. "
|
|
91
|
+
f"Note: Only select models support the Batch API. "
|
|
92
|
+
f"Supported models include: {', '.join(supported_models)}. "
|
|
93
|
+
f"Default: {default}"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
parser = create_parser(description)
|
|
97
|
+
add_model_selection_arg(parser, default=default, help_text=help_text)
|
|
98
|
+
args = parser.parse_args()
|
|
99
|
+
return str(args.model)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def parse_args_with_config(config: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
|
|
103
|
+
"""Parse arguments using a configuration dictionary.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
config: Dictionary defining arguments to add. Format:
|
|
107
|
+
{
|
|
108
|
+
"arg_name": {
|
|
109
|
+
"type": str,
|
|
110
|
+
"default": "default_value",
|
|
111
|
+
"help": "Help text",
|
|
112
|
+
"required": False # optional
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Dictionary of parsed argument values
|
|
118
|
+
"""
|
|
119
|
+
parser = argparse.ArgumentParser()
|
|
120
|
+
|
|
121
|
+
for arg_name, arg_config in config.items():
|
|
122
|
+
kwargs = {"type": arg_config.get("type", str), "help": arg_config.get("help", "")}
|
|
123
|
+
|
|
124
|
+
if "default" in arg_config:
|
|
125
|
+
kwargs["default"] = arg_config["default"]
|
|
126
|
+
if "required" in arg_config:
|
|
127
|
+
kwargs["required"] = arg_config["required"]
|
|
128
|
+
|
|
129
|
+
parser.add_argument(f"--{arg_name.replace('_', '-')}", **kwargs)
|
|
130
|
+
|
|
131
|
+
args = parser.parse_args()
|
|
132
|
+
return vars(args)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utility functions for saving Scorebook evaluation results.
|
|
3
|
+
|
|
4
|
+
This module provides common helper functions used across multiple Scorebook examples
|
|
5
|
+
for saving evaluation results to files.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def save_results_to_json(results: Any, output_dir: Path, filename: str) -> None:
|
|
14
|
+
"""Save evaluation results to a JSON file.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
results: The evaluation results to save
|
|
18
|
+
output_dir: Directory to save the file in
|
|
19
|
+
filename: Name of the output file (should include .json extension)
|
|
20
|
+
"""
|
|
21
|
+
output_path = output_dir / filename
|
|
22
|
+
with open(output_path, "w") as output_file:
|
|
23
|
+
json.dump(results, output_file, indent=4)
|