scorebook 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +12 -5
- scorebook/cli/auth.py +1 -1
- scorebook/dashboard/__init__.py +1 -0
- scorebook/dashboard/create_project.py +91 -0
- scorebook/{trismik → dashboard}/credentials.py +57 -12
- scorebook/{trismik → dashboard}/upload_results.py +1 -1
- scorebook/eval_datasets/__init__.py +0 -4
- scorebook/eval_datasets/eval_dataset.py +4 -2
- scorebook/evaluate/__init__.py +1 -15
- scorebook/evaluate/_async/evaluate_async.py +36 -19
- scorebook/evaluate/_sync/evaluate.py +36 -19
- scorebook/evaluate/evaluate_helpers.py +4 -3
- scorebook/inference/__init__.py +1 -11
- scorebook/inference/clients/__init__.py +1 -8
- scorebook/inference/inference_pipeline.py +1 -1
- scorebook/metrics/README.md +121 -0
- scorebook/metrics/__init__.py +7 -16
- scorebook/metrics/accuracy.py +2 -6
- scorebook/metrics/bertscore.py +50 -0
- scorebook/metrics/bleu.py +82 -0
- scorebook/metrics/core/__init__.py +1 -0
- scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
- scorebook/metrics/core/metric_registry.py +195 -0
- scorebook/metrics/exactmatch.py +95 -0
- scorebook/metrics/f1.py +96 -0
- scorebook/metrics/precision.py +84 -9
- scorebook/metrics/recall.py +94 -0
- scorebook/metrics/rouge.py +85 -0
- scorebook/score/__init__.py +0 -5
- scorebook/score/_async/score_async.py +3 -2
- scorebook/score/_sync/score.py +3 -2
- scorebook/score/score_helpers.py +29 -12
- scorebook/types.py +3 -3
- scorebook/utils/__init__.py +0 -22
- scorebook/utils/common_helpers.py +1 -1
- scorebook/utils/mock_llm/__init__.py +41 -0
- scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
- scorebook/utils/progress_bars.py +58 -786
- scorebook-0.0.15.dist-info/METADATA +300 -0
- scorebook-0.0.15.dist-info/RECORD +110 -0
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
- tutorials/README.md +147 -0
- tutorials/__init__.py +5 -0
- tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
- tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
- tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
- tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
- tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
- tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
- tutorials/examples/1-score/__init__.py +0 -0
- tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
- tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
- tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
- tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
- tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
- tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
- tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
- tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
- tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
- tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
- tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
- tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
- tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
- tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
- tutorials/examples/6-providers/aws/__init__.py +1 -0
- tutorials/examples/6-providers/aws/batch_example.py +219 -0
- tutorials/examples/6-providers/portkey/__init__.py +1 -0
- tutorials/examples/6-providers/portkey/batch_example.py +120 -0
- tutorials/examples/6-providers/portkey/messages_example.py +121 -0
- tutorials/examples/6-providers/vertex/__init__.py +1 -0
- tutorials/examples/6-providers/vertex/batch_example.py +166 -0
- tutorials/examples/6-providers/vertex/messages_example.py +142 -0
- tutorials/examples/__init__.py +0 -0
- tutorials/notebooks/1-scoring.ipynb +162 -0
- tutorials/notebooks/2-evaluating.ipynb +316 -0
- tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
- tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
- tutorials/notebooks/4-uploading_results.ipynb +175 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
- tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
- tutorials/quickstarts/getting_started.ipynb +197 -0
- tutorials/utils/__init__.py +35 -0
- tutorials/utils/args_parser.py +132 -0
- tutorials/utils/output.py +23 -0
- tutorials/utils/setup.py +98 -0
- scorebook/metrics/metric_registry.py +0 -105
- scorebook/trismik/__init__.py +0 -10
- scorebook-0.0.13.dist-info/METADATA +0 -389
- scorebook-0.0.13.dist-info/RECORD +0 -50
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"metadata": {},
|
|
6
|
+
"source": [
|
|
7
|
+
"# Evaluating Models with Scorebook\n",
|
|
8
|
+
"\n",
|
|
9
|
+
"This notebook demonstrates how to use Scorebook's `evaluate()` function to run inference and compute metrics in a single step.\n",
|
|
10
|
+
"\n",
|
|
11
|
+
"## When to use `evaluate()`\n",
|
|
12
|
+
"\n",
|
|
13
|
+
"- You want to run inference on a dataset and score the results\n",
|
|
14
|
+
"- You're comparing different models on the same dataset\n",
|
|
15
|
+
"- You want to track hyperparameters alongside results\n",
|
|
16
|
+
"\n",
|
|
17
|
+
"## Prerequisites\n",
|
|
18
|
+
"\n",
|
|
19
|
+
"This example uses a local HuggingFace model. For cloud models (OpenAI), see the examples directory."
|
|
20
|
+
]
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
"cell_type": "markdown",
|
|
24
|
+
"metadata": {},
|
|
25
|
+
"source": [
|
|
26
|
+
"## Setup\n",
|
|
27
|
+
"\n",
|
|
28
|
+
"Import necessary modules:"
|
|
29
|
+
]
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"cell_type": "code",
|
|
33
|
+
"metadata": {},
|
|
34
|
+
"source": [
|
|
35
|
+
"from pprint import pprint\n",
|
|
36
|
+
"from typing import Any, List\n",
|
|
37
|
+
"import transformers\n",
|
|
38
|
+
"\n",
|
|
39
|
+
"from scorebook import EvalDataset, evaluate"
|
|
40
|
+
],
|
|
41
|
+
"outputs": [],
|
|
42
|
+
"execution_count": null
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
"cell_type": "markdown",
|
|
46
|
+
"metadata": {},
|
|
47
|
+
"source": [
|
|
48
|
+
"## Initialize Your Model\n",
|
|
49
|
+
"\n",
|
|
50
|
+
"Set up a HuggingFace pipeline for inference:"
|
|
51
|
+
]
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
"cell_type": "code",
|
|
55
|
+
"metadata": {},
|
|
56
|
+
"source": [
|
|
57
|
+
"model_name = \"microsoft/Phi-4-mini-instruct\"\n",
|
|
58
|
+
"\n",
|
|
59
|
+
"pipeline = transformers.pipeline(\n",
|
|
60
|
+
" \"text-generation\",\n",
|
|
61
|
+
" model=model_name,\n",
|
|
62
|
+
" model_kwargs={\"torch_dtype\": \"auto\"},\n",
|
|
63
|
+
" device_map=\"auto\",\n",
|
|
64
|
+
")\n",
|
|
65
|
+
"\n",
|
|
66
|
+
"print(f\"Model loaded: {model_name}\")"
|
|
67
|
+
],
|
|
68
|
+
"outputs": [],
|
|
69
|
+
"execution_count": null
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
"cell_type": "markdown",
|
|
73
|
+
"metadata": {},
|
|
74
|
+
"source": [
|
|
75
|
+
"## Define Your Inference Function\n",
|
|
76
|
+
"\n",
|
|
77
|
+
"Create a function that processes inputs and returns outputs:"
|
|
78
|
+
]
|
|
79
|
+
},
|
|
80
|
+
{
|
|
81
|
+
"cell_type": "code",
|
|
82
|
+
"metadata": {},
|
|
83
|
+
"source": [
|
|
84
|
+
"def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:\n",
|
|
85
|
+
" \"\"\"Process inputs through the model.\n",
|
|
86
|
+
" \n",
|
|
87
|
+
" Args:\n",
|
|
88
|
+
" inputs: List of input values from the dataset\n",
|
|
89
|
+
" hyperparameters: Model hyperparameters (e.g., temperature, system_message)\n",
|
|
90
|
+
" \n",
|
|
91
|
+
" Returns:\n",
|
|
92
|
+
" List of model outputs\n",
|
|
93
|
+
" \"\"\"\n",
|
|
94
|
+
" outputs = []\n",
|
|
95
|
+
" \n",
|
|
96
|
+
" for input_val in inputs:\n",
|
|
97
|
+
" # Build messages for the model\n",
|
|
98
|
+
" messages = [\n",
|
|
99
|
+
" {\n",
|
|
100
|
+
" \"role\": \"system\",\n",
|
|
101
|
+
" \"content\": hyperparameters.get(\"system_message\", \"You are a helpful assistant.\")\n",
|
|
102
|
+
" },\n",
|
|
103
|
+
" {\"role\": \"user\", \"content\": str(input_val)},\n",
|
|
104
|
+
" ]\n",
|
|
105
|
+
" \n",
|
|
106
|
+
" # Run inference\n",
|
|
107
|
+
" result = pipeline(\n",
|
|
108
|
+
" messages,\n",
|
|
109
|
+
" max_new_tokens=hyperparameters.get(\"max_new_tokens\", 100),\n",
|
|
110
|
+
" )\n",
|
|
111
|
+
" \n",
|
|
112
|
+
" # Extract the answer\n",
|
|
113
|
+
" output = str(result[0][\"generated_text\"][-1][\"content\"])\n",
|
|
114
|
+
" outputs.append(output)\n",
|
|
115
|
+
" \n",
|
|
116
|
+
" return outputs"
|
|
117
|
+
],
|
|
118
|
+
"outputs": [],
|
|
119
|
+
"execution_count": null
|
|
120
|
+
},
|
|
121
|
+
{
|
|
122
|
+
"cell_type": "markdown",
|
|
123
|
+
"metadata": {},
|
|
124
|
+
"source": [
|
|
125
|
+
"## Load Your Dataset\n",
|
|
126
|
+
"\n",
|
|
127
|
+
"Create an evaluation dataset from a JSON file:"
|
|
128
|
+
]
|
|
129
|
+
},
|
|
130
|
+
{
|
|
131
|
+
"cell_type": "code",
|
|
132
|
+
"metadata": {},
|
|
133
|
+
"source": [
|
|
134
|
+
"# Create a sample dataset\n",
|
|
135
|
+
"sample_data = [\n",
|
|
136
|
+
" {\"question\": \"What is 2 + 2?\", \"answer\": \"4\"},\n",
|
|
137
|
+
" {\"question\": \"What is the capital of France?\", \"answer\": \"Paris\"},\n",
|
|
138
|
+
" {\"question\": \"Who wrote Romeo and Juliet?\", \"answer\": \"William Shakespeare\"},\n",
|
|
139
|
+
"]\n",
|
|
140
|
+
"\n",
|
|
141
|
+
"# Create EvalDataset directly from list\n",
|
|
142
|
+
"dataset = EvalDataset.from_list(\n",
|
|
143
|
+
" name=\"sample_questions\",\n",
|
|
144
|
+
" metrics=\"accuracy\",\n",
|
|
145
|
+
" items=sample_data,\n",
|
|
146
|
+
" input=\"question\",\n",
|
|
147
|
+
" label=\"answer\",\n",
|
|
148
|
+
")\n",
|
|
149
|
+
"\n",
|
|
150
|
+
"print(f\"Loaded dataset with {len(dataset.items)} items\")"
|
|
151
|
+
],
|
|
152
|
+
"outputs": [],
|
|
153
|
+
"execution_count": null
|
|
154
|
+
},
|
|
155
|
+
{
|
|
156
|
+
"cell_type": "markdown",
|
|
157
|
+
"metadata": {},
|
|
158
|
+
"source": [
|
|
159
|
+
"## Run Evaluation\n",
|
|
160
|
+
"\n",
|
|
161
|
+
"Use `evaluate()` to run inference and compute metrics:"
|
|
162
|
+
]
|
|
163
|
+
},
|
|
164
|
+
{
|
|
165
|
+
"cell_type": "code",
|
|
166
|
+
"metadata": {},
|
|
167
|
+
"source": [
|
|
168
|
+
"results = evaluate(\n",
|
|
169
|
+
" inference,\n",
|
|
170
|
+
" dataset,\n",
|
|
171
|
+
" hyperparameters={\n",
|
|
172
|
+
" \"system_message\": \"Answer the question directly and concisely.\",\n",
|
|
173
|
+
" \"max_new_tokens\": 50,\n",
|
|
174
|
+
" },\n",
|
|
175
|
+
" return_aggregates=True,\n",
|
|
176
|
+
" return_items=True,\n",
|
|
177
|
+
" return_output=True,\n",
|
|
178
|
+
" upload_results=False, # Set to True to upload to Trismik\n",
|
|
179
|
+
")\n",
|
|
180
|
+
"\n",
|
|
181
|
+
"pprint(results)"
|
|
182
|
+
],
|
|
183
|
+
"outputs": [],
|
|
184
|
+
"execution_count": null
|
|
185
|
+
},
|
|
186
|
+
{
|
|
187
|
+
"cell_type": "markdown",
|
|
188
|
+
"metadata": {},
|
|
189
|
+
"source": [
|
|
190
|
+
"## Analyze Results\n",
|
|
191
|
+
"\n",
|
|
192
|
+
"Examine the outputs and metrics:"
|
|
193
|
+
]
|
|
194
|
+
},
|
|
195
|
+
{
|
|
196
|
+
"cell_type": "code",
|
|
197
|
+
"metadata": {},
|
|
198
|
+
"source": [
|
|
199
|
+
"# Overall accuracy\n",
|
|
200
|
+
"print(f\"\\nOverall Accuracy: {results['aggregate_results'][0]['accuracy']:.2%}\")\n",
|
|
201
|
+
"\n",
|
|
202
|
+
"# Per-item results\n",
|
|
203
|
+
"print(\"\\nPer-Item Results:\")\n",
|
|
204
|
+
"for i, item in enumerate(results['item_results'], 1):\n",
|
|
205
|
+
" print(f\"\\nQuestion {i}: {item['input']}\")\n",
|
|
206
|
+
" print(f\" Model Output: {item['output']}\")\n",
|
|
207
|
+
" print(f\" Expected: {item['label']}\")\n",
|
|
208
|
+
" print(f\" Correct: {'✓' if item['accuracy'] == 1.0 else '✗'}\")"
|
|
209
|
+
],
|
|
210
|
+
"outputs": [],
|
|
211
|
+
"execution_count": null
|
|
212
|
+
},
|
|
213
|
+
{
|
|
214
|
+
"metadata": {},
|
|
215
|
+
"cell_type": "markdown",
|
|
216
|
+
"source": [
|
|
217
|
+
"## Hyperparameter Sweeps\n",
|
|
218
|
+
"\n",
|
|
219
|
+
"Evaluate with different hyperparameters to find optimal settings:"
|
|
220
|
+
]
|
|
221
|
+
},
|
|
222
|
+
{
|
|
223
|
+
"metadata": {},
|
|
224
|
+
"cell_type": "code",
|
|
225
|
+
"source": [
|
|
226
|
+
"# Define an inference function\n",
|
|
227
|
+
"def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:\n",
|
|
228
|
+
" \"\"\"Process inputs through the model.\n",
|
|
229
|
+
"\n",
|
|
230
|
+
" Args:\n",
|
|
231
|
+
" inputs: Input values from an EvalDataset.\n",
|
|
232
|
+
" hyperparameters: Model hyperparameters including system_message, temperature, top_p, top_k.\n",
|
|
233
|
+
"\n",
|
|
234
|
+
" Returns:\n",
|
|
235
|
+
" List of model outputs for all inputs.\n",
|
|
236
|
+
" \"\"\"\n",
|
|
237
|
+
" outputs = []\n",
|
|
238
|
+
" for input_val in inputs:\n",
|
|
239
|
+
" # Preprocess: Build messages\n",
|
|
240
|
+
" messages = [\n",
|
|
241
|
+
" {\"role\": \"system\", \"content\": hyperparameters[\"system_message\"]},\n",
|
|
242
|
+
" {\"role\": \"user\", \"content\": str(input_val)},\n",
|
|
243
|
+
" ]\n",
|
|
244
|
+
"\n",
|
|
245
|
+
" # Run inference\n",
|
|
246
|
+
" result = pipeline(\n",
|
|
247
|
+
" messages,\n",
|
|
248
|
+
" temperature=hyperparameters[\"temperature\"],\n",
|
|
249
|
+
" top_p=hyperparameters.get(\"top_p\"),\n",
|
|
250
|
+
" top_k=hyperparameters.get(\"top_k\"),\n",
|
|
251
|
+
" )\n",
|
|
252
|
+
"\n",
|
|
253
|
+
" # Postprocess: Extract the answer\n",
|
|
254
|
+
" output = str(result[0][\"generated_text\"][-1][\"content\"])\n",
|
|
255
|
+
" outputs.append(output)\n",
|
|
256
|
+
"\n",
|
|
257
|
+
" return outputs\n",
|
|
258
|
+
"\n",
|
|
259
|
+
"# Define hyperparameters with lists of values to create a sweep\n",
|
|
260
|
+
"hyperparameters = {\n",
|
|
261
|
+
" \"system_message\": \"Answer the question directly and concisely.\",\n",
|
|
262
|
+
" \"temperature\": [0.6, 0.7, 0.8],\n",
|
|
263
|
+
" \"top_p\": [0.7, 0.8, 0.9],\n",
|
|
264
|
+
" \"top_k\": [10, 20, 30],\n",
|
|
265
|
+
"}\n",
|
|
266
|
+
"\n",
|
|
267
|
+
"# Run evaluation across all hyperparameter combinations\n",
|
|
268
|
+
"results = evaluate(\n",
|
|
269
|
+
" inference,\n",
|
|
270
|
+
" dataset,\n",
|
|
271
|
+
" hyperparameters=hyperparameters,\n",
|
|
272
|
+
" return_aggregates=True,\n",
|
|
273
|
+
" return_items=True,\n",
|
|
274
|
+
" return_output=True,\n",
|
|
275
|
+
" upload_results=False,\n",
|
|
276
|
+
")\n",
|
|
277
|
+
"\n",
|
|
278
|
+
"pprint(results)"
|
|
279
|
+
],
|
|
280
|
+
"outputs": [],
|
|
281
|
+
"execution_count": null
|
|
282
|
+
},
|
|
283
|
+
{
|
|
284
|
+
"cell_type": "markdown",
|
|
285
|
+
"metadata": {},
|
|
286
|
+
"source": [
|
|
287
|
+
"## Next Steps\n",
|
|
288
|
+
"\n",
|
|
289
|
+
"- Try the **Adaptive Evaluations** notebook for efficient testing with fewer questions\n",
|
|
290
|
+
"- See the **Upload Results** notebook to track results in Trismik's dashboard\n",
|
|
291
|
+
"- Explore batch processing for faster evaluation of large datasets"
|
|
292
|
+
]
|
|
293
|
+
}
|
|
294
|
+
],
|
|
295
|
+
"metadata": {
|
|
296
|
+
"kernelspec": {
|
|
297
|
+
"display_name": "Python 3 (ipykernel)",
|
|
298
|
+
"language": "python",
|
|
299
|
+
"name": "python3"
|
|
300
|
+
},
|
|
301
|
+
"language_info": {
|
|
302
|
+
"codemirror_mode": {
|
|
303
|
+
"name": "ipython",
|
|
304
|
+
"version": 3
|
|
305
|
+
},
|
|
306
|
+
"file_extension": ".py",
|
|
307
|
+
"mimetype": "text/x-python",
|
|
308
|
+
"name": "python",
|
|
309
|
+
"nbconvert_exporter": "python",
|
|
310
|
+
"pygments_lexer": "ipython3",
|
|
311
|
+
"version": "3.13.5"
|
|
312
|
+
}
|
|
313
|
+
},
|
|
314
|
+
"nbformat": 4,
|
|
315
|
+
"nbformat_minor": 4
|
|
316
|
+
}
|
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"id": "cell-0",
|
|
6
|
+
"metadata": {},
|
|
7
|
+
"source": [
|
|
8
|
+
"# Adaptive Evaluations with Scorebook (Phi)\n",
|
|
9
|
+
"\n",
|
|
10
|
+
"This notebook demonstrates Trismik's adaptive evaluation feature using a **local open-source model** that runs on your machine without requiring API keys or cloud costs.\n",
|
|
11
|
+
"\n",
|
|
12
|
+
"## What are Adaptive Evaluations?\n",
|
|
13
|
+
"\n",
|
|
14
|
+
"Adaptive evaluations dynamically select questions based on a model's previous responses, similar to adaptive testing in education (like the GRE or GMAT).\n",
|
|
15
|
+
"\n",
|
|
16
|
+
"### Benefits:\n",
|
|
17
|
+
"- **More efficient**: Fewer questions needed to assess capability\n",
|
|
18
|
+
"- **Precise measurement**: Better statistical confidence intervals\n",
|
|
19
|
+
"- **Optimal difficulty**: Questions adapt to the model's skill level\n",
|
|
20
|
+
"\n",
|
|
21
|
+
"## Prerequisites\n",
|
|
22
|
+
"\n",
|
|
23
|
+
"- **Trismik API key**: Get yours at https://app.trismik.com/settings\n",
|
|
24
|
+
"- **Trismik Project**: Create a project at https://app.trismik.com and copy its Project ID\n",
|
|
25
|
+
"- **Hardware**: GPU recommended but not required (CPU inference will be slower)\n",
|
|
26
|
+
"- **Packages**: `pip install transformers torch` (or `pip install transformers torch torchvision` for full PyTorch)\n",
|
|
27
|
+
"\n",
|
|
28
|
+
"## Note on Model Performance\n",
|
|
29
|
+
"\n",
|
|
30
|
+
"⚠️ **Important**: Local models (especially smaller ones) may not perform as well on complex reasoning tasks. This notebook prioritizes **accessibility and reproducibility** over maximum accuracy and uses microsoft Phi-3 on MMLU-Pro."
|
|
31
|
+
]
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
"cell_type": "markdown",
|
|
35
|
+
"id": "cell-1",
|
|
36
|
+
"metadata": {},
|
|
37
|
+
"source": [
|
|
38
|
+
"## Setup Credentials\n",
|
|
39
|
+
"\n",
|
|
40
|
+
"Set your Trismik credentials here:"
|
|
41
|
+
]
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
"cell_type": "code",
|
|
45
|
+
"id": "cell-2",
|
|
46
|
+
"metadata": {},
|
|
47
|
+
"source": "# STEP 1: Get your Trismik API key from https://app.trismik.com/settings\n# STEP 2: Create a project at https://app.trismik.com and copy the Project ID\n\n# Set your credentials here\nTRISMIK_API_KEY = \"your-trismik-api-key\"\nTRISMIK_PROJECT_ID = \"your-project-id\"",
|
|
48
|
+
"outputs": [],
|
|
49
|
+
"execution_count": null
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
"cell_type": "markdown",
|
|
53
|
+
"id": "cell-3",
|
|
54
|
+
"metadata": {},
|
|
55
|
+
"source": [
|
|
56
|
+
"## Import Dependencies"
|
|
57
|
+
]
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
"cell_type": "code",
|
|
61
|
+
"id": "cell-4",
|
|
62
|
+
"metadata": {},
|
|
63
|
+
"source": [
|
|
64
|
+
"import asyncio\n",
|
|
65
|
+
"import string\n",
|
|
66
|
+
"from pprint import pprint\n",
|
|
67
|
+
"from typing import Any, List\n",
|
|
68
|
+
"\n",
|
|
69
|
+
"import torch\n",
|
|
70
|
+
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
|
|
71
|
+
"\n",
|
|
72
|
+
"from scorebook import evaluate_async, login"
|
|
73
|
+
],
|
|
74
|
+
"outputs": [],
|
|
75
|
+
"execution_count": null
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
"cell_type": "markdown",
|
|
79
|
+
"id": "cell-5",
|
|
80
|
+
"metadata": {},
|
|
81
|
+
"source": [
|
|
82
|
+
"## Login to Trismik\n",
|
|
83
|
+
"\n",
|
|
84
|
+
"Authenticate with your Trismik account:"
|
|
85
|
+
]
|
|
86
|
+
},
|
|
87
|
+
{
|
|
88
|
+
"cell_type": "code",
|
|
89
|
+
"id": "cell-6",
|
|
90
|
+
"metadata": {},
|
|
91
|
+
"source": [
|
|
92
|
+
"if not TRISMIK_API_KEY or TRISMIK_API_KEY == \"your-trismik-api-key\":\n",
|
|
93
|
+
" raise ValueError(\"Please set TRISMIK_API_KEY. Get your API key from https://app.trismik.com/settings\")\n",
|
|
94
|
+
"\n",
|
|
95
|
+
"login(TRISMIK_API_KEY)\n",
|
|
96
|
+
"print(\"✓ Logged in to Trismik\")\n",
|
|
97
|
+
"\n",
|
|
98
|
+
"if not TRISMIK_PROJECT_ID or TRISMIK_PROJECT_ID == \"your-project-id\":\n",
|
|
99
|
+
" raise ValueError(\"Please set TRISMIK_PROJECT_ID. Create a project at https://app.trismik.com\")\n",
|
|
100
|
+
"\n",
|
|
101
|
+
"print(f\"✓ Using project: {TRISMIK_PROJECT_ID}\")"
|
|
102
|
+
],
|
|
103
|
+
"outputs": [],
|
|
104
|
+
"execution_count": null
|
|
105
|
+
},
|
|
106
|
+
{
|
|
107
|
+
"cell_type": "markdown",
|
|
108
|
+
"id": "cell-7",
|
|
109
|
+
"metadata": {},
|
|
110
|
+
"source": [
|
|
111
|
+
"## Initialize Local Model\n",
|
|
112
|
+
"\n",
|
|
113
|
+
"We'll use Phi-3-mini, a compact 3.8B parameter model that runs efficiently on consumer hardware.\n",
|
|
114
|
+
"\n",
|
|
115
|
+
"**Model Options** (in order of size/performance):\n",
|
|
116
|
+
"- `microsoft/Phi-3-mini-4k-instruct` (3.8B) - Fast, runs on most hardware\n",
|
|
117
|
+
"- `microsoft/Phi-3-small-8k-instruct` (7B) - Better performance, needs more memory\n",
|
|
118
|
+
"- `microsoft/Phi-3-medium-4k-instruct` (14B) - High performance, requires GPU\n",
|
|
119
|
+
"\n",
|
|
120
|
+
"Change the model name below based on your hardware capabilities."
|
|
121
|
+
]
|
|
122
|
+
},
|
|
123
|
+
{
|
|
124
|
+
"cell_type": "code",
|
|
125
|
+
"id": "cell-8",
|
|
126
|
+
"metadata": {},
|
|
127
|
+
"source": [
|
|
128
|
+
"# Select model (change based on your hardware)\n",
|
|
129
|
+
"model_name = \"microsoft/Phi-3-mini-4k-instruct\"\n",
|
|
130
|
+
"\n",
|
|
131
|
+
"print(f\"Loading {model_name}...\")\n",
|
|
132
|
+
"print(\"(This may take a few minutes on first run as the model downloads)\\n\")\n",
|
|
133
|
+
"\n",
|
|
134
|
+
"# Check if CUDA is available\n",
|
|
135
|
+
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
|
136
|
+
"print(f\"Using device: {device}\")\n",
|
|
137
|
+
"\n",
|
|
138
|
+
"# Load tokenizer and model\n",
|
|
139
|
+
"tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
|
|
140
|
+
"model = AutoModelForCausalLM.from_pretrained(\n",
|
|
141
|
+
" model_name,\n",
|
|
142
|
+
" torch_dtype=torch.float16 if device == \"cuda\" else torch.float32,\n",
|
|
143
|
+
" device_map=\"auto\" if device == \"cuda\" else None,\n",
|
|
144
|
+
" trust_remote_code=True,\n",
|
|
145
|
+
")\n",
|
|
146
|
+
"\n",
|
|
147
|
+
"if device == \"cpu\":\n",
|
|
148
|
+
" model = model.to(device)\n",
|
|
149
|
+
"\n",
|
|
150
|
+
"print(f\"\\n✓ Model loaded successfully on {device}\")"
|
|
151
|
+
],
|
|
152
|
+
"outputs": [],
|
|
153
|
+
"execution_count": null
|
|
154
|
+
},
|
|
155
|
+
{
|
|
156
|
+
"cell_type": "markdown",
|
|
157
|
+
"id": "cell-9",
|
|
158
|
+
"metadata": {},
|
|
159
|
+
"source": [
|
|
160
|
+
"## Define Async Inference Function\n",
|
|
161
|
+
"\n",
|
|
162
|
+
"Create an async function to process inputs through the local model:"
|
|
163
|
+
]
|
|
164
|
+
},
|
|
165
|
+
{
|
|
166
|
+
"cell_type": "code",
|
|
167
|
+
"id": "cell-10",
|
|
168
|
+
"metadata": {},
|
|
169
|
+
"source": [
|
|
170
|
+
"async def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:\n",
|
|
171
|
+
" \"\"\"Process inputs through the local Phi model.\n",
|
|
172
|
+
" \n",
|
|
173
|
+
" Args:\n",
|
|
174
|
+
" inputs: Input values from an EvalDataset. For adaptive MMLU-Pro,\n",
|
|
175
|
+
" each input is a dict with 'question' and 'options' keys.\n",
|
|
176
|
+
" hyperparameters: Model hyperparameters.\n",
|
|
177
|
+
" \n",
|
|
178
|
+
" Returns:\n",
|
|
179
|
+
" List of model outputs for all inputs.\n",
|
|
180
|
+
" \"\"\"\n",
|
|
181
|
+
" outputs = []\n",
|
|
182
|
+
" \n",
|
|
183
|
+
" for input_val in inputs:\n",
|
|
184
|
+
" # Handle dict input from adaptive dataset\n",
|
|
185
|
+
" if isinstance(input_val, dict):\n",
|
|
186
|
+
" prompt = input_val.get(\"question\", \"\")\n",
|
|
187
|
+
" if \"options\" in input_val:\n",
|
|
188
|
+
" prompt += \"\\nOptions:\\n\" + \"\\n\".join(\n",
|
|
189
|
+
" f\"{letter}: {choice}\"\n",
|
|
190
|
+
" for letter, choice in zip(string.ascii_uppercase, input_val[\"options\"])\n",
|
|
191
|
+
" )\n",
|
|
192
|
+
" else:\n",
|
|
193
|
+
" prompt = str(input_val)\n",
|
|
194
|
+
" \n",
|
|
195
|
+
" # Build prompt for Phi model\n",
|
|
196
|
+
" system_message = \"Answer the question with a single letter representing the correct answer from the list of choices. Do not provide any additional explanation or output beyond the single letter.\"\n",
|
|
197
|
+
" \n",
|
|
198
|
+
" # Phi-3 uses ChatML format\n",
|
|
199
|
+
" messages = [\n",
|
|
200
|
+
" {\"role\": \"system\", \"content\": system_message},\n",
|
|
201
|
+
" {\"role\": \"user\", \"content\": prompt},\n",
|
|
202
|
+
" ]\n",
|
|
203
|
+
" \n",
|
|
204
|
+
" # Apply chat template\n",
|
|
205
|
+
" formatted_prompt = tokenizer.apply_chat_template(\n",
|
|
206
|
+
" messages, tokenize=False, add_generation_prompt=True\n",
|
|
207
|
+
" )\n",
|
|
208
|
+
" \n",
|
|
209
|
+
" # Tokenize\n",
|
|
210
|
+
" inputs_tokenized = tokenizer(\n",
|
|
211
|
+
" formatted_prompt, return_tensors=\"pt\", truncation=True, max_length=2048\n",
|
|
212
|
+
" )\n",
|
|
213
|
+
" inputs_tokenized = {k: v.to(device) for k, v in inputs_tokenized.items()}\n",
|
|
214
|
+
" \n",
|
|
215
|
+
" # Generate\n",
|
|
216
|
+
" try:\n",
|
|
217
|
+
" with torch.no_grad():\n",
|
|
218
|
+
" output_ids = model.generate(\n",
|
|
219
|
+
" **inputs_tokenized,\n",
|
|
220
|
+
" max_new_tokens=10, # We only need 1 letter\n",
|
|
221
|
+
" temperature=0.7,\n",
|
|
222
|
+
" do_sample=True,\n",
|
|
223
|
+
" pad_token_id=tokenizer.eos_token_id,\n",
|
|
224
|
+
" )\n",
|
|
225
|
+
" \n",
|
|
226
|
+
" # Decode only the generated tokens\n",
|
|
227
|
+
" generated_tokens = output_ids[0][inputs_tokenized[\"input_ids\"].shape[1]:]\n",
|
|
228
|
+
" output = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()\n",
|
|
229
|
+
" \n",
|
|
230
|
+
" # Extract just the first letter if model outputs more\n",
|
|
231
|
+
" if output and output[0].upper() in string.ascii_uppercase:\n",
|
|
232
|
+
" output = output[0].upper()\n",
|
|
233
|
+
" except Exception as e:\n",
|
|
234
|
+
" output = f\"Error: {str(e)}\"\n",
|
|
235
|
+
" \n",
|
|
236
|
+
" outputs.append(output)\n",
|
|
237
|
+
" \n",
|
|
238
|
+
" return outputs"
|
|
239
|
+
],
|
|
240
|
+
"outputs": [],
|
|
241
|
+
"execution_count": null
|
|
242
|
+
},
|
|
243
|
+
{
|
|
244
|
+
"cell_type": "markdown",
|
|
245
|
+
"id": "cell-11",
|
|
246
|
+
"metadata": {},
|
|
247
|
+
"source": [
|
|
248
|
+
"## Run Adaptive Evaluation\n",
|
|
249
|
+
"\n",
|
|
250
|
+
"Use `evaluate_async()` with an adaptive dataset (indicated by the `:adaptive` suffix):"
|
|
251
|
+
]
|
|
252
|
+
},
|
|
253
|
+
{
|
|
254
|
+
"cell_type": "code",
|
|
255
|
+
"id": "cell-12",
|
|
256
|
+
"metadata": {},
|
|
257
|
+
"source": [
|
|
258
|
+
"print(f\"Running adaptive evaluation on Common Sense QA with model: {model_name.split('/')[-1]}\")\n",
|
|
259
|
+
"print(\"Note: Adaptive evaluation selects questions dynamically based on responses.\")\n",
|
|
260
|
+
"\n",
|
|
261
|
+
"# Run adaptive evaluation\n",
|
|
262
|
+
"results = await evaluate_async(\n",
|
|
263
|
+
" inference,\n",
|
|
264
|
+
" datasets=\"trismik/CommonSenseQA:adaptive\", # Adaptive datasets have the \":adaptive\" suffix\n",
|
|
265
|
+
" experiment_id=\"Adaptive-Common-Sense-QA-Local-Notebook\",\n",
|
|
266
|
+
" project_id=TRISMIK_PROJECT_ID,\n",
|
|
267
|
+
" return_dict=True,\n",
|
|
268
|
+
" return_aggregates=True,\n",
|
|
269
|
+
" return_items=True,\n",
|
|
270
|
+
" return_output=True,\n",
|
|
271
|
+
")\n",
|
|
272
|
+
"\n",
|
|
273
|
+
"print(\"\\n✓ Adaptive evaluation complete!\")"
|
|
274
|
+
],
|
|
275
|
+
"outputs": [],
|
|
276
|
+
"execution_count": null
|
|
277
|
+
},
|
|
278
|
+
{
|
|
279
|
+
"cell_type": "markdown",
|
|
280
|
+
"id": "cell-13",
|
|
281
|
+
"metadata": {},
|
|
282
|
+
"source": [
|
|
283
|
+
"## View Results"
|
|
284
|
+
]
|
|
285
|
+
},
|
|
286
|
+
{
|
|
287
|
+
"cell_type": "code",
|
|
288
|
+
"id": "cell-14",
|
|
289
|
+
"metadata": {},
|
|
290
|
+
"source": [
|
|
291
|
+
"pprint(results)"
|
|
292
|
+
],
|
|
293
|
+
"outputs": [],
|
|
294
|
+
"execution_count": null
|
|
295
|
+
},
|
|
296
|
+
{
|
|
297
|
+
"cell_type": "markdown",
|
|
298
|
+
"id": "cell-17",
|
|
299
|
+
"metadata": {},
|
|
300
|
+
"source": [
|
|
301
|
+
"## View Results on Dashboard\n",
|
|
302
|
+
"\n",
|
|
303
|
+
"Your results have been uploaded to Trismik's dashboard for visualization and tracking:"
|
|
304
|
+
]
|
|
305
|
+
},
|
|
306
|
+
{
|
|
307
|
+
"cell_type": "markdown",
|
|
308
|
+
"id": "cell-19",
|
|
309
|
+
"metadata": {},
|
|
310
|
+
"source": [
|
|
311
|
+
"## Understanding Adaptive Testing\n",
|
|
312
|
+
"\n",
|
|
313
|
+
"### How it works:\n",
|
|
314
|
+
"1. **Initial Questions**: Start with medium-difficulty questions\n",
|
|
315
|
+
"2. **Adaptation**: If the model answers correctly, harder questions follow; if incorrect, easier questions are selected\n",
|
|
316
|
+
"3. **Convergence**: The test converges to the model's true ability level\n",
|
|
317
|
+
"4. **Stopping Criteria**: Stops when sufficient confidence is reached\n",
|
|
318
|
+
"\n",
|
|
319
|
+
"### Benefits vs. Traditional Testing:\n",
|
|
320
|
+
"- **Efficiency**: Typically requires 50-70% fewer questions for the same precision\n",
|
|
321
|
+
"- **Precision**: Better estimates of model capability\n",
|
|
322
|
+
"- **Engagement**: Questions are appropriately challenging\n",
|
|
323
|
+
"\n",
|
|
324
|
+
"## Next Steps\n",
|
|
325
|
+
"\n",
|
|
326
|
+
"- Try different local models (Phi-3-small, Phi-3-medium, Llama-3, etc.)\n",
|
|
327
|
+
"- Compare local model performance with the OpenAI version\n",
|
|
328
|
+
"- Explore other adaptive datasets available on Trismik\n",
|
|
329
|
+
"- See the **Upload Results** notebook for non-adaptive result tracking"
|
|
330
|
+
]
|
|
331
|
+
}
|
|
332
|
+
],
|
|
333
|
+
"metadata": {
|
|
334
|
+
"kernelspec": {
|
|
335
|
+
"display_name": "Python 3 (ipykernel)",
|
|
336
|
+
"language": "python",
|
|
337
|
+
"name": "python3"
|
|
338
|
+
},
|
|
339
|
+
"language_info": {
|
|
340
|
+
"codemirror_mode": {
|
|
341
|
+
"name": "ipython",
|
|
342
|
+
"version": 3
|
|
343
|
+
},
|
|
344
|
+
"file_extension": ".py",
|
|
345
|
+
"mimetype": "text/x-python",
|
|
346
|
+
"name": "python",
|
|
347
|
+
"nbconvert_exporter": "python",
|
|
348
|
+
"pygments_lexer": "ipython3",
|
|
349
|
+
"version": "3.13.5"
|
|
350
|
+
}
|
|
351
|
+
},
|
|
352
|
+
"nbformat": 4,
|
|
353
|
+
"nbformat_minor": 5
|
|
354
|
+
}
|