scorebook 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. scorebook/__init__.py +2 -0
  2. scorebook/dashboard/credentials.py +34 -4
  3. scorebook/eval_datasets/eval_dataset.py +2 -2
  4. scorebook/evaluate/_async/evaluate_async.py +27 -11
  5. scorebook/evaluate/_sync/evaluate.py +27 -11
  6. scorebook/metrics/README.md +121 -0
  7. scorebook/metrics/__init__.py +8 -0
  8. scorebook/metrics/accuracy.py +2 -6
  9. scorebook/metrics/bertscore.py +50 -0
  10. scorebook/metrics/bleu.py +82 -0
  11. scorebook/metrics/core/__init__.py +1 -0
  12. scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
  13. scorebook/metrics/core/metric_registry.py +195 -0
  14. scorebook/metrics/exactmatch.py +95 -0
  15. scorebook/metrics/f1.py +96 -0
  16. scorebook/metrics/precision.py +84 -9
  17. scorebook/metrics/recall.py +94 -0
  18. scorebook/metrics/rouge.py +85 -0
  19. scorebook/score/score_helpers.py +28 -11
  20. scorebook/types.py +2 -2
  21. scorebook/utils/progress_bars.py +58 -786
  22. {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/METADATA +32 -24
  23. scorebook-0.0.16.dist-info/RECORD +110 -0
  24. {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/WHEEL +1 -1
  25. tutorials/README.md +147 -0
  26. tutorials/__init__.py +5 -0
  27. tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
  28. tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
  29. tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
  30. tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
  31. tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
  32. tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
  33. tutorials/examples/1-score/__init__.py +0 -0
  34. tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
  35. tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
  36. tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
  37. tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
  38. tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
  39. tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
  40. tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
  41. tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
  42. tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
  43. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
  44. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
  45. tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
  46. tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
  47. tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
  48. tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
  49. tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
  50. tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
  51. tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
  52. tutorials/examples/6-providers/aws/__init__.py +1 -0
  53. tutorials/examples/6-providers/aws/batch_example.py +219 -0
  54. tutorials/examples/6-providers/portkey/__init__.py +1 -0
  55. tutorials/examples/6-providers/portkey/batch_example.py +120 -0
  56. tutorials/examples/6-providers/portkey/messages_example.py +121 -0
  57. tutorials/examples/6-providers/vertex/__init__.py +1 -0
  58. tutorials/examples/6-providers/vertex/batch_example.py +166 -0
  59. tutorials/examples/6-providers/vertex/messages_example.py +142 -0
  60. tutorials/examples/__init__.py +0 -0
  61. tutorials/notebooks/1-scoring.ipynb +162 -0
  62. tutorials/notebooks/2-evaluating.ipynb +316 -0
  63. tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
  64. tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
  65. tutorials/notebooks/4-uploading_results.ipynb +175 -0
  66. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
  67. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
  68. tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
  69. tutorials/quickstarts/getting_started.ipynb +197 -0
  70. tutorials/utils/__init__.py +35 -0
  71. tutorials/utils/args_parser.py +132 -0
  72. tutorials/utils/output.py +23 -0
  73. tutorials/utils/setup.py +98 -0
  74. scorebook/metrics/metric_registry.py +0 -107
  75. scorebook-0.0.14.dist-info/RECORD +0 -53
  76. {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/entry_points.txt +0 -0
  77. {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,354 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "cell-0",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Adaptive Evaluations with Scorebook (Phi)\n",
9
+ "\n",
10
+ "This notebook demonstrates Trismik's adaptive evaluation feature using a **local open-source model** that runs on your machine without requiring API keys or cloud costs.\n",
11
+ "\n",
12
+ "## What are Adaptive Evaluations?\n",
13
+ "\n",
14
+ "Adaptive evaluations dynamically select questions based on a model's previous responses, similar to adaptive testing in education (like the GRE or GMAT).\n",
15
+ "\n",
16
+ "### Benefits:\n",
17
+ "- **More efficient**: Fewer questions needed to assess capability\n",
18
+ "- **Precise measurement**: Better statistical confidence intervals\n",
19
+ "- **Optimal difficulty**: Questions adapt to the model's skill level\n",
20
+ "\n",
21
+ "## Prerequisites\n",
22
+ "\n",
23
+ "- **Trismik API key**: Get yours at https://app.trismik.com/settings\n",
24
+ "- **Trismik Project**: Create a project at https://app.trismik.com and copy its Project ID\n",
25
+ "- **Hardware**: GPU recommended but not required (CPU inference will be slower)\n",
26
+ "- **Packages**: `pip install transformers torch` (or `pip install transformers torch torchvision` for full PyTorch)\n",
27
+ "\n",
28
+ "## Note on Model Performance\n",
29
+ "\n",
30
+ "⚠️ **Important**: Local models (especially smaller ones) may not perform as well on complex reasoning tasks. This notebook prioritizes **accessibility and reproducibility** over maximum accuracy and uses microsoft Phi-3 on MMLU-Pro."
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "markdown",
35
+ "id": "cell-1",
36
+ "metadata": {},
37
+ "source": [
38
+ "## Setup Credentials\n",
39
+ "\n",
40
+ "Set your Trismik credentials here:"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "id": "cell-2",
46
+ "metadata": {},
47
+ "source": "# STEP 1: Get your Trismik API key from https://app.trismik.com/settings\n# STEP 2: Create a project at https://app.trismik.com and copy the Project ID\n\n# Set your credentials here\nTRISMIK_API_KEY = \"your-trismik-api-key\"\nTRISMIK_PROJECT_ID = \"your-project-id\"",
48
+ "outputs": [],
49
+ "execution_count": null
50
+ },
51
+ {
52
+ "cell_type": "markdown",
53
+ "id": "cell-3",
54
+ "metadata": {},
55
+ "source": [
56
+ "## Import Dependencies"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "id": "cell-4",
62
+ "metadata": {},
63
+ "source": [
64
+ "import asyncio\n",
65
+ "import string\n",
66
+ "from pprint import pprint\n",
67
+ "from typing import Any, List\n",
68
+ "\n",
69
+ "import torch\n",
70
+ "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
71
+ "\n",
72
+ "from scorebook import evaluate_async, login"
73
+ ],
74
+ "outputs": [],
75
+ "execution_count": null
76
+ },
77
+ {
78
+ "cell_type": "markdown",
79
+ "id": "cell-5",
80
+ "metadata": {},
81
+ "source": [
82
+ "## Login to Trismik\n",
83
+ "\n",
84
+ "Authenticate with your Trismik account:"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "id": "cell-6",
90
+ "metadata": {},
91
+ "source": [
92
+ "if not TRISMIK_API_KEY or TRISMIK_API_KEY == \"your-trismik-api-key\":\n",
93
+ " raise ValueError(\"Please set TRISMIK_API_KEY. Get your API key from https://app.trismik.com/settings\")\n",
94
+ "\n",
95
+ "login(TRISMIK_API_KEY)\n",
96
+ "print(\"✓ Logged in to Trismik\")\n",
97
+ "\n",
98
+ "if not TRISMIK_PROJECT_ID or TRISMIK_PROJECT_ID == \"your-project-id\":\n",
99
+ " raise ValueError(\"Please set TRISMIK_PROJECT_ID. Create a project at https://app.trismik.com\")\n",
100
+ "\n",
101
+ "print(f\"✓ Using project: {TRISMIK_PROJECT_ID}\")"
102
+ ],
103
+ "outputs": [],
104
+ "execution_count": null
105
+ },
106
+ {
107
+ "cell_type": "markdown",
108
+ "id": "cell-7",
109
+ "metadata": {},
110
+ "source": [
111
+ "## Initialize Local Model\n",
112
+ "\n",
113
+ "We'll use Phi-3-mini, a compact 3.8B parameter model that runs efficiently on consumer hardware.\n",
114
+ "\n",
115
+ "**Model Options** (in order of size/performance):\n",
116
+ "- `microsoft/Phi-3-mini-4k-instruct` (3.8B) - Fast, runs on most hardware\n",
117
+ "- `microsoft/Phi-3-small-8k-instruct` (7B) - Better performance, needs more memory\n",
118
+ "- `microsoft/Phi-3-medium-4k-instruct` (14B) - High performance, requires GPU\n",
119
+ "\n",
120
+ "Change the model name below based on your hardware capabilities."
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "id": "cell-8",
126
+ "metadata": {},
127
+ "source": [
128
+ "# Select model (change based on your hardware)\n",
129
+ "model_name = \"microsoft/Phi-3-mini-4k-instruct\"\n",
130
+ "\n",
131
+ "print(f\"Loading {model_name}...\")\n",
132
+ "print(\"(This may take a few minutes on first run as the model downloads)\\n\")\n",
133
+ "\n",
134
+ "# Check if CUDA is available\n",
135
+ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
136
+ "print(f\"Using device: {device}\")\n",
137
+ "\n",
138
+ "# Load tokenizer and model\n",
139
+ "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
140
+ "model = AutoModelForCausalLM.from_pretrained(\n",
141
+ " model_name,\n",
142
+ " torch_dtype=torch.float16 if device == \"cuda\" else torch.float32,\n",
143
+ " device_map=\"auto\" if device == \"cuda\" else None,\n",
144
+ " trust_remote_code=True,\n",
145
+ ")\n",
146
+ "\n",
147
+ "if device == \"cpu\":\n",
148
+ " model = model.to(device)\n",
149
+ "\n",
150
+ "print(f\"\\n✓ Model loaded successfully on {device}\")"
151
+ ],
152
+ "outputs": [],
153
+ "execution_count": null
154
+ },
155
+ {
156
+ "cell_type": "markdown",
157
+ "id": "cell-9",
158
+ "metadata": {},
159
+ "source": [
160
+ "## Define Async Inference Function\n",
161
+ "\n",
162
+ "Create an async function to process inputs through the local model:"
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "code",
167
+ "id": "cell-10",
168
+ "metadata": {},
169
+ "source": [
170
+ "async def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:\n",
171
+ " \"\"\"Process inputs through the local Phi model.\n",
172
+ " \n",
173
+ " Args:\n",
174
+ " inputs: Input values from an EvalDataset. For adaptive MMLU-Pro,\n",
175
+ " each input is a dict with 'question' and 'options' keys.\n",
176
+ " hyperparameters: Model hyperparameters.\n",
177
+ " \n",
178
+ " Returns:\n",
179
+ " List of model outputs for all inputs.\n",
180
+ " \"\"\"\n",
181
+ " outputs = []\n",
182
+ " \n",
183
+ " for input_val in inputs:\n",
184
+ " # Handle dict input from adaptive dataset\n",
185
+ " if isinstance(input_val, dict):\n",
186
+ " prompt = input_val.get(\"question\", \"\")\n",
187
+ " if \"options\" in input_val:\n",
188
+ " prompt += \"\\nOptions:\\n\" + \"\\n\".join(\n",
189
+ " f\"{letter}: {choice}\"\n",
190
+ " for letter, choice in zip(string.ascii_uppercase, input_val[\"options\"])\n",
191
+ " )\n",
192
+ " else:\n",
193
+ " prompt = str(input_val)\n",
194
+ " \n",
195
+ " # Build prompt for Phi model\n",
196
+ " system_message = \"Answer the question with a single letter representing the correct answer from the list of choices. Do not provide any additional explanation or output beyond the single letter.\"\n",
197
+ " \n",
198
+ " # Phi-3 uses ChatML format\n",
199
+ " messages = [\n",
200
+ " {\"role\": \"system\", \"content\": system_message},\n",
201
+ " {\"role\": \"user\", \"content\": prompt},\n",
202
+ " ]\n",
203
+ " \n",
204
+ " # Apply chat template\n",
205
+ " formatted_prompt = tokenizer.apply_chat_template(\n",
206
+ " messages, tokenize=False, add_generation_prompt=True\n",
207
+ " )\n",
208
+ " \n",
209
+ " # Tokenize\n",
210
+ " inputs_tokenized = tokenizer(\n",
211
+ " formatted_prompt, return_tensors=\"pt\", truncation=True, max_length=2048\n",
212
+ " )\n",
213
+ " inputs_tokenized = {k: v.to(device) for k, v in inputs_tokenized.items()}\n",
214
+ " \n",
215
+ " # Generate\n",
216
+ " try:\n",
217
+ " with torch.no_grad():\n",
218
+ " output_ids = model.generate(\n",
219
+ " **inputs_tokenized,\n",
220
+ " max_new_tokens=10, # We only need 1 letter\n",
221
+ " temperature=0.7,\n",
222
+ " do_sample=True,\n",
223
+ " pad_token_id=tokenizer.eos_token_id,\n",
224
+ " )\n",
225
+ " \n",
226
+ " # Decode only the generated tokens\n",
227
+ " generated_tokens = output_ids[0][inputs_tokenized[\"input_ids\"].shape[1]:]\n",
228
+ " output = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()\n",
229
+ " \n",
230
+ " # Extract just the first letter if model outputs more\n",
231
+ " if output and output[0].upper() in string.ascii_uppercase:\n",
232
+ " output = output[0].upper()\n",
233
+ " except Exception as e:\n",
234
+ " output = f\"Error: {str(e)}\"\n",
235
+ " \n",
236
+ " outputs.append(output)\n",
237
+ " \n",
238
+ " return outputs"
239
+ ],
240
+ "outputs": [],
241
+ "execution_count": null
242
+ },
243
+ {
244
+ "cell_type": "markdown",
245
+ "id": "cell-11",
246
+ "metadata": {},
247
+ "source": [
248
+ "## Run Adaptive Evaluation\n",
249
+ "\n",
250
+ "Use `evaluate_async()` with an adaptive dataset (indicated by the `:adaptive` suffix):"
251
+ ]
252
+ },
253
+ {
254
+ "cell_type": "code",
255
+ "id": "cell-12",
256
+ "metadata": {},
257
+ "source": [
258
+ "print(f\"Running adaptive evaluation on Common Sense QA with model: {model_name.split('/')[-1]}\")\n",
259
+ "print(\"Note: Adaptive evaluation selects questions dynamically based on responses.\")\n",
260
+ "\n",
261
+ "# Run adaptive evaluation\n",
262
+ "results = await evaluate_async(\n",
263
+ " inference,\n",
264
+ " datasets=\"trismik/CommonSenseQA:adaptive\", # Adaptive datasets have the \":adaptive\" suffix\n",
265
+ " experiment_id=\"Adaptive-Common-Sense-QA-Local-Notebook\",\n",
266
+ " project_id=TRISMIK_PROJECT_ID,\n",
267
+ " return_dict=True,\n",
268
+ " return_aggregates=True,\n",
269
+ " return_items=True,\n",
270
+ " return_output=True,\n",
271
+ ")\n",
272
+ "\n",
273
+ "print(\"\\n✓ Adaptive evaluation complete!\")"
274
+ ],
275
+ "outputs": [],
276
+ "execution_count": null
277
+ },
278
+ {
279
+ "cell_type": "markdown",
280
+ "id": "cell-13",
281
+ "metadata": {},
282
+ "source": [
283
+ "## View Results"
284
+ ]
285
+ },
286
+ {
287
+ "cell_type": "code",
288
+ "id": "cell-14",
289
+ "metadata": {},
290
+ "source": [
291
+ "pprint(results)"
292
+ ],
293
+ "outputs": [],
294
+ "execution_count": null
295
+ },
296
+ {
297
+ "cell_type": "markdown",
298
+ "id": "cell-17",
299
+ "metadata": {},
300
+ "source": [
301
+ "## View Results on Dashboard\n",
302
+ "\n",
303
+ "Your results have been uploaded to Trismik's dashboard for visualization and tracking:"
304
+ ]
305
+ },
306
+ {
307
+ "cell_type": "markdown",
308
+ "id": "cell-19",
309
+ "metadata": {},
310
+ "source": [
311
+ "## Understanding Adaptive Testing\n",
312
+ "\n",
313
+ "### How it works:\n",
314
+ "1. **Initial Questions**: Start with medium-difficulty questions\n",
315
+ "2. **Adaptation**: If the model answers correctly, harder questions follow; if incorrect, easier questions are selected\n",
316
+ "3. **Convergence**: The test converges to the model's true ability level\n",
317
+ "4. **Stopping Criteria**: Stops when sufficient confidence is reached\n",
318
+ "\n",
319
+ "### Benefits vs. Traditional Testing:\n",
320
+ "- **Efficiency**: Typically requires 50-70% fewer questions for the same precision\n",
321
+ "- **Precision**: Better estimates of model capability\n",
322
+ "- **Engagement**: Questions are appropriately challenging\n",
323
+ "\n",
324
+ "## Next Steps\n",
325
+ "\n",
326
+ "- Try different local models (Phi-3-small, Phi-3-medium, Llama-3, etc.)\n",
327
+ "- Compare local model performance with the OpenAI version\n",
328
+ "- Explore other adaptive datasets available on Trismik\n",
329
+ "- See the **Upload Results** notebook for non-adaptive result tracking"
330
+ ]
331
+ }
332
+ ],
333
+ "metadata": {
334
+ "kernelspec": {
335
+ "display_name": "Python 3 (ipykernel)",
336
+ "language": "python",
337
+ "name": "python3"
338
+ },
339
+ "language_info": {
340
+ "codemirror_mode": {
341
+ "name": "ipython",
342
+ "version": 3
343
+ },
344
+ "file_extension": ".py",
345
+ "mimetype": "text/x-python",
346
+ "name": "python",
347
+ "nbconvert_exporter": "python",
348
+ "pygments_lexer": "ipython3",
349
+ "version": "3.13.5"
350
+ }
351
+ },
352
+ "nbformat": 4,
353
+ "nbformat_minor": 5
354
+ }
@@ -0,0 +1,243 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Adaptive Evaluations with Scorebook (GPT)\n",
8
+ "\n",
9
+ "This notebook demonstrates Trismik's adaptive evaluation feature using **OpenAI's GPT models** for high-accuracy results.\n",
10
+ "\n",
11
+ "> **Looking for a version without API costs?** See `3-adaptive_evaluation_local.ipynb` for a version using local open-source models (Phi-3) that runs on your machine without API keys.\n",
12
+ "\n",
13
+ "## What are Adaptive Evaluations?\n",
14
+ "\n",
15
+ "Adaptive evaluations dynamically select questions based on a model's previous responses, similar to adaptive testing in education (like the GRE or GMAT).\n",
16
+ "\n",
17
+ "### Benefits:\n",
18
+ "- **More efficient**: Fewer questions needed to assess capability\n",
19
+ "- **Precise measurement**: Better statistical confidence intervals\n",
20
+ "- **Optimal difficulty**: Questions adapt to the model's skill level\n",
21
+ "\n",
22
+ "## Prerequisites\n",
23
+ "\n",
24
+ "- **Trismik API key**: Get yours at https://app.trismik.com/settings\n",
25
+ "- **Trismik Project**: Create a project at https://app.trismik.com and copy its Project ID\n",
26
+ "- **OpenAI API key**: For high-accuracy results on complex reasoning tasks"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "markdown",
31
+ "metadata": {},
32
+ "source": "## Setup Credentials\n\nSet your API credentials here:"
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "metadata": {},
37
+ "source": "# STEP 1: Get your Trismik API key from https://app.trismik.com/settings\n# STEP 2: Create a project at https://app.trismik.com and copy the Project ID\n# STEP 3: Get your OpenAI API key from https://platform.openai.com/api-keys\n\n# Set your credentials here\nTRISMIK_API_KEY = \"your-trismik-api-key\"\nTRISMIK_PROJECT_ID = \"your-project-id\"\nOPENAI_API_KEY = \"your-openai-api-key\"",
38
+ "outputs": [],
39
+ "execution_count": null
40
+ },
41
+ {
42
+ "cell_type": "markdown",
43
+ "metadata": {},
44
+ "source": [
45
+ "## Import Dependencies"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "metadata": {},
51
+ "source": [
52
+ "import asyncio\n",
53
+ "import string\n",
54
+ "from pprint import pprint\n",
55
+ "from typing import Any, List\n",
56
+ "\n",
57
+ "from openai import AsyncOpenAI\n",
58
+ "from scorebook import evaluate_async, login"
59
+ ],
60
+ "outputs": [],
61
+ "execution_count": null
62
+ },
63
+ {
64
+ "cell_type": "markdown",
65
+ "metadata": {},
66
+ "source": [
67
+ "## Login to Trismik\n",
68
+ "\n",
69
+ "Authenticate with your Trismik account:"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "metadata": {},
75
+ "source": "if not TRISMIK_API_KEY or TRISMIK_API_KEY == \"your-trismik-api-key\":\n raise ValueError(\"Please set TRISMIK_API_KEY. Get your API key from https://app.trismik.com/settings\")\n\nlogin(TRISMIK_API_KEY)\nprint(\"✓ Logged in to Trismik\")\n\nif not TRISMIK_PROJECT_ID or TRISMIK_PROJECT_ID == \"your-project-id\":\n raise ValueError(\"Please set TRISMIK_PROJECT_ID. Create a project at https://app.trismik.com\")\n\nprint(f\"✓ Using project: {TRISMIK_PROJECT_ID}\")",
76
+ "outputs": [],
77
+ "execution_count": null
78
+ },
79
+ {
80
+ "cell_type": "markdown",
81
+ "metadata": {},
82
+ "source": [
83
+ "## Initialize OpenAI Client"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "metadata": {},
89
+ "source": "if not OPENAI_API_KEY or OPENAI_API_KEY == \"your-openai-api-key\":\n raise ValueError(\"Please set OPENAI_API_KEY. Get your API key from https://platform.openai.com/api-keys\")\n\nclient = AsyncOpenAI(api_key=OPENAI_API_KEY) # pragma: allowlist secret\nmodel_name = \"gpt-4o-mini\"\n\nprint(f\"✓ Using model: {model_name}\")",
90
+ "outputs": [],
91
+ "execution_count": null
92
+ },
93
+ {
94
+ "cell_type": "markdown",
95
+ "metadata": {},
96
+ "source": [
97
+ "## Define Async Inference Function\n",
98
+ "\n",
99
+ "Create an async function to process inputs through the OpenAI API:"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "metadata": {},
105
+ "source": [
106
+ "async def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:\n",
107
+ " \"\"\"Process inputs through OpenAI's API.\n",
108
+ " \n",
109
+ " Args:\n",
110
+ " inputs: Input values from an EvalDataset. For adaptive MMLU-Pro,\n",
111
+ " each input is a dict with 'question' and 'options' keys.\n",
112
+ " hyperparameters: Model hyperparameters.\n",
113
+ " \n",
114
+ " Returns:\n",
115
+ " List of model outputs for all inputs.\n",
116
+ " \"\"\"\n",
117
+ " outputs = []\n",
118
+ " \n",
119
+ " for input_val in inputs:\n",
120
+ " # Handle dict input from adaptive dataset\n",
121
+ " if isinstance(input_val, dict):\n",
122
+ " prompt = input_val.get(\"question\", \"\")\n",
123
+ " if \"options\" in input_val:\n",
124
+ " prompt += \"\\nOptions:\\n\" + \"\\n\".join(\n",
125
+ " f\"{letter}: {choice}\"\n",
126
+ " for letter, choice in zip(string.ascii_uppercase, input_val[\"options\"])\n",
127
+ " )\n",
128
+ " else:\n",
129
+ " prompt = str(input_val)\n",
130
+ " \n",
131
+ " # Build messages for OpenAI API\n",
132
+ " messages = [\n",
133
+ " {\n",
134
+ " \"role\": \"system\",\n",
135
+ " \"content\": \"Answer the question with a single letter representing the correct answer from the list of choices. Do not provide any additional explanation or output beyond the single letter.\",\n",
136
+ " },\n",
137
+ " {\"role\": \"user\", \"content\": prompt},\n",
138
+ " ]\n",
139
+ " \n",
140
+ " # Call OpenAI API\n",
141
+ " try:\n",
142
+ " response = await client.chat.completions.create(\n",
143
+ " model=model_name,\n",
144
+ " messages=messages,\n",
145
+ " temperature=0.7,\n",
146
+ " )\n",
147
+ " output = response.choices[0].message.content.strip()\n",
148
+ " except Exception as e:\n",
149
+ " output = f\"Error: {str(e)}\"\n",
150
+ " \n",
151
+ " outputs.append(output)\n",
152
+ " \n",
153
+ " return outputs"
154
+ ],
155
+ "outputs": [],
156
+ "execution_count": null
157
+ },
158
+ {
159
+ "cell_type": "markdown",
160
+ "metadata": {},
161
+ "source": [
162
+ "## Run Adaptive Evaluation\n",
163
+ "\n",
164
+ "Use `evaluate_async()` with an adaptive dataset (indicated by the `:adaptive` suffix):"
165
+ ]
166
+ },
167
+ {
168
+ "cell_type": "code",
169
+ "metadata": {},
170
+ "source": [
171
+ "print(f\"Running adaptive evaluation on Common Sense QA with model: {model_name}\")\n",
172
+ "print(\"Note: Adaptive evaluation selects questions dynamically based on responses.\\n\")\n",
173
+ "\n",
174
+ "# Run adaptive evaluation\n",
175
+ "results = await evaluate_async(\n",
176
+ " inference,\n",
177
+ " datasets=\"trismik/CommonSenseQA:adaptive\", # Adaptive datasets have the \":adaptive\" suffix\n",
178
+ " experiment_id=\"Adaptive-Common-Sense-QA-Notebook\",\n",
179
+ " project_id=TRISMIK_PROJECT_ID,\n",
180
+ " return_dict=True,\n",
181
+ " return_aggregates=True,\n",
182
+ " return_items=True,\n",
183
+ " return_output=True,\n",
184
+ ")\n",
185
+ "\n",
186
+ "print(\"\\n✓ Adaptive evaluation complete!\")"
187
+ ],
188
+ "outputs": [],
189
+ "execution_count": null
190
+ },
191
+ {
192
+ "cell_type": "markdown",
193
+ "metadata": {},
194
+ "source": [
195
+ "## View Results"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "code",
200
+ "metadata": {},
201
+ "source": [
202
+ "pprint(results)"
203
+ ],
204
+ "outputs": [],
205
+ "execution_count": null
206
+ },
207
+ {
208
+ "cell_type": "markdown",
209
+ "metadata": {},
210
+ "source": [
211
+ "## View Results on Dashboard\n",
212
+ "\n",
213
+ "Your results have been uploaded to Trismik's dashboard for visualization and tracking:"
214
+ ]
215
+ },
216
+ {
217
+ "cell_type": "markdown",
218
+ "metadata": {},
219
+ "source": "## Understanding Adaptive Testing\n\n### How it works:\n1. **Initial Questions**: Start with medium-difficulty questions\n2. **Adaptation**: If the model answers correctly, harder questions follow; if incorrect, easier questions are selected\n3. **Convergence**: The test converges to the model's true ability level\n4. **Stopping Criteria**: Stops when sufficient confidence is reached\n\n### Benefits vs. Traditional Testing:\n- **Efficiency**: Typically requires 50-70% fewer questions for the same precision\n- **Precision**: Better estimates of model capability\n- **Engagement**: Questions are appropriately challenging\n\n## Next Steps\n\n- Try adaptive evaluation with different models to compare\n- **Don't have an OpenAI API key?** See `3-adaptive_evaluation_local.ipynb` to run adaptive evaluations with local open-source models (Phi-3, Llama, etc.)\n- Explore other adaptive datasets available on Trismik\n- See the **Upload Results** notebook for non-adaptive result tracking"
220
+ }
221
+ ],
222
+ "metadata": {
223
+ "kernelspec": {
224
+ "display_name": "Python 3",
225
+ "language": "python",
226
+ "name": "python3"
227
+ },
228
+ "language_info": {
229
+ "codemirror_mode": {
230
+ "name": "ipython",
231
+ "version": 3
232
+ },
233
+ "file_extension": ".py",
234
+ "mimetype": "text/x-python",
235
+ "name": "python",
236
+ "nbconvert_exporter": "python",
237
+ "pygments_lexer": "ipython3",
238
+ "version": "3.11.0"
239
+ }
240
+ },
241
+ "nbformat": 4,
242
+ "nbformat_minor": 4
243
+ }