scorebook 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +2 -0
- scorebook/dashboard/credentials.py +34 -4
- scorebook/eval_datasets/eval_dataset.py +2 -2
- scorebook/evaluate/_async/evaluate_async.py +27 -11
- scorebook/evaluate/_sync/evaluate.py +27 -11
- scorebook/metrics/README.md +121 -0
- scorebook/metrics/__init__.py +8 -0
- scorebook/metrics/accuracy.py +2 -6
- scorebook/metrics/bertscore.py +50 -0
- scorebook/metrics/bleu.py +82 -0
- scorebook/metrics/core/__init__.py +1 -0
- scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
- scorebook/metrics/core/metric_registry.py +195 -0
- scorebook/metrics/exactmatch.py +95 -0
- scorebook/metrics/f1.py +96 -0
- scorebook/metrics/precision.py +84 -9
- scorebook/metrics/recall.py +94 -0
- scorebook/metrics/rouge.py +85 -0
- scorebook/score/score_helpers.py +28 -11
- scorebook/types.py +2 -2
- scorebook/utils/progress_bars.py +58 -786
- {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/METADATA +32 -24
- scorebook-0.0.16.dist-info/RECORD +110 -0
- {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/WHEEL +1 -1
- tutorials/README.md +147 -0
- tutorials/__init__.py +5 -0
- tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
- tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
- tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
- tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
- tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
- tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
- tutorials/examples/1-score/__init__.py +0 -0
- tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
- tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
- tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
- tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
- tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
- tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
- tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
- tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
- tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
- tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
- tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
- tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
- tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
- tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
- tutorials/examples/6-providers/aws/__init__.py +1 -0
- tutorials/examples/6-providers/aws/batch_example.py +219 -0
- tutorials/examples/6-providers/portkey/__init__.py +1 -0
- tutorials/examples/6-providers/portkey/batch_example.py +120 -0
- tutorials/examples/6-providers/portkey/messages_example.py +121 -0
- tutorials/examples/6-providers/vertex/__init__.py +1 -0
- tutorials/examples/6-providers/vertex/batch_example.py +166 -0
- tutorials/examples/6-providers/vertex/messages_example.py +142 -0
- tutorials/examples/__init__.py +0 -0
- tutorials/notebooks/1-scoring.ipynb +162 -0
- tutorials/notebooks/2-evaluating.ipynb +316 -0
- tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
- tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
- tutorials/notebooks/4-uploading_results.ipynb +175 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
- tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
- tutorials/quickstarts/getting_started.ipynb +197 -0
- tutorials/utils/__init__.py +35 -0
- tutorials/utils/args_parser.py +132 -0
- tutorials/utils/output.py +23 -0
- tutorials/utils/setup.py +98 -0
- scorebook/metrics/metric_registry.py +0 -107
- scorebook-0.0.14.dist-info/RECORD +0 -53
- {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"metadata": {},
|
|
6
|
+
"source": "# Uploading Results to Trismik Dashboard\n\nThis notebook demonstrates three ways to upload evaluation results to Trismik's dashboard for tracking and visualization.\n\n## Why Upload Results?\n\n- **Track Progress**: Monitor model performance over time\n- **Compare Models**: Visualize performance across different models and experiments\n- **Share Results**: Collaborate with your team on evaluation insights\n- **Historical Analysis**: Maintain a record of all evaluations\n\n## Prerequisites\n\n- **Trismik API key**: Get yours at https://app.trismik.com/settings\n- **Trismik Project**: Create a project at https://app.trismik.com and copy its Project ID"
|
|
7
|
+
},
|
|
8
|
+
{
|
|
9
|
+
"cell_type": "markdown",
|
|
10
|
+
"metadata": {},
|
|
11
|
+
"source": "## Setup Credentials\n\nSet your Trismik credentials here:"
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
"cell_type": "code",
|
|
15
|
+
"metadata": {},
|
|
16
|
+
"source": "# STEP 1: Get your Trismik API key from https://app.trismik.com/settings\n# STEP 2: Create a project at https://app.trismik.com and copy the Project ID\n\n# Set your credentials here\nTRISMIK_API_KEY = \"your-trismik-api-key\"\nTRISMIK_PROJECT_ID = \"your-project-id\"",
|
|
17
|
+
"outputs": [],
|
|
18
|
+
"execution_count": null
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"cell_type": "code",
|
|
22
|
+
"metadata": {},
|
|
23
|
+
"source": [
|
|
24
|
+
"from pprint import pprint\n",
|
|
25
|
+
"from scorebook import score, login\n",
|
|
26
|
+
"from scorebook.metrics.accuracy import Accuracy"
|
|
27
|
+
],
|
|
28
|
+
"outputs": [],
|
|
29
|
+
"execution_count": null
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"cell_type": "markdown",
|
|
33
|
+
"metadata": {},
|
|
34
|
+
"source": [
|
|
35
|
+
"## Login to Trismik"
|
|
36
|
+
]
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
"cell_type": "code",
|
|
40
|
+
"metadata": {},
|
|
41
|
+
"source": "if not TRISMIK_API_KEY or TRISMIK_API_KEY == \"your-trismik-api-key\":\n raise ValueError(\"Please set TRISMIK_API_KEY. Get your API key from https://app.trismik.com/settings\")\n\nlogin(TRISMIK_API_KEY)\nprint(\"✓ Logged in to Trismik\")\n\nif not TRISMIK_PROJECT_ID or TRISMIK_PROJECT_ID == \"your-project-id\":\n raise ValueError(\"Please set TRISMIK_PROJECT_ID. Create a project at https://app.trismik.com\")\n\nprint(f\"✓ Using project: {TRISMIK_PROJECT_ID}\")",
|
|
42
|
+
"outputs": [],
|
|
43
|
+
"execution_count": null
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"cell_type": "markdown",
|
|
47
|
+
"metadata": {},
|
|
48
|
+
"source": [
|
|
49
|
+
"## Method 1: Upload score() Results\n",
|
|
50
|
+
"\n",
|
|
51
|
+
"Score pre-computed outputs and upload to Trismik:"
|
|
52
|
+
]
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
"cell_type": "code",
|
|
56
|
+
"metadata": {},
|
|
57
|
+
"source": "# Prepare items with pre-computed outputs\nitems = [\n {\"input\": \"What is 2 + 2?\", \"output\": \"4\", \"label\": \"4\"},\n {\"input\": \"What is the capital of France?\", \"output\": \"Paris\", \"label\": \"Paris\"},\n {\"input\": \"Who wrote Romeo and Juliet?\", \"output\": \"William Shakespeare\", \"label\": \"William Shakespeare\"},\n {\"input\": \"What is 5 * 6?\", \"output\": \"30\", \"label\": \"30\"},\n {\"input\": \"What is the largest planet?\", \"output\": \"Jupiter\", \"label\": \"Jupiter\"},\n]\n\n# Score and upload\nresults = score(\n items=items,\n metrics=Accuracy,\n dataset_name=\"basic_questions\",\n model_name=\"example-model-v1\",\n experiment_id=\"Score-Upload-Notebook\",\n project_id=TRISMIK_PROJECT_ID,\n metadata={\n \"description\": \"Example from Jupyter notebook\",\n \"note\": \"Pre-computed outputs uploaded via score()\",\n },\n upload_results=True, # Enable uploading\n)\n\nprint(f\"\\n✓ Results uploaded successfully!\")\nprint(f\"Accuracy: {results['aggregate_results'][0]['accuracy']:.2%}\")",
|
|
58
|
+
"outputs": [],
|
|
59
|
+
"execution_count": null
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
"cell_type": "markdown",
|
|
63
|
+
"metadata": {},
|
|
64
|
+
"source": [
|
|
65
|
+
"## Method 2: Upload evaluate() Results\n",
|
|
66
|
+
"\n",
|
|
67
|
+
"Run inference and automatically upload results:"
|
|
68
|
+
]
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
"cell_type": "code",
|
|
72
|
+
"metadata": {},
|
|
73
|
+
"source": "from typing import Any, List\nfrom scorebook import EvalDataset, evaluate\n\n# Create a simple dataset\nimport json\nfrom pathlib import Path\n\nsample_data = [\n {\"question\": \"What is 10 + 5?\", \"answer\": \"15\"},\n {\"question\": \"What is the capital of Spain?\", \"answer\": \"Madrid\"},\n]\n\ntemp_file = Path(\"temp_eval_dataset.json\")\nwith open(temp_file, \"w\") as f:\n json.dump(sample_data, f)\n\ndataset = EvalDataset.from_json(\n path=str(temp_file),\n metrics=\"accuracy\",\n input=\"question\",\n label=\"answer\",\n)\n\n# Define a simple inference function (mock)\ndef mock_inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:\n \"\"\"Mock inference that returns the expected answers.\"\"\"\n # In practice, this would call your model\n return [\"15\", \"Madrid\"] # Mock perfect answers\n\n# Run evaluation with upload\neval_results = evaluate(\n mock_inference,\n dataset,\n hyperparameters={\"temperature\": 0.7},\n experiment_id=\"Evaluate-Upload-Notebook\",\n project_id=TRISMIK_PROJECT_ID,\n metadata={\n \"model\": \"mock-model\",\n \"description\": \"Evaluation results from notebook\",\n },\n return_aggregates=True,\n return_items=True,\n return_output=True,\n)\n\nprint(f\"\\n✓ Evaluation results uploaded!\")\nprint(f\"Accuracy: {eval_results['aggregate_results'][0]['accuracy']:.2%}\")\n\n# Cleanup\ntemp_file.unlink()",
|
|
74
|
+
"outputs": [],
|
|
75
|
+
"execution_count": null
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
"cell_type": "markdown",
|
|
79
|
+
"metadata": {},
|
|
80
|
+
"source": [
|
|
81
|
+
"## Method 3: Upload External Results\n",
|
|
82
|
+
"\n",
|
|
83
|
+
"Import results from external evaluation frameworks or historical data:"
|
|
84
|
+
]
|
|
85
|
+
},
|
|
86
|
+
{
|
|
87
|
+
"cell_type": "code",
|
|
88
|
+
"metadata": {},
|
|
89
|
+
"source": "# Example: Import results from another evaluation framework\nexternal_results = [\n {\"input\": \"Translate 'hello' to Spanish\", \"output\": \"hola\", \"label\": \"hola\"},\n {\"input\": \"Translate 'goodbye' to Spanish\", \"output\": \"adiós\", \"label\": \"adiós\"},\n {\"input\": \"Translate 'thank you' to Spanish\", \"output\": \"gracias\", \"label\": \"gracias\"},\n {\"input\": \"Translate 'please' to Spanish\", \"output\": \"por favor\", \"label\": \"por favor\"},\n]\n\n# Upload external results\nexternal_upload = score(\n items=external_results,\n metrics=\"accuracy\",\n dataset_name=\"spanish_translation\",\n model_name=\"external-translator-v2\",\n experiment_id=\"External-Results-Upload\",\n project_id=TRISMIK_PROJECT_ID,\n metadata={\n \"description\": \"Historical results imported from external framework\",\n \"source\": \"Custom evaluation pipeline\",\n \"date\": \"2025-01-15\",\n },\n upload_results=True,\n)\n\nprint(f\"\\n✓ External results uploaded!\")\nprint(f\"Accuracy: {external_upload['aggregate_results'][0]['accuracy']:.2%}\")",
|
|
90
|
+
"outputs": [],
|
|
91
|
+
"execution_count": null
|
|
92
|
+
},
|
|
93
|
+
{
|
|
94
|
+
"cell_type": "markdown",
|
|
95
|
+
"metadata": {},
|
|
96
|
+
"source": [
|
|
97
|
+
"## View Results on Dashboard\n",
|
|
98
|
+
"\n",
|
|
99
|
+
"All uploaded results are now visible on your Trismik dashboard:"
|
|
100
|
+
]
|
|
101
|
+
},
|
|
102
|
+
{
|
|
103
|
+
"cell_type": "code",
|
|
104
|
+
"metadata": {},
|
|
105
|
+
"source": "from IPython.display import display, Markdown\n\ndashboard_url = f\"https://app.trismik.com/projects/{TRISMIK_PROJECT_ID}\"\ndisplay(Markdown(f\"### 📊 [View All Results on Dashboard]({dashboard_url})\"))\nprint(f\"\\nDirect link: {dashboard_url}\")\nprint(\"\\nYou should see three experiments:\")\nprint(\" 1. Score-Upload-Notebook\")\nprint(\" 2. Evaluate-Upload-Notebook\")\nprint(\" 3. External-Results-Upload\")",
|
|
106
|
+
"outputs": [],
|
|
107
|
+
"execution_count": null
|
|
108
|
+
},
|
|
109
|
+
{
|
|
110
|
+
"cell_type": "markdown",
|
|
111
|
+
"metadata": {},
|
|
112
|
+
"source": [
|
|
113
|
+
"## Organizing Results with Metadata\n",
|
|
114
|
+
"\n",
|
|
115
|
+
"Use metadata to add context and organization to your results:"
|
|
116
|
+
]
|
|
117
|
+
},
|
|
118
|
+
{
|
|
119
|
+
"cell_type": "code",
|
|
120
|
+
"metadata": {},
|
|
121
|
+
"source": "# Example: Organizing a model comparison experiment\nmodels_to_test = [\n {\"name\": \"model-a\", \"version\": \"1.0\"},\n {\"name\": \"model-b\", \"version\": \"2.0\"},\n]\n\ntest_items = [\n {\"output\": \"positive\", \"label\": \"positive\"},\n {\"output\": \"negative\", \"label\": \"negative\"},\n]\n\nfor model_info in models_to_test:\n result = score(\n items=test_items,\n metrics=Accuracy,\n dataset_name=\"sentiment_test\",\n model_name=model_info[\"name\"],\n experiment_id=\"Model-Comparison-Notebook\",\n project_id=TRISMIK_PROJECT_ID,\n metadata={\n \"model_version\": model_info[\"version\"],\n \"comparison_group\": \"sentiment_analysis\",\n \"date\": \"2025-01-26\",\n \"notes\": f\"Testing {model_info['name']} v{model_info['version']}\",\n },\n upload_results=True,\n )\n print(f\"✓ Uploaded results for {model_info['name']} v{model_info['version']}\")",
|
|
122
|
+
"outputs": [],
|
|
123
|
+
"execution_count": null
|
|
124
|
+
},
|
|
125
|
+
{
|
|
126
|
+
"cell_type": "markdown",
|
|
127
|
+
"metadata": {},
|
|
128
|
+
"source": [
|
|
129
|
+
"## Best Practices\n",
|
|
130
|
+
"\n",
|
|
131
|
+
"### Experiment Naming\n",
|
|
132
|
+
"- Use descriptive `experiment_id` values (e.g., \"GPT4-MMLU-Baseline\")\n",
|
|
133
|
+
"- Group related runs under the same experiment ID\n",
|
|
134
|
+
"- Use different experiment IDs for different types of tests\n",
|
|
135
|
+
"\n",
|
|
136
|
+
"### Metadata\n",
|
|
137
|
+
"- Include model version, hyperparameters, and configuration\n",
|
|
138
|
+
"- Add timestamps and descriptions for historical tracking\n",
|
|
139
|
+
"- Use consistent keys across experiments for easy comparison\n",
|
|
140
|
+
"\n",
|
|
141
|
+
"### Organization\n",
|
|
142
|
+
"- Create separate projects for different use cases\n",
|
|
143
|
+
"- Use tags or metadata fields to categorize experiments\n",
|
|
144
|
+
"- Document your evaluation methodology in metadata\n",
|
|
145
|
+
"\n",
|
|
146
|
+
"## Next Steps\n",
|
|
147
|
+
"\n",
|
|
148
|
+
"- Explore the Trismik dashboard to visualize trends and comparisons\n",
|
|
149
|
+
"- Set up automated evaluation pipelines with result uploading\n",
|
|
150
|
+
"- Try the **Adaptive Evaluations** notebook for efficient testing with automatic uploads"
|
|
151
|
+
]
|
|
152
|
+
}
|
|
153
|
+
],
|
|
154
|
+
"metadata": {
|
|
155
|
+
"kernelspec": {
|
|
156
|
+
"display_name": "Python 3",
|
|
157
|
+
"language": "python",
|
|
158
|
+
"name": "python3"
|
|
159
|
+
},
|
|
160
|
+
"language_info": {
|
|
161
|
+
"codemirror_mode": {
|
|
162
|
+
"name": "ipython",
|
|
163
|
+
"version": 3
|
|
164
|
+
},
|
|
165
|
+
"file_extension": ".py",
|
|
166
|
+
"mimetype": "text/x-python",
|
|
167
|
+
"name": "python",
|
|
168
|
+
"nbconvert_exporter": "python",
|
|
169
|
+
"pygments_lexer": "ipython3",
|
|
170
|
+
"version": "3.11.0"
|
|
171
|
+
}
|
|
172
|
+
},
|
|
173
|
+
"nbformat": 4,
|
|
174
|
+
"nbformat_minor": 4
|
|
175
|
+
}
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"id": "bc3ba3cd77800bb4",
|
|
6
|
+
"metadata": {},
|
|
7
|
+
"source": [
|
|
8
|
+
"# Adaptive Evaluations with Scorebook - Evaluating an OpenAI GPT Model\n",
|
|
9
|
+
"\n",
|
|
10
|
+
"This quick-start guide showcases an adaptive evaluation of OpenAI's GPT-4o Mini model.\n",
|
|
11
|
+
"\n",
|
|
12
|
+
"We recommend that you first see our [getting started quick-start guide](https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/getting_started.ipynb) if you have not done so already, for more of a detailed overview on adaptive testing and setting up Trismik credentials.\n",
|
|
13
|
+
"\n",
|
|
14
|
+
"## Prerequisites\n",
|
|
15
|
+
"\n",
|
|
16
|
+
"- **Trismik API key**: Generate a Trismik API key from the [Trismik dashboard's settings page](https://app.trismik.com/settings).\n",
|
|
17
|
+
"- **Trismik Project Id**: We recommend you use the project id generated in the [Getting Started Quick-Start Guide](https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/getting_started.ipynb).\n",
|
|
18
|
+
"- **OpenAI API key**: Generate an OpenAI API key from [OpenAI's API Platform](https://openai.com/api/).\n",
|
|
19
|
+
"\n",
|
|
20
|
+
"## Install Scorebook"
|
|
21
|
+
]
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"metadata": {},
|
|
25
|
+
"cell_type": "code",
|
|
26
|
+
"source": [
|
|
27
|
+
"!pip install scorebook\n",
|
|
28
|
+
"# if you're running this locally, please run !pip install scorebook\"[examples, providers]\""
|
|
29
|
+
],
|
|
30
|
+
"id": "f454e876551a4a0c",
|
|
31
|
+
"outputs": [],
|
|
32
|
+
"execution_count": null
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
"metadata": {},
|
|
36
|
+
"cell_type": "markdown",
|
|
37
|
+
"source": [
|
|
38
|
+
"\n",
|
|
39
|
+
"## Setup Credentials\n",
|
|
40
|
+
"\n",
|
|
41
|
+
"Enter your Trismik API key, project id and OpenAI API Key below."
|
|
42
|
+
],
|
|
43
|
+
"id": "cad992b287d4d0ac"
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"cell_type": "code",
|
|
47
|
+
"id": "14e576282749edb7",
|
|
48
|
+
"metadata": {},
|
|
49
|
+
"source": [
|
|
50
|
+
"# Set your credentials here\n",
|
|
51
|
+
"TRISMIK_API_KEY = \"your-trismik-api-key-here\"\n",
|
|
52
|
+
"TRISMIK_PROJECT_ID = \"your-trismik-project-id-here\"\n",
|
|
53
|
+
"OPENAI_API_KEY = \"your-openai-api-key-here\""
|
|
54
|
+
],
|
|
55
|
+
"outputs": [],
|
|
56
|
+
"execution_count": null
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
"cell_type": "markdown",
|
|
60
|
+
"id": "700950d039e4c0f6",
|
|
61
|
+
"metadata": {},
|
|
62
|
+
"source": [
|
|
63
|
+
"## Login with Trismik API Key"
|
|
64
|
+
]
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
"cell_type": "code",
|
|
68
|
+
"id": "initial_id",
|
|
69
|
+
"metadata": {},
|
|
70
|
+
"source": [
|
|
71
|
+
"from scorebook import login\n",
|
|
72
|
+
"\n",
|
|
73
|
+
"# Login to Trismik\n",
|
|
74
|
+
"login(TRISMIK_API_KEY)\n",
|
|
75
|
+
"print(\"✓ Logged in to Trismik\")"
|
|
76
|
+
],
|
|
77
|
+
"outputs": [],
|
|
78
|
+
"execution_count": null
|
|
79
|
+
},
|
|
80
|
+
{
|
|
81
|
+
"cell_type": "markdown",
|
|
82
|
+
"id": "13084db21e549ccf",
|
|
83
|
+
"metadata": {},
|
|
84
|
+
"source": [
|
|
85
|
+
"## Define an Inference Function\n",
|
|
86
|
+
"\n",
|
|
87
|
+
"To evaluate a model with Scorebook, it must be encapsulated within an inference function. An inference function must accept a list of model inputs, pass these to the model for inference, collect and return outputs generated.\n",
|
|
88
|
+
"\n",
|
|
89
|
+
"An inference function can be defined to encapsulate any model, local or cloud-hosted. There is flexibility in how an inference function can be defined, the only requirements are the function signature. An inference function must,\n",
|
|
90
|
+
"\n",
|
|
91
|
+
"Accept:\n",
|
|
92
|
+
"\n",
|
|
93
|
+
"- A list of model inputs.\n",
|
|
94
|
+
"- Hyperparameters which can be optionally accessed via kwargs.\n",
|
|
95
|
+
"\n",
|
|
96
|
+
"Return\n",
|
|
97
|
+
"\n",
|
|
98
|
+
"- A list of parsed model outputs for scoring."
|
|
99
|
+
]
|
|
100
|
+
},
|
|
101
|
+
{
|
|
102
|
+
"cell_type": "code",
|
|
103
|
+
"id": "8aa99f513db6241a",
|
|
104
|
+
"metadata": {},
|
|
105
|
+
"source": [
|
|
106
|
+
"from openai import OpenAI\n",
|
|
107
|
+
"from typing import Any, List\n",
|
|
108
|
+
"import string\n",
|
|
109
|
+
"\n",
|
|
110
|
+
"client = OpenAI(api_key=OPENAI_API_KEY)\n",
|
|
111
|
+
"\n",
|
|
112
|
+
"# define an inference function for GPT-4o Mini.\n",
|
|
113
|
+
"def gpt4o_mini(inputs: List[Any], **hyperparameters: Any) -> List[Any]:\n",
|
|
114
|
+
" \"\"\"Process inputs through OpenAI's API\"\"\"\n",
|
|
115
|
+
"\n",
|
|
116
|
+
" outputs = []\n",
|
|
117
|
+
" for idx, input_item in enumerate(inputs):\n",
|
|
118
|
+
"\n",
|
|
119
|
+
" # Format prompt\n",
|
|
120
|
+
" choices = input_item.get(\"options\", [])\n",
|
|
121
|
+
" prompt = (\n",
|
|
122
|
+
" str(input_item.get(\"question\", \"\"))\n",
|
|
123
|
+
" + \"\\nOptions:\\n\"\n",
|
|
124
|
+
" + \"\\n\".join(\n",
|
|
125
|
+
" f\"{letter}: {choice['text'] if isinstance(choice, dict) else choice}\"\n",
|
|
126
|
+
" for letter, choice in zip(string.ascii_uppercase, choices)\n",
|
|
127
|
+
" )\n",
|
|
128
|
+
" )\n",
|
|
129
|
+
"\n",
|
|
130
|
+
" # Build messages for OpenAI API\n",
|
|
131
|
+
" messages = [\n",
|
|
132
|
+
" {\n",
|
|
133
|
+
" \"role\": \"system\",\n",
|
|
134
|
+
" \"content\": hyperparameters[\"system_message\"]\n",
|
|
135
|
+
" },\n",
|
|
136
|
+
" {\"role\": \"user\", \"content\": prompt},\n",
|
|
137
|
+
" ]\n",
|
|
138
|
+
"\n",
|
|
139
|
+
" # Call OpenAI API and extract output from the response\n",
|
|
140
|
+
" try:\n",
|
|
141
|
+
" response = client.chat.completions.create(\n",
|
|
142
|
+
" model=\"gpt-4o-mini\",\n",
|
|
143
|
+
" messages=messages,\n",
|
|
144
|
+
" temperature=0.7,\n",
|
|
145
|
+
" )\n",
|
|
146
|
+
" output = response.choices[0].message.content.strip()\n",
|
|
147
|
+
"\n",
|
|
148
|
+
" except Exception as e:\n",
|
|
149
|
+
" output = f\"Error: {str(e)}\"\n",
|
|
150
|
+
"\n",
|
|
151
|
+
" outputs.append(output)\n",
|
|
152
|
+
"\n",
|
|
153
|
+
" return outputs"
|
|
154
|
+
],
|
|
155
|
+
"outputs": [],
|
|
156
|
+
"execution_count": null
|
|
157
|
+
},
|
|
158
|
+
{
|
|
159
|
+
"cell_type": "markdown",
|
|
160
|
+
"id": "efa5c3ea791bbcd1",
|
|
161
|
+
"metadata": {},
|
|
162
|
+
"source": [
|
|
163
|
+
"## Run an Adaptive Evaluation\n",
|
|
164
|
+
"\n",
|
|
165
|
+
"When running an adaptive evaluation, we can use any single or multiple adaptive datasets and specify a split to be evaluated."
|
|
166
|
+
]
|
|
167
|
+
},
|
|
168
|
+
{
|
|
169
|
+
"cell_type": "code",
|
|
170
|
+
"id": "3cbf1b2f13d5553e",
|
|
171
|
+
"metadata": {},
|
|
172
|
+
"source": [
|
|
173
|
+
"from scorebook import evaluate\n",
|
|
174
|
+
"\n",
|
|
175
|
+
"# Run adaptive evaluation\n",
|
|
176
|
+
"results = evaluate(\n",
|
|
177
|
+
" inference = gpt4o_mini,\n",
|
|
178
|
+
" datasets = \"trismik/figQA:adaptive\",\n",
|
|
179
|
+
" hyperparameters = {\"system_message\": \"Answer the question with only the letter of the correct option. No additional text or context\"},\n",
|
|
180
|
+
" split = \"validation\",\n",
|
|
181
|
+
" experiment_id = \"GPT-4o-Mini-Adaptive-Evaluation\",\n",
|
|
182
|
+
" project_id = TRISMIK_PROJECT_ID,\n",
|
|
183
|
+
")\n",
|
|
184
|
+
"\n",
|
|
185
|
+
"# Print the adaptive evaluation results\n",
|
|
186
|
+
"print(\"✓ Adaptive evaluation complete!\")\n",
|
|
187
|
+
"print(\"Results: \", results[0][\"score\"])"
|
|
188
|
+
],
|
|
189
|
+
"outputs": [],
|
|
190
|
+
"execution_count": null
|
|
191
|
+
},
|
|
192
|
+
{
|
|
193
|
+
"cell_type": "markdown",
|
|
194
|
+
"id": "d37cb5e87cc297fe",
|
|
195
|
+
"metadata": {},
|
|
196
|
+
"source": [
|
|
197
|
+
"---\n",
|
|
198
|
+
"\n",
|
|
199
|
+
"## Next Steps\n",
|
|
200
|
+
"\n",
|
|
201
|
+
"- [Adaptive Testing White Paper](https://docs.trismik.com/adaptiveTesting/adaptive-testing-introduction/): An in depth overview of the science behind the adaptive testing methodology.\n",
|
|
202
|
+
"- [Dataset Page](https://dashboard.trismik.com/datasets): Trismik's full set of currently adaptive datasets from the Trismik dashboard.\n",
|
|
203
|
+
"- [Scorebook Docs](https://docs.trismik.com/scorebook/introduction-to-scorebook/): Scorebook's full documentation.\n",
|
|
204
|
+
"- [Scorebook Repository](https://github.com/trismik/scorebook): Scorebook is an open-source library, view the code and more examples."
|
|
205
|
+
]
|
|
206
|
+
}
|
|
207
|
+
],
|
|
208
|
+
"metadata": {
|
|
209
|
+
"kernelspec": {
|
|
210
|
+
"display_name": "Python 3 (ipykernel)",
|
|
211
|
+
"language": "python",
|
|
212
|
+
"name": "python3"
|
|
213
|
+
},
|
|
214
|
+
"language_info": {
|
|
215
|
+
"codemirror_mode": {
|
|
216
|
+
"name": "ipython",
|
|
217
|
+
"version": 3
|
|
218
|
+
},
|
|
219
|
+
"file_extension": ".py",
|
|
220
|
+
"mimetype": "text/x-python",
|
|
221
|
+
"name": "python",
|
|
222
|
+
"nbconvert_exporter": "python",
|
|
223
|
+
"pygments_lexer": "ipython3",
|
|
224
|
+
"version": "3.13.5"
|
|
225
|
+
}
|
|
226
|
+
},
|
|
227
|
+
"nbformat": 4,
|
|
228
|
+
"nbformat_minor": 5
|
|
229
|
+
}
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"id": "bc3ba3cd77800bb4",
|
|
6
|
+
"metadata": {},
|
|
7
|
+
"source": [
|
|
8
|
+
"# Adaptive Evaluations with Scorebook - Evaluating a Local Qwen Model\n",
|
|
9
|
+
"\n",
|
|
10
|
+
"This quick-start guide showcases an adaptive evaluation of Qwen's Qwen2.5 0.5B Instruct model.\n",
|
|
11
|
+
"\n",
|
|
12
|
+
"We recommend that you first see our [getting started quick-start guide](https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/getting_started.ipynb) if you have not done so already, for more of a detailed overview on adaptive testing and setting up Trismik credentials.\n",
|
|
13
|
+
"\n",
|
|
14
|
+
"## Prerequisites\n",
|
|
15
|
+
"\n",
|
|
16
|
+
"- **Trismik API key**: Generate a Trismik API key from the [Trismik dashboard's settings page](https://app.trismik.com/settings).\n",
|
|
17
|
+
"- **Trismik Project Id**: We recommend you use the project id generated in the [Getting Started Quick-Start Guide](https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/getting_started.ipynb).\n",
|
|
18
|
+
"\n",
|
|
19
|
+
"## Install Scorebook"
|
|
20
|
+
]
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
"metadata": {},
|
|
24
|
+
"cell_type": "code",
|
|
25
|
+
"source": [
|
|
26
|
+
"!pip install scorebook\n",
|
|
27
|
+
"# if you're running this locally, please run !pip install scorebook\"[examples]\""
|
|
28
|
+
],
|
|
29
|
+
"id": "90146caef86f19ee",
|
|
30
|
+
"outputs": [],
|
|
31
|
+
"execution_count": null
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
"cell_type": "markdown",
|
|
35
|
+
"id": "cad992b287d4d0ac",
|
|
36
|
+
"metadata": {},
|
|
37
|
+
"source": [
|
|
38
|
+
"## Setup Credentials\n",
|
|
39
|
+
"\n",
|
|
40
|
+
"Enter your Trismik API key and project id below."
|
|
41
|
+
]
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
"cell_type": "code",
|
|
45
|
+
"id": "14e576282749edb7",
|
|
46
|
+
"metadata": {},
|
|
47
|
+
"source": [
|
|
48
|
+
"# Set your credentials here\n",
|
|
49
|
+
"TRISMIK_API_KEY = \"your-trismik-api-key-here\"\n",
|
|
50
|
+
"TRISMIK_PROJECT_ID = \"your-trismik-project-id-here\""
|
|
51
|
+
],
|
|
52
|
+
"outputs": [],
|
|
53
|
+
"execution_count": null
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
"cell_type": "markdown",
|
|
57
|
+
"id": "700950d039e4c0f6",
|
|
58
|
+
"metadata": {},
|
|
59
|
+
"source": [
|
|
60
|
+
"## Login with Trismik API Key"
|
|
61
|
+
]
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
"cell_type": "code",
|
|
65
|
+
"id": "initial_id",
|
|
66
|
+
"metadata": {},
|
|
67
|
+
"source": [
|
|
68
|
+
"from scorebook import login\n",
|
|
69
|
+
"\n",
|
|
70
|
+
"# Login to Trismik\n",
|
|
71
|
+
"login(TRISMIK_API_KEY)\n",
|
|
72
|
+
"print(\"✓ Logged in to Trismik\")"
|
|
73
|
+
],
|
|
74
|
+
"outputs": [],
|
|
75
|
+
"execution_count": null
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
"cell_type": "markdown",
|
|
79
|
+
"id": "609a95a43d8cfc2c",
|
|
80
|
+
"metadata": {},
|
|
81
|
+
"source": [
|
|
82
|
+
"## Instantiate a Local Qwen Model\n",
|
|
83
|
+
"\n",
|
|
84
|
+
"For this quick-start guide, we will use the lightweight Qwen2.5 0.5B instruct model, via Hugging Face's transformers package."
|
|
85
|
+
]
|
|
86
|
+
},
|
|
87
|
+
{
|
|
88
|
+
"cell_type": "code",
|
|
89
|
+
"id": "d1da8af72ef8de6f",
|
|
90
|
+
"metadata": {},
|
|
91
|
+
"source": [
|
|
92
|
+
"import transformers\n",
|
|
93
|
+
"\n",
|
|
94
|
+
"# Instantiate a model\n",
|
|
95
|
+
"pipeline = transformers.pipeline(\n",
|
|
96
|
+
" \"text-generation\",\n",
|
|
97
|
+
" model=\"Qwen/Qwen2.5-0.5B-Instruct\",\n",
|
|
98
|
+
" model_kwargs={\"torch_dtype\": \"auto\"},\n",
|
|
99
|
+
" device_map=\"auto\",\n",
|
|
100
|
+
")\n",
|
|
101
|
+
"\n",
|
|
102
|
+
"print(\"✓ Transformers pipeline instantiated\")"
|
|
103
|
+
],
|
|
104
|
+
"outputs": [],
|
|
105
|
+
"execution_count": null
|
|
106
|
+
},
|
|
107
|
+
{
|
|
108
|
+
"cell_type": "markdown",
|
|
109
|
+
"id": "13084db21e549ccf",
|
|
110
|
+
"metadata": {},
|
|
111
|
+
"source": [
|
|
112
|
+
"## Define an Inference Function\n",
|
|
113
|
+
"\n",
|
|
114
|
+
"To evaluate a model with Scorebook, it must be encapsulated within an inference function. An inference function must accept a list of model inputs, pass these to the model for inference, collect and return outputs generated.\n",
|
|
115
|
+
"\n",
|
|
116
|
+
"An inference function can be defined to encapsulate any model, local or cloud-hosted. There is flexibility in how an inference function can be defined, the only requirements are the function signature. An inference function must,\n",
|
|
117
|
+
"\n",
|
|
118
|
+
"Accept:\n",
|
|
119
|
+
"\n",
|
|
120
|
+
"- A list of model inputs.\n",
|
|
121
|
+
"- Hyperparameters which can be optionally accessed via kwargs.\n",
|
|
122
|
+
"\n",
|
|
123
|
+
"Return\n",
|
|
124
|
+
"\n",
|
|
125
|
+
"- A list of parsed model outputs for scoring."
|
|
126
|
+
]
|
|
127
|
+
},
|
|
128
|
+
{
|
|
129
|
+
"cell_type": "code",
|
|
130
|
+
"id": "8aa99f513db6241a",
|
|
131
|
+
"metadata": {},
|
|
132
|
+
"source": [
|
|
133
|
+
"from typing import Any, List\n",
|
|
134
|
+
"import string\n",
|
|
135
|
+
"\n",
|
|
136
|
+
"# Define an inference function for the Qwen model.\n",
|
|
137
|
+
"def qwen(inputs: List[Any], **hyperparameters: Any) -> List[Any]:\n",
|
|
138
|
+
" \"\"\"Process inputs through Qwen model\"\"\"\n",
|
|
139
|
+
"\n",
|
|
140
|
+
" outputs = []\n",
|
|
141
|
+
" for idx, input_item in enumerate(inputs):\n",
|
|
142
|
+
"\n",
|
|
143
|
+
" # Format prompt\n",
|
|
144
|
+
" choices = input_item.get(\"options\", [])\n",
|
|
145
|
+
" prompt = (\n",
|
|
146
|
+
" str(input_item.get(\"question\", \"\"))\n",
|
|
147
|
+
" + \"\\nOptions:\\n\"\n",
|
|
148
|
+
" + \"\\n\".join(\n",
|
|
149
|
+
" f\"{letter}: {choice['text'] if isinstance(choice, dict) else choice}\"\n",
|
|
150
|
+
" for letter, choice in zip(string.ascii_uppercase, choices)\n",
|
|
151
|
+
" )\n",
|
|
152
|
+
" )\n",
|
|
153
|
+
"\n",
|
|
154
|
+
" # Build messages for Qwen model\n",
|
|
155
|
+
" messages = [\n",
|
|
156
|
+
" {\n",
|
|
157
|
+
" \"role\": \"system\",\n",
|
|
158
|
+
" \"content\": hyperparameters[\"system_message\"]\n",
|
|
159
|
+
" },\n",
|
|
160
|
+
" {\"role\": \"user\", \"content\": prompt},\n",
|
|
161
|
+
" ]\n",
|
|
162
|
+
"\n",
|
|
163
|
+
" # Run inference using the pipeline\n",
|
|
164
|
+
" try:\n",
|
|
165
|
+
" output = pipeline(\n",
|
|
166
|
+
" messages,\n",
|
|
167
|
+
" temperature = hyperparameters.get(\"temperature\", 0.7),\n",
|
|
168
|
+
" top_p = hyperparameters.get(\"top_p\", 0.9),\n",
|
|
169
|
+
" top_k = hyperparameters.get(\"top_k\", 50),\n",
|
|
170
|
+
" max_new_tokens = 512,\n",
|
|
171
|
+
" do_sample = hyperparameters.get(\"temperature\", 0.7) > 0,\n",
|
|
172
|
+
" )\n",
|
|
173
|
+
" response = output[0][\"generated_text\"][-1][\"content\"].strip()\n",
|
|
174
|
+
"\n",
|
|
175
|
+
" except Exception as e:\n",
|
|
176
|
+
" response = f\"Error: {str(e)}\"\n",
|
|
177
|
+
"\n",
|
|
178
|
+
" outputs.append(response)\n",
|
|
179
|
+
"\n",
|
|
180
|
+
" return outputs"
|
|
181
|
+
],
|
|
182
|
+
"outputs": [],
|
|
183
|
+
"execution_count": null
|
|
184
|
+
},
|
|
185
|
+
{
|
|
186
|
+
"cell_type": "markdown",
|
|
187
|
+
"id": "efa5c3ea791bbcd1",
|
|
188
|
+
"metadata": {},
|
|
189
|
+
"source": [
|
|
190
|
+
"## Run an Adaptive Evaluation\n",
|
|
191
|
+
"\n",
|
|
192
|
+
"When running an adaptive evaluation, we can use any single or multiple adaptive datasets and specify a split to be evaluated."
|
|
193
|
+
]
|
|
194
|
+
},
|
|
195
|
+
{
|
|
196
|
+
"cell_type": "code",
|
|
197
|
+
"id": "3cbf1b2f13d5553e",
|
|
198
|
+
"metadata": {},
|
|
199
|
+
"source": [
|
|
200
|
+
"from scorebook import evaluate\n",
|
|
201
|
+
"\n",
|
|
202
|
+
"# Run adaptive evaluation\n",
|
|
203
|
+
"results = evaluate(\n",
|
|
204
|
+
" inference = qwen,\n",
|
|
205
|
+
" datasets = \"trismik/figQA:adaptive\",\n",
|
|
206
|
+
" hyperparameters = {\"system_message\": \"Answer the question with only the letter of the correct option. No additional text or context\"},\n",
|
|
207
|
+
" split = \"validation\",\n",
|
|
208
|
+
" experiment_id = \"Qwen-2.5-0.5B-Adaptive-Evaluation\",\n",
|
|
209
|
+
" project_id = TRISMIK_PROJECT_ID,\n",
|
|
210
|
+
")\n",
|
|
211
|
+
"\n",
|
|
212
|
+
"# Print the adaptive evaluation results\n",
|
|
213
|
+
"print(\"✓ Adaptive evaluation complete!\")\n",
|
|
214
|
+
"print(\"Results: \", results[0][\"score\"])"
|
|
215
|
+
],
|
|
216
|
+
"outputs": [],
|
|
217
|
+
"execution_count": null
|
|
218
|
+
},
|
|
219
|
+
{
|
|
220
|
+
"cell_type": "markdown",
|
|
221
|
+
"id": "d37cb5e87cc297fe",
|
|
222
|
+
"metadata": {},
|
|
223
|
+
"source": [
|
|
224
|
+
"---\n",
|
|
225
|
+
"\n",
|
|
226
|
+
"## Next Steps\n",
|
|
227
|
+
"\n",
|
|
228
|
+
"- [Adaptive Testing White Paper](https://docs.trismik.com/adaptiveTesting/adaptive-testing-introduction/): An in depth overview of the science behind the adaptive testing methodology.\n",
|
|
229
|
+
"- [Dataset Page](https://dashboard.trismik.com/datasets): Trismik's full set of currently adaptive datasets from the Trismik dashboard.\n",
|
|
230
|
+
"- [Scorebook Docs](https://docs.trismik.com/scorebook/introduction-to-scorebook/): Scorebook's full documentation.\n",
|
|
231
|
+
"- [Scorebook Repository](https://github.com/trismik/scorebook): Scorebook is an open-source library, view the code and more examples."
|
|
232
|
+
]
|
|
233
|
+
}
|
|
234
|
+
],
|
|
235
|
+
"metadata": {
|
|
236
|
+
"kernelspec": {
|
|
237
|
+
"display_name": "Python 3 (ipykernel)",
|
|
238
|
+
"language": "python",
|
|
239
|
+
"name": "python3"
|
|
240
|
+
},
|
|
241
|
+
"language_info": {
|
|
242
|
+
"codemirror_mode": {
|
|
243
|
+
"name": "ipython",
|
|
244
|
+
"version": 3
|
|
245
|
+
},
|
|
246
|
+
"file_extension": ".py",
|
|
247
|
+
"mimetype": "text/x-python",
|
|
248
|
+
"name": "python",
|
|
249
|
+
"nbconvert_exporter": "python",
|
|
250
|
+
"pygments_lexer": "ipython3",
|
|
251
|
+
"version": "3.11.12"
|
|
252
|
+
}
|
|
253
|
+
},
|
|
254
|
+
"nbformat": 4,
|
|
255
|
+
"nbformat_minor": 5
|
|
256
|
+
}
|