scorebook 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. scorebook/__init__.py +12 -5
  2. scorebook/cli/auth.py +1 -1
  3. scorebook/dashboard/__init__.py +1 -0
  4. scorebook/dashboard/create_project.py +91 -0
  5. scorebook/{trismik → dashboard}/credentials.py +57 -12
  6. scorebook/{trismik → dashboard}/upload_results.py +1 -1
  7. scorebook/eval_datasets/__init__.py +0 -4
  8. scorebook/eval_datasets/eval_dataset.py +4 -2
  9. scorebook/evaluate/__init__.py +1 -15
  10. scorebook/evaluate/_async/evaluate_async.py +36 -19
  11. scorebook/evaluate/_sync/evaluate.py +36 -19
  12. scorebook/evaluate/evaluate_helpers.py +4 -3
  13. scorebook/inference/__init__.py +1 -11
  14. scorebook/inference/clients/__init__.py +1 -8
  15. scorebook/inference/inference_pipeline.py +1 -1
  16. scorebook/metrics/README.md +121 -0
  17. scorebook/metrics/__init__.py +7 -16
  18. scorebook/metrics/accuracy.py +2 -6
  19. scorebook/metrics/bertscore.py +50 -0
  20. scorebook/metrics/bleu.py +82 -0
  21. scorebook/metrics/core/__init__.py +1 -0
  22. scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
  23. scorebook/metrics/core/metric_registry.py +195 -0
  24. scorebook/metrics/exactmatch.py +95 -0
  25. scorebook/metrics/f1.py +96 -0
  26. scorebook/metrics/precision.py +84 -9
  27. scorebook/metrics/recall.py +94 -0
  28. scorebook/metrics/rouge.py +85 -0
  29. scorebook/score/__init__.py +0 -5
  30. scorebook/score/_async/score_async.py +3 -2
  31. scorebook/score/_sync/score.py +3 -2
  32. scorebook/score/score_helpers.py +29 -12
  33. scorebook/types.py +3 -3
  34. scorebook/utils/__init__.py +0 -22
  35. scorebook/utils/common_helpers.py +1 -1
  36. scorebook/utils/mock_llm/__init__.py +41 -0
  37. scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
  38. scorebook/utils/progress_bars.py +58 -786
  39. scorebook-0.0.15.dist-info/METADATA +300 -0
  40. scorebook-0.0.15.dist-info/RECORD +110 -0
  41. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
  42. tutorials/README.md +147 -0
  43. tutorials/__init__.py +5 -0
  44. tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
  45. tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
  46. tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
  47. tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
  48. tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
  49. tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
  50. tutorials/examples/1-score/__init__.py +0 -0
  51. tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
  52. tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
  53. tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
  54. tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
  55. tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
  56. tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
  57. tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
  58. tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
  59. tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
  60. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
  61. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
  62. tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
  63. tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
  64. tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
  65. tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
  66. tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
  67. tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
  68. tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
  69. tutorials/examples/6-providers/aws/__init__.py +1 -0
  70. tutorials/examples/6-providers/aws/batch_example.py +219 -0
  71. tutorials/examples/6-providers/portkey/__init__.py +1 -0
  72. tutorials/examples/6-providers/portkey/batch_example.py +120 -0
  73. tutorials/examples/6-providers/portkey/messages_example.py +121 -0
  74. tutorials/examples/6-providers/vertex/__init__.py +1 -0
  75. tutorials/examples/6-providers/vertex/batch_example.py +166 -0
  76. tutorials/examples/6-providers/vertex/messages_example.py +142 -0
  77. tutorials/examples/__init__.py +0 -0
  78. tutorials/notebooks/1-scoring.ipynb +162 -0
  79. tutorials/notebooks/2-evaluating.ipynb +316 -0
  80. tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
  81. tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
  82. tutorials/notebooks/4-uploading_results.ipynb +175 -0
  83. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
  84. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
  85. tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
  86. tutorials/quickstarts/getting_started.ipynb +197 -0
  87. tutorials/utils/__init__.py +35 -0
  88. tutorials/utils/args_parser.py +132 -0
  89. tutorials/utils/output.py +23 -0
  90. tutorials/utils/setup.py +98 -0
  91. scorebook/metrics/metric_registry.py +0 -105
  92. scorebook/trismik/__init__.py +0 -10
  93. scorebook-0.0.13.dist-info/METADATA +0 -389
  94. scorebook-0.0.13.dist-info/RECORD +0 -50
  95. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
  96. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,256 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "bc3ba3cd77800bb4",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Adaptive Evaluations with Scorebook - Evaluating a Local Qwen Model\n",
9
+ "\n",
10
+ "This quick-start guide showcases an adaptive evaluation of Qwen's Qwen2.5 0.5B Instruct model.\n",
11
+ "\n",
12
+ "We recommend that you first see our [getting started quick-start guide](https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/getting_started.ipynb) if you have not done so already, for more of a detailed overview on adaptive testing and setting up Trismik credentials.\n",
13
+ "\n",
14
+ "## Prerequisites\n",
15
+ "\n",
16
+ "- **Trismik API key**: Generate a Trismik API key from the [Trismik dashboard's settings page](https://app.trismik.com/settings).\n",
17
+ "- **Trismik Project Id**: We recommend you use the project id generated in the [Getting Started Quick-Start Guide](https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/getting_started.ipynb).\n",
18
+ "\n",
19
+ "## Install Scorebook"
20
+ ]
21
+ },
22
+ {
23
+ "metadata": {},
24
+ "cell_type": "code",
25
+ "source": [
26
+ "!pip install scorebook\n",
27
+ "# if you're running this locally, please run !pip install scorebook\"[examples]\""
28
+ ],
29
+ "id": "90146caef86f19ee",
30
+ "outputs": [],
31
+ "execution_count": null
32
+ },
33
+ {
34
+ "cell_type": "markdown",
35
+ "id": "cad992b287d4d0ac",
36
+ "metadata": {},
37
+ "source": [
38
+ "## Setup Credentials\n",
39
+ "\n",
40
+ "Enter your Trismik API key and project id below."
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "id": "14e576282749edb7",
46
+ "metadata": {},
47
+ "source": [
48
+ "# Set your credentials here\n",
49
+ "TRISMIK_API_KEY = \"your-trismik-api-key-here\"\n",
50
+ "TRISMIK_PROJECT_ID = \"your-trismik-project-id-here\""
51
+ ],
52
+ "outputs": [],
53
+ "execution_count": null
54
+ },
55
+ {
56
+ "cell_type": "markdown",
57
+ "id": "700950d039e4c0f6",
58
+ "metadata": {},
59
+ "source": [
60
+ "## Login with Trismik API Key"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "id": "initial_id",
66
+ "metadata": {},
67
+ "source": [
68
+ "from scorebook import login\n",
69
+ "\n",
70
+ "# Login to Trismik\n",
71
+ "login(TRISMIK_API_KEY)\n",
72
+ "print(\"✓ Logged in to Trismik\")"
73
+ ],
74
+ "outputs": [],
75
+ "execution_count": null
76
+ },
77
+ {
78
+ "cell_type": "markdown",
79
+ "id": "609a95a43d8cfc2c",
80
+ "metadata": {},
81
+ "source": [
82
+ "## Instantiate a Local Qwen Model\n",
83
+ "\n",
84
+ "For this quick-start guide, we will use the lightweight Qwen2.5 0.5B instruct model, via Hugging Face's transformers package."
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "id": "d1da8af72ef8de6f",
90
+ "metadata": {},
91
+ "source": [
92
+ "import transformers\n",
93
+ "\n",
94
+ "# Instantiate a model\n",
95
+ "pipeline = transformers.pipeline(\n",
96
+ " \"text-generation\",\n",
97
+ " model=\"Qwen/Qwen2.5-0.5B-Instruct\",\n",
98
+ " model_kwargs={\"torch_dtype\": \"auto\"},\n",
99
+ " device_map=\"auto\",\n",
100
+ ")\n",
101
+ "\n",
102
+ "print(\"✓ Transformers pipeline instantiated\")"
103
+ ],
104
+ "outputs": [],
105
+ "execution_count": null
106
+ },
107
+ {
108
+ "cell_type": "markdown",
109
+ "id": "13084db21e549ccf",
110
+ "metadata": {},
111
+ "source": [
112
+ "## Define an Inference Function\n",
113
+ "\n",
114
+ "To evaluate a model with Scorebook, it must be encapsulated within an inference function. An inference function must accept a list of model inputs, pass these to the model for inference, collect and return outputs generated.\n",
115
+ "\n",
116
+ "An inference function can be defined to encapsulate any model, local or cloud-hosted. There is flexibility in how an inference function can be defined, the only requirements are the function signature. An inference function must,\n",
117
+ "\n",
118
+ "Accept:\n",
119
+ "\n",
120
+ "- A list of model inputs.\n",
121
+ "- Hyperparameters which can be optionally accessed via kwargs.\n",
122
+ "\n",
123
+ "Return\n",
124
+ "\n",
125
+ "- A list of parsed model outputs for scoring."
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "id": "8aa99f513db6241a",
131
+ "metadata": {},
132
+ "source": [
133
+ "from typing import Any, List\n",
134
+ "import string\n",
135
+ "\n",
136
+ "# Define an inference function for the Qwen model.\n",
137
+ "def qwen(inputs: List[Any], **hyperparameters: Any) -> List[Any]:\n",
138
+ " \"\"\"Process inputs through Qwen model\"\"\"\n",
139
+ "\n",
140
+ " outputs = []\n",
141
+ " for idx, input_item in enumerate(inputs):\n",
142
+ "\n",
143
+ " # Format prompt\n",
144
+ " choices = input_item.get(\"options\", [])\n",
145
+ " prompt = (\n",
146
+ " str(input_item.get(\"question\", \"\"))\n",
147
+ " + \"\\nOptions:\\n\"\n",
148
+ " + \"\\n\".join(\n",
149
+ " f\"{letter}: {choice['text'] if isinstance(choice, dict) else choice}\"\n",
150
+ " for letter, choice in zip(string.ascii_uppercase, choices)\n",
151
+ " )\n",
152
+ " )\n",
153
+ "\n",
154
+ " # Build messages for Qwen model\n",
155
+ " messages = [\n",
156
+ " {\n",
157
+ " \"role\": \"system\",\n",
158
+ " \"content\": hyperparameters[\"system_message\"]\n",
159
+ " },\n",
160
+ " {\"role\": \"user\", \"content\": prompt},\n",
161
+ " ]\n",
162
+ "\n",
163
+ " # Run inference using the pipeline\n",
164
+ " try:\n",
165
+ " output = pipeline(\n",
166
+ " messages,\n",
167
+ " temperature = hyperparameters.get(\"temperature\", 0.7),\n",
168
+ " top_p = hyperparameters.get(\"top_p\", 0.9),\n",
169
+ " top_k = hyperparameters.get(\"top_k\", 50),\n",
170
+ " max_new_tokens = 512,\n",
171
+ " do_sample = hyperparameters.get(\"temperature\", 0.7) > 0,\n",
172
+ " )\n",
173
+ " response = output[0][\"generated_text\"][-1][\"content\"].strip()\n",
174
+ "\n",
175
+ " except Exception as e:\n",
176
+ " response = f\"Error: {str(e)}\"\n",
177
+ "\n",
178
+ " outputs.append(response)\n",
179
+ "\n",
180
+ " return outputs"
181
+ ],
182
+ "outputs": [],
183
+ "execution_count": null
184
+ },
185
+ {
186
+ "cell_type": "markdown",
187
+ "id": "efa5c3ea791bbcd1",
188
+ "metadata": {},
189
+ "source": [
190
+ "## Run an Adaptive Evaluation\n",
191
+ "\n",
192
+ "When running an adaptive evaluation, we can use any single or multiple adaptive datasets and specify a split to be evaluated."
193
+ ]
194
+ },
195
+ {
196
+ "cell_type": "code",
197
+ "id": "3cbf1b2f13d5553e",
198
+ "metadata": {},
199
+ "source": [
200
+ "from scorebook import evaluate\n",
201
+ "\n",
202
+ "# Run adaptive evaluation\n",
203
+ "results = evaluate(\n",
204
+ " inference = qwen,\n",
205
+ " datasets = \"trismik/figQA:adaptive\",\n",
206
+ " hyperparameters = {\"system_message\": \"Answer the question with only the letter of the correct option. No additional text or context\"},\n",
207
+ " split = \"validation\",\n",
208
+ " experiment_id = \"Qwen-2.5-0.5B-Adaptive-Evaluation\",\n",
209
+ " project_id = TRISMIK_PROJECT_ID,\n",
210
+ ")\n",
211
+ "\n",
212
+ "# Print the adaptive evaluation results\n",
213
+ "print(\"✓ Adaptive evaluation complete!\")\n",
214
+ "print(\"Results: \", results[0][\"score\"])"
215
+ ],
216
+ "outputs": [],
217
+ "execution_count": null
218
+ },
219
+ {
220
+ "cell_type": "markdown",
221
+ "id": "d37cb5e87cc297fe",
222
+ "metadata": {},
223
+ "source": [
224
+ "---\n",
225
+ "\n",
226
+ "## Next Steps\n",
227
+ "\n",
228
+ "- [Adaptive Testing White Paper](https://docs.trismik.com/adaptiveTesting/adaptive-testing-introduction/): An in depth overview of the science behind the adaptive testing methodology.\n",
229
+ "- [Dataset Page](https://dashboard.trismik.com/datasets): Trismik's full set of currently adaptive datasets from the Trismik dashboard.\n",
230
+ "- [Scorebook Docs](https://docs.trismik.com/scorebook/introduction-to-scorebook/): Scorebook's full documentation.\n",
231
+ "- [Scorebook Repository](https://github.com/trismik/scorebook): Scorebook is an open-source library, view the code and more examples."
232
+ ]
233
+ }
234
+ ],
235
+ "metadata": {
236
+ "kernelspec": {
237
+ "display_name": "Python 3 (ipykernel)",
238
+ "language": "python",
239
+ "name": "python3"
240
+ },
241
+ "language_info": {
242
+ "codemirror_mode": {
243
+ "name": "ipython",
244
+ "version": 3
245
+ },
246
+ "file_extension": ".py",
247
+ "mimetype": "text/x-python",
248
+ "name": "python",
249
+ "nbconvert_exporter": "python",
250
+ "pygments_lexer": "ipython3",
251
+ "version": "3.11.12"
252
+ }
253
+ },
254
+ "nbformat": 4,
255
+ "nbformat_minor": 5
256
+ }
@@ -0,0 +1,277 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "b1ef68792a1c82f2",
6
+ "metadata": {},
7
+ "source": [
8
+ "# _Classical_ Evaluations with Scorebook\n",
9
+ "\n",
10
+ "Scorebook, developed by Trismik, is an open-source Python library for model evaluation. It supports both Trismik’s adaptive testing and traditional classical evaluations. In a classical evaluation, a model runs inference on every item in a dataset, and the results are scored using Scorebook’s built-in metrics, such as accuracy, to produce evaluation results. Evaluation results can be automatically uploaded to the Scorebook dashboard, organized by project, for storing, managing, and visualizing model evaluation experiments.\n",
11
+ "\n",
12
+ "## Prerequisites\n",
13
+ "\n",
14
+ "- **Trismik API key**: Generate a Trismik API key from the [Trismik dashboard's settings page](https://app.trismik.com/settings).\n",
15
+ "- **Trismik Project Id**: We recommend you use the project id generated in the [Getting Started Quick-Start Guide](https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/getting_started.ipynb).\n",
16
+ "\n",
17
+ "### Install Scorebook\n"
18
+ ]
19
+ },
20
+ {
21
+ "metadata": {},
22
+ "cell_type": "code",
23
+ "source": "!pip install scorebook",
24
+ "id": "c2fecbb3d7b699b3",
25
+ "outputs": [],
26
+ "execution_count": null
27
+ },
28
+ {
29
+ "metadata": {},
30
+ "cell_type": "markdown",
31
+ "source": [
32
+ "\n",
33
+ "\n",
34
+ "### Setup Credentials\n",
35
+ "\n",
36
+ "Enter your Trismik API key and project id below."
37
+ ],
38
+ "id": "8e18d0e628fdb944"
39
+ },
40
+ {
41
+ "metadata": {},
42
+ "cell_type": "code",
43
+ "source": [
44
+ "# Set your credentials here\n",
45
+ "TRISMIK_API_KEY = \"your-trismik-api-key-here\"\n",
46
+ "TRISMIK_PROJECT_ID = \"your-trismik-project-id-key-here\""
47
+ ],
48
+ "id": "4cb833e7b092ae4",
49
+ "outputs": [],
50
+ "execution_count": null
51
+ },
52
+ {
53
+ "metadata": {},
54
+ "cell_type": "markdown",
55
+ "source": "### Login with Trismik API Key",
56
+ "id": "ed4e8281ecb99685"
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "id": "f59fb493a575d361",
61
+ "metadata": {},
62
+ "source": [
63
+ "from scorebook import login\n",
64
+ "\n",
65
+ "login(TRISMIK_API_KEY)\n",
66
+ "print(\"✓ Logged in to Trismik\")"
67
+ ],
68
+ "outputs": [],
69
+ "execution_count": null
70
+ },
71
+ {
72
+ "cell_type": "markdown",
73
+ "id": "cb58f57296229115",
74
+ "metadata": {},
75
+ "source": [
76
+ "\n",
77
+ "\n",
78
+ "## Evaluation Datasets\n",
79
+ "\n",
80
+ "A scorebook evaluation requires an evaluation dataset, represented by the `EvalDataset` class. Evaluation datasets can be constructed via a number of factory methods. In this example we will create a basic evaluation dataset from a list of evaluation items."
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "id": "fa7c936b75c83cad",
86
+ "metadata": {},
87
+ "source": [
88
+ "from scorebook import EvalDataset\n",
89
+ "from scorebook.metrics.accuracy import Accuracy\n",
90
+ "\n",
91
+ "# Create a sample dataset from a list of multiple-choice questions\n",
92
+ "evaluation_items = [\n",
93
+ " {\"question\": \"What is 2 + 2?\", \"answer\": \"4\"},\n",
94
+ " {\"question\": \"What is the capital of France?\", \"answer\": \"Paris\"},\n",
95
+ " {\"question\": \"Who wrote Romeo and Juliet?\", \"answer\": \"William Shakespeare\"},\n",
96
+ " {\"question\": \"What is the chemical symbol for gold?\", \"answer\": \"Au\"}\n",
97
+ "]\n",
98
+ "\n",
99
+ "# Create an EvalDataset from the list\n",
100
+ "dataset = EvalDataset.from_list(\n",
101
+ " name = \"sample_multiple_choice\",\n",
102
+ " metrics = Accuracy,\n",
103
+ " items = evaluation_items,\n",
104
+ " input = \"question\",\n",
105
+ " label = \"answer\",\n",
106
+ ")\n",
107
+ "\n",
108
+ "print(f\"✓ Created dataset with {len(dataset.items)} items\")"
109
+ ],
110
+ "outputs": [],
111
+ "execution_count": null
112
+ },
113
+ {
114
+ "cell_type": "markdown",
115
+ "id": "609a95a43d8cfc2c",
116
+ "metadata": {},
117
+ "source": [
118
+ "## Preparing Models for Evaluation\n",
119
+ "\n",
120
+ "To evaluate a model with Scorebook, it must be encapsulated within an inference function. An inference function must accept a list of model inputs, pass these to the model for inference, collect and return outputs generated.\n",
121
+ "\n",
122
+ "### Instantiate a Local Qwen Model\n",
123
+ "\n",
124
+ "For this quick-start guide, we will use the lightweight Qwen2.5 0.5B instruct model, via Hugging Face's transformers package."
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "id": "d1da8af72ef8de6f",
130
+ "metadata": {},
131
+ "source": [
132
+ "import transformers\n",
133
+ "\n",
134
+ "# Instantiate a model\n",
135
+ "pipeline = transformers.pipeline(\n",
136
+ " \"text-generation\",\n",
137
+ " model=\"Qwen/Qwen2.5-0.5B-Instruct\",\n",
138
+ " model_kwargs={\"torch_dtype\": \"auto\"},\n",
139
+ " device_map=\"auto\",\n",
140
+ ")\n",
141
+ "\n",
142
+ "print(\"✓ Transformers pipeline instantiated\")"
143
+ ],
144
+ "outputs": [],
145
+ "execution_count": null
146
+ },
147
+ {
148
+ "cell_type": "markdown",
149
+ "id": "3b56a72374920220",
150
+ "metadata": {},
151
+ "source": [
152
+ "### Define an Inference Function\n",
153
+ "\n",
154
+ "An inference function can be defined to encapsulate any model, local or cloud-hosted. There is flexibility in how an inference function can be defined, the only requirements are the function signature. An inference function must,\n",
155
+ "\n",
156
+ "Accept:\n",
157
+ "\n",
158
+ "- A list of model inputs.\n",
159
+ "- Hyperparameters which can be optionally accessed via kwargs.\n",
160
+ "\n",
161
+ "Return\n",
162
+ "\n",
163
+ "- A list of parsed model outputs for scoring.\n"
164
+ ]
165
+ },
166
+ {
167
+ "cell_type": "code",
168
+ "id": "55f6d1eee2fa886e",
169
+ "metadata": {},
170
+ "source": [
171
+ "from typing import Any, List\n",
172
+ "\n",
173
+ "# Define an inference function for the Qwen model.\n",
174
+ "def qwen(inputs: List[Any], **hyperparameters: Any) -> List[Any]:\n",
175
+ " \"\"\"Run inference on a list of inputs using the 0.5B Qwen model.\"\"\"\n",
176
+ " inference_outputs = []\n",
177
+ "\n",
178
+ " for model_input in inputs:\n",
179
+ " messages = [\n",
180
+ " {\"role\": \"system\", \"content\": hyperparameters.get(\"system_message\", \"You are a helpful assistant.\")},\n",
181
+ " {\"role\": \"user\", \"content\": str(model_input)},\n",
182
+ " ]\n",
183
+ "\n",
184
+ " output = pipeline(\n",
185
+ " messages,\n",
186
+ " temperature = hyperparameters.get(\"temperature\", 0.7),\n",
187
+ " top_p = hyperparameters.get(\"top_p\", 0.9),\n",
188
+ " top_k = hyperparameters.get(\"top_k\", 50),\n",
189
+ " max_new_tokens = 512,\n",
190
+ " do_sample = hyperparameters.get(\"temperature\", 0.7) > 0,\n",
191
+ " )\n",
192
+ "\n",
193
+ " inference_outputs.append(output[0][\"generated_text\"][-1][\"content\"])\n",
194
+ "\n",
195
+ " return inference_outputs\n",
196
+ "\n",
197
+ "print(\"✓ Inference function for Qwen2.5 0.5B defined\")\n",
198
+ "print(qwen([\"Hello!\"]))"
199
+ ],
200
+ "outputs": [],
201
+ "execution_count": null
202
+ },
203
+ {
204
+ "cell_type": "markdown",
205
+ "id": "1230b9ad762482af",
206
+ "metadata": {},
207
+ "source": [
208
+ "## Running an Evaluation\n",
209
+ "\n",
210
+ "Running a scorebook evaluation with `evaluate` only requires an inference function and a dataset. When uploading results to Trismik's dashboard, an experiment and project id are also required. We can also specify in hyperparameters, which are passed to the inference function."
211
+ ]
212
+ },
213
+ {
214
+ "cell_type": "code",
215
+ "id": "7031045c655d1062",
216
+ "metadata": {},
217
+ "source": [
218
+ "from scorebook import evaluate\n",
219
+ "\n",
220
+ "# Run evaluation\n",
221
+ "results = evaluate(\n",
222
+ " inference=qwen,\n",
223
+ " datasets = dataset,\n",
224
+ " hyperparameters = {\n",
225
+ " 'temperature': 0.9,\n",
226
+ " 'top_p': 0.8,\n",
227
+ " 'top_k': 40,\n",
228
+ " 'system_message': \"Answer the question directly, provide no additional context.\"\n",
229
+ " },\n",
230
+ " experiment_id = \"Qwen-Classical-Evaluation\",\n",
231
+ " project_id = TRISMIK_PROJECT_ID,\n",
232
+ ")\n",
233
+ "\n",
234
+ "print(\"Qwen2.5 0.5B Evaluation Results:\")\n",
235
+ "print(f\"accuracy: {results[0]['accuracy']}\")"
236
+ ],
237
+ "outputs": [],
238
+ "execution_count": null
239
+ },
240
+ {
241
+ "metadata": {},
242
+ "cell_type": "markdown",
243
+ "source": [
244
+ "The results are encapsulated within a list of dictionaries, with a dict for each evaluation run. the above example only executes a single run as only 1 dataset, and hyperparameter configuration is evaluated.\n",
245
+ "\n",
246
+ "---\n",
247
+ "\n",
248
+ "## Next Steps\n",
249
+ "\n",
250
+ "- [Scorebook Docs](https://docs.trismik.com/scorebook/introduction-to-scorebook/): Scorebook's full documentation.\n",
251
+ "- [Scorebook Repository](https://github.com/trismik/scorebook): Scorebook is an open-source library, view the code and more examples."
252
+ ],
253
+ "id": "61d9ff67d63624d6"
254
+ }
255
+ ],
256
+ "metadata": {
257
+ "kernelspec": {
258
+ "display_name": "Python 3 (ipykernel)",
259
+ "language": "python",
260
+ "name": "python3"
261
+ },
262
+ "language_info": {
263
+ "codemirror_mode": {
264
+ "name": "ipython",
265
+ "version": 3
266
+ },
267
+ "file_extension": ".py",
268
+ "mimetype": "text/x-python",
269
+ "name": "python",
270
+ "nbconvert_exporter": "python",
271
+ "pygments_lexer": "ipython3",
272
+ "version": "3.13.5"
273
+ }
274
+ },
275
+ "nbformat": 4,
276
+ "nbformat_minor": 5
277
+ }