scorebook 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. scorebook/__init__.py +2 -0
  2. scorebook/dashboard/credentials.py +34 -4
  3. scorebook/eval_datasets/eval_dataset.py +2 -2
  4. scorebook/evaluate/_async/evaluate_async.py +27 -11
  5. scorebook/evaluate/_sync/evaluate.py +27 -11
  6. scorebook/metrics/README.md +121 -0
  7. scorebook/metrics/__init__.py +8 -0
  8. scorebook/metrics/accuracy.py +2 -6
  9. scorebook/metrics/bertscore.py +50 -0
  10. scorebook/metrics/bleu.py +82 -0
  11. scorebook/metrics/core/__init__.py +1 -0
  12. scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
  13. scorebook/metrics/core/metric_registry.py +195 -0
  14. scorebook/metrics/exactmatch.py +95 -0
  15. scorebook/metrics/f1.py +96 -0
  16. scorebook/metrics/precision.py +84 -9
  17. scorebook/metrics/recall.py +94 -0
  18. scorebook/metrics/rouge.py +85 -0
  19. scorebook/score/score_helpers.py +28 -11
  20. scorebook/types.py +2 -2
  21. scorebook/utils/progress_bars.py +58 -786
  22. {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/METADATA +32 -24
  23. scorebook-0.0.16.dist-info/RECORD +110 -0
  24. {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/WHEEL +1 -1
  25. tutorials/README.md +147 -0
  26. tutorials/__init__.py +5 -0
  27. tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
  28. tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
  29. tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
  30. tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
  31. tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
  32. tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
  33. tutorials/examples/1-score/__init__.py +0 -0
  34. tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
  35. tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
  36. tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
  37. tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
  38. tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
  39. tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
  40. tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
  41. tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
  42. tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
  43. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
  44. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
  45. tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
  46. tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
  47. tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
  48. tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
  49. tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
  50. tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
  51. tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
  52. tutorials/examples/6-providers/aws/__init__.py +1 -0
  53. tutorials/examples/6-providers/aws/batch_example.py +219 -0
  54. tutorials/examples/6-providers/portkey/__init__.py +1 -0
  55. tutorials/examples/6-providers/portkey/batch_example.py +120 -0
  56. tutorials/examples/6-providers/portkey/messages_example.py +121 -0
  57. tutorials/examples/6-providers/vertex/__init__.py +1 -0
  58. tutorials/examples/6-providers/vertex/batch_example.py +166 -0
  59. tutorials/examples/6-providers/vertex/messages_example.py +142 -0
  60. tutorials/examples/__init__.py +0 -0
  61. tutorials/notebooks/1-scoring.ipynb +162 -0
  62. tutorials/notebooks/2-evaluating.ipynb +316 -0
  63. tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
  64. tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
  65. tutorials/notebooks/4-uploading_results.ipynb +175 -0
  66. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
  67. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
  68. tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
  69. tutorials/quickstarts/getting_started.ipynb +197 -0
  70. tutorials/utils/__init__.py +35 -0
  71. tutorials/utils/args_parser.py +132 -0
  72. tutorials/utils/output.py +23 -0
  73. tutorials/utils/setup.py +98 -0
  74. scorebook/metrics/metric_registry.py +0 -107
  75. scorebook-0.0.14.dist-info/RECORD +0 -53
  76. {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/entry_points.txt +0 -0
  77. {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,175 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": "# Uploading Results to Trismik Dashboard\n\nThis notebook demonstrates three ways to upload evaluation results to Trismik's dashboard for tracking and visualization.\n\n## Why Upload Results?\n\n- **Track Progress**: Monitor model performance over time\n- **Compare Models**: Visualize performance across different models and experiments\n- **Share Results**: Collaborate with your team on evaluation insights\n- **Historical Analysis**: Maintain a record of all evaluations\n\n## Prerequisites\n\n- **Trismik API key**: Get yours at https://app.trismik.com/settings\n- **Trismik Project**: Create a project at https://app.trismik.com and copy its Project ID"
7
+ },
8
+ {
9
+ "cell_type": "markdown",
10
+ "metadata": {},
11
+ "source": "## Setup Credentials\n\nSet your Trismik credentials here:"
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "metadata": {},
16
+ "source": "# STEP 1: Get your Trismik API key from https://app.trismik.com/settings\n# STEP 2: Create a project at https://app.trismik.com and copy the Project ID\n\n# Set your credentials here\nTRISMIK_API_KEY = \"your-trismik-api-key\"\nTRISMIK_PROJECT_ID = \"your-project-id\"",
17
+ "outputs": [],
18
+ "execution_count": null
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "metadata": {},
23
+ "source": [
24
+ "from pprint import pprint\n",
25
+ "from scorebook import score, login\n",
26
+ "from scorebook.metrics.accuracy import Accuracy"
27
+ ],
28
+ "outputs": [],
29
+ "execution_count": null
30
+ },
31
+ {
32
+ "cell_type": "markdown",
33
+ "metadata": {},
34
+ "source": [
35
+ "## Login to Trismik"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "metadata": {},
41
+ "source": "if not TRISMIK_API_KEY or TRISMIK_API_KEY == \"your-trismik-api-key\":\n raise ValueError(\"Please set TRISMIK_API_KEY. Get your API key from https://app.trismik.com/settings\")\n\nlogin(TRISMIK_API_KEY)\nprint(\"✓ Logged in to Trismik\")\n\nif not TRISMIK_PROJECT_ID or TRISMIK_PROJECT_ID == \"your-project-id\":\n raise ValueError(\"Please set TRISMIK_PROJECT_ID. Create a project at https://app.trismik.com\")\n\nprint(f\"✓ Using project: {TRISMIK_PROJECT_ID}\")",
42
+ "outputs": [],
43
+ "execution_count": null
44
+ },
45
+ {
46
+ "cell_type": "markdown",
47
+ "metadata": {},
48
+ "source": [
49
+ "## Method 1: Upload score() Results\n",
50
+ "\n",
51
+ "Score pre-computed outputs and upload to Trismik:"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "metadata": {},
57
+ "source": "# Prepare items with pre-computed outputs\nitems = [\n {\"input\": \"What is 2 + 2?\", \"output\": \"4\", \"label\": \"4\"},\n {\"input\": \"What is the capital of France?\", \"output\": \"Paris\", \"label\": \"Paris\"},\n {\"input\": \"Who wrote Romeo and Juliet?\", \"output\": \"William Shakespeare\", \"label\": \"William Shakespeare\"},\n {\"input\": \"What is 5 * 6?\", \"output\": \"30\", \"label\": \"30\"},\n {\"input\": \"What is the largest planet?\", \"output\": \"Jupiter\", \"label\": \"Jupiter\"},\n]\n\n# Score and upload\nresults = score(\n items=items,\n metrics=Accuracy,\n dataset_name=\"basic_questions\",\n model_name=\"example-model-v1\",\n experiment_id=\"Score-Upload-Notebook\",\n project_id=TRISMIK_PROJECT_ID,\n metadata={\n \"description\": \"Example from Jupyter notebook\",\n \"note\": \"Pre-computed outputs uploaded via score()\",\n },\n upload_results=True, # Enable uploading\n)\n\nprint(f\"\\n✓ Results uploaded successfully!\")\nprint(f\"Accuracy: {results['aggregate_results'][0]['accuracy']:.2%}\")",
58
+ "outputs": [],
59
+ "execution_count": null
60
+ },
61
+ {
62
+ "cell_type": "markdown",
63
+ "metadata": {},
64
+ "source": [
65
+ "## Method 2: Upload evaluate() Results\n",
66
+ "\n",
67
+ "Run inference and automatically upload results:"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "metadata": {},
73
+ "source": "from typing import Any, List\nfrom scorebook import EvalDataset, evaluate\n\n# Create a simple dataset\nimport json\nfrom pathlib import Path\n\nsample_data = [\n {\"question\": \"What is 10 + 5?\", \"answer\": \"15\"},\n {\"question\": \"What is the capital of Spain?\", \"answer\": \"Madrid\"},\n]\n\ntemp_file = Path(\"temp_eval_dataset.json\")\nwith open(temp_file, \"w\") as f:\n json.dump(sample_data, f)\n\ndataset = EvalDataset.from_json(\n path=str(temp_file),\n metrics=\"accuracy\",\n input=\"question\",\n label=\"answer\",\n)\n\n# Define a simple inference function (mock)\ndef mock_inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:\n \"\"\"Mock inference that returns the expected answers.\"\"\"\n # In practice, this would call your model\n return [\"15\", \"Madrid\"] # Mock perfect answers\n\n# Run evaluation with upload\neval_results = evaluate(\n mock_inference,\n dataset,\n hyperparameters={\"temperature\": 0.7},\n experiment_id=\"Evaluate-Upload-Notebook\",\n project_id=TRISMIK_PROJECT_ID,\n metadata={\n \"model\": \"mock-model\",\n \"description\": \"Evaluation results from notebook\",\n },\n return_aggregates=True,\n return_items=True,\n return_output=True,\n)\n\nprint(f\"\\n✓ Evaluation results uploaded!\")\nprint(f\"Accuracy: {eval_results['aggregate_results'][0]['accuracy']:.2%}\")\n\n# Cleanup\ntemp_file.unlink()",
74
+ "outputs": [],
75
+ "execution_count": null
76
+ },
77
+ {
78
+ "cell_type": "markdown",
79
+ "metadata": {},
80
+ "source": [
81
+ "## Method 3: Upload External Results\n",
82
+ "\n",
83
+ "Import results from external evaluation frameworks or historical data:"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "metadata": {},
89
+ "source": "# Example: Import results from another evaluation framework\nexternal_results = [\n {\"input\": \"Translate 'hello' to Spanish\", \"output\": \"hola\", \"label\": \"hola\"},\n {\"input\": \"Translate 'goodbye' to Spanish\", \"output\": \"adiós\", \"label\": \"adiós\"},\n {\"input\": \"Translate 'thank you' to Spanish\", \"output\": \"gracias\", \"label\": \"gracias\"},\n {\"input\": \"Translate 'please' to Spanish\", \"output\": \"por favor\", \"label\": \"por favor\"},\n]\n\n# Upload external results\nexternal_upload = score(\n items=external_results,\n metrics=\"accuracy\",\n dataset_name=\"spanish_translation\",\n model_name=\"external-translator-v2\",\n experiment_id=\"External-Results-Upload\",\n project_id=TRISMIK_PROJECT_ID,\n metadata={\n \"description\": \"Historical results imported from external framework\",\n \"source\": \"Custom evaluation pipeline\",\n \"date\": \"2025-01-15\",\n },\n upload_results=True,\n)\n\nprint(f\"\\n✓ External results uploaded!\")\nprint(f\"Accuracy: {external_upload['aggregate_results'][0]['accuracy']:.2%}\")",
90
+ "outputs": [],
91
+ "execution_count": null
92
+ },
93
+ {
94
+ "cell_type": "markdown",
95
+ "metadata": {},
96
+ "source": [
97
+ "## View Results on Dashboard\n",
98
+ "\n",
99
+ "All uploaded results are now visible on your Trismik dashboard:"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "metadata": {},
105
+ "source": "from IPython.display import display, Markdown\n\ndashboard_url = f\"https://app.trismik.com/projects/{TRISMIK_PROJECT_ID}\"\ndisplay(Markdown(f\"### 📊 [View All Results on Dashboard]({dashboard_url})\"))\nprint(f\"\\nDirect link: {dashboard_url}\")\nprint(\"\\nYou should see three experiments:\")\nprint(\" 1. Score-Upload-Notebook\")\nprint(\" 2. Evaluate-Upload-Notebook\")\nprint(\" 3. External-Results-Upload\")",
106
+ "outputs": [],
107
+ "execution_count": null
108
+ },
109
+ {
110
+ "cell_type": "markdown",
111
+ "metadata": {},
112
+ "source": [
113
+ "## Organizing Results with Metadata\n",
114
+ "\n",
115
+ "Use metadata to add context and organization to your results:"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "metadata": {},
121
+ "source": "# Example: Organizing a model comparison experiment\nmodels_to_test = [\n {\"name\": \"model-a\", \"version\": \"1.0\"},\n {\"name\": \"model-b\", \"version\": \"2.0\"},\n]\n\ntest_items = [\n {\"output\": \"positive\", \"label\": \"positive\"},\n {\"output\": \"negative\", \"label\": \"negative\"},\n]\n\nfor model_info in models_to_test:\n result = score(\n items=test_items,\n metrics=Accuracy,\n dataset_name=\"sentiment_test\",\n model_name=model_info[\"name\"],\n experiment_id=\"Model-Comparison-Notebook\",\n project_id=TRISMIK_PROJECT_ID,\n metadata={\n \"model_version\": model_info[\"version\"],\n \"comparison_group\": \"sentiment_analysis\",\n \"date\": \"2025-01-26\",\n \"notes\": f\"Testing {model_info['name']} v{model_info['version']}\",\n },\n upload_results=True,\n )\n print(f\"✓ Uploaded results for {model_info['name']} v{model_info['version']}\")",
122
+ "outputs": [],
123
+ "execution_count": null
124
+ },
125
+ {
126
+ "cell_type": "markdown",
127
+ "metadata": {},
128
+ "source": [
129
+ "## Best Practices\n",
130
+ "\n",
131
+ "### Experiment Naming\n",
132
+ "- Use descriptive `experiment_id` values (e.g., \"GPT4-MMLU-Baseline\")\n",
133
+ "- Group related runs under the same experiment ID\n",
134
+ "- Use different experiment IDs for different types of tests\n",
135
+ "\n",
136
+ "### Metadata\n",
137
+ "- Include model version, hyperparameters, and configuration\n",
138
+ "- Add timestamps and descriptions for historical tracking\n",
139
+ "- Use consistent keys across experiments for easy comparison\n",
140
+ "\n",
141
+ "### Organization\n",
142
+ "- Create separate projects for different use cases\n",
143
+ "- Use tags or metadata fields to categorize experiments\n",
144
+ "- Document your evaluation methodology in metadata\n",
145
+ "\n",
146
+ "## Next Steps\n",
147
+ "\n",
148
+ "- Explore the Trismik dashboard to visualize trends and comparisons\n",
149
+ "- Set up automated evaluation pipelines with result uploading\n",
150
+ "- Try the **Adaptive Evaluations** notebook for efficient testing with automatic uploads"
151
+ ]
152
+ }
153
+ ],
154
+ "metadata": {
155
+ "kernelspec": {
156
+ "display_name": "Python 3",
157
+ "language": "python",
158
+ "name": "python3"
159
+ },
160
+ "language_info": {
161
+ "codemirror_mode": {
162
+ "name": "ipython",
163
+ "version": 3
164
+ },
165
+ "file_extension": ".py",
166
+ "mimetype": "text/x-python",
167
+ "name": "python",
168
+ "nbconvert_exporter": "python",
169
+ "pygments_lexer": "ipython3",
170
+ "version": "3.11.0"
171
+ }
172
+ },
173
+ "nbformat": 4,
174
+ "nbformat_minor": 4
175
+ }
@@ -0,0 +1,229 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "bc3ba3cd77800bb4",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Adaptive Evaluations with Scorebook - Evaluating an OpenAI GPT Model\n",
9
+ "\n",
10
+ "This quick-start guide showcases an adaptive evaluation of OpenAI's GPT-4o Mini model.\n",
11
+ "\n",
12
+ "We recommend that you first see our [getting started quick-start guide](https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/getting_started.ipynb) if you have not done so already, for more of a detailed overview on adaptive testing and setting up Trismik credentials.\n",
13
+ "\n",
14
+ "## Prerequisites\n",
15
+ "\n",
16
+ "- **Trismik API key**: Generate a Trismik API key from the [Trismik dashboard's settings page](https://app.trismik.com/settings).\n",
17
+ "- **Trismik Project Id**: We recommend you use the project id generated in the [Getting Started Quick-Start Guide](https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/getting_started.ipynb).\n",
18
+ "- **OpenAI API key**: Generate an OpenAI API key from [OpenAI's API Platform](https://openai.com/api/).\n",
19
+ "\n",
20
+ "## Install Scorebook"
21
+ ]
22
+ },
23
+ {
24
+ "metadata": {},
25
+ "cell_type": "code",
26
+ "source": [
27
+ "!pip install scorebook\n",
28
+ "# if you're running this locally, please run !pip install scorebook\"[examples, providers]\""
29
+ ],
30
+ "id": "f454e876551a4a0c",
31
+ "outputs": [],
32
+ "execution_count": null
33
+ },
34
+ {
35
+ "metadata": {},
36
+ "cell_type": "markdown",
37
+ "source": [
38
+ "\n",
39
+ "## Setup Credentials\n",
40
+ "\n",
41
+ "Enter your Trismik API key, project id and OpenAI API Key below."
42
+ ],
43
+ "id": "cad992b287d4d0ac"
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "id": "14e576282749edb7",
48
+ "metadata": {},
49
+ "source": [
50
+ "# Set your credentials here\n",
51
+ "TRISMIK_API_KEY = \"your-trismik-api-key-here\"\n",
52
+ "TRISMIK_PROJECT_ID = \"your-trismik-project-id-here\"\n",
53
+ "OPENAI_API_KEY = \"your-openai-api-key-here\""
54
+ ],
55
+ "outputs": [],
56
+ "execution_count": null
57
+ },
58
+ {
59
+ "cell_type": "markdown",
60
+ "id": "700950d039e4c0f6",
61
+ "metadata": {},
62
+ "source": [
63
+ "## Login with Trismik API Key"
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "id": "initial_id",
69
+ "metadata": {},
70
+ "source": [
71
+ "from scorebook import login\n",
72
+ "\n",
73
+ "# Login to Trismik\n",
74
+ "login(TRISMIK_API_KEY)\n",
75
+ "print(\"✓ Logged in to Trismik\")"
76
+ ],
77
+ "outputs": [],
78
+ "execution_count": null
79
+ },
80
+ {
81
+ "cell_type": "markdown",
82
+ "id": "13084db21e549ccf",
83
+ "metadata": {},
84
+ "source": [
85
+ "## Define an Inference Function\n",
86
+ "\n",
87
+ "To evaluate a model with Scorebook, it must be encapsulated within an inference function. An inference function must accept a list of model inputs, pass these to the model for inference, collect and return outputs generated.\n",
88
+ "\n",
89
+ "An inference function can be defined to encapsulate any model, local or cloud-hosted. There is flexibility in how an inference function can be defined, the only requirements are the function signature. An inference function must,\n",
90
+ "\n",
91
+ "Accept:\n",
92
+ "\n",
93
+ "- A list of model inputs.\n",
94
+ "- Hyperparameters which can be optionally accessed via kwargs.\n",
95
+ "\n",
96
+ "Return\n",
97
+ "\n",
98
+ "- A list of parsed model outputs for scoring."
99
+ ]
100
+ },
101
+ {
102
+ "cell_type": "code",
103
+ "id": "8aa99f513db6241a",
104
+ "metadata": {},
105
+ "source": [
106
+ "from openai import OpenAI\n",
107
+ "from typing import Any, List\n",
108
+ "import string\n",
109
+ "\n",
110
+ "client = OpenAI(api_key=OPENAI_API_KEY)\n",
111
+ "\n",
112
+ "# define an inference function for GPT-4o Mini.\n",
113
+ "def gpt4o_mini(inputs: List[Any], **hyperparameters: Any) -> List[Any]:\n",
114
+ " \"\"\"Process inputs through OpenAI's API\"\"\"\n",
115
+ "\n",
116
+ " outputs = []\n",
117
+ " for idx, input_item in enumerate(inputs):\n",
118
+ "\n",
119
+ " # Format prompt\n",
120
+ " choices = input_item.get(\"options\", [])\n",
121
+ " prompt = (\n",
122
+ " str(input_item.get(\"question\", \"\"))\n",
123
+ " + \"\\nOptions:\\n\"\n",
124
+ " + \"\\n\".join(\n",
125
+ " f\"{letter}: {choice['text'] if isinstance(choice, dict) else choice}\"\n",
126
+ " for letter, choice in zip(string.ascii_uppercase, choices)\n",
127
+ " )\n",
128
+ " )\n",
129
+ "\n",
130
+ " # Build messages for OpenAI API\n",
131
+ " messages = [\n",
132
+ " {\n",
133
+ " \"role\": \"system\",\n",
134
+ " \"content\": hyperparameters[\"system_message\"]\n",
135
+ " },\n",
136
+ " {\"role\": \"user\", \"content\": prompt},\n",
137
+ " ]\n",
138
+ "\n",
139
+ " # Call OpenAI API and extract output from the response\n",
140
+ " try:\n",
141
+ " response = client.chat.completions.create(\n",
142
+ " model=\"gpt-4o-mini\",\n",
143
+ " messages=messages,\n",
144
+ " temperature=0.7,\n",
145
+ " )\n",
146
+ " output = response.choices[0].message.content.strip()\n",
147
+ "\n",
148
+ " except Exception as e:\n",
149
+ " output = f\"Error: {str(e)}\"\n",
150
+ "\n",
151
+ " outputs.append(output)\n",
152
+ "\n",
153
+ " return outputs"
154
+ ],
155
+ "outputs": [],
156
+ "execution_count": null
157
+ },
158
+ {
159
+ "cell_type": "markdown",
160
+ "id": "efa5c3ea791bbcd1",
161
+ "metadata": {},
162
+ "source": [
163
+ "## Run an Adaptive Evaluation\n",
164
+ "\n",
165
+ "When running an adaptive evaluation, we can use any single or multiple adaptive datasets and specify a split to be evaluated."
166
+ ]
167
+ },
168
+ {
169
+ "cell_type": "code",
170
+ "id": "3cbf1b2f13d5553e",
171
+ "metadata": {},
172
+ "source": [
173
+ "from scorebook import evaluate\n",
174
+ "\n",
175
+ "# Run adaptive evaluation\n",
176
+ "results = evaluate(\n",
177
+ " inference = gpt4o_mini,\n",
178
+ " datasets = \"trismik/figQA:adaptive\",\n",
179
+ " hyperparameters = {\"system_message\": \"Answer the question with only the letter of the correct option. No additional text or context\"},\n",
180
+ " split = \"validation\",\n",
181
+ " experiment_id = \"GPT-4o-Mini-Adaptive-Evaluation\",\n",
182
+ " project_id = TRISMIK_PROJECT_ID,\n",
183
+ ")\n",
184
+ "\n",
185
+ "# Print the adaptive evaluation results\n",
186
+ "print(\"✓ Adaptive evaluation complete!\")\n",
187
+ "print(\"Results: \", results[0][\"score\"])"
188
+ ],
189
+ "outputs": [],
190
+ "execution_count": null
191
+ },
192
+ {
193
+ "cell_type": "markdown",
194
+ "id": "d37cb5e87cc297fe",
195
+ "metadata": {},
196
+ "source": [
197
+ "---\n",
198
+ "\n",
199
+ "## Next Steps\n",
200
+ "\n",
201
+ "- [Adaptive Testing White Paper](https://docs.trismik.com/adaptiveTesting/adaptive-testing-introduction/): An in depth overview of the science behind the adaptive testing methodology.\n",
202
+ "- [Dataset Page](https://dashboard.trismik.com/datasets): Trismik's full set of currently adaptive datasets from the Trismik dashboard.\n",
203
+ "- [Scorebook Docs](https://docs.trismik.com/scorebook/introduction-to-scorebook/): Scorebook's full documentation.\n",
204
+ "- [Scorebook Repository](https://github.com/trismik/scorebook): Scorebook is an open-source library, view the code and more examples."
205
+ ]
206
+ }
207
+ ],
208
+ "metadata": {
209
+ "kernelspec": {
210
+ "display_name": "Python 3 (ipykernel)",
211
+ "language": "python",
212
+ "name": "python3"
213
+ },
214
+ "language_info": {
215
+ "codemirror_mode": {
216
+ "name": "ipython",
217
+ "version": 3
218
+ },
219
+ "file_extension": ".py",
220
+ "mimetype": "text/x-python",
221
+ "name": "python",
222
+ "nbconvert_exporter": "python",
223
+ "pygments_lexer": "ipython3",
224
+ "version": "3.13.5"
225
+ }
226
+ },
227
+ "nbformat": 4,
228
+ "nbformat_minor": 5
229
+ }
@@ -0,0 +1,256 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "bc3ba3cd77800bb4",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Adaptive Evaluations with Scorebook - Evaluating a Local Qwen Model\n",
9
+ "\n",
10
+ "This quick-start guide showcases an adaptive evaluation of Qwen's Qwen2.5 0.5B Instruct model.\n",
11
+ "\n",
12
+ "We recommend that you first see our [getting started quick-start guide](https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/getting_started.ipynb) if you have not done so already, for more of a detailed overview on adaptive testing and setting up Trismik credentials.\n",
13
+ "\n",
14
+ "## Prerequisites\n",
15
+ "\n",
16
+ "- **Trismik API key**: Generate a Trismik API key from the [Trismik dashboard's settings page](https://app.trismik.com/settings).\n",
17
+ "- **Trismik Project Id**: We recommend you use the project id generated in the [Getting Started Quick-Start Guide](https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/getting_started.ipynb).\n",
18
+ "\n",
19
+ "## Install Scorebook"
20
+ ]
21
+ },
22
+ {
23
+ "metadata": {},
24
+ "cell_type": "code",
25
+ "source": [
26
+ "!pip install scorebook\n",
27
+ "# if you're running this locally, please run !pip install scorebook\"[examples]\""
28
+ ],
29
+ "id": "90146caef86f19ee",
30
+ "outputs": [],
31
+ "execution_count": null
32
+ },
33
+ {
34
+ "cell_type": "markdown",
35
+ "id": "cad992b287d4d0ac",
36
+ "metadata": {},
37
+ "source": [
38
+ "## Setup Credentials\n",
39
+ "\n",
40
+ "Enter your Trismik API key and project id below."
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "id": "14e576282749edb7",
46
+ "metadata": {},
47
+ "source": [
48
+ "# Set your credentials here\n",
49
+ "TRISMIK_API_KEY = \"your-trismik-api-key-here\"\n",
50
+ "TRISMIK_PROJECT_ID = \"your-trismik-project-id-here\""
51
+ ],
52
+ "outputs": [],
53
+ "execution_count": null
54
+ },
55
+ {
56
+ "cell_type": "markdown",
57
+ "id": "700950d039e4c0f6",
58
+ "metadata": {},
59
+ "source": [
60
+ "## Login with Trismik API Key"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "id": "initial_id",
66
+ "metadata": {},
67
+ "source": [
68
+ "from scorebook import login\n",
69
+ "\n",
70
+ "# Login to Trismik\n",
71
+ "login(TRISMIK_API_KEY)\n",
72
+ "print(\"✓ Logged in to Trismik\")"
73
+ ],
74
+ "outputs": [],
75
+ "execution_count": null
76
+ },
77
+ {
78
+ "cell_type": "markdown",
79
+ "id": "609a95a43d8cfc2c",
80
+ "metadata": {},
81
+ "source": [
82
+ "## Instantiate a Local Qwen Model\n",
83
+ "\n",
84
+ "For this quick-start guide, we will use the lightweight Qwen2.5 0.5B instruct model, via Hugging Face's transformers package."
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "id": "d1da8af72ef8de6f",
90
+ "metadata": {},
91
+ "source": [
92
+ "import transformers\n",
93
+ "\n",
94
+ "# Instantiate a model\n",
95
+ "pipeline = transformers.pipeline(\n",
96
+ " \"text-generation\",\n",
97
+ " model=\"Qwen/Qwen2.5-0.5B-Instruct\",\n",
98
+ " model_kwargs={\"torch_dtype\": \"auto\"},\n",
99
+ " device_map=\"auto\",\n",
100
+ ")\n",
101
+ "\n",
102
+ "print(\"✓ Transformers pipeline instantiated\")"
103
+ ],
104
+ "outputs": [],
105
+ "execution_count": null
106
+ },
107
+ {
108
+ "cell_type": "markdown",
109
+ "id": "13084db21e549ccf",
110
+ "metadata": {},
111
+ "source": [
112
+ "## Define an Inference Function\n",
113
+ "\n",
114
+ "To evaluate a model with Scorebook, it must be encapsulated within an inference function. An inference function must accept a list of model inputs, pass these to the model for inference, collect and return outputs generated.\n",
115
+ "\n",
116
+ "An inference function can be defined to encapsulate any model, local or cloud-hosted. There is flexibility in how an inference function can be defined, the only requirements are the function signature. An inference function must,\n",
117
+ "\n",
118
+ "Accept:\n",
119
+ "\n",
120
+ "- A list of model inputs.\n",
121
+ "- Hyperparameters which can be optionally accessed via kwargs.\n",
122
+ "\n",
123
+ "Return\n",
124
+ "\n",
125
+ "- A list of parsed model outputs for scoring."
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "id": "8aa99f513db6241a",
131
+ "metadata": {},
132
+ "source": [
133
+ "from typing import Any, List\n",
134
+ "import string\n",
135
+ "\n",
136
+ "# Define an inference function for the Qwen model.\n",
137
+ "def qwen(inputs: List[Any], **hyperparameters: Any) -> List[Any]:\n",
138
+ " \"\"\"Process inputs through Qwen model\"\"\"\n",
139
+ "\n",
140
+ " outputs = []\n",
141
+ " for idx, input_item in enumerate(inputs):\n",
142
+ "\n",
143
+ " # Format prompt\n",
144
+ " choices = input_item.get(\"options\", [])\n",
145
+ " prompt = (\n",
146
+ " str(input_item.get(\"question\", \"\"))\n",
147
+ " + \"\\nOptions:\\n\"\n",
148
+ " + \"\\n\".join(\n",
149
+ " f\"{letter}: {choice['text'] if isinstance(choice, dict) else choice}\"\n",
150
+ " for letter, choice in zip(string.ascii_uppercase, choices)\n",
151
+ " )\n",
152
+ " )\n",
153
+ "\n",
154
+ " # Build messages for Qwen model\n",
155
+ " messages = [\n",
156
+ " {\n",
157
+ " \"role\": \"system\",\n",
158
+ " \"content\": hyperparameters[\"system_message\"]\n",
159
+ " },\n",
160
+ " {\"role\": \"user\", \"content\": prompt},\n",
161
+ " ]\n",
162
+ "\n",
163
+ " # Run inference using the pipeline\n",
164
+ " try:\n",
165
+ " output = pipeline(\n",
166
+ " messages,\n",
167
+ " temperature = hyperparameters.get(\"temperature\", 0.7),\n",
168
+ " top_p = hyperparameters.get(\"top_p\", 0.9),\n",
169
+ " top_k = hyperparameters.get(\"top_k\", 50),\n",
170
+ " max_new_tokens = 512,\n",
171
+ " do_sample = hyperparameters.get(\"temperature\", 0.7) > 0,\n",
172
+ " )\n",
173
+ " response = output[0][\"generated_text\"][-1][\"content\"].strip()\n",
174
+ "\n",
175
+ " except Exception as e:\n",
176
+ " response = f\"Error: {str(e)}\"\n",
177
+ "\n",
178
+ " outputs.append(response)\n",
179
+ "\n",
180
+ " return outputs"
181
+ ],
182
+ "outputs": [],
183
+ "execution_count": null
184
+ },
185
+ {
186
+ "cell_type": "markdown",
187
+ "id": "efa5c3ea791bbcd1",
188
+ "metadata": {},
189
+ "source": [
190
+ "## Run an Adaptive Evaluation\n",
191
+ "\n",
192
+ "When running an adaptive evaluation, we can use any single or multiple adaptive datasets and specify a split to be evaluated."
193
+ ]
194
+ },
195
+ {
196
+ "cell_type": "code",
197
+ "id": "3cbf1b2f13d5553e",
198
+ "metadata": {},
199
+ "source": [
200
+ "from scorebook import evaluate\n",
201
+ "\n",
202
+ "# Run adaptive evaluation\n",
203
+ "results = evaluate(\n",
204
+ " inference = qwen,\n",
205
+ " datasets = \"trismik/figQA:adaptive\",\n",
206
+ " hyperparameters = {\"system_message\": \"Answer the question with only the letter of the correct option. No additional text or context\"},\n",
207
+ " split = \"validation\",\n",
208
+ " experiment_id = \"Qwen-2.5-0.5B-Adaptive-Evaluation\",\n",
209
+ " project_id = TRISMIK_PROJECT_ID,\n",
210
+ ")\n",
211
+ "\n",
212
+ "# Print the adaptive evaluation results\n",
213
+ "print(\"✓ Adaptive evaluation complete!\")\n",
214
+ "print(\"Results: \", results[0][\"score\"])"
215
+ ],
216
+ "outputs": [],
217
+ "execution_count": null
218
+ },
219
+ {
220
+ "cell_type": "markdown",
221
+ "id": "d37cb5e87cc297fe",
222
+ "metadata": {},
223
+ "source": [
224
+ "---\n",
225
+ "\n",
226
+ "## Next Steps\n",
227
+ "\n",
228
+ "- [Adaptive Testing White Paper](https://docs.trismik.com/adaptiveTesting/adaptive-testing-introduction/): An in depth overview of the science behind the adaptive testing methodology.\n",
229
+ "- [Dataset Page](https://dashboard.trismik.com/datasets): Trismik's full set of currently adaptive datasets from the Trismik dashboard.\n",
230
+ "- [Scorebook Docs](https://docs.trismik.com/scorebook/introduction-to-scorebook/): Scorebook's full documentation.\n",
231
+ "- [Scorebook Repository](https://github.com/trismik/scorebook): Scorebook is an open-source library, view the code and more examples."
232
+ ]
233
+ }
234
+ ],
235
+ "metadata": {
236
+ "kernelspec": {
237
+ "display_name": "Python 3 (ipykernel)",
238
+ "language": "python",
239
+ "name": "python3"
240
+ },
241
+ "language_info": {
242
+ "codemirror_mode": {
243
+ "name": "ipython",
244
+ "version": 3
245
+ },
246
+ "file_extension": ".py",
247
+ "mimetype": "text/x-python",
248
+ "name": "python",
249
+ "nbconvert_exporter": "python",
250
+ "pygments_lexer": "ipython3",
251
+ "version": "3.11.12"
252
+ }
253
+ },
254
+ "nbformat": 4,
255
+ "nbformat_minor": 5
256
+ }