scorebook 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. scorebook/__init__.py +2 -0
  2. scorebook/dashboard/credentials.py +34 -4
  3. scorebook/eval_datasets/eval_dataset.py +2 -2
  4. scorebook/evaluate/_async/evaluate_async.py +27 -11
  5. scorebook/evaluate/_sync/evaluate.py +27 -11
  6. scorebook/metrics/README.md +121 -0
  7. scorebook/metrics/__init__.py +8 -0
  8. scorebook/metrics/accuracy.py +2 -6
  9. scorebook/metrics/bertscore.py +50 -0
  10. scorebook/metrics/bleu.py +82 -0
  11. scorebook/metrics/core/__init__.py +1 -0
  12. scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
  13. scorebook/metrics/core/metric_registry.py +195 -0
  14. scorebook/metrics/exactmatch.py +95 -0
  15. scorebook/metrics/f1.py +96 -0
  16. scorebook/metrics/precision.py +84 -9
  17. scorebook/metrics/recall.py +94 -0
  18. scorebook/metrics/rouge.py +85 -0
  19. scorebook/score/score_helpers.py +28 -11
  20. scorebook/types.py +2 -2
  21. scorebook/utils/progress_bars.py +58 -786
  22. {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/METADATA +32 -24
  23. scorebook-0.0.15.dist-info/RECORD +110 -0
  24. {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
  25. tutorials/README.md +147 -0
  26. tutorials/__init__.py +5 -0
  27. tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
  28. tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
  29. tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
  30. tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
  31. tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
  32. tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
  33. tutorials/examples/1-score/__init__.py +0 -0
  34. tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
  35. tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
  36. tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
  37. tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
  38. tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
  39. tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
  40. tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
  41. tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
  42. tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
  43. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
  44. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
  45. tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
  46. tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
  47. tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
  48. tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
  49. tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
  50. tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
  51. tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
  52. tutorials/examples/6-providers/aws/__init__.py +1 -0
  53. tutorials/examples/6-providers/aws/batch_example.py +219 -0
  54. tutorials/examples/6-providers/portkey/__init__.py +1 -0
  55. tutorials/examples/6-providers/portkey/batch_example.py +120 -0
  56. tutorials/examples/6-providers/portkey/messages_example.py +121 -0
  57. tutorials/examples/6-providers/vertex/__init__.py +1 -0
  58. tutorials/examples/6-providers/vertex/batch_example.py +166 -0
  59. tutorials/examples/6-providers/vertex/messages_example.py +142 -0
  60. tutorials/examples/__init__.py +0 -0
  61. tutorials/notebooks/1-scoring.ipynb +162 -0
  62. tutorials/notebooks/2-evaluating.ipynb +316 -0
  63. tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
  64. tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
  65. tutorials/notebooks/4-uploading_results.ipynb +175 -0
  66. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
  67. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
  68. tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
  69. tutorials/quickstarts/getting_started.ipynb +197 -0
  70. tutorials/utils/__init__.py +35 -0
  71. tutorials/utils/args_parser.py +132 -0
  72. tutorials/utils/output.py +23 -0
  73. tutorials/utils/setup.py +98 -0
  74. scorebook/metrics/metric_registry.py +0 -107
  75. scorebook-0.0.14.dist-info/RECORD +0 -53
  76. {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
  77. {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,142 @@
1
+ """
2
+ Google Cloud Vertex AI Model Inference Example.
3
+
4
+ This example demonstrates how to evaluate language models using Google Cloud
5
+ Vertex AI's Gemini models with Scorebook for real-time API calls.
6
+
7
+ Prerequisites: Google Cloud SDK (gcloud) authenticated and GOOGLE_CLOUD_PROJECT
8
+ environment variable set, or pass project ID as command line argument.
9
+ """
10
+
11
+ import json
12
+ from pathlib import Path
13
+ from typing import Any, Dict
14
+
15
+ from dotenv import load_dotenv
16
+
17
+ from scorebook import EvalDataset, InferencePipeline, evaluate
18
+ from scorebook.inference.clients.vertex import responses
19
+ from scorebook.metrics.accuracy import Accuracy
20
+
21
+
22
+ def main() -> None:
23
+ """Run the Vertex AI inference example."""
24
+ # Load environment variables from .env file for configuration
25
+ load_dotenv()
26
+
27
+ output_dir, model_name, project_id = setup_arguments()
28
+
29
+ # Step 1: Load the evaluation dataset
30
+ # Create an EvalDataset from local JSON file
31
+ # - Uses 'answer' field as ground truth labels
32
+ # - Configures Accuracy metric for evaluation
33
+ # - Loads from examples/example_datasets/dataset.json
34
+ dataset = EvalDataset.from_json(
35
+ "examples/example_datasets/dataset.json", label="answer", metrics=[Accuracy]
36
+ )
37
+
38
+ # Step 2: Define the preprocessing function
39
+ # Convert raw dataset items into Vertex AI API-compatible format
40
+ # This function formats the question for the Gemini model
41
+ def preprocessor(eval_item: Dict[str, str]) -> str:
42
+ """Pre-process dataset items into Vertex AI string format."""
43
+ return eval_item["question"]
44
+
45
+ # Step 3: Define the postprocessing function
46
+ # Extract the final answer from Vertex AI API response
47
+ # Handles response parsing and returns the response text
48
+ def postprocessor(response: Any) -> str:
49
+ """Post-process Vertex AI response to extract the answer."""
50
+ return str(response.text.strip())
51
+
52
+ # Step 4: Create the inference pipeline for cloud-based evaluation
53
+ # Combine preprocessing, Vertex AI inference, and postprocessing
54
+ # Uses scorebook's built-in Vertex AI responses function for API calls
55
+
56
+ # Create a system message with instructions for direct answers
57
+ system_prompt = """
58
+ Answer the question directly and concisely.
59
+ Do not provide lengthy explanations unless specifically asked.
60
+ """.strip()
61
+
62
+ async def inference_function(items: list, **hyperparams: Any) -> Any:
63
+ return await responses(
64
+ items,
65
+ model=model_name,
66
+ project_id=project_id,
67
+ system_instruction=system_prompt,
68
+ **hyperparams,
69
+ )
70
+
71
+ inference_pipeline = InferencePipeline(
72
+ model=model_name,
73
+ preprocessor=preprocessor,
74
+ inference_function=inference_function,
75
+ postprocessor=postprocessor,
76
+ )
77
+
78
+ # Step 5: Run the cloud-based evaluation
79
+ # Execute evaluation using Vertex AI with the inference pipeline
80
+ # - Uses score_type="all" to get both aggregate and per-item results
81
+ # - Limits to 10 items for quick demonstration and cost control
82
+ print(f"Running Vertex AI evaluation with model: {model_name}")
83
+ print(f"Project ID: {project_id}")
84
+ print("Evaluating 10 items from local dataset...")
85
+
86
+ results = evaluate(inference_pipeline, dataset, item_limit=10, score_type="all")
87
+ print(results)
88
+
89
+ # Step 6: Save results to file
90
+ # Export evaluation results as JSON for later analysis
91
+ output_file = output_dir / "vertex_messages_output.json"
92
+ with open(output_file, "w") as f:
93
+ json.dump(results, f, indent=4)
94
+ print(f"Results saved in {output_file}")
95
+
96
+
97
+ # ============================================================================
98
+ # Utility Functions
99
+ # ============================================================================
100
+
101
+
102
+ def setup_arguments() -> tuple[Path, str, str]:
103
+ """Parse command line arguments."""
104
+ import argparse
105
+ import os
106
+
107
+ parser = argparse.ArgumentParser(description="Run Vertex AI evaluation and save results.")
108
+ parser.add_argument(
109
+ "--output-dir",
110
+ type=str,
111
+ default=str(Path.cwd() / "results"),
112
+ help="Directory to save evaluation outputs (JSON).",
113
+ )
114
+ parser.add_argument(
115
+ "--model",
116
+ type=str,
117
+ required=True,
118
+ help="Gemini model to use for inference (e.g., gemini-2.0-flash-001)",
119
+ )
120
+ parser.add_argument(
121
+ "--project-id",
122
+ type=str,
123
+ help="Google Cloud Project ID (defaults to GOOGLE_CLOUD_PROJECT env var)",
124
+ )
125
+ args = parser.parse_args()
126
+
127
+ output_dir = Path(args.output_dir)
128
+ output_dir.mkdir(parents=True, exist_ok=True)
129
+
130
+ # Handle project ID fallback
131
+ project_id = args.project_id or os.getenv("GOOGLE_CLOUD_PROJECT")
132
+ if not project_id:
133
+ raise ValueError(
134
+ "Project ID must be provided via --project-id or "
135
+ "GOOGLE_CLOUD_PROJECT environment variable"
136
+ )
137
+
138
+ return output_dir, str(args.model), str(project_id)
139
+
140
+
141
+ if __name__ == "__main__":
142
+ main()
File without changes
@@ -0,0 +1,162 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "metadata": {},
5
+ "cell_type": "markdown",
6
+ "source": "# Scoring Model Outputs with Scorebook\n\nThis notebook demonstrates how to use Scorebook's `score()` function to evaluate pre-generated model predictions.\n\n## When to use `score()`\n\n- You already have model predictions and want to compute metrics\n- You want to re-score existing results with different metrics\n- You're importing evaluation results from another framework\n"
7
+ },
8
+ {
9
+ "metadata": {},
10
+ "cell_type": "markdown",
11
+ "source": [
12
+ "## Setup\n",
13
+ "\n",
14
+ "First, let's import the necessary modules:"
15
+ ]
16
+ },
17
+ {
18
+ "metadata": {},
19
+ "cell_type": "code",
20
+ "source": [
21
+ "from pprint import pprint\n",
22
+ "from scorebook import score\n",
23
+ "from scorebook.metrics.accuracy import Accuracy"
24
+ ],
25
+ "outputs": [],
26
+ "execution_count": null
27
+ },
28
+ {
29
+ "cell_type": "markdown",
30
+ "metadata": {},
31
+ "source": [
32
+ "## Prepare Your Data\n",
33
+ "\n",
34
+ "The `score()` function expects a list of items, where each item is a dictionary with:\n",
35
+ "- `input`: The input to the model (optional, for reference)\n",
36
+ "- `output`: The model's prediction\n",
37
+ "- `label`: The ground truth answer"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "metadata": {},
43
+ "source": [
44
+ "# Example: Pre-generated model outputs\n",
45
+ "items = [\n",
46
+ " {\n",
47
+ " \"input\": \"What is 2 + 2?\",\n",
48
+ " \"output\": \"4\",\n",
49
+ " \"label\": \"4\"\n",
50
+ " },\n",
51
+ " {\n",
52
+ " \"input\": \"What is the capital of France?\",\n",
53
+ " \"output\": \"Paris\",\n",
54
+ " \"label\": \"Paris\"\n",
55
+ " },\n",
56
+ " {\n",
57
+ " \"input\": \"Who wrote Romeo and Juliet?\",\n",
58
+ " \"output\": \"William Shakespeare\",\n",
59
+ " \"label\": \"William Shakespeare\"\n",
60
+ " },\n",
61
+ " {\n",
62
+ " \"input\": \"What is 5 * 6?\",\n",
63
+ " \"output\": \"30\",\n",
64
+ " \"label\": \"30\"\n",
65
+ " },\n",
66
+ " {\n",
67
+ " \"input\": \"What is the largest planet in our solar system?\",\n",
68
+ " \"output\": \"Jupiter\",\n",
69
+ " \"label\": \"Jupiter\"\n",
70
+ " },\n",
71
+ "]\n",
72
+ "\n",
73
+ "print(f\"Prepared {len(items)} items for scoring\")"
74
+ ],
75
+ "outputs": [],
76
+ "execution_count": null
77
+ },
78
+ {
79
+ "cell_type": "markdown",
80
+ "metadata": {},
81
+ "source": [
82
+ "## Score the Results\n",
83
+ "\n",
84
+ "Now we'll use the `score()` function to compute accuracy metrics:"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "metadata": {},
90
+ "source": [
91
+ "results = score(\n",
92
+ " items=items,\n",
93
+ " metrics=Accuracy,\n",
94
+ " dataset_name=\"basic_questions\",\n",
95
+ " model_name=\"example-model\",\n",
96
+ " upload_results=False, # Set to True to upload to Trismik\n",
97
+ ")\n",
98
+ "\n",
99
+ "pprint(results)"
100
+ ],
101
+ "outputs": [],
102
+ "execution_count": null
103
+ },
104
+ {
105
+ "cell_type": "markdown",
106
+ "metadata": {},
107
+ "source": [
108
+ "## Understanding the Results\n",
109
+ "\n",
110
+ "The results dictionary contains:\n",
111
+ "- `aggregates`: Overall metrics (e.g., accuracy across all items)\n",
112
+ "- `items`: Per-item scores and predictions\n",
113
+ "- `metadata`: Information about the dataset and model"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "code",
118
+ "metadata": {},
119
+ "source": [
120
+ "# View aggregate metrics\n",
121
+ "print(\"\\nAggregate Metrics:\")\n",
122
+ "print(f\"\\nAccuracy: {results['aggregate_results'][0]['accuracy']}\")\n",
123
+ "\n",
124
+ "# View per-item scores\n",
125
+ "print(\"\\nPer-Item Scores:\")\n",
126
+ "for i, item in enumerate(results['item_results'][:3], 1):\n",
127
+ " print(f\"\\nItem {i}:\")\n",
128
+ " print(f\" Output: {item['output']}\")\n",
129
+ " print(f\" Label: {item['label']}\")\n",
130
+ " print(f\" Accuracy: {item['accuracy']}\")"
131
+ ],
132
+ "outputs": [],
133
+ "execution_count": null
134
+ },
135
+ {
136
+ "cell_type": "markdown",
137
+ "source": "## Next Steps\n\n- Try the **Evaluate** notebook to learn how to run inference and scoring together\n- See the **Upload Results** notebook to upload your scores to Trismik's dashboard\n- Explore custom metrics in the Scorebook documentation",
138
+ "metadata": {}
139
+ }
140
+ ],
141
+ "metadata": {
142
+ "kernelspec": {
143
+ "display_name": "Python 3",
144
+ "language": "python",
145
+ "name": "python3"
146
+ },
147
+ "language_info": {
148
+ "codemirror_mode": {
149
+ "name": "ipython",
150
+ "version": 3
151
+ },
152
+ "file_extension": ".py",
153
+ "mimetype": "text/x-python",
154
+ "name": "python",
155
+ "nbconvert_exporter": "python",
156
+ "pygments_lexer": "ipython3",
157
+ "version": "3.11.0"
158
+ }
159
+ },
160
+ "nbformat": 4,
161
+ "nbformat_minor": 4
162
+ }
@@ -0,0 +1,316 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Evaluating Models with Scorebook\n",
8
+ "\n",
9
+ "This notebook demonstrates how to use Scorebook's `evaluate()` function to run inference and compute metrics in a single step.\n",
10
+ "\n",
11
+ "## When to use `evaluate()`\n",
12
+ "\n",
13
+ "- You want to run inference on a dataset and score the results\n",
14
+ "- You're comparing different models on the same dataset\n",
15
+ "- You want to track hyperparameters alongside results\n",
16
+ "\n",
17
+ "## Prerequisites\n",
18
+ "\n",
19
+ "This example uses a local HuggingFace model. For cloud models (OpenAI), see the examples directory."
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "markdown",
24
+ "metadata": {},
25
+ "source": [
26
+ "## Setup\n",
27
+ "\n",
28
+ "Import necessary modules:"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "metadata": {},
34
+ "source": [
35
+ "from pprint import pprint\n",
36
+ "from typing import Any, List\n",
37
+ "import transformers\n",
38
+ "\n",
39
+ "from scorebook import EvalDataset, evaluate"
40
+ ],
41
+ "outputs": [],
42
+ "execution_count": null
43
+ },
44
+ {
45
+ "cell_type": "markdown",
46
+ "metadata": {},
47
+ "source": [
48
+ "## Initialize Your Model\n",
49
+ "\n",
50
+ "Set up a HuggingFace pipeline for inference:"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "metadata": {},
56
+ "source": [
57
+ "model_name = \"microsoft/Phi-4-mini-instruct\"\n",
58
+ "\n",
59
+ "pipeline = transformers.pipeline(\n",
60
+ " \"text-generation\",\n",
61
+ " model=model_name,\n",
62
+ " model_kwargs={\"torch_dtype\": \"auto\"},\n",
63
+ " device_map=\"auto\",\n",
64
+ ")\n",
65
+ "\n",
66
+ "print(f\"Model loaded: {model_name}\")"
67
+ ],
68
+ "outputs": [],
69
+ "execution_count": null
70
+ },
71
+ {
72
+ "cell_type": "markdown",
73
+ "metadata": {},
74
+ "source": [
75
+ "## Define Your Inference Function\n",
76
+ "\n",
77
+ "Create a function that processes inputs and returns outputs:"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "code",
82
+ "metadata": {},
83
+ "source": [
84
+ "def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:\n",
85
+ " \"\"\"Process inputs through the model.\n",
86
+ " \n",
87
+ " Args:\n",
88
+ " inputs: List of input values from the dataset\n",
89
+ " hyperparameters: Model hyperparameters (e.g., temperature, system_message)\n",
90
+ " \n",
91
+ " Returns:\n",
92
+ " List of model outputs\n",
93
+ " \"\"\"\n",
94
+ " outputs = []\n",
95
+ " \n",
96
+ " for input_val in inputs:\n",
97
+ " # Build messages for the model\n",
98
+ " messages = [\n",
99
+ " {\n",
100
+ " \"role\": \"system\",\n",
101
+ " \"content\": hyperparameters.get(\"system_message\", \"You are a helpful assistant.\")\n",
102
+ " },\n",
103
+ " {\"role\": \"user\", \"content\": str(input_val)},\n",
104
+ " ]\n",
105
+ " \n",
106
+ " # Run inference\n",
107
+ " result = pipeline(\n",
108
+ " messages,\n",
109
+ " max_new_tokens=hyperparameters.get(\"max_new_tokens\", 100),\n",
110
+ " )\n",
111
+ " \n",
112
+ " # Extract the answer\n",
113
+ " output = str(result[0][\"generated_text\"][-1][\"content\"])\n",
114
+ " outputs.append(output)\n",
115
+ " \n",
116
+ " return outputs"
117
+ ],
118
+ "outputs": [],
119
+ "execution_count": null
120
+ },
121
+ {
122
+ "cell_type": "markdown",
123
+ "metadata": {},
124
+ "source": [
125
+ "## Load Your Dataset\n",
126
+ "\n",
127
+ "Create an evaluation dataset from a JSON file:"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "metadata": {},
133
+ "source": [
134
+ "# Create a sample dataset\n",
135
+ "sample_data = [\n",
136
+ " {\"question\": \"What is 2 + 2?\", \"answer\": \"4\"},\n",
137
+ " {\"question\": \"What is the capital of France?\", \"answer\": \"Paris\"},\n",
138
+ " {\"question\": \"Who wrote Romeo and Juliet?\", \"answer\": \"William Shakespeare\"},\n",
139
+ "]\n",
140
+ "\n",
141
+ "# Create EvalDataset directly from list\n",
142
+ "dataset = EvalDataset.from_list(\n",
143
+ " name=\"sample_questions\",\n",
144
+ " metrics=\"accuracy\",\n",
145
+ " items=sample_data,\n",
146
+ " input=\"question\",\n",
147
+ " label=\"answer\",\n",
148
+ ")\n",
149
+ "\n",
150
+ "print(f\"Loaded dataset with {len(dataset.items)} items\")"
151
+ ],
152
+ "outputs": [],
153
+ "execution_count": null
154
+ },
155
+ {
156
+ "cell_type": "markdown",
157
+ "metadata": {},
158
+ "source": [
159
+ "## Run Evaluation\n",
160
+ "\n",
161
+ "Use `evaluate()` to run inference and compute metrics:"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "code",
166
+ "metadata": {},
167
+ "source": [
168
+ "results = evaluate(\n",
169
+ " inference,\n",
170
+ " dataset,\n",
171
+ " hyperparameters={\n",
172
+ " \"system_message\": \"Answer the question directly and concisely.\",\n",
173
+ " \"max_new_tokens\": 50,\n",
174
+ " },\n",
175
+ " return_aggregates=True,\n",
176
+ " return_items=True,\n",
177
+ " return_output=True,\n",
178
+ " upload_results=False, # Set to True to upload to Trismik\n",
179
+ ")\n",
180
+ "\n",
181
+ "pprint(results)"
182
+ ],
183
+ "outputs": [],
184
+ "execution_count": null
185
+ },
186
+ {
187
+ "cell_type": "markdown",
188
+ "metadata": {},
189
+ "source": [
190
+ "## Analyze Results\n",
191
+ "\n",
192
+ "Examine the outputs and metrics:"
193
+ ]
194
+ },
195
+ {
196
+ "cell_type": "code",
197
+ "metadata": {},
198
+ "source": [
199
+ "# Overall accuracy\n",
200
+ "print(f\"\\nOverall Accuracy: {results['aggregate_results'][0]['accuracy']:.2%}\")\n",
201
+ "\n",
202
+ "# Per-item results\n",
203
+ "print(\"\\nPer-Item Results:\")\n",
204
+ "for i, item in enumerate(results['item_results'], 1):\n",
205
+ " print(f\"\\nQuestion {i}: {item['input']}\")\n",
206
+ " print(f\" Model Output: {item['output']}\")\n",
207
+ " print(f\" Expected: {item['label']}\")\n",
208
+ " print(f\" Correct: {'✓' if item['accuracy'] == 1.0 else '✗'}\")"
209
+ ],
210
+ "outputs": [],
211
+ "execution_count": null
212
+ },
213
+ {
214
+ "metadata": {},
215
+ "cell_type": "markdown",
216
+ "source": [
217
+ "## Hyperparameter Sweeps\n",
218
+ "\n",
219
+ "Evaluate with different hyperparameters to find optimal settings:"
220
+ ]
221
+ },
222
+ {
223
+ "metadata": {},
224
+ "cell_type": "code",
225
+ "source": [
226
+ "# Define an inference function\n",
227
+ "def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:\n",
228
+ " \"\"\"Process inputs through the model.\n",
229
+ "\n",
230
+ " Args:\n",
231
+ " inputs: Input values from an EvalDataset.\n",
232
+ " hyperparameters: Model hyperparameters including system_message, temperature, top_p, top_k.\n",
233
+ "\n",
234
+ " Returns:\n",
235
+ " List of model outputs for all inputs.\n",
236
+ " \"\"\"\n",
237
+ " outputs = []\n",
238
+ " for input_val in inputs:\n",
239
+ " # Preprocess: Build messages\n",
240
+ " messages = [\n",
241
+ " {\"role\": \"system\", \"content\": hyperparameters[\"system_message\"]},\n",
242
+ " {\"role\": \"user\", \"content\": str(input_val)},\n",
243
+ " ]\n",
244
+ "\n",
245
+ " # Run inference\n",
246
+ " result = pipeline(\n",
247
+ " messages,\n",
248
+ " temperature=hyperparameters[\"temperature\"],\n",
249
+ " top_p=hyperparameters.get(\"top_p\"),\n",
250
+ " top_k=hyperparameters.get(\"top_k\"),\n",
251
+ " )\n",
252
+ "\n",
253
+ " # Postprocess: Extract the answer\n",
254
+ " output = str(result[0][\"generated_text\"][-1][\"content\"])\n",
255
+ " outputs.append(output)\n",
256
+ "\n",
257
+ " return outputs\n",
258
+ "\n",
259
+ "# Define hyperparameters with lists of values to create a sweep\n",
260
+ "hyperparameters = {\n",
261
+ " \"system_message\": \"Answer the question directly and concisely.\",\n",
262
+ " \"temperature\": [0.6, 0.7, 0.8],\n",
263
+ " \"top_p\": [0.7, 0.8, 0.9],\n",
264
+ " \"top_k\": [10, 20, 30],\n",
265
+ "}\n",
266
+ "\n",
267
+ "# Run evaluation across all hyperparameter combinations\n",
268
+ "results = evaluate(\n",
269
+ " inference,\n",
270
+ " dataset,\n",
271
+ " hyperparameters=hyperparameters,\n",
272
+ " return_aggregates=True,\n",
273
+ " return_items=True,\n",
274
+ " return_output=True,\n",
275
+ " upload_results=False,\n",
276
+ ")\n",
277
+ "\n",
278
+ "pprint(results)"
279
+ ],
280
+ "outputs": [],
281
+ "execution_count": null
282
+ },
283
+ {
284
+ "cell_type": "markdown",
285
+ "metadata": {},
286
+ "source": [
287
+ "## Next Steps\n",
288
+ "\n",
289
+ "- Try the **Adaptive Evaluations** notebook for efficient testing with fewer questions\n",
290
+ "- See the **Upload Results** notebook to track results in Trismik's dashboard\n",
291
+ "- Explore batch processing for faster evaluation of large datasets"
292
+ ]
293
+ }
294
+ ],
295
+ "metadata": {
296
+ "kernelspec": {
297
+ "display_name": "Python 3 (ipykernel)",
298
+ "language": "python",
299
+ "name": "python3"
300
+ },
301
+ "language_info": {
302
+ "codemirror_mode": {
303
+ "name": "ipython",
304
+ "version": 3
305
+ },
306
+ "file_extension": ".py",
307
+ "mimetype": "text/x-python",
308
+ "name": "python",
309
+ "nbconvert_exporter": "python",
310
+ "pygments_lexer": "ipython3",
311
+ "version": "3.13.5"
312
+ }
313
+ },
314
+ "nbformat": 4,
315
+ "nbformat_minor": 4
316
+ }