scorebook 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. scorebook/__init__.py +12 -5
  2. scorebook/cli/auth.py +1 -1
  3. scorebook/dashboard/__init__.py +1 -0
  4. scorebook/dashboard/create_project.py +91 -0
  5. scorebook/{trismik → dashboard}/credentials.py +57 -12
  6. scorebook/{trismik → dashboard}/upload_results.py +1 -1
  7. scorebook/eval_datasets/__init__.py +0 -4
  8. scorebook/eval_datasets/eval_dataset.py +4 -2
  9. scorebook/evaluate/__init__.py +1 -15
  10. scorebook/evaluate/_async/evaluate_async.py +36 -19
  11. scorebook/evaluate/_sync/evaluate.py +36 -19
  12. scorebook/evaluate/evaluate_helpers.py +4 -3
  13. scorebook/inference/__init__.py +1 -11
  14. scorebook/inference/clients/__init__.py +1 -8
  15. scorebook/inference/inference_pipeline.py +1 -1
  16. scorebook/metrics/README.md +121 -0
  17. scorebook/metrics/__init__.py +7 -16
  18. scorebook/metrics/accuracy.py +2 -6
  19. scorebook/metrics/bertscore.py +50 -0
  20. scorebook/metrics/bleu.py +82 -0
  21. scorebook/metrics/core/__init__.py +1 -0
  22. scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
  23. scorebook/metrics/core/metric_registry.py +195 -0
  24. scorebook/metrics/exactmatch.py +95 -0
  25. scorebook/metrics/f1.py +96 -0
  26. scorebook/metrics/precision.py +84 -9
  27. scorebook/metrics/recall.py +94 -0
  28. scorebook/metrics/rouge.py +85 -0
  29. scorebook/score/__init__.py +0 -5
  30. scorebook/score/_async/score_async.py +3 -2
  31. scorebook/score/_sync/score.py +3 -2
  32. scorebook/score/score_helpers.py +29 -12
  33. scorebook/types.py +3 -3
  34. scorebook/utils/__init__.py +0 -22
  35. scorebook/utils/common_helpers.py +1 -1
  36. scorebook/utils/mock_llm/__init__.py +41 -0
  37. scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
  38. scorebook/utils/progress_bars.py +58 -786
  39. scorebook-0.0.15.dist-info/METADATA +300 -0
  40. scorebook-0.0.15.dist-info/RECORD +110 -0
  41. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
  42. tutorials/README.md +147 -0
  43. tutorials/__init__.py +5 -0
  44. tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
  45. tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
  46. tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
  47. tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
  48. tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
  49. tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
  50. tutorials/examples/1-score/__init__.py +0 -0
  51. tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
  52. tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
  53. tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
  54. tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
  55. tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
  56. tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
  57. tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
  58. tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
  59. tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
  60. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
  61. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
  62. tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
  63. tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
  64. tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
  65. tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
  66. tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
  67. tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
  68. tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
  69. tutorials/examples/6-providers/aws/__init__.py +1 -0
  70. tutorials/examples/6-providers/aws/batch_example.py +219 -0
  71. tutorials/examples/6-providers/portkey/__init__.py +1 -0
  72. tutorials/examples/6-providers/portkey/batch_example.py +120 -0
  73. tutorials/examples/6-providers/portkey/messages_example.py +121 -0
  74. tutorials/examples/6-providers/vertex/__init__.py +1 -0
  75. tutorials/examples/6-providers/vertex/batch_example.py +166 -0
  76. tutorials/examples/6-providers/vertex/messages_example.py +142 -0
  77. tutorials/examples/__init__.py +0 -0
  78. tutorials/notebooks/1-scoring.ipynb +162 -0
  79. tutorials/notebooks/2-evaluating.ipynb +316 -0
  80. tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
  81. tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
  82. tutorials/notebooks/4-uploading_results.ipynb +175 -0
  83. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
  84. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
  85. tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
  86. tutorials/quickstarts/getting_started.ipynb +197 -0
  87. tutorials/utils/__init__.py +35 -0
  88. tutorials/utils/args_parser.py +132 -0
  89. tutorials/utils/output.py +23 -0
  90. tutorials/utils/setup.py +98 -0
  91. scorebook/metrics/metric_registry.py +0 -105
  92. scorebook/trismik/__init__.py +0 -10
  93. scorebook-0.0.13.dist-info/METADATA +0 -389
  94. scorebook-0.0.13.dist-info/RECORD +0 -50
  95. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
  96. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,170 @@
1
+ """Tutorials - Evaluate - Example 4 - Evaluating Cloud Models with Batching."""
2
+
3
+ import asyncio
4
+ import json
5
+ import tempfile
6
+ from pathlib import Path
7
+ from pprint import pprint
8
+ from typing import Any, List
9
+
10
+ from dotenv import load_dotenv
11
+ from openai import AsyncOpenAI
12
+
13
+ from tutorials.utils import save_results_to_json, setup_logging
14
+
15
+ from scorebook import EvalDataset, evaluate_async
16
+
17
+
18
+ async def main() -> Any:
19
+ """Run evaluation using OpenAI's Batch API.
20
+
21
+ This example demonstrates how to use OpenAI's Batch API for cost-effective,
22
+ large-scale model evaluation. The Batch API offers 50% cost savings compared
23
+ to standard API calls, with results typically delivered within 24 hours.
24
+
25
+ Prerequisites:
26
+ - OpenAI API key set in environment variable OPENAI_API_KEY
27
+ """
28
+
29
+ # Initialize OpenAI client
30
+ client = AsyncOpenAI()
31
+ model_name = "gpt-4o-mini"
32
+
33
+ # Define an async batch inference function
34
+ async def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
35
+ """Process inputs through OpenAI's Batch API.
36
+
37
+ Args:
38
+ inputs: Input values from an EvalDataset.
39
+ hyperparameters: Model hyperparameters including system_message and temperature.
40
+
41
+ Returns:
42
+ List of model outputs for all inputs.
43
+ """
44
+ # Step 1: Create batch requests in JSONL format
45
+ batch_requests = []
46
+ for idx, input_val in enumerate(inputs):
47
+ request = {
48
+ "custom_id": f"request-{idx}",
49
+ "method": "POST",
50
+ "url": "/v1/chat/completions",
51
+ "body": {
52
+ "model": model_name,
53
+ "messages": [
54
+ {
55
+ "role": "system",
56
+ "content": hyperparameters.get(
57
+ "system_message", "You are a helpful assistant."
58
+ ),
59
+ },
60
+ {"role": "user", "content": str(input_val)},
61
+ ],
62
+ "temperature": hyperparameters.get("temperature", 0.7),
63
+ },
64
+ }
65
+ batch_requests.append(request)
66
+
67
+ # Step 2: Write requests to a temporary JSONL file
68
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
69
+ for request in batch_requests:
70
+ f.write(json.dumps(request) + "\n")
71
+ temp_file_path = f.name
72
+
73
+ try:
74
+ # Step 3: Upload the batch file
75
+ print(f"Uploading batch file with {len(inputs)} requests...")
76
+ with open(temp_file_path, "rb") as f:
77
+ batch_file = await client.files.create(file=f, purpose="batch")
78
+
79
+ # Step 4: Create the batch job
80
+ print(f"Creating batch job...")
81
+ batch_job = await client.batches.create(
82
+ input_file_id=batch_file.id,
83
+ endpoint="/v1/chat/completions",
84
+ completion_window="24h",
85
+ )
86
+
87
+ # Step 5: Wait for batch completion (with polling)
88
+ print(f"Waiting for batch to complete (ID: {batch_job.id})...")
89
+ while batch_job.status not in ["completed", "failed", "cancelled"]:
90
+ await asyncio.sleep(10) # Poll every 10 seconds
91
+ batch_job = await client.batches.retrieve(batch_job.id)
92
+ print(f"Status: {batch_job.status}")
93
+
94
+ if batch_job.status != "completed":
95
+ raise Exception(f"Batch job failed with status: {batch_job.status}")
96
+
97
+ # Step 6: Download and parse results
98
+ print("Batch completed! Downloading results...")
99
+ result_file_id = batch_job.output_file_id
100
+ result_content = await client.files.content(result_file_id)
101
+ result_text = result_content.text
102
+
103
+ # Step 7: Parse results and extract outputs
104
+ results_by_id = {}
105
+ for line in result_text.strip().split("\n"):
106
+ result = json.loads(line)
107
+ custom_id = result["custom_id"]
108
+ try:
109
+ output = result["response"]["body"]["choices"][0]["message"]["content"]
110
+ results_by_id[custom_id] = output.strip()
111
+ except (KeyError, IndexError):
112
+ results_by_id[custom_id] = "Error: Failed to extract response"
113
+
114
+ # Step 8: Return outputs in original order
115
+ outputs = []
116
+ for idx in range(len(inputs)):
117
+ custom_id = f"request-{idx}"
118
+ outputs.append(results_by_id.get(custom_id, "Error: Missing response"))
119
+
120
+ return outputs
121
+
122
+ finally:
123
+ # Clean up the temporary file
124
+ Path(temp_file_path).unlink(missing_ok=True)
125
+
126
+ # Create a list of evaluation items
127
+ evaluation_items = [
128
+ {"question": "What is 2 + 2?", "answer": "4"},
129
+ {"question": "What is the capital of France?", "answer": "Paris"},
130
+ {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"},
131
+ ]
132
+
133
+ # Create an evaluation dataset
134
+ evaluation_dataset = EvalDataset.from_list(
135
+ name="basic_questions",
136
+ metrics="accuracy",
137
+ items=evaluation_items,
138
+ input="question",
139
+ label="answer",
140
+ )
141
+
142
+ print(f"\nRunning OpenAI Batch API evaluation with model: {model_name}")
143
+ print("Note: Batch processing may take several minutes to complete.\n")
144
+
145
+ # Run evaluation
146
+ results = await evaluate_async(
147
+ inference,
148
+ evaluation_dataset,
149
+ hyperparameters={
150
+ "temperature": 0.7,
151
+ "system_message": "Answer the question directly and concisely",
152
+ },
153
+ return_aggregates=True,
154
+ return_items=True,
155
+ return_output=True,
156
+ upload_results=False,
157
+ )
158
+
159
+ print("\nBatch evaluation completed:\n")
160
+ pprint(results)
161
+ return results
162
+
163
+
164
+ if __name__ == "__main__":
165
+ load_dotenv()
166
+ log_file = setup_logging(experiment_id="4-evaluating_cloud_models_with_batching", base_dir=Path(__file__).parent)
167
+ output_dir = Path(__file__).parent / "results"
168
+ output_dir.mkdir(exist_ok=True)
169
+ results_dict = asyncio.run(main())
170
+ save_results_to_json(results_dict, output_dir, "4-evaluating_cloud_models_with_batching_output.json")
@@ -0,0 +1,122 @@
1
+ """Tutorials - Evaluate - Example 5 - Hyperparameter Sweeps."""
2
+
3
+ from pathlib import Path
4
+ from pprint import pprint
5
+ from typing import Any, List
6
+
7
+ import transformers
8
+ from dotenv import load_dotenv
9
+
10
+ from tutorials.utils import save_results_to_json, setup_logging
11
+
12
+ from scorebook import EvalDataset, evaluate
13
+
14
+
15
+ def main() -> Any:
16
+ """Run a Scorebook evaluation with a hyperparameter sweep.
17
+
18
+ This example demonstrates how Scorebook can automatically test multiple
19
+ hyperparameter configurations in a single evaluation.
20
+
21
+ How Hyperparameter Sweeping Works:
22
+ - Define hyperparameters with lists of values to test
23
+ - Scorebook generates all possible combinations (Cartesian product)
24
+ - Each configuration is evaluated separately on the same dataset
25
+
26
+ Example Hyperparameters:
27
+ - system_message: "Answer the question directly and concisely." (1 value)
28
+ - temperature: [0.6, 0.7, 0.8] (3 values)
29
+ - top_p: [0.7, 0.8, 0.9] (3 values)
30
+ - top_k: [10, 20, 30] (3 values)
31
+
32
+ Total configurations = 1 × 3 × 3 × 3 = 27 hyperparameter configurations
33
+ """
34
+
35
+ # Initialize HuggingFace model pipeline
36
+ model_name = "microsoft/Phi-4-mini-instruct"
37
+ pipeline = transformers.pipeline(
38
+ "text-generation",
39
+ model=model_name,
40
+ model_kwargs={"torch_dtype": "auto"},
41
+ device_map="auto",
42
+ )
43
+
44
+ # Define an inference function
45
+ def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
46
+ """Process inputs through the model.
47
+
48
+ Args:
49
+ inputs: Input values from an EvalDataset.
50
+ hyperparameters: Model hyperparameters including system_message, temperature, top_p, top_k.
51
+
52
+ Returns:
53
+ List of model outputs for all inputs.
54
+ """
55
+ outputs = []
56
+ for input_val in inputs:
57
+ # Preprocess: Build messages
58
+ messages = [
59
+ {"role": "system", "content": hyperparameters["system_message"]},
60
+ {"role": "user", "content": str(input_val)},
61
+ ]
62
+
63
+ # Run inference
64
+ result = pipeline(
65
+ messages,
66
+ temperature=hyperparameters["temperature"],
67
+ top_p=hyperparameters.get("top_p"),
68
+ top_k=hyperparameters.get("top_k"),
69
+ )
70
+
71
+ # Postprocess: Extract the answer
72
+ output = str(result[0]["generated_text"][-1]["content"])
73
+ outputs.append(output)
74
+
75
+ return outputs
76
+
77
+ # Create a list of evaluation items
78
+ evaluation_items = [
79
+ {"question": "What is 2 + 2?", "answer": "4"},
80
+ {"question": "What is the capital of France?", "answer": "Paris"},
81
+ {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"},
82
+ ]
83
+
84
+ # Create an evaluation dataset
85
+ evaluation_dataset = EvalDataset.from_list(
86
+ name="basic_questions",
87
+ metrics="accuracy",
88
+ items=evaluation_items,
89
+ input="question",
90
+ label="answer",
91
+ )
92
+
93
+ # Define hyperparameters with lists of values to create a sweep
94
+ hyperparameters = {
95
+ "system_message": "Answer the question directly and concisely.",
96
+ "temperature": [0.6, 0.7, 0.8],
97
+ "top_p": [0.7, 0.8, 0.9],
98
+ "top_k": [10, 20, 30],
99
+ }
100
+
101
+ # Run evaluation across all hyperparameter combinations
102
+ results = evaluate(
103
+ inference,
104
+ evaluation_dataset,
105
+ hyperparameters=hyperparameters,
106
+ return_aggregates=True,
107
+ return_items=True,
108
+ return_output=True,
109
+ upload_results=False,
110
+ )
111
+
112
+ pprint(results)
113
+ return results
114
+
115
+
116
+ if __name__ == "__main__":
117
+ load_dotenv()
118
+ log_file = setup_logging(experiment_id="5-hyperparameter_sweeps", base_dir=Path(__file__).parent)
119
+ output_dir = Path(__file__).parent / "results"
120
+ output_dir.mkdir(exist_ok=True)
121
+ results_dict = main()
122
+ save_results_to_json(results_dict, output_dir, "5-hyperparameter_sweeps_output.json")
@@ -0,0 +1,141 @@
1
+ """Tutorials - Evaluate - Example 6 - Inference Pipelines."""
2
+
3
+ from pprint import pprint
4
+ from typing import Any, List
5
+
6
+ import transformers
7
+ from dotenv import load_dotenv
8
+ from pathlib import Path
9
+
10
+ from tutorials.utils import save_results_to_json, setup_logging
11
+
12
+ from scorebook import EvalDataset, InferencePipeline, evaluate
13
+
14
+
15
+ def main() -> Any:
16
+ """Run a simple Scorebook evaluation using an InferencePipeline.
17
+
18
+ This example demonstrates how to use Scorebook's InferencePipeline in evaluations.
19
+
20
+ Inference pipelines separate the evaluation workflow into three distinct stages:
21
+ 1. Pre-processing: Convert raw dataset items into model-ready input format
22
+ 2. Inference: Execute model predictions on preprocessed data
23
+ 3. Post-processing: Extract final answers from raw model outputs
24
+
25
+ These stages can be encapsulated in reusable functions and used to create pipelines.
26
+ An inference pipeline can be passed into the evaluate function's inference parameter.
27
+ """
28
+
29
+ # === Pre-Processing ===
30
+
31
+ # The preprocessor function is responsible for mapping items in an Eval Dataset to model inputs
32
+ def preprocessor(input_value: str, **hyperparameter_config: Any) -> List[Any]:
33
+ """Convert an evaluation input to a valid model input.
34
+
35
+ Args:
36
+ input_value: The input value from the dataset.
37
+ hyperparameter_config: Model hyperparameters.
38
+
39
+ Returns:
40
+ A structured representation of an evaluation item for model input.
41
+ """
42
+ messages = [
43
+ {
44
+ "role": "system",
45
+ "content": hyperparameter_config["system_message"],
46
+ },
47
+ {"role": "user", "content": input_value},
48
+ ]
49
+
50
+ return messages
51
+
52
+ # === Inference ===
53
+
54
+ pipeline = transformers.pipeline(
55
+ "text-generation",
56
+ model="microsoft/Phi-4-mini-instruct",
57
+ model_kwargs={"torch_dtype": "auto"},
58
+ device_map="auto",
59
+ )
60
+
61
+ # An inference function for an InferencePipeline that returns a list of raw outputs
62
+ def inference(preprocessed_items: List[Any], **hyperparameter_config: Any) -> List[Any]:
63
+ """Run model inference on preprocessed eval items.
64
+
65
+ Args:
66
+ preprocessed_items: The list of evaluation items for an EvalDataset.
67
+ hyperparameter_config: Model hyperparameters.
68
+
69
+ Returns:
70
+ A list of model outputs for an EvalDataset.
71
+ """
72
+ return [
73
+ pipeline(model_input, temperature=hyperparameter_config["temperature"])
74
+ for model_input in preprocessed_items
75
+ ]
76
+
77
+ # === Post-Processing ===
78
+
79
+ # The postprocessor function parses model output for metric scoring
80
+ def postprocessor(model_output: Any, **hyperparameter_config: Any) -> str:
81
+ """Extract the final parsed answer from the model output.
82
+
83
+ Args:
84
+ model_output: An evaluation item from an EvalDataset.
85
+ hyperparameter_config: Model hyperparameters.
86
+
87
+ Returns:
88
+ Parsed answer from the model output to be used for scoring.
89
+ """
90
+ return str(model_output[0]["generated_text"][-1]["content"])
91
+
92
+ # === Evaluation With An InferencePipeline ===
93
+
94
+ # Step 1: Create an inference pipeline, using the 3 functions defined
95
+ inference_pipeline = InferencePipeline(
96
+ model="microsoft/Phi-4-mini-instruct",
97
+ preprocessor=preprocessor,
98
+ inference_function=inference,
99
+ postprocessor=postprocessor,
100
+ )
101
+
102
+ # Step 2: Create a list of evaluation items
103
+ evaluation_items = [
104
+ {"question": "What is 2 + 2?", "answer": "4"},
105
+ {"question": "What is the capital of France?", "answer": "Paris"},
106
+ {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"},
107
+ ]
108
+
109
+ # Create an evaluation dataset
110
+ evaluation_dataset = EvalDataset.from_list(
111
+ name="basic_questions",
112
+ metrics="accuracy",
113
+ items=evaluation_items,
114
+ input="question",
115
+ label="answer",
116
+ )
117
+
118
+ # Step 3: Run the evaluation using the inference pipeline and dataset
119
+ results = evaluate(
120
+ inference_pipeline,
121
+ evaluation_dataset,
122
+ hyperparameters={
123
+ "temperature": 0.7,
124
+ "system_message": "Answer the question directly and concisely.",
125
+ },
126
+ return_items=True, # Enable to include results for individual items in the dict returned.
127
+ return_output=True, # Enable to include the model's output for individual items.
128
+ upload_results=False, # Disable uploading for this example
129
+ )
130
+
131
+ pprint(results)
132
+ return results
133
+
134
+
135
+ if __name__ == "__main__":
136
+ load_dotenv()
137
+ log_file = setup_logging(experiment_id="6-inference_output", base_dir=Path(__file__).parent)
138
+ output_dir = Path(__file__).parent / "results"
139
+ output_dir.mkdir(exist_ok=True)
140
+ results_dict = main()
141
+ save_results_to_json(results_dict, output_dir, "6-inference_output_output.json")
@@ -0,0 +1,110 @@
1
+ """Tutorials - Evaluation Datasets - Example 1 - Loading Datasets from Files."""
2
+
3
+ from pathlib import Path
4
+ from pprint import pprint
5
+ from typing import Any, List
6
+
7
+ import transformers
8
+ from dotenv import load_dotenv
9
+
10
+ from tutorials.utils import save_results_to_json, setup_logging
11
+
12
+ from scorebook import EvalDataset, evaluate
13
+
14
+
15
+ def main() -> Any:
16
+ """Run evaluations using datasets loaded from local files.
17
+
18
+ This example demonstrates how to load evaluation datasets from files:
19
+ - from_json: Load datasets from JSON files
20
+ - from_csv: Load datasets from CSV files
21
+
22
+ Both methods support loading data from local files with custom field mappings.
23
+ """
24
+
25
+ # Initialize HuggingFace model pipeline
26
+ model_name = "microsoft/Phi-4-mini-instruct"
27
+ pipeline = transformers.pipeline(
28
+ "text-generation",
29
+ model=model_name,
30
+ model_kwargs={"torch_dtype": "auto"},
31
+ device_map="auto",
32
+ )
33
+
34
+ # Define an inference function
35
+ def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
36
+ """Process inputs through the model.
37
+
38
+ Args:
39
+ inputs: Input values from an EvalDataset.
40
+ hyperparameters: Model hyperparameters.
41
+
42
+ Returns:
43
+ List of model outputs for all inputs.
44
+ """
45
+ outputs = []
46
+ for input_val in inputs:
47
+ # Build messages
48
+ messages = [
49
+ {
50
+ "role": "system",
51
+ "content": "Answer the question directly and concisely. Provide only the answer, no additional context or text.",
52
+ },
53
+ {"role": "user", "content": str(input_val)},
54
+ ]
55
+
56
+ # Run inference
57
+ result = pipeline(messages, temperature=0.7)
58
+
59
+ # Extract the answer
60
+ output = str(result[0]["generated_text"][-1]["content"])
61
+ outputs.append(output)
62
+
63
+ return outputs
64
+
65
+ # Construct paths to example data files
66
+ example_datasets_dir = Path(__file__).parent / "example_datasets"
67
+ json_path = example_datasets_dir / "basic_questions.json"
68
+ csv_path = example_datasets_dir / "basic_questions.csv"
69
+
70
+ # Load dataset from JSON file
71
+ json_dataset = EvalDataset.from_json(
72
+ name="basic_questions_json",
73
+ path=str(json_path),
74
+ metrics="accuracy",
75
+ input="question",
76
+ label="answer",
77
+ )
78
+ print(f"Loaded {json_dataset.name} from JSON file: {len(json_dataset.items)} items")
79
+
80
+ # Load dataset from CSV file
81
+ csv_dataset = EvalDataset.from_csv(
82
+ name="basic_questions_csv",
83
+ path=str(csv_path),
84
+ metrics="accuracy",
85
+ input="question",
86
+ label="answer",
87
+ )
88
+ print(f"Loaded {csv_dataset.name} from CSV file: {len(csv_dataset.items)} items")
89
+
90
+ # Run evaluation on both datasets
91
+ results = evaluate(
92
+ inference,
93
+ datasets=[json_dataset, csv_dataset],
94
+ return_aggregates=True,
95
+ return_items=True,
96
+ return_output=True,
97
+ upload_results=False,
98
+ )
99
+
100
+ pprint(results)
101
+ return results
102
+
103
+
104
+ if __name__ == "__main__":
105
+ load_dotenv()
106
+ log_file = setup_logging(experiment_id="1-evaluation_datasets_from_files", base_dir=Path(__file__).parent)
107
+ output_dir = Path(__file__).parent / "results"
108
+ output_dir.mkdir(exist_ok=True)
109
+ results_dict = main()
110
+ save_results_to_json(results_dict, output_dir, "1-evaluation_datasets_from_files_output.json")
@@ -0,0 +1,101 @@
1
+ """Tutorials - Evaluation Datasets - Example 2 - Loading from HuggingFace."""
2
+
3
+ import asyncio
4
+ from pathlib import Path
5
+ from pprint import pprint
6
+ from typing import Any, List
7
+
8
+ from dotenv import load_dotenv
9
+ from openai import AsyncOpenAI
10
+
11
+ from tutorials.utils import save_results_to_json, setup_logging
12
+
13
+ from scorebook import EvalDataset, evaluate_async
14
+
15
+
16
+ async def main() -> Any:
17
+ """Run evaluations using datasets loaded from HuggingFace Hub.
18
+
19
+ This example demonstrates how to load evaluation datasets directly from
20
+ HuggingFace Hub using the from_huggingface method. This allows you to
21
+ easily evaluate on standard benchmarks and datasets.
22
+
23
+ We'll evaluate on the SimpleQA dataset, which tests factual question answering.
24
+
25
+ Prerequisites:
26
+ - OpenAI API key set in environment variable OPENAI_API_KEY
27
+ """
28
+
29
+ # Initialize OpenAI client
30
+ client = AsyncOpenAI()
31
+ model_name = "gpt-4o-mini"
32
+
33
+ # Define an async inference function
34
+ async def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
35
+ """Process inputs through OpenAI's API.
36
+
37
+ Args:
38
+ inputs: Input values from an EvalDataset.
39
+ hyperparameters: Model hyperparameters.
40
+
41
+ Returns:
42
+ List of model outputs for all inputs.
43
+ """
44
+ outputs = []
45
+ for input_val in inputs:
46
+ # Build messages for OpenAI API
47
+ messages = [
48
+ {
49
+ "role": "system",
50
+ "content": "Answer the question directly and concisely. Provide only the answer, no additional context or text.",
51
+ },
52
+ {"role": "user", "content": str(input_val)},
53
+ ]
54
+
55
+ # Call OpenAI API
56
+ try:
57
+ response = await client.chat.completions.create(
58
+ model=model_name,
59
+ messages=messages,
60
+ temperature=0.7,
61
+ )
62
+ output = response.choices[0].message.content.strip()
63
+ except Exception as e:
64
+ output = f"Error: {str(e)}"
65
+
66
+ outputs.append(output)
67
+
68
+ return outputs
69
+
70
+ # Load dataset from HuggingFace Hub
71
+ simple_qa = EvalDataset.from_huggingface(
72
+ path="basicv8vc/SimpleQA",
73
+ metrics="accuracy",
74
+ input="problem",
75
+ label="answer",
76
+ split="test",
77
+ )
78
+ print(f"Loaded {simple_qa.name} from HuggingFace Hub: {len(simple_qa.items)} items")
79
+
80
+ # Run evaluation with a sample to avoid long runtime
81
+ results = await evaluate_async(
82
+ inference,
83
+ simple_qa,
84
+ sample_size=10, # Sample 10 items for quick testing
85
+ return_aggregates=True,
86
+ return_items=True,
87
+ return_output=True,
88
+ upload_results=False,
89
+ )
90
+
91
+ pprint(results)
92
+ return results
93
+
94
+
95
+ if __name__ == "__main__":
96
+ load_dotenv()
97
+ log_file = setup_logging(experiment_id="2-evaluation_datasets_from_huggingface", base_dir=Path(__file__).parent)
98
+ output_dir = Path(__file__).parent / "results"
99
+ output_dir.mkdir(exist_ok=True)
100
+ results_dict = asyncio.run(main())
101
+ save_results_to_json(results_dict, output_dir, "2-evaluation_datasets_from_huggingface_output.json")