scorebook 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +2 -0
- scorebook/dashboard/credentials.py +34 -4
- scorebook/eval_datasets/eval_dataset.py +2 -2
- scorebook/evaluate/_async/evaluate_async.py +27 -11
- scorebook/evaluate/_sync/evaluate.py +27 -11
- scorebook/metrics/README.md +121 -0
- scorebook/metrics/__init__.py +8 -0
- scorebook/metrics/accuracy.py +2 -6
- scorebook/metrics/bertscore.py +50 -0
- scorebook/metrics/bleu.py +82 -0
- scorebook/metrics/core/__init__.py +1 -0
- scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
- scorebook/metrics/core/metric_registry.py +195 -0
- scorebook/metrics/exactmatch.py +95 -0
- scorebook/metrics/f1.py +96 -0
- scorebook/metrics/precision.py +84 -9
- scorebook/metrics/recall.py +94 -0
- scorebook/metrics/rouge.py +85 -0
- scorebook/score/score_helpers.py +28 -11
- scorebook/types.py +2 -2
- scorebook/utils/progress_bars.py +58 -786
- {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/METADATA +32 -24
- scorebook-0.0.16.dist-info/RECORD +110 -0
- {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/WHEEL +1 -1
- tutorials/README.md +147 -0
- tutorials/__init__.py +5 -0
- tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
- tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
- tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
- tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
- tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
- tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
- tutorials/examples/1-score/__init__.py +0 -0
- tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
- tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
- tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
- tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
- tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
- tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
- tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
- tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
- tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
- tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
- tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
- tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
- tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
- tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
- tutorials/examples/6-providers/aws/__init__.py +1 -0
- tutorials/examples/6-providers/aws/batch_example.py +219 -0
- tutorials/examples/6-providers/portkey/__init__.py +1 -0
- tutorials/examples/6-providers/portkey/batch_example.py +120 -0
- tutorials/examples/6-providers/portkey/messages_example.py +121 -0
- tutorials/examples/6-providers/vertex/__init__.py +1 -0
- tutorials/examples/6-providers/vertex/batch_example.py +166 -0
- tutorials/examples/6-providers/vertex/messages_example.py +142 -0
- tutorials/examples/__init__.py +0 -0
- tutorials/notebooks/1-scoring.ipynb +162 -0
- tutorials/notebooks/2-evaluating.ipynb +316 -0
- tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
- tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
- tutorials/notebooks/4-uploading_results.ipynb +175 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
- tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
- tutorials/quickstarts/getting_started.ipynb +197 -0
- tutorials/utils/__init__.py +35 -0
- tutorials/utils/args_parser.py +132 -0
- tutorials/utils/output.py +23 -0
- tutorials/utils/setup.py +98 -0
- scorebook/metrics/metric_registry.py +0 -107
- scorebook-0.0.14.dist-info/RECORD +0 -53
- {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""Tutorials - Evaluate - Example 5 - Hyperparameter Sweeps."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from pprint import pprint
|
|
5
|
+
from typing import Any, List
|
|
6
|
+
|
|
7
|
+
import transformers
|
|
8
|
+
from dotenv import load_dotenv
|
|
9
|
+
|
|
10
|
+
from tutorials.utils import save_results_to_json, setup_logging
|
|
11
|
+
|
|
12
|
+
from scorebook import EvalDataset, evaluate
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main() -> Any:
|
|
16
|
+
"""Run a Scorebook evaluation with a hyperparameter sweep.
|
|
17
|
+
|
|
18
|
+
This example demonstrates how Scorebook can automatically test multiple
|
|
19
|
+
hyperparameter configurations in a single evaluation.
|
|
20
|
+
|
|
21
|
+
How Hyperparameter Sweeping Works:
|
|
22
|
+
- Define hyperparameters with lists of values to test
|
|
23
|
+
- Scorebook generates all possible combinations (Cartesian product)
|
|
24
|
+
- Each configuration is evaluated separately on the same dataset
|
|
25
|
+
|
|
26
|
+
Example Hyperparameters:
|
|
27
|
+
- system_message: "Answer the question directly and concisely." (1 value)
|
|
28
|
+
- temperature: [0.6, 0.7, 0.8] (3 values)
|
|
29
|
+
- top_p: [0.7, 0.8, 0.9] (3 values)
|
|
30
|
+
- top_k: [10, 20, 30] (3 values)
|
|
31
|
+
|
|
32
|
+
Total configurations = 1 × 3 × 3 × 3 = 27 hyperparameter configurations
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
# Initialize HuggingFace model pipeline
|
|
36
|
+
model_name = "microsoft/Phi-4-mini-instruct"
|
|
37
|
+
pipeline = transformers.pipeline(
|
|
38
|
+
"text-generation",
|
|
39
|
+
model=model_name,
|
|
40
|
+
model_kwargs={"torch_dtype": "auto"},
|
|
41
|
+
device_map="auto",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# Define an inference function
|
|
45
|
+
def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
|
|
46
|
+
"""Process inputs through the model.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
inputs: Input values from an EvalDataset.
|
|
50
|
+
hyperparameters: Model hyperparameters including system_message, temperature, top_p, top_k.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
List of model outputs for all inputs.
|
|
54
|
+
"""
|
|
55
|
+
outputs = []
|
|
56
|
+
for input_val in inputs:
|
|
57
|
+
# Preprocess: Build messages
|
|
58
|
+
messages = [
|
|
59
|
+
{"role": "system", "content": hyperparameters["system_message"]},
|
|
60
|
+
{"role": "user", "content": str(input_val)},
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
# Run inference
|
|
64
|
+
result = pipeline(
|
|
65
|
+
messages,
|
|
66
|
+
temperature=hyperparameters["temperature"],
|
|
67
|
+
top_p=hyperparameters.get("top_p"),
|
|
68
|
+
top_k=hyperparameters.get("top_k"),
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# Postprocess: Extract the answer
|
|
72
|
+
output = str(result[0]["generated_text"][-1]["content"])
|
|
73
|
+
outputs.append(output)
|
|
74
|
+
|
|
75
|
+
return outputs
|
|
76
|
+
|
|
77
|
+
# Create a list of evaluation items
|
|
78
|
+
evaluation_items = [
|
|
79
|
+
{"question": "What is 2 + 2?", "answer": "4"},
|
|
80
|
+
{"question": "What is the capital of France?", "answer": "Paris"},
|
|
81
|
+
{"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"},
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
# Create an evaluation dataset
|
|
85
|
+
evaluation_dataset = EvalDataset.from_list(
|
|
86
|
+
name="basic_questions",
|
|
87
|
+
metrics="accuracy",
|
|
88
|
+
items=evaluation_items,
|
|
89
|
+
input="question",
|
|
90
|
+
label="answer",
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Define hyperparameters with lists of values to create a sweep
|
|
94
|
+
hyperparameters = {
|
|
95
|
+
"system_message": "Answer the question directly and concisely.",
|
|
96
|
+
"temperature": [0.6, 0.7, 0.8],
|
|
97
|
+
"top_p": [0.7, 0.8, 0.9],
|
|
98
|
+
"top_k": [10, 20, 30],
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
# Run evaluation across all hyperparameter combinations
|
|
102
|
+
results = evaluate(
|
|
103
|
+
inference,
|
|
104
|
+
evaluation_dataset,
|
|
105
|
+
hyperparameters=hyperparameters,
|
|
106
|
+
return_aggregates=True,
|
|
107
|
+
return_items=True,
|
|
108
|
+
return_output=True,
|
|
109
|
+
upload_results=False,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
pprint(results)
|
|
113
|
+
return results
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
if __name__ == "__main__":
|
|
117
|
+
load_dotenv()
|
|
118
|
+
log_file = setup_logging(experiment_id="5-hyperparameter_sweeps", base_dir=Path(__file__).parent)
|
|
119
|
+
output_dir = Path(__file__).parent / "results"
|
|
120
|
+
output_dir.mkdir(exist_ok=True)
|
|
121
|
+
results_dict = main()
|
|
122
|
+
save_results_to_json(results_dict, output_dir, "5-hyperparameter_sweeps_output.json")
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""Tutorials - Evaluate - Example 6 - Inference Pipelines."""
|
|
2
|
+
|
|
3
|
+
from pprint import pprint
|
|
4
|
+
from typing import Any, List
|
|
5
|
+
|
|
6
|
+
import transformers
|
|
7
|
+
from dotenv import load_dotenv
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from tutorials.utils import save_results_to_json, setup_logging
|
|
11
|
+
|
|
12
|
+
from scorebook import EvalDataset, InferencePipeline, evaluate
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main() -> Any:
|
|
16
|
+
"""Run a simple Scorebook evaluation using an InferencePipeline.
|
|
17
|
+
|
|
18
|
+
This example demonstrates how to use Scorebook's InferencePipeline in evaluations.
|
|
19
|
+
|
|
20
|
+
Inference pipelines separate the evaluation workflow into three distinct stages:
|
|
21
|
+
1. Pre-processing: Convert raw dataset items into model-ready input format
|
|
22
|
+
2. Inference: Execute model predictions on preprocessed data
|
|
23
|
+
3. Post-processing: Extract final answers from raw model outputs
|
|
24
|
+
|
|
25
|
+
These stages can be encapsulated in reusable functions and used to create pipelines.
|
|
26
|
+
An inference pipeline can be passed into the evaluate function's inference parameter.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
# === Pre-Processing ===
|
|
30
|
+
|
|
31
|
+
# The preprocessor function is responsible for mapping items in an Eval Dataset to model inputs
|
|
32
|
+
def preprocessor(input_value: str, **hyperparameter_config: Any) -> List[Any]:
|
|
33
|
+
"""Convert an evaluation input to a valid model input.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
input_value: The input value from the dataset.
|
|
37
|
+
hyperparameter_config: Model hyperparameters.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
A structured representation of an evaluation item for model input.
|
|
41
|
+
"""
|
|
42
|
+
messages = [
|
|
43
|
+
{
|
|
44
|
+
"role": "system",
|
|
45
|
+
"content": hyperparameter_config["system_message"],
|
|
46
|
+
},
|
|
47
|
+
{"role": "user", "content": input_value},
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
return messages
|
|
51
|
+
|
|
52
|
+
# === Inference ===
|
|
53
|
+
|
|
54
|
+
pipeline = transformers.pipeline(
|
|
55
|
+
"text-generation",
|
|
56
|
+
model="microsoft/Phi-4-mini-instruct",
|
|
57
|
+
model_kwargs={"torch_dtype": "auto"},
|
|
58
|
+
device_map="auto",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# An inference function for an InferencePipeline that returns a list of raw outputs
|
|
62
|
+
def inference(preprocessed_items: List[Any], **hyperparameter_config: Any) -> List[Any]:
|
|
63
|
+
"""Run model inference on preprocessed eval items.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
preprocessed_items: The list of evaluation items for an EvalDataset.
|
|
67
|
+
hyperparameter_config: Model hyperparameters.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
A list of model outputs for an EvalDataset.
|
|
71
|
+
"""
|
|
72
|
+
return [
|
|
73
|
+
pipeline(model_input, temperature=hyperparameter_config["temperature"])
|
|
74
|
+
for model_input in preprocessed_items
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
# === Post-Processing ===
|
|
78
|
+
|
|
79
|
+
# The postprocessor function parses model output for metric scoring
|
|
80
|
+
def postprocessor(model_output: Any, **hyperparameter_config: Any) -> str:
|
|
81
|
+
"""Extract the final parsed answer from the model output.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
model_output: An evaluation item from an EvalDataset.
|
|
85
|
+
hyperparameter_config: Model hyperparameters.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Parsed answer from the model output to be used for scoring.
|
|
89
|
+
"""
|
|
90
|
+
return str(model_output[0]["generated_text"][-1]["content"])
|
|
91
|
+
|
|
92
|
+
# === Evaluation With An InferencePipeline ===
|
|
93
|
+
|
|
94
|
+
# Step 1: Create an inference pipeline, using the 3 functions defined
|
|
95
|
+
inference_pipeline = InferencePipeline(
|
|
96
|
+
model="microsoft/Phi-4-mini-instruct",
|
|
97
|
+
preprocessor=preprocessor,
|
|
98
|
+
inference_function=inference,
|
|
99
|
+
postprocessor=postprocessor,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# Step 2: Create a list of evaluation items
|
|
103
|
+
evaluation_items = [
|
|
104
|
+
{"question": "What is 2 + 2?", "answer": "4"},
|
|
105
|
+
{"question": "What is the capital of France?", "answer": "Paris"},
|
|
106
|
+
{"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"},
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
# Create an evaluation dataset
|
|
110
|
+
evaluation_dataset = EvalDataset.from_list(
|
|
111
|
+
name="basic_questions",
|
|
112
|
+
metrics="accuracy",
|
|
113
|
+
items=evaluation_items,
|
|
114
|
+
input="question",
|
|
115
|
+
label="answer",
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Step 3: Run the evaluation using the inference pipeline and dataset
|
|
119
|
+
results = evaluate(
|
|
120
|
+
inference_pipeline,
|
|
121
|
+
evaluation_dataset,
|
|
122
|
+
hyperparameters={
|
|
123
|
+
"temperature": 0.7,
|
|
124
|
+
"system_message": "Answer the question directly and concisely.",
|
|
125
|
+
},
|
|
126
|
+
return_items=True, # Enable to include results for individual items in the dict returned.
|
|
127
|
+
return_output=True, # Enable to include the model's output for individual items.
|
|
128
|
+
upload_results=False, # Disable uploading for this example
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
pprint(results)
|
|
132
|
+
return results
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
if __name__ == "__main__":
|
|
136
|
+
load_dotenv()
|
|
137
|
+
log_file = setup_logging(experiment_id="6-inference_output", base_dir=Path(__file__).parent)
|
|
138
|
+
output_dir = Path(__file__).parent / "results"
|
|
139
|
+
output_dir.mkdir(exist_ok=True)
|
|
140
|
+
results_dict = main()
|
|
141
|
+
save_results_to_json(results_dict, output_dir, "6-inference_output_output.json")
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Tutorials - Evaluation Datasets - Example 1 - Loading Datasets from Files."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from pprint import pprint
|
|
5
|
+
from typing import Any, List
|
|
6
|
+
|
|
7
|
+
import transformers
|
|
8
|
+
from dotenv import load_dotenv
|
|
9
|
+
|
|
10
|
+
from tutorials.utils import save_results_to_json, setup_logging
|
|
11
|
+
|
|
12
|
+
from scorebook import EvalDataset, evaluate
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main() -> Any:
|
|
16
|
+
"""Run evaluations using datasets loaded from local files.
|
|
17
|
+
|
|
18
|
+
This example demonstrates how to load evaluation datasets from files:
|
|
19
|
+
- from_json: Load datasets from JSON files
|
|
20
|
+
- from_csv: Load datasets from CSV files
|
|
21
|
+
|
|
22
|
+
Both methods support loading data from local files with custom field mappings.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
# Initialize HuggingFace model pipeline
|
|
26
|
+
model_name = "microsoft/Phi-4-mini-instruct"
|
|
27
|
+
pipeline = transformers.pipeline(
|
|
28
|
+
"text-generation",
|
|
29
|
+
model=model_name,
|
|
30
|
+
model_kwargs={"torch_dtype": "auto"},
|
|
31
|
+
device_map="auto",
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# Define an inference function
|
|
35
|
+
def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
|
|
36
|
+
"""Process inputs through the model.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
inputs: Input values from an EvalDataset.
|
|
40
|
+
hyperparameters: Model hyperparameters.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
List of model outputs for all inputs.
|
|
44
|
+
"""
|
|
45
|
+
outputs = []
|
|
46
|
+
for input_val in inputs:
|
|
47
|
+
# Build messages
|
|
48
|
+
messages = [
|
|
49
|
+
{
|
|
50
|
+
"role": "system",
|
|
51
|
+
"content": "Answer the question directly and concisely. Provide only the answer, no additional context or text.",
|
|
52
|
+
},
|
|
53
|
+
{"role": "user", "content": str(input_val)},
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
# Run inference
|
|
57
|
+
result = pipeline(messages, temperature=0.7)
|
|
58
|
+
|
|
59
|
+
# Extract the answer
|
|
60
|
+
output = str(result[0]["generated_text"][-1]["content"])
|
|
61
|
+
outputs.append(output)
|
|
62
|
+
|
|
63
|
+
return outputs
|
|
64
|
+
|
|
65
|
+
# Construct paths to example data files
|
|
66
|
+
example_datasets_dir = Path(__file__).parent / "example_datasets"
|
|
67
|
+
json_path = example_datasets_dir / "basic_questions.json"
|
|
68
|
+
csv_path = example_datasets_dir / "basic_questions.csv"
|
|
69
|
+
|
|
70
|
+
# Load dataset from JSON file
|
|
71
|
+
json_dataset = EvalDataset.from_json(
|
|
72
|
+
name="basic_questions_json",
|
|
73
|
+
path=str(json_path),
|
|
74
|
+
metrics="accuracy",
|
|
75
|
+
input="question",
|
|
76
|
+
label="answer",
|
|
77
|
+
)
|
|
78
|
+
print(f"Loaded {json_dataset.name} from JSON file: {len(json_dataset.items)} items")
|
|
79
|
+
|
|
80
|
+
# Load dataset from CSV file
|
|
81
|
+
csv_dataset = EvalDataset.from_csv(
|
|
82
|
+
name="basic_questions_csv",
|
|
83
|
+
path=str(csv_path),
|
|
84
|
+
metrics="accuracy",
|
|
85
|
+
input="question",
|
|
86
|
+
label="answer",
|
|
87
|
+
)
|
|
88
|
+
print(f"Loaded {csv_dataset.name} from CSV file: {len(csv_dataset.items)} items")
|
|
89
|
+
|
|
90
|
+
# Run evaluation on both datasets
|
|
91
|
+
results = evaluate(
|
|
92
|
+
inference,
|
|
93
|
+
datasets=[json_dataset, csv_dataset],
|
|
94
|
+
return_aggregates=True,
|
|
95
|
+
return_items=True,
|
|
96
|
+
return_output=True,
|
|
97
|
+
upload_results=False,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
pprint(results)
|
|
101
|
+
return results
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
if __name__ == "__main__":
|
|
105
|
+
load_dotenv()
|
|
106
|
+
log_file = setup_logging(experiment_id="1-evaluation_datasets_from_files", base_dir=Path(__file__).parent)
|
|
107
|
+
output_dir = Path(__file__).parent / "results"
|
|
108
|
+
output_dir.mkdir(exist_ok=True)
|
|
109
|
+
results_dict = main()
|
|
110
|
+
save_results_to_json(results_dict, output_dir, "1-evaluation_datasets_from_files_output.json")
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""Tutorials - Evaluation Datasets - Example 2 - Loading from HuggingFace."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from pprint import pprint
|
|
6
|
+
from typing import Any, List
|
|
7
|
+
|
|
8
|
+
from dotenv import load_dotenv
|
|
9
|
+
from openai import AsyncOpenAI
|
|
10
|
+
|
|
11
|
+
from tutorials.utils import save_results_to_json, setup_logging
|
|
12
|
+
|
|
13
|
+
from scorebook import EvalDataset, evaluate_async
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
async def main() -> Any:
|
|
17
|
+
"""Run evaluations using datasets loaded from HuggingFace Hub.
|
|
18
|
+
|
|
19
|
+
This example demonstrates how to load evaluation datasets directly from
|
|
20
|
+
HuggingFace Hub using the from_huggingface method. This allows you to
|
|
21
|
+
easily evaluate on standard benchmarks and datasets.
|
|
22
|
+
|
|
23
|
+
We'll evaluate on the SimpleQA dataset, which tests factual question answering.
|
|
24
|
+
|
|
25
|
+
Prerequisites:
|
|
26
|
+
- OpenAI API key set in environment variable OPENAI_API_KEY
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
# Initialize OpenAI client
|
|
30
|
+
client = AsyncOpenAI()
|
|
31
|
+
model_name = "gpt-4o-mini"
|
|
32
|
+
|
|
33
|
+
# Define an async inference function
|
|
34
|
+
async def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
|
|
35
|
+
"""Process inputs through OpenAI's API.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
inputs: Input values from an EvalDataset.
|
|
39
|
+
hyperparameters: Model hyperparameters.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
List of model outputs for all inputs.
|
|
43
|
+
"""
|
|
44
|
+
outputs = []
|
|
45
|
+
for input_val in inputs:
|
|
46
|
+
# Build messages for OpenAI API
|
|
47
|
+
messages = [
|
|
48
|
+
{
|
|
49
|
+
"role": "system",
|
|
50
|
+
"content": "Answer the question directly and concisely. Provide only the answer, no additional context or text.",
|
|
51
|
+
},
|
|
52
|
+
{"role": "user", "content": str(input_val)},
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
# Call OpenAI API
|
|
56
|
+
try:
|
|
57
|
+
response = await client.chat.completions.create(
|
|
58
|
+
model=model_name,
|
|
59
|
+
messages=messages,
|
|
60
|
+
temperature=0.7,
|
|
61
|
+
)
|
|
62
|
+
output = response.choices[0].message.content.strip()
|
|
63
|
+
except Exception as e:
|
|
64
|
+
output = f"Error: {str(e)}"
|
|
65
|
+
|
|
66
|
+
outputs.append(output)
|
|
67
|
+
|
|
68
|
+
return outputs
|
|
69
|
+
|
|
70
|
+
# Load dataset from HuggingFace Hub
|
|
71
|
+
simple_qa = EvalDataset.from_huggingface(
|
|
72
|
+
path="basicv8vc/SimpleQA",
|
|
73
|
+
metrics="accuracy",
|
|
74
|
+
input="problem",
|
|
75
|
+
label="answer",
|
|
76
|
+
split="test",
|
|
77
|
+
)
|
|
78
|
+
print(f"Loaded {simple_qa.name} from HuggingFace Hub: {len(simple_qa.items)} items")
|
|
79
|
+
|
|
80
|
+
# Run evaluation with a sample to avoid long runtime
|
|
81
|
+
results = await evaluate_async(
|
|
82
|
+
inference,
|
|
83
|
+
simple_qa,
|
|
84
|
+
sample_size=10, # Sample 10 items for quick testing
|
|
85
|
+
return_aggregates=True,
|
|
86
|
+
return_items=True,
|
|
87
|
+
return_output=True,
|
|
88
|
+
upload_results=False,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
pprint(results)
|
|
92
|
+
return results
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
if __name__ == "__main__":
|
|
96
|
+
load_dotenv()
|
|
97
|
+
log_file = setup_logging(experiment_id="2-evaluation_datasets_from_huggingface", base_dir=Path(__file__).parent)
|
|
98
|
+
output_dir = Path(__file__).parent / "results"
|
|
99
|
+
output_dir.mkdir(exist_ok=True)
|
|
100
|
+
results_dict = asyncio.run(main())
|
|
101
|
+
save_results_to_json(results_dict, output_dir, "2-evaluation_datasets_from_huggingface_output.json")
|
tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Tutorials - Evaluation Datasets - Example 3 - Loading from YAML Config."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from pprint import pprint
|
|
6
|
+
from typing import Any, List
|
|
7
|
+
|
|
8
|
+
from dotenv import load_dotenv
|
|
9
|
+
from openai import AsyncOpenAI
|
|
10
|
+
|
|
11
|
+
from tutorials.utils import save_results_to_json, setup_logging
|
|
12
|
+
|
|
13
|
+
from scorebook import EvalDataset, evaluate_async
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
async def main() -> Any:
|
|
17
|
+
"""Run evaluations using datasets loaded from YAML configuration files.
|
|
18
|
+
|
|
19
|
+
This example demonstrates how to use YAML configuration files to define
|
|
20
|
+
dataset loading parameters. YAML configs are useful for:
|
|
21
|
+
- Storing dataset configurations in version control
|
|
22
|
+
- Reusing the same dataset configuration across projects
|
|
23
|
+
- Defining complex prompt templates and field mappings
|
|
24
|
+
|
|
25
|
+
The YAML files contain:
|
|
26
|
+
- HuggingFace dataset path and split information
|
|
27
|
+
- Metrics to use for evaluation
|
|
28
|
+
- Jinja2 templates for input and label formatting
|
|
29
|
+
- Metadata about the dataset
|
|
30
|
+
|
|
31
|
+
Prerequisites:
|
|
32
|
+
- OpenAI API key set in environment variable OPENAI_API_KEY
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
# Initialize OpenAI client
|
|
36
|
+
client = AsyncOpenAI()
|
|
37
|
+
model_name = "gpt-4o-mini"
|
|
38
|
+
|
|
39
|
+
# Define an async inference function
|
|
40
|
+
async def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
|
|
41
|
+
"""Process inputs through OpenAI's API.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
inputs: Input values from an EvalDataset.
|
|
45
|
+
hyperparameters: Model hyperparameters.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
List of model outputs for all inputs.
|
|
49
|
+
"""
|
|
50
|
+
outputs = []
|
|
51
|
+
for input_val in inputs:
|
|
52
|
+
# Build messages for OpenAI API
|
|
53
|
+
messages = [
|
|
54
|
+
{
|
|
55
|
+
"role": "system",
|
|
56
|
+
"content": "Answer the multiple choice question by selecting the correct letter (A, B, C, D, etc.). Provide ONLY the letter of your answer, no additional text or explanation.",
|
|
57
|
+
},
|
|
58
|
+
{"role": "user", "content": str(input_val)},
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
# Call OpenAI API
|
|
62
|
+
try:
|
|
63
|
+
response = await client.chat.completions.create(
|
|
64
|
+
model=model_name,
|
|
65
|
+
messages=messages,
|
|
66
|
+
temperature=0.7,
|
|
67
|
+
)
|
|
68
|
+
output = response.choices[0].message.content.strip()
|
|
69
|
+
except Exception as e:
|
|
70
|
+
output = f"Error: {str(e)}"
|
|
71
|
+
|
|
72
|
+
outputs.append(output)
|
|
73
|
+
|
|
74
|
+
return outputs
|
|
75
|
+
|
|
76
|
+
# Construct paths to YAML config files
|
|
77
|
+
yaml_configs_dir = Path(__file__).parent / "example_yaml_configs"
|
|
78
|
+
cais_mmlu_yaml = yaml_configs_dir / "Cais-MMLU.yaml"
|
|
79
|
+
tiger_mmlu_pro_yaml = yaml_configs_dir / "TIGER-Lab-MMLU-Pro.yaml"
|
|
80
|
+
|
|
81
|
+
# Load Cais-MMLU dataset from YAML configuration
|
|
82
|
+
cais_mmlu = EvalDataset.from_yaml(str(cais_mmlu_yaml))
|
|
83
|
+
print(f"Loaded {cais_mmlu.name} from YAML config: {len(cais_mmlu.items)} items")
|
|
84
|
+
|
|
85
|
+
# Load TIGER-Lab MMLU-Pro dataset from YAML configuration
|
|
86
|
+
tiger_mmlu_pro = EvalDataset.from_yaml(str(tiger_mmlu_pro_yaml))
|
|
87
|
+
print(f"Loaded {tiger_mmlu_pro.name} from YAML config: {len(tiger_mmlu_pro.items)} items")
|
|
88
|
+
|
|
89
|
+
# Run evaluation on both datasets
|
|
90
|
+
results = await evaluate_async(
|
|
91
|
+
inference,
|
|
92
|
+
datasets=[cais_mmlu, tiger_mmlu_pro],
|
|
93
|
+
sample_size=5, # Sample 5 items from each dataset for quick testing
|
|
94
|
+
return_aggregates=True,
|
|
95
|
+
return_items=True,
|
|
96
|
+
return_output=True,
|
|
97
|
+
upload_results=False,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
pprint(results)
|
|
101
|
+
return results
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
if __name__ == "__main__":
|
|
105
|
+
load_dotenv()
|
|
106
|
+
log_file = setup_logging(experiment_id="3-evaluation_datasets_from_yaml", base_dir=Path(__file__).parent)
|
|
107
|
+
output_dir = Path(__file__).parent / "results"
|
|
108
|
+
output_dir.mkdir(exist_ok=True)
|
|
109
|
+
results_dict = asyncio.run(main())
|
|
110
|
+
save_results_to_json(results_dict, output_dir, "3-evaluation_datasets_from_yaml_output.json")
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
question,answer
|
|
2
|
+
What is 2 + 2?,4
|
|
3
|
+
What is the capital of France?,Paris
|
|
4
|
+
Who wrote Romeo and Juliet?,William Shakespeare
|
|
5
|
+
What is 5 * 6?,30
|
|
6
|
+
What is the largest planet in our solar system?,Jupiter
|
|
7
|
+
Who painted the Mona Lisa?,Leonardo da Vinci
|
|
8
|
+
What is the square root of 64?,8
|
|
9
|
+
What is the capital of Japan?,Tokyo
|
|
10
|
+
Who invented the telephone?,Alexander Graham Bell
|
|
11
|
+
What is 12 - 7?,5
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"question": "What is 2 + 2?",
|
|
4
|
+
"answer": "4"
|
|
5
|
+
},
|
|
6
|
+
{
|
|
7
|
+
"question": "What is the capital of France?",
|
|
8
|
+
"answer": "Paris"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"question": "Who wrote Romeo and Juliet?",
|
|
12
|
+
"answer": "William Shakespeare"
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"question": "What is 5 * 6?",
|
|
16
|
+
"answer": "30"
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"question": "What is the largest planet in our solar system?",
|
|
20
|
+
"answer": "Jupiter"
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
"question": "Who painted the Mona Lisa?",
|
|
24
|
+
"answer": "Leonardo da Vinci"
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
"question": "What is the square root of 64?",
|
|
28
|
+
"answer": "8"
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
"question": "What is the capital of Japan?",
|
|
32
|
+
"answer": "Tokyo"
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
"question": "Who invented the telephone?",
|
|
36
|
+
"answer": "Alexander Graham Bell"
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
"question": "What is 12 - 7?",
|
|
40
|
+
"answer": "5"
|
|
41
|
+
}
|
|
42
|
+
]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
path: "cais/mmlu"
|
|
2
|
+
name: "Cais-MMLU"
|
|
3
|
+
split: "test"
|
|
4
|
+
config: "all"
|
|
5
|
+
metrics:
|
|
6
|
+
- "accuracy"
|
|
7
|
+
|
|
8
|
+
templates:
|
|
9
|
+
input: |
|
|
10
|
+
{{ question }}
|
|
11
|
+
|
|
12
|
+
A. {{ choices[0] }}
|
|
13
|
+
B. {{ choices[1] }}
|
|
14
|
+
C. {{ choices[2] }}
|
|
15
|
+
D. {{ choices[3] }}
|
|
16
|
+
label: "{{ answer }}"
|
|
17
|
+
|
|
18
|
+
metadata:
|
|
19
|
+
description: "MMLU multiple choice questions from Cais"
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
path: "TIGER-Lab/MMLU-Pro"
|
|
2
|
+
name: "TIGER-Lab/MMLU-Pro"
|
|
3
|
+
split: "validation"
|
|
4
|
+
config: "default"
|
|
5
|
+
metrics:
|
|
6
|
+
- "accuracy"
|
|
7
|
+
|
|
8
|
+
templates:
|
|
9
|
+
input: |
|
|
10
|
+
{{ question }}
|
|
11
|
+
Options:
|
|
12
|
+
{% for option in options %}
|
|
13
|
+
{{ number_to_letter(loop.index0) }} : {{ option }}
|
|
14
|
+
{% endfor %}
|
|
15
|
+
label: "{{ answer }}"
|
|
16
|
+
|
|
17
|
+
metadata:
|
|
18
|
+
description: "MMLU-Pro multiple choice questions"
|