scorebook 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +2 -0
- scorebook/dashboard/credentials.py +34 -4
- scorebook/eval_datasets/eval_dataset.py +2 -2
- scorebook/evaluate/_async/evaluate_async.py +27 -11
- scorebook/evaluate/_sync/evaluate.py +27 -11
- scorebook/metrics/README.md +121 -0
- scorebook/metrics/__init__.py +8 -0
- scorebook/metrics/accuracy.py +2 -6
- scorebook/metrics/bertscore.py +50 -0
- scorebook/metrics/bleu.py +82 -0
- scorebook/metrics/core/__init__.py +1 -0
- scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
- scorebook/metrics/core/metric_registry.py +195 -0
- scorebook/metrics/exactmatch.py +95 -0
- scorebook/metrics/f1.py +96 -0
- scorebook/metrics/precision.py +84 -9
- scorebook/metrics/recall.py +94 -0
- scorebook/metrics/rouge.py +85 -0
- scorebook/score/score_helpers.py +28 -11
- scorebook/types.py +2 -2
- scorebook/utils/progress_bars.py +58 -786
- {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/METADATA +32 -24
- scorebook-0.0.16.dist-info/RECORD +110 -0
- {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/WHEEL +1 -1
- tutorials/README.md +147 -0
- tutorials/__init__.py +5 -0
- tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
- tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
- tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
- tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
- tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
- tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
- tutorials/examples/1-score/__init__.py +0 -0
- tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
- tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
- tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
- tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
- tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
- tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
- tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
- tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
- tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
- tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
- tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
- tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
- tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
- tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
- tutorials/examples/6-providers/aws/__init__.py +1 -0
- tutorials/examples/6-providers/aws/batch_example.py +219 -0
- tutorials/examples/6-providers/portkey/__init__.py +1 -0
- tutorials/examples/6-providers/portkey/batch_example.py +120 -0
- tutorials/examples/6-providers/portkey/messages_example.py +121 -0
- tutorials/examples/6-providers/vertex/__init__.py +1 -0
- tutorials/examples/6-providers/vertex/batch_example.py +166 -0
- tutorials/examples/6-providers/vertex/messages_example.py +142 -0
- tutorials/examples/__init__.py +0 -0
- tutorials/notebooks/1-scoring.ipynb +162 -0
- tutorials/notebooks/2-evaluating.ipynb +316 -0
- tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
- tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
- tutorials/notebooks/4-uploading_results.ipynb +175 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
- tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
- tutorials/quickstarts/getting_started.ipynb +197 -0
- tutorials/utils/__init__.py +35 -0
- tutorials/utils/args_parser.py +132 -0
- tutorials/utils/output.py +23 -0
- tutorials/utils/setup.py +98 -0
- scorebook/metrics/metric_registry.py +0 -107
- scorebook-0.0.14.dist-info/RECORD +0 -53
- {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AWS Bedrock Batch Inference Example.
|
|
3
|
+
|
|
4
|
+
This example demonstrates how to leverage AWS Bedrock's Model Invocation Jobs for
|
|
5
|
+
cost-effective, large-scale model evaluation using Scorebook. It uses Claude models
|
|
6
|
+
for batch processing with automatic S3 upload/download and job management.
|
|
7
|
+
|
|
8
|
+
This example requires AWS CLI to be configured with appropriate credentials and
|
|
9
|
+
permissions for Bedrock and S3. Set up your AWS profile and ensure you have
|
|
10
|
+
the necessary IAM roles configured.
|
|
11
|
+
|
|
12
|
+
Prerequisites:
|
|
13
|
+
- AWS CLI configured with appropriate profile
|
|
14
|
+
- S3 bucket with proper permissions
|
|
15
|
+
- IAM role for Bedrock execution with S3 access
|
|
16
|
+
- Minimum 100 items for batch processing (AWS requirement)
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Any, List, Optional
|
|
22
|
+
|
|
23
|
+
from dotenv import load_dotenv
|
|
24
|
+
|
|
25
|
+
from scorebook import EvalDataset, InferencePipeline, evaluate
|
|
26
|
+
from scorebook.inference.clients.bedrock import batch
|
|
27
|
+
from scorebook.metrics.accuracy import Accuracy
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def main() -> None:
|
|
31
|
+
"""Run the AWS Bedrock batch inference example."""
|
|
32
|
+
# Load environment variables from .env file for configuration
|
|
33
|
+
load_dotenv()
|
|
34
|
+
|
|
35
|
+
args = setup_arguments()
|
|
36
|
+
|
|
37
|
+
# Step 1: Load the evaluation dataset
|
|
38
|
+
dataset = EvalDataset.from_json(
|
|
39
|
+
"examples/example_datasets/dataset.json",
|
|
40
|
+
metrics=[Accuracy],
|
|
41
|
+
input="question",
|
|
42
|
+
label="answer",
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# Ensure minimum batch size requirement (AWS Bedrock requires 100+ items)
|
|
46
|
+
if len(dataset) < 100:
|
|
47
|
+
# Cycle through items to reach 100 - default minimum size of AWS batch jobs
|
|
48
|
+
original_items = dataset.items
|
|
49
|
+
expanded_items: List[Any] = []
|
|
50
|
+
while len(expanded_items) < 100:
|
|
51
|
+
items_needed = 100 - len(expanded_items)
|
|
52
|
+
expanded_items.extend(original_items[: min(len(original_items), items_needed)])
|
|
53
|
+
|
|
54
|
+
# Create new dataset with expanded items
|
|
55
|
+
# Items already have "input" and "label" columns from the original dataset
|
|
56
|
+
dataset = EvalDataset.from_list(
|
|
57
|
+
name=dataset.name,
|
|
58
|
+
metrics=[Accuracy],
|
|
59
|
+
items=expanded_items,
|
|
60
|
+
input="input",
|
|
61
|
+
label="label",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Step 2: Define the preprocessing function for AWS Bedrock Batch API
|
|
65
|
+
def preprocessor(input_value: str) -> list:
|
|
66
|
+
"""Pre-process dataset inputs into AWS Bedrock Batch API format."""
|
|
67
|
+
# Create the batch API request messages format for Bedrock
|
|
68
|
+
messages = [
|
|
69
|
+
{
|
|
70
|
+
"role": "system",
|
|
71
|
+
"content": "Answer the question directly and concisely as a single word",
|
|
72
|
+
},
|
|
73
|
+
{"role": "user", "content": input_value},
|
|
74
|
+
]
|
|
75
|
+
|
|
76
|
+
return messages
|
|
77
|
+
|
|
78
|
+
# Step 3: Define the postprocessing function
|
|
79
|
+
def postprocessor(response: str) -> str:
|
|
80
|
+
"""Post-process AWS Bedrock batch response to extract the answer."""
|
|
81
|
+
# The batch function returns the message content directly
|
|
82
|
+
return response.strip()
|
|
83
|
+
|
|
84
|
+
# Step 4: Create the inference pipeline for batch processing
|
|
85
|
+
async def inference_function(items: list, **hyperparams: Any) -> Any:
|
|
86
|
+
return await batch(
|
|
87
|
+
items,
|
|
88
|
+
model=args.model,
|
|
89
|
+
aws_region=args.aws_region,
|
|
90
|
+
aws_profile=args.aws_profile,
|
|
91
|
+
bucket=args.bucket,
|
|
92
|
+
input_prefix=args.input_prefix,
|
|
93
|
+
output_prefix=args.output_prefix,
|
|
94
|
+
role_arn=args.role_arn,
|
|
95
|
+
**hyperparams,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
inference_pipeline = InferencePipeline(
|
|
99
|
+
model=args.model or "claude-model",
|
|
100
|
+
preprocessor=preprocessor,
|
|
101
|
+
inference_function=inference_function,
|
|
102
|
+
postprocessor=postprocessor,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Step 5: Run the batch evaluation
|
|
106
|
+
print("Running AWS Bedrock Batch API evaluation")
|
|
107
|
+
print(f"Model: {args.model or 'Not specified'}")
|
|
108
|
+
print(f"AWS Region: {args.aws_region or 'Not specified'}")
|
|
109
|
+
print(f"AWS Profile: {args.aws_profile or 'Not specified'}")
|
|
110
|
+
print(f"S3 Bucket: {args.bucket or 'Not specified'}")
|
|
111
|
+
print(f"Role ARN: {args.role_arn or 'Not specified'}")
|
|
112
|
+
print(f"Processing {len(dataset)} items using batch inference...")
|
|
113
|
+
print("Note: Batch processing may take several minutes to complete.")
|
|
114
|
+
|
|
115
|
+
results = evaluate(inference_pipeline, dataset, return_items=True, return_output=True)
|
|
116
|
+
print("\nBatch evaluation completed!")
|
|
117
|
+
print(results)
|
|
118
|
+
|
|
119
|
+
# Step 6: Save results to file
|
|
120
|
+
output_file = args.output_dir / "bedrock_batch_output.json"
|
|
121
|
+
with open(output_file, "w") as f:
|
|
122
|
+
json.dump(results, f, indent=4)
|
|
123
|
+
print(f"Results saved in {output_file}")
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# ============================================================================
|
|
127
|
+
# Utility Functions
|
|
128
|
+
# ============================================================================
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class Args:
|
|
132
|
+
"""Simple container for parsed arguments."""
|
|
133
|
+
|
|
134
|
+
def __init__(self) -> None:
|
|
135
|
+
"""Parse command line arguments."""
|
|
136
|
+
self.output_dir: Path = Path(".") # Will be overridden in setup_arguments
|
|
137
|
+
self.model: Optional[str] = None
|
|
138
|
+
self.aws_region: Optional[str] = None
|
|
139
|
+
self.aws_profile: Optional[str] = None
|
|
140
|
+
self.bucket: Optional[str] = None
|
|
141
|
+
self.input_prefix: Optional[str] = None
|
|
142
|
+
self.output_prefix: Optional[str] = None
|
|
143
|
+
self.role_arn: Optional[str] = None
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def setup_arguments() -> Args:
|
|
147
|
+
"""Parse command line arguments."""
|
|
148
|
+
import argparse
|
|
149
|
+
|
|
150
|
+
parser = argparse.ArgumentParser(
|
|
151
|
+
description="Run AWS Bedrock Batch API evaluation and save results."
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# Required argument
|
|
155
|
+
parser.add_argument(
|
|
156
|
+
"--output-dir",
|
|
157
|
+
type=str,
|
|
158
|
+
required=False,
|
|
159
|
+
default=str(Path.cwd() / "results"),
|
|
160
|
+
help="Directory to save evaluation outputs (JSON).",
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# All optional AWS parameters
|
|
164
|
+
parser.add_argument(
|
|
165
|
+
"--model",
|
|
166
|
+
type=str,
|
|
167
|
+
help="Bedrock model ID (e.g., 'us.anthropic.claude-3-5-sonnet-20241022-v2:0')",
|
|
168
|
+
)
|
|
169
|
+
parser.add_argument(
|
|
170
|
+
"--aws-region",
|
|
171
|
+
type=str,
|
|
172
|
+
help="AWS region for Bedrock and S3 operations",
|
|
173
|
+
)
|
|
174
|
+
parser.add_argument(
|
|
175
|
+
"--aws-profile",
|
|
176
|
+
type=str,
|
|
177
|
+
help="AWS profile name for authentication",
|
|
178
|
+
)
|
|
179
|
+
parser.add_argument(
|
|
180
|
+
"--bucket",
|
|
181
|
+
type=str,
|
|
182
|
+
help="S3 bucket name for input/output data",
|
|
183
|
+
)
|
|
184
|
+
parser.add_argument(
|
|
185
|
+
"--input-prefix",
|
|
186
|
+
type=str,
|
|
187
|
+
help="S3 prefix for input data (e.g., 'batch/input/')",
|
|
188
|
+
)
|
|
189
|
+
parser.add_argument(
|
|
190
|
+
"--output-prefix",
|
|
191
|
+
type=str,
|
|
192
|
+
help="S3 prefix for output data (e.g., 'batch/output/')",
|
|
193
|
+
)
|
|
194
|
+
parser.add_argument(
|
|
195
|
+
"--role-arn",
|
|
196
|
+
type=str,
|
|
197
|
+
help="IAM role ARN for Bedrock execution",
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
parsed_args = parser.parse_args()
|
|
201
|
+
|
|
202
|
+
# Create Args object and populate
|
|
203
|
+
args = Args()
|
|
204
|
+
args.output_dir = Path(parsed_args.output_dir)
|
|
205
|
+
args.output_dir.mkdir(parents=True, exist_ok=True)
|
|
206
|
+
|
|
207
|
+
args.model = parsed_args.model
|
|
208
|
+
args.aws_region = parsed_args.aws_region
|
|
209
|
+
args.aws_profile = parsed_args.aws_profile
|
|
210
|
+
args.bucket = parsed_args.bucket
|
|
211
|
+
args.input_prefix = parsed_args.input_prefix
|
|
212
|
+
args.output_prefix = parsed_args.output_prefix
|
|
213
|
+
args.role_arn = parsed_args.role_arn
|
|
214
|
+
|
|
215
|
+
return args
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
if __name__ == "__main__":
|
|
219
|
+
main()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Example usage of the Portkey API with the scorebook library."""
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Portkey Cloud Batch Inference Example.
|
|
3
|
+
|
|
4
|
+
This example demonstrates how to leverage Portkey's Batch API for cost-effective,
|
|
5
|
+
large-scale model evaluation using Scorebook. The backend provider of choice for this example is
|
|
6
|
+
OpenAI, but it's easy to adapt to any other provider.
|
|
7
|
+
|
|
8
|
+
This example requires a portkey account linked to an OpenAI account and
|
|
9
|
+
a portkey API key set in environment variable PORTKEY_API_KEY .
|
|
10
|
+
|
|
11
|
+
Compare with messages_example.py to understand the differences
|
|
12
|
+
between real-time and batch processing approaches.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
from dotenv import load_dotenv
|
|
20
|
+
|
|
21
|
+
from scorebook import EvalDataset, InferencePipeline, evaluate
|
|
22
|
+
from scorebook.inference.clients.portkey import batch
|
|
23
|
+
from scorebook.metrics.accuracy import Accuracy
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def main() -> None:
|
|
27
|
+
"""Run the Portkey batch inference example."""
|
|
28
|
+
# Load environment variables from .env file for API keys
|
|
29
|
+
load_dotenv()
|
|
30
|
+
|
|
31
|
+
output_dir, model_name = setup_arguments()
|
|
32
|
+
|
|
33
|
+
# Step 1: Load the evaluation dataset
|
|
34
|
+
dataset = EvalDataset.from_json(
|
|
35
|
+
"examples/example_datasets/dataset.json", label="answer", metrics=[Accuracy]
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# Step 2: Define the preprocessing function for batch API
|
|
39
|
+
def preprocessor(eval_item: dict) -> list:
|
|
40
|
+
"""Pre-process dataset items into Portkey Batch API format."""
|
|
41
|
+
prompt = eval_item["question"]
|
|
42
|
+
|
|
43
|
+
# Create the batch API request messages format
|
|
44
|
+
messages = [
|
|
45
|
+
{
|
|
46
|
+
"role": "system",
|
|
47
|
+
"content": "Answer the question directly and concisely as a single word",
|
|
48
|
+
},
|
|
49
|
+
{"role": "user", "content": prompt},
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
return messages
|
|
53
|
+
|
|
54
|
+
# Step 3: Define the postprocessing function
|
|
55
|
+
def postprocessor(response: str) -> str:
|
|
56
|
+
"""Post-process Portkey batch response to extract the answer."""
|
|
57
|
+
# The batch function returns the message content directly
|
|
58
|
+
return response.strip()
|
|
59
|
+
|
|
60
|
+
# Step 4: Create the inference pipeline for batch processing
|
|
61
|
+
|
|
62
|
+
async def inference_function(items: list, **hyperparams: Any) -> Any: # noqa
|
|
63
|
+
return await batch(items, model=model_name, **hyperparams)
|
|
64
|
+
|
|
65
|
+
inference_pipeline = InferencePipeline(
|
|
66
|
+
model=model_name,
|
|
67
|
+
preprocessor=preprocessor,
|
|
68
|
+
inference_function=inference_function,
|
|
69
|
+
postprocessor=postprocessor,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Step 5: Run the batch evaluation
|
|
73
|
+
print(f"Running Portkey Batch API evaluation with model: {model_name}")
|
|
74
|
+
print(f"Processing {len(dataset)} items using batch inference...")
|
|
75
|
+
print("Note: Batch processing may take several minutes to complete.")
|
|
76
|
+
|
|
77
|
+
# For demonstration, limit to 25 items
|
|
78
|
+
results = evaluate(inference_pipeline, dataset, item_limit=25, score_type="all")
|
|
79
|
+
print("\nBatch evaluation completed!")
|
|
80
|
+
print(results)
|
|
81
|
+
|
|
82
|
+
# Step 6: Save results to file
|
|
83
|
+
output_file = output_dir / "portkey_batch_output.json"
|
|
84
|
+
with open(output_file, "w") as f:
|
|
85
|
+
json.dump(results, f, indent=4)
|
|
86
|
+
print(f"Results saved in {output_file}")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# ============================================================================
|
|
90
|
+
# Utility Functions
|
|
91
|
+
# ============================================================================
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def setup_arguments() -> tuple[Path, str]:
|
|
95
|
+
"""Parse command line arguments."""
|
|
96
|
+
import argparse
|
|
97
|
+
|
|
98
|
+
parser = argparse.ArgumentParser(
|
|
99
|
+
description="Run Portkey Batch API evaluation and save results."
|
|
100
|
+
)
|
|
101
|
+
parser.add_argument(
|
|
102
|
+
"--output-dir",
|
|
103
|
+
type=str,
|
|
104
|
+
default=str(Path.cwd() / "results"),
|
|
105
|
+
help="Directory to save evaluation outputs (JSON).",
|
|
106
|
+
)
|
|
107
|
+
parser.add_argument(
|
|
108
|
+
"--model",
|
|
109
|
+
type=str,
|
|
110
|
+
required=True,
|
|
111
|
+
help="Model to use for batch inference via Portkey (e.g., @openai/gpt-4.1-mini)",
|
|
112
|
+
)
|
|
113
|
+
args = parser.parse_args()
|
|
114
|
+
output_dir = Path(args.output_dir)
|
|
115
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
116
|
+
return output_dir, str(args.model)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
if __name__ == "__main__":
|
|
120
|
+
main()
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Portkey Cloud Model Inference Example.
|
|
3
|
+
|
|
4
|
+
This example demonstrates how to evaluate language models using Portkey's inference
|
|
5
|
+
services with Scorebook for real-time API calls.
|
|
6
|
+
|
|
7
|
+
Prerequisites: PORTKEY_API_KEY environment variable and active Portkey account.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from dotenv import load_dotenv
|
|
15
|
+
|
|
16
|
+
from scorebook import EvalDataset, InferencePipeline, evaluate
|
|
17
|
+
from scorebook.inference.clients.portkey import responses
|
|
18
|
+
from scorebook.metrics.accuracy import Accuracy
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def main() -> None:
|
|
22
|
+
"""Run the Portkey inference example."""
|
|
23
|
+
# Load environment variables from .env file for API keys
|
|
24
|
+
load_dotenv()
|
|
25
|
+
|
|
26
|
+
output_dir, model_name = setup_arguments()
|
|
27
|
+
|
|
28
|
+
# Step 1: Load the evaluation dataset
|
|
29
|
+
# Create an EvalDataset from local JSON file
|
|
30
|
+
# - Uses 'answer' field as ground truth labels
|
|
31
|
+
# - Configures Accuracy metric for evaluation
|
|
32
|
+
# - Loads from examples/example_datasets/dataset.json
|
|
33
|
+
dataset = EvalDataset.from_json(
|
|
34
|
+
"examples/example_datasets/dataset.json", label="answer", metrics=[Accuracy]
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# Step 2: Define the preprocessing function
|
|
38
|
+
# Convert raw dataset items into Portkey API-compatible format
|
|
39
|
+
# This function formats the question for the cloud model
|
|
40
|
+
def preprocessor(eval_item: dict) -> list:
|
|
41
|
+
"""Pre-process dataset items into Portkey messages format."""
|
|
42
|
+
prompt = eval_item["question"]
|
|
43
|
+
|
|
44
|
+
# Create a system message with instructions for direct answers
|
|
45
|
+
system_prompt = """
|
|
46
|
+
Answer the question directly and concisely.
|
|
47
|
+
Do not provide lengthy explanations unless specifically asked.
|
|
48
|
+
""".strip()
|
|
49
|
+
|
|
50
|
+
# Format as messages for Portkey API
|
|
51
|
+
return [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]
|
|
52
|
+
|
|
53
|
+
# Step 3: Define the postprocessing function
|
|
54
|
+
# Extract the final answer from Portkey API response
|
|
55
|
+
# Handles response parsing and returns the response text
|
|
56
|
+
def postprocessor(response: Any) -> str:
|
|
57
|
+
"""Post-process Portkey response to extract the answer."""
|
|
58
|
+
return str(response.choices[0].message.content.strip())
|
|
59
|
+
|
|
60
|
+
# Step 4: Create the inference pipeline for cloud-based evaluation
|
|
61
|
+
# Combine preprocessing, Portkey API inference, and postprocessing
|
|
62
|
+
# Uses scorebook's built-in Portkey responses function for API calls
|
|
63
|
+
|
|
64
|
+
async def inference_function(items: list, **hyperparams: Any) -> Any:
|
|
65
|
+
return await responses(items, model=model_name, **hyperparams)
|
|
66
|
+
|
|
67
|
+
inference_pipeline = InferencePipeline(
|
|
68
|
+
model=model_name,
|
|
69
|
+
preprocessor=preprocessor,
|
|
70
|
+
inference_function=inference_function,
|
|
71
|
+
postprocessor=postprocessor,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Step 5: Run the cloud-based evaluation
|
|
75
|
+
# Execute evaluation using Portkey API with the inference pipeline
|
|
76
|
+
# - Uses score_type="all" to get both aggregate and per-item results
|
|
77
|
+
# - Limits to 10 items for quick demonstration and cost control
|
|
78
|
+
print(f"Running Portkey evaluation with model: {model_name}")
|
|
79
|
+
print("Evaluating 10 items from local dataset...")
|
|
80
|
+
|
|
81
|
+
results = evaluate(inference_pipeline, dataset, item_limit=10, score_type="all")
|
|
82
|
+
print(results)
|
|
83
|
+
|
|
84
|
+
# Step 6: Save results to file
|
|
85
|
+
# Export evaluation results as JSON for later analysis
|
|
86
|
+
output_file = output_dir / "portkey_messages_output.json"
|
|
87
|
+
with open(output_file, "w") as f:
|
|
88
|
+
json.dump(results, f, indent=4)
|
|
89
|
+
print(f"Results saved in {output_file}")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# ============================================================================
|
|
93
|
+
# Utility Functions
|
|
94
|
+
# ============================================================================
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def setup_arguments() -> tuple[Path, str]:
|
|
98
|
+
"""Parse command line arguments."""
|
|
99
|
+
import argparse
|
|
100
|
+
|
|
101
|
+
parser = argparse.ArgumentParser(description="Run Portkey evaluation and save results.")
|
|
102
|
+
parser.add_argument(
|
|
103
|
+
"--output-dir",
|
|
104
|
+
type=str,
|
|
105
|
+
default=str(Path.cwd() / "results"),
|
|
106
|
+
help="Directory to save evaluation outputs (JSON).",
|
|
107
|
+
)
|
|
108
|
+
parser.add_argument(
|
|
109
|
+
"--model",
|
|
110
|
+
type=str,
|
|
111
|
+
required=True,
|
|
112
|
+
help="Model to use for inference via Portkey (e.g., @openai/gpt-4.1-mini)",
|
|
113
|
+
)
|
|
114
|
+
args = parser.parse_args()
|
|
115
|
+
output_dir = Path(args.output_dir)
|
|
116
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
117
|
+
return output_dir, str(args.model)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
if __name__ == "__main__":
|
|
121
|
+
main()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Example usage of the Google Cloud Platform with the scorebook library."""
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Google Cloud Vertex AI Batch Inference Example.
|
|
3
|
+
|
|
4
|
+
This example demonstrates how to leverage Google Cloud Vertex AI's Batch API for
|
|
5
|
+
cost-effective, large-scale model evaluation using Scorebook. It uses Gemini models
|
|
6
|
+
for batch processing with automatic GCS upload/download and job management.
|
|
7
|
+
|
|
8
|
+
This example requires Google Cloud SDK (gsutil) to be installed and authenticated,
|
|
9
|
+
and a Google Cloud project with Vertex AI enabled. Set the project ID in the
|
|
10
|
+
GOOGLE_CLOUD_PROJECT environment variable or pass it as a command line argument.
|
|
11
|
+
|
|
12
|
+
Compare with the Portkey batch example to understand the differences
|
|
13
|
+
between different cloud providers' batch processing approaches.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import os
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
from dotenv import load_dotenv
|
|
22
|
+
|
|
23
|
+
from scorebook import EvalDataset, InferencePipeline, evaluate
|
|
24
|
+
from scorebook.inference.clients.vertex import batch
|
|
25
|
+
from scorebook.metrics.accuracy import Accuracy
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def main() -> None:
|
|
29
|
+
"""Run the Vertex AI batch inference example."""
|
|
30
|
+
# Load environment variables from .env file for configuration
|
|
31
|
+
load_dotenv()
|
|
32
|
+
|
|
33
|
+
output_dir, model_name, input_bucket, output_bucket, project_id = setup_arguments()
|
|
34
|
+
|
|
35
|
+
# Step 1: Load the evaluation dataset
|
|
36
|
+
dataset = EvalDataset.from_json(
|
|
37
|
+
"examples/example_datasets/dataset.json", label="answer", metrics=[Accuracy]
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# Step 2: Define the preprocessing function for Vertex AI Batch API
|
|
41
|
+
def preprocessor(eval_item: dict) -> list:
|
|
42
|
+
"""Pre-process dataset items into Vertex AI Batch API format."""
|
|
43
|
+
prompt = eval_item["question"]
|
|
44
|
+
|
|
45
|
+
# Create the batch API request messages format for Vertex AI
|
|
46
|
+
messages = [
|
|
47
|
+
{
|
|
48
|
+
"role": "system",
|
|
49
|
+
"content": "Answer the question directly and concisely as a single word",
|
|
50
|
+
},
|
|
51
|
+
{"role": "user", "content": prompt},
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
return messages
|
|
55
|
+
|
|
56
|
+
# Step 3: Define the postprocessing function
|
|
57
|
+
def postprocessor(response: str) -> str:
|
|
58
|
+
"""Post-process Vertex AI batch response to extract the answer."""
|
|
59
|
+
# The batch function returns the message content directly
|
|
60
|
+
return response.strip()
|
|
61
|
+
|
|
62
|
+
# Step 4: Create the inference pipeline for batch processing
|
|
63
|
+
|
|
64
|
+
async def inference_function(items: list, **hyperparams: Any) -> Any: # noqa
|
|
65
|
+
return await batch(
|
|
66
|
+
items,
|
|
67
|
+
model=model_name,
|
|
68
|
+
project_id=project_id,
|
|
69
|
+
input_bucket=input_bucket,
|
|
70
|
+
output_bucket=output_bucket,
|
|
71
|
+
**hyperparams,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
inference_pipeline = InferencePipeline(
|
|
75
|
+
model=model_name,
|
|
76
|
+
preprocessor=preprocessor,
|
|
77
|
+
inference_function=inference_function,
|
|
78
|
+
postprocessor=postprocessor,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Step 5: Run the batch evaluation
|
|
82
|
+
print(f"Running Vertex AI Batch API evaluation with model: {model_name}")
|
|
83
|
+
print(f"Project ID: {project_id}")
|
|
84
|
+
print(f"Input bucket: {input_bucket}")
|
|
85
|
+
print(f"Output bucket: {output_bucket}")
|
|
86
|
+
print(f"Processing {len(dataset)} items using batch inference...")
|
|
87
|
+
print("Note: Batch processing may take several minutes to complete.")
|
|
88
|
+
|
|
89
|
+
# For demonstration, limit to 25 items
|
|
90
|
+
results = evaluate(inference_pipeline, dataset, item_limit=25, score_type="all")
|
|
91
|
+
print("\nBatch evaluation completed!")
|
|
92
|
+
print(results)
|
|
93
|
+
|
|
94
|
+
# Step 6: Save results to file
|
|
95
|
+
output_file = output_dir / "vertex_batch_output.json"
|
|
96
|
+
with open(output_file, "w") as f:
|
|
97
|
+
json.dump(results, f, indent=4)
|
|
98
|
+
print(f"Results saved in {output_file}")
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# ============================================================================
|
|
102
|
+
# Utility Functions
|
|
103
|
+
# ============================================================================
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def setup_arguments() -> tuple[Path, str, str, str, str]:
|
|
107
|
+
"""Parse command line arguments."""
|
|
108
|
+
import argparse
|
|
109
|
+
|
|
110
|
+
parser = argparse.ArgumentParser(
|
|
111
|
+
description="Run Vertex AI Batch API evaluation and save results."
|
|
112
|
+
)
|
|
113
|
+
parser.add_argument(
|
|
114
|
+
"--output-dir",
|
|
115
|
+
type=str,
|
|
116
|
+
default=str(Path.cwd() / "results"),
|
|
117
|
+
help="Directory to save evaluation outputs (JSON).",
|
|
118
|
+
)
|
|
119
|
+
parser.add_argument(
|
|
120
|
+
"--model",
|
|
121
|
+
type=str,
|
|
122
|
+
required=True,
|
|
123
|
+
help="Gemini model to use for batch inference (e.g., gemini-2.0-flash-001)",
|
|
124
|
+
)
|
|
125
|
+
parser.add_argument(
|
|
126
|
+
"--input-bucket",
|
|
127
|
+
type=str,
|
|
128
|
+
required=True,
|
|
129
|
+
help="GCS bucket name for input data (without gs:// prefix)",
|
|
130
|
+
)
|
|
131
|
+
parser.add_argument(
|
|
132
|
+
"--output-bucket",
|
|
133
|
+
type=str,
|
|
134
|
+
required=True,
|
|
135
|
+
help="GCS bucket name for output data (without gs:// prefix)",
|
|
136
|
+
)
|
|
137
|
+
parser.add_argument(
|
|
138
|
+
"--project-id",
|
|
139
|
+
type=str,
|
|
140
|
+
help="Google Cloud Project ID (defaults to GOOGLE_CLOUD_PROJECT env var)",
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
args = parser.parse_args()
|
|
144
|
+
|
|
145
|
+
output_dir = Path(args.output_dir)
|
|
146
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
147
|
+
|
|
148
|
+
# Handle project ID fallback
|
|
149
|
+
project_id = args.project_id or os.getenv("GOOGLE_CLOUD_PROJECT")
|
|
150
|
+
if not project_id:
|
|
151
|
+
raise ValueError(
|
|
152
|
+
"Project ID must be provided via --project-id or "
|
|
153
|
+
"GOOGLE_CLOUD_PROJECT environment variable"
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
return (
|
|
157
|
+
output_dir,
|
|
158
|
+
str(args.model),
|
|
159
|
+
str(args.input_bucket),
|
|
160
|
+
str(args.output_bucket),
|
|
161
|
+
str(project_id),
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
if __name__ == "__main__":
|
|
166
|
+
main()
|