scorebook 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +12 -5
- scorebook/cli/auth.py +1 -1
- scorebook/dashboard/__init__.py +1 -0
- scorebook/dashboard/create_project.py +91 -0
- scorebook/{trismik → dashboard}/credentials.py +57 -12
- scorebook/{trismik → dashboard}/upload_results.py +1 -1
- scorebook/eval_datasets/__init__.py +0 -4
- scorebook/eval_datasets/eval_dataset.py +4 -2
- scorebook/evaluate/__init__.py +1 -15
- scorebook/evaluate/_async/evaluate_async.py +36 -19
- scorebook/evaluate/_sync/evaluate.py +36 -19
- scorebook/evaluate/evaluate_helpers.py +4 -3
- scorebook/inference/__init__.py +1 -11
- scorebook/inference/clients/__init__.py +1 -8
- scorebook/inference/inference_pipeline.py +1 -1
- scorebook/metrics/README.md +121 -0
- scorebook/metrics/__init__.py +7 -16
- scorebook/metrics/accuracy.py +2 -6
- scorebook/metrics/bertscore.py +50 -0
- scorebook/metrics/bleu.py +82 -0
- scorebook/metrics/core/__init__.py +1 -0
- scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
- scorebook/metrics/core/metric_registry.py +195 -0
- scorebook/metrics/exactmatch.py +95 -0
- scorebook/metrics/f1.py +96 -0
- scorebook/metrics/precision.py +84 -9
- scorebook/metrics/recall.py +94 -0
- scorebook/metrics/rouge.py +85 -0
- scorebook/score/__init__.py +0 -5
- scorebook/score/_async/score_async.py +3 -2
- scorebook/score/_sync/score.py +3 -2
- scorebook/score/score_helpers.py +29 -12
- scorebook/types.py +3 -3
- scorebook/utils/__init__.py +0 -22
- scorebook/utils/common_helpers.py +1 -1
- scorebook/utils/mock_llm/__init__.py +41 -0
- scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
- scorebook/utils/progress_bars.py +58 -786
- scorebook-0.0.15.dist-info/METADATA +300 -0
- scorebook-0.0.15.dist-info/RECORD +110 -0
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
- tutorials/README.md +147 -0
- tutorials/__init__.py +5 -0
- tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
- tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
- tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
- tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
- tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
- tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
- tutorials/examples/1-score/__init__.py +0 -0
- tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
- tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
- tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
- tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
- tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
- tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
- tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
- tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
- tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
- tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
- tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
- tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
- tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
- tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
- tutorials/examples/6-providers/aws/__init__.py +1 -0
- tutorials/examples/6-providers/aws/batch_example.py +219 -0
- tutorials/examples/6-providers/portkey/__init__.py +1 -0
- tutorials/examples/6-providers/portkey/batch_example.py +120 -0
- tutorials/examples/6-providers/portkey/messages_example.py +121 -0
- tutorials/examples/6-providers/vertex/__init__.py +1 -0
- tutorials/examples/6-providers/vertex/batch_example.py +166 -0
- tutorials/examples/6-providers/vertex/messages_example.py +142 -0
- tutorials/examples/__init__.py +0 -0
- tutorials/notebooks/1-scoring.ipynb +162 -0
- tutorials/notebooks/2-evaluating.ipynb +316 -0
- tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
- tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
- tutorials/notebooks/4-uploading_results.ipynb +175 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
- tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
- tutorials/quickstarts/getting_started.ipynb +197 -0
- tutorials/utils/__init__.py +35 -0
- tutorials/utils/args_parser.py +132 -0
- tutorials/utils/output.py +23 -0
- tutorials/utils/setup.py +98 -0
- scorebook/metrics/metric_registry.py +0 -105
- scorebook/trismik/__init__.py +0 -10
- scorebook-0.0.13.dist-info/METADATA +0 -389
- scorebook-0.0.13.dist-info/RECORD +0 -50
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""Tutorials - Upload Results - Example 3 - Uploading Pre-Scored Results."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from pprint import pprint
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from dotenv import load_dotenv
|
|
8
|
+
|
|
9
|
+
from tutorials.utils import save_results_to_json, setup_logging
|
|
10
|
+
|
|
11
|
+
from scorebook import login, upload_result
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def main() -> Any:
|
|
15
|
+
"""Upload pre-scored results directly to Trismik's dashboard.
|
|
16
|
+
|
|
17
|
+
This example demonstrates how to upload results where metrics are ALREADY computed.
|
|
18
|
+
This is different from score() or evaluate() which compute metrics for you.
|
|
19
|
+
|
|
20
|
+
Use upload_result() when you:
|
|
21
|
+
- Already have metric scores calculated
|
|
22
|
+
- Used a custom evaluation framework that computed metrics
|
|
23
|
+
- Want to import historical evaluation data with existing scores
|
|
24
|
+
- Have results from external tools (e.g., other eval frameworks)
|
|
25
|
+
|
|
26
|
+
The key difference from Examples 1 & 2:
|
|
27
|
+
- Example 1 (score): You have outputs/labels → Scorebook computes metrics
|
|
28
|
+
- Example 2 (evaluate): Scorebook runs inference AND computes metrics
|
|
29
|
+
- Example 3 (upload_result): You have EVERYTHING including metrics → Just upload
|
|
30
|
+
|
|
31
|
+
Prerequisites:
|
|
32
|
+
- Valid Trismik API key set in TRISMIK_API_KEY environment variable
|
|
33
|
+
- A Trismik project ID
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
# Step 1: Log in with your Trismik API key
|
|
37
|
+
# login() reads TRISMIK_API_KEY from environment variables or .env file
|
|
38
|
+
login()
|
|
39
|
+
|
|
40
|
+
# Step 2: Format your pre-scored results
|
|
41
|
+
# This is the structure that upload_result() expects:
|
|
42
|
+
# - aggregate_results: List with one dict containing overall metrics
|
|
43
|
+
# - item_results: List of dicts with per-item data and metric scores
|
|
44
|
+
|
|
45
|
+
# Example: You already ran an evaluation with your custom framework
|
|
46
|
+
# and computed accuracy, f1_score, etc.
|
|
47
|
+
my_pre_scored_results = {
|
|
48
|
+
"aggregate_results": [
|
|
49
|
+
{
|
|
50
|
+
"dataset": "spanish_translation",
|
|
51
|
+
"accuracy": 0.8, # Your pre-computed aggregate accuracy
|
|
52
|
+
"bleu_score": 0.75, # Your pre-computed BLEU score
|
|
53
|
+
# Add any hyperparameters used (optional)
|
|
54
|
+
"temperature": 0.7,
|
|
55
|
+
"max_tokens": 100,
|
|
56
|
+
}
|
|
57
|
+
],
|
|
58
|
+
"item_results": [
|
|
59
|
+
{
|
|
60
|
+
"id": 0,
|
|
61
|
+
"dataset": "spanish_translation",
|
|
62
|
+
"input": "Translate 'hello' to Spanish",
|
|
63
|
+
"output": "hola",
|
|
64
|
+
"label": "hola",
|
|
65
|
+
"accuracy": 1.0, # Item-level metric scores
|
|
66
|
+
"bleu_score": 1.0,
|
|
67
|
+
"temperature": 0.7,
|
|
68
|
+
"max_tokens": 100,
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
"id": 1,
|
|
72
|
+
"dataset": "spanish_translation",
|
|
73
|
+
"input": "Translate 'goodbye' to Spanish",
|
|
74
|
+
"output": "adiós",
|
|
75
|
+
"label": "adiós",
|
|
76
|
+
"accuracy": 1.0,
|
|
77
|
+
"bleu_score": 0.95,
|
|
78
|
+
"temperature": 0.7,
|
|
79
|
+
"max_tokens": 100,
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
"id": 2,
|
|
83
|
+
"dataset": "spanish_translation",
|
|
84
|
+
"input": "Translate 'thank you' to Spanish",
|
|
85
|
+
"output": "gracias",
|
|
86
|
+
"label": "gracias",
|
|
87
|
+
"accuracy": 1.0,
|
|
88
|
+
"bleu_score": 1.0,
|
|
89
|
+
"temperature": 0.7,
|
|
90
|
+
"max_tokens": 100,
|
|
91
|
+
},
|
|
92
|
+
{
|
|
93
|
+
"id": 3,
|
|
94
|
+
"dataset": "spanish_translation",
|
|
95
|
+
"input": "Translate 'please' to Spanish",
|
|
96
|
+
"output": "por favor",
|
|
97
|
+
"label": "por favor",
|
|
98
|
+
"accuracy": 1.0,
|
|
99
|
+
"bleu_score": 1.0,
|
|
100
|
+
"temperature": 0.7,
|
|
101
|
+
"max_tokens": 100,
|
|
102
|
+
},
|
|
103
|
+
{
|
|
104
|
+
"id": 4,
|
|
105
|
+
"dataset": "spanish_translation",
|
|
106
|
+
"input": "Translate 'good morning' to Spanish",
|
|
107
|
+
"output": "buenos dias", # Missing accent - wrong answer
|
|
108
|
+
"label": "buenos días",
|
|
109
|
+
"accuracy": 0.0,
|
|
110
|
+
"bleu_score": 0.85,
|
|
111
|
+
"temperature": 0.7,
|
|
112
|
+
"max_tokens": 100,
|
|
113
|
+
},
|
|
114
|
+
],
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
# Step 3: Upload your pre-scored results directly
|
|
118
|
+
print("\nUploading pre-scored results to Trismik...")
|
|
119
|
+
print("Metrics are already computed - just uploading to dashboard.\n")
|
|
120
|
+
|
|
121
|
+
run_id = upload_result(
|
|
122
|
+
run_result=my_pre_scored_results,
|
|
123
|
+
experiment_id="Pre-Scored-Results-Example",
|
|
124
|
+
project_id="TRISMIK_PROJECT_ID", # TODO: ADD YOUR TRISMIK PROJECT ID
|
|
125
|
+
dataset_name="spanish_translation",
|
|
126
|
+
hyperparameters={
|
|
127
|
+
"temperature": 0.7,
|
|
128
|
+
"max_tokens": 100,
|
|
129
|
+
},
|
|
130
|
+
metadata={
|
|
131
|
+
"description": "Results with pre-computed metrics from custom framework",
|
|
132
|
+
"source": "Custom evaluation tool",
|
|
133
|
+
"evaluation_date": "2025-01-15",
|
|
134
|
+
},
|
|
135
|
+
model_name="my-custom-translator-v2",
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
print(f"\nResults uploaded successfully with run_id: {run_id}")
|
|
139
|
+
|
|
140
|
+
# Add run_id to results for reference
|
|
141
|
+
my_pre_scored_results["run_id"] = run_id
|
|
142
|
+
|
|
143
|
+
pprint(my_pre_scored_results)
|
|
144
|
+
return my_pre_scored_results
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
if __name__ == "__main__":
|
|
148
|
+
load_dotenv()
|
|
149
|
+
log_file = setup_logging(experiment_id="3-uploading_your_results", base_dir=Path(__file__).parent)
|
|
150
|
+
output_dir = Path(__file__).parent / "results"
|
|
151
|
+
output_dir.mkdir(exist_ok=True)
|
|
152
|
+
results_dict = main()
|
|
153
|
+
save_results_to_json(results_dict, output_dir, "3-uploading_your_results_output.json")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Example usage of the AWS API with the scorebook library."""
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AWS Bedrock Batch Inference Example.
|
|
3
|
+
|
|
4
|
+
This example demonstrates how to leverage AWS Bedrock's Model Invocation Jobs for
|
|
5
|
+
cost-effective, large-scale model evaluation using Scorebook. It uses Claude models
|
|
6
|
+
for batch processing with automatic S3 upload/download and job management.
|
|
7
|
+
|
|
8
|
+
This example requires AWS CLI to be configured with appropriate credentials and
|
|
9
|
+
permissions for Bedrock and S3. Set up your AWS profile and ensure you have
|
|
10
|
+
the necessary IAM roles configured.
|
|
11
|
+
|
|
12
|
+
Prerequisites:
|
|
13
|
+
- AWS CLI configured with appropriate profile
|
|
14
|
+
- S3 bucket with proper permissions
|
|
15
|
+
- IAM role for Bedrock execution with S3 access
|
|
16
|
+
- Minimum 100 items for batch processing (AWS requirement)
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Any, List, Optional
|
|
22
|
+
|
|
23
|
+
from dotenv import load_dotenv
|
|
24
|
+
|
|
25
|
+
from scorebook import EvalDataset, InferencePipeline, evaluate
|
|
26
|
+
from scorebook.inference.clients.bedrock import batch
|
|
27
|
+
from scorebook.metrics.accuracy import Accuracy
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def main() -> None:
|
|
31
|
+
"""Run the AWS Bedrock batch inference example."""
|
|
32
|
+
# Load environment variables from .env file for configuration
|
|
33
|
+
load_dotenv()
|
|
34
|
+
|
|
35
|
+
args = setup_arguments()
|
|
36
|
+
|
|
37
|
+
# Step 1: Load the evaluation dataset
|
|
38
|
+
dataset = EvalDataset.from_json(
|
|
39
|
+
"examples/example_datasets/dataset.json",
|
|
40
|
+
metrics=[Accuracy],
|
|
41
|
+
input="question",
|
|
42
|
+
label="answer",
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# Ensure minimum batch size requirement (AWS Bedrock requires 100+ items)
|
|
46
|
+
if len(dataset) < 100:
|
|
47
|
+
# Cycle through items to reach 100 - default minimum size of AWS batch jobs
|
|
48
|
+
original_items = dataset.items
|
|
49
|
+
expanded_items: List[Any] = []
|
|
50
|
+
while len(expanded_items) < 100:
|
|
51
|
+
items_needed = 100 - len(expanded_items)
|
|
52
|
+
expanded_items.extend(original_items[: min(len(original_items), items_needed)])
|
|
53
|
+
|
|
54
|
+
# Create new dataset with expanded items
|
|
55
|
+
# Items already have "input" and "label" columns from the original dataset
|
|
56
|
+
dataset = EvalDataset.from_list(
|
|
57
|
+
name=dataset.name,
|
|
58
|
+
metrics=[Accuracy],
|
|
59
|
+
items=expanded_items,
|
|
60
|
+
input="input",
|
|
61
|
+
label="label",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Step 2: Define the preprocessing function for AWS Bedrock Batch API
|
|
65
|
+
def preprocessor(input_value: str) -> list:
|
|
66
|
+
"""Pre-process dataset inputs into AWS Bedrock Batch API format."""
|
|
67
|
+
# Create the batch API request messages format for Bedrock
|
|
68
|
+
messages = [
|
|
69
|
+
{
|
|
70
|
+
"role": "system",
|
|
71
|
+
"content": "Answer the question directly and concisely as a single word",
|
|
72
|
+
},
|
|
73
|
+
{"role": "user", "content": input_value},
|
|
74
|
+
]
|
|
75
|
+
|
|
76
|
+
return messages
|
|
77
|
+
|
|
78
|
+
# Step 3: Define the postprocessing function
|
|
79
|
+
def postprocessor(response: str) -> str:
|
|
80
|
+
"""Post-process AWS Bedrock batch response to extract the answer."""
|
|
81
|
+
# The batch function returns the message content directly
|
|
82
|
+
return response.strip()
|
|
83
|
+
|
|
84
|
+
# Step 4: Create the inference pipeline for batch processing
|
|
85
|
+
async def inference_function(items: list, **hyperparams: Any) -> Any:
|
|
86
|
+
return await batch(
|
|
87
|
+
items,
|
|
88
|
+
model=args.model,
|
|
89
|
+
aws_region=args.aws_region,
|
|
90
|
+
aws_profile=args.aws_profile,
|
|
91
|
+
bucket=args.bucket,
|
|
92
|
+
input_prefix=args.input_prefix,
|
|
93
|
+
output_prefix=args.output_prefix,
|
|
94
|
+
role_arn=args.role_arn,
|
|
95
|
+
**hyperparams,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
inference_pipeline = InferencePipeline(
|
|
99
|
+
model=args.model or "claude-model",
|
|
100
|
+
preprocessor=preprocessor,
|
|
101
|
+
inference_function=inference_function,
|
|
102
|
+
postprocessor=postprocessor,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Step 5: Run the batch evaluation
|
|
106
|
+
print("Running AWS Bedrock Batch API evaluation")
|
|
107
|
+
print(f"Model: {args.model or 'Not specified'}")
|
|
108
|
+
print(f"AWS Region: {args.aws_region or 'Not specified'}")
|
|
109
|
+
print(f"AWS Profile: {args.aws_profile or 'Not specified'}")
|
|
110
|
+
print(f"S3 Bucket: {args.bucket or 'Not specified'}")
|
|
111
|
+
print(f"Role ARN: {args.role_arn or 'Not specified'}")
|
|
112
|
+
print(f"Processing {len(dataset)} items using batch inference...")
|
|
113
|
+
print("Note: Batch processing may take several minutes to complete.")
|
|
114
|
+
|
|
115
|
+
results = evaluate(inference_pipeline, dataset, return_items=True, return_output=True)
|
|
116
|
+
print("\nBatch evaluation completed!")
|
|
117
|
+
print(results)
|
|
118
|
+
|
|
119
|
+
# Step 6: Save results to file
|
|
120
|
+
output_file = args.output_dir / "bedrock_batch_output.json"
|
|
121
|
+
with open(output_file, "w") as f:
|
|
122
|
+
json.dump(results, f, indent=4)
|
|
123
|
+
print(f"Results saved in {output_file}")
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# ============================================================================
|
|
127
|
+
# Utility Functions
|
|
128
|
+
# ============================================================================
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class Args:
|
|
132
|
+
"""Simple container for parsed arguments."""
|
|
133
|
+
|
|
134
|
+
def __init__(self) -> None:
|
|
135
|
+
"""Parse command line arguments."""
|
|
136
|
+
self.output_dir: Path = Path(".") # Will be overridden in setup_arguments
|
|
137
|
+
self.model: Optional[str] = None
|
|
138
|
+
self.aws_region: Optional[str] = None
|
|
139
|
+
self.aws_profile: Optional[str] = None
|
|
140
|
+
self.bucket: Optional[str] = None
|
|
141
|
+
self.input_prefix: Optional[str] = None
|
|
142
|
+
self.output_prefix: Optional[str] = None
|
|
143
|
+
self.role_arn: Optional[str] = None
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def setup_arguments() -> Args:
|
|
147
|
+
"""Parse command line arguments."""
|
|
148
|
+
import argparse
|
|
149
|
+
|
|
150
|
+
parser = argparse.ArgumentParser(
|
|
151
|
+
description="Run AWS Bedrock Batch API evaluation and save results."
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# Required argument
|
|
155
|
+
parser.add_argument(
|
|
156
|
+
"--output-dir",
|
|
157
|
+
type=str,
|
|
158
|
+
required=False,
|
|
159
|
+
default=str(Path.cwd() / "results"),
|
|
160
|
+
help="Directory to save evaluation outputs (JSON).",
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# All optional AWS parameters
|
|
164
|
+
parser.add_argument(
|
|
165
|
+
"--model",
|
|
166
|
+
type=str,
|
|
167
|
+
help="Bedrock model ID (e.g., 'us.anthropic.claude-3-5-sonnet-20241022-v2:0')",
|
|
168
|
+
)
|
|
169
|
+
parser.add_argument(
|
|
170
|
+
"--aws-region",
|
|
171
|
+
type=str,
|
|
172
|
+
help="AWS region for Bedrock and S3 operations",
|
|
173
|
+
)
|
|
174
|
+
parser.add_argument(
|
|
175
|
+
"--aws-profile",
|
|
176
|
+
type=str,
|
|
177
|
+
help="AWS profile name for authentication",
|
|
178
|
+
)
|
|
179
|
+
parser.add_argument(
|
|
180
|
+
"--bucket",
|
|
181
|
+
type=str,
|
|
182
|
+
help="S3 bucket name for input/output data",
|
|
183
|
+
)
|
|
184
|
+
parser.add_argument(
|
|
185
|
+
"--input-prefix",
|
|
186
|
+
type=str,
|
|
187
|
+
help="S3 prefix for input data (e.g., 'batch/input/')",
|
|
188
|
+
)
|
|
189
|
+
parser.add_argument(
|
|
190
|
+
"--output-prefix",
|
|
191
|
+
type=str,
|
|
192
|
+
help="S3 prefix for output data (e.g., 'batch/output/')",
|
|
193
|
+
)
|
|
194
|
+
parser.add_argument(
|
|
195
|
+
"--role-arn",
|
|
196
|
+
type=str,
|
|
197
|
+
help="IAM role ARN for Bedrock execution",
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
parsed_args = parser.parse_args()
|
|
201
|
+
|
|
202
|
+
# Create Args object and populate
|
|
203
|
+
args = Args()
|
|
204
|
+
args.output_dir = Path(parsed_args.output_dir)
|
|
205
|
+
args.output_dir.mkdir(parents=True, exist_ok=True)
|
|
206
|
+
|
|
207
|
+
args.model = parsed_args.model
|
|
208
|
+
args.aws_region = parsed_args.aws_region
|
|
209
|
+
args.aws_profile = parsed_args.aws_profile
|
|
210
|
+
args.bucket = parsed_args.bucket
|
|
211
|
+
args.input_prefix = parsed_args.input_prefix
|
|
212
|
+
args.output_prefix = parsed_args.output_prefix
|
|
213
|
+
args.role_arn = parsed_args.role_arn
|
|
214
|
+
|
|
215
|
+
return args
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
if __name__ == "__main__":
|
|
219
|
+
main()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Example usage of the Portkey API with the scorebook library."""
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Portkey Cloud Batch Inference Example.
|
|
3
|
+
|
|
4
|
+
This example demonstrates how to leverage Portkey's Batch API for cost-effective,
|
|
5
|
+
large-scale model evaluation using Scorebook. The backend provider of choice for this example is
|
|
6
|
+
OpenAI, but it's easy to adapt to any other provider.
|
|
7
|
+
|
|
8
|
+
This example requires a portkey account linked to an OpenAI account and
|
|
9
|
+
a portkey API key set in environment variable PORTKEY_API_KEY .
|
|
10
|
+
|
|
11
|
+
Compare with messages_example.py to understand the differences
|
|
12
|
+
between real-time and batch processing approaches.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
from dotenv import load_dotenv
|
|
20
|
+
|
|
21
|
+
from scorebook import EvalDataset, InferencePipeline, evaluate
|
|
22
|
+
from scorebook.inference.clients.portkey import batch
|
|
23
|
+
from scorebook.metrics.accuracy import Accuracy
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def main() -> None:
|
|
27
|
+
"""Run the Portkey batch inference example."""
|
|
28
|
+
# Load environment variables from .env file for API keys
|
|
29
|
+
load_dotenv()
|
|
30
|
+
|
|
31
|
+
output_dir, model_name = setup_arguments()
|
|
32
|
+
|
|
33
|
+
# Step 1: Load the evaluation dataset
|
|
34
|
+
dataset = EvalDataset.from_json(
|
|
35
|
+
"examples/example_datasets/dataset.json", label="answer", metrics=[Accuracy]
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# Step 2: Define the preprocessing function for batch API
|
|
39
|
+
def preprocessor(eval_item: dict) -> list:
|
|
40
|
+
"""Pre-process dataset items into Portkey Batch API format."""
|
|
41
|
+
prompt = eval_item["question"]
|
|
42
|
+
|
|
43
|
+
# Create the batch API request messages format
|
|
44
|
+
messages = [
|
|
45
|
+
{
|
|
46
|
+
"role": "system",
|
|
47
|
+
"content": "Answer the question directly and concisely as a single word",
|
|
48
|
+
},
|
|
49
|
+
{"role": "user", "content": prompt},
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
return messages
|
|
53
|
+
|
|
54
|
+
# Step 3: Define the postprocessing function
|
|
55
|
+
def postprocessor(response: str) -> str:
|
|
56
|
+
"""Post-process Portkey batch response to extract the answer."""
|
|
57
|
+
# The batch function returns the message content directly
|
|
58
|
+
return response.strip()
|
|
59
|
+
|
|
60
|
+
# Step 4: Create the inference pipeline for batch processing
|
|
61
|
+
|
|
62
|
+
async def inference_function(items: list, **hyperparams: Any) -> Any: # noqa
|
|
63
|
+
return await batch(items, model=model_name, **hyperparams)
|
|
64
|
+
|
|
65
|
+
inference_pipeline = InferencePipeline(
|
|
66
|
+
model=model_name,
|
|
67
|
+
preprocessor=preprocessor,
|
|
68
|
+
inference_function=inference_function,
|
|
69
|
+
postprocessor=postprocessor,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Step 5: Run the batch evaluation
|
|
73
|
+
print(f"Running Portkey Batch API evaluation with model: {model_name}")
|
|
74
|
+
print(f"Processing {len(dataset)} items using batch inference...")
|
|
75
|
+
print("Note: Batch processing may take several minutes to complete.")
|
|
76
|
+
|
|
77
|
+
# For demonstration, limit to 25 items
|
|
78
|
+
results = evaluate(inference_pipeline, dataset, item_limit=25, score_type="all")
|
|
79
|
+
print("\nBatch evaluation completed!")
|
|
80
|
+
print(results)
|
|
81
|
+
|
|
82
|
+
# Step 6: Save results to file
|
|
83
|
+
output_file = output_dir / "portkey_batch_output.json"
|
|
84
|
+
with open(output_file, "w") as f:
|
|
85
|
+
json.dump(results, f, indent=4)
|
|
86
|
+
print(f"Results saved in {output_file}")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# ============================================================================
|
|
90
|
+
# Utility Functions
|
|
91
|
+
# ============================================================================
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def setup_arguments() -> tuple[Path, str]:
|
|
95
|
+
"""Parse command line arguments."""
|
|
96
|
+
import argparse
|
|
97
|
+
|
|
98
|
+
parser = argparse.ArgumentParser(
|
|
99
|
+
description="Run Portkey Batch API evaluation and save results."
|
|
100
|
+
)
|
|
101
|
+
parser.add_argument(
|
|
102
|
+
"--output-dir",
|
|
103
|
+
type=str,
|
|
104
|
+
default=str(Path.cwd() / "results"),
|
|
105
|
+
help="Directory to save evaluation outputs (JSON).",
|
|
106
|
+
)
|
|
107
|
+
parser.add_argument(
|
|
108
|
+
"--model",
|
|
109
|
+
type=str,
|
|
110
|
+
required=True,
|
|
111
|
+
help="Model to use for batch inference via Portkey (e.g., @openai/gpt-4.1-mini)",
|
|
112
|
+
)
|
|
113
|
+
args = parser.parse_args()
|
|
114
|
+
output_dir = Path(args.output_dir)
|
|
115
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
116
|
+
return output_dir, str(args.model)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
if __name__ == "__main__":
|
|
120
|
+
main()
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Portkey Cloud Model Inference Example.
|
|
3
|
+
|
|
4
|
+
This example demonstrates how to evaluate language models using Portkey's inference
|
|
5
|
+
services with Scorebook for real-time API calls.
|
|
6
|
+
|
|
7
|
+
Prerequisites: PORTKEY_API_KEY environment variable and active Portkey account.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from dotenv import load_dotenv
|
|
15
|
+
|
|
16
|
+
from scorebook import EvalDataset, InferencePipeline, evaluate
|
|
17
|
+
from scorebook.inference.clients.portkey import responses
|
|
18
|
+
from scorebook.metrics.accuracy import Accuracy
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def main() -> None:
|
|
22
|
+
"""Run the Portkey inference example."""
|
|
23
|
+
# Load environment variables from .env file for API keys
|
|
24
|
+
load_dotenv()
|
|
25
|
+
|
|
26
|
+
output_dir, model_name = setup_arguments()
|
|
27
|
+
|
|
28
|
+
# Step 1: Load the evaluation dataset
|
|
29
|
+
# Create an EvalDataset from local JSON file
|
|
30
|
+
# - Uses 'answer' field as ground truth labels
|
|
31
|
+
# - Configures Accuracy metric for evaluation
|
|
32
|
+
# - Loads from examples/example_datasets/dataset.json
|
|
33
|
+
dataset = EvalDataset.from_json(
|
|
34
|
+
"examples/example_datasets/dataset.json", label="answer", metrics=[Accuracy]
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# Step 2: Define the preprocessing function
|
|
38
|
+
# Convert raw dataset items into Portkey API-compatible format
|
|
39
|
+
# This function formats the question for the cloud model
|
|
40
|
+
def preprocessor(eval_item: dict) -> list:
|
|
41
|
+
"""Pre-process dataset items into Portkey messages format."""
|
|
42
|
+
prompt = eval_item["question"]
|
|
43
|
+
|
|
44
|
+
# Create a system message with instructions for direct answers
|
|
45
|
+
system_prompt = """
|
|
46
|
+
Answer the question directly and concisely.
|
|
47
|
+
Do not provide lengthy explanations unless specifically asked.
|
|
48
|
+
""".strip()
|
|
49
|
+
|
|
50
|
+
# Format as messages for Portkey API
|
|
51
|
+
return [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]
|
|
52
|
+
|
|
53
|
+
# Step 3: Define the postprocessing function
|
|
54
|
+
# Extract the final answer from Portkey API response
|
|
55
|
+
# Handles response parsing and returns the response text
|
|
56
|
+
def postprocessor(response: Any) -> str:
|
|
57
|
+
"""Post-process Portkey response to extract the answer."""
|
|
58
|
+
return str(response.choices[0].message.content.strip())
|
|
59
|
+
|
|
60
|
+
# Step 4: Create the inference pipeline for cloud-based evaluation
|
|
61
|
+
# Combine preprocessing, Portkey API inference, and postprocessing
|
|
62
|
+
# Uses scorebook's built-in Portkey responses function for API calls
|
|
63
|
+
|
|
64
|
+
async def inference_function(items: list, **hyperparams: Any) -> Any:
|
|
65
|
+
return await responses(items, model=model_name, **hyperparams)
|
|
66
|
+
|
|
67
|
+
inference_pipeline = InferencePipeline(
|
|
68
|
+
model=model_name,
|
|
69
|
+
preprocessor=preprocessor,
|
|
70
|
+
inference_function=inference_function,
|
|
71
|
+
postprocessor=postprocessor,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Step 5: Run the cloud-based evaluation
|
|
75
|
+
# Execute evaluation using Portkey API with the inference pipeline
|
|
76
|
+
# - Uses score_type="all" to get both aggregate and per-item results
|
|
77
|
+
# - Limits to 10 items for quick demonstration and cost control
|
|
78
|
+
print(f"Running Portkey evaluation with model: {model_name}")
|
|
79
|
+
print("Evaluating 10 items from local dataset...")
|
|
80
|
+
|
|
81
|
+
results = evaluate(inference_pipeline, dataset, item_limit=10, score_type="all")
|
|
82
|
+
print(results)
|
|
83
|
+
|
|
84
|
+
# Step 6: Save results to file
|
|
85
|
+
# Export evaluation results as JSON for later analysis
|
|
86
|
+
output_file = output_dir / "portkey_messages_output.json"
|
|
87
|
+
with open(output_file, "w") as f:
|
|
88
|
+
json.dump(results, f, indent=4)
|
|
89
|
+
print(f"Results saved in {output_file}")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# ============================================================================
|
|
93
|
+
# Utility Functions
|
|
94
|
+
# ============================================================================
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def setup_arguments() -> tuple[Path, str]:
|
|
98
|
+
"""Parse command line arguments."""
|
|
99
|
+
import argparse
|
|
100
|
+
|
|
101
|
+
parser = argparse.ArgumentParser(description="Run Portkey evaluation and save results.")
|
|
102
|
+
parser.add_argument(
|
|
103
|
+
"--output-dir",
|
|
104
|
+
type=str,
|
|
105
|
+
default=str(Path.cwd() / "results"),
|
|
106
|
+
help="Directory to save evaluation outputs (JSON).",
|
|
107
|
+
)
|
|
108
|
+
parser.add_argument(
|
|
109
|
+
"--model",
|
|
110
|
+
type=str,
|
|
111
|
+
required=True,
|
|
112
|
+
help="Model to use for inference via Portkey (e.g., @openai/gpt-4.1-mini)",
|
|
113
|
+
)
|
|
114
|
+
args = parser.parse_args()
|
|
115
|
+
output_dir = Path(args.output_dir)
|
|
116
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
117
|
+
return output_dir, str(args.model)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
if __name__ == "__main__":
|
|
121
|
+
main()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Example usage of the Google Cloud Platform with the scorebook library."""
|