scorebook 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. scorebook/__init__.py +12 -5
  2. scorebook/cli/auth.py +1 -1
  3. scorebook/dashboard/__init__.py +1 -0
  4. scorebook/dashboard/create_project.py +91 -0
  5. scorebook/{trismik → dashboard}/credentials.py +57 -12
  6. scorebook/{trismik → dashboard}/upload_results.py +1 -1
  7. scorebook/eval_datasets/__init__.py +0 -4
  8. scorebook/eval_datasets/eval_dataset.py +4 -2
  9. scorebook/evaluate/__init__.py +1 -15
  10. scorebook/evaluate/_async/evaluate_async.py +36 -19
  11. scorebook/evaluate/_sync/evaluate.py +36 -19
  12. scorebook/evaluate/evaluate_helpers.py +4 -3
  13. scorebook/inference/__init__.py +1 -11
  14. scorebook/inference/clients/__init__.py +1 -8
  15. scorebook/inference/inference_pipeline.py +1 -1
  16. scorebook/metrics/README.md +121 -0
  17. scorebook/metrics/__init__.py +7 -16
  18. scorebook/metrics/accuracy.py +2 -6
  19. scorebook/metrics/bertscore.py +50 -0
  20. scorebook/metrics/bleu.py +82 -0
  21. scorebook/metrics/core/__init__.py +1 -0
  22. scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
  23. scorebook/metrics/core/metric_registry.py +195 -0
  24. scorebook/metrics/exactmatch.py +95 -0
  25. scorebook/metrics/f1.py +96 -0
  26. scorebook/metrics/precision.py +84 -9
  27. scorebook/metrics/recall.py +94 -0
  28. scorebook/metrics/rouge.py +85 -0
  29. scorebook/score/__init__.py +0 -5
  30. scorebook/score/_async/score_async.py +3 -2
  31. scorebook/score/_sync/score.py +3 -2
  32. scorebook/score/score_helpers.py +29 -12
  33. scorebook/types.py +3 -3
  34. scorebook/utils/__init__.py +0 -22
  35. scorebook/utils/common_helpers.py +1 -1
  36. scorebook/utils/mock_llm/__init__.py +41 -0
  37. scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
  38. scorebook/utils/progress_bars.py +58 -786
  39. scorebook-0.0.15.dist-info/METADATA +300 -0
  40. scorebook-0.0.15.dist-info/RECORD +110 -0
  41. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
  42. tutorials/README.md +147 -0
  43. tutorials/__init__.py +5 -0
  44. tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
  45. tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
  46. tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
  47. tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
  48. tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
  49. tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
  50. tutorials/examples/1-score/__init__.py +0 -0
  51. tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
  52. tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
  53. tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
  54. tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
  55. tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
  56. tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
  57. tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
  58. tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
  59. tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
  60. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
  61. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
  62. tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
  63. tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
  64. tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
  65. tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
  66. tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
  67. tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
  68. tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
  69. tutorials/examples/6-providers/aws/__init__.py +1 -0
  70. tutorials/examples/6-providers/aws/batch_example.py +219 -0
  71. tutorials/examples/6-providers/portkey/__init__.py +1 -0
  72. tutorials/examples/6-providers/portkey/batch_example.py +120 -0
  73. tutorials/examples/6-providers/portkey/messages_example.py +121 -0
  74. tutorials/examples/6-providers/vertex/__init__.py +1 -0
  75. tutorials/examples/6-providers/vertex/batch_example.py +166 -0
  76. tutorials/examples/6-providers/vertex/messages_example.py +142 -0
  77. tutorials/examples/__init__.py +0 -0
  78. tutorials/notebooks/1-scoring.ipynb +162 -0
  79. tutorials/notebooks/2-evaluating.ipynb +316 -0
  80. tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
  81. tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
  82. tutorials/notebooks/4-uploading_results.ipynb +175 -0
  83. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
  84. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
  85. tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
  86. tutorials/quickstarts/getting_started.ipynb +197 -0
  87. tutorials/utils/__init__.py +35 -0
  88. tutorials/utils/args_parser.py +132 -0
  89. tutorials/utils/output.py +23 -0
  90. tutorials/utils/setup.py +98 -0
  91. scorebook/metrics/metric_registry.py +0 -105
  92. scorebook/trismik/__init__.py +0 -10
  93. scorebook-0.0.13.dist-info/METADATA +0 -389
  94. scorebook-0.0.13.dist-info/RECORD +0 -50
  95. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
  96. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,153 @@
1
+ """Tutorials - Upload Results - Example 3 - Uploading Pre-Scored Results."""
2
+
3
+ from pathlib import Path
4
+ from pprint import pprint
5
+ from typing import Any
6
+
7
+ from dotenv import load_dotenv
8
+
9
+ from tutorials.utils import save_results_to_json, setup_logging
10
+
11
+ from scorebook import login, upload_result
12
+
13
+
14
+ def main() -> Any:
15
+ """Upload pre-scored results directly to Trismik's dashboard.
16
+
17
+ This example demonstrates how to upload results where metrics are ALREADY computed.
18
+ This is different from score() or evaluate() which compute metrics for you.
19
+
20
+ Use upload_result() when you:
21
+ - Already have metric scores calculated
22
+ - Used a custom evaluation framework that computed metrics
23
+ - Want to import historical evaluation data with existing scores
24
+ - Have results from external tools (e.g., other eval frameworks)
25
+
26
+ The key difference from Examples 1 & 2:
27
+ - Example 1 (score): You have outputs/labels → Scorebook computes metrics
28
+ - Example 2 (evaluate): Scorebook runs inference AND computes metrics
29
+ - Example 3 (upload_result): You have EVERYTHING including metrics → Just upload
30
+
31
+ Prerequisites:
32
+ - Valid Trismik API key set in TRISMIK_API_KEY environment variable
33
+ - A Trismik project ID
34
+ """
35
+
36
+ # Step 1: Log in with your Trismik API key
37
+ # login() reads TRISMIK_API_KEY from environment variables or .env file
38
+ login()
39
+
40
+ # Step 2: Format your pre-scored results
41
+ # This is the structure that upload_result() expects:
42
+ # - aggregate_results: List with one dict containing overall metrics
43
+ # - item_results: List of dicts with per-item data and metric scores
44
+
45
+ # Example: You already ran an evaluation with your custom framework
46
+ # and computed accuracy, f1_score, etc.
47
+ my_pre_scored_results = {
48
+ "aggregate_results": [
49
+ {
50
+ "dataset": "spanish_translation",
51
+ "accuracy": 0.8, # Your pre-computed aggregate accuracy
52
+ "bleu_score": 0.75, # Your pre-computed BLEU score
53
+ # Add any hyperparameters used (optional)
54
+ "temperature": 0.7,
55
+ "max_tokens": 100,
56
+ }
57
+ ],
58
+ "item_results": [
59
+ {
60
+ "id": 0,
61
+ "dataset": "spanish_translation",
62
+ "input": "Translate 'hello' to Spanish",
63
+ "output": "hola",
64
+ "label": "hola",
65
+ "accuracy": 1.0, # Item-level metric scores
66
+ "bleu_score": 1.0,
67
+ "temperature": 0.7,
68
+ "max_tokens": 100,
69
+ },
70
+ {
71
+ "id": 1,
72
+ "dataset": "spanish_translation",
73
+ "input": "Translate 'goodbye' to Spanish",
74
+ "output": "adiós",
75
+ "label": "adiós",
76
+ "accuracy": 1.0,
77
+ "bleu_score": 0.95,
78
+ "temperature": 0.7,
79
+ "max_tokens": 100,
80
+ },
81
+ {
82
+ "id": 2,
83
+ "dataset": "spanish_translation",
84
+ "input": "Translate 'thank you' to Spanish",
85
+ "output": "gracias",
86
+ "label": "gracias",
87
+ "accuracy": 1.0,
88
+ "bleu_score": 1.0,
89
+ "temperature": 0.7,
90
+ "max_tokens": 100,
91
+ },
92
+ {
93
+ "id": 3,
94
+ "dataset": "spanish_translation",
95
+ "input": "Translate 'please' to Spanish",
96
+ "output": "por favor",
97
+ "label": "por favor",
98
+ "accuracy": 1.0,
99
+ "bleu_score": 1.0,
100
+ "temperature": 0.7,
101
+ "max_tokens": 100,
102
+ },
103
+ {
104
+ "id": 4,
105
+ "dataset": "spanish_translation",
106
+ "input": "Translate 'good morning' to Spanish",
107
+ "output": "buenos dias", # Missing accent - wrong answer
108
+ "label": "buenos días",
109
+ "accuracy": 0.0,
110
+ "bleu_score": 0.85,
111
+ "temperature": 0.7,
112
+ "max_tokens": 100,
113
+ },
114
+ ],
115
+ }
116
+
117
+ # Step 3: Upload your pre-scored results directly
118
+ print("\nUploading pre-scored results to Trismik...")
119
+ print("Metrics are already computed - just uploading to dashboard.\n")
120
+
121
+ run_id = upload_result(
122
+ run_result=my_pre_scored_results,
123
+ experiment_id="Pre-Scored-Results-Example",
124
+ project_id="TRISMIK_PROJECT_ID", # TODO: ADD YOUR TRISMIK PROJECT ID
125
+ dataset_name="spanish_translation",
126
+ hyperparameters={
127
+ "temperature": 0.7,
128
+ "max_tokens": 100,
129
+ },
130
+ metadata={
131
+ "description": "Results with pre-computed metrics from custom framework",
132
+ "source": "Custom evaluation tool",
133
+ "evaluation_date": "2025-01-15",
134
+ },
135
+ model_name="my-custom-translator-v2",
136
+ )
137
+
138
+ print(f"\nResults uploaded successfully with run_id: {run_id}")
139
+
140
+ # Add run_id to results for reference
141
+ my_pre_scored_results["run_id"] = run_id
142
+
143
+ pprint(my_pre_scored_results)
144
+ return my_pre_scored_results
145
+
146
+
147
+ if __name__ == "__main__":
148
+ load_dotenv()
149
+ log_file = setup_logging(experiment_id="3-uploading_your_results", base_dir=Path(__file__).parent)
150
+ output_dir = Path(__file__).parent / "results"
151
+ output_dir.mkdir(exist_ok=True)
152
+ results_dict = main()
153
+ save_results_to_json(results_dict, output_dir, "3-uploading_your_results_output.json")
@@ -0,0 +1 @@
1
+ """Example usage of the AWS API with the scorebook library."""
@@ -0,0 +1,219 @@
1
+ """
2
+ AWS Bedrock Batch Inference Example.
3
+
4
+ This example demonstrates how to leverage AWS Bedrock's Model Invocation Jobs for
5
+ cost-effective, large-scale model evaluation using Scorebook. It uses Claude models
6
+ for batch processing with automatic S3 upload/download and job management.
7
+
8
+ This example requires AWS CLI to be configured with appropriate credentials and
9
+ permissions for Bedrock and S3. Set up your AWS profile and ensure you have
10
+ the necessary IAM roles configured.
11
+
12
+ Prerequisites:
13
+ - AWS CLI configured with appropriate profile
14
+ - S3 bucket with proper permissions
15
+ - IAM role for Bedrock execution with S3 access
16
+ - Minimum 100 items for batch processing (AWS requirement)
17
+ """
18
+
19
+ import json
20
+ from pathlib import Path
21
+ from typing import Any, List, Optional
22
+
23
+ from dotenv import load_dotenv
24
+
25
+ from scorebook import EvalDataset, InferencePipeline, evaluate
26
+ from scorebook.inference.clients.bedrock import batch
27
+ from scorebook.metrics.accuracy import Accuracy
28
+
29
+
30
+ def main() -> None:
31
+ """Run the AWS Bedrock batch inference example."""
32
+ # Load environment variables from .env file for configuration
33
+ load_dotenv()
34
+
35
+ args = setup_arguments()
36
+
37
+ # Step 1: Load the evaluation dataset
38
+ dataset = EvalDataset.from_json(
39
+ "examples/example_datasets/dataset.json",
40
+ metrics=[Accuracy],
41
+ input="question",
42
+ label="answer",
43
+ )
44
+
45
+ # Ensure minimum batch size requirement (AWS Bedrock requires 100+ items)
46
+ if len(dataset) < 100:
47
+ # Cycle through items to reach 100 - default minimum size of AWS batch jobs
48
+ original_items = dataset.items
49
+ expanded_items: List[Any] = []
50
+ while len(expanded_items) < 100:
51
+ items_needed = 100 - len(expanded_items)
52
+ expanded_items.extend(original_items[: min(len(original_items), items_needed)])
53
+
54
+ # Create new dataset with expanded items
55
+ # Items already have "input" and "label" columns from the original dataset
56
+ dataset = EvalDataset.from_list(
57
+ name=dataset.name,
58
+ metrics=[Accuracy],
59
+ items=expanded_items,
60
+ input="input",
61
+ label="label",
62
+ )
63
+
64
+ # Step 2: Define the preprocessing function for AWS Bedrock Batch API
65
+ def preprocessor(input_value: str) -> list:
66
+ """Pre-process dataset inputs into AWS Bedrock Batch API format."""
67
+ # Create the batch API request messages format for Bedrock
68
+ messages = [
69
+ {
70
+ "role": "system",
71
+ "content": "Answer the question directly and concisely as a single word",
72
+ },
73
+ {"role": "user", "content": input_value},
74
+ ]
75
+
76
+ return messages
77
+
78
+ # Step 3: Define the postprocessing function
79
+ def postprocessor(response: str) -> str:
80
+ """Post-process AWS Bedrock batch response to extract the answer."""
81
+ # The batch function returns the message content directly
82
+ return response.strip()
83
+
84
+ # Step 4: Create the inference pipeline for batch processing
85
+ async def inference_function(items: list, **hyperparams: Any) -> Any:
86
+ return await batch(
87
+ items,
88
+ model=args.model,
89
+ aws_region=args.aws_region,
90
+ aws_profile=args.aws_profile,
91
+ bucket=args.bucket,
92
+ input_prefix=args.input_prefix,
93
+ output_prefix=args.output_prefix,
94
+ role_arn=args.role_arn,
95
+ **hyperparams,
96
+ )
97
+
98
+ inference_pipeline = InferencePipeline(
99
+ model=args.model or "claude-model",
100
+ preprocessor=preprocessor,
101
+ inference_function=inference_function,
102
+ postprocessor=postprocessor,
103
+ )
104
+
105
+ # Step 5: Run the batch evaluation
106
+ print("Running AWS Bedrock Batch API evaluation")
107
+ print(f"Model: {args.model or 'Not specified'}")
108
+ print(f"AWS Region: {args.aws_region or 'Not specified'}")
109
+ print(f"AWS Profile: {args.aws_profile or 'Not specified'}")
110
+ print(f"S3 Bucket: {args.bucket or 'Not specified'}")
111
+ print(f"Role ARN: {args.role_arn or 'Not specified'}")
112
+ print(f"Processing {len(dataset)} items using batch inference...")
113
+ print("Note: Batch processing may take several minutes to complete.")
114
+
115
+ results = evaluate(inference_pipeline, dataset, return_items=True, return_output=True)
116
+ print("\nBatch evaluation completed!")
117
+ print(results)
118
+
119
+ # Step 6: Save results to file
120
+ output_file = args.output_dir / "bedrock_batch_output.json"
121
+ with open(output_file, "w") as f:
122
+ json.dump(results, f, indent=4)
123
+ print(f"Results saved in {output_file}")
124
+
125
+
126
+ # ============================================================================
127
+ # Utility Functions
128
+ # ============================================================================
129
+
130
+
131
+ class Args:
132
+ """Simple container for parsed arguments."""
133
+
134
+ def __init__(self) -> None:
135
+ """Parse command line arguments."""
136
+ self.output_dir: Path = Path(".") # Will be overridden in setup_arguments
137
+ self.model: Optional[str] = None
138
+ self.aws_region: Optional[str] = None
139
+ self.aws_profile: Optional[str] = None
140
+ self.bucket: Optional[str] = None
141
+ self.input_prefix: Optional[str] = None
142
+ self.output_prefix: Optional[str] = None
143
+ self.role_arn: Optional[str] = None
144
+
145
+
146
+ def setup_arguments() -> Args:
147
+ """Parse command line arguments."""
148
+ import argparse
149
+
150
+ parser = argparse.ArgumentParser(
151
+ description="Run AWS Bedrock Batch API evaluation and save results."
152
+ )
153
+
154
+ # Required argument
155
+ parser.add_argument(
156
+ "--output-dir",
157
+ type=str,
158
+ required=False,
159
+ default=str(Path.cwd() / "results"),
160
+ help="Directory to save evaluation outputs (JSON).",
161
+ )
162
+
163
+ # All optional AWS parameters
164
+ parser.add_argument(
165
+ "--model",
166
+ type=str,
167
+ help="Bedrock model ID (e.g., 'us.anthropic.claude-3-5-sonnet-20241022-v2:0')",
168
+ )
169
+ parser.add_argument(
170
+ "--aws-region",
171
+ type=str,
172
+ help="AWS region for Bedrock and S3 operations",
173
+ )
174
+ parser.add_argument(
175
+ "--aws-profile",
176
+ type=str,
177
+ help="AWS profile name for authentication",
178
+ )
179
+ parser.add_argument(
180
+ "--bucket",
181
+ type=str,
182
+ help="S3 bucket name for input/output data",
183
+ )
184
+ parser.add_argument(
185
+ "--input-prefix",
186
+ type=str,
187
+ help="S3 prefix for input data (e.g., 'batch/input/')",
188
+ )
189
+ parser.add_argument(
190
+ "--output-prefix",
191
+ type=str,
192
+ help="S3 prefix for output data (e.g., 'batch/output/')",
193
+ )
194
+ parser.add_argument(
195
+ "--role-arn",
196
+ type=str,
197
+ help="IAM role ARN for Bedrock execution",
198
+ )
199
+
200
+ parsed_args = parser.parse_args()
201
+
202
+ # Create Args object and populate
203
+ args = Args()
204
+ args.output_dir = Path(parsed_args.output_dir)
205
+ args.output_dir.mkdir(parents=True, exist_ok=True)
206
+
207
+ args.model = parsed_args.model
208
+ args.aws_region = parsed_args.aws_region
209
+ args.aws_profile = parsed_args.aws_profile
210
+ args.bucket = parsed_args.bucket
211
+ args.input_prefix = parsed_args.input_prefix
212
+ args.output_prefix = parsed_args.output_prefix
213
+ args.role_arn = parsed_args.role_arn
214
+
215
+ return args
216
+
217
+
218
+ if __name__ == "__main__":
219
+ main()
@@ -0,0 +1 @@
1
+ """Example usage of the Portkey API with the scorebook library."""
@@ -0,0 +1,120 @@
1
+ """
2
+ Portkey Cloud Batch Inference Example.
3
+
4
+ This example demonstrates how to leverage Portkey's Batch API for cost-effective,
5
+ large-scale model evaluation using Scorebook. The backend provider of choice for this example is
6
+ OpenAI, but it's easy to adapt to any other provider.
7
+
8
+ This example requires a portkey account linked to an OpenAI account and
9
+ a portkey API key set in environment variable PORTKEY_API_KEY .
10
+
11
+ Compare with messages_example.py to understand the differences
12
+ between real-time and batch processing approaches.
13
+ """
14
+
15
+ import json
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ from dotenv import load_dotenv
20
+
21
+ from scorebook import EvalDataset, InferencePipeline, evaluate
22
+ from scorebook.inference.clients.portkey import batch
23
+ from scorebook.metrics.accuracy import Accuracy
24
+
25
+
26
+ def main() -> None:
27
+ """Run the Portkey batch inference example."""
28
+ # Load environment variables from .env file for API keys
29
+ load_dotenv()
30
+
31
+ output_dir, model_name = setup_arguments()
32
+
33
+ # Step 1: Load the evaluation dataset
34
+ dataset = EvalDataset.from_json(
35
+ "examples/example_datasets/dataset.json", label="answer", metrics=[Accuracy]
36
+ )
37
+
38
+ # Step 2: Define the preprocessing function for batch API
39
+ def preprocessor(eval_item: dict) -> list:
40
+ """Pre-process dataset items into Portkey Batch API format."""
41
+ prompt = eval_item["question"]
42
+
43
+ # Create the batch API request messages format
44
+ messages = [
45
+ {
46
+ "role": "system",
47
+ "content": "Answer the question directly and concisely as a single word",
48
+ },
49
+ {"role": "user", "content": prompt},
50
+ ]
51
+
52
+ return messages
53
+
54
+ # Step 3: Define the postprocessing function
55
+ def postprocessor(response: str) -> str:
56
+ """Post-process Portkey batch response to extract the answer."""
57
+ # The batch function returns the message content directly
58
+ return response.strip()
59
+
60
+ # Step 4: Create the inference pipeline for batch processing
61
+
62
+ async def inference_function(items: list, **hyperparams: Any) -> Any: # noqa
63
+ return await batch(items, model=model_name, **hyperparams)
64
+
65
+ inference_pipeline = InferencePipeline(
66
+ model=model_name,
67
+ preprocessor=preprocessor,
68
+ inference_function=inference_function,
69
+ postprocessor=postprocessor,
70
+ )
71
+
72
+ # Step 5: Run the batch evaluation
73
+ print(f"Running Portkey Batch API evaluation with model: {model_name}")
74
+ print(f"Processing {len(dataset)} items using batch inference...")
75
+ print("Note: Batch processing may take several minutes to complete.")
76
+
77
+ # For demonstration, limit to 25 items
78
+ results = evaluate(inference_pipeline, dataset, item_limit=25, score_type="all")
79
+ print("\nBatch evaluation completed!")
80
+ print(results)
81
+
82
+ # Step 6: Save results to file
83
+ output_file = output_dir / "portkey_batch_output.json"
84
+ with open(output_file, "w") as f:
85
+ json.dump(results, f, indent=4)
86
+ print(f"Results saved in {output_file}")
87
+
88
+
89
+ # ============================================================================
90
+ # Utility Functions
91
+ # ============================================================================
92
+
93
+
94
+ def setup_arguments() -> tuple[Path, str]:
95
+ """Parse command line arguments."""
96
+ import argparse
97
+
98
+ parser = argparse.ArgumentParser(
99
+ description="Run Portkey Batch API evaluation and save results."
100
+ )
101
+ parser.add_argument(
102
+ "--output-dir",
103
+ type=str,
104
+ default=str(Path.cwd() / "results"),
105
+ help="Directory to save evaluation outputs (JSON).",
106
+ )
107
+ parser.add_argument(
108
+ "--model",
109
+ type=str,
110
+ required=True,
111
+ help="Model to use for batch inference via Portkey (e.g., @openai/gpt-4.1-mini)",
112
+ )
113
+ args = parser.parse_args()
114
+ output_dir = Path(args.output_dir)
115
+ output_dir.mkdir(parents=True, exist_ok=True)
116
+ return output_dir, str(args.model)
117
+
118
+
119
+ if __name__ == "__main__":
120
+ main()
@@ -0,0 +1,121 @@
1
+ """
2
+ Portkey Cloud Model Inference Example.
3
+
4
+ This example demonstrates how to evaluate language models using Portkey's inference
5
+ services with Scorebook for real-time API calls.
6
+
7
+ Prerequisites: PORTKEY_API_KEY environment variable and active Portkey account.
8
+ """
9
+
10
+ import json
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ from dotenv import load_dotenv
15
+
16
+ from scorebook import EvalDataset, InferencePipeline, evaluate
17
+ from scorebook.inference.clients.portkey import responses
18
+ from scorebook.metrics.accuracy import Accuracy
19
+
20
+
21
+ def main() -> None:
22
+ """Run the Portkey inference example."""
23
+ # Load environment variables from .env file for API keys
24
+ load_dotenv()
25
+
26
+ output_dir, model_name = setup_arguments()
27
+
28
+ # Step 1: Load the evaluation dataset
29
+ # Create an EvalDataset from local JSON file
30
+ # - Uses 'answer' field as ground truth labels
31
+ # - Configures Accuracy metric for evaluation
32
+ # - Loads from examples/example_datasets/dataset.json
33
+ dataset = EvalDataset.from_json(
34
+ "examples/example_datasets/dataset.json", label="answer", metrics=[Accuracy]
35
+ )
36
+
37
+ # Step 2: Define the preprocessing function
38
+ # Convert raw dataset items into Portkey API-compatible format
39
+ # This function formats the question for the cloud model
40
+ def preprocessor(eval_item: dict) -> list:
41
+ """Pre-process dataset items into Portkey messages format."""
42
+ prompt = eval_item["question"]
43
+
44
+ # Create a system message with instructions for direct answers
45
+ system_prompt = """
46
+ Answer the question directly and concisely.
47
+ Do not provide lengthy explanations unless specifically asked.
48
+ """.strip()
49
+
50
+ # Format as messages for Portkey API
51
+ return [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]
52
+
53
+ # Step 3: Define the postprocessing function
54
+ # Extract the final answer from Portkey API response
55
+ # Handles response parsing and returns the response text
56
+ def postprocessor(response: Any) -> str:
57
+ """Post-process Portkey response to extract the answer."""
58
+ return str(response.choices[0].message.content.strip())
59
+
60
+ # Step 4: Create the inference pipeline for cloud-based evaluation
61
+ # Combine preprocessing, Portkey API inference, and postprocessing
62
+ # Uses scorebook's built-in Portkey responses function for API calls
63
+
64
+ async def inference_function(items: list, **hyperparams: Any) -> Any:
65
+ return await responses(items, model=model_name, **hyperparams)
66
+
67
+ inference_pipeline = InferencePipeline(
68
+ model=model_name,
69
+ preprocessor=preprocessor,
70
+ inference_function=inference_function,
71
+ postprocessor=postprocessor,
72
+ )
73
+
74
+ # Step 5: Run the cloud-based evaluation
75
+ # Execute evaluation using Portkey API with the inference pipeline
76
+ # - Uses score_type="all" to get both aggregate and per-item results
77
+ # - Limits to 10 items for quick demonstration and cost control
78
+ print(f"Running Portkey evaluation with model: {model_name}")
79
+ print("Evaluating 10 items from local dataset...")
80
+
81
+ results = evaluate(inference_pipeline, dataset, item_limit=10, score_type="all")
82
+ print(results)
83
+
84
+ # Step 6: Save results to file
85
+ # Export evaluation results as JSON for later analysis
86
+ output_file = output_dir / "portkey_messages_output.json"
87
+ with open(output_file, "w") as f:
88
+ json.dump(results, f, indent=4)
89
+ print(f"Results saved in {output_file}")
90
+
91
+
92
+ # ============================================================================
93
+ # Utility Functions
94
+ # ============================================================================
95
+
96
+
97
+ def setup_arguments() -> tuple[Path, str]:
98
+ """Parse command line arguments."""
99
+ import argparse
100
+
101
+ parser = argparse.ArgumentParser(description="Run Portkey evaluation and save results.")
102
+ parser.add_argument(
103
+ "--output-dir",
104
+ type=str,
105
+ default=str(Path.cwd() / "results"),
106
+ help="Directory to save evaluation outputs (JSON).",
107
+ )
108
+ parser.add_argument(
109
+ "--model",
110
+ type=str,
111
+ required=True,
112
+ help="Model to use for inference via Portkey (e.g., @openai/gpt-4.1-mini)",
113
+ )
114
+ args = parser.parse_args()
115
+ output_dir = Path(args.output_dir)
116
+ output_dir.mkdir(parents=True, exist_ok=True)
117
+ return output_dir, str(args.model)
118
+
119
+
120
+ if __name__ == "__main__":
121
+ main()
@@ -0,0 +1 @@
1
+ """Example usage of the Google Cloud Platform with the scorebook library."""