scorebook 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. scorebook/__init__.py +2 -0
  2. scorebook/dashboard/credentials.py +34 -4
  3. scorebook/eval_datasets/eval_dataset.py +2 -2
  4. scorebook/evaluate/_async/evaluate_async.py +27 -11
  5. scorebook/evaluate/_sync/evaluate.py +27 -11
  6. scorebook/metrics/README.md +121 -0
  7. scorebook/metrics/__init__.py +8 -0
  8. scorebook/metrics/accuracy.py +2 -6
  9. scorebook/metrics/bertscore.py +50 -0
  10. scorebook/metrics/bleu.py +82 -0
  11. scorebook/metrics/core/__init__.py +1 -0
  12. scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
  13. scorebook/metrics/core/metric_registry.py +195 -0
  14. scorebook/metrics/exactmatch.py +95 -0
  15. scorebook/metrics/f1.py +96 -0
  16. scorebook/metrics/precision.py +84 -9
  17. scorebook/metrics/recall.py +94 -0
  18. scorebook/metrics/rouge.py +85 -0
  19. scorebook/score/score_helpers.py +28 -11
  20. scorebook/types.py +2 -2
  21. scorebook/utils/progress_bars.py +58 -786
  22. {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/METADATA +32 -24
  23. scorebook-0.0.16.dist-info/RECORD +110 -0
  24. {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/WHEEL +1 -1
  25. tutorials/README.md +147 -0
  26. tutorials/__init__.py +5 -0
  27. tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
  28. tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
  29. tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
  30. tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
  31. tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
  32. tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
  33. tutorials/examples/1-score/__init__.py +0 -0
  34. tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
  35. tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
  36. tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
  37. tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
  38. tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
  39. tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
  40. tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
  41. tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
  42. tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
  43. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
  44. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
  45. tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
  46. tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
  47. tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
  48. tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
  49. tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
  50. tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
  51. tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
  52. tutorials/examples/6-providers/aws/__init__.py +1 -0
  53. tutorials/examples/6-providers/aws/batch_example.py +219 -0
  54. tutorials/examples/6-providers/portkey/__init__.py +1 -0
  55. tutorials/examples/6-providers/portkey/batch_example.py +120 -0
  56. tutorials/examples/6-providers/portkey/messages_example.py +121 -0
  57. tutorials/examples/6-providers/vertex/__init__.py +1 -0
  58. tutorials/examples/6-providers/vertex/batch_example.py +166 -0
  59. tutorials/examples/6-providers/vertex/messages_example.py +142 -0
  60. tutorials/examples/__init__.py +0 -0
  61. tutorials/notebooks/1-scoring.ipynb +162 -0
  62. tutorials/notebooks/2-evaluating.ipynb +316 -0
  63. tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
  64. tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
  65. tutorials/notebooks/4-uploading_results.ipynb +175 -0
  66. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
  67. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
  68. tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
  69. tutorials/quickstarts/getting_started.ipynb +197 -0
  70. tutorials/utils/__init__.py +35 -0
  71. tutorials/utils/args_parser.py +132 -0
  72. tutorials/utils/output.py +23 -0
  73. tutorials/utils/setup.py +98 -0
  74. scorebook/metrics/metric_registry.py +0 -107
  75. scorebook-0.0.14.dist-info/RECORD +0 -53
  76. {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/entry_points.txt +0 -0
  77. {scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,84 @@
1
+ """Tutorials - Score - Example 5 - Scoring Models with Exact Match."""
2
+
3
+ from pathlib import Path
4
+ from pprint import pprint
5
+ from typing import Any
6
+
7
+ from dotenv import load_dotenv
8
+
9
+ from tutorials.utils import save_results_to_json, setup_logging
10
+
11
+ from scorebook import score
12
+ from scorebook.metrics.exactmatch import ExactMatch
13
+
14
+
15
+ def main() -> Any:
16
+ """Score text predictions using Exact Match metric.
17
+
18
+ This example demonstrates how to compare model outputs against
19
+ reference labels using exact string matching with configurable
20
+ preprocessing options.
21
+ """
22
+
23
+ # Prepare a list of items with model outputs and expected labels
24
+ # Note: outputs may have different casing or extra whitespace
25
+ model_predictions = [
26
+ {"output": "Paris", "label": "Paris"}, # Exact match
27
+ {"output": "LONDON", "label": "London"}, # Different case
28
+ {"output": " Berlin ", "label": "Berlin"}, # Extra whitespace
29
+ {"output": " NEW YORK ", "label": "new york"}, # Both case and whitespace
30
+ {"output": "Tokyo", "label": "Kyoto"}, # No match
31
+ ]
32
+
33
+ print(f"Scoring {len(model_predictions)} predictions\n")
34
+
35
+ # Score with default settings (case_insensitive=True, strip=True)
36
+ print("Default settings (case_insensitive=True, strip=True):")
37
+ results_default = score(
38
+ items=model_predictions,
39
+ metrics=ExactMatch(),
40
+ upload_results=False,
41
+ )
42
+ pprint(results_default["aggregate_results"])
43
+ print(f"Item matches: {[item['exact_match'] for item in results_default['item_results']]}")
44
+
45
+ # Score with case-sensitive matching
46
+ print("\nCase-sensitive matching (case_insensitive=False, strip=True):")
47
+ results_case_sensitive = score(
48
+ items=model_predictions,
49
+ metrics=ExactMatch(case_insensitive=False),
50
+ upload_results=False,
51
+ )
52
+ pprint(results_case_sensitive["aggregate_results"])
53
+ print(f"Item matches: {[item['exact_match'] for item in results_case_sensitive['item_results']]}")
54
+
55
+ # Score without stripping whitespace
56
+ print("\nWithout stripping (case_insensitive=True, strip=False):")
57
+ results_no_strip = score(
58
+ items=model_predictions,
59
+ metrics=ExactMatch(strip=False),
60
+ upload_results=False,
61
+ )
62
+ pprint(results_no_strip["aggregate_results"])
63
+ print(f"Item matches: {[item['exact_match'] for item in results_no_strip['item_results']]}")
64
+
65
+ # Score with strict matching (no preprocessing)
66
+ print("\nStrict matching (case_insensitive=False, strip=False):")
67
+ results_strict = score(
68
+ items=model_predictions,
69
+ metrics=ExactMatch(case_insensitive=False, strip=False),
70
+ upload_results=False,
71
+ )
72
+ pprint(results_strict["aggregate_results"])
73
+ print(f"Item matches: {[item['exact_match'] for item in results_strict['item_results']]}")
74
+
75
+ return results_default
76
+
77
+
78
+ if __name__ == "__main__":
79
+ load_dotenv()
80
+ log_file = setup_logging(experiment_id="5-scoring_model_exact_match", base_dir=Path(__file__).parent)
81
+ output_dir = Path(__file__).parent / "results"
82
+ output_dir.mkdir(exist_ok=True)
83
+ results_dict = main()
84
+ save_results_to_json(results_dict, output_dir, "5-scoring_model_exact_match_output.json")
@@ -0,0 +1,57 @@
1
+ """Tutorials - Score - Example 6 - Scoring with BertScore."""
2
+
3
+ from pathlib import Path
4
+ from pprint import pprint
5
+ from typing import Any
6
+
7
+ from dotenv import load_dotenv
8
+
9
+ from scorebook.metrics.bertscore import BertScore
10
+
11
+ from tutorials.utils import save_results_to_json, setup_logging
12
+ from scorebook import score
13
+
14
+
15
+ def main() -> Any:
16
+ """Score pre-computed model predictions using Scorebook.
17
+
18
+ This example demonstrates how to score generated model predictions.
19
+ """
20
+
21
+ # Prepare a list of items with generated summaries and reference summaries
22
+ model_predictions = [
23
+ {
24
+ "output": "A woman donated her kidney to a stranger. This sparked a chain of six kidney transplants.",
25
+ "label": "Zully Broussard decided to give a kidney to a stranger. A new computer program helped her donation spur transplants for six kidney patients.",
26
+ },
27
+ {
28
+ "output": "Scientists discovered a new species of frog in the Amazon rainforest. The frog has unique markings that distinguish it from other species.",
29
+ "label": "A new frog species with distinctive blue and yellow stripes was found in the Amazon. Researchers say this discovery highlights the biodiversity of the region.",
30
+ },
31
+ {
32
+ "output": "The technology company released its quarterly earnings report showing strong growth.",
33
+ "label": "Tech giant announces record quarterly revenue driven by cloud services and AI products.",
34
+ },
35
+ ]
36
+
37
+ # Score the predictions against labels using the BertScore metric
38
+ results = score(
39
+ items=model_predictions,
40
+ metrics=BertScore,
41
+ upload_results=False, # Disable uploading for this example
42
+ )
43
+
44
+ print("\nResults:")
45
+ pprint(results)
46
+
47
+ return results
48
+
49
+
50
+ if __name__ == "__main__":
51
+ load_dotenv()
52
+ log_file = setup_logging(experiment_id="6-scoring_model_bertscore", base_dir=Path(__file__).parent)
53
+ output_dir = Path(__file__).parent / "results"
54
+ output_dir.mkdir(exist_ok=True)
55
+ results_dict = main()
56
+ save_results_to_json(results_dict, output_dir, "6-scoring_model_bertscore_output.json")
57
+
File without changes
@@ -0,0 +1,106 @@
1
+ """Tutorials - Evaluate - Example 1 - Evaluating Local Models."""
2
+
3
+ from pathlib import Path
4
+ from pprint import pprint
5
+ from typing import Any, List
6
+
7
+ import transformers
8
+ from dotenv import load_dotenv
9
+
10
+ from tutorials.utils import save_results_to_json, setup_logging
11
+
12
+ from scorebook import EvalDataset, evaluate
13
+
14
+
15
+ def main() -> Any:
16
+ """Run a simple Scorebook evaluation on a local model.
17
+
18
+ This example demonstrates the fundamental workflow for evaluating a model using Scorebook.
19
+
20
+ It shows how to:
21
+ 1. Create an evaluation dataset from a list of evaluation items
22
+ 2. Define an inference function using Hugging Face's transformers library
23
+ 3. Run the evaluation and collect results
24
+
25
+ This serves as a starting point for understanding Scorebook's core evaluation capabilities.
26
+ """
27
+
28
+ # Create a list of evaluation items
29
+ evaluation_items = [
30
+ {"question": "What is 2 + 2?", "answer": "4"},
31
+ {"question": "What is the capital of France?", "answer": "Paris"},
32
+ {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"},
33
+ ]
34
+
35
+ # Create an evaluation dataset
36
+ evaluation_dataset = EvalDataset.from_list(
37
+ name="basic_questions", # Dataset name
38
+ metrics="accuracy", # Metric/Metrics used to calculate scores
39
+ items=evaluation_items, # List of evaluation items
40
+ input="question", # Key for the input field in evaluation items
41
+ label="answer", # Key for the label field in evaluation items
42
+ )
43
+
44
+ # Create a model
45
+ pipeline = transformers.pipeline(
46
+ "text-generation",
47
+ model="microsoft/Phi-4-mini-instruct",
48
+ model_kwargs={"torch_dtype": "auto"},
49
+ device_map="auto",
50
+ )
51
+
52
+ # Define an inference function
53
+ def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
54
+ """Return a list of model outputs for a list of inputs.
55
+
56
+ Args:
57
+ inputs: Input values from an EvalDataset.
58
+ hyperparameters: Model hyperparameters.
59
+
60
+ Returns:
61
+ The model outputs for a list of inputs.
62
+ """
63
+ inference_outputs = []
64
+ for model_input in inputs:
65
+
66
+ # Wrap inputs in the model's message format
67
+ messages = [
68
+ {
69
+ "role": "system",
70
+ "content": hyperparameters.get("system_message"),
71
+ },
72
+ {"role": "user", "content": model_input},
73
+ ]
74
+
75
+ # Run inference on the item
76
+ output = pipeline(messages, temperature=hyperparameters.get("temperature"))
77
+
78
+ # Extract and collect the output generated from the model's response
79
+ inference_outputs.append(output[0]["generated_text"][-1]["content"])
80
+
81
+ return inference_outputs
82
+
83
+ # Evaluate a model against an evaluation dataset
84
+ results = evaluate(
85
+ inference,
86
+ evaluation_dataset,
87
+ hyperparameters={
88
+ "temperature": 0.7,
89
+ "system_message": "Answer the question directly and concisely.",
90
+ },
91
+ return_items=True,
92
+ upload_results=False, # Disable uploading for this example
93
+ )
94
+
95
+ print("\nEvaluation Results:")
96
+ pprint(results)
97
+ return results
98
+
99
+
100
+ if __name__ == "__main__":
101
+ load_dotenv()
102
+ log_file = setup_logging(experiment_id="1-evaluating_local_models", base_dir=Path(__file__).parent)
103
+ output_dir = Path(__file__).parent / "results"
104
+ output_dir.mkdir(exist_ok=True)
105
+ results_dict = main()
106
+ save_results_to_json(results_dict, output_dir, "1-evaluating_local_models_output.json")
@@ -0,0 +1,108 @@
1
+ """Tutorials - Evaluate - Example 2 - Evaluating Local Models with Batching."""
2
+
3
+ from pathlib import Path
4
+ from pprint import pprint
5
+ from typing import Any, List
6
+
7
+ import transformers
8
+ from dotenv import load_dotenv
9
+
10
+ from tutorials.utils import save_results_to_json, setup_logging
11
+
12
+ from scorebook import EvalDataset, evaluate
13
+
14
+
15
+ def main() -> Any:
16
+ """Run a Scorebook evaluation using local batch inference.
17
+
18
+ This example demonstrates how to perform batch inference locally.
19
+
20
+ This approach offers several benefits:
21
+ 1. Improved throughput by processing multiple items in parallel
22
+ 2. Better GPU utilization through batched tensor operations
23
+ 3. More efficient memory usage compared to sequential processing
24
+ """
25
+
26
+ # Initialize the pipeline with appropriate settings for batch processing
27
+ model_name = "google/flan-t5-small"
28
+
29
+ # Task is text2text-generation for seq2seq models
30
+ pipeline = transformers.pipeline(
31
+ "text2text-generation",
32
+ model=model_name,
33
+ torch_dtype="auto",
34
+ device_map="auto", # will pick up gpu if available
35
+ )
36
+
37
+ # Define a batch inference function
38
+ def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
39
+ """Process multiple inputs through the model in batches.
40
+
41
+ Args:
42
+ inputs: Input values from an EvalDataset.
43
+ hyperparameters: Model hyperparameters including batch_size and max_new_tokens.
44
+
45
+ Returns:
46
+ List of model outputs for all inputs.
47
+ """
48
+ # Preprocess: Convert inputs to strings
49
+ preprocessed_inputs = [str(input_val) for input_val in inputs]
50
+
51
+ # Run batch inference
52
+ raw_results = pipeline(
53
+ preprocessed_inputs,
54
+ batch_size=hyperparameters["batch_size"],
55
+ max_new_tokens=hyperparameters["max_new_tokens"],
56
+ pad_token_id=pipeline.tokenizer.eos_token_id,
57
+ )
58
+
59
+ # Postprocess: Extract and clean the generated text
60
+ final_outputs = [str(result["generated_text"]).strip() for result in raw_results]
61
+
62
+ return final_outputs
63
+
64
+ # Create a list of evaluation items
65
+ evaluation_items = [
66
+ {"question": "What is 2 + 2?", "answer": "4"},
67
+ {"question": "What is the capital of France?", "answer": "Paris"},
68
+ {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"},
69
+ ]
70
+
71
+ # Create an evaluation dataset
72
+ evaluation_dataset = EvalDataset.from_list(
73
+ name="basic_questions", # Dataset name
74
+ metrics="accuracy", # Metric/Metrics used to calculate scores
75
+ items=evaluation_items, # List of evaluation items
76
+ input="question", # Key for the input field in evaluation items
77
+ label="answer", # Key for the label field in evaluation items
78
+ )
79
+
80
+
81
+ # Define hyperparameters
82
+ hyperparameters = {
83
+ "max_new_tokens": 128,
84
+ "batch_size": 2,
85
+ }
86
+
87
+ # Run the evaluation with batch inference
88
+ results = evaluate(
89
+ inference,
90
+ evaluation_dataset,
91
+ hyperparameters=hyperparameters,
92
+ return_aggregates=True, # Include aggregate results for each configuration
93
+ return_items=True, # Include results for individual items
94
+ return_output=True, # Include model outputs for debugging
95
+ upload_results=False, # Disable uploading for this example
96
+ )
97
+
98
+ pprint(results)
99
+ return results
100
+
101
+
102
+ if __name__ == "__main__":
103
+ load_dotenv()
104
+ log_file = setup_logging(experiment_id="2-evaluating_local_models_with_batching", base_dir=Path(__file__).parent)
105
+ output_dir = Path(__file__).parent / "results"
106
+ output_dir.mkdir(exist_ok=True)
107
+ results_dict = main()
108
+ save_results_to_json(results_dict, output_dir, "2-evaluating_local_models_with_batching_output.json")
@@ -0,0 +1,109 @@
1
+ """Tutorials - Evaluate - Example 3 - Evaluating Cloud Models."""
2
+
3
+ import asyncio
4
+ from pathlib import Path
5
+ from pprint import pprint
6
+ from typing import Any, List
7
+
8
+ from dotenv import load_dotenv
9
+ from openai import AsyncOpenAI
10
+
11
+ from tutorials.utils import save_results_to_json, setup_logging
12
+
13
+ from scorebook import EvalDataset, evaluate_async
14
+
15
+
16
+ async def main() -> Any:
17
+ """Run an evaluation using a cloud-hosted model.
18
+
19
+ This example demonstrates how to evaluate cloud-hosted models using OpenAI's API directly.
20
+
21
+ Prerequisites:
22
+ - OpenAI API key set in environment variable OPENAI_API_KEY
23
+ """
24
+
25
+ # Initialize OpenAI client
26
+ client = AsyncOpenAI()
27
+ model_name = "gpt-4o-mini"
28
+
29
+ # Define an async inference function
30
+ async def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
31
+ """Process inputs through OpenAI's API.
32
+
33
+ Args:
34
+ inputs: Input values from an EvalDataset.
35
+ hyperparameters: Model hyperparameters including system_message and temperature.
36
+
37
+ Returns:
38
+ List of model outputs for all inputs.
39
+ """
40
+ outputs = []
41
+ for input_val in inputs:
42
+ # Build messages for OpenAI API
43
+ messages = [
44
+ {
45
+ "role": "system",
46
+ "content": hyperparameters.get(
47
+ "system_message", "You are a helpful assistant."
48
+ ),
49
+ },
50
+ {"role": "user", "content": str(input_val)},
51
+ ]
52
+
53
+ # Call OpenAI API
54
+ try:
55
+ response = await client.chat.completions.create(
56
+ model=model_name,
57
+ messages=messages,
58
+ temperature=hyperparameters.get("temperature", 0.7),
59
+ )
60
+ output = response.choices[0].message.content.strip()
61
+ except Exception as e:
62
+ output = f"Error: {str(e)}"
63
+
64
+ outputs.append(output)
65
+
66
+ return outputs
67
+
68
+ # Create a list of evaluation items
69
+ evaluation_items = [
70
+ {"question": "What is 2 + 2?", "answer": "4"},
71
+ {"question": "What is the capital of France?", "answer": "Paris"},
72
+ {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"},
73
+ ]
74
+
75
+ # Create an evaluation dataset
76
+ evaluation_dataset = EvalDataset.from_list(
77
+ name="basic_questions", # Dataset name
78
+ metrics="accuracy", # Metric/Metrics used to calculate scores
79
+ items=evaluation_items, # List of evaluation items
80
+ input="question", # Key for the input field in evaluation items
81
+ label="answer", # Key for the label field in evaluation items
82
+ )
83
+
84
+ # Run evaluation
85
+ results = await evaluate_async(
86
+ inference,
87
+ evaluation_dataset,
88
+ hyperparameters={
89
+ "system_message": (
90
+ "Answer the question directly. Provide only the answer, without context."
91
+ ),
92
+ "temperature": 0.7,
93
+ },
94
+ return_items=True,
95
+ return_output=True,
96
+ upload_results=False,
97
+ )
98
+
99
+ pprint(results)
100
+ return results
101
+
102
+
103
+ if __name__ == "__main__":
104
+ load_dotenv()
105
+ log_file = setup_logging(experiment_id="3-evaluating_cloud_models", base_dir=Path(__file__).parent)
106
+ output_dir = Path(__file__).parent / "results"
107
+ output_dir.mkdir(exist_ok=True)
108
+ results_dict = asyncio.run(main())
109
+ save_results_to_json(results_dict, output_dir, "3-evaluating_cloud_models_output.json")
@@ -0,0 +1,170 @@
1
+ """Tutorials - Evaluate - Example 4 - Evaluating Cloud Models with Batching."""
2
+
3
+ import asyncio
4
+ import json
5
+ import tempfile
6
+ from pathlib import Path
7
+ from pprint import pprint
8
+ from typing import Any, List
9
+
10
+ from dotenv import load_dotenv
11
+ from openai import AsyncOpenAI
12
+
13
+ from tutorials.utils import save_results_to_json, setup_logging
14
+
15
+ from scorebook import EvalDataset, evaluate_async
16
+
17
+
18
+ async def main() -> Any:
19
+ """Run evaluation using OpenAI's Batch API.
20
+
21
+ This example demonstrates how to use OpenAI's Batch API for cost-effective,
22
+ large-scale model evaluation. The Batch API offers 50% cost savings compared
23
+ to standard API calls, with results typically delivered within 24 hours.
24
+
25
+ Prerequisites:
26
+ - OpenAI API key set in environment variable OPENAI_API_KEY
27
+ """
28
+
29
+ # Initialize OpenAI client
30
+ client = AsyncOpenAI()
31
+ model_name = "gpt-4o-mini"
32
+
33
+ # Define an async batch inference function
34
+ async def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
35
+ """Process inputs through OpenAI's Batch API.
36
+
37
+ Args:
38
+ inputs: Input values from an EvalDataset.
39
+ hyperparameters: Model hyperparameters including system_message and temperature.
40
+
41
+ Returns:
42
+ List of model outputs for all inputs.
43
+ """
44
+ # Step 1: Create batch requests in JSONL format
45
+ batch_requests = []
46
+ for idx, input_val in enumerate(inputs):
47
+ request = {
48
+ "custom_id": f"request-{idx}",
49
+ "method": "POST",
50
+ "url": "/v1/chat/completions",
51
+ "body": {
52
+ "model": model_name,
53
+ "messages": [
54
+ {
55
+ "role": "system",
56
+ "content": hyperparameters.get(
57
+ "system_message", "You are a helpful assistant."
58
+ ),
59
+ },
60
+ {"role": "user", "content": str(input_val)},
61
+ ],
62
+ "temperature": hyperparameters.get("temperature", 0.7),
63
+ },
64
+ }
65
+ batch_requests.append(request)
66
+
67
+ # Step 2: Write requests to a temporary JSONL file
68
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
69
+ for request in batch_requests:
70
+ f.write(json.dumps(request) + "\n")
71
+ temp_file_path = f.name
72
+
73
+ try:
74
+ # Step 3: Upload the batch file
75
+ print(f"Uploading batch file with {len(inputs)} requests...")
76
+ with open(temp_file_path, "rb") as f:
77
+ batch_file = await client.files.create(file=f, purpose="batch")
78
+
79
+ # Step 4: Create the batch job
80
+ print(f"Creating batch job...")
81
+ batch_job = await client.batches.create(
82
+ input_file_id=batch_file.id,
83
+ endpoint="/v1/chat/completions",
84
+ completion_window="24h",
85
+ )
86
+
87
+ # Step 5: Wait for batch completion (with polling)
88
+ print(f"Waiting for batch to complete (ID: {batch_job.id})...")
89
+ while batch_job.status not in ["completed", "failed", "cancelled"]:
90
+ await asyncio.sleep(10) # Poll every 10 seconds
91
+ batch_job = await client.batches.retrieve(batch_job.id)
92
+ print(f"Status: {batch_job.status}")
93
+
94
+ if batch_job.status != "completed":
95
+ raise Exception(f"Batch job failed with status: {batch_job.status}")
96
+
97
+ # Step 6: Download and parse results
98
+ print("Batch completed! Downloading results...")
99
+ result_file_id = batch_job.output_file_id
100
+ result_content = await client.files.content(result_file_id)
101
+ result_text = result_content.text
102
+
103
+ # Step 7: Parse results and extract outputs
104
+ results_by_id = {}
105
+ for line in result_text.strip().split("\n"):
106
+ result = json.loads(line)
107
+ custom_id = result["custom_id"]
108
+ try:
109
+ output = result["response"]["body"]["choices"][0]["message"]["content"]
110
+ results_by_id[custom_id] = output.strip()
111
+ except (KeyError, IndexError):
112
+ results_by_id[custom_id] = "Error: Failed to extract response"
113
+
114
+ # Step 8: Return outputs in original order
115
+ outputs = []
116
+ for idx in range(len(inputs)):
117
+ custom_id = f"request-{idx}"
118
+ outputs.append(results_by_id.get(custom_id, "Error: Missing response"))
119
+
120
+ return outputs
121
+
122
+ finally:
123
+ # Clean up the temporary file
124
+ Path(temp_file_path).unlink(missing_ok=True)
125
+
126
+ # Create a list of evaluation items
127
+ evaluation_items = [
128
+ {"question": "What is 2 + 2?", "answer": "4"},
129
+ {"question": "What is the capital of France?", "answer": "Paris"},
130
+ {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"},
131
+ ]
132
+
133
+ # Create an evaluation dataset
134
+ evaluation_dataset = EvalDataset.from_list(
135
+ name="basic_questions",
136
+ metrics="accuracy",
137
+ items=evaluation_items,
138
+ input="question",
139
+ label="answer",
140
+ )
141
+
142
+ print(f"\nRunning OpenAI Batch API evaluation with model: {model_name}")
143
+ print("Note: Batch processing may take several minutes to complete.\n")
144
+
145
+ # Run evaluation
146
+ results = await evaluate_async(
147
+ inference,
148
+ evaluation_dataset,
149
+ hyperparameters={
150
+ "temperature": 0.7,
151
+ "system_message": "Answer the question directly and concisely",
152
+ },
153
+ return_aggregates=True,
154
+ return_items=True,
155
+ return_output=True,
156
+ upload_results=False,
157
+ )
158
+
159
+ print("\nBatch evaluation completed:\n")
160
+ pprint(results)
161
+ return results
162
+
163
+
164
+ if __name__ == "__main__":
165
+ load_dotenv()
166
+ log_file = setup_logging(experiment_id="4-evaluating_cloud_models_with_batching", base_dir=Path(__file__).parent)
167
+ output_dir = Path(__file__).parent / "results"
168
+ output_dir.mkdir(exist_ok=True)
169
+ results_dict = asyncio.run(main())
170
+ save_results_to_json(results_dict, output_dir, "4-evaluating_cloud_models_with_batching_output.json")