PyPI - scorebook - Versions diffs - 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl - Mend

scorebook 0.0.14py3-none-any.whl → 0.0.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

scorebook/__init__.py +2 -0
scorebook/dashboard/credentials.py +34 -4
scorebook/eval_datasets/eval_dataset.py +2 -2
scorebook/evaluate/_async/evaluate_async.py +27 -11
scorebook/evaluate/_sync/evaluate.py +27 -11
scorebook/metrics/README.md +121 -0
scorebook/metrics/__init__.py +8 -0
scorebook/metrics/accuracy.py +2 -6
scorebook/metrics/bertscore.py +50 -0
scorebook/metrics/bleu.py +82 -0
scorebook/metrics/core/__init__.py +1 -0
scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
scorebook/metrics/core/metric_registry.py +195 -0
scorebook/metrics/exactmatch.py +95 -0
scorebook/metrics/f1.py +96 -0
scorebook/metrics/precision.py +84 -9
scorebook/metrics/recall.py +94 -0
scorebook/metrics/rouge.py +85 -0
scorebook/score/score_helpers.py +28 -11
scorebook/types.py +2 -2
scorebook/utils/progress_bars.py +58 -786
{scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/METADATA +32 -24
scorebook-0.0.16.dist-info/RECORD +110 -0
{scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/WHEEL +1 -1
tutorials/README.md +147 -0
tutorials/__init__.py +5 -0
tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
tutorials/examples/1-score/__init__.py +0 -0
tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
tutorials/examples/6-providers/aws/__init__.py +1 -0
tutorials/examples/6-providers/aws/batch_example.py +219 -0
tutorials/examples/6-providers/portkey/__init__.py +1 -0
tutorials/examples/6-providers/portkey/batch_example.py +120 -0
tutorials/examples/6-providers/portkey/messages_example.py +121 -0
tutorials/examples/6-providers/vertex/__init__.py +1 -0
tutorials/examples/6-providers/vertex/batch_example.py +166 -0
tutorials/examples/6-providers/vertex/messages_example.py +142 -0
tutorials/examples/__init__.py +0 -0
tutorials/notebooks/1-scoring.ipynb +162 -0
tutorials/notebooks/2-evaluating.ipynb +316 -0
tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
tutorials/notebooks/4-uploading_results.ipynb +175 -0
tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
tutorials/quickstarts/getting_started.ipynb +197 -0
tutorials/utils/__init__.py +35 -0
tutorials/utils/args_parser.py +132 -0
tutorials/utils/output.py +23 -0
tutorials/utils/setup.py +98 -0
scorebook/metrics/metric_registry.py +0 -107
scorebook-0.0.14.dist-info/RECORD +0 -53
{scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/entry_points.txt +0 -0
{scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/licenses/LICENSE +0 -0

tutorials/examples/1-score/5-scoring_model_exact_match.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""Tutorials - Score - Example 5 - Scoring Models with Exact Match."""
+from pathlib import Path
+from pprint import pprint
+from typing import Any
+from dotenv import load_dotenv
+from tutorials.utils import save_results_to_json, setup_logging
+from scorebook import score
+from scorebook.metrics.exactmatch import ExactMatch
+def main() -> Any:
+    """Score text predictions using Exact Match metric.
+    This example demonstrates how to compare model outputs against
+    reference labels using exact string matching with configurable
+    preprocessing options.
+    """
+    # Prepare a list of items with model outputs and expected labels
+    # Note: outputs may have different casing or extra whitespace
+    model_predictions = [
+        {"output": "Paris", "label": "Paris"},           # Exact match
+        {"output": "LONDON", "label": "London"},         # Different case
+        {"output": "  Berlin  ", "label": "Berlin"},     # Extra whitespace
+        {"output": " NEW YORK ", "label": "new york"},   # Both case and whitespace
+        {"output": "Tokyo", "label": "Kyoto"},           # No match
+    ]
+    print(f"Scoring {len(model_predictions)} predictions\n")
+    # Score with default settings (case_insensitive=True, strip=True)
+    print("Default settings (case_insensitive=True, strip=True):")
+    results_default = score(
+        items=model_predictions,
+        metrics=ExactMatch(),
+        upload_results=False,
+    )
+    pprint(results_default["aggregate_results"])
+    print(f"Item matches: {[item['exact_match'] for item in results_default['item_results']]}")
+    # Score with case-sensitive matching
+    print("\nCase-sensitive matching (case_insensitive=False, strip=True):")
+    results_case_sensitive = score(
+        items=model_predictions,
+        metrics=ExactMatch(case_insensitive=False),
+        upload_results=False,
+    )
+    pprint(results_case_sensitive["aggregate_results"])
+    print(f"Item matches: {[item['exact_match'] for item in results_case_sensitive['item_results']]}")
+    # Score without stripping whitespace
+    print("\nWithout stripping (case_insensitive=True, strip=False):")
+    results_no_strip = score(
+        items=model_predictions,
+        metrics=ExactMatch(strip=False),
+        upload_results=False,
+    )
+    pprint(results_no_strip["aggregate_results"])
+    print(f"Item matches: {[item['exact_match'] for item in results_no_strip['item_results']]}")
+    # Score with strict matching (no preprocessing)
+    print("\nStrict matching (case_insensitive=False, strip=False):")
+    results_strict = score(
+        items=model_predictions,
+        metrics=ExactMatch(case_insensitive=False, strip=False),
+        upload_results=False,
+    )
+    pprint(results_strict["aggregate_results"])
+    print(f"Item matches: {[item['exact_match'] for item in results_strict['item_results']]}")
+    return results_default
+if __name__ == "__main__":
+    load_dotenv()
+    log_file = setup_logging(experiment_id="5-scoring_model_exact_match", base_dir=Path(__file__).parent)
+    output_dir = Path(__file__).parent / "results"
+    output_dir.mkdir(exist_ok=True)
+    results_dict = main()
+    save_results_to_json(results_dict, output_dir, "5-scoring_model_exact_match_output.json")

tutorials/examples/1-score/6-scoring_with_bertscore.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""Tutorials - Score - Example 6 - Scoring with BertScore."""
+from pathlib import Path
+from pprint import pprint
+from typing import Any
+from dotenv import load_dotenv
+from scorebook.metrics.bertscore import BertScore
+from tutorials.utils import save_results_to_json, setup_logging
+from scorebook import score
+def main() -> Any:
+    """Score pre-computed model predictions using Scorebook.
+    This example demonstrates how to score generated model predictions.
+    """
+    # Prepare a list of items with generated summaries and reference summaries
+    model_predictions = [
+        {
+            "output": "A woman donated her kidney to a stranger. This sparked a chain of six kidney transplants.",
+            "label": "Zully Broussard decided to give a kidney to a stranger. A new computer program helped her donation spur transplants for six kidney patients.",
+        },
+        {
+            "output": "Scientists discovered a new species of frog in the Amazon rainforest. The frog has unique markings that distinguish it from other species.",
+            "label": "A new frog species with distinctive blue and yellow stripes was found in the Amazon. Researchers say this discovery highlights the biodiversity of the region.",
+        },
+        {
+            "output": "The technology company released its quarterly earnings report showing strong growth.",
+            "label": "Tech giant announces record quarterly revenue driven by cloud services and AI products.",
+        },
+    ]
+    # Score the predictions against labels using the BertScore metric
+    results = score(
+        items=model_predictions,
+        metrics=BertScore,
+        upload_results=False,  # Disable uploading for this example
+    )
+    print("\nResults:")
+    pprint(results)
+    return results
+if __name__ == "__main__":
+    load_dotenv()
+    log_file = setup_logging(experiment_id="6-scoring_model_bertscore", base_dir=Path(__file__).parent)
+    output_dir = Path(__file__).parent / "results"
+    output_dir.mkdir(exist_ok=True)
+    results_dict = main()
+    save_results_to_json(results_dict, output_dir, "6-scoring_model_bertscore_output.json")

tutorials/examples/1-score/__init__.py ADDED Viewed

File without changes

tutorials/examples/2-evaluate/1-evaluating_local_models.py ADDED Viewed

@@ -0,0 +1,106 @@
+"""Tutorials - Evaluate - Example 1 - Evaluating Local Models."""
+from pathlib import Path
+from pprint import pprint
+from typing import Any, List
+import transformers
+from dotenv import load_dotenv
+from tutorials.utils import save_results_to_json, setup_logging
+from scorebook import EvalDataset, evaluate
+def main() -> Any:
+    """Run a simple Scorebook evaluation on a local model.
+    This example demonstrates the fundamental workflow for evaluating a model using Scorebook.
+    It shows how to:
+        1. Create an evaluation dataset from a list of evaluation items
+        2. Define an inference function using Hugging Face's transformers library
+        3. Run the evaluation and collect results
+    This serves as a starting point for understanding Scorebook's core evaluation capabilities.
+    """
+    # Create a list of evaluation items
+    evaluation_items = [
+        {"question": "What is 2 + 2?", "answer": "4"},
+        {"question": "What is the capital of France?", "answer": "Paris"},
+        {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"},
+    ]
+    # Create an evaluation dataset
+    evaluation_dataset = EvalDataset.from_list(
+        name="basic_questions",    # Dataset name
+        metrics="accuracy",        # Metric/Metrics used to calculate scores
+        items=evaluation_items,    # List of evaluation items
+        input="question",          # Key for the input field in evaluation items
+        label="answer",            # Key for the label field in evaluation items
+    )
+    # Create a model
+    pipeline = transformers.pipeline(
+        "text-generation",
+        model="microsoft/Phi-4-mini-instruct",
+        model_kwargs={"torch_dtype": "auto"},
+        device_map="auto",
+    )
+    # Define an inference function
+    def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
+        """Return a list of model outputs for a list of inputs.
+        Args:
+            inputs: Input values from an EvalDataset.
+            hyperparameters: Model hyperparameters.
+        Returns:
+            The model outputs for a list of inputs.
+        """
+        inference_outputs = []
+        for model_input in inputs:
+            # Wrap inputs in the model's message format
+            messages = [
+                {
+                    "role": "system",
+                    "content": hyperparameters.get("system_message"),
+                },
+                {"role": "user", "content": model_input},
+            ]
+            # Run inference on the item
+            output = pipeline(messages, temperature=hyperparameters.get("temperature"))
+            # Extract and collect the output generated from the model's response
+            inference_outputs.append(output[0]["generated_text"][-1]["content"])
+        return inference_outputs
+    # Evaluate a model against an evaluation dataset
+    results = evaluate(
+        inference,
+        evaluation_dataset,
+        hyperparameters={
+            "temperature": 0.7,
+            "system_message": "Answer the question directly and concisely.",
+        },
+        return_items=True,
+        upload_results=False,  # Disable uploading for this example
+    )
+    print("\nEvaluation Results:")
+    pprint(results)
+    return results
+if __name__ == "__main__":
+    load_dotenv()
+    log_file = setup_logging(experiment_id="1-evaluating_local_models", base_dir=Path(__file__).parent)
+    output_dir = Path(__file__).parent / "results"
+    output_dir.mkdir(exist_ok=True)
+    results_dict = main()
+    save_results_to_json(results_dict, output_dir, "1-evaluating_local_models_output.json")

tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py ADDED Viewed

@@ -0,0 +1,108 @@
+"""Tutorials - Evaluate - Example 2 - Evaluating Local Models with Batching."""
+from pathlib import Path
+from pprint import pprint
+from typing import Any, List
+import transformers
+from dotenv import load_dotenv
+from tutorials.utils import save_results_to_json, setup_logging
+from scorebook import EvalDataset, evaluate
+def main() -> Any:
+    """Run a Scorebook evaluation using local batch inference.
+    This example demonstrates how to perform batch inference locally.
+    This approach offers several benefits:
+        1. Improved throughput by processing multiple items in parallel
+        2. Better GPU utilization through batched tensor operations
+        3. More efficient memory usage compared to sequential processing
+    """
+    # Initialize the pipeline with appropriate settings for batch processing
+    model_name = "google/flan-t5-small"
+    # Task is text2text-generation for seq2seq models
+    pipeline = transformers.pipeline(
+        "text2text-generation",
+        model=model_name,
+        torch_dtype="auto",
+        device_map="auto",  # will pick up gpu if available
+    )
+    # Define a batch inference function
+    def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
+        """Process multiple inputs through the model in batches.
+        Args:
+            inputs: Input values from an EvalDataset.
+            hyperparameters: Model hyperparameters including batch_size and max_new_tokens.
+        Returns:
+            List of model outputs for all inputs.
+        """
+        # Preprocess: Convert inputs to strings
+        preprocessed_inputs = [str(input_val) for input_val in inputs]
+        # Run batch inference
+        raw_results = pipeline(
+            preprocessed_inputs,
+            batch_size=hyperparameters["batch_size"],
+            max_new_tokens=hyperparameters["max_new_tokens"],
+            pad_token_id=pipeline.tokenizer.eos_token_id,
+        )
+        # Postprocess: Extract and clean the generated text
+        final_outputs = [str(result["generated_text"]).strip() for result in raw_results]
+        return final_outputs
+    # Create a list of evaluation items
+    evaluation_items = [
+        {"question": "What is 2 + 2?", "answer": "4"},
+        {"question": "What is the capital of France?", "answer": "Paris"},
+        {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"},
+    ]
+    # Create an evaluation dataset
+    evaluation_dataset = EvalDataset.from_list(
+        name="basic_questions",    # Dataset name
+        metrics="accuracy",        # Metric/Metrics used to calculate scores
+        items=evaluation_items,    # List of evaluation items
+        input="question",          # Key for the input field in evaluation items
+        label="answer",            # Key for the label field in evaluation items
+    )
+    # Define hyperparameters
+    hyperparameters = {
+        "max_new_tokens": 128,
+        "batch_size": 2,
+    }
+    # Run the evaluation with batch inference
+    results = evaluate(
+        inference,
+        evaluation_dataset,
+        hyperparameters=hyperparameters,
+        return_aggregates=True,  # Include aggregate results for each configuration
+        return_items=True,       # Include results for individual items
+        return_output=True,      # Include model outputs for debugging
+        upload_results=False,    # Disable uploading for this example
+    )
+    pprint(results)
+    return results
+if __name__ == "__main__":
+    load_dotenv()
+    log_file = setup_logging(experiment_id="2-evaluating_local_models_with_batching", base_dir=Path(__file__).parent)
+    output_dir = Path(__file__).parent / "results"
+    output_dir.mkdir(exist_ok=True)
+    results_dict = main()
+    save_results_to_json(results_dict, output_dir, "2-evaluating_local_models_with_batching_output.json")

tutorials/examples/2-evaluate/3-evaluating_cloud_models.py ADDED Viewed

@@ -0,0 +1,109 @@
+"""Tutorials - Evaluate - Example 3 - Evaluating Cloud Models."""
+import asyncio
+from pathlib import Path
+from pprint import pprint
+from typing import Any, List
+from dotenv import load_dotenv
+from openai import AsyncOpenAI
+from tutorials.utils import save_results_to_json, setup_logging
+from scorebook import EvalDataset, evaluate_async
+async def main() -> Any:
+    """Run an evaluation using a cloud-hosted model.
+    This example demonstrates how to evaluate cloud-hosted models using OpenAI's API directly.
+    Prerequisites:
+        - OpenAI API key set in environment variable OPENAI_API_KEY
+    """
+    # Initialize OpenAI client
+    client = AsyncOpenAI()
+    model_name = "gpt-4o-mini"
+    # Define an async inference function
+    async def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
+        """Process inputs through OpenAI's API.
+        Args:
+            inputs: Input values from an EvalDataset.
+            hyperparameters: Model hyperparameters including system_message and temperature.
+        Returns:
+            List of model outputs for all inputs.
+        """
+        outputs = []
+        for input_val in inputs:
+            # Build messages for OpenAI API
+            messages = [
+                {
+                    "role": "system",
+                    "content": hyperparameters.get(
+                        "system_message", "You are a helpful assistant."
+                    ),
+                },
+                {"role": "user", "content": str(input_val)},
+            ]
+            # Call OpenAI API
+            try:
+                response = await client.chat.completions.create(
+                    model=model_name,
+                    messages=messages,
+                    temperature=hyperparameters.get("temperature", 0.7),
+                )
+                output = response.choices[0].message.content.strip()
+            except Exception as e:
+                output = f"Error: {str(e)}"
+            outputs.append(output)
+        return outputs
+    # Create a list of evaluation items
+    evaluation_items = [
+        {"question": "What is 2 + 2?", "answer": "4"},
+        {"question": "What is the capital of France?", "answer": "Paris"},
+        {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"},
+    ]
+    # Create an evaluation dataset
+    evaluation_dataset = EvalDataset.from_list(
+        name="basic_questions",    # Dataset name
+        metrics="accuracy",        # Metric/Metrics used to calculate scores
+        items=evaluation_items,    # List of evaluation items
+        input="question",          # Key for the input field in evaluation items
+        label="answer",            # Key for the label field in evaluation items
+    )
+    # Run evaluation
+    results = await evaluate_async(
+        inference,
+        evaluation_dataset,
+        hyperparameters={
+            "system_message": (
+                "Answer the question directly. Provide only the answer, without context."
+            ),
+            "temperature": 0.7,
+        },
+        return_items=True,
+        return_output=True,
+        upload_results=False,
+    )
+    pprint(results)
+    return results
+if __name__ == "__main__":
+    load_dotenv()
+    log_file = setup_logging(experiment_id="3-evaluating_cloud_models", base_dir=Path(__file__).parent)
+    output_dir = Path(__file__).parent / "results"
+    output_dir.mkdir(exist_ok=True)
+    results_dict = asyncio.run(main())
+    save_results_to_json(results_dict, output_dir, "3-evaluating_cloud_models_output.json")

tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py ADDED Viewed

@@ -0,0 +1,170 @@
+"""Tutorials - Evaluate - Example 4 - Evaluating Cloud Models with Batching."""
+import asyncio
+import json
+import tempfile
+from pathlib import Path
+from pprint import pprint
+from typing import Any, List
+from dotenv import load_dotenv
+from openai import AsyncOpenAI
+from tutorials.utils import save_results_to_json, setup_logging
+from scorebook import EvalDataset, evaluate_async
+async def main() -> Any:
+    """Run evaluation using OpenAI's Batch API.
+    This example demonstrates how to use OpenAI's Batch API for cost-effective,
+    large-scale model evaluation. The Batch API offers 50% cost savings compared
+    to standard API calls, with results typically delivered within 24 hours.
+    Prerequisites:
+        - OpenAI API key set in environment variable OPENAI_API_KEY
+    """
+    # Initialize OpenAI client
+    client = AsyncOpenAI()
+    model_name = "gpt-4o-mini"
+    # Define an async batch inference function
+    async def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:
+        """Process inputs through OpenAI's Batch API.
+        Args:
+            inputs: Input values from an EvalDataset.
+            hyperparameters: Model hyperparameters including system_message and temperature.
+        Returns:
+            List of model outputs for all inputs.
+        """
+        # Step 1: Create batch requests in JSONL format
+        batch_requests = []
+        for idx, input_val in enumerate(inputs):
+            request = {
+                "custom_id": f"request-{idx}",
+                "method": "POST",
+                "url": "/v1/chat/completions",
+                "body": {
+                    "model": model_name,
+                    "messages": [
+                        {
+                            "role": "system",
+                            "content": hyperparameters.get(
+                                "system_message", "You are a helpful assistant."
+                            ),
+                        },
+                        {"role": "user", "content": str(input_val)},
+                    ],
+                    "temperature": hyperparameters.get("temperature", 0.7),
+                },
+            }
+            batch_requests.append(request)
+        # Step 2: Write requests to a temporary JSONL file
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
+            for request in batch_requests:
+                f.write(json.dumps(request) + "\n")
+            temp_file_path = f.name
+        try:
+            # Step 3: Upload the batch file
+            print(f"Uploading batch file with {len(inputs)} requests...")
+            with open(temp_file_path, "rb") as f:
+                batch_file = await client.files.create(file=f, purpose="batch")
+            # Step 4: Create the batch job
+            print(f"Creating batch job...")
+            batch_job = await client.batches.create(
+                input_file_id=batch_file.id,
+                endpoint="/v1/chat/completions",
+                completion_window="24h",
+            )
+            # Step 5: Wait for batch completion (with polling)
+            print(f"Waiting for batch to complete (ID: {batch_job.id})...")
+            while batch_job.status not in ["completed", "failed", "cancelled"]:
+                await asyncio.sleep(10)  # Poll every 10 seconds
+                batch_job = await client.batches.retrieve(batch_job.id)
+                print(f"Status: {batch_job.status}")
+            if batch_job.status != "completed":
+                raise Exception(f"Batch job failed with status: {batch_job.status}")
+            # Step 6: Download and parse results
+            print("Batch completed! Downloading results...")
+            result_file_id = batch_job.output_file_id
+            result_content = await client.files.content(result_file_id)
+            result_text = result_content.text
+            # Step 7: Parse results and extract outputs
+            results_by_id = {}
+            for line in result_text.strip().split("\n"):
+                result = json.loads(line)
+                custom_id = result["custom_id"]
+                try:
+                    output = result["response"]["body"]["choices"][0]["message"]["content"]
+                    results_by_id[custom_id] = output.strip()
+                except (KeyError, IndexError):
+                    results_by_id[custom_id] = "Error: Failed to extract response"
+            # Step 8: Return outputs in original order
+            outputs = []
+            for idx in range(len(inputs)):
+                custom_id = f"request-{idx}"
+                outputs.append(results_by_id.get(custom_id, "Error: Missing response"))
+            return outputs
+        finally:
+            # Clean up the temporary file
+            Path(temp_file_path).unlink(missing_ok=True)
+    # Create a list of evaluation items
+    evaluation_items = [
+        {"question": "What is 2 + 2?", "answer": "4"},
+        {"question": "What is the capital of France?", "answer": "Paris"},
+        {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"},
+    ]
+    # Create an evaluation dataset
+    evaluation_dataset = EvalDataset.from_list(
+        name="basic_questions",
+        metrics="accuracy",
+        items=evaluation_items,
+        input="question",
+        label="answer",
+    )
+    print(f"\nRunning OpenAI Batch API evaluation with model: {model_name}")
+    print("Note: Batch processing may take several minutes to complete.\n")
+    # Run evaluation
+    results = await evaluate_async(
+        inference,
+        evaluation_dataset,
+        hyperparameters={
+            "temperature": 0.7,
+            "system_message": "Answer the question directly and concisely",
+        },
+        return_aggregates=True,
+        return_items=True,
+        return_output=True,
+        upload_results=False,
+    )
+    print("\nBatch evaluation completed:\n")
+    pprint(results)
+    return results
+if __name__ == "__main__":
+    load_dotenv()
+    log_file = setup_logging(experiment_id="4-evaluating_cloud_models_with_batching", base_dir=Path(__file__).parent)
+    output_dir = Path(__file__).parent / "results"
+    output_dir.mkdir(exist_ok=True)
+    results_dict = asyncio.run(main())
+    save_results_to_json(results_dict, output_dir, "4-evaluating_cloud_models_with_batching_output.json")

scorebook 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl

scorebook 0.0.14py3-none-any.whl → 0.0.16py3-none-any.whl