PyPI - scorebook - Versions diffs - 0.0.1__py3-none-any.whl - Mend

scorebook 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

scorebook/__init__.py +15 -0
scorebook/evaluator.py +228 -0
scorebook/inference/__init__.py +11 -0
scorebook/inference/openai.py +185 -0
scorebook/inference/portkey.py +186 -0
scorebook/metrics/__init__.py +18 -0
scorebook/metrics/accuracy.py +42 -0
scorebook/metrics/metric_base.py +28 -0
scorebook/metrics/metric_registry.py +105 -0
scorebook/metrics/precision.py +19 -0
scorebook/types/__init__.py +11 -0
scorebook/types/eval_dataset.py +310 -0
scorebook/types/eval_result.py +129 -0
scorebook/types/inference_pipeline.py +84 -0
scorebook/utils/__init__.py +8 -0
scorebook/utils/async_utils.py +27 -0
scorebook/utils/io_helpers.py +28 -0
scorebook/utils/mappers.py +36 -0
scorebook/utils/progress_bars.py +89 -0
scorebook/utils/transform_helpers.py +25 -0
scorebook-0.0.1.dist-info/LICENSE +21 -0
scorebook-0.0.1.dist-info/METADATA +376 -0
scorebook-0.0.1.dist-info/RECORD +24 -0
scorebook-0.0.1.dist-info/WHEEL +4 -0

scorebook/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""
+Scorebook package.
+A Python project for scorebook functionality.
+"""
+import importlib.metadata
+# get version from pyproject.toml
+__version__ = importlib.metadata.version(__package__ or __name__)
+from scorebook.evaluator import evaluate
+from scorebook.types.eval_dataset import EvalDataset
+__all__ = ["EvalDataset", "evaluate"]

scorebook/evaluator.py ADDED Viewed

@@ -0,0 +1,228 @@
+"""
+Model evaluation functionality for the Scorebook framework.
+This module provides the core evaluation logic to assess model predictions
+against ground truth labels using configurable metrics. It supports:
+- Batch evaluation of models across multiple datasets
+- Flexible metric computation and aggregation
+- Optional parameter sweeping and experiment tracking
+- Customizable inference functions
+The main entry point is the `evaluate()` function which handles running
+models on datasets and computing metric scores.
+"""
+import asyncio
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+from scorebook.types.eval_dataset import EvalDataset
+from scorebook.types.eval_result import EvalResult
+from scorebook.utils import evaluation_progress, expand_dict, is_awaitable
+async def _evaluate_async(
+    inference_callable: Callable,
+    eval_datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
+    hyperparameters: Optional[Dict[str, Any]] = None,
+    experiment_id: Optional[str] = None,
+    item_limit: Optional[int] = None,
+    return_type: str = "dict",
+    score_type: str = "aggregate",
+) -> Union[Dict, List]:
+    """Run inference across datasets/hyperparams, compute metrics, and format results."""
+    _validate_score_type(score_type)
+    normalized_datasets = _normalize_datasets(eval_datasets)
+    hyperparam_grid = _expand_hyperparams(hyperparameters)
+    eval_results: List[EvalResult] = []
+    with evaluation_progress(normalized_datasets, len(hyperparam_grid)) as progress_bars:
+        # Loop through datasets, then hyperparameters for clear progress tracking
+        for dataset_idx, eval_dataset in enumerate(normalized_datasets):
+            with progress_bars.hyperparam_progress_context():
+                # Run inference for each hyperparameter configuration on this dataset
+                for hp_idx, hyperparam_config in enumerate(hyperparam_grid):
+                    items = _clip_items(eval_dataset.items, item_limit)
+                    labels = _labels_for(items, eval_dataset.label)
+                    # 1) Run inference
+                    outputs = await _run_inference_callable(
+                        inference_callable, items, hyperparam_config
+                    )
+                    # 2) Score metrics
+                    metric_scores = _score_metrics(eval_dataset, outputs, labels)
+                    # 3) Wrap into EvalResult
+                    eval_results.append(
+                        EvalResult(eval_dataset, outputs, metric_scores, hyperparam_config)
+                    )
+                    # Update inner progress bar
+                    progress_bars.update_hyperparam_progress()
+            # Update the outer progress bar
+            progress_bars.update_dataset_progress()
+    # TODO: experiment_id handling (left as passthrough to preserve behavior)
+    if experiment_id:
+        pass
+    # 4) Format as requested
+    return _format_results(eval_results, return_type, score_type)
+def evaluate(
+    inference_callable: Callable,
+    eval_datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
+    hyperparameters: Optional[Dict[str, Any]] = None,
+    experiment_id: Optional[str] = None,
+    item_limit: Optional[int] = None,
+    return_type: str = "dict",
+    score_type: str = "aggregate",
+) -> Union[Dict, List]:
+    """
+    Evaluate model predictions using specified metrics on given datasets.
+    This function runs the provided inference callable on one or more evaluation datasets,
+    computes metric scores, and returns the evaluation results. It supports batch processing,
+    parameter sweeping, and different result formatting options.
+    Args:
+        inference_callable: A callable function or object that takes (items, hyperparameters)
+                           and returns predictions. Can be a regular function, async function,
+                           or callable instance (like a class with __call__ method).
+        eval_datasets: One or more evaluation datasets to run evaluation on. Can be:
+                 - A single EvalDataset instance
+                 - A list of EvalDataset instances
+                 - A string identifier (for future dataset registry support)
+                 - A list of string identifiers
+        hyperparameters: Optional dictionary containing hyperparameter sweep configuration.
+        experiment_id: Optional string identifier for tracking multiple evaluation runs.
+        item_limit: Optional integer limiting the number of items to evaluate per dataset.
+        return_type: Format of the return value. Currently only "dict" is supported.
+        score_type: Type of score aggregation to return. Options:
+                   - "aggregate": Return aggregated metrics
+                   - "item": Return per-item scores
+                   - "all": Return both aggregate and per-item scores
+    Returns:
+        Dictionary mapping dataset names to their evaluation results. For each dataset,
+        returns a dictionary containing:
+        - items: List of EvalResult objects with predictions and ground truth
+        - metrics: Dictionary mapping metric names to their computed scores
+    Example:
+    python
+        dataset = EvalDataset.from_huggingface("dataset_name", label="answer", metrics=[Precision])
+        def inference_fn(items):
+            # Model inference logic here - process all items at once
+            return [prediction for item in items]
+        results = evaluate(inference_fn, dataset, item_limit=100)
+    """
+    return asyncio.run(
+        _evaluate_async(
+            inference_callable=inference_callable,
+            eval_datasets=eval_datasets,
+            hyperparameters=hyperparameters,
+            experiment_id=experiment_id,
+            item_limit=item_limit,
+            return_type=return_type,
+            score_type=score_type,
+        )
+    )
+# ===== Helper Functions =====
+def _normalize_datasets(
+    datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]]
+) -> List[EvalDataset]:
+    if not isinstance(datasets, list):
+        datasets = [datasets]
+    # TODO: handle other types (string registry, etc.)
+    return [d for d in datasets if isinstance(d, EvalDataset)]
+def _validate_score_type(score_type: str) -> None:
+    if score_type not in {"aggregate", "item", "all"}:
+        raise ValueError("score_type must be 'aggregate', 'item', or 'all'")
+def _expand_hyperparams(hyperparameters: Optional[Dict[str, Any]]) -> Any:
+    return expand_dict(hyperparameters or {})
+def _clip_items(items: List[Dict[str, Any]], item_limit: Optional[int]) -> List[Dict[str, Any]]:
+    return items[:item_limit] if item_limit else items
+def _labels_for(items: List[Dict[str, Any]], label_key: str) -> List[Any]:
+    return [item.get(label_key) for item in items]
+async def _run_inference_callable(
+    inference_callable: Callable,
+    items: List[Dict[str, Any]],
+    hyperparams: Dict[str, Any],
+) -> Any:
+    if is_awaitable(inference_callable):
+        return await inference_callable(items, **hyperparams)
+    else:
+        return inference_callable(items, **hyperparams)
+# Yields (eval_dataset, items, labels, hyperparams) for every dataset x hyperparam combo.
+def _iter_dataset_jobs(
+    datasets: List[EvalDataset],
+    hyperparam_grid: List[Dict[str, Any]],
+    item_limit: Optional[int],
+) -> Iterable[Tuple[EvalDataset, List[Dict[str, Any]], List[Any], Dict[str, Any]]]:
+    for eval_dataset in datasets:
+        for hp in hyperparam_grid:
+            items = _clip_items(eval_dataset.items, item_limit)
+            labels = _labels_for(items, eval_dataset.label)
+            yield eval_dataset, items, labels, hp
+def _score_metrics(
+    eval_dataset: EvalDataset, outputs: List[Any], labels: List[Any]
+) -> Dict[str, Dict[str, Any]]:
+    metric_scores: Dict[str, Dict[str, Any]] = {}
+    for metric in eval_dataset.metrics:
+        aggregate_scores, item_scores = metric.score(outputs, labels)
+        metric_scores[metric.name] = {
+            "aggregate_scores": aggregate_scores,
+            "item_scores": item_scores,
+        }
+    return metric_scores
+def _format_results(
+    eval_results: List[EvalResult], return_type: str, score_type: str
+) -> Union[Dict, List]:
+    if return_type != "dict":
+        return {er.eval_dataset.name: er for er in eval_results}
+    if score_type == "all":
+        combined: Dict[str, List[Dict[str, Any]]] = {"aggregate": [], "per_sample": []}
+        for er in eval_results:
+            d = er.to_dict()
+            combined["aggregate"].extend(d["aggregate"])
+            combined["per_sample"].extend(d["per_sample"])
+        return combined
+    if score_type == "aggregate":
+        return [er.aggregate_scores for er in eval_results]
+    if score_type == "item":
+        return [item for er in eval_results for item in er.item_scores]
+    # Should be unreachable due to validation
+    return {}

scorebook/inference/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""
+Inference module for model execution and predictions.
+This module provides functionality for running inference with various models
+and processing their responses. It includes utilities for both single and
+batch inference operations.
+"""
+from scorebook.inference.openai import batch, responses
+__all__ = ["responses", "batch"]

scorebook/inference/openai.py ADDED Viewed

@@ -0,0 +1,185 @@
+"""
+OpenAI inference implementation for Scorebook.
+This module provides utilities for running inference using OpenAI's models,
+supporting both single response and batch inference operations. It handles
+API communication, request formatting, and response processing.
+"""
+import asyncio
+import json
+import tempfile
+from typing import Any, List
+from openai import OpenAI
+from tqdm.asyncio import tqdm
+async def responses(
+    items: List[Any], model: str = "gpt-4.1-nano", client: Any = None, **hyperparameters: Any
+) -> List[Any]:
+    """Process multiple inference requests using OpenAI's API.
+    This asynchronous function handles multiple inference requests,
+    manages the API communication, and processes the responses.
+    Args:
+        items: List of preprocessed items to process.
+        model: OpenAI model to use.
+        client: Optional OpenAI client instance.
+        hyperparameters: Dictionary of hyperparameters for inference.
+    Returns:
+        List of raw model responses.
+    Raises:
+        NotImplementedError: Currently not implemented.
+    """
+    if client is None:
+        client = OpenAI()
+    results = []
+    for item in items:
+        response = client.responses.create(model=model, input=item)
+        results.append(response)
+    return results
+async def batch(
+    items: List[Any],
+    model: str = "gpt-4.1-nano",
+    client: Any = None,
+    **hyperparameters: Any,
+) -> List[Any]:
+    """Process multiple inference requests in batch using OpenAI's API.
+    This asynchronous function handles batch processing of inference requests,
+    optimizing for throughput while respecting API rate limits.
+    Args:
+        items: List of preprocessed items to process.
+        model: OpenAI model to use.
+        client: Optional OpenAI client instance.
+        hyperparameters: Dictionary of hyperparameters for inference.
+    Returns:
+        A list of raw model responses.
+    Raises:
+        NotImplementedError: Currently not implemented.
+    """
+    if client is None:
+        client = OpenAI()
+    file_id = _upload_batch(items, client)
+    batch_id = _start_batch(file_id, client)
+    # Initialize progress bar
+    pbar = tqdm(total=len(items), desc="Batch processing", unit="requests")
+    awaiting_batch = True
+    while awaiting_batch:
+        batch_object = await _get_batch(batch_id, client)
+        batch_status = batch_object.status
+        if hasattr(batch_object, "request_counts") and batch_object.request_counts:
+            completed = batch_object.request_counts.completed
+            total = batch_object.request_counts.total
+            pbar.n = completed
+            pbar.set_postfix(status=batch_status, completed=f"{completed}/{total}")
+        else:
+            pbar.set_postfix(status=batch_status)
+        pbar.refresh()
+        if batch_status == "completed":
+            awaiting_batch = False
+            pbar.n = pbar.total
+            pbar.set_postfix(status="completed")
+        elif batch_status == "failed":
+            raise Exception("Batch processing failed")
+        else:
+            await asyncio.sleep(60)
+    pbar.close()
+    # Get the final batch object to access output_file_id
+    final_batch_object = await _get_batch(batch_id, client)
+    output_file_id = final_batch_object.output_file_id
+    batch_result = await _get_results_file(output_file_id, client)
+    return batch_result
+def _upload_batch(items: List[Any], client: Any) -> str:
+    """Create a .jsonl file from preprocessed items and upload to OpenAI for batch processing.
+    Args:
+        items: A list of preprocessed items, each representing a single dataset eval item.
+    Returns:
+        The file ID returned by OpenAI after uploading.
+    """
+    print("Uploading batch...")
+    # Instantiate OpenAI client
+    if client is None:
+        client = OpenAI()
+    # Create temp .jsonl file
+    with tempfile.NamedTemporaryFile(mode="w+", suffix=".jsonl", delete=False) as f:
+        for i, item in enumerate(items):
+            # Construct each batch line
+            payload = {
+                "custom_id": f"request-{i}",
+                "method": "POST",
+                "url": "/v1/chat/completions",
+                "body": item,
+            }
+            f.write(json.dumps(payload) + "\n")
+        file_path = f.name
+    # Upload file to OpenAI
+    with open(file_path, "rb") as upload_file:
+        response = client.files.create(file=upload_file, purpose="batch")
+    return str(response.id)
+def _start_batch(file_id: str, client: Any) -> str:
+    batch_response = client.batches.create(
+        input_file_id=file_id,
+        endpoint="/v1/chat/completions",
+        completion_window="24h",
+    )
+    return str(batch_response.id)
+async def _get_batch(batch_id: str, client: Any) -> Any:
+    batch_object = client.batches.retrieve(batch_id)
+    return batch_object
+async def _get_results_file(output_file_id: str, client: Any) -> List[str]:
+    """Download and parse the batch results file from OpenAI."""
+    response = client.files.content(output_file_id)
+    # Parse the JSONL content
+    content = response.content.decode("utf-8")
+    results = []
+    for line in content.strip().split("\n"):
+        if line.strip():
+            result_obj = json.loads(line)
+            # Extract the response from the batch result structure
+            if "response" in result_obj and "body" in result_obj["response"]:
+                response_body = result_obj["response"]["body"]
+                if "choices" in response_body and len(response_body["choices"]) > 0:
+                    message_content = response_body["choices"][0]["message"]["content"]
+                    results.append(message_content)
+                else:
+                    results.append("")
+            else:
+                results.append("")
+    return results

scorebook/inference/portkey.py ADDED Viewed

@@ -0,0 +1,186 @@
+"""
+Portkey inference implementation for Scorebook.
+This module provides utilities for running inference using Portkey's API,
+supporting both single response and batch inference operations. It handles
+API communication, request formatting, and response processing.
+"""
+import asyncio
+import json
+import os
+import tempfile
+from typing import Any, List, Optional
+from portkey_ai import AsyncPortkey
+from tqdm.asyncio import tqdm
+async def responses(
+    items: List[Any], model: str, client: Optional[AsyncPortkey] = None, **hyperparameters: Any
+) -> List[Any]:
+    """Process multiple inference requests using Portkey's API.
+    This asynchronous function handles multiple inference requests,
+    manages the API communication, and processes the responses.
+    Args:
+        items: List of preprocessed items to process.
+        model: Model to use via Portkey.
+        client: Optional Portkey client instance.
+        hyperparameters: Dictionary of hyperparameters for inference.
+    Returns:
+        List of raw model responses.
+    """
+    if client is None:
+        client = AsyncPortkey(api_key=os.getenv("PORTKEY_API_KEY"))
+    results = []
+    for item in items:
+        response = await client.chat.completions.create(
+            model=model,
+            messages=item if isinstance(item, list) else [{"role": "user", "content": str(item)}],
+        )
+        results.append(response)
+    return results
+async def batch(
+    items: List[Any],
+    model: str,
+    client: Optional[AsyncPortkey] = None,
+    **hyperparameters: Any,
+) -> List[Any]:
+    """Process multiple inference requests in batch using Portkey's API.
+    This asynchronous function handles batch processing of inference requests,
+    optimizing for throughput while respecting API rate limits.
+    Args:
+        items: List of preprocessed items to process.
+        model: Model to use via Portkey.
+        client: Optional Portkey client instance.
+        hyperparameters: Dictionary of hyperparameters for inference.
+    Returns:
+        A list of raw model responses.
+    """
+    provider, model = model.split("/")
+    if client is None:
+        client = AsyncPortkey(provider=provider, api_key=os.getenv("PORTKEY_API_KEY"))
+    file_id = await _upload_batch(items, client, model, **hyperparameters)
+    batch_id = await _start_batch(file_id, client)
+    # Initialize progress bar
+    pbar = tqdm(total=len(items), desc="Batch processing", unit="requests")
+    awaiting_batch = True
+    while awaiting_batch:
+        batch_object = await _get_batch(batch_id, client)
+        batch_status = batch_object.status
+        if hasattr(batch_object, "request_counts") and batch_object.request_counts:
+            completed = batch_object.request_counts.completed
+            total = batch_object.request_counts.total
+            pbar.n = completed
+            pbar.set_postfix(status=batch_status, completed=f"{completed}/{total}")
+        else:
+            pbar.set_postfix(status=batch_status)
+        pbar.refresh()
+        if batch_status == "completed":
+            awaiting_batch = False
+            pbar.n = pbar.total
+            pbar.set_postfix(status="completed")
+        elif batch_status == "failed":
+            raise Exception("Batch processing failed")
+        else:
+            await asyncio.sleep(60)
+    pbar.close()
+    # Use the final batch object to access output_file_id
+    output_file_id = batch_object.output_file_id
+    batch_result = await _get_results_file(output_file_id, client)
+    return batch_result
+async def _upload_batch(
+    items: List[Any], client: AsyncPortkey, model: str, **hyperparameters: Any
+) -> str:
+    """Create a .jsonl file from preprocessed items and upload to Portkey for batch processing.
+    Args:
+        items: A list of preprocessed items, each representing a single dataset eval item.
+        client: Portkey client instance.
+        model: Model to use for batch processing.
+        hyperparameters: Additional parameters for the batch requests.
+    Returns:
+        The file ID returned by Portkey after uploading.
+    """
+    print("Uploading batch...")
+    # Create temp .jsonl file
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
+        for i, item in enumerate(items):
+            # Construct each batch line
+            payload = {
+                "custom_id": f"request-{i}",
+                "method": "POST",
+                "url": "/v1/chat/completions",
+                "body": {
+                    "model": model,
+                    "messages": (
+                        item if isinstance(item, list) else [{"role": "user", "content": str(item)}]
+                    ),
+                    **hyperparameters,
+                },
+            }
+            f.write(json.dumps(payload) + "\n")
+        file_path = f.name
+    # Upload file to Portkey
+    with open(file_path, "rb") as upload_file:
+        response = await client.files.create(file=upload_file, purpose="batch")
+    return str(response.id)
+async def _start_batch(file_id: str, client: Any) -> str:
+    batch_response = await client.batches.create(
+        input_file_id=file_id,
+        endpoint="/v1/chat/completions",
+        completion_window="24h",
+    )
+    return str(batch_response.id)
+async def _get_batch(batch_id: str, client: Any) -> Any:
+    batch_object = await client.batches.retrieve(batch_id)
+    return batch_object
+async def _get_results_file(output_file_id: str, client: Any) -> List[str]:
+    """Download and parse the batch results file from Portkey."""
+    response = await client.files.content(output_file_id)
+    # Parse the JSONL content
+    content = response.content.decode("utf-8")
+    results = []
+    for line in content.strip().split("\n"):
+        result_obj = json.loads(line)
+        message_content = result_obj["response"]["body"]["choices"][0]["message"]["content"]
+        results.append(message_content)
+    return results

scorebook/metrics/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+"""
+Metrics for evaluating model predictions.
+This module provides a collection of evaluation metrics for comparing model outputs
+against ground truth labels. Available metrics include standard classification and
+generation metrics like accuracy, precision, recall, F1-score, etc.
+Metrics can be accessed by name through the `get_metrics()` function or used
+directly by instantiating specific metric classes. All metrics implement a
+common interface for scoring predictions against references.
+"""
+from scorebook.metrics.accuracy import Accuracy
+from scorebook.metrics.metric_base import MetricBase
+from scorebook.metrics.metric_registry import MetricRegistry
+from scorebook.metrics.precision import Precision
+__all__ = ["MetricBase", "Precision", "Accuracy", "MetricRegistry"]

scorebook/metrics/accuracy.py ADDED Viewed

@@ -0,0 +1,42 @@
+"""Accuracy metric implementation for Scorebook."""
+from typing import Any, Dict, List, Tuple
+from scorebook.metrics.metric_base import MetricBase
+from scorebook.metrics.metric_registry import MetricRegistry
+@MetricRegistry.register()
+class Accuracy(MetricBase):
+    """Accuracy metric for evaluating model predictions of any type.
+    Accuracy = correct predictions / total predictions
+    """
+    @staticmethod
+    def score(outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
+        """Calculate accuracy score between predictions and references.
+        Args:
+            outputs: A list of inference outputs.
+            labels: A list of ground truth labels.
+        Returns:
+            The aggregate accuracy score for all items (correct predictions / total predictions).
+            The item scores for each output-label pair (true/false).
+        """
+        if len(outputs) != len(labels):
+            raise ValueError("Number of outputs must match number of labels")
+        if not outputs:  # Handle empty lists
+            return {"accuracy": 0.0}, []
+        # Calculate item scores
+        item_scores = [output == label for output, label in zip(outputs, labels)]
+        # Calculate aggregate score
+        correct_predictions = sum(item_scores)
+        total_predictions = len(outputs)
+        aggregate_scores = {"accuracy": correct_predictions / total_predictions}
+        return aggregate_scores, item_scores