PyPI - scorebook - Versions diffs - 0.0.5__py3-none-any.whl → 0.0.7__py3-none-any.whl - Mend

scorebook 0.0.5py3-none-any.whl → 0.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

scorebook/__init__.py CHANGED Viewed

@@ -12,7 +12,7 @@ __version__ = importlib.metadata.version(__package__ or __name__)
 from scorebook.eval_dataset import EvalDataset
 from scorebook.evaluate import evaluate
 from scorebook.inference_pipeline import InferencePipeline
-from scorebook.trismik.login import login, whoami
+from scorebook.trismik_services.login import login, whoami
 from scorebook.utils.build_prompt import build_prompt
 __all__ = ["EvalDataset", "evaluate", "build_prompt", "login", "whoami", "InferencePipeline"]

scorebook/evaluate.py CHANGED Viewed

@@ -15,7 +15,7 @@ models on datasets and computing metric scores.
 import asyncio
 import logging
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Literal, Optional, Union
 from scorebook.eval_dataset import EvalDataset
 from scorebook.exceptions import (
@@ -23,8 +23,11 @@ from scorebook.exceptions import (
     MetricComputationError,
     ParallelExecutionError,
     ParameterValidationError,
+    ScoreBookError,
 )
-from scorebook.trismik import run_adaptive_evaluation
+from scorebook.trismik_services import run_adaptive_evaluation
+from scorebook.trismik_services.login import get_token
+from scorebook.trismik_services.upload_classic_eval_run import upload_classic_eval_run
 from scorebook.types import (
     AdaptiveEvalDataset,
     AdaptiveEvalRunResult,
@@ -39,60 +42,43 @@ logger = logging.getLogger(__name__)
 def evaluate(
-    inference_callable: Callable,
-    eval_datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
+    inference: Callable,
+    datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
     hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
-    metadata: Optional[Dict[str, Any]] = None,
     experiment_id: Optional[str] = None,
     project_id: Optional[str] = None,
+    metadata: Optional[Dict[str, Any]] = None,
+    upload_results: Union[Literal["auto"], bool] = "auto",
+    sample_size: Optional[int] = None,
     parallel: bool = False,
     return_dict: bool = True,
     return_aggregates: bool = True,
     return_items: bool = False,
     return_output: bool = False,
-    sample_size: Optional[int] = None,
 ) -> Union[Dict, List]:
     """
-    Evaluate model predictions using specified metrics on given datasets.
-    This function runs the provided inference callable on one or more evaluation datasets,
-    computes metric scores, and returns the evaluation results. It supports batch processing,
-    parameter sweeping, and different result formatting options.
+    Evaluate a model and collection of hyperparameters over datasets with specified metrics.
     Args:
-        inference_callable: A callable function or object that takes (items, hyperparameters)
-                           and returns predictions. Can be a regular function, async function,
-                           or callable instance (like a class with __call__ method).
-        eval_datasets: One or more evaluation datasets to run evaluation on. Can be:
-                 - A single EvalDataset instance
-                 - A list of EvalDataset instances
-                 - A string identifier (for future dataset registry support)
-                 - A list of string identifiers
-        hyperparameters: Optional dictionary containing hyperparameter sweep configuration.
-        metadata: Optional dictionary containing evaluation metadata.
-        experiment_id: Optional string identifier for tracking multiple evaluation runs.
+        inference: A callable that runs model inference over a list of evaluation items
+        datasets: One or more evaluation datasets to run evaluation on.
+        hyperparameters: Optional list of hyperparameter configurations or grid to evaluate
+        experiment_id: Optional ID of the experiment to upload results to on Trismik's dashboard.
+        project_id: Optional ID of the project to upload results to on Trismik's dashboard.
+        metadata: Optional metadata to attach to the evaluation.
+        upload_results: If True, uploads results to Trismik's dashboard.
+        sample_size: Optional number of items to sample from each dataset.
+        parallel: If True, runs evaluation in parallel. Requires the inference callable to be async.
         return_dict: If True, returns eval results as a dict
         return_aggregates: If True, returns aggregate scores for each dataset
         return_items: If True, returns individual items for each dataset
         return_output: If True, returns model outputs for each dataset item evaluated
-        sample_size: If set, only return a sample of the dataset items (for debugging)
-        parallel: If True, run inference functions in parallel (requires all functions to be async)
     Returns:
-        Dictionary mapping dataset names to their evaluation results. For each dataset,
-        returns a dictionary containing:
-        - items: List of EvalResult objects with predictions and ground truth
-        - metrics: Dictionary mapping metric names to their computed scores
-    Example:
-    python
-        dataset = EvalDataset.from_huggingface("dataset_name", label="answer", metrics=[Precision])
-        def inference_fn(items):
-            # Model inference logic here - process all items at once
-            return [prediction for item in items]
-        results = evaluate(inference_fn, dataset, item_limit=100)
+        Union[Dict, List, EvalResult]:
+        The evaluation results in the format specified by return parameters:
+            - If return_dict=False: Returns an EvalResult object containing all run results
+            - If return_dict=True Returns the evaluation results as a dict
     """
     logger.info(
@@ -104,8 +90,8 @@ def evaluate(
     return asyncio.run(
         _evaluate_async(
-            inference_callable=inference_callable,
-            eval_datasets=eval_datasets,
+            inference=inference,
+            datasets=datasets,
             hyperparameters=hyperparameters,
             metadata=metadata,
             experiment_id=experiment_id,
@@ -115,14 +101,15 @@ def evaluate(
             return_aggregates=return_aggregates,
             return_items=return_items,
             return_output=return_output,
+            upload_results=upload_results,
             sample_size=sample_size,
         )
     )
 async def _evaluate_async(
-    inference_callable: Callable,
-    eval_datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
+    inference: Callable,
+    datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
     hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
     metadata: Optional[Dict[str, Any]] = None,
     experiment_id: Optional[str] = None,
@@ -132,10 +119,15 @@ async def _evaluate_async(
     return_items: bool = False,
     return_output: bool = False,
     parallel: bool = False,
+    upload_results: Union[Literal["auto"], bool] = "auto",
     sample_size: Optional[int] = None,
 ) -> Union[Dict, List]:
+    """Run evaluation asynchronously."""
+    upload_results = _resolve_upload_results(upload_results)
     _validate_parameters(locals())
-    datasets = _prepare_datasets(eval_datasets, sample_size)
+    datasets = _prepare_datasets(datasets, sample_size)
     hyperparameter_configs = _prepare_hyperparameter_configs(hyperparameters)
     logger.info(
@@ -155,29 +147,29 @@ async def _evaluate_async(
         datasets, len(hyperparameter_configs), parallel, len(eval_run_specs)
     ) as progress_bars:
         if parallel:
-            eval_results = await _run_parallel(
-                inference_callable,
+            eval_result = await _run_parallel(
+                inference,
                 eval_run_specs,
                 progress_bars,
                 experiment_id,
                 project_id,
                 metadata,
+                upload_results,
             )
         else:
-            eval_results = await _run_sequential(
-                inference_callable,
+            eval_result = await _run_sequential(
+                inference,
                 eval_run_specs,
                 progress_bars,
                 experiment_id,
                 project_id,
                 metadata,
+                upload_results,
             )
         logger.info("Evaluation completed successfully")
-    return _format_results(
-        eval_results, return_dict, return_aggregates, return_items, return_output
-    )
+    return _format_results(eval_result, return_dict, return_aggregates, return_items, return_output)
 # ===== ORCHESTRATION PATHS =====
@@ -190,16 +182,35 @@ async def _run_parallel(
     experiment_id: Optional[str] = None,
     project_id: Optional[str] = None,
     metadata: Optional[Dict[str, Any]] = None,
+    upload_results: bool = False,
 ) -> EvalResult:
+    """Run evaluation in parallel."""
     logger.debug("Running inference in parallel")
+    # Worker function to execute individual runs and handle uploads
     async def worker(
         run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
     ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
         run_result = await _execute_run(inference, run, experiment_id, project_id, metadata)
         progress_bars.on_eval_run_completed(run.dataset_index)
+        if (
+            upload_results
+            and isinstance(run_result, ClassicEvalRunResult)
+            and experiment_id
+            and project_id
+        ):
+            # Only upload runs that completed successfully
+            if run_result.run_completed:
+                run_id = await _upload_classic_run(
+                    run_result, experiment_id, project_id, inference, metadata
+                )
+                run_result.run_id = run_id
         return run_result
+    # Execute all runs concurrently
     run_results = await asyncio.gather(*[worker(run) for run in runs])
     # Return in canonical (dataset_idx, hp_idx) order for stability
     run_results.sort(
@@ -215,13 +226,32 @@ async def _run_sequential(
     experiment_id: Optional[str] = None,
     project_id: Optional[str] = None,
     metadata: Optional[Dict[str, Any]] = None,
+    upload_results: bool = False,
 ) -> EvalResult:
+    """Run evaluation sequentially."""
     logger.debug("Running inference sequentially")
     run_results: List[Union[ClassicEvalRunResult, AdaptiveEvalRunResult]] = []
     for run in runs:
         run_result = await _execute_run(inference, run, experiment_id, project_id, metadata)
         run_results.append(run_result)
         progress_bars.on_hyperparam_completed(run_result.run_spec.dataset_index)
+        # Upload a classic eval run result immediately if upload_results is enabled
+        if (
+            upload_results
+            and isinstance(run_result, ClassicEvalRunResult)
+            and experiment_id
+            and project_id
+        ):
+            # Only upload runs that completed successfully
+            if run_result.run_completed:
+                run_id = await _upload_classic_run(
+                    run_result, experiment_id, project_id, inference, metadata
+                )
+                run_result.run_id = run_id
     return EvalResult(run_results)
@@ -236,29 +266,39 @@ async def _execute_run(
     metadata: Optional[Dict[str, Any]] = None,
 ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
     """Execute a single evaluation run."""
     if isinstance(run, EvalRunSpec):
         return await _execute_classic_eval_run(inference, run)
     elif isinstance(run, AdaptiveEvalRunSpec):
-        if experiment_id is None or project_id is None:
-            raise ParameterValidationError(
-                "experiment_id and project_id are required for adaptive evaluation runs"
+        if not experiment_id or not project_id:
+            raise ScoreBookError(
+                "experiment_id and project_id are required for adaptive evaluations"
             )
         return await _execute_adaptive_eval_run(inference, run, experiment_id, project_id, metadata)
     else:
-        raise ParameterValidationError(f"Unrecognized run type: {type(run)}")
+        raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
 async def _execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEvalRunResult:
     """Execute a classic evaluation run."""
     logger.debug("Executing classic eval run for %s", run)
-    inference_outputs = await _run_inference_callable(
-        inference, run.dataset.items, run.hyperparameter_config
-    )
-    metric_scores = _score_metrics(run.dataset, inference_outputs, run.labels)
+    inference_outputs = None
+    metric_scores = None
+    try:
+        inference_outputs = await _run_inference_callable(
+            inference, run.dataset.items, run.hyperparameter_config
+        )
+        metric_scores = _score_metrics(run.dataset, inference_outputs, run.labels)
+        logger.debug("Classic evaluation completed for run %s", run)
+        return ClassicEvalRunResult(run, True, inference_outputs, metric_scores)
-    logger.debug("Classic evaluation completed for run %s", run)
-    return ClassicEvalRunResult(run, inference_outputs, metric_scores)
+    except Exception as e:
+        logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
+        return ClassicEvalRunResult(run, False, inference_outputs, metric_scores)
 async def _execute_adaptive_eval_run(
@@ -282,20 +322,41 @@ async def _execute_adaptive_eval_run(
 # ===== HELPER FUNCTIONS =====
+def _resolve_upload_results(upload_results: Union[Literal["auto"], bool]) -> bool:
+    """Resolve the upload_results parameter based on trismik login status."""
+    if upload_results == "auto":
+        upload_results = get_token() is not None
+        logger.debug("Auto upload results resolved to: %s", upload_results)
+    return upload_results
 def _validate_parameters(params: Dict[str, Any]) -> None:
     """Validate all parameters for evaluation."""
+    # If returning a dict, it must contain items and/or aggregates
     if params["return_dict"] and not params["return_aggregates"] and not params["return_items"]:
         raise ParameterValidationError(
             "When return_dict=True, at least one of return_aggregates or return_items must be True"
         )
-    if params["parallel"] and not is_awaitable(params["inference_callable"]):
+    # Parallel runs require an asynchronous inference callable
+    if params["parallel"] and not is_awaitable(params["inference"]):
         raise ParallelExecutionError(
             "parallel=True requires the inference_callable to be async. "
             "Please make your inference function async or set parallel=False."
         )
+    # If uploading results, experiment_id and project_id must be specified
+    if params["upload_results"]:
+        if params["experiment_id"] is None or params["project_id"] is None:
+            raise ParameterValidationError(
+                "experiment_id and project_id are required for upload_results=True"
+            )
+    logger.debug("Parameter validation successful")
 def _prepare_datasets(
     datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
@@ -354,7 +415,8 @@ def _build_eval_run_specs(
     project_id: Optional[str],
     metadata: Optional[Dict[str, Any]] = None,
 ) -> List[Union[EvalRunSpec, AdaptiveEvalRunSpec]]:
-    """Build RunSpec objects for each dataset/hyperparameter combination."""
+    """Build All RunSpec objects for each dataset/hyperparameter combination."""
     eval_run_specs: List[Union[EvalRunSpec, AdaptiveEvalRunSpec]] = []
     for dataset_index, dataset in enumerate(datasets):
         for hyperparameters_index, hyperparameter_config in enumerate(hyperparameters):
@@ -369,9 +431,9 @@ def _build_eval_run_specs(
             # Create adaptive eval run spec from string
             elif isinstance(dataset, str) and dataset.endswith(":adaptive"):
-                if experiment_id is None or project_id is None:
-                    raise ParameterValidationError(
-                        "experiment_id and project_id are required for adaptive evaluation"
+                if not experiment_id or not project_id:
+                    raise ScoreBookError(
+                        "experiment_id and project_id are required for adaptive evaluations"
                     )
                 eval_run_specs.append(
                     _build_adaptive_eval_run_spec(
@@ -385,24 +447,6 @@ def _build_eval_run_specs(
                     )
                 )
-            # Create adaptive eval run spec from AdaptiveEvalDataset
-            elif isinstance(dataset, AdaptiveEvalDataset):
-                if experiment_id is None or project_id is None:
-                    raise ParameterValidationError(
-                        "experiment_id and project_id are required for adaptive evaluation"
-                    )
-                eval_run_specs.append(
-                    _build_adaptive_eval_run_spec(
-                        dataset.name,
-                        dataset_index,
-                        hyperparameter_config,
-                        hyperparameters_index,
-                        experiment_id,
-                        project_id,
-                        metadata,
-                    )
-                )
             # Log warning - should never happen
             else:
                 logger.warning("Unrecognized dataset type: %s", dataset)
@@ -416,7 +460,7 @@ def _build_classic_eval_run_spec(
     hyperparameters: Dict[str, Any],
     hyperparameters_index: int,
 ) -> EvalRunSpec:
-    """Build RunSpec objects for each dataset/hyperparameter combination."""
+    """Build EvalRunSpec objects for a classic dataset and hyperparameter combination."""
     items = dataset.items
     labels = [item.get(dataset.label) for item in items]
     eval_run_spec = EvalRunSpec(
@@ -440,6 +484,7 @@ def _build_adaptive_eval_run_spec(
     project_id: str,
     metadata: Optional[Dict[str, Any]] = None,
 ) -> AdaptiveEvalRunSpec:
+    """Build AdaptiveEvalRunSpec objects for a dataset/hyperparameter combination."""
     dataset = adaptive_dataset.replace(":adaptive", "")
     adaptive_eval_run_spec = AdaptiveEvalRunSpec(
         dataset,
@@ -459,6 +504,7 @@ async def _run_inference_callable(
     items: List[Dict[str, Any]],
     hyperparameter_config: Dict[str, Any],
 ) -> Any:
+    """Run inference on a given dataset and hyperparameter configuration."""
     if is_awaitable(inference):
         return await inference(items, **hyperparameter_config)
     else:
@@ -493,6 +539,50 @@ def _score_metrics(
     return metric_scores
+async def _upload_classic_run(
+    run_result: ClassicEvalRunResult,
+    experiment_id: str,
+    project_id: str,
+    inference_callable: Optional[Callable] = None,
+    metadata: Optional[Dict[str, Any]] = None,
+) -> Optional[str]:
+    """Upload a ClassicEvalRunResult to Trismik."""
+    logger.debug("Uploading classic eval run: %s", run_result.run_spec)
+    try:
+        model_name = _get_model_name(inference_callable, metadata)
+        response = await upload_classic_eval_run(
+            run=run_result,
+            experiment_id=experiment_id,
+            project_id=project_id,
+            model=model_name,
+            metadata=metadata,
+        )
+        logger.info("Successfully uploaded classic eval run: %s", response.id)
+        return str(response.id)
+    except Exception as e:
+        logger.error("Failed to upload classic eval run: %s", str(e))
+        return None
+def _get_model_name(
+    inference_callable: Optional[Callable] = None, metadata: Optional[Dict[str, Any]] = None
+) -> str:
+    """Determine a model's name with the fallback "unspecified"."""
+    # First priority: metadata.model
+    if metadata and "model" in metadata:
+        return str(metadata["model"])
+    # Second priority: inference_pipeline.model (if callable is an InferencePipeline)
+    if inference_callable and hasattr(inference_callable, "model"):
+        return str(inference_callable.model)
+    # Fallback: "unspecified"
+    return "unspecified"
 def _format_results(
     eval_result: EvalResult,
     return_dict: bool,
@@ -510,10 +600,12 @@ def _format_results(
         if return_items:
             item_scores = eval_result.item_scores
             # Remove inference output if not requested
             if not return_output:
                 for item in item_scores:
                     item.pop("inference_output", None)
             results["item_results"] = item_scores
         # If both are requested, return the combined structure

scorebook/trismik_services/upload_classic_eval_run.py ADDED Viewed

@@ -0,0 +1,102 @@
+"""Upload classic evaluation run results to Trismik platform."""
+import logging
+from typing import Any, Dict, List, Optional
+from trismik.adaptive_test import AdaptiveTest
+from trismik.client_async import TrismikAsyncClient
+from trismik.types import (
+    TrismikClassicEvalItem,
+    TrismikClassicEvalMetric,
+    TrismikClassicEvalRequest,
+    TrismikClassicEvalResponse,
+)
+from scorebook.trismik_services.login import get_token
+from scorebook.types import ClassicEvalRunResult
+logger = logging.getLogger(__name__)
+async def upload_classic_eval_run(
+    run: ClassicEvalRunResult,
+    experiment_id: str,
+    project_id: str,
+    model: str,
+    metadata: Optional[Dict[str, Any]],
+) -> TrismikClassicEvalResponse:
+    """Upload a classic evaluation run result to Trismik platform.
+    Args:
+        run: The evaluation run result to upload
+        experiment_id: Trismik experiment identifier
+        project_id: Trismik project identifier
+        model: Model name used for evaluation
+        metadata: Optional metadata dictionary
+    Returns:
+        Response from Trismik API containing the upload result
+    """
+    runner = AdaptiveTest(
+        lambda x: None,
+        client=TrismikAsyncClient(
+            service_url="https://api-stage.trismik.com/adaptive-testing", api_key=get_token()
+        ),
+    )
+    # Create eval items from run_spec items, outputs, and labels
+    items: List[TrismikClassicEvalItem] = []
+    for idx, (item, output) in enumerate(zip(run.run_spec.items, run.outputs)):
+        label = run.run_spec.labels[idx] if idx < len(run.run_spec.labels) else ""
+        # Calculate item-level metrics for this item
+        item_metrics: Dict[str, Any] = {}
+        for metric_name, metric_data in run.scores.items():
+            if isinstance(metric_data, dict) and "item_scores" in metric_data:
+                if idx < len(metric_data["item_scores"]):
+                    item_metrics[metric_name] = metric_data["item_scores"][idx]
+            else:
+                # If scores is just a single value, use it for all items
+                item_metrics[metric_name] = metric_data
+        eval_item = TrismikClassicEvalItem(
+            datasetItemId=str(idx),
+            modelInput=str(item),
+            modelOutput=str(output),
+            goldOutput=str(label),
+            metrics=item_metrics,
+        )
+        items.append(eval_item)
+    # Create eval metrics from run aggregate scores
+    metrics: List[TrismikClassicEvalMetric] = []
+    for metric_name, metric_data in run.scores.items():
+        if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
+            # Handle structured metric data with aggregate scores
+            for agg_name, agg_value in metric_data["aggregate_scores"].items():
+                metric_id = f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
+                metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
+                metrics.append(metric)
+        else:
+            # Handle simple metric data (single value)
+            metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
+            metrics.append(metric)
+    classic_eval_request = TrismikClassicEvalRequest(
+        project_id,
+        experiment_id,
+        run.run_spec.dataset.name,
+        model,
+        run.run_spec.hyperparameter_config,
+        items,
+        metrics,
+    )
+    response: TrismikClassicEvalResponse = await runner.submit_classic_eval_async(
+        classic_eval_request
+    )
+    run_id: str = response.id
+    logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
+    return response

scorebook/types.py CHANGED Viewed

@@ -52,35 +52,43 @@ class ClassicEvalRunResult:
     """Results from executing a classic evaluation run."""
     run_spec: EvalRunSpec
-    outputs: List[Any]
-    scores: Dict[str, Any]
+    run_completed: bool
+    outputs: Optional[List[Any]]
+    scores: Optional[Dict[str, Any]]
+    run_id: Optional[str] = None
     @property
     def item_scores(self) -> List[Dict[str, Any]]:
         """Return a list of dictionaries containing scores for each evaluated item."""
         results = []
-        for idx, output in enumerate(self.outputs):
-            if idx >= len(self.run_spec.items):
-                break
-            result = {
-                "item_id": idx,
-                "dataset_name": self.run_spec.dataset.name,
-                "inference_output": output,
-                **self.run_spec.hyperparameter_config,
-            }
-            # Add individual item scores if available
-            for metric_name, metric_data in self.scores.items():
-                if isinstance(metric_data, dict) and "item_scores" in metric_data:
-                    if idx < len(metric_data["item_scores"]):
-                        result[metric_name] = metric_data["item_scores"][idx]
-                else:
-                    # If scores is just a single value, replicate it for each item
-                    result[metric_name] = metric_data
-            results.append(result)
+        if self.outputs:
+            for idx, output in enumerate(self.outputs):
+                if idx >= len(self.run_spec.items):
+                    break
+                result = {
+                    "item_id": idx,
+                    "dataset_name": self.run_spec.dataset.name,
+                    "inference_output": output,
+                    **self.run_spec.hyperparameter_config,
+                }
+                # Add run_id if available
+                if self.run_id is not None:
+                    result["run_id"] = self.run_id
+                # Add individual item scores if available
+                if self.scores is not None:
+                    for metric_name, metric_data in self.scores.items():
+                        if isinstance(metric_data, dict) and "item_scores" in metric_data:
+                            if idx < len(metric_data["item_scores"]):
+                                result[metric_name] = metric_data["item_scores"][idx]
+                        else:
+                            # If scores is just a single value, replicate it for each item
+                            result[metric_name] = metric_data
+                results.append(result)
         return results
@@ -89,19 +97,25 @@ class ClassicEvalRunResult:
         """Return the aggregated scores for this run."""
         result = {
             "dataset": self.run_spec.dataset.name,
+            "run_completed": self.run_completed,
             **self.run_spec.hyperparameter_config,
         }
+        # Add run_id if available
+        if self.run_id is not None:
+            result["run_id"] = self.run_id
         # Add aggregate scores from metrics
-        for metric_name, metric_data in self.scores.items():
-            if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
-                # Flatten the aggregate scores from each metric
-                for key, value in metric_data["aggregate_scores"].items():
-                    score_key = key if key == metric_name else f"{metric_name}_{key}"
-                    result[score_key] = value
-            else:
-                # If scores is just a single value, use it as is
-                result[metric_name] = metric_data
+        if self.scores is not None:
+            for metric_name, metric_data in self.scores.items():
+                if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
+                    # Flatten the aggregate scores from each metric
+                    for key, value in metric_data["aggregate_scores"].items():
+                        score_key = key if key == metric_name else f"{metric_name}_{key}"
+                        result[score_key] = value
+                else:
+                    # If scores is just a single value, use it as is
+                    result[metric_name] = metric_data
         return result
@@ -149,7 +163,7 @@ class EvalResult:
         results = []
         for run_result in self.run_results:
-            if isinstance(run_result, ClassicEvalRunResult):
+            if isinstance(run_result, ClassicEvalRunResult) and run_result.run_completed:
                 results.extend(run_result.item_scores)
         return results

{scorebook-0.0.5.dist-info → scorebook-0.0.7.dist-info}/METADATA RENAMED Viewed

@@ -1,7 +1,8 @@
-Metadata-Version: 2.3
+Metadata-Version: 2.4
 Name: scorebook
-Version: 0.0.5
+Version: 0.0.7
 Summary: A Python project for LLM evaluation.
+License-File: LICENSE
 Author: Euan Campbell
 Author-email: euan@trismik.com
 Requires-Python: >=3.9
@@ -11,6 +12,7 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
 Provides-Extra: bedrock
 Provides-Extra: examples
 Provides-Extra: openai
@@ -35,7 +37,7 @@ Requires-Dist: torch ; extra == "examples"
 Requires-Dist: torchaudio ; extra == "examples"
 Requires-Dist: torchvision ; extra == "examples"
 Requires-Dist: transformers ; extra == "examples"
-Requires-Dist: trismik
+Requires-Dist: trismik (>=0.9.4)
 Description-Content-Type: text/markdown
 # Scorebook

{scorebook-0.0.5.dist-info → scorebook-0.0.7.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-scorebook/__init__.py,sha256=yHhNIHeLeRwjdyfSg3jtCz-NbQXMlN9fLhHJ1QzPQGQ,548
+scorebook/__init__.py,sha256=30kyXG8sVbThtCt6cjPtkx7hiaUEukRQC-RsgunlkL4,557
 scorebook/cli/__init__.py,sha256=E89jR1DljFSHhfjEGSRKLgz0KhxGyRQ9a3vpUOmQL9o,32
 scorebook/cli/auth.py,sha256=bv3imsgmY_t52wFoMJt9iu-cKPwvKYkVqZ7nE8EVc6E,2931
 scorebook/cli/main.py,sha256=cEvShENl6L6feX_sa7FGNTeoz5UtwqzwenmcHaON1hg,1589
 scorebook/eval_dataset.py,sha256=LSTyxUkT06iEAVYCnjIDFxFgZzRejwiS5CZA-jvy1ns,15098
-scorebook/evaluate.py,sha256=0Begs5Py9rpapoMixpqjlS2ofigQaGofbjqucABRfuM,19088
+scorebook/evaluate.py,sha256=7tj1qFogcQJtCTAKK3oFnjV1xCMpxaHVnK0EGa9M4Hg,21912
 scorebook/exceptions.py,sha256=emq2QY-4mW6VXlq1dxunPjt-xZpLQIxo8Ck_gYxz1VE,1827
 scorebook/inference/__init__.py,sha256=tqSXSyVurc_YRfPypYed8iTH7Fwt7iFCXMxBXnqY-9I,242
 scorebook/inference/bedrock.py,sha256=wllq0ysNFQKWJDEqoN-k96Jx43BHCAvfxm14zMRCf90,10074
@@ -16,10 +16,11 @@ scorebook/metrics/accuracy.py,sha256=5KQ4hfOn9M94sB7WsXUelJWJiuKfoCGQEl5q5q9vNfo
 scorebook/metrics/metric_base.py,sha256=I3L0DGcRojFp93UGFnXG1tZ2UK9ilTcXXJG6lj5ddXA,857
 scorebook/metrics/metric_registry.py,sha256=jWwt9P3zvtFLlEYrd60v7LS7X251nZczouE02zcCxWg,3402
 scorebook/metrics/precision.py,sha256=AaYPYYKnY74Nwqp_p3jd2Ewf3VHNOJjoRWf5fhb-tXk,563
-scorebook/trismik/__init__.py,sha256=CiGl1u4GcfYhWmB_fGOlsJPwYeKXtIr-uCXoOv4O8yg,284
-scorebook/trismik/adaptive_testing_service.py,sha256=S1yAVnrzqtVWprsiNS_l3q5FibQkMuAs7I7YaSFNtKM,5109
-scorebook/trismik/login.py,sha256=WtJLaNmBMwCi6gT1Bgp4J9x2tq5HDrDI9U074r08TnU,3275
-scorebook/types.py,sha256=wQNFewn9Ji7nQJhXwRS-hVAL4XV6ePcLVdVQiMfWYzg,5149
+scorebook/trismik_services/__init__.py,sha256=CiGl1u4GcfYhWmB_fGOlsJPwYeKXtIr-uCXoOv4O8yg,284
+scorebook/trismik_services/adaptive_testing_service.py,sha256=S1yAVnrzqtVWprsiNS_l3q5FibQkMuAs7I7YaSFNtKM,5109
+scorebook/trismik_services/login.py,sha256=WtJLaNmBMwCi6gT1Bgp4J9x2tq5HDrDI9U074r08TnU,3275
+scorebook/trismik_services/upload_classic_eval_run.py,sha256=oHxELymEPxikVbtC6EQ06LfuNKEFV4Ijnth5v78bAmQ,3664
+scorebook/types.py,sha256=zt8sGfbRjXatx1WtttWZDVIoiS-yhh_1lP0K4VHYvAM,5797
 scorebook/utils/__init__.py,sha256=l_bfi9lAMz1oyGnuyKuzYasQKt2DJwffqsbfSl4-GIQ,452
 scorebook/utils/async_utils.py,sha256=OeNvMrOT9P4rIyaCf5IbR3ZIFMtEzXgoAArNbINRtMU,728
 scorebook/utils/build_prompt.py,sha256=L_Y84a1ewm3GvwnSSuUXfPO_M0QL1Dl8UgOS_l_zvh4,1617
@@ -28,8 +29,8 @@ scorebook/utils/jinja_helpers.py,sha256=ksIKHiKdj8N0o7ZJZGasfbSNoAY6K5d9X_KM6mcK
 scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
 scorebook/utils/progress_bars.py,sha256=TBz41w3yFujsO9n8vUjeubgOrmdiAMI2P2SSVqTJzAA,5269
 scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
-scorebook-0.0.5.dist-info/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
-scorebook-0.0.5.dist-info/METADATA,sha256=t8ADsF_Eul8RVxZ1HS6VF1omw5RMWeWdSQdB0G8Czhg,11432
-scorebook-0.0.5.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-scorebook-0.0.5.dist-info/entry_points.txt,sha256=9gNd3Q0MEozhJ7fog-Q-Z_PrcGMnF-404Jon40MH2_U,53
-scorebook-0.0.5.dist-info/RECORD,,
+scorebook-0.0.7.dist-info/METADATA,sha256=rVXvTh-2fP1H6xMoRvzphrVdy14MirVjnA844Sgb1zA,11515
+scorebook-0.0.7.dist-info/WHEEL,sha256=M5asmiAlL6HEcOq52Yi5mmk9KmTVjY2RDPtO4p9DMrc,88
+scorebook-0.0.7.dist-info/entry_points.txt,sha256=9gNd3Q0MEozhJ7fog-Q-Z_PrcGMnF-404Jon40MH2_U,53
+scorebook-0.0.7.dist-info/licenses/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
+scorebook-0.0.7.dist-info/RECORD,,

{scorebook-0.0.5.dist-info → scorebook-0.0.7.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: poetry-core 2.1.3
+Generator: poetry-core 2.2.0
 Root-Is-Purelib: true
 Tag: py3-none-any

/scorebook/{trismik → trismik_services}/__init__.py RENAMED Viewed

File without changes

/scorebook/{trismik → trismik_services}/adaptive_testing_service.py RENAMED Viewed

File without changes

/scorebook/{trismik → trismik_services}/login.py RENAMED Viewed

File without changes

{scorebook-0.0.5.dist-info → scorebook-0.0.7.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{scorebook-0.0.5.dist-info → scorebook-0.0.7.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

scorebook 0.0.5__py3-none-any.whl → 0.0.7__py3-none-any.whl

scorebook 0.0.5py3-none-any.whl → 0.0.7py3-none-any.whl