PyPI - scorebook - Versions diffs - 0.0.11__tar.gz → 0.0.13__tar.gz - Mend

scorebook 0.0.11tar.gz → 0.0.13tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

{scorebook-0.0.11 → scorebook-0.0.13}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: scorebook
-Version: 0.0.11
+Version: 0.0.13
 Summary: A Python project for LLM evaluation.
 License-File: LICENSE
 Author: Euan Campbell
@@ -37,7 +37,7 @@ Requires-Dist: torch ; extra == "examples"
 Requires-Dist: torchaudio ; extra == "examples"
 Requires-Dist: torchvision ; extra == "examples"
 Requires-Dist: transformers ; extra == "examples"
-Requires-Dist: trismik (>=1.0.1,<2.0.0)
+Requires-Dist: trismik (==1.0.2)
 Description-Content-Type: text/markdown
 # Scorebook

{scorebook-0.0.11 → scorebook-0.0.13}/pyproject.toml RENAMED Viewed

@@ -11,7 +11,7 @@ requires-python = ">=3.9, <3.14"
 dependencies = [
     "datasets>=3.6.0",
     "notebook (>=7.4.5,<8.0.0)",
-    "trismik (>=1.0.1, <2.0.0)",
+    "trismik==1.0.2",
     "ipywidgets>=8.0.0",
 ]
@@ -19,14 +19,14 @@ dependencies = [
 scorebook = "scorebook.cli.main:main"
 [tool.poetry]
-version = "0.0.11"  # base version
+version = "0.0.13"  # base version
 packages = [{ include = "scorebook", from = "src" }]
 [tool.poetry.dependencies]
 python = ">=3.9,<3.14"
 datasets = ">=3.6.0"
 notebook = ">=7.4.5,<8.0.0"
-trismik = ">=1.0.1,<2.0.0"
+trismik = "1.0.2"
 ipywidgets = ">=8.0.0"
 # Optional dependencies
@@ -119,16 +119,22 @@ install_types = true
 [tool.flake8] # note that this depends on Flake8-pyproject
 ignore = ["D202", "W503", "W504"]
-[tool.unasync]
 [[tool.unasync.rules]]
 fromdir = "src/scorebook/evaluate/_async/"
 todir = "src/scorebook/evaluate/_sync/"
+replacements."scorebook.score._async.score_async" = "scorebook.score._sync.score"
+replacements."scorebook.score._async" = "scorebook.score._sync"
+replacements.evaluate_async = "evaluate"
+replacements."Asynchronous evaluation complete" = "Synchronous evaluation complete"
+replacements."    run_results = asyncio.gather(*[worker(run) for run in runs])" = "    run_results = [worker(run) for run in runs]"
+replacements.async_nullcontext = "nullcontext"
+replacements.create_trismik_async_client = "create_trismik_sync_client"
+replacements.score_async = "score"
-# Custom replacements beyond default async/await transformations
-[tool.unasync.rules.replacements]
-"evaluate_async" = "evaluate"
-"Asynchronous evaluation complete" = "Synchronous evaluation complete"
-"    run_results = asyncio.gather(*[worker(run) for run in runs])" = "    run_results = [worker(run) for run in runs]"
-"async_nullcontext" = "nullcontext"
-"create_trismik_async_client" = "create_trismik_sync_client"
+[[tool.unasync.rules]]
+fromdir = "src/scorebook/score/_async/"
+todir = "src/scorebook/score/_sync/"
+replacements.score_async = "score"
+replacements."Async scoring complete" = "Scoring complete"
+replacements.calculate_metric_scores_async = "calculate_metric_scores"
+replacements.upload_result_async = "upload_result"

{scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/__init__.py RENAMED Viewed

@@ -12,15 +12,22 @@ __version__ = importlib.metadata.version(__package__ or __name__)
 from scorebook.eval_datasets import EvalDataset
 from scorebook.evaluate import evaluate, evaluate_async
 from scorebook.inference.inference_pipeline import InferencePipeline
-from scorebook.trismik.credentials import login, whoami
+from scorebook.score import score, score_async
+from scorebook.trismik.credentials import login, logout, whoami
+from scorebook.trismik.upload_results import upload_result, upload_result_async
 from scorebook.utils.render_template import render_template
 __all__ = [
     "EvalDataset",
     "evaluate",
     "evaluate_async",
+    "score",
+    "score_async",
     "render_template",
     "login",
+    "logout",
     "whoami",
     "InferencePipeline",
+    "upload_result",
+    "upload_result_async",
 ]

{scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/eval_datasets/eval_dataset.py RENAMED Viewed

@@ -137,6 +137,24 @@ class EvalDataset:
             raise DatasetNotInitializedError("Dataset is not initialized")
         return list(map(str, self._hf_dataset.column_names))
+    @property
+    def split(self) -> Optional[str]:
+        """Return the split name of the underlying HuggingFace dataset, if available.
+        Returns:
+            The split name (e.g., "train", "test", "validation") if the dataset was loaded
+            from HuggingFace with a specific split. Returns None if the dataset was created
+            from a list, CSV, JSON, or loaded without a split specification.
+        Raises:
+            DatasetNotInitializedError: If the dataset is not initialized.
+        """
+        if self._hf_dataset is None:
+            raise DatasetNotInitializedError("Dataset is not initialized")
+        split = self._hf_dataset.split
+        return str(split) if split is not None else None
     def shuffle(self) -> None:
         """Randomly shuffle the dataset items."""
         if self._hf_dataset is None:

{scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/evaluate/_async/evaluate_async.py RENAMED Viewed

@@ -3,13 +3,8 @@ import logging
 from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
 from trismik import TrismikAsyncClient, TrismikClient
-from trismik.types import (
-    TrismikClassicEvalItem,
-    TrismikClassicEvalMetric,
-    TrismikClassicEvalRequest,
-    TrismikClassicEvalResponse,
-    TrismikRunMetadata,
-)
+from trismik.settings import evaluation_settings
+from trismik.types import TrismikRunMetadata
 from scorebook.eval_datasets import EvalDataset
 from scorebook.evaluate.evaluate_helpers import (
@@ -20,12 +15,12 @@ from scorebook.evaluate.evaluate_helpers import (
     make_trismik_inference,
     prepare_datasets,
     prepare_hyperparameter_configs,
-    resolve_show_progress,
-    resolve_upload_results,
-    score_metrics,
+    resolve_adaptive_split,
     validate_parameters,
 )
 from scorebook.exceptions import InferenceError, ScoreBookError
+from scorebook.inference.inference_pipeline import InferencePipeline
+from scorebook.score._async.score_async import score_async
 from scorebook.types import (
     AdaptiveEvalRunResult,
     AdaptiveEvalRunSpec,
@@ -33,14 +28,20 @@ from scorebook.types import (
     EvalResult,
     EvalRunSpec,
 )
-from scorebook.utils import async_nullcontext, evaluation_progress_context
+from scorebook.utils import (
+    async_nullcontext,
+    evaluation_progress_context,
+    resolve_show_progress,
+    resolve_upload_results,
+)
 logger = logging.getLogger(__name__)
 async def evaluate_async(
-    inference: Callable,
+    inference: Union[Callable, InferencePipeline],
     datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
+    split: Optional[str] = None,
     hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
     metadata: Optional[Dict[str, Any]] = None,
     experiment_id: Optional[str] = None,
@@ -59,6 +60,7 @@ async def evaluate_async(
     Args:
         inference: The inference callable to evaluate
         datasets: Dataset(s) to evaluate on
+        split: Split to use for evaluation (default: "validation")
         hyperparameters: Hyperparameter configuration(s) to evaluate with
         metadata: Optional metadata to attach to the evaluation
         experiment_id: Optional experiment identifier
@@ -83,14 +85,14 @@ async def evaluate_async(
     validate_parameters(locals(), evaluate_async)
     # Prepare datasets, hyperparameters, and eval run specs
-    datasets = prepare_datasets(datasets, sample_size)
+    datasets = prepare_datasets(datasets, split, sample_size)
     hyperparameter_configs = prepare_hyperparameter_configs(hyperparameters)
     eval_run_specs = sorted(
         build_eval_run_specs(datasets, hyperparameter_configs, experiment_id, project_id, metadata),
         key=lambda run: (run.dataset_index, run.hyperparameters_index),
     )
-    # Create Trismik client if needed (for adaptive evals or uploads)
+    # Create a Trismik client if needed (for adaptive evals or uploads)
     needs_client = upload_results or any(
         isinstance(run, AdaptiveEvalRunSpec) for run in eval_run_specs
     )
@@ -101,7 +103,14 @@ async def evaluate_async(
     async with trismik_client or async_nullcontext():
         # Execute evaluation runs
         # Calculate total items across all runs
-        total_items = sum(len(run.dataset.items) for run in eval_run_specs)
+        total_items = sum(
+            (
+                len(run.dataset.items)
+                if isinstance(run, EvalRunSpec)
+                else evaluation_settings["max_iterations"]
+            )  # Adaptive evals use max_iterations
+            for run in eval_run_specs
+        )
         model_display = get_model_name(inference)
         with evaluation_progress_context(
@@ -145,34 +154,32 @@ async def execute_runs(
     async def worker(
         run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
     ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
+        # Execute run (score_async handles upload internally for classic evals)
         run_result = await execute_run(
-            inference, run, experiment_id, project_id, metadata, trismik_client
+            inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
         )
         # Update progress bars with items processed and success status
         if progress_bars is not None:
-            items_processed = len(run.dataset.items)
+            # Classic evals have .items; adaptive evals use max_iterations
+            items_processed = (
+                len(run.dataset.items)
+                if isinstance(run, EvalRunSpec)
+                else evaluation_settings["max_iterations"]
+            )
             progress_bars.on_run_completed(items_processed, run_result.run_completed)
+        # Update upload progress for classic evals
         if (
             upload_results
             and isinstance(run_result, ClassicEvalRunResult)
-            and experiment_id
-            and project_id
             and run_result.run_completed
-            and trismik_client is not None
         ):
-            try:
-                run_id = await upload_classic_run_results(
-                    run_result, experiment_id, project_id, inference, metadata, trismik_client
-                )
-                run_result.run_id = run_id
+            # Check if upload succeeded by checking for run_id
+            if experiment_id and project_id:
+                upload_succeeded = run_result.run_id is not None
                 if progress_bars is not None:
-                    progress_bars.on_upload_completed(succeeded=True)
-            except Exception as e:
-                logger.warning(f"Failed to upload run results: {e}")
-                if progress_bars is not None:
-                    progress_bars.on_upload_completed(succeeded=False)
-                # Continue evaluation even if upload fails
+                    progress_bars.on_upload_completed(succeeded=upload_succeeded)
         return run_result
@@ -191,6 +198,7 @@ async def execute_runs(
 async def execute_run(
     inference: Callable,
     run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
+    upload_results: bool,  # NEW PARAMETER
     experiment_id: Optional[str] = None,
     project_id: Optional[str] = None,
     metadata: Optional[Dict[str, Any]] = None,
@@ -199,7 +207,9 @@ async def execute_run(
     """Execute a single evaluation run."""
     if isinstance(run, EvalRunSpec):
-        return await execute_classic_eval_run(inference, run)
+        return await execute_classic_eval_run(
+            inference, run, upload_results, experiment_id, project_id, metadata
+        )
     elif isinstance(run, AdaptiveEvalRunSpec):
         resolved_experiment_id = experiment_id if experiment_id is not None else run.experiment_id
@@ -217,24 +227,79 @@ async def execute_run(
         raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
-async def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEvalRunResult:
-    """Execute a classic evaluation run."""
+async def execute_classic_eval_run(
+    inference: Callable,
+    run: EvalRunSpec,
+    upload_results: bool,
+    experiment_id: Optional[str],
+    project_id: Optional[str],
+    metadata: Optional[Dict[str, Any]],
+) -> ClassicEvalRunResult:
+    """Execute a classic evaluation run using score_async() for scoring and uploading."""
     logger.debug("Executing classic eval run for %s", run)
     inference_outputs = None
-    metric_scores = None
+    scores = None
     try:
+        # 1. Run inference
         inference_outputs = await run_inference_callable(
             inference, run.inputs, run.hyperparameter_config
         )
-        metric_scores = score_metrics(run.dataset, inference_outputs, run.labels)
-        logger.debug("Classic evaluation completed for run %s", run)
-        return ClassicEvalRunResult(run, True, inference_outputs, metric_scores)
+        # 2. Build items for score_async
+        items = [
+            {
+                "input": run.inputs[i] if i < len(run.inputs) else None,
+                "output": inference_outputs[i],
+                "label": run.labels[i] if i < len(run.labels) else "",
+            }
+            for i in range(len(inference_outputs))
+        ]
+        # 3. Get the model name for upload
+        model_name = get_model_name(inference, metadata)
+        # 4. Call score_async
+        scores = await score_async(
+            items=items,
+            metrics=run.dataset.metrics,
+            output_column="output",  # Explicit parameter
+            label_column="label",  # Explicit parameter
+            input_column="input",  # Explicit parameter
+            hyperparameters=run.hyperparameter_config,
+            dataset_name=run.dataset.name,
+            model_name=model_name,
+            metadata=metadata,
+            experiment_id=experiment_id,
+            project_id=project_id,
+            upload_results=upload_results,
+            show_progress=False,
+        )
+        # 5. Extract run_id if upload succeeded
+        run_id = None
+        if scores.get("aggregate_results") and len(scores["aggregate_results"]) > 0:
+            run_id = scores["aggregate_results"][0].get("run_id")
+        logger.debug("Classic evaluation completed for run %s (run_id: %s)", run, run_id)
+        return ClassicEvalRunResult(
+            run_spec=run,
+            run_completed=True,
+            outputs=inference_outputs,
+            scores=scores,
+            run_id=run_id,
+        )
     except Exception as e:
         logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
-        return ClassicEvalRunResult(run, False, inference_outputs, metric_scores)
+        return ClassicEvalRunResult(
+            run_spec=run,
+            run_completed=False,
+            outputs=inference_outputs,
+            scores=scores,
+            run_id=None,
+        )
 async def run_inference_callable(
@@ -296,93 +361,6 @@ async def execute_adaptive_eval_run(
         return AdaptiveEvalRunResult(run, False, {})
-async def upload_classic_run_results(
-    run_result: ClassicEvalRunResult,
-    experiment_id: str,
-    project_id: str,
-    inference_callable: Optional[Callable],
-    metadata: Optional[Dict[str, Any]],
-    trismik_client: Union[TrismikClient, TrismikAsyncClient],
-) -> str:
-    """Upload a classic evaluation run result to Trismik platform.
-    Args:
-        run: The evaluation run result to upload
-        experiment_id: Trismik experiment identifier
-        project_id: Trismik project identifier
-        model: Model name used for evaluation
-        metadata: Optional metadata dictionary
-        trismik_client: Trismik client instance
-    Returns:
-        Run id
-    """
-    model = get_model_name(inference_callable)
-    # Create eval items from run_spec inputs, outputs, and labels
-    items: List[TrismikClassicEvalItem] = []
-    inputs_outputs = zip(run_result.run_spec.inputs, run_result.outputs)
-    for idx, (input_value, output) in enumerate(inputs_outputs):
-        labels = run_result.run_spec.labels
-        label = labels[idx] if idx < len(labels) else ""
-        # Calculate item-level metrics for this item
-        item_metrics: Dict[str, Any] = {}
-        if run_result.scores:
-            for metric_name, metric_data in run_result.scores.items():
-                if isinstance(metric_data, dict) and "item_scores" in metric_data:
-                    if idx < len(metric_data["item_scores"]):
-                        item_metrics[metric_name] = metric_data["item_scores"][idx]
-                else:
-                    # If scores is just a single value, use it for all items
-                    item_metrics[metric_name] = metric_data
-        eval_item = TrismikClassicEvalItem(
-            datasetItemId=str(idx),
-            modelInput=str(input_value),
-            modelOutput=str(output),
-            goldOutput=str(label),
-            metrics=item_metrics,
-        )
-        items.append(eval_item)
-    # Create eval metrics from run aggregate scores
-    metrics: List[TrismikClassicEvalMetric] = []
-    if run_result.scores:
-        for metric_name, metric_data in run_result.scores.items():
-            if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
-                # Handle structured metric data with aggregate scores
-                for agg_name, agg_value in metric_data["aggregate_scores"].items():
-                    metric_id = (
-                        f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
-                    )
-                    metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
-                    metrics.append(metric)
-            else:
-                # Handle simple metric data (single value)
-                metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
-                metrics.append(metric)
-    classic_eval_request = TrismikClassicEvalRequest(
-        project_id,
-        experiment_id,
-        run_result.run_spec.dataset.name,
-        model,
-        run_result.run_spec.hyperparameter_config,
-        items,
-        metrics,
-    )
-    response: TrismikClassicEvalResponse = await trismik_client.submit_classic_eval(
-        classic_eval_request
-    )
-    run_id: str = response.id
-    logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
-    return run_id
 async def run_adaptive_evaluation(
     inference: Callable,
     adaptive_run_spec: AdaptiveEvalRunSpec,
@@ -403,8 +381,20 @@ async def run_adaptive_evaluation(
     Returns:
         Results from the adaptive evaluation
     """
+    # Fetch available splits from Trismik
+    dataset_info = await trismik_client.get_dataset_info(adaptive_run_spec.dataset)
+    available_splits = dataset_info.splits if hasattr(dataset_info, "splits") else []
+    # Resolve the split to use (with fallback: user-specified -> validation -> test)
+    resolved_split = resolve_adaptive_split(
+        test_id=adaptive_run_spec.dataset,
+        user_specified_split=adaptive_run_spec.split,
+        available_splits=available_splits,
+    )
     trismik_results = await trismik_client.run(
         test_id=adaptive_run_spec.dataset,
+        split=resolved_split,
         project_id=project_id,
         experiment=experiment_id,
         run_metadata=TrismikRunMetadata(

scorebook 0.0.11__tar.gz → 0.0.13__tar.gz

scorebook 0.0.11tar.gz → 0.0.13tar.gz