PyPI - scorebook - Versions diffs - 0.0.12__tar.gz → 0.0.13__tar.gz - Mend

scorebook 0.0.12tar.gz → 0.0.13tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

{scorebook-0.0.12 → scorebook-0.0.13}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: scorebook
-Version: 0.0.12
+Version: 0.0.13
 Summary: A Python project for LLM evaluation.
 License-File: LICENSE
 Author: Euan Campbell
@@ -37,7 +37,7 @@ Requires-Dist: torch ; extra == "examples"
 Requires-Dist: torchaudio ; extra == "examples"
 Requires-Dist: torchvision ; extra == "examples"
 Requires-Dist: transformers ; extra == "examples"
-Requires-Dist: trismik (==1.0.1)
+Requires-Dist: trismik (==1.0.2)
 Description-Content-Type: text/markdown
 # Scorebook

{scorebook-0.0.12 → scorebook-0.0.13}/pyproject.toml RENAMED Viewed

@@ -11,7 +11,7 @@ requires-python = ">=3.9, <3.14"
 dependencies = [
     "datasets>=3.6.0",
     "notebook (>=7.4.5,<8.0.0)",
-    "trismik==1.0.1",
+    "trismik==1.0.2",
     "ipywidgets>=8.0.0",
 ]
@@ -19,14 +19,14 @@ dependencies = [
 scorebook = "scorebook.cli.main:main"
 [tool.poetry]
-version = "0.0.12"  # base version
+version = "0.0.13"  # base version
 packages = [{ include = "scorebook", from = "src" }]
 [tool.poetry.dependencies]
 python = ">=3.9,<3.14"
 datasets = ">=3.6.0"
 notebook = ">=7.4.5,<8.0.0"
-trismik = "1.0.1"
+trismik = "1.0.2"
 ipywidgets = ">=8.0.0"
 # Optional dependencies

{scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/eval_datasets/eval_dataset.py RENAMED Viewed

@@ -137,6 +137,24 @@ class EvalDataset:
             raise DatasetNotInitializedError("Dataset is not initialized")
         return list(map(str, self._hf_dataset.column_names))
+    @property
+    def split(self) -> Optional[str]:
+        """Return the split name of the underlying HuggingFace dataset, if available.
+        Returns:
+            The split name (e.g., "train", "test", "validation") if the dataset was loaded
+            from HuggingFace with a specific split. Returns None if the dataset was created
+            from a list, CSV, JSON, or loaded without a split specification.
+        Raises:
+            DatasetNotInitializedError: If the dataset is not initialized.
+        """
+        if self._hf_dataset is None:
+            raise DatasetNotInitializedError("Dataset is not initialized")
+        split = self._hf_dataset.split
+        return str(split) if split is not None else None
     def shuffle(self) -> None:
         """Randomly shuffle the dataset items."""
         if self._hf_dataset is None:

{scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/evaluate/_async/evaluate_async.py RENAMED Viewed

@@ -15,6 +15,7 @@ from scorebook.evaluate.evaluate_helpers import (
     make_trismik_inference,
     prepare_datasets,
     prepare_hyperparameter_configs,
+    resolve_adaptive_split,
     validate_parameters,
 )
 from scorebook.exceptions import InferenceError, ScoreBookError
@@ -40,6 +41,7 @@ logger = logging.getLogger(__name__)
 async def evaluate_async(
     inference: Union[Callable, InferencePipeline],
     datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
+    split: Optional[str] = None,
     hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
     metadata: Optional[Dict[str, Any]] = None,
     experiment_id: Optional[str] = None,
@@ -58,6 +60,7 @@ async def evaluate_async(
     Args:
         inference: The inference callable to evaluate
         datasets: Dataset(s) to evaluate on
+        split: Split to use for evaluation (default: "validation")
         hyperparameters: Hyperparameter configuration(s) to evaluate with
         metadata: Optional metadata to attach to the evaluation
         experiment_id: Optional experiment identifier
@@ -82,7 +85,7 @@ async def evaluate_async(
     validate_parameters(locals(), evaluate_async)
     # Prepare datasets, hyperparameters, and eval run specs
-    datasets = prepare_datasets(datasets, sample_size)
+    datasets = prepare_datasets(datasets, split, sample_size)
     hyperparameter_configs = prepare_hyperparameter_configs(hyperparameters)
     eval_run_specs = sorted(
         build_eval_run_specs(datasets, hyperparameter_configs, experiment_id, project_id, metadata),
@@ -378,8 +381,20 @@ async def run_adaptive_evaluation(
     Returns:
         Results from the adaptive evaluation
     """
+    # Fetch available splits from Trismik
+    dataset_info = await trismik_client.get_dataset_info(adaptive_run_spec.dataset)
+    available_splits = dataset_info.splits if hasattr(dataset_info, "splits") else []
+    # Resolve the split to use (with fallback: user-specified -> validation -> test)
+    resolved_split = resolve_adaptive_split(
+        test_id=adaptive_run_spec.dataset,
+        user_specified_split=adaptive_run_spec.split,
+        available_splits=available_splits,
+    )
     trismik_results = await trismik_client.run(
         test_id=adaptive_run_spec.dataset,
+        split=resolved_split,
         project_id=project_id,
         experiment=experiment_id,
         run_metadata=TrismikRunMetadata(

{scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/evaluate/_sync/evaluate.py RENAMED Viewed

@@ -14,6 +14,7 @@ from scorebook.evaluate.evaluate_helpers import (
     make_trismik_inference,
     prepare_datasets,
     prepare_hyperparameter_configs,
+    resolve_adaptive_split,
     validate_parameters,
 )
 from scorebook.exceptions import InferenceError, ScoreBookError
@@ -39,6 +40,7 @@ logger = logging.getLogger(__name__)
 def evaluate(
     inference: Union[Callable, InferencePipeline],
     datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
+    split: Optional[str] = None,
     hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
     metadata: Optional[Dict[str, Any]] = None,
     experiment_id: Optional[str] = None,
@@ -57,6 +59,7 @@ def evaluate(
     Args:
         inference: The inference callable to evaluate
         datasets: Dataset(s) to evaluate on
+        split: Split to use for evaluation (default: "validation")
         hyperparameters: Hyperparameter configuration(s) to evaluate with
         metadata: Optional metadata to attach to the evaluation
         experiment_id: Optional experiment identifier
@@ -81,7 +84,7 @@ def evaluate(
     validate_parameters(locals(), evaluate)
     # Prepare datasets, hyperparameters, and eval run specs
-    datasets = prepare_datasets(datasets, sample_size)
+    datasets = prepare_datasets(datasets, split, sample_size)
     hyperparameter_configs = prepare_hyperparameter_configs(hyperparameters)
     eval_run_specs = sorted(
         build_eval_run_specs(datasets, hyperparameter_configs, experiment_id, project_id, metadata),
@@ -377,8 +380,20 @@ def run_adaptive_evaluation(
     Returns:
         Results from the adaptive evaluation
     """
+    # Fetch available splits from Trismik
+    dataset_info = trismik_client.get_dataset_info(adaptive_run_spec.dataset)
+    available_splits = dataset_info.splits if hasattr(dataset_info, "splits") else []
+    # Resolve the split to use (with fallback: user-specified -> validation -> test)
+    resolved_split = resolve_adaptive_split(
+        test_id=adaptive_run_spec.dataset,
+        user_specified_split=adaptive_run_spec.split,
+        available_splits=available_splits,
+    )
     trismik_results = trismik_client.run(
         test_id=adaptive_run_spec.dataset,
+        split=resolved_split,
         project_id=project_id,
         experiment=experiment_id,
         run_metadata=TrismikRunMetadata(

{scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/evaluate/evaluate_helpers.py RENAMED Viewed

@@ -91,6 +91,7 @@ def validate_parameters(params: Dict[str, Any], caller: Callable[..., Any]) -> N
 def prepare_datasets(
     datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
+    split: Optional[str] = None,
     sample_size: Optional[int] = None,
 ) -> List[Union[EvalDataset, AdaptiveEvalDataset]]:
     """Prepare and separate input datasets into classic and adaptive evaluation datasets."""
@@ -104,6 +105,12 @@ def prepare_datasets(
         # Prepare classic datasets
         if isinstance(dataset, EvalDataset):
+            # Warn if dataset split differs from provided split parameter
+            if split is not None and dataset.split is not None and dataset.split != split:
+                logger.warning(
+                    f"Dataset '{dataset.name}' has split '{dataset.split}' but evaluate split "
+                    f"parameter is '{split}'. The dataset split will be used."
+                )
             if sample_size is not None:
                 dataset = dataset.sample(sample_size)
@@ -111,8 +118,17 @@ def prepare_datasets(
             datasets_out.append(dataset)
         # Prepare adaptive datasets
-        elif isinstance(dataset, str) and dataset.endswith(":adaptive"):
-            datasets_out.append(AdaptiveEvalDataset(dataset))
+        elif isinstance(dataset, str) and ":adaptive" in dataset:
+            # Parse adaptive dataset
+            parts = dataset.split(":")
+            if len(parts) != 2 or parts[1] != "adaptive":
+                raise ParameterValidationError(
+                    f"Invalid adaptive dataset format: '{dataset}'. "
+                    f"Use 'test_id:adaptive' format and specify split via the split parameter."
+                )
+            # Use the split parameter for all adaptive datasets
+            datasets_out.append(AdaptiveEvalDataset(name=dataset, split=split))
         # TODO: dataset name string registry
         elif isinstance(dataset, str):
@@ -174,6 +190,7 @@ def build_eval_run_specs(
                         hyperparameters_index,
                         experiment_id,
                         project_id,
+                        dataset.split,
                         metadata,
                     )
                 )
@@ -220,6 +237,7 @@ def build_adaptive_eval_run_spec(
     hyperparameter_config_index: int,
     experiment_id: str,
     project_id: str,
+    split: Optional[str] = None,
     metadata: Optional[Dict[str, Any]] = None,
 ) -> AdaptiveEvalRunSpec:
     """Build AdaptiveEvalRunSpec objects for a dataset/hyperparameter combination."""
@@ -231,6 +249,7 @@ def build_adaptive_eval_run_spec(
         hyperparameter_config_index,
         experiment_id,
         project_id,
+        split,
         metadata,
     )
     logger.debug("Built AdaptiveEvalRunSpec: %s", adaptive_eval_run_spec)
@@ -386,3 +405,57 @@ def make_trismik_inference(
         )
     return sync_trismik_inference_function
+def resolve_adaptive_split(
+    test_id: str,
+    user_specified_split: Optional[str],
+    available_splits: List[str],
+) -> str:
+    """Resolve the dataset split to use for adaptive evaluation.
+    Resolution order:
+    1. If user specified a split, validate it exists and use it
+    2. If not specified and exactly one split is available, use it
+    3. If not specified and multiple splits are available, raise an error
+    4. If no splits are available, raise an error
+    Args:
+        test_id: The test dataset ID (without ":adaptive" suffix)
+        user_specified_split: Optional split name specified by the user
+        available_splits: List of available split names for this dataset
+    Returns:
+        The resolved split name to use
+    Raises:
+        ScoreBookError: If the specified split doesn't exist, multiple splits exist without
+            user specification, or no splits are available
+    """
+    logger.debug(f"Available splits for {test_id}: {available_splits}")
+    # If user specified a split, validate and use it
+    if user_specified_split is not None:
+        if user_specified_split in available_splits:
+            logger.info(f"Using user-specified split '{user_specified_split}' for {test_id}")
+            return user_specified_split
+        else:
+            raise ScoreBookError(
+                f"Specified split '{user_specified_split}' not found for dataset '{test_id}'. "
+                f"Available splits: {available_splits}"
+            )
+    # No split specified - check available splits
+    if len(available_splits) == 0:
+        raise ScoreBookError(f"No splits available for dataset '{test_id}'. ")
+    elif len(available_splits) == 1:
+        # Exactly one split - auto-select it
+        selected_split = available_splits[0]
+        logger.info(f"Auto-selecting only available split '{selected_split}' for {test_id}")
+        return selected_split
+    else:
+        # Multiple splits available - user must specify
+        raise ScoreBookError(
+            f"Multiple splits available for dataset '{test_id}': {available_splits}. "
+            f"Please specify which split to use via evaluate's 'split' parameter."
+        )

{scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/types.py RENAMED Viewed

@@ -17,6 +17,7 @@ class AdaptiveEvalDataset:
     """Represents a dataset configured for adaptive evaluation."""
     name: str
+    split: Optional[str] = None
 @dataclass
@@ -50,6 +51,7 @@ class AdaptiveEvalRunSpec:
     hyperparameters_index: int
     experiment_id: str
     project_id: str
+    split: Optional[str] = None
     metadata: Optional[Dict[str, Any]] = None