PyPI - scorebook - Versions diffs - 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl - Mend

scorebook 0.0.4py3-none-any.whl → 0.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

scorebook/__init__.py +5 -3
scorebook/cli/__init__.py +1 -0
scorebook/cli/auth.py +98 -0
scorebook/cli/main.py +57 -0
scorebook/{types/eval_dataset.py → eval_dataset.py} +38 -0
scorebook/evaluate.py +531 -0
scorebook/inference/openai.py +23 -1
scorebook/trismik/__init__.py +6 -0
scorebook/trismik/adaptive_testing_service.py +141 -0
scorebook/trismik/login.py +120 -0
scorebook/types.py +165 -0
{scorebook-0.0.4.dist-info → scorebook-0.0.6.dist-info}/METADATA +2 -1
{scorebook-0.0.4.dist-info → scorebook-0.0.6.dist-info}/RECORD +17 -13
scorebook-0.0.6.dist-info/entry_points.txt +3 -0
scorebook/evaluator.py +0 -379
scorebook/types/__init__.py +0 -12
scorebook/types/eval_result.py +0 -133
scorebook/types/eval_run_spec.py +0 -28
scorebook/utils/logging_utils.py +0 -1
/scorebook/{types/inference_pipeline.py → inference_pipeline.py} +0 -0
{scorebook-0.0.4.dist-info → scorebook-0.0.6.dist-info}/LICENSE +0 -0
{scorebook-0.0.4.dist-info → scorebook-0.0.6.dist-info}/WHEEL +0 -0

scorebook/trismik/adaptive_testing_service.py ADDED Viewed

@@ -0,0 +1,141 @@
+"""Trismik adaptive testing service integration."""
+import asyncio
+import dataclasses
+import inspect
+import logging
+from typing import Any, Callable, Iterable, Mapping
+from trismik.adaptive_test import AdaptiveTest
+from trismik.client_async import TrismikAsyncClient
+from trismik.types import TrismikMultipleChoiceTextItem, TrismikRunMetadata
+from scorebook.types import AdaptiveEvalRunResult, AdaptiveEvalRunSpec
+from .login import get_token
+logger = logging.getLogger(__name__)
+async def run_adaptive_evaluation(
+    inference: Callable,
+    adaptive_run_spec: AdaptiveEvalRunSpec,
+    experiment_id: str,
+    project_id: str,
+    metadata: Any,
+) -> AdaptiveEvalRunResult:
+    """Run an adaptive evaluation using the Trismik API.
+    Args:
+        inference: Function to run inference
+        adaptive_run_spec: Specification for the adaptive evaluation
+        experiment_id: Experiment identifier
+        project_id: Trismik project ID
+        metadata: Additional metadata
+    Returns:
+        Results from the adaptive evaluation
+    """
+    runner = AdaptiveTest(
+        make_trismik_inference(inference),
+        client=TrismikAsyncClient(
+            service_url="https://api-stage.trismik.com/adaptive-testing", api_key=get_token()
+        ),
+    )
+    logger.debug(
+        "test_id: %s, project_id: %s, experiment: %s ",
+        adaptive_run_spec.dataset,
+        project_id,
+        experiment_id,
+    )
+    trismik_results = runner.run(
+        adaptive_run_spec.dataset,
+        project_id,
+        experiment_id,
+        run_metadata=TrismikRunMetadata(
+            model_metadata=TrismikRunMetadata.ModelMetadata(name="unknown"),
+            test_configuration={},
+            inference_setup={},
+        ),
+        return_dict=False,
+    )
+    # Convert TrismikRunResults to AdaptiveEvalRunResult
+    # Extract scores from the Trismik results
+    scores = {}
+    if hasattr(trismik_results, "scores") and trismik_results.scores:
+        scores = trismik_results.scores
+    elif hasattr(trismik_results, "__dict__"):
+        # If scores aren't directly available, include all attributes as scores
+        scores = {k: v for k, v in trismik_results.__dict__.items() if not k.startswith("_")}
+    # Convert AdaptiveTestScore objects to JSON-serializable dictionaries
+    def make_json_serializable(obj: Any) -> Any:
+        if hasattr(obj, "theta") and hasattr(obj, "std_error"):
+            # This is likely an AdaptiveTestScore object
+            return {"theta": obj.theta, "std_error": obj.std_error}
+        elif isinstance(obj, dict):
+            return {k: make_json_serializable(v) for k, v in obj.items()}
+        elif isinstance(obj, (list, tuple)):
+            return [make_json_serializable(item) for item in obj]
+        else:
+            return obj
+    # Make scores JSON serializable
+    scores = make_json_serializable(scores)
+    return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, scores=scores)
+def make_trismik_inference(
+    inference_function: Callable,
+    return_list: bool = False,
+) -> Callable[[Any], Any]:
+    """Wrap an inference function for flexible input handling.
+    Takes a function expecting list[dict] and makes it accept single dict
+    or TrismikMultipleChoiceTextItem.
+    """
+    # Check if the inference function is async
+    is_async = inspect.iscoroutinefunction(inference_function) or (
+        hasattr(inference_function, "__call__")
+        and inspect.iscoroutinefunction(inference_function.__call__)
+    )
+    def sync_trismik_inference_function(eval_items: Any, **kwargs: Any) -> Any:
+        # Single TrismikMultipleChoiceTextItem dataclass
+        if isinstance(eval_items, TrismikMultipleChoiceTextItem):
+            eval_item_dict = dataclasses.asdict(eval_items)
+            results = inference_function([eval_item_dict], **kwargs)
+            if is_async:
+                results = asyncio.run(results)
+            return results if return_list else results[0]
+        # Single item (a mapping)
+        if isinstance(eval_items, Mapping):
+            results = inference_function([eval_items], **kwargs)
+            if is_async:
+                results = asyncio.run(results)
+            return results if return_list else results[0]
+        # Iterable of items (but not a string/bytes)
+        if isinstance(eval_items, Iterable) and not isinstance(eval_items, (str, bytes)):
+            # Convert any TrismikMultipleChoiceTextItem instances to dicts
+            converted_items = []
+            for item in eval_items:
+                if isinstance(item, TrismikMultipleChoiceTextItem):
+                    converted_items.append(dataclasses.asdict(item))
+                else:
+                    converted_items.append(item)
+            results = inference_function(converted_items, **kwargs)
+            if is_async:
+                results = asyncio.run(results)
+            return results
+        raise TypeError(
+            "Expected a single item (Mapping[str, Any] or TrismikMultipleChoiceTextItem) "
+            "or an iterable of such items."
+        )
+    return sync_trismik_inference_function

scorebook/trismik/login.py ADDED Viewed

@@ -0,0 +1,120 @@
+"""Authentication and token management for Trismik API."""
+import logging
+import os
+import pathlib
+from typing import Optional
+logger = logging.getLogger(__name__)
+def get_scorebook_config_dir() -> str:
+    """Get the scorebook config directory."""
+    return os.path.join(os.path.expanduser("~"), ".scorebook")
+def get_token_path() -> str:
+    """Get the path where the trismik token is stored."""
+    if "TRISMIK_TOKEN_PATH" in os.environ:
+        return os.environ["TRISMIK_TOKEN_PATH"]
+    return os.path.join(get_scorebook_config_dir(), "config")
+def save_token(token: str) -> None:
+    """Save the token to the local cache directory."""
+    token_path = get_token_path()
+    # Create a directory if it doesn't exist
+    os.makedirs(os.path.dirname(token_path), exist_ok=True)
+    # Write token to file
+    pathlib.Path(token_path).write_text(token.strip())
+    # Set restrictive permissions (owner read/write only)
+    os.chmod(token_path, 0o600)
+def get_stored_token() -> Optional[str]:
+    """Retrieve the stored token from the cache directory."""
+    token_path = get_token_path()
+    if not os.path.exists(token_path):
+        return None
+    try:
+        token = pathlib.Path(token_path).read_text().strip()
+        return token if token else None
+    except (OSError, IOError) as e:
+        logger.warning(f"Failed to read token from {token_path}: {e}")
+        return None
+def get_token() -> Optional[str]:
+    """Get the trismik API token in order of priority.
+    Priority order:
+    1. TRISMIK_API_KEY environment variable
+    2. Stored token file
+    """
+    # Check environment variable first
+    env_token = os.environ.get("TRISMIK_API_KEY")
+    if env_token:
+        return env_token.strip()
+    # Fallback to stored token
+    return get_stored_token()
+def validate_token(token: str) -> bool:
+    """Validate the token by making a test API call to trismik."""
+    # TODO: Implement actual API validation once you have an endpoint
+    # This would typically make a request to something like:
+    # response = requests.get("https://api.trismik.com/whoami",
+    #                        headers={"Authorization": f"Bearer {token}"})
+    # return response.status_code == 200
+    # For now, just check it's not empty
+    return bool(token and token.strip())
+def login(trismik_api_key: str) -> None:
+    """Login to trismik by saving API key locally.
+    Args:
+        trismik_api_key: The API key to use.
+    Raises:
+        ValueError: If API key is empty or invalid.
+    """
+    if not trismik_api_key:
+        raise ValueError("API key cannot be empty")
+    # Validate token
+    if not validate_token(trismik_api_key):
+        raise ValueError("Invalid API key provided")
+    # Save token
+    save_token(trismik_api_key)
+def logout() -> bool:
+    """Remove the stored token.
+    Returns:
+        bool: True if a token was removed, False if no token was found.
+    """
+    token_path = get_token_path()
+    if os.path.exists(token_path):
+        os.remove(token_path)
+        return True
+    else:
+        return False
+def whoami() -> Optional[str]:
+    """Return information about the current user/token.
+    Returns:
+        str: The stored token if logged in, None if not logged in.
+    """
+    return get_stored_token()

scorebook/types.py ADDED Viewed

@@ -0,0 +1,165 @@
+"""Type definitions for scorebook evaluation framework."""
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Union
+from scorebook.eval_dataset import EvalDataset
+@dataclass
+class AdaptiveEvalDataset:
+    """Represents a dataset configured for adaptive evaluation."""
+    name: str
+@dataclass
+class EvalRunSpec:
+    """Specification for a single evaluation run with dataset and hyperparameters."""
+    dataset: EvalDataset
+    dataset_index: int
+    hyperparameter_config: Dict[str, Any]
+    hyperparameters_index: int
+    items: List[Dict[str, Any]]
+    labels: List[Any]
+    def __str__(self) -> str:
+        """Return string representation of EvalRunSpec."""
+        return (
+            f"EvalRunSpec(dataset={self.dataset.name}, "
+            f"dataset_index={self.dataset_index}, "
+            f"hyperparameter_config={self.hyperparameter_config}, "
+            f"hyperparameters_index={self.hyperparameters_index})"
+        )
+@dataclass
+class AdaptiveEvalRunSpec:
+    """Specification for an adaptive evaluation run."""
+    dataset: str
+    dataset_index: int
+    hyperparameter_config: Dict[str, Any]
+    hyperparameters_index: int
+    experiment_id: str
+    project_id: str
+    metadata: Optional[Dict[str, Any]] = None
+@dataclass
+class ClassicEvalRunResult:
+    """Results from executing a classic evaluation run."""
+    run_spec: EvalRunSpec
+    outputs: List[Any]
+    scores: Dict[str, Any]
+    @property
+    def item_scores(self) -> List[Dict[str, Any]]:
+        """Return a list of dictionaries containing scores for each evaluated item."""
+        results = []
+        for idx, output in enumerate(self.outputs):
+            if idx >= len(self.run_spec.items):
+                break
+            result = {
+                "item_id": idx,
+                "dataset_name": self.run_spec.dataset.name,
+                "inference_output": output,
+                **self.run_spec.hyperparameter_config,
+            }
+            # Add individual item scores if available
+            for metric_name, metric_data in self.scores.items():
+                if isinstance(metric_data, dict) and "item_scores" in metric_data:
+                    if idx < len(metric_data["item_scores"]):
+                        result[metric_name] = metric_data["item_scores"][idx]
+                else:
+                    # If scores is just a single value, replicate it for each item
+                    result[metric_name] = metric_data
+            results.append(result)
+        return results
+    @property
+    def aggregate_scores(self) -> Dict[str, Any]:
+        """Return the aggregated scores for this run."""
+        result = {
+            "dataset": self.run_spec.dataset.name,
+            **self.run_spec.hyperparameter_config,
+        }
+        # Add aggregate scores from metrics
+        for metric_name, metric_data in self.scores.items():
+            if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
+                # Flatten the aggregate scores from each metric
+                for key, value in metric_data["aggregate_scores"].items():
+                    score_key = key if key == metric_name else f"{metric_name}_{key}"
+                    result[score_key] = value
+            else:
+                # If scores is just a single value, use it as is
+                result[metric_name] = metric_data
+        return result
+@dataclass
+class AdaptiveEvalRunResult:
+    """Results from executing an adaptive evaluation run."""
+    run_spec: AdaptiveEvalRunSpec
+    scores: Dict[str, Any]
+    @property
+    def aggregate_scores(self) -> Dict[str, Any]:
+        """Return the aggregated scores for this adaptive run."""
+        result = {
+            "dataset": self.run_spec.dataset,
+            "experiment_id": self.run_spec.experiment_id,
+            "project_id": self.run_spec.project_id,
+        }
+        # Safely unpack hyperparameter_config if it's not None
+        if self.run_spec.hyperparameter_config:
+            result.update(self.run_spec.hyperparameter_config)
+        # Safely unpack metadata if it's not None
+        if self.run_spec.metadata:
+            result.update(self.run_spec.metadata)
+        # Safely unpack scores if it's not None
+        if self.scores:
+            result.update(self.scores)
+        return result
+@dataclass
+class EvalResult:
+    """Container for evaluation results across multiple runs."""
+    run_results: List[Union[ClassicEvalRunResult, AdaptiveEvalRunResult]]
+    @property
+    def item_scores(self) -> List[Dict[str, Any]]:
+        """Return a list of dictionaries containing scores for each evaluated item."""
+        results = []
+        for run_result in self.run_results:
+            if isinstance(run_result, ClassicEvalRunResult):
+                results.extend(run_result.item_scores)
+        return results
+    @property
+    def aggregate_scores(self) -> List[Dict[str, Any]]:
+        """Return the aggregated scores across all evaluated runs."""
+        results = []
+        for run_result in self.run_results:
+            results.append(run_result.aggregate_scores)
+        return results

{scorebook-0.0.4.dist-info → scorebook-0.0.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: scorebook
-Version: 0.0.4
+Version: 0.0.6
 Summary: A Python project for LLM evaluation.
 Author: Euan Campbell
 Author-email: euan@trismik.com
@@ -35,6 +35,7 @@ Requires-Dist: torch ; extra == "examples"
 Requires-Dist: torchaudio ; extra == "examples"
 Requires-Dist: torchvision ; extra == "examples"
 Requires-Dist: transformers ; extra == "examples"
+Requires-Dist: trismik (>=0.9.3rc3)
 Description-Content-Type: text/markdown
 # Scorebook

{scorebook-0.0.4.dist-info → scorebook-0.0.6.dist-info}/RECORD RENAMED Viewed

@@ -1,31 +1,35 @@
-scorebook/__init__.py,sha256=7ac3KpXU3kKFekq8mZ3cVbF7oQ6Q9E-uqX7ijyte1Q0,406
-scorebook/evaluator.py,sha256=mS3G3PI26nHzqkYX4tqusQZJL5Q1xTxzqshAdwscl0s,14170
+scorebook/__init__.py,sha256=yHhNIHeLeRwjdyfSg3jtCz-NbQXMlN9fLhHJ1QzPQGQ,548
+scorebook/cli/__init__.py,sha256=E89jR1DljFSHhfjEGSRKLgz0KhxGyRQ9a3vpUOmQL9o,32
+scorebook/cli/auth.py,sha256=bv3imsgmY_t52wFoMJt9iu-cKPwvKYkVqZ7nE8EVc6E,2931
+scorebook/cli/main.py,sha256=cEvShENl6L6feX_sa7FGNTeoz5UtwqzwenmcHaON1hg,1589
+scorebook/eval_dataset.py,sha256=LSTyxUkT06iEAVYCnjIDFxFgZzRejwiS5CZA-jvy1ns,15098
+scorebook/evaluate.py,sha256=0Begs5Py9rpapoMixpqjlS2ofigQaGofbjqucABRfuM,19088
 scorebook/exceptions.py,sha256=emq2QY-4mW6VXlq1dxunPjt-xZpLQIxo8Ck_gYxz1VE,1827
 scorebook/inference/__init__.py,sha256=tqSXSyVurc_YRfPypYed8iTH7Fwt7iFCXMxBXnqY-9I,242
 scorebook/inference/bedrock.py,sha256=wllq0ysNFQKWJDEqoN-k96Jx43BHCAvfxm14zMRCf90,10074
-scorebook/inference/openai.py,sha256=FqXua4v4PTYSHrdTm_9fM0Us8Mo2n2LSN94CwRipRw4,7658
+scorebook/inference/openai.py,sha256=iJVWp0HT9skyM4KXf21yaEjLafdETT5kK3HKl7MZ1hg,8292
 scorebook/inference/portkey.py,sha256=OHSS-sa2aLxuO6fEfG8MsPlhXc_95_-6j7ImbCkY8KE,5952
 scorebook/inference/vertex.py,sha256=jv_Nbt1NJQ6mMUyEuW_idxhj_3fugBojshtpGP9fMeY,9874
+scorebook/inference_pipeline.py,sha256=-HcGGbwM34fGJ_FlXcyqj_pV6DjWIXRGgICiN_63UsU,3251
 scorebook/metrics/__init__.py,sha256=be_riJNojebXw2xfkMsHHjl3HFKgk9jQWlLkXJHhheI,782
 scorebook/metrics/accuracy.py,sha256=5KQ4hfOn9M94sB7WsXUelJWJiuKfoCGQEl5q5q9vNfo,1467
 scorebook/metrics/metric_base.py,sha256=I3L0DGcRojFp93UGFnXG1tZ2UK9ilTcXXJG6lj5ddXA,857
 scorebook/metrics/metric_registry.py,sha256=jWwt9P3zvtFLlEYrd60v7LS7X251nZczouE02zcCxWg,3402
 scorebook/metrics/precision.py,sha256=AaYPYYKnY74Nwqp_p3jd2Ewf3VHNOJjoRWf5fhb-tXk,563
-scorebook/types/__init__.py,sha256=dXY3Y-GiMipVExzVu7H5pbdFfg4HBMEKxqSTfENywSs,427
-scorebook/types/eval_dataset.py,sha256=dCqOHjGaEb7pGG1VF4aGFn6hngFvlxpxddqsDtM4nTs,13870
-scorebook/types/eval_result.py,sha256=R2zuWrx8p9_4A2W3Gmlp-xGgmelPdg8QB5PoV1hiqRc,4728
-scorebook/types/eval_run_spec.py,sha256=nf7LGa_dG60Qb385W6O6qiu7VlJ03-dpo2X1PgKGcRQ,845
-scorebook/types/inference_pipeline.py,sha256=-HcGGbwM34fGJ_FlXcyqj_pV6DjWIXRGgICiN_63UsU,3251
+scorebook/trismik/__init__.py,sha256=CiGl1u4GcfYhWmB_fGOlsJPwYeKXtIr-uCXoOv4O8yg,284
+scorebook/trismik/adaptive_testing_service.py,sha256=S1yAVnrzqtVWprsiNS_l3q5FibQkMuAs7I7YaSFNtKM,5109
+scorebook/trismik/login.py,sha256=WtJLaNmBMwCi6gT1Bgp4J9x2tq5HDrDI9U074r08TnU,3275
+scorebook/types.py,sha256=wQNFewn9Ji7nQJhXwRS-hVAL4XV6ePcLVdVQiMfWYzg,5149
 scorebook/utils/__init__.py,sha256=l_bfi9lAMz1oyGnuyKuzYasQKt2DJwffqsbfSl4-GIQ,452
 scorebook/utils/async_utils.py,sha256=OeNvMrOT9P4rIyaCf5IbR3ZIFMtEzXgoAArNbINRtMU,728
 scorebook/utils/build_prompt.py,sha256=L_Y84a1ewm3GvwnSSuUXfPO_M0QL1Dl8UgOS_l_zvh4,1617
 scorebook/utils/io_helpers.py,sha256=ksOJ9ILcZqqt-HwRUYy1NMQbS6RuMh8i2ZzUADLMlQ8,913
 scorebook/utils/jinja_helpers.py,sha256=ksIKHiKdj8N0o7ZJZGasfbSNoAY6K5d9X_KM6mcKYD4,4208
-scorebook/utils/logging_utils.py,sha256=M4BXt369mJo037WYpvuWDoe3oGWVdHWaGo4Vbl0WDL0,60
 scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
 scorebook/utils/progress_bars.py,sha256=TBz41w3yFujsO9n8vUjeubgOrmdiAMI2P2SSVqTJzAA,5269
 scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
-scorebook-0.0.4.dist-info/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
-scorebook-0.0.4.dist-info/METADATA,sha256=7odU7Q8SHfuHru2oBBk1XlZ2tXLi2WaSShbUhfmX60A,11409
-scorebook-0.0.4.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-scorebook-0.0.4.dist-info/RECORD,,
+scorebook-0.0.6.dist-info/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
+scorebook-0.0.6.dist-info/METADATA,sha256=nAnw4oMdvTXlE79pXqhOmTSZ3ITmVJbFDjc7o_vwOD8,11445
+scorebook-0.0.6.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+scorebook-0.0.6.dist-info/entry_points.txt,sha256=9gNd3Q0MEozhJ7fog-Q-Z_PrcGMnF-404Jon40MH2_U,53
+scorebook-0.0.6.dist-info/RECORD,,

scorebook-0.0.6.dist-info/entry_points.txt ADDED Viewed

@@ -0,0 +1,3 @@
+[console_scripts]
+scorebook=scorebook.cli.main:main

scorebook 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl

scorebook 0.0.4py3-none-any.whl → 0.0.6py3-none-any.whl