PyPI - scorebook - Versions diffs - 0.0.12__py3-none-any.whl → 0.0.14__py3-none-any.whl - Mend

scorebook 0.0.12py3-none-any.whl → 0.0.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

scorebook/__init__.py +10 -5
scorebook/cli/auth.py +1 -1
scorebook/dashboard/__init__.py +1 -0
scorebook/dashboard/create_project.py +91 -0
scorebook/{trismik → dashboard}/credentials.py +24 -9
scorebook/{trismik → dashboard}/upload_results.py +1 -1
scorebook/eval_datasets/__init__.py +0 -4
scorebook/eval_datasets/eval_dataset.py +22 -2
scorebook/evaluate/__init__.py +1 -15
scorebook/evaluate/_async/evaluate_async.py +25 -9
scorebook/evaluate/_sync/evaluate.py +25 -9
scorebook/evaluate/evaluate_helpers.py +79 -5
scorebook/inference/__init__.py +1 -11
scorebook/inference/clients/__init__.py +1 -8
scorebook/inference/inference_pipeline.py +1 -1
scorebook/metrics/__init__.py +1 -18
scorebook/metrics/metric_registry.py +2 -0
scorebook/score/__init__.py +0 -5
scorebook/score/_async/score_async.py +3 -2
scorebook/score/_sync/score.py +3 -2
scorebook/score/score_helpers.py +1 -1
scorebook/types.py +3 -1
scorebook/utils/__init__.py +0 -22
scorebook/utils/common_helpers.py +1 -1
scorebook/utils/mock_llm/__init__.py +41 -0
scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
scorebook-0.0.14.dist-info/METADATA +292 -0
scorebook-0.0.14.dist-info/RECORD +53 -0
scorebook/trismik/__init__.py +0 -10
scorebook-0.0.12.dist-info/METADATA +0 -389
scorebook-0.0.12.dist-info/RECORD +0 -50
{scorebook-0.0.12.dist-info → scorebook-0.0.14.dist-info}/WHEEL +0 -0
{scorebook-0.0.12.dist-info → scorebook-0.0.14.dist-info}/entry_points.txt +0 -0
{scorebook-0.0.12.dist-info → scorebook-0.0.14.dist-info}/licenses/LICENSE +0 -0

scorebook/score/_sync/score.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import logging
 from typing import Any, Dict, List, Literal, Optional, Union, cast
+from scorebook.dashboard.upload_results import upload_result
 from scorebook.exceptions import DataMismatchError, ParameterValidationError
 from scorebook.score.score_helpers import (
     calculate_metric_scores,
@@ -8,9 +9,9 @@ from scorebook.score.score_helpers import (
     resolve_metrics,
     validate_items,
 )
-from scorebook.trismik.upload_results import upload_result
 from scorebook.types import Metrics
-from scorebook.utils import resolve_show_progress, resolve_upload_results, scoring_progress_context
+from scorebook.utils.common_helpers import resolve_show_progress, resolve_upload_results
+from scorebook.utils.progress_bars import scoring_progress_context
 logger = logging.getLogger(__name__)

scorebook/score/score_helpers.py CHANGED Viewed

@@ -7,7 +7,7 @@ from scorebook.exceptions import DataMismatchError, ParameterValidationError
 from scorebook.metrics.metric_base import MetricBase
 from scorebook.metrics.metric_registry import MetricRegistry
 from scorebook.types import MetricScore
-from scorebook.utils import is_awaitable
+from scorebook.utils.async_utils import is_awaitable
 logger = logging.getLogger(__name__)

scorebook/types.py CHANGED Viewed

@@ -3,7 +3,7 @@
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Sequence, Type, Union
-from scorebook.eval_datasets import EvalDataset
+from scorebook.eval_datasets.eval_dataset import EvalDataset
 from scorebook.metrics.metric_base import MetricBase
 # Type alias for metrics parameter
@@ -17,6 +17,7 @@ class AdaptiveEvalDataset:
     """Represents a dataset configured for adaptive evaluation."""
     name: str
+    split: Optional[str] = None
 @dataclass
@@ -50,6 +51,7 @@ class AdaptiveEvalRunSpec:
     hyperparameters_index: int
     experiment_id: str
     project_id: str
+    split: Optional[str] = None
     metadata: Optional[Dict[str, Any]] = None

scorebook/utils/__init__.py CHANGED Viewed

@@ -1,23 +1 @@
 """Utility functions and common helpers for the Scorebook framework."""
-from contextlib import nullcontext
-from scorebook.utils.async_utils import async_nullcontext, is_awaitable
-from scorebook.utils.common_helpers import resolve_show_progress, resolve_upload_results
-from scorebook.utils.io_helpers import validate_path
-from scorebook.utils.progress_bars import evaluation_progress_context, scoring_progress_context
-from scorebook.utils.render_template import render_template
-from scorebook.utils.transform_helpers import expand_dict
-__all__ = [
-    "async_nullcontext",
-    "nullcontext",
-    "is_awaitable",
-    "resolve_show_progress",
-    "resolve_upload_results",
-    "validate_path",
-    "expand_dict",
-    "evaluation_progress_context",
-    "scoring_progress_context",
-    "render_template",
-]

scorebook/utils/common_helpers.py CHANGED Viewed

@@ -17,7 +17,7 @@ def resolve_upload_results(upload_results: Union[Literal["auto"], bool]) -> bool
         bool: Whether to upload results to Trismik
     """
     if upload_results == "auto":
-        from scorebook.trismik.credentials import get_token
+        from scorebook.dashboard.credentials import get_token
         upload_results = get_token() is not None
         logger.debug("Auto upload results resolved to: %s", upload_results)

scorebook/utils/mock_llm/__init__.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""Mock LLM utilities for testing and demonstrations."""
+import json
+import random
+from pathlib import Path
+from typing import Any, List
+# Load the mock data once at module initialization
+_DATA_PATH = Path(__file__).parent / "data" / "mock_llm_data.json"
+with open(_DATA_PATH, "r", encoding="utf-8") as f:
+    _MOCK_DATA = json.load(f)
+def mock_llm(inputs: List[Any], **hyperparameters: Any) -> List[str]:
+    """Mock LLM that returns answers based on pre-recorded accuracy data."""
+    results = []
+    all_choices = ["A", "B", "C", "D", "E"]
+    for item in inputs:
+        item_id = item["id"]
+        # Look up the item in our mock data
+        if item_id not in _MOCK_DATA:
+            # If item not found, return random answer
+            results.append(random.choice(all_choices))
+            continue
+        item_data = _MOCK_DATA[item_id]
+        correct_answer = item_data["answer"]
+        was_accurate = item_data["accuracy"]
+        if was_accurate:
+            # Return the correct answer
+            results.append(correct_answer)
+        else:
+            # Return a random incorrect answer
+            incorrect_choices = [choice for choice in all_choices if choice != correct_answer]
+            results.append(random.choice(incorrect_choices))
+    return results

scorebook 0.0.12__py3-none-any.whl → 0.0.14__py3-none-any.whl

scorebook 0.0.12py3-none-any.whl → 0.0.14py3-none-any.whl