scorebook 0.0.12__py3-none-any.whl → 0.0.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +10 -5
- scorebook/cli/auth.py +1 -1
- scorebook/dashboard/__init__.py +1 -0
- scorebook/dashboard/create_project.py +91 -0
- scorebook/{trismik → dashboard}/credentials.py +24 -9
- scorebook/{trismik → dashboard}/upload_results.py +1 -1
- scorebook/eval_datasets/__init__.py +0 -4
- scorebook/eval_datasets/eval_dataset.py +22 -2
- scorebook/evaluate/__init__.py +1 -15
- scorebook/evaluate/_async/evaluate_async.py +25 -9
- scorebook/evaluate/_sync/evaluate.py +25 -9
- scorebook/evaluate/evaluate_helpers.py +79 -5
- scorebook/inference/__init__.py +1 -11
- scorebook/inference/clients/__init__.py +1 -8
- scorebook/inference/inference_pipeline.py +1 -1
- scorebook/metrics/__init__.py +1 -18
- scorebook/metrics/metric_registry.py +2 -0
- scorebook/score/__init__.py +0 -5
- scorebook/score/_async/score_async.py +3 -2
- scorebook/score/_sync/score.py +3 -2
- scorebook/score/score_helpers.py +1 -1
- scorebook/types.py +3 -1
- scorebook/utils/__init__.py +0 -22
- scorebook/utils/common_helpers.py +1 -1
- scorebook/utils/mock_llm/__init__.py +41 -0
- scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
- scorebook-0.0.14.dist-info/METADATA +292 -0
- scorebook-0.0.14.dist-info/RECORD +53 -0
- scorebook/trismik/__init__.py +0 -10
- scorebook-0.0.12.dist-info/METADATA +0 -389
- scorebook-0.0.12.dist-info/RECORD +0 -50
- {scorebook-0.0.12.dist-info → scorebook-0.0.14.dist-info}/WHEEL +0 -0
- {scorebook-0.0.12.dist-info → scorebook-0.0.14.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.12.dist-info → scorebook-0.0.14.dist-info}/licenses/LICENSE +0 -0
scorebook/score/_sync/score.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Any, Dict, List, Literal, Optional, Union, cast
|
|
3
3
|
|
|
4
|
+
from scorebook.dashboard.upload_results import upload_result
|
|
4
5
|
from scorebook.exceptions import DataMismatchError, ParameterValidationError
|
|
5
6
|
from scorebook.score.score_helpers import (
|
|
6
7
|
calculate_metric_scores,
|
|
@@ -8,9 +9,9 @@ from scorebook.score.score_helpers import (
|
|
|
8
9
|
resolve_metrics,
|
|
9
10
|
validate_items,
|
|
10
11
|
)
|
|
11
|
-
from scorebook.trismik.upload_results import upload_result
|
|
12
12
|
from scorebook.types import Metrics
|
|
13
|
-
from scorebook.utils import resolve_show_progress, resolve_upload_results
|
|
13
|
+
from scorebook.utils.common_helpers import resolve_show_progress, resolve_upload_results
|
|
14
|
+
from scorebook.utils.progress_bars import scoring_progress_context
|
|
14
15
|
|
|
15
16
|
logger = logging.getLogger(__name__)
|
|
16
17
|
|
scorebook/score/score_helpers.py
CHANGED
|
@@ -7,7 +7,7 @@ from scorebook.exceptions import DataMismatchError, ParameterValidationError
|
|
|
7
7
|
from scorebook.metrics.metric_base import MetricBase
|
|
8
8
|
from scorebook.metrics.metric_registry import MetricRegistry
|
|
9
9
|
from scorebook.types import MetricScore
|
|
10
|
-
from scorebook.utils import is_awaitable
|
|
10
|
+
from scorebook.utils.async_utils import is_awaitable
|
|
11
11
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
13
13
|
|
scorebook/types.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from typing import Any, Dict, List, Optional, Sequence, Type, Union
|
|
5
5
|
|
|
6
|
-
from scorebook.eval_datasets import EvalDataset
|
|
6
|
+
from scorebook.eval_datasets.eval_dataset import EvalDataset
|
|
7
7
|
from scorebook.metrics.metric_base import MetricBase
|
|
8
8
|
|
|
9
9
|
# Type alias for metrics parameter
|
|
@@ -17,6 +17,7 @@ class AdaptiveEvalDataset:
|
|
|
17
17
|
"""Represents a dataset configured for adaptive evaluation."""
|
|
18
18
|
|
|
19
19
|
name: str
|
|
20
|
+
split: Optional[str] = None
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
@dataclass
|
|
@@ -50,6 +51,7 @@ class AdaptiveEvalRunSpec:
|
|
|
50
51
|
hyperparameters_index: int
|
|
51
52
|
experiment_id: str
|
|
52
53
|
project_id: str
|
|
54
|
+
split: Optional[str] = None
|
|
53
55
|
metadata: Optional[Dict[str, Any]] = None
|
|
54
56
|
|
|
55
57
|
|
scorebook/utils/__init__.py
CHANGED
|
@@ -1,23 +1 @@
|
|
|
1
1
|
"""Utility functions and common helpers for the Scorebook framework."""
|
|
2
|
-
|
|
3
|
-
from contextlib import nullcontext
|
|
4
|
-
|
|
5
|
-
from scorebook.utils.async_utils import async_nullcontext, is_awaitable
|
|
6
|
-
from scorebook.utils.common_helpers import resolve_show_progress, resolve_upload_results
|
|
7
|
-
from scorebook.utils.io_helpers import validate_path
|
|
8
|
-
from scorebook.utils.progress_bars import evaluation_progress_context, scoring_progress_context
|
|
9
|
-
from scorebook.utils.render_template import render_template
|
|
10
|
-
from scorebook.utils.transform_helpers import expand_dict
|
|
11
|
-
|
|
12
|
-
__all__ = [
|
|
13
|
-
"async_nullcontext",
|
|
14
|
-
"nullcontext",
|
|
15
|
-
"is_awaitable",
|
|
16
|
-
"resolve_show_progress",
|
|
17
|
-
"resolve_upload_results",
|
|
18
|
-
"validate_path",
|
|
19
|
-
"expand_dict",
|
|
20
|
-
"evaluation_progress_context",
|
|
21
|
-
"scoring_progress_context",
|
|
22
|
-
"render_template",
|
|
23
|
-
]
|
|
@@ -17,7 +17,7 @@ def resolve_upload_results(upload_results: Union[Literal["auto"], bool]) -> bool
|
|
|
17
17
|
bool: Whether to upload results to Trismik
|
|
18
18
|
"""
|
|
19
19
|
if upload_results == "auto":
|
|
20
|
-
from scorebook.
|
|
20
|
+
from scorebook.dashboard.credentials import get_token
|
|
21
21
|
|
|
22
22
|
upload_results = get_token() is not None
|
|
23
23
|
logger.debug("Auto upload results resolved to: %s", upload_results)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Mock LLM utilities for testing and demonstrations."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import random
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, List
|
|
7
|
+
|
|
8
|
+
# Load the mock data once at module initialization
|
|
9
|
+
_DATA_PATH = Path(__file__).parent / "data" / "mock_llm_data.json"
|
|
10
|
+
with open(_DATA_PATH, "r", encoding="utf-8") as f:
|
|
11
|
+
_MOCK_DATA = json.load(f)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def mock_llm(inputs: List[Any], **hyperparameters: Any) -> List[str]:
|
|
15
|
+
"""Mock LLM that returns answers based on pre-recorded accuracy data."""
|
|
16
|
+
|
|
17
|
+
results = []
|
|
18
|
+
all_choices = ["A", "B", "C", "D", "E"]
|
|
19
|
+
|
|
20
|
+
for item in inputs:
|
|
21
|
+
item_id = item["id"]
|
|
22
|
+
|
|
23
|
+
# Look up the item in our mock data
|
|
24
|
+
if item_id not in _MOCK_DATA:
|
|
25
|
+
# If item not found, return random answer
|
|
26
|
+
results.append(random.choice(all_choices))
|
|
27
|
+
continue
|
|
28
|
+
|
|
29
|
+
item_data = _MOCK_DATA[item_id]
|
|
30
|
+
correct_answer = item_data["answer"]
|
|
31
|
+
was_accurate = item_data["accuracy"]
|
|
32
|
+
|
|
33
|
+
if was_accurate:
|
|
34
|
+
# Return the correct answer
|
|
35
|
+
results.append(correct_answer)
|
|
36
|
+
else:
|
|
37
|
+
# Return a random incorrect answer
|
|
38
|
+
incorrect_choices = [choice for choice in all_choices if choice != correct_answer]
|
|
39
|
+
results.append(random.choice(incorrect_choices))
|
|
40
|
+
|
|
41
|
+
return results
|