scorebook 0.0.12__py3-none-any.whl → 0.0.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +10 -5
- scorebook/cli/auth.py +1 -1
- scorebook/dashboard/__init__.py +1 -0
- scorebook/dashboard/create_project.py +91 -0
- scorebook/{trismik → dashboard}/credentials.py +24 -9
- scorebook/{trismik → dashboard}/upload_results.py +1 -1
- scorebook/eval_datasets/__init__.py +0 -4
- scorebook/eval_datasets/eval_dataset.py +22 -2
- scorebook/evaluate/__init__.py +1 -15
- scorebook/evaluate/_async/evaluate_async.py +25 -9
- scorebook/evaluate/_sync/evaluate.py +25 -9
- scorebook/evaluate/evaluate_helpers.py +79 -5
- scorebook/inference/__init__.py +1 -11
- scorebook/inference/clients/__init__.py +1 -8
- scorebook/inference/inference_pipeline.py +1 -1
- scorebook/metrics/__init__.py +1 -18
- scorebook/metrics/metric_registry.py +2 -0
- scorebook/score/__init__.py +0 -5
- scorebook/score/_async/score_async.py +3 -2
- scorebook/score/_sync/score.py +3 -2
- scorebook/score/score_helpers.py +1 -1
- scorebook/types.py +3 -1
- scorebook/utils/__init__.py +0 -22
- scorebook/utils/common_helpers.py +1 -1
- scorebook/utils/mock_llm/__init__.py +41 -0
- scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
- scorebook-0.0.14.dist-info/METADATA +292 -0
- scorebook-0.0.14.dist-info/RECORD +53 -0
- scorebook/trismik/__init__.py +0 -10
- scorebook-0.0.12.dist-info/METADATA +0 -389
- scorebook-0.0.12.dist-info/RECORD +0 -50
- {scorebook-0.0.12.dist-info → scorebook-0.0.14.dist-info}/WHEEL +0 -0
- {scorebook-0.0.12.dist-info → scorebook-0.0.14.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.12.dist-info → scorebook-0.0.14.dist-info}/licenses/LICENSE +0 -0
scorebook/__init__.py
CHANGED
|
@@ -9,12 +9,15 @@ import importlib.metadata
|
|
|
9
9
|
# get version from pyproject.toml
|
|
10
10
|
__version__ = importlib.metadata.version(__package__ or __name__)
|
|
11
11
|
|
|
12
|
-
from scorebook.
|
|
13
|
-
from scorebook.
|
|
12
|
+
from scorebook.dashboard.create_project import create_project, create_project_async
|
|
13
|
+
from scorebook.dashboard.credentials import login, logout, whoami
|
|
14
|
+
from scorebook.dashboard.upload_results import upload_result, upload_result_async
|
|
15
|
+
from scorebook.eval_datasets.eval_dataset import EvalDataset
|
|
16
|
+
from scorebook.evaluate._async.evaluate_async import evaluate_async
|
|
17
|
+
from scorebook.evaluate._sync.evaluate import evaluate
|
|
14
18
|
from scorebook.inference.inference_pipeline import InferencePipeline
|
|
15
|
-
from scorebook.score import
|
|
16
|
-
from scorebook.
|
|
17
|
-
from scorebook.trismik.upload_results import upload_result, upload_result_async
|
|
19
|
+
from scorebook.score._async.score_async import score_async
|
|
20
|
+
from scorebook.score._sync.score import score
|
|
18
21
|
from scorebook.utils.render_template import render_template
|
|
19
22
|
|
|
20
23
|
__all__ = [
|
|
@@ -28,6 +31,8 @@ __all__ = [
|
|
|
28
31
|
"logout",
|
|
29
32
|
"whoami",
|
|
30
33
|
"InferencePipeline",
|
|
34
|
+
"create_project",
|
|
35
|
+
"create_project_async",
|
|
31
36
|
"upload_result",
|
|
32
37
|
"upload_result_async",
|
|
33
38
|
]
|
scorebook/cli/auth.py
CHANGED
|
@@ -4,7 +4,7 @@ import argparse
|
|
|
4
4
|
import getpass
|
|
5
5
|
import sys
|
|
6
6
|
|
|
7
|
-
from scorebook.
|
|
7
|
+
from scorebook.dashboard.credentials import get_stored_token, get_token_path, login, logout, whoami
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def auth_command(args: argparse.Namespace) -> int:
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Trismik authentication and API integration."""
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Create projects in Trismik's experimentation platform."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from trismik.types import TrismikProject
|
|
7
|
+
|
|
8
|
+
from scorebook.evaluate.evaluate_helpers import (
|
|
9
|
+
create_trismik_async_client,
|
|
10
|
+
create_trismik_sync_client,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def create_project(
|
|
17
|
+
name: str,
|
|
18
|
+
team_id: Optional[str] = None,
|
|
19
|
+
description: Optional[str] = None,
|
|
20
|
+
) -> TrismikProject:
|
|
21
|
+
"""Create a new project in Trismik's experimentation platform (synchronous).
|
|
22
|
+
|
|
23
|
+
This function creates a new project that can be used to organize experiments
|
|
24
|
+
and evaluation runs in the Trismik platform.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
name: Name of the project
|
|
28
|
+
team_id: Optional ID of the team to create the project in. If not provided,
|
|
29
|
+
the project will be created in the user's default team.
|
|
30
|
+
description: Optional description of the project
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
TrismikProject: Created project object containing project details including
|
|
34
|
+
id, name, description, accountId, createdAt, and updatedAt fields
|
|
35
|
+
|
|
36
|
+
Raises:
|
|
37
|
+
TrismikValidationError: If the request fails validation
|
|
38
|
+
TrismikApiError: If the API request fails
|
|
39
|
+
"""
|
|
40
|
+
# Create Trismik client
|
|
41
|
+
trismik_client = create_trismik_sync_client()
|
|
42
|
+
|
|
43
|
+
# Create project via Trismik API
|
|
44
|
+
project = trismik_client.create_project(
|
|
45
|
+
name=name,
|
|
46
|
+
team_id=team_id,
|
|
47
|
+
description=description,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
logger.info(f"Project '{name}' created successfully with ID: {project.id}")
|
|
51
|
+
|
|
52
|
+
return project
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
async def create_project_async(
|
|
56
|
+
name: str,
|
|
57
|
+
team_id: Optional[str] = None,
|
|
58
|
+
description: Optional[str] = None,
|
|
59
|
+
) -> TrismikProject:
|
|
60
|
+
"""Create a new project in Trismik's experimentation platform (asynchronous).
|
|
61
|
+
|
|
62
|
+
This function creates a new project that can be used to organize experiments
|
|
63
|
+
and evaluation runs in the Trismik platform.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
name: Name of the project
|
|
67
|
+
team_id: Optional ID of the team to create the project in. If not provided,
|
|
68
|
+
the project will be created in the user's default team.
|
|
69
|
+
description: Optional description of the project
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
TrismikProject: Created project object containing project details including
|
|
73
|
+
id, name, description, accountId, createdAt, and updatedAt fields
|
|
74
|
+
|
|
75
|
+
Raises:
|
|
76
|
+
TrismikValidationError: If the request fails validation
|
|
77
|
+
TrismikApiError: If the API request fails
|
|
78
|
+
"""
|
|
79
|
+
# Create Trismik async client
|
|
80
|
+
trismik_client = create_trismik_async_client()
|
|
81
|
+
|
|
82
|
+
# Create project via Trismik API (async)
|
|
83
|
+
project = await trismik_client.create_project(
|
|
84
|
+
name=name,
|
|
85
|
+
team_id=team_id,
|
|
86
|
+
description=description,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
logger.info(f"Project '{name}' created successfully with ID: {project.id}")
|
|
90
|
+
|
|
91
|
+
return project
|
|
@@ -5,6 +5,10 @@ import os
|
|
|
5
5
|
import pathlib
|
|
6
6
|
from typing import Optional
|
|
7
7
|
|
|
8
|
+
from trismik import TrismikClient
|
|
9
|
+
|
|
10
|
+
from scorebook.settings import TRISMIK_SERVICE_URL
|
|
11
|
+
|
|
8
12
|
logger = logging.getLogger(__name__)
|
|
9
13
|
|
|
10
14
|
|
|
@@ -66,15 +70,26 @@ def get_token() -> Optional[str]:
|
|
|
66
70
|
|
|
67
71
|
|
|
68
72
|
def validate_token(token: str) -> bool:
|
|
69
|
-
"""Validate the token by making a test API call to trismik.
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
73
|
+
"""Validate the token by making a test API call to trismik.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
token: The API token to validate.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
bool: True if the token is valid, False otherwise.
|
|
80
|
+
"""
|
|
81
|
+
if not token or not token.strip():
|
|
82
|
+
return False
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
# Create a client with the token and verify it works
|
|
86
|
+
client = TrismikClient(service_url=TRISMIK_SERVICE_URL, api_key=token)
|
|
87
|
+
client.me()
|
|
88
|
+
client.close()
|
|
89
|
+
return True
|
|
90
|
+
except Exception as e:
|
|
91
|
+
logger.debug(f"Token validation failed: {e}")
|
|
92
|
+
return False
|
|
78
93
|
|
|
79
94
|
|
|
80
95
|
def login(trismik_api_key: str) -> None:
|
|
@@ -21,7 +21,7 @@ logger = logging.getLogger(__name__)
|
|
|
21
21
|
|
|
22
22
|
# Known fields that are not metrics or hyperparameters
|
|
23
23
|
KNOWN_AGGREGATE_FIELDS = {"dataset", "run_id", "run_completed"}
|
|
24
|
-
KNOWN_ITEM_FIELDS = {"id", "
|
|
24
|
+
KNOWN_ITEM_FIELDS = {"id", "dataset", "input", "output", "label", "run_id"}
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
def upload_result(
|
|
@@ -18,8 +18,10 @@ from scorebook.exceptions import (
|
|
|
18
18
|
DatasetSampleError,
|
|
19
19
|
MissingFieldError,
|
|
20
20
|
)
|
|
21
|
-
from scorebook.metrics import MetricBase
|
|
22
|
-
from scorebook.
|
|
21
|
+
from scorebook.metrics.metric_base import MetricBase
|
|
22
|
+
from scorebook.metrics.metric_registry import MetricRegistry
|
|
23
|
+
from scorebook.utils.io_helpers import validate_path
|
|
24
|
+
from scorebook.utils.render_template import render_template
|
|
23
25
|
|
|
24
26
|
|
|
25
27
|
class EvalDataset:
|
|
@@ -137,6 +139,24 @@ class EvalDataset:
|
|
|
137
139
|
raise DatasetNotInitializedError("Dataset is not initialized")
|
|
138
140
|
return list(map(str, self._hf_dataset.column_names))
|
|
139
141
|
|
|
142
|
+
@property
|
|
143
|
+
def split(self) -> Optional[str]:
|
|
144
|
+
"""Return the split name of the underlying HuggingFace dataset, if available.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
The split name (e.g., "train", "test", "validation") if the dataset was loaded
|
|
148
|
+
from HuggingFace with a specific split. Returns None if the dataset was created
|
|
149
|
+
from a list, CSV, JSON, or loaded without a split specification.
|
|
150
|
+
|
|
151
|
+
Raises:
|
|
152
|
+
DatasetNotInitializedError: If the dataset is not initialized.
|
|
153
|
+
"""
|
|
154
|
+
if self._hf_dataset is None:
|
|
155
|
+
raise DatasetNotInitializedError("Dataset is not initialized")
|
|
156
|
+
|
|
157
|
+
split = self._hf_dataset.split
|
|
158
|
+
return str(split) if split is not None else None
|
|
159
|
+
|
|
140
160
|
def shuffle(self) -> None:
|
|
141
161
|
"""Randomly shuffle the dataset items."""
|
|
142
162
|
if self._hf_dataset is None:
|
scorebook/evaluate/__init__.py
CHANGED
|
@@ -1,15 +1 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Evaluation module for Scorebook.
|
|
3
|
-
|
|
4
|
-
This module provides both synchronous and asynchronous evaluation functions.
|
|
5
|
-
The async version serves as the source of truth, with the sync version
|
|
6
|
-
automatically generated using unasync.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
# Import from async module
|
|
10
|
-
from ._async.evaluate_async import evaluate_async
|
|
11
|
-
|
|
12
|
-
# Import from generated sync module
|
|
13
|
-
from ._sync.evaluate import evaluate
|
|
14
|
-
|
|
15
|
-
__all__ = ["evaluate", "evaluate_async"]
|
|
1
|
+
"""Evaluation module for Scorebook."""
|
|
@@ -6,7 +6,7 @@ from trismik import TrismikAsyncClient, TrismikClient
|
|
|
6
6
|
from trismik.settings import evaluation_settings
|
|
7
7
|
from trismik.types import TrismikRunMetadata
|
|
8
8
|
|
|
9
|
-
from scorebook.eval_datasets import EvalDataset
|
|
9
|
+
from scorebook.eval_datasets.eval_dataset import EvalDataset
|
|
10
10
|
from scorebook.evaluate.evaluate_helpers import (
|
|
11
11
|
build_eval_run_specs,
|
|
12
12
|
create_trismik_async_client,
|
|
@@ -15,6 +15,7 @@ from scorebook.evaluate.evaluate_helpers import (
|
|
|
15
15
|
make_trismik_inference,
|
|
16
16
|
prepare_datasets,
|
|
17
17
|
prepare_hyperparameter_configs,
|
|
18
|
+
resolve_adaptive_split,
|
|
18
19
|
validate_parameters,
|
|
19
20
|
)
|
|
20
21
|
from scorebook.exceptions import InferenceError, ScoreBookError
|
|
@@ -27,12 +28,9 @@ from scorebook.types import (
|
|
|
27
28
|
EvalResult,
|
|
28
29
|
EvalRunSpec,
|
|
29
30
|
)
|
|
30
|
-
from scorebook.utils import
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
resolve_show_progress,
|
|
34
|
-
resolve_upload_results,
|
|
35
|
-
)
|
|
31
|
+
from scorebook.utils.async_utils import async_nullcontext
|
|
32
|
+
from scorebook.utils.common_helpers import resolve_show_progress, resolve_upload_results
|
|
33
|
+
from scorebook.utils.progress_bars import evaluation_progress_context
|
|
36
34
|
|
|
37
35
|
logger = logging.getLogger(__name__)
|
|
38
36
|
|
|
@@ -40,6 +38,7 @@ logger = logging.getLogger(__name__)
|
|
|
40
38
|
async def evaluate_async(
|
|
41
39
|
inference: Union[Callable, InferencePipeline],
|
|
42
40
|
datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
41
|
+
split: Optional[str] = None,
|
|
43
42
|
hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
|
44
43
|
metadata: Optional[Dict[str, Any]] = None,
|
|
45
44
|
experiment_id: Optional[str] = None,
|
|
@@ -58,6 +57,7 @@ async def evaluate_async(
|
|
|
58
57
|
Args:
|
|
59
58
|
inference: The inference callable to evaluate
|
|
60
59
|
datasets: Dataset(s) to evaluate on
|
|
60
|
+
split: Split to use for evaluation (default: "validation")
|
|
61
61
|
hyperparameters: Hyperparameter configuration(s) to evaluate with
|
|
62
62
|
metadata: Optional metadata to attach to the evaluation
|
|
63
63
|
experiment_id: Optional experiment identifier
|
|
@@ -82,7 +82,7 @@ async def evaluate_async(
|
|
|
82
82
|
validate_parameters(locals(), evaluate_async)
|
|
83
83
|
|
|
84
84
|
# Prepare datasets, hyperparameters, and eval run specs
|
|
85
|
-
datasets = prepare_datasets(datasets, sample_size)
|
|
85
|
+
datasets = prepare_datasets(datasets, split, sample_size)
|
|
86
86
|
hyperparameter_configs = prepare_hyperparameter_configs(hyperparameters)
|
|
87
87
|
eval_run_specs = sorted(
|
|
88
88
|
build_eval_run_specs(datasets, hyperparameter_configs, experiment_id, project_id, metadata),
|
|
@@ -378,8 +378,24 @@ async def run_adaptive_evaluation(
|
|
|
378
378
|
Returns:
|
|
379
379
|
Results from the adaptive evaluation
|
|
380
380
|
"""
|
|
381
|
+
# Fetch available splits from Trismik
|
|
382
|
+
dataset_info = await trismik_client.get_dataset_info(adaptive_run_spec.dataset)
|
|
383
|
+
available_splits = dataset_info.splits if hasattr(dataset_info, "splits") else []
|
|
384
|
+
|
|
385
|
+
# Resolve the split to use (with fallback: user-specified -> validation -> test)
|
|
386
|
+
resolved_split = resolve_adaptive_split(
|
|
387
|
+
test_id=adaptive_run_spec.dataset,
|
|
388
|
+
user_specified_split=adaptive_run_spec.split,
|
|
389
|
+
available_splits=available_splits,
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
# Create inference function with bound hyperparameters
|
|
393
|
+
async def inference_with_hyperparams(items: Any) -> Any:
|
|
394
|
+
return await inference(items, **adaptive_run_spec.hyperparameter_config)
|
|
395
|
+
|
|
381
396
|
trismik_results = await trismik_client.run(
|
|
382
397
|
test_id=adaptive_run_spec.dataset,
|
|
398
|
+
split=resolved_split,
|
|
383
399
|
project_id=project_id,
|
|
384
400
|
experiment=experiment_id,
|
|
385
401
|
run_metadata=TrismikRunMetadata(
|
|
@@ -387,7 +403,7 @@ async def run_adaptive_evaluation(
|
|
|
387
403
|
test_configuration={},
|
|
388
404
|
inference_setup={},
|
|
389
405
|
),
|
|
390
|
-
item_processor=make_trismik_inference(
|
|
406
|
+
item_processor=make_trismik_inference(inference_with_hyperparams),
|
|
391
407
|
return_dict=False,
|
|
392
408
|
)
|
|
393
409
|
|
|
@@ -5,7 +5,7 @@ from trismik import TrismikAsyncClient, TrismikClient
|
|
|
5
5
|
from trismik.settings import evaluation_settings
|
|
6
6
|
from trismik.types import TrismikRunMetadata
|
|
7
7
|
|
|
8
|
-
from scorebook.eval_datasets import EvalDataset
|
|
8
|
+
from scorebook.eval_datasets.eval_dataset import EvalDataset
|
|
9
9
|
from scorebook.evaluate.evaluate_helpers import (
|
|
10
10
|
build_eval_run_specs,
|
|
11
11
|
create_trismik_sync_client,
|
|
@@ -14,6 +14,7 @@ from scorebook.evaluate.evaluate_helpers import (
|
|
|
14
14
|
make_trismik_inference,
|
|
15
15
|
prepare_datasets,
|
|
16
16
|
prepare_hyperparameter_configs,
|
|
17
|
+
resolve_adaptive_split,
|
|
17
18
|
validate_parameters,
|
|
18
19
|
)
|
|
19
20
|
from scorebook.exceptions import InferenceError, ScoreBookError
|
|
@@ -26,12 +27,9 @@ from scorebook.types import (
|
|
|
26
27
|
EvalResult,
|
|
27
28
|
EvalRunSpec,
|
|
28
29
|
)
|
|
29
|
-
from
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
resolve_show_progress,
|
|
33
|
-
resolve_upload_results,
|
|
34
|
-
)
|
|
30
|
+
from contextlib import nullcontext
|
|
31
|
+
from scorebook.utils.common_helpers import resolve_show_progress, resolve_upload_results
|
|
32
|
+
from scorebook.utils.progress_bars import evaluation_progress_context
|
|
35
33
|
|
|
36
34
|
logger = logging.getLogger(__name__)
|
|
37
35
|
|
|
@@ -39,6 +37,7 @@ logger = logging.getLogger(__name__)
|
|
|
39
37
|
def evaluate(
|
|
40
38
|
inference: Union[Callable, InferencePipeline],
|
|
41
39
|
datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
40
|
+
split: Optional[str] = None,
|
|
42
41
|
hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
|
43
42
|
metadata: Optional[Dict[str, Any]] = None,
|
|
44
43
|
experiment_id: Optional[str] = None,
|
|
@@ -57,6 +56,7 @@ def evaluate(
|
|
|
57
56
|
Args:
|
|
58
57
|
inference: The inference callable to evaluate
|
|
59
58
|
datasets: Dataset(s) to evaluate on
|
|
59
|
+
split: Split to use for evaluation (default: "validation")
|
|
60
60
|
hyperparameters: Hyperparameter configuration(s) to evaluate with
|
|
61
61
|
metadata: Optional metadata to attach to the evaluation
|
|
62
62
|
experiment_id: Optional experiment identifier
|
|
@@ -81,7 +81,7 @@ def evaluate(
|
|
|
81
81
|
validate_parameters(locals(), evaluate)
|
|
82
82
|
|
|
83
83
|
# Prepare datasets, hyperparameters, and eval run specs
|
|
84
|
-
datasets = prepare_datasets(datasets, sample_size)
|
|
84
|
+
datasets = prepare_datasets(datasets, split, sample_size)
|
|
85
85
|
hyperparameter_configs = prepare_hyperparameter_configs(hyperparameters)
|
|
86
86
|
eval_run_specs = sorted(
|
|
87
87
|
build_eval_run_specs(datasets, hyperparameter_configs, experiment_id, project_id, metadata),
|
|
@@ -377,8 +377,24 @@ def run_adaptive_evaluation(
|
|
|
377
377
|
Returns:
|
|
378
378
|
Results from the adaptive evaluation
|
|
379
379
|
"""
|
|
380
|
+
# Fetch available splits from Trismik
|
|
381
|
+
dataset_info = trismik_client.get_dataset_info(adaptive_run_spec.dataset)
|
|
382
|
+
available_splits = dataset_info.splits if hasattr(dataset_info, "splits") else []
|
|
383
|
+
|
|
384
|
+
# Resolve the split to use (with fallback: user-specified -> validation -> test)
|
|
385
|
+
resolved_split = resolve_adaptive_split(
|
|
386
|
+
test_id=adaptive_run_spec.dataset,
|
|
387
|
+
user_specified_split=adaptive_run_spec.split,
|
|
388
|
+
available_splits=available_splits,
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
# Create inference function with bound hyperparameters
|
|
392
|
+
def inference_with_hyperparams(items: Any) -> Any:
|
|
393
|
+
return inference(items, **adaptive_run_spec.hyperparameter_config)
|
|
394
|
+
|
|
380
395
|
trismik_results = trismik_client.run(
|
|
381
396
|
test_id=adaptive_run_spec.dataset,
|
|
397
|
+
split=resolved_split,
|
|
382
398
|
project_id=project_id,
|
|
383
399
|
experiment=experiment_id,
|
|
384
400
|
run_metadata=TrismikRunMetadata(
|
|
@@ -386,7 +402,7 @@ def run_adaptive_evaluation(
|
|
|
386
402
|
test_configuration={},
|
|
387
403
|
inference_setup={},
|
|
388
404
|
),
|
|
389
|
-
item_processor=make_trismik_inference(
|
|
405
|
+
item_processor=make_trismik_inference(inference_with_hyperparams),
|
|
390
406
|
return_dict=False,
|
|
391
407
|
)
|
|
392
408
|
|
|
@@ -9,7 +9,8 @@ from trismik._async.client import TrismikAsyncClient
|
|
|
9
9
|
from trismik._sync.client import TrismikClient
|
|
10
10
|
from trismik.types import TrismikMultipleChoiceTextItem
|
|
11
11
|
|
|
12
|
-
from scorebook import
|
|
12
|
+
from scorebook.dashboard.credentials import get_token
|
|
13
|
+
from scorebook.eval_datasets.eval_dataset import EvalDataset
|
|
13
14
|
from scorebook.exceptions import (
|
|
14
15
|
DataMismatchError,
|
|
15
16
|
MetricComputationError,
|
|
@@ -17,9 +18,9 @@ from scorebook.exceptions import (
|
|
|
17
18
|
ScoreBookError,
|
|
18
19
|
)
|
|
19
20
|
from scorebook.settings import TRISMIK_SERVICE_URL
|
|
20
|
-
from scorebook.trismik.credentials import get_token
|
|
21
21
|
from scorebook.types import AdaptiveEvalDataset, AdaptiveEvalRunSpec, EvalResult, EvalRunSpec
|
|
22
|
-
from scorebook.utils import
|
|
22
|
+
from scorebook.utils.async_utils import is_awaitable
|
|
23
|
+
from scorebook.utils.transform_helpers import expand_dict
|
|
23
24
|
|
|
24
25
|
logger = logging.getLogger(__name__)
|
|
25
26
|
|
|
@@ -91,6 +92,7 @@ def validate_parameters(params: Dict[str, Any], caller: Callable[..., Any]) -> N
|
|
|
91
92
|
|
|
92
93
|
def prepare_datasets(
|
|
93
94
|
datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
95
|
+
split: Optional[str] = None,
|
|
94
96
|
sample_size: Optional[int] = None,
|
|
95
97
|
) -> List[Union[EvalDataset, AdaptiveEvalDataset]]:
|
|
96
98
|
"""Prepare and separate input datasets into classic and adaptive evaluation datasets."""
|
|
@@ -104,6 +106,12 @@ def prepare_datasets(
|
|
|
104
106
|
|
|
105
107
|
# Prepare classic datasets
|
|
106
108
|
if isinstance(dataset, EvalDataset):
|
|
109
|
+
# Warn if dataset split differs from provided split parameter
|
|
110
|
+
if split is not None and dataset.split is not None and dataset.split != split:
|
|
111
|
+
logger.warning(
|
|
112
|
+
f"Dataset '{dataset.name}' has split '{dataset.split}' but evaluate split "
|
|
113
|
+
f"parameter is '{split}'. The dataset split will be used."
|
|
114
|
+
)
|
|
107
115
|
|
|
108
116
|
if sample_size is not None:
|
|
109
117
|
dataset = dataset.sample(sample_size)
|
|
@@ -111,8 +119,17 @@ def prepare_datasets(
|
|
|
111
119
|
datasets_out.append(dataset)
|
|
112
120
|
|
|
113
121
|
# Prepare adaptive datasets
|
|
114
|
-
elif isinstance(dataset, str) and
|
|
115
|
-
|
|
122
|
+
elif isinstance(dataset, str) and ":adaptive" in dataset:
|
|
123
|
+
# Parse adaptive dataset
|
|
124
|
+
parts = dataset.split(":")
|
|
125
|
+
if len(parts) != 2 or parts[1] != "adaptive":
|
|
126
|
+
raise ParameterValidationError(
|
|
127
|
+
f"Invalid adaptive dataset format: '{dataset}'. "
|
|
128
|
+
f"Use 'test_id:adaptive' format and specify split via the split parameter."
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Use the split parameter for all adaptive datasets
|
|
132
|
+
datasets_out.append(AdaptiveEvalDataset(name=dataset, split=split))
|
|
116
133
|
|
|
117
134
|
# TODO: dataset name string registry
|
|
118
135
|
elif isinstance(dataset, str):
|
|
@@ -174,6 +191,7 @@ def build_eval_run_specs(
|
|
|
174
191
|
hyperparameters_index,
|
|
175
192
|
experiment_id,
|
|
176
193
|
project_id,
|
|
194
|
+
dataset.split,
|
|
177
195
|
metadata,
|
|
178
196
|
)
|
|
179
197
|
)
|
|
@@ -220,6 +238,7 @@ def build_adaptive_eval_run_spec(
|
|
|
220
238
|
hyperparameter_config_index: int,
|
|
221
239
|
experiment_id: str,
|
|
222
240
|
project_id: str,
|
|
241
|
+
split: Optional[str] = None,
|
|
223
242
|
metadata: Optional[Dict[str, Any]] = None,
|
|
224
243
|
) -> AdaptiveEvalRunSpec:
|
|
225
244
|
"""Build AdaptiveEvalRunSpec objects for a dataset/hyperparameter combination."""
|
|
@@ -231,6 +250,7 @@ def build_adaptive_eval_run_spec(
|
|
|
231
250
|
hyperparameter_config_index,
|
|
232
251
|
experiment_id,
|
|
233
252
|
project_id,
|
|
253
|
+
split,
|
|
234
254
|
metadata,
|
|
235
255
|
)
|
|
236
256
|
logger.debug("Built AdaptiveEvalRunSpec: %s", adaptive_eval_run_spec)
|
|
@@ -386,3 +406,57 @@ def make_trismik_inference(
|
|
|
386
406
|
)
|
|
387
407
|
|
|
388
408
|
return sync_trismik_inference_function
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def resolve_adaptive_split(
|
|
412
|
+
test_id: str,
|
|
413
|
+
user_specified_split: Optional[str],
|
|
414
|
+
available_splits: List[str],
|
|
415
|
+
) -> str:
|
|
416
|
+
"""Resolve the dataset split to use for adaptive evaluation.
|
|
417
|
+
|
|
418
|
+
Resolution order:
|
|
419
|
+
1. If user specified a split, validate it exists and use it
|
|
420
|
+
2. If not specified and exactly one split is available, use it
|
|
421
|
+
3. If not specified and multiple splits are available, raise an error
|
|
422
|
+
4. If no splits are available, raise an error
|
|
423
|
+
|
|
424
|
+
Args:
|
|
425
|
+
test_id: The test dataset ID (without ":adaptive" suffix)
|
|
426
|
+
user_specified_split: Optional split name specified by the user
|
|
427
|
+
available_splits: List of available split names for this dataset
|
|
428
|
+
|
|
429
|
+
Returns:
|
|
430
|
+
The resolved split name to use
|
|
431
|
+
|
|
432
|
+
Raises:
|
|
433
|
+
ScoreBookError: If the specified split doesn't exist, multiple splits exist without
|
|
434
|
+
user specification, or no splits are available
|
|
435
|
+
"""
|
|
436
|
+
logger.debug(f"Available splits for {test_id}: {available_splits}")
|
|
437
|
+
|
|
438
|
+
# If user specified a split, validate and use it
|
|
439
|
+
if user_specified_split is not None:
|
|
440
|
+
if user_specified_split in available_splits:
|
|
441
|
+
logger.info(f"Using user-specified split '{user_specified_split}' for {test_id}")
|
|
442
|
+
return user_specified_split
|
|
443
|
+
else:
|
|
444
|
+
raise ScoreBookError(
|
|
445
|
+
f"Specified split '{user_specified_split}' not found for dataset '{test_id}'. "
|
|
446
|
+
f"Available splits: {available_splits}"
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
# No split specified - check available splits
|
|
450
|
+
if len(available_splits) == 0:
|
|
451
|
+
raise ScoreBookError(f"No splits available for dataset '{test_id}'. ")
|
|
452
|
+
elif len(available_splits) == 1:
|
|
453
|
+
# Exactly one split - auto-select it
|
|
454
|
+
selected_split = available_splits[0]
|
|
455
|
+
logger.info(f"Auto-selecting only available split '{selected_split}' for {test_id}")
|
|
456
|
+
return selected_split
|
|
457
|
+
else:
|
|
458
|
+
# Multiple splits available - user must specify
|
|
459
|
+
raise ScoreBookError(
|
|
460
|
+
f"Multiple splits available for dataset '{test_id}': {available_splits}. "
|
|
461
|
+
f"Please specify which split to use via evaluate's 'split' parameter."
|
|
462
|
+
)
|
scorebook/inference/__init__.py
CHANGED
|
@@ -1,11 +1 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Inference module for model execution and predictions.
|
|
3
|
-
|
|
4
|
-
This module provides functionality for running inference with various models
|
|
5
|
-
and processing their responses. It includes utilities for both single and
|
|
6
|
-
batch inference operations.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
from scorebook.inference.inference_pipeline import InferencePipeline
|
|
10
|
-
|
|
11
|
-
__all__ = ["InferencePipeline"]
|
|
1
|
+
"""Inference module for model execution and predictions."""
|
|
@@ -1,8 +1 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Inference clients for various LLM providers.
|
|
3
|
-
|
|
4
|
-
This module provides client implementations for different LLM providers including
|
|
5
|
-
OpenAI, AWS Bedrock, Google Vertex AI, and Portkey.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
__all__ = ["bedrock", "openai", "portkey", "vertex"]
|
|
1
|
+
"""Inference clients for various LLM providers."""
|
scorebook/metrics/__init__.py
CHANGED
|
@@ -1,18 +1 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Metrics for evaluating model predictions.
|
|
3
|
-
|
|
4
|
-
This module provides a collection of evaluation metrics for comparing model outputs
|
|
5
|
-
against ground truth labels. Available metrics include standard classification and
|
|
6
|
-
generation metrics like accuracy, precision, recall, F1-score, etc.
|
|
7
|
-
|
|
8
|
-
Metrics can be accessed by name through the `get_metrics()` function or used
|
|
9
|
-
directly by instantiating specific metric classes. All metrics implement a
|
|
10
|
-
common interface for scoring predictions against references.
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
from scorebook.metrics.accuracy import Accuracy
|
|
14
|
-
from scorebook.metrics.metric_base import MetricBase
|
|
15
|
-
from scorebook.metrics.metric_registry import MetricRegistry
|
|
16
|
-
from scorebook.metrics.precision import Precision
|
|
17
|
-
|
|
18
|
-
__all__ = ["MetricBase", "Precision", "Accuracy", "MetricRegistry"]
|
|
1
|
+
"""Metrics for evaluating model predictions."""
|
|
@@ -85,8 +85,10 @@ class MetricRegistry:
|
|
|
85
85
|
# If input is a string, look up the class in the registry
|
|
86
86
|
if isinstance(name_or_class, str):
|
|
87
87
|
key = name_or_class.lower()
|
|
88
|
+
|
|
88
89
|
if key not in cls._registry:
|
|
89
90
|
raise ValueError(f"Metric '{name_or_class}' not registered.")
|
|
91
|
+
|
|
90
92
|
return cls._registry[key](**kwargs)
|
|
91
93
|
|
|
92
94
|
raise ValueError(
|
scorebook/score/__init__.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Any, Dict, List, Literal, Optional, Union, cast
|
|
3
3
|
|
|
4
|
+
from scorebook.dashboard.upload_results import upload_result_async
|
|
4
5
|
from scorebook.exceptions import DataMismatchError, ParameterValidationError
|
|
5
6
|
from scorebook.score.score_helpers import (
|
|
6
7
|
calculate_metric_scores_async,
|
|
@@ -8,9 +9,9 @@ from scorebook.score.score_helpers import (
|
|
|
8
9
|
resolve_metrics,
|
|
9
10
|
validate_items,
|
|
10
11
|
)
|
|
11
|
-
from scorebook.trismik.upload_results import upload_result_async
|
|
12
12
|
from scorebook.types import Metrics
|
|
13
|
-
from scorebook.utils import resolve_show_progress, resolve_upload_results
|
|
13
|
+
from scorebook.utils.common_helpers import resolve_show_progress, resolve_upload_results
|
|
14
|
+
from scorebook.utils.progress_bars import scoring_progress_context
|
|
14
15
|
|
|
15
16
|
logger = logging.getLogger(__name__)
|
|
16
17
|
|