scorebook 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +12 -5
- scorebook/cli/auth.py +1 -1
- scorebook/dashboard/__init__.py +1 -0
- scorebook/dashboard/create_project.py +91 -0
- scorebook/{trismik → dashboard}/credentials.py +57 -12
- scorebook/{trismik → dashboard}/upload_results.py +1 -1
- scorebook/eval_datasets/__init__.py +0 -4
- scorebook/eval_datasets/eval_dataset.py +4 -2
- scorebook/evaluate/__init__.py +1 -15
- scorebook/evaluate/_async/evaluate_async.py +36 -19
- scorebook/evaluate/_sync/evaluate.py +36 -19
- scorebook/evaluate/evaluate_helpers.py +4 -3
- scorebook/inference/__init__.py +1 -11
- scorebook/inference/clients/__init__.py +1 -8
- scorebook/inference/inference_pipeline.py +1 -1
- scorebook/metrics/README.md +121 -0
- scorebook/metrics/__init__.py +7 -16
- scorebook/metrics/accuracy.py +2 -6
- scorebook/metrics/bertscore.py +50 -0
- scorebook/metrics/bleu.py +82 -0
- scorebook/metrics/core/__init__.py +1 -0
- scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
- scorebook/metrics/core/metric_registry.py +195 -0
- scorebook/metrics/exactmatch.py +95 -0
- scorebook/metrics/f1.py +96 -0
- scorebook/metrics/precision.py +84 -9
- scorebook/metrics/recall.py +94 -0
- scorebook/metrics/rouge.py +85 -0
- scorebook/score/__init__.py +0 -5
- scorebook/score/_async/score_async.py +3 -2
- scorebook/score/_sync/score.py +3 -2
- scorebook/score/score_helpers.py +29 -12
- scorebook/types.py +3 -3
- scorebook/utils/__init__.py +0 -22
- scorebook/utils/common_helpers.py +1 -1
- scorebook/utils/mock_llm/__init__.py +41 -0
- scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
- scorebook/utils/progress_bars.py +58 -786
- scorebook-0.0.15.dist-info/METADATA +300 -0
- scorebook-0.0.15.dist-info/RECORD +110 -0
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
- tutorials/README.md +147 -0
- tutorials/__init__.py +5 -0
- tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
- tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
- tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
- tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
- tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
- tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
- tutorials/examples/1-score/__init__.py +0 -0
- tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
- tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
- tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
- tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
- tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
- tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
- tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
- tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
- tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
- tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
- tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
- tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
- tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
- tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
- tutorials/examples/6-providers/aws/__init__.py +1 -0
- tutorials/examples/6-providers/aws/batch_example.py +219 -0
- tutorials/examples/6-providers/portkey/__init__.py +1 -0
- tutorials/examples/6-providers/portkey/batch_example.py +120 -0
- tutorials/examples/6-providers/portkey/messages_example.py +121 -0
- tutorials/examples/6-providers/vertex/__init__.py +1 -0
- tutorials/examples/6-providers/vertex/batch_example.py +166 -0
- tutorials/examples/6-providers/vertex/messages_example.py +142 -0
- tutorials/examples/__init__.py +0 -0
- tutorials/notebooks/1-scoring.ipynb +162 -0
- tutorials/notebooks/2-evaluating.ipynb +316 -0
- tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
- tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
- tutorials/notebooks/4-uploading_results.ipynb +175 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
- tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
- tutorials/quickstarts/getting_started.ipynb +197 -0
- tutorials/utils/__init__.py +35 -0
- tutorials/utils/args_parser.py +132 -0
- tutorials/utils/output.py +23 -0
- tutorials/utils/setup.py +98 -0
- scorebook/metrics/metric_registry.py +0 -105
- scorebook/trismik/__init__.py +0 -10
- scorebook-0.0.13.dist-info/METADATA +0 -389
- scorebook-0.0.13.dist-info/RECORD +0 -50
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
scorebook/__init__.py
CHANGED
|
@@ -9,12 +9,16 @@ import importlib.metadata
|
|
|
9
9
|
# get version from pyproject.toml
|
|
10
10
|
__version__ = importlib.metadata.version(__package__ or __name__)
|
|
11
11
|
|
|
12
|
-
from scorebook.
|
|
13
|
-
from scorebook.
|
|
12
|
+
from scorebook.dashboard.create_project import create_project, create_project_async
|
|
13
|
+
from scorebook.dashboard.credentials import login, logout, whoami
|
|
14
|
+
from scorebook.dashboard.upload_results import upload_result, upload_result_async
|
|
15
|
+
from scorebook.eval_datasets.eval_dataset import EvalDataset
|
|
16
|
+
from scorebook.evaluate._async.evaluate_async import evaluate_async
|
|
17
|
+
from scorebook.evaluate._sync.evaluate import evaluate
|
|
14
18
|
from scorebook.inference.inference_pipeline import InferencePipeline
|
|
15
|
-
from scorebook.
|
|
16
|
-
from scorebook.
|
|
17
|
-
from scorebook.
|
|
19
|
+
from scorebook.metrics.core.metric_registry import scorebook_metric
|
|
20
|
+
from scorebook.score._async.score_async import score_async
|
|
21
|
+
from scorebook.score._sync.score import score
|
|
18
22
|
from scorebook.utils.render_template import render_template
|
|
19
23
|
|
|
20
24
|
__all__ = [
|
|
@@ -28,6 +32,9 @@ __all__ = [
|
|
|
28
32
|
"logout",
|
|
29
33
|
"whoami",
|
|
30
34
|
"InferencePipeline",
|
|
35
|
+
"create_project",
|
|
36
|
+
"create_project_async",
|
|
31
37
|
"upload_result",
|
|
32
38
|
"upload_result_async",
|
|
39
|
+
"scorebook_metric",
|
|
33
40
|
]
|
scorebook/cli/auth.py
CHANGED
|
@@ -4,7 +4,7 @@ import argparse
|
|
|
4
4
|
import getpass
|
|
5
5
|
import sys
|
|
6
6
|
|
|
7
|
-
from scorebook.
|
|
7
|
+
from scorebook.dashboard.credentials import get_stored_token, get_token_path, login, logout, whoami
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def auth_command(args: argparse.Namespace) -> int:
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Trismik authentication and API integration."""
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Create projects in Trismik's experimentation platform."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from trismik.types import TrismikProject
|
|
7
|
+
|
|
8
|
+
from scorebook.evaluate.evaluate_helpers import (
|
|
9
|
+
create_trismik_async_client,
|
|
10
|
+
create_trismik_sync_client,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def create_project(
|
|
17
|
+
name: str,
|
|
18
|
+
team_id: Optional[str] = None,
|
|
19
|
+
description: Optional[str] = None,
|
|
20
|
+
) -> TrismikProject:
|
|
21
|
+
"""Create a new project in Trismik's experimentation platform (synchronous).
|
|
22
|
+
|
|
23
|
+
This function creates a new project that can be used to organize experiments
|
|
24
|
+
and evaluation runs in the Trismik platform.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
name: Name of the project
|
|
28
|
+
team_id: Optional ID of the team to create the project in. If not provided,
|
|
29
|
+
the project will be created in the user's default team.
|
|
30
|
+
description: Optional description of the project
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
TrismikProject: Created project object containing project details including
|
|
34
|
+
id, name, description, accountId, createdAt, and updatedAt fields
|
|
35
|
+
|
|
36
|
+
Raises:
|
|
37
|
+
TrismikValidationError: If the request fails validation
|
|
38
|
+
TrismikApiError: If the API request fails
|
|
39
|
+
"""
|
|
40
|
+
# Create Trismik client
|
|
41
|
+
trismik_client = create_trismik_sync_client()
|
|
42
|
+
|
|
43
|
+
# Create project via Trismik API
|
|
44
|
+
project = trismik_client.create_project(
|
|
45
|
+
name=name,
|
|
46
|
+
team_id=team_id,
|
|
47
|
+
description=description,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
logger.info(f"Project '{name}' created successfully with ID: {project.id}")
|
|
51
|
+
|
|
52
|
+
return project
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
async def create_project_async(
|
|
56
|
+
name: str,
|
|
57
|
+
team_id: Optional[str] = None,
|
|
58
|
+
description: Optional[str] = None,
|
|
59
|
+
) -> TrismikProject:
|
|
60
|
+
"""Create a new project in Trismik's experimentation platform (asynchronous).
|
|
61
|
+
|
|
62
|
+
This function creates a new project that can be used to organize experiments
|
|
63
|
+
and evaluation runs in the Trismik platform.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
name: Name of the project
|
|
67
|
+
team_id: Optional ID of the team to create the project in. If not provided,
|
|
68
|
+
the project will be created in the user's default team.
|
|
69
|
+
description: Optional description of the project
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
TrismikProject: Created project object containing project details including
|
|
73
|
+
id, name, description, accountId, createdAt, and updatedAt fields
|
|
74
|
+
|
|
75
|
+
Raises:
|
|
76
|
+
TrismikValidationError: If the request fails validation
|
|
77
|
+
TrismikApiError: If the API request fails
|
|
78
|
+
"""
|
|
79
|
+
# Create Trismik async client
|
|
80
|
+
trismik_client = create_trismik_async_client()
|
|
81
|
+
|
|
82
|
+
# Create project via Trismik API (async)
|
|
83
|
+
project = await trismik_client.create_project(
|
|
84
|
+
name=name,
|
|
85
|
+
team_id=team_id,
|
|
86
|
+
description=description,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
logger.info(f"Project '{name}' created successfully with ID: {project.id}")
|
|
90
|
+
|
|
91
|
+
return project
|
|
@@ -3,8 +3,14 @@
|
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
5
|
import pathlib
|
|
6
|
+
import warnings
|
|
6
7
|
from typing import Optional
|
|
7
8
|
|
|
9
|
+
from dotenv import load_dotenv
|
|
10
|
+
from trismik import TrismikClient
|
|
11
|
+
|
|
12
|
+
from scorebook.settings import TRISMIK_SERVICE_URL
|
|
13
|
+
|
|
8
14
|
logger = logging.getLogger(__name__)
|
|
9
15
|
|
|
10
16
|
|
|
@@ -66,27 +72,66 @@ def get_token() -> Optional[str]:
|
|
|
66
72
|
|
|
67
73
|
|
|
68
74
|
def validate_token(token: str) -> bool:
|
|
69
|
-
"""Validate the token by making a test API call to trismik.
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
+
"""Validate the token by making a test API call to trismik.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
token: The API token to validate.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
bool: True if the token is valid, False otherwise.
|
|
82
|
+
"""
|
|
83
|
+
if not token or not token.strip():
|
|
84
|
+
return False
|
|
75
85
|
|
|
76
|
-
|
|
77
|
-
|
|
86
|
+
try:
|
|
87
|
+
# Create a client with the token and verify it works
|
|
88
|
+
client = TrismikClient(service_url=TRISMIK_SERVICE_URL, api_key=token)
|
|
89
|
+
client.me()
|
|
90
|
+
client.close()
|
|
91
|
+
return True
|
|
92
|
+
except Exception as e:
|
|
93
|
+
logger.debug(f"Token validation failed: {e}")
|
|
94
|
+
return False
|
|
78
95
|
|
|
79
96
|
|
|
80
|
-
def login(trismik_api_key: str) -> None:
|
|
97
|
+
def login(trismik_api_key: Optional[str] = None) -> None:
|
|
81
98
|
"""Login to trismik by saving API key locally.
|
|
82
99
|
|
|
100
|
+
If no API key is provided, the function will attempt to read it from the
|
|
101
|
+
TRISMIK_API_KEY environment variable or .env file (using python-dotenv).
|
|
102
|
+
Environment variables take precedence over .env file values.
|
|
103
|
+
|
|
83
104
|
Args:
|
|
84
|
-
trismik_api_key: The API key to use.
|
|
105
|
+
trismik_api_key: The API key to use. If not provided, reads from
|
|
106
|
+
environment or .env file.
|
|
85
107
|
Raises:
|
|
86
|
-
ValueError: If API key is empty or invalid.
|
|
108
|
+
ValueError: If API key is empty, not found, or invalid.
|
|
109
|
+
|
|
110
|
+
Warns:
|
|
111
|
+
UserWarning: If an explicit API key is passed but TRISMIK_API_KEY
|
|
112
|
+
environment variable is also set.
|
|
87
113
|
"""
|
|
114
|
+
# Warn if user passes explicit key but env var is also set
|
|
115
|
+
if trismik_api_key is not None and os.environ.get("TRISMIK_API_KEY"):
|
|
116
|
+
warnings.warn(
|
|
117
|
+
"TRISMIK_API_KEY environment variable is set. The environment variable "
|
|
118
|
+
"takes precedence over the stored token when calling evaluate(). "
|
|
119
|
+
"To use the explicitly provided key, unset the TRISMIK_API_KEY "
|
|
120
|
+
"environment variable.",
|
|
121
|
+
UserWarning,
|
|
122
|
+
stacklevel=2,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
if trismik_api_key is None:
|
|
126
|
+
# Load from .env file if TRISMIK_API_KEY is not already set in environment
|
|
127
|
+
load_dotenv()
|
|
128
|
+
trismik_api_key = os.environ.get("TRISMIK_API_KEY")
|
|
129
|
+
|
|
88
130
|
if not trismik_api_key:
|
|
89
|
-
raise ValueError(
|
|
131
|
+
raise ValueError(
|
|
132
|
+
"API key cannot be empty. Either pass it as a parameter or "
|
|
133
|
+
"set the TRISMIK_API_KEY environment variable or .env file."
|
|
134
|
+
)
|
|
90
135
|
|
|
91
136
|
# Validate token
|
|
92
137
|
if not validate_token(trismik_api_key):
|
|
@@ -21,7 +21,7 @@ logger = logging.getLogger(__name__)
|
|
|
21
21
|
|
|
22
22
|
# Known fields that are not metrics or hyperparameters
|
|
23
23
|
KNOWN_AGGREGATE_FIELDS = {"dataset", "run_id", "run_completed"}
|
|
24
|
-
KNOWN_ITEM_FIELDS = {"id", "
|
|
24
|
+
KNOWN_ITEM_FIELDS = {"id", "dataset", "input", "output", "label", "run_id"}
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
def upload_result(
|
|
@@ -18,8 +18,10 @@ from scorebook.exceptions import (
|
|
|
18
18
|
DatasetSampleError,
|
|
19
19
|
MissingFieldError,
|
|
20
20
|
)
|
|
21
|
-
from scorebook.metrics import MetricBase
|
|
22
|
-
from scorebook.
|
|
21
|
+
from scorebook.metrics.core.metric_base import MetricBase
|
|
22
|
+
from scorebook.metrics.core.metric_registry import MetricRegistry
|
|
23
|
+
from scorebook.utils.io_helpers import validate_path
|
|
24
|
+
from scorebook.utils.render_template import render_template
|
|
23
25
|
|
|
24
26
|
|
|
25
27
|
class EvalDataset:
|
scorebook/evaluate/__init__.py
CHANGED
|
@@ -1,15 +1 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Evaluation module for Scorebook.
|
|
3
|
-
|
|
4
|
-
This module provides both synchronous and asynchronous evaluation functions.
|
|
5
|
-
The async version serves as the source of truth, with the sync version
|
|
6
|
-
automatically generated using unasync.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
# Import from async module
|
|
10
|
-
from ._async.evaluate_async import evaluate_async
|
|
11
|
-
|
|
12
|
-
# Import from generated sync module
|
|
13
|
-
from ._sync.evaluate import evaluate
|
|
14
|
-
|
|
15
|
-
__all__ = ["evaluate", "evaluate_async"]
|
|
1
|
+
"""Evaluation module for Scorebook."""
|
|
@@ -6,7 +6,7 @@ from trismik import TrismikAsyncClient, TrismikClient
|
|
|
6
6
|
from trismik.settings import evaluation_settings
|
|
7
7
|
from trismik.types import TrismikRunMetadata
|
|
8
8
|
|
|
9
|
-
from scorebook.eval_datasets import EvalDataset
|
|
9
|
+
from scorebook.eval_datasets.eval_dataset import EvalDataset
|
|
10
10
|
from scorebook.evaluate.evaluate_helpers import (
|
|
11
11
|
build_eval_run_specs,
|
|
12
12
|
create_trismik_async_client,
|
|
@@ -28,12 +28,9 @@ from scorebook.types import (
|
|
|
28
28
|
EvalResult,
|
|
29
29
|
EvalRunSpec,
|
|
30
30
|
)
|
|
31
|
-
from scorebook.utils import
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
resolve_show_progress,
|
|
35
|
-
resolve_upload_results,
|
|
36
|
-
)
|
|
31
|
+
from scorebook.utils.async_utils import async_nullcontext
|
|
32
|
+
from scorebook.utils.common_helpers import resolve_show_progress, resolve_upload_results
|
|
33
|
+
from scorebook.utils.progress_bars import evaluation_progress_context
|
|
37
34
|
|
|
38
35
|
logger = logging.getLogger(__name__)
|
|
39
36
|
|
|
@@ -116,8 +113,6 @@ async def evaluate_async(
|
|
|
116
113
|
with evaluation_progress_context(
|
|
117
114
|
total_eval_runs=len(eval_run_specs),
|
|
118
115
|
total_items=total_items,
|
|
119
|
-
dataset_count=len(datasets),
|
|
120
|
-
hyperparam_count=len(hyperparameter_configs),
|
|
121
116
|
model_display=model_display,
|
|
122
117
|
enabled=show_progress_bars,
|
|
123
118
|
) as progress_bars:
|
|
@@ -154,19 +149,31 @@ async def execute_runs(
|
|
|
154
149
|
async def worker(
|
|
155
150
|
run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
|
|
156
151
|
) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
|
|
152
|
+
# Create progress callback for adaptive evals
|
|
153
|
+
on_progress: Optional[Callable[[int, int], None]] = None
|
|
154
|
+
if progress_bars is not None and isinstance(run, AdaptiveEvalRunSpec):
|
|
155
|
+
|
|
156
|
+
def _on_progress(current: int, total: int) -> None:
|
|
157
|
+
progress_bars.on_item_progress(current, total)
|
|
158
|
+
|
|
159
|
+
on_progress = _on_progress
|
|
160
|
+
|
|
157
161
|
# Execute run (score_async handles upload internally for classic evals)
|
|
158
162
|
run_result = await execute_run(
|
|
159
|
-
inference,
|
|
163
|
+
inference,
|
|
164
|
+
run,
|
|
165
|
+
upload_results,
|
|
166
|
+
experiment_id,
|
|
167
|
+
project_id,
|
|
168
|
+
metadata,
|
|
169
|
+
trismik_client,
|
|
170
|
+
on_progress,
|
|
160
171
|
)
|
|
161
172
|
|
|
162
173
|
# Update progress bars with items processed and success status
|
|
163
174
|
if progress_bars is not None:
|
|
164
|
-
# Classic evals
|
|
165
|
-
items_processed = (
|
|
166
|
-
len(run.dataset.items)
|
|
167
|
-
if isinstance(run, EvalRunSpec)
|
|
168
|
-
else evaluation_settings["max_iterations"]
|
|
169
|
-
)
|
|
175
|
+
# Classic evals: update items count; Adaptive evals: items already tracked via callback
|
|
176
|
+
items_processed = len(run.dataset.items) if isinstance(run, EvalRunSpec) else 0
|
|
170
177
|
progress_bars.on_run_completed(items_processed, run_result.run_completed)
|
|
171
178
|
|
|
172
179
|
# Update upload progress for classic evals
|
|
@@ -198,11 +205,12 @@ async def execute_runs(
|
|
|
198
205
|
async def execute_run(
|
|
199
206
|
inference: Callable,
|
|
200
207
|
run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
|
|
201
|
-
upload_results: bool,
|
|
208
|
+
upload_results: bool,
|
|
202
209
|
experiment_id: Optional[str] = None,
|
|
203
210
|
project_id: Optional[str] = None,
|
|
204
211
|
metadata: Optional[Dict[str, Any]] = None,
|
|
205
212
|
trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
|
|
213
|
+
on_progress: Optional[Callable[[int, int], None]] = None,
|
|
206
214
|
) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
|
|
207
215
|
"""Execute a single evaluation run."""
|
|
208
216
|
|
|
@@ -221,6 +229,7 @@ async def execute_run(
|
|
|
221
229
|
resolved_project_id,
|
|
222
230
|
metadata,
|
|
223
231
|
trismik_client,
|
|
232
|
+
on_progress,
|
|
224
233
|
)
|
|
225
234
|
|
|
226
235
|
else:
|
|
@@ -341,6 +350,7 @@ async def execute_adaptive_eval_run(
|
|
|
341
350
|
project_id: str,
|
|
342
351
|
metadata: Optional[Dict[str, Any]] = None,
|
|
343
352
|
trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
|
|
353
|
+
on_progress: Optional[Callable[[int, int], None]] = None,
|
|
344
354
|
) -> AdaptiveEvalRunResult:
|
|
345
355
|
"""Execute an adaptive evaluation run."""
|
|
346
356
|
logger.debug("Executing adaptive run for %s", run)
|
|
@@ -350,7 +360,7 @@ async def execute_adaptive_eval_run(
|
|
|
350
360
|
raise ScoreBookError("Trismik client is required for adaptive evaluation")
|
|
351
361
|
|
|
352
362
|
adaptive_eval_run_result = await run_adaptive_evaluation(
|
|
353
|
-
inference, run, experiment_id, project_id, metadata, trismik_client
|
|
363
|
+
inference, run, experiment_id, project_id, metadata, trismik_client, on_progress
|
|
354
364
|
)
|
|
355
365
|
logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
|
|
356
366
|
|
|
@@ -368,6 +378,7 @@ async def run_adaptive_evaluation(
|
|
|
368
378
|
project_id: str,
|
|
369
379
|
metadata: Any,
|
|
370
380
|
trismik_client: Union[TrismikClient, TrismikAsyncClient],
|
|
381
|
+
on_progress: Optional[Callable[[int, int], None]] = None,
|
|
371
382
|
) -> AdaptiveEvalRunResult:
|
|
372
383
|
"""Run an adaptive evaluation using the Trismik API.
|
|
373
384
|
|
|
@@ -378,6 +389,7 @@ async def run_adaptive_evaluation(
|
|
|
378
389
|
project_id: Trismik project ID
|
|
379
390
|
metadata: Additional metadata
|
|
380
391
|
trismik_client: Trismik client instance
|
|
392
|
+
on_progress: Optional callback for progress updates (current, total)
|
|
381
393
|
Returns:
|
|
382
394
|
Results from the adaptive evaluation
|
|
383
395
|
"""
|
|
@@ -392,6 +404,10 @@ async def run_adaptive_evaluation(
|
|
|
392
404
|
available_splits=available_splits,
|
|
393
405
|
)
|
|
394
406
|
|
|
407
|
+
# Create inference function with bound hyperparameters
|
|
408
|
+
async def inference_with_hyperparams(items: Any) -> Any:
|
|
409
|
+
return await inference(items, **adaptive_run_spec.hyperparameter_config)
|
|
410
|
+
|
|
395
411
|
trismik_results = await trismik_client.run(
|
|
396
412
|
test_id=adaptive_run_spec.dataset,
|
|
397
413
|
split=resolved_split,
|
|
@@ -402,7 +418,8 @@ async def run_adaptive_evaluation(
|
|
|
402
418
|
test_configuration={},
|
|
403
419
|
inference_setup={},
|
|
404
420
|
),
|
|
405
|
-
item_processor=make_trismik_inference(
|
|
421
|
+
item_processor=make_trismik_inference(inference_with_hyperparams),
|
|
422
|
+
on_progress=on_progress,
|
|
406
423
|
return_dict=False,
|
|
407
424
|
)
|
|
408
425
|
|
|
@@ -5,7 +5,7 @@ from trismik import TrismikAsyncClient, TrismikClient
|
|
|
5
5
|
from trismik.settings import evaluation_settings
|
|
6
6
|
from trismik.types import TrismikRunMetadata
|
|
7
7
|
|
|
8
|
-
from scorebook.eval_datasets import EvalDataset
|
|
8
|
+
from scorebook.eval_datasets.eval_dataset import EvalDataset
|
|
9
9
|
from scorebook.evaluate.evaluate_helpers import (
|
|
10
10
|
build_eval_run_specs,
|
|
11
11
|
create_trismik_sync_client,
|
|
@@ -27,12 +27,9 @@ from scorebook.types import (
|
|
|
27
27
|
EvalResult,
|
|
28
28
|
EvalRunSpec,
|
|
29
29
|
)
|
|
30
|
-
from
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
resolve_show_progress,
|
|
34
|
-
resolve_upload_results,
|
|
35
|
-
)
|
|
30
|
+
from contextlib import nullcontext
|
|
31
|
+
from scorebook.utils.common_helpers import resolve_show_progress, resolve_upload_results
|
|
32
|
+
from scorebook.utils.progress_bars import evaluation_progress_context
|
|
36
33
|
|
|
37
34
|
logger = logging.getLogger(__name__)
|
|
38
35
|
|
|
@@ -115,8 +112,6 @@ def evaluate(
|
|
|
115
112
|
with evaluation_progress_context(
|
|
116
113
|
total_eval_runs=len(eval_run_specs),
|
|
117
114
|
total_items=total_items,
|
|
118
|
-
dataset_count=len(datasets),
|
|
119
|
-
hyperparam_count=len(hyperparameter_configs),
|
|
120
115
|
model_display=model_display,
|
|
121
116
|
enabled=show_progress_bars,
|
|
122
117
|
) as progress_bars:
|
|
@@ -153,19 +148,31 @@ def execute_runs(
|
|
|
153
148
|
def worker(
|
|
154
149
|
run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
|
|
155
150
|
) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
|
|
151
|
+
# Create progress callback for adaptive evals
|
|
152
|
+
on_progress: Optional[Callable[[int, int], None]] = None
|
|
153
|
+
if progress_bars is not None and isinstance(run, AdaptiveEvalRunSpec):
|
|
154
|
+
|
|
155
|
+
def _on_progress(current: int, total: int) -> None:
|
|
156
|
+
progress_bars.on_item_progress(current, total)
|
|
157
|
+
|
|
158
|
+
on_progress = _on_progress
|
|
159
|
+
|
|
156
160
|
# Execute run (score_async handles upload internally for classic evals)
|
|
157
161
|
run_result = execute_run(
|
|
158
|
-
inference,
|
|
162
|
+
inference,
|
|
163
|
+
run,
|
|
164
|
+
upload_results,
|
|
165
|
+
experiment_id,
|
|
166
|
+
project_id,
|
|
167
|
+
metadata,
|
|
168
|
+
trismik_client,
|
|
169
|
+
on_progress,
|
|
159
170
|
)
|
|
160
171
|
|
|
161
172
|
# Update progress bars with items processed and success status
|
|
162
173
|
if progress_bars is not None:
|
|
163
|
-
# Classic evals
|
|
164
|
-
items_processed = (
|
|
165
|
-
len(run.dataset.items)
|
|
166
|
-
if isinstance(run, EvalRunSpec)
|
|
167
|
-
else evaluation_settings["max_iterations"]
|
|
168
|
-
)
|
|
174
|
+
# Classic evals: update items count; Adaptive evals: items already tracked via callback
|
|
175
|
+
items_processed = len(run.dataset.items) if isinstance(run, EvalRunSpec) else 0
|
|
169
176
|
progress_bars.on_run_completed(items_processed, run_result.run_completed)
|
|
170
177
|
|
|
171
178
|
# Update upload progress for classic evals
|
|
@@ -197,11 +204,12 @@ def execute_runs(
|
|
|
197
204
|
def execute_run(
|
|
198
205
|
inference: Callable,
|
|
199
206
|
run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
|
|
200
|
-
upload_results: bool,
|
|
207
|
+
upload_results: bool,
|
|
201
208
|
experiment_id: Optional[str] = None,
|
|
202
209
|
project_id: Optional[str] = None,
|
|
203
210
|
metadata: Optional[Dict[str, Any]] = None,
|
|
204
211
|
trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
|
|
212
|
+
on_progress: Optional[Callable[[int, int], None]] = None,
|
|
205
213
|
) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
|
|
206
214
|
"""Execute a single evaluation run."""
|
|
207
215
|
|
|
@@ -220,6 +228,7 @@ def execute_run(
|
|
|
220
228
|
resolved_project_id,
|
|
221
229
|
metadata,
|
|
222
230
|
trismik_client,
|
|
231
|
+
on_progress,
|
|
223
232
|
)
|
|
224
233
|
|
|
225
234
|
else:
|
|
@@ -340,6 +349,7 @@ def execute_adaptive_eval_run(
|
|
|
340
349
|
project_id: str,
|
|
341
350
|
metadata: Optional[Dict[str, Any]] = None,
|
|
342
351
|
trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
|
|
352
|
+
on_progress: Optional[Callable[[int, int], None]] = None,
|
|
343
353
|
) -> AdaptiveEvalRunResult:
|
|
344
354
|
"""Execute an adaptive evaluation run."""
|
|
345
355
|
logger.debug("Executing adaptive run for %s", run)
|
|
@@ -349,7 +359,7 @@ def execute_adaptive_eval_run(
|
|
|
349
359
|
raise ScoreBookError("Trismik client is required for adaptive evaluation")
|
|
350
360
|
|
|
351
361
|
adaptive_eval_run_result = run_adaptive_evaluation(
|
|
352
|
-
inference, run, experiment_id, project_id, metadata, trismik_client
|
|
362
|
+
inference, run, experiment_id, project_id, metadata, trismik_client, on_progress
|
|
353
363
|
)
|
|
354
364
|
logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
|
|
355
365
|
|
|
@@ -367,6 +377,7 @@ def run_adaptive_evaluation(
|
|
|
367
377
|
project_id: str,
|
|
368
378
|
metadata: Any,
|
|
369
379
|
trismik_client: Union[TrismikClient, TrismikAsyncClient],
|
|
380
|
+
on_progress: Optional[Callable[[int, int], None]] = None,
|
|
370
381
|
) -> AdaptiveEvalRunResult:
|
|
371
382
|
"""Run an adaptive evaluation using the Trismik API.
|
|
372
383
|
|
|
@@ -377,6 +388,7 @@ def run_adaptive_evaluation(
|
|
|
377
388
|
project_id: Trismik project ID
|
|
378
389
|
metadata: Additional metadata
|
|
379
390
|
trismik_client: Trismik client instance
|
|
391
|
+
on_progress: Optional callback for progress updates (current, total)
|
|
380
392
|
Returns:
|
|
381
393
|
Results from the adaptive evaluation
|
|
382
394
|
"""
|
|
@@ -391,6 +403,10 @@ def run_adaptive_evaluation(
|
|
|
391
403
|
available_splits=available_splits,
|
|
392
404
|
)
|
|
393
405
|
|
|
406
|
+
# Create inference function with bound hyperparameters
|
|
407
|
+
def inference_with_hyperparams(items: Any) -> Any:
|
|
408
|
+
return inference(items, **adaptive_run_spec.hyperparameter_config)
|
|
409
|
+
|
|
394
410
|
trismik_results = trismik_client.run(
|
|
395
411
|
test_id=adaptive_run_spec.dataset,
|
|
396
412
|
split=resolved_split,
|
|
@@ -401,7 +417,8 @@ def run_adaptive_evaluation(
|
|
|
401
417
|
test_configuration={},
|
|
402
418
|
inference_setup={},
|
|
403
419
|
),
|
|
404
|
-
item_processor=make_trismik_inference(
|
|
420
|
+
item_processor=make_trismik_inference(inference_with_hyperparams),
|
|
421
|
+
on_progress=on_progress,
|
|
405
422
|
return_dict=False,
|
|
406
423
|
)
|
|
407
424
|
|
|
@@ -9,7 +9,8 @@ from trismik._async.client import TrismikAsyncClient
|
|
|
9
9
|
from trismik._sync.client import TrismikClient
|
|
10
10
|
from trismik.types import TrismikMultipleChoiceTextItem
|
|
11
11
|
|
|
12
|
-
from scorebook import
|
|
12
|
+
from scorebook.dashboard.credentials import get_token
|
|
13
|
+
from scorebook.eval_datasets.eval_dataset import EvalDataset
|
|
13
14
|
from scorebook.exceptions import (
|
|
14
15
|
DataMismatchError,
|
|
15
16
|
MetricComputationError,
|
|
@@ -17,9 +18,9 @@ from scorebook.exceptions import (
|
|
|
17
18
|
ScoreBookError,
|
|
18
19
|
)
|
|
19
20
|
from scorebook.settings import TRISMIK_SERVICE_URL
|
|
20
|
-
from scorebook.trismik.credentials import get_token
|
|
21
21
|
from scorebook.types import AdaptiveEvalDataset, AdaptiveEvalRunSpec, EvalResult, EvalRunSpec
|
|
22
|
-
from scorebook.utils import
|
|
22
|
+
from scorebook.utils.async_utils import is_awaitable
|
|
23
|
+
from scorebook.utils.transform_helpers import expand_dict
|
|
23
24
|
|
|
24
25
|
logger = logging.getLogger(__name__)
|
|
25
26
|
|
scorebook/inference/__init__.py
CHANGED
|
@@ -1,11 +1 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Inference module for model execution and predictions.
|
|
3
|
-
|
|
4
|
-
This module provides functionality for running inference with various models
|
|
5
|
-
and processing their responses. It includes utilities for both single and
|
|
6
|
-
batch inference operations.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
from scorebook.inference.inference_pipeline import InferencePipeline
|
|
10
|
-
|
|
11
|
-
__all__ = ["InferencePipeline"]
|
|
1
|
+
"""Inference module for model execution and predictions."""
|
|
@@ -1,8 +1 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Inference clients for various LLM providers.
|
|
3
|
-
|
|
4
|
-
This module provides client implementations for different LLM providers including
|
|
5
|
-
OpenAI, AWS Bedrock, Google Vertex AI, and Portkey.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
__all__ = ["bedrock", "openai", "portkey", "vertex"]
|
|
1
|
+
"""Inference clients for various LLM providers."""
|