scorebook 0.0.11__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +8 -1
- scorebook/evaluate/_async/evaluate_async.py +100 -125
- scorebook/evaluate/_sync/evaluate.py +100 -126
- scorebook/evaluate/evaluate_helpers.py +24 -24
- scorebook/exceptions.py +6 -2
- scorebook/score/__init__.py +6 -0
- scorebook/score/_async/__init__.py +0 -0
- scorebook/score/_async/score_async.py +145 -0
- scorebook/score/_sync/__init__.py +0 -0
- scorebook/score/_sync/score.py +145 -0
- scorebook/score/score_helpers.py +207 -0
- scorebook/trismik/upload_results.py +254 -0
- scorebook/types.py +33 -54
- scorebook/utils/__init__.py +8 -1
- scorebook/utils/common_helpers.py +41 -0
- scorebook/utils/progress_bars.py +67 -0
- {scorebook-0.0.11.dist-info → scorebook-0.0.12.dist-info}/METADATA +2 -2
- {scorebook-0.0.11.dist-info → scorebook-0.0.12.dist-info}/RECORD +21 -13
- {scorebook-0.0.11.dist-info → scorebook-0.0.12.dist-info}/WHEEL +0 -0
- {scorebook-0.0.11.dist-info → scorebook-0.0.12.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.11.dist-info → scorebook-0.0.12.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
"""Upload evaluation and scoring results to Trismik's experimentation platform."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Any, Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
from trismik.types import (
|
|
7
|
+
TrismikClassicEvalItem,
|
|
8
|
+
TrismikClassicEvalMetric,
|
|
9
|
+
TrismikClassicEvalRequest,
|
|
10
|
+
TrismikClassicEvalResponse,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
from scorebook.evaluate.evaluate_helpers import (
|
|
14
|
+
create_trismik_async_client,
|
|
15
|
+
create_trismik_sync_client,
|
|
16
|
+
get_model_name,
|
|
17
|
+
normalize_metric_value,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
# Known fields that are not metrics or hyperparameters
|
|
23
|
+
KNOWN_AGGREGATE_FIELDS = {"dataset", "run_id", "run_completed"}
|
|
24
|
+
KNOWN_ITEM_FIELDS = {"id", "dataset_name", "input", "output", "label", "run_id"}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def upload_result(
|
|
28
|
+
run_result: Dict[str, List[Dict[str, Any]]],
|
|
29
|
+
experiment_id: str,
|
|
30
|
+
project_id: str,
|
|
31
|
+
dataset_name: Optional[str] = None,
|
|
32
|
+
hyperparameters: Optional[Dict[str, Any]] = None,
|
|
33
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
34
|
+
model_name: Optional[str] = None,
|
|
35
|
+
) -> str:
|
|
36
|
+
"""Upload evaluation or scoring results to Trismik's platform (synchronous).
|
|
37
|
+
|
|
38
|
+
This function uploads results in the format returned by the evaluate or score
|
|
39
|
+
functions to the Trismik platform for tracking and analysis.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
run_result: Dict with keys 'aggregate_results' and 'item_results' containing
|
|
43
|
+
evaluation/scoring results. Structure matches the output of evaluate()/score().
|
|
44
|
+
experiment_id: Trismik experiment identifier
|
|
45
|
+
project_id: Trismik project identifier
|
|
46
|
+
dataset_name: Optional dataset name. If not provided, extracted from metadata
|
|
47
|
+
or defaults to "Dataset"
|
|
48
|
+
hyperparameters: Optional dict of hyperparameters. If not provided, extracted
|
|
49
|
+
from run_result.
|
|
50
|
+
metadata: Optional metadata dict (can include 'model' and 'dataset' keys)
|
|
51
|
+
model_name: Optional model name. If not provided, extracted from metadata
|
|
52
|
+
or defaults to "Model"
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
str: Run ID assigned by Trismik
|
|
56
|
+
|
|
57
|
+
Raises:
|
|
58
|
+
Exception: If upload fails (re-raises underlying exceptions)
|
|
59
|
+
"""
|
|
60
|
+
# Create Trismik client
|
|
61
|
+
trismik_client = create_trismik_sync_client()
|
|
62
|
+
|
|
63
|
+
# Get model name - use provided model_name, or extract from metadata, or use default
|
|
64
|
+
if model_name is not None:
|
|
65
|
+
model = model_name
|
|
66
|
+
else:
|
|
67
|
+
model = get_model_name(metadata=metadata)
|
|
68
|
+
|
|
69
|
+
# Get dataset name - use provided dataset_name, or extract from metadata, or use default
|
|
70
|
+
if dataset_name is None:
|
|
71
|
+
if metadata and "dataset" in metadata:
|
|
72
|
+
dataset_name = str(metadata["dataset"])
|
|
73
|
+
else:
|
|
74
|
+
dataset_name = "Dataset"
|
|
75
|
+
|
|
76
|
+
# Extract aggregate and item results
|
|
77
|
+
aggregate_results = run_result.get("aggregate_results", [])
|
|
78
|
+
item_results = run_result.get("item_results", [])
|
|
79
|
+
|
|
80
|
+
# Use provided hyperparameters or default to empty dict
|
|
81
|
+
# Note: We don't extract hyperparameters from aggregate_results to avoid
|
|
82
|
+
# misclassifying metrics as hyperparameters
|
|
83
|
+
if hyperparameters is None:
|
|
84
|
+
hyperparameters = {}
|
|
85
|
+
|
|
86
|
+
# Create eval items from item_results
|
|
87
|
+
trismik_items: List[TrismikClassicEvalItem] = []
|
|
88
|
+
for item in item_results:
|
|
89
|
+
# Extract inputs, outputs, labels
|
|
90
|
+
item_id = str(item.get("id", 0))
|
|
91
|
+
model_input = str(item.get("input", ""))
|
|
92
|
+
model_output = str(item.get("output", ""))
|
|
93
|
+
gold_output = str(item.get("label", ""))
|
|
94
|
+
|
|
95
|
+
# Extract item-level metrics (exclude known fields and hyperparameters)
|
|
96
|
+
item_metrics: Dict[str, Any] = {}
|
|
97
|
+
for key, value in item.items():
|
|
98
|
+
if key not in KNOWN_ITEM_FIELDS and key not in (hyperparameters or {}):
|
|
99
|
+
# Normalize metric value for API compatibility
|
|
100
|
+
item_metrics[key] = normalize_metric_value(value)
|
|
101
|
+
|
|
102
|
+
eval_item = TrismikClassicEvalItem( # pragma: allowlist secret
|
|
103
|
+
datasetItemId=item_id,
|
|
104
|
+
modelInput=model_input,
|
|
105
|
+
modelOutput=model_output,
|
|
106
|
+
goldOutput=gold_output,
|
|
107
|
+
metrics=item_metrics,
|
|
108
|
+
)
|
|
109
|
+
trismik_items.append(eval_item)
|
|
110
|
+
|
|
111
|
+
# Extract aggregate metrics from aggregate_results
|
|
112
|
+
trismik_metrics: List[TrismikClassicEvalMetric] = []
|
|
113
|
+
if aggregate_results:
|
|
114
|
+
for key, value in aggregate_results[0].items():
|
|
115
|
+
if key not in KNOWN_AGGREGATE_FIELDS and key not in (hyperparameters or {}):
|
|
116
|
+
# This is a metric # pragma: allowlist secret
|
|
117
|
+
metric = TrismikClassicEvalMetric(metricId=key, value=normalize_metric_value(value))
|
|
118
|
+
trismik_metrics.append(metric) # pragma: allowlist secret
|
|
119
|
+
|
|
120
|
+
# Create classic eval request
|
|
121
|
+
classic_eval_request = TrismikClassicEvalRequest(
|
|
122
|
+
project_id,
|
|
123
|
+
experiment_id,
|
|
124
|
+
dataset_name,
|
|
125
|
+
model,
|
|
126
|
+
hyperparameters,
|
|
127
|
+
trismik_items,
|
|
128
|
+
trismik_metrics,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Submit to Trismik # pragma: allowlist secret
|
|
132
|
+
response: TrismikClassicEvalResponse = trismik_client.submit_classic_eval(
|
|
133
|
+
classic_eval_request
|
|
134
|
+
) # pragma: allowlist secret
|
|
135
|
+
|
|
136
|
+
run_id: str = response.id
|
|
137
|
+
logger.info(f"Run result uploaded successfully to Trismik with run_id: {run_id}")
|
|
138
|
+
|
|
139
|
+
return run_id
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
async def upload_result_async(
|
|
143
|
+
run_result: Dict[str, List[Dict[str, Any]]],
|
|
144
|
+
experiment_id: str,
|
|
145
|
+
project_id: str,
|
|
146
|
+
dataset_name: Optional[str] = None,
|
|
147
|
+
hyperparameters: Optional[Dict[str, Any]] = None,
|
|
148
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
149
|
+
model_name: Optional[str] = None,
|
|
150
|
+
) -> str:
|
|
151
|
+
"""Upload evaluation or scoring results to Trismik's platform (asynchronous).
|
|
152
|
+
|
|
153
|
+
This function uploads results in the format returned by the evaluate or
|
|
154
|
+
score functions to the Trismik platform for tracking and analysis.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
run_result: Dict with keys 'aggregate_results' and 'item_results' containing
|
|
158
|
+
evaluation/scoring results. Structure matches the output of evaluate()/score().
|
|
159
|
+
experiment_id: Trismik experiment identifier
|
|
160
|
+
project_id: Trismik project identifier
|
|
161
|
+
dataset_name: Optional dataset name. If not provided, extracted from metadata
|
|
162
|
+
or defaults to "Dataset"
|
|
163
|
+
hyperparameters: Optional dict of hyperparameters. If not provided, extracted
|
|
164
|
+
from run_result.
|
|
165
|
+
metadata: Optional metadata dict (can include 'model' and 'dataset' keys)
|
|
166
|
+
model_name: Optional model name. If not provided, extracted from metadata
|
|
167
|
+
or defaults to "Model"
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
str: Run ID assigned by Trismik
|
|
171
|
+
|
|
172
|
+
Raises:
|
|
173
|
+
Exception: If upload fails (re-raises underlying exceptions)
|
|
174
|
+
"""
|
|
175
|
+
# Create Trismik async client
|
|
176
|
+
trismik_client = create_trismik_async_client()
|
|
177
|
+
|
|
178
|
+
# Get model name - use provided model_name, or extract from metadata, or use default
|
|
179
|
+
if model_name is not None:
|
|
180
|
+
model = model_name
|
|
181
|
+
else:
|
|
182
|
+
model = get_model_name(metadata=metadata)
|
|
183
|
+
|
|
184
|
+
# Get dataset name - use provided dataset_name, or extract from metadata, or use default
|
|
185
|
+
if dataset_name is None:
|
|
186
|
+
if metadata and "dataset" in metadata:
|
|
187
|
+
dataset_name = str(metadata["dataset"])
|
|
188
|
+
else:
|
|
189
|
+
dataset_name = "Dataset"
|
|
190
|
+
|
|
191
|
+
# Extract aggregate and item results
|
|
192
|
+
aggregate_results = run_result.get("aggregate_results", [])
|
|
193
|
+
item_results = run_result.get("item_results", [])
|
|
194
|
+
|
|
195
|
+
# Use provided hyperparameters or default to empty dict
|
|
196
|
+
# Note: We don't extract hyperparameters from aggregate_results to avoid
|
|
197
|
+
# misclassifying metrics as hyperparameters
|
|
198
|
+
if hyperparameters is None:
|
|
199
|
+
hyperparameters = {}
|
|
200
|
+
|
|
201
|
+
# Create eval items from item_results
|
|
202
|
+
trismik_items: List[TrismikClassicEvalItem] = []
|
|
203
|
+
for item in item_results:
|
|
204
|
+
# Extract inputs, outputs, labels
|
|
205
|
+
item_id = str(item.get("id", 0))
|
|
206
|
+
model_input = str(item.get("input", ""))
|
|
207
|
+
model_output = str(item.get("output", ""))
|
|
208
|
+
gold_output = str(item.get("label", ""))
|
|
209
|
+
|
|
210
|
+
# Extract item-level metrics (exclude known fields and hyperparameters)
|
|
211
|
+
item_metrics: Dict[str, Any] = {}
|
|
212
|
+
for key, value in item.items():
|
|
213
|
+
if key not in KNOWN_ITEM_FIELDS and key not in (hyperparameters or {}):
|
|
214
|
+
# Normalize metric value for API compatibility
|
|
215
|
+
item_metrics[key] = normalize_metric_value(value)
|
|
216
|
+
|
|
217
|
+
eval_item = TrismikClassicEvalItem( # pragma: allowlist secret
|
|
218
|
+
datasetItemId=item_id,
|
|
219
|
+
modelInput=model_input,
|
|
220
|
+
modelOutput=model_output,
|
|
221
|
+
goldOutput=gold_output,
|
|
222
|
+
metrics=item_metrics,
|
|
223
|
+
)
|
|
224
|
+
trismik_items.append(eval_item)
|
|
225
|
+
|
|
226
|
+
# Extract aggregate metrics from aggregate_results
|
|
227
|
+
trismik_metrics: List[TrismikClassicEvalMetric] = []
|
|
228
|
+
if aggregate_results:
|
|
229
|
+
for key, value in aggregate_results[0].items():
|
|
230
|
+
if key not in KNOWN_AGGREGATE_FIELDS and key not in (hyperparameters or {}):
|
|
231
|
+
# This is a metric # pragma: allowlist secret
|
|
232
|
+
metric = TrismikClassicEvalMetric(metricId=key, value=normalize_metric_value(value))
|
|
233
|
+
trismik_metrics.append(metric) # pragma: allowlist secret
|
|
234
|
+
|
|
235
|
+
# Create classic eval request
|
|
236
|
+
classic_eval_request = TrismikClassicEvalRequest(
|
|
237
|
+
project_id,
|
|
238
|
+
experiment_id,
|
|
239
|
+
dataset_name,
|
|
240
|
+
model,
|
|
241
|
+
hyperparameters,
|
|
242
|
+
trismik_items,
|
|
243
|
+
trismik_metrics,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
# Submit to Trismik (async) # pragma: allowlist secret
|
|
247
|
+
response: TrismikClassicEvalResponse = await trismik_client.submit_classic_eval(
|
|
248
|
+
classic_eval_request
|
|
249
|
+
) # pragma: allowlist secret
|
|
250
|
+
|
|
251
|
+
run_id: str = response.id
|
|
252
|
+
logger.info(f"Run result uploaded successfully to Trismik with run_id: {run_id}")
|
|
253
|
+
|
|
254
|
+
return run_id
|
scorebook/types.py
CHANGED
|
@@ -1,9 +1,15 @@
|
|
|
1
1
|
"""Type definitions for scorebook evaluation framework."""
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
-
from typing import Any, Dict, List, Optional, Union
|
|
4
|
+
from typing import Any, Dict, List, Optional, Sequence, Type, Union
|
|
5
5
|
|
|
6
6
|
from scorebook.eval_datasets import EvalDataset
|
|
7
|
+
from scorebook.metrics.metric_base import MetricBase
|
|
8
|
+
|
|
9
|
+
# Type alias for metrics parameter
|
|
10
|
+
Metrics = Union[
|
|
11
|
+
str, "MetricBase", Type["MetricBase"], Sequence[Union[str, "MetricBase", Type["MetricBase"]]]
|
|
12
|
+
]
|
|
7
13
|
|
|
8
14
|
|
|
9
15
|
@dataclass
|
|
@@ -54,73 +60,37 @@ class ClassicEvalRunResult:
|
|
|
54
60
|
run_spec: EvalRunSpec
|
|
55
61
|
run_completed: bool
|
|
56
62
|
outputs: Optional[List[Any]]
|
|
57
|
-
scores: Optional[Dict[str, Any]]
|
|
63
|
+
scores: Optional[Dict[str, List[Dict[str, Any]]]] # score_async format
|
|
58
64
|
run_id: Optional[str] = None
|
|
59
65
|
|
|
60
66
|
@property
|
|
61
67
|
def item_scores(self) -> List[Dict[str, Any]]:
|
|
62
68
|
"""Return a list of dictionaries containing scores for each evaluated item."""
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
if idx >= len(self.run_spec.inputs):
|
|
68
|
-
break
|
|
69
|
-
|
|
70
|
-
result = {
|
|
71
|
-
"id": idx,
|
|
72
|
-
"dataset_name": self.run_spec.dataset.name,
|
|
73
|
-
"input": self.run_spec.inputs[idx],
|
|
74
|
-
"label": self.run_spec.labels[idx] if idx < len(self.run_spec.labels) else None,
|
|
75
|
-
"output": output,
|
|
76
|
-
**self.run_spec.hyperparameter_config,
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
# Add run_id if available
|
|
80
|
-
if self.run_id is not None:
|
|
81
|
-
result["run_id"] = self.run_id
|
|
82
|
-
|
|
83
|
-
# Add individual item scores if available
|
|
84
|
-
if self.scores is not None:
|
|
85
|
-
for metric_name, metric_data in self.scores.items():
|
|
86
|
-
if isinstance(metric_data, dict) and "item_scores" in metric_data:
|
|
87
|
-
if idx < len(metric_data["item_scores"]):
|
|
88
|
-
result[metric_name] = metric_data["item_scores"][idx]
|
|
89
|
-
else:
|
|
90
|
-
# If scores is just a single value, replicate it for each item
|
|
91
|
-
result[metric_name] = metric_data
|
|
92
|
-
|
|
93
|
-
results.append(result)
|
|
94
|
-
|
|
95
|
-
return results
|
|
69
|
+
if self.scores and "item_results" in self.scores:
|
|
70
|
+
# score_async already built this in the exact format we need
|
|
71
|
+
return self.scores["item_results"]
|
|
72
|
+
return []
|
|
96
73
|
|
|
97
74
|
@property
|
|
98
75
|
def aggregate_scores(self) -> Dict[str, Any]:
|
|
99
76
|
"""Return the aggregated scores for this run."""
|
|
100
|
-
|
|
77
|
+
if (
|
|
78
|
+
self.scores
|
|
79
|
+
and "aggregate_results" in self.scores
|
|
80
|
+
and len(self.scores["aggregate_results"]) > 0
|
|
81
|
+
):
|
|
82
|
+
result = self.scores["aggregate_results"][0].copy()
|
|
83
|
+
# Add run_completed (not included in score_async format)
|
|
84
|
+
result["run_completed"] = self.run_completed
|
|
85
|
+
return result
|
|
86
|
+
|
|
87
|
+
# Fallback if no scores available
|
|
88
|
+
return {
|
|
101
89
|
"dataset": self.run_spec.dataset.name,
|
|
102
90
|
"run_completed": self.run_completed,
|
|
103
91
|
**self.run_spec.hyperparameter_config,
|
|
104
92
|
}
|
|
105
93
|
|
|
106
|
-
# Add run_id if available
|
|
107
|
-
if self.run_id is not None:
|
|
108
|
-
result["run_id"] = self.run_id
|
|
109
|
-
|
|
110
|
-
# Add aggregate scores from metrics
|
|
111
|
-
if self.scores is not None:
|
|
112
|
-
for metric_name, metric_data in self.scores.items():
|
|
113
|
-
if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
|
|
114
|
-
# Flatten the aggregate scores from each metric
|
|
115
|
-
for key, value in metric_data["aggregate_scores"].items():
|
|
116
|
-
score_key = key if key == metric_name else f"{metric_name}_{key}"
|
|
117
|
-
result[score_key] = value
|
|
118
|
-
else:
|
|
119
|
-
# If scores is just a single value, use it as is
|
|
120
|
-
result[metric_name] = metric_data
|
|
121
|
-
|
|
122
|
-
return result
|
|
123
|
-
|
|
124
94
|
|
|
125
95
|
@dataclass
|
|
126
96
|
class AdaptiveEvalRunResult:
|
|
@@ -180,3 +150,12 @@ class EvalResult:
|
|
|
180
150
|
results.append(run_result.aggregate_scores)
|
|
181
151
|
|
|
182
152
|
return results
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@dataclass
|
|
156
|
+
class MetricScore:
|
|
157
|
+
"""Container for metric scores across multiple runs."""
|
|
158
|
+
|
|
159
|
+
metric_name: str
|
|
160
|
+
aggregate_scores: Dict[str, Any]
|
|
161
|
+
item_scores: List[Dict[str, Any]]
|
scorebook/utils/__init__.py
CHANGED
|
@@ -1,16 +1,23 @@
|
|
|
1
1
|
"""Utility functions and common helpers for the Scorebook framework."""
|
|
2
2
|
|
|
3
|
+
from contextlib import nullcontext
|
|
4
|
+
|
|
3
5
|
from scorebook.utils.async_utils import async_nullcontext, is_awaitable
|
|
6
|
+
from scorebook.utils.common_helpers import resolve_show_progress, resolve_upload_results
|
|
4
7
|
from scorebook.utils.io_helpers import validate_path
|
|
5
|
-
from scorebook.utils.progress_bars import evaluation_progress_context
|
|
8
|
+
from scorebook.utils.progress_bars import evaluation_progress_context, scoring_progress_context
|
|
6
9
|
from scorebook.utils.render_template import render_template
|
|
7
10
|
from scorebook.utils.transform_helpers import expand_dict
|
|
8
11
|
|
|
9
12
|
__all__ = [
|
|
10
13
|
"async_nullcontext",
|
|
14
|
+
"nullcontext",
|
|
11
15
|
"is_awaitable",
|
|
16
|
+
"resolve_show_progress",
|
|
17
|
+
"resolve_upload_results",
|
|
12
18
|
"validate_path",
|
|
13
19
|
"expand_dict",
|
|
14
20
|
"evaluation_progress_context",
|
|
21
|
+
"scoring_progress_context",
|
|
15
22
|
"render_template",
|
|
16
23
|
]
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Common helper functions shared across scorebook modules."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Literal, Optional, Union
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def resolve_upload_results(upload_results: Union[Literal["auto"], bool]) -> bool:
|
|
10
|
+
"""Resolve the upload_results parameter based on trismik login status.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
upload_results: Can be True, False, or "auto". When "auto", resolves to True
|
|
14
|
+
if user is logged in to Trismik, False otherwise.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
bool: Whether to upload results to Trismik
|
|
18
|
+
"""
|
|
19
|
+
if upload_results == "auto":
|
|
20
|
+
from scorebook.trismik.credentials import get_token
|
|
21
|
+
|
|
22
|
+
upload_results = get_token() is not None
|
|
23
|
+
logger.debug("Auto upload results resolved to: %s", upload_results)
|
|
24
|
+
|
|
25
|
+
return upload_results
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def resolve_show_progress(show_progress: Optional[bool]) -> bool:
|
|
29
|
+
"""Resolve whether to show progress bars.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
show_progress: Explicit setting (None uses default from settings)
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
bool: Whether to show progress bars
|
|
36
|
+
"""
|
|
37
|
+
if show_progress is None:
|
|
38
|
+
from scorebook.settings import SHOW_PROGRESS_BARS
|
|
39
|
+
|
|
40
|
+
return bool(SHOW_PROGRESS_BARS)
|
|
41
|
+
return show_progress
|
scorebook/utils/progress_bars.py
CHANGED
|
@@ -787,3 +787,70 @@ def evaluation_progress_context(
|
|
|
787
787
|
yield progress_bars
|
|
788
788
|
finally:
|
|
789
789
|
progress_bars.close_progress_bars()
|
|
790
|
+
|
|
791
|
+
|
|
792
|
+
@contextmanager
|
|
793
|
+
def scoring_progress_context(
|
|
794
|
+
total_metrics: int,
|
|
795
|
+
enabled: bool = True,
|
|
796
|
+
) -> Generator[Optional[tqdm], None, None]:
|
|
797
|
+
"""Context manager for scoring progress display.
|
|
798
|
+
|
|
799
|
+
Args:
|
|
800
|
+
total_metrics: Total number of metrics to score
|
|
801
|
+
enabled: Whether to show progress bar (default: True)
|
|
802
|
+
|
|
803
|
+
Yields:
|
|
804
|
+
Optional[tqdm]: Progress bar instance (None if disabled)
|
|
805
|
+
"""
|
|
806
|
+
if not enabled:
|
|
807
|
+
yield None
|
|
808
|
+
return
|
|
809
|
+
|
|
810
|
+
# Use appropriate spinner frames based on environment
|
|
811
|
+
spinner_frames = SPINNER_FRAMES if SPINNER_FRAMES else ["|"]
|
|
812
|
+
spinner_cycle_obj = cycle(spinner_frames)
|
|
813
|
+
|
|
814
|
+
# Get initial spinner frame
|
|
815
|
+
initial_frame = next(spinner_cycle_obj)
|
|
816
|
+
|
|
817
|
+
progress_bar = tqdm(
|
|
818
|
+
total=total_metrics,
|
|
819
|
+
desc=f"{initial_frame} Scoring metrics",
|
|
820
|
+
unit="metric",
|
|
821
|
+
leave=False,
|
|
822
|
+
bar_format="{desc} | {n}/{total} metrics {percentage:3.0f}%|{bar}|",
|
|
823
|
+
)
|
|
824
|
+
|
|
825
|
+
# Start spinner animation thread
|
|
826
|
+
stop_event = threading.Event()
|
|
827
|
+
current_metric_name = [""] # List to allow mutation in closure
|
|
828
|
+
|
|
829
|
+
def animate_spinner() -> None:
|
|
830
|
+
"""Update spinner and description in background thread."""
|
|
831
|
+
while not stop_event.is_set():
|
|
832
|
+
try:
|
|
833
|
+
frame = next(spinner_cycle_obj)
|
|
834
|
+
metric_suffix = f": {current_metric_name[0]}" if current_metric_name[0] else ""
|
|
835
|
+
progress_bar.set_description_str(
|
|
836
|
+
f"{frame} Scoring metrics{metric_suffix}", refresh=True
|
|
837
|
+
)
|
|
838
|
+
time.sleep(SPINNER_INTERVAL_SECONDS)
|
|
839
|
+
except Exception:
|
|
840
|
+
break
|
|
841
|
+
|
|
842
|
+
spinner_thread = threading.Thread(target=animate_spinner, daemon=True)
|
|
843
|
+
spinner_thread.start()
|
|
844
|
+
|
|
845
|
+
# Attach helper method to update current metric name
|
|
846
|
+
def set_current_metric(metric_name: str) -> None:
|
|
847
|
+
current_metric_name[0] = metric_name
|
|
848
|
+
|
|
849
|
+
progress_bar.set_current_metric = set_current_metric
|
|
850
|
+
|
|
851
|
+
try:
|
|
852
|
+
yield progress_bar
|
|
853
|
+
finally:
|
|
854
|
+
stop_event.set()
|
|
855
|
+
spinner_thread.join(timeout=1.0)
|
|
856
|
+
progress_bar.close()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scorebook
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.12
|
|
4
4
|
Summary: A Python project for LLM evaluation.
|
|
5
5
|
License-File: LICENSE
|
|
6
6
|
Author: Euan Campbell
|
|
@@ -37,7 +37,7 @@ Requires-Dist: torch ; extra == "examples"
|
|
|
37
37
|
Requires-Dist: torchaudio ; extra == "examples"
|
|
38
38
|
Requires-Dist: torchvision ; extra == "examples"
|
|
39
39
|
Requires-Dist: transformers ; extra == "examples"
|
|
40
|
-
Requires-Dist: trismik (
|
|
40
|
+
Requires-Dist: trismik (==1.0.1)
|
|
41
41
|
Description-Content-Type: text/markdown
|
|
42
42
|
|
|
43
43
|
# Scorebook
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
scorebook/__init__.py,sha256=
|
|
1
|
+
scorebook/__init__.py,sha256=dcaqd4-qxLHPCw6p-LS_0b8JumEpHDtEilgwP8qNKRY,868
|
|
2
2
|
scorebook/cli/__init__.py,sha256=E89jR1DljFSHhfjEGSRKLgz0KhxGyRQ9a3vpUOmQL9o,32
|
|
3
3
|
scorebook/cli/auth.py,sha256=T6-5662Jh-HEhZvfUgy82BvxIiRzjZne-4LRp9Gb2JE,2937
|
|
4
4
|
scorebook/cli/main.py,sha256=cEvShENl6L6feX_sa7FGNTeoz5UtwqzwenmcHaON1hg,1589
|
|
@@ -6,11 +6,11 @@ scorebook/eval_datasets/__init__.py,sha256=9YPjxjdaMaOrBUzJwvsUlFPl-KdYMgUGTV3WN
|
|
|
6
6
|
scorebook/eval_datasets/eval_dataset.py,sha256=6GgrAaWelU5dK6I-x9zXHCxVSfvo41yyYNPF0ue4zbo,27200
|
|
7
7
|
scorebook/evaluate/__init__.py,sha256=m3mCjeLildghT86ZDwY4GxCmaYZmhjbxkuTk0M9S_mc,423
|
|
8
8
|
scorebook/evaluate/_async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
-
scorebook/evaluate/_async/evaluate_async.py,sha256=
|
|
9
|
+
scorebook/evaluate/_async/evaluate_async.py,sha256=qZ2y7-uQRT1b4saBoNaPO9fv4G2LhcP_ZyvkSsIEgHg,15629
|
|
10
10
|
scorebook/evaluate/_sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
-
scorebook/evaluate/_sync/evaluate.py,sha256=
|
|
12
|
-
scorebook/evaluate/evaluate_helpers.py,sha256=
|
|
13
|
-
scorebook/exceptions.py,sha256=
|
|
11
|
+
scorebook/evaluate/_sync/evaluate.py,sha256=xanhHu-CaY_WarNM1V64W2sHttkM18j42K0MKrdtrvE,15438
|
|
12
|
+
scorebook/evaluate/evaluate_helpers.py,sha256=swbgB1LurWdufeiVIZZ7ildsYO-ptC7uF3x6AVgptkU,13809
|
|
13
|
+
scorebook/exceptions.py,sha256=3sxCWhFqYgXiWNUAMRR2ggLfqvbDI8e5vLjnT9V7X1M,3649
|
|
14
14
|
scorebook/inference/__init__.py,sha256=u3TmfftO0oMkz8ngwxAKLPfL1so1w2hbK7c5UNlRq-M,345
|
|
15
15
|
scorebook/inference/clients/__init__.py,sha256=QCjbrXYeFd7xK-5ZH7o7bSaKUJqHtGnH5285pezNKyY,242
|
|
16
16
|
scorebook/inference/clients/bedrock.py,sha256=bsnz0IB6ufjZVPd6syD3yVaOelerB5G_YAmPAVqmBmI,10071
|
|
@@ -23,20 +23,28 @@ scorebook/metrics/accuracy.py,sha256=5KQ4hfOn9M94sB7WsXUelJWJiuKfoCGQEl5q5q9vNfo
|
|
|
23
23
|
scorebook/metrics/metric_base.py,sha256=I3L0DGcRojFp93UGFnXG1tZ2UK9ilTcXXJG6lj5ddXA,857
|
|
24
24
|
scorebook/metrics/metric_registry.py,sha256=jWwt9P3zvtFLlEYrd60v7LS7X251nZczouE02zcCxWg,3402
|
|
25
25
|
scorebook/metrics/precision.py,sha256=AaYPYYKnY74Nwqp_p3jd2Ewf3VHNOJjoRWf5fhb-tXk,563
|
|
26
|
+
scorebook/score/__init__.py,sha256=pwjSEb8Tc1edQpYDuu49wnupazISpRX3DQGD2cfiJek,208
|
|
27
|
+
scorebook/score/_async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
|
+
scorebook/score/_async/score_async.py,sha256=GM84UcuFvW1x6ZIePEshG2cwVNB9GvwhhjouOduUwTA,6097
|
|
29
|
+
scorebook/score/_sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
|
+
scorebook/score/_sync/score.py,sha256=rbJhYEhu8auHG4AwpZIkmzw_0ZK1bzbDiIK7Q0ApxhY,6043
|
|
31
|
+
scorebook/score/score_helpers.py,sha256=lq0t5UrOgxa_pDiwL3yHbBlT2BL5B-SkWw1nyaXVoZU,7074
|
|
26
32
|
scorebook/settings.py,sha256=qZrNiki6rFXn43udmhjQSmdDKOEaX62WYoEs2Rbggr0,720
|
|
27
33
|
scorebook/trismik/__init__.py,sha256=kWZkEC57LJscRZNLE3sJR1L5w-ltb5mEbQd3_ePtZPQ,380
|
|
28
34
|
scorebook/trismik/credentials.py,sha256=WtJLaNmBMwCi6gT1Bgp4J9x2tq5HDrDI9U074r08TnU,3275
|
|
29
|
-
scorebook/
|
|
30
|
-
scorebook/
|
|
35
|
+
scorebook/trismik/upload_results.py,sha256=jgT9EVFpuv6OmrYgZVi032cbRrcCOyX4ulLDeWPFBWU,9743
|
|
36
|
+
scorebook/types.py,sha256=x5bD2DU-Xafh7pXwmaQQ1i1zoZDsniHJjE-UEfXySAg,4827
|
|
37
|
+
scorebook/utils/__init__.py,sha256=crefSaTUWkhFF-w4kotUzcz9_GGZukQDgRit4HxJRHY,805
|
|
31
38
|
scorebook/utils/async_utils.py,sha256=2ewk_VOePib8z7DTRl-pZQBGzVI3L3JvnEuYW-DTkRA,1325
|
|
39
|
+
scorebook/utils/common_helpers.py,sha256=jewPdQH4JqTWcYT31wn1WNucOPLtGbrGdViwwlYRhD4,1216
|
|
32
40
|
scorebook/utils/io_helpers.py,sha256=ORO6DwtXOKWJq9v_isuunUrz0viE3xy2qYO4lrgU-TM,1437
|
|
33
41
|
scorebook/utils/jinja_helpers.py,sha256=ksIKHiKdj8N0o7ZJZGasfbSNoAY6K5d9X_KM6mcKYD4,4208
|
|
34
42
|
scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
|
|
35
|
-
scorebook/utils/progress_bars.py,sha256=
|
|
43
|
+
scorebook/utils/progress_bars.py,sha256=gdT6dJ9LMLYzs7TospP3wQNY9htm_FhVLdX0ueluC6E,31890
|
|
36
44
|
scorebook/utils/render_template.py,sha256=NOaZt-N1WcR5MA7at1XxzD-4sFMFKo9X0k7fKq6oSSM,1654
|
|
37
45
|
scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
|
|
38
|
-
scorebook-0.0.
|
|
39
|
-
scorebook-0.0.
|
|
40
|
-
scorebook-0.0.
|
|
41
|
-
scorebook-0.0.
|
|
42
|
-
scorebook-0.0.
|
|
46
|
+
scorebook-0.0.12.dist-info/METADATA,sha256=bMjbT1e0GYExB1HcBkAfesaUcXK2-Pck5ox2oCUBXpE,11508
|
|
47
|
+
scorebook-0.0.12.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
48
|
+
scorebook-0.0.12.dist-info/entry_points.txt,sha256=9gNd3Q0MEozhJ7fog-Q-Z_PrcGMnF-404Jon40MH2_U,53
|
|
49
|
+
scorebook-0.0.12.dist-info/licenses/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
|
|
50
|
+
scorebook-0.0.12.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|