scorebook 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +2 -0
- scorebook/dashboard/credentials.py +34 -4
- scorebook/eval_datasets/eval_dataset.py +2 -2
- scorebook/evaluate/_async/evaluate_async.py +27 -11
- scorebook/evaluate/_sync/evaluate.py +27 -11
- scorebook/metrics/README.md +121 -0
- scorebook/metrics/__init__.py +8 -0
- scorebook/metrics/accuracy.py +2 -6
- scorebook/metrics/bertscore.py +50 -0
- scorebook/metrics/bleu.py +82 -0
- scorebook/metrics/core/__init__.py +1 -0
- scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
- scorebook/metrics/core/metric_registry.py +195 -0
- scorebook/metrics/exactmatch.py +95 -0
- scorebook/metrics/f1.py +96 -0
- scorebook/metrics/precision.py +84 -9
- scorebook/metrics/recall.py +94 -0
- scorebook/metrics/rouge.py +85 -0
- scorebook/score/score_helpers.py +28 -11
- scorebook/types.py +2 -2
- scorebook/utils/progress_bars.py +58 -786
- {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/METADATA +32 -24
- scorebook-0.0.15.dist-info/RECORD +110 -0
- {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
- tutorials/README.md +147 -0
- tutorials/__init__.py +5 -0
- tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
- tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
- tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
- tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
- tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
- tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
- tutorials/examples/1-score/__init__.py +0 -0
- tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
- tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
- tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
- tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
- tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
- tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
- tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
- tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
- tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
- tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
- tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
- tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
- tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
- tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
- tutorials/examples/6-providers/aws/__init__.py +1 -0
- tutorials/examples/6-providers/aws/batch_example.py +219 -0
- tutorials/examples/6-providers/portkey/__init__.py +1 -0
- tutorials/examples/6-providers/portkey/batch_example.py +120 -0
- tutorials/examples/6-providers/portkey/messages_example.py +121 -0
- tutorials/examples/6-providers/vertex/__init__.py +1 -0
- tutorials/examples/6-providers/vertex/batch_example.py +166 -0
- tutorials/examples/6-providers/vertex/messages_example.py +142 -0
- tutorials/examples/__init__.py +0 -0
- tutorials/notebooks/1-scoring.ipynb +162 -0
- tutorials/notebooks/2-evaluating.ipynb +316 -0
- tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
- tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
- tutorials/notebooks/4-uploading_results.ipynb +175 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
- tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
- tutorials/quickstarts/getting_started.ipynb +197 -0
- tutorials/utils/__init__.py +35 -0
- tutorials/utils/args_parser.py +132 -0
- tutorials/utils/output.py +23 -0
- tutorials/utils/setup.py +98 -0
- scorebook/metrics/metric_registry.py +0 -107
- scorebook-0.0.14.dist-info/RECORD +0 -53
- {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
scorebook/score/score_helpers.py
CHANGED
|
@@ -4,8 +4,8 @@ import logging
|
|
|
4
4
|
from typing import Any, Dict, List, Mapping, Optional, Type, Union
|
|
5
5
|
|
|
6
6
|
from scorebook.exceptions import DataMismatchError, ParameterValidationError
|
|
7
|
-
from scorebook.metrics.metric_base import MetricBase
|
|
8
|
-
from scorebook.metrics.metric_registry import MetricRegistry
|
|
7
|
+
from scorebook.metrics.core.metric_base import MetricBase
|
|
8
|
+
from scorebook.metrics.core.metric_registry import MetricRegistry
|
|
9
9
|
from scorebook.types import MetricScore
|
|
10
10
|
from scorebook.utils.async_utils import is_awaitable
|
|
11
11
|
|
|
@@ -89,7 +89,7 @@ async def calculate_metric_scores_async(
|
|
|
89
89
|
for metric in metrics:
|
|
90
90
|
|
|
91
91
|
if progress_bar is not None:
|
|
92
|
-
progress_bar.
|
|
92
|
+
progress_bar.set_postfix(metric=metric.name)
|
|
93
93
|
|
|
94
94
|
if is_awaitable(metric.score):
|
|
95
95
|
aggregate_scores, item_scores = await metric.score(outputs, labels)
|
|
@@ -134,7 +134,7 @@ def calculate_metric_scores(
|
|
|
134
134
|
for metric in metrics:
|
|
135
135
|
|
|
136
136
|
if progress_bar is not None:
|
|
137
|
-
progress_bar.
|
|
137
|
+
progress_bar.set_postfix(metric=metric.name)
|
|
138
138
|
|
|
139
139
|
if is_awaitable(metric.score):
|
|
140
140
|
raise ParameterValidationError(
|
|
@@ -164,18 +164,27 @@ def format_results(
|
|
|
164
164
|
hyperparameters = hyperparameters or {}
|
|
165
165
|
dataset_name = dataset_name or "scored_items"
|
|
166
166
|
|
|
167
|
+
# Detect key collisions across all metrics (for both aggregate and item scores)
|
|
168
|
+
all_keys: Dict[str, set] = {}
|
|
169
|
+
for metric_score in metric_scores:
|
|
170
|
+
for key in metric_score.aggregate_scores.keys():
|
|
171
|
+
all_keys.setdefault(key, set()).add(metric_score.metric_name)
|
|
172
|
+
# Also check item_scores keys if they are dicts
|
|
173
|
+
if metric_score.item_scores and isinstance(metric_score.item_scores[0], dict):
|
|
174
|
+
for key in metric_score.item_scores[0].keys():
|
|
175
|
+
all_keys.setdefault(key, set()).add(metric_score.metric_name)
|
|
176
|
+
colliding_keys = {k for k, metrics in all_keys.items() if len(metrics) > 1}
|
|
177
|
+
|
|
167
178
|
# Build aggregate results
|
|
168
|
-
aggregate_result = {
|
|
179
|
+
aggregate_result: Dict[str, Any] = {
|
|
169
180
|
"dataset": dataset_name,
|
|
170
181
|
**hyperparameters,
|
|
171
182
|
}
|
|
172
183
|
|
|
173
|
-
# Add aggregate scores from metrics
|
|
184
|
+
# Add aggregate scores from metrics (flat, with suffix on collision)
|
|
174
185
|
for metric_score in metric_scores:
|
|
175
186
|
for key, value in metric_score.aggregate_scores.items():
|
|
176
|
-
score_key =
|
|
177
|
-
key if key == metric_score.metric_name else f"{metric_score.metric_name}_{key}"
|
|
178
|
-
)
|
|
187
|
+
score_key = f"{key}_{metric_score.metric_name}" if key in colliding_keys else key
|
|
179
188
|
aggregate_result[score_key] = value
|
|
180
189
|
|
|
181
190
|
# Build item results
|
|
@@ -193,10 +202,18 @@ def format_results(
|
|
|
193
202
|
if inputs is not None and inputs[idx] is not None:
|
|
194
203
|
item_result["input"] = inputs[idx]
|
|
195
204
|
|
|
196
|
-
# Add item-level metric scores
|
|
205
|
+
# Add item-level metric scores (flat, with suffix on collision)
|
|
197
206
|
for metric_score in metric_scores:
|
|
198
207
|
if idx < len(metric_score.item_scores):
|
|
199
|
-
|
|
208
|
+
item_scores = metric_score.item_scores[idx]
|
|
209
|
+
if isinstance(item_scores, dict):
|
|
210
|
+
for key, value in item_scores.items():
|
|
211
|
+
score_key = (
|
|
212
|
+
f"{key}_{metric_score.metric_name}" if key in colliding_keys else key
|
|
213
|
+
)
|
|
214
|
+
item_result[score_key] = value
|
|
215
|
+
else:
|
|
216
|
+
item_result[metric_score.metric_name] = item_scores
|
|
200
217
|
|
|
201
218
|
item_results.append(item_result)
|
|
202
219
|
|
scorebook/types.py
CHANGED
|
@@ -4,11 +4,11 @@ from dataclasses import dataclass
|
|
|
4
4
|
from typing import Any, Dict, List, Optional, Sequence, Type, Union
|
|
5
5
|
|
|
6
6
|
from scorebook.eval_datasets.eval_dataset import EvalDataset
|
|
7
|
-
from scorebook.metrics.metric_base import MetricBase
|
|
7
|
+
from scorebook.metrics.core.metric_base import MetricBase
|
|
8
8
|
|
|
9
9
|
# Type alias for metrics parameter
|
|
10
10
|
Metrics = Union[
|
|
11
|
-
str,
|
|
11
|
+
str, MetricBase, Type[MetricBase], Sequence[Union[str, MetricBase, Type[MetricBase]]]
|
|
12
12
|
]
|
|
13
13
|
|
|
14
14
|
|