scorebook 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +2 -0
- scorebook/dashboard/credentials.py +34 -4
- scorebook/eval_datasets/eval_dataset.py +2 -2
- scorebook/evaluate/_async/evaluate_async.py +27 -11
- scorebook/evaluate/_sync/evaluate.py +27 -11
- scorebook/metrics/README.md +121 -0
- scorebook/metrics/__init__.py +8 -0
- scorebook/metrics/accuracy.py +2 -6
- scorebook/metrics/bertscore.py +50 -0
- scorebook/metrics/bleu.py +82 -0
- scorebook/metrics/core/__init__.py +1 -0
- scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
- scorebook/metrics/core/metric_registry.py +195 -0
- scorebook/metrics/exactmatch.py +95 -0
- scorebook/metrics/f1.py +96 -0
- scorebook/metrics/precision.py +84 -9
- scorebook/metrics/recall.py +94 -0
- scorebook/metrics/rouge.py +85 -0
- scorebook/score/score_helpers.py +28 -11
- scorebook/types.py +2 -2
- scorebook/utils/progress_bars.py +58 -786
- {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/METADATA +32 -24
- scorebook-0.0.15.dist-info/RECORD +110 -0
- {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
- tutorials/README.md +147 -0
- tutorials/__init__.py +5 -0
- tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
- tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
- tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
- tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
- tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
- tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
- tutorials/examples/1-score/__init__.py +0 -0
- tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
- tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
- tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
- tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
- tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
- tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
- tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
- tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
- tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
- tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
- tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
- tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
- tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
- tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
- tutorials/examples/6-providers/aws/__init__.py +1 -0
- tutorials/examples/6-providers/aws/batch_example.py +219 -0
- tutorials/examples/6-providers/portkey/__init__.py +1 -0
- tutorials/examples/6-providers/portkey/batch_example.py +120 -0
- tutorials/examples/6-providers/portkey/messages_example.py +121 -0
- tutorials/examples/6-providers/vertex/__init__.py +1 -0
- tutorials/examples/6-providers/vertex/batch_example.py +166 -0
- tutorials/examples/6-providers/vertex/messages_example.py +142 -0
- tutorials/examples/__init__.py +0 -0
- tutorials/notebooks/1-scoring.ipynb +162 -0
- tutorials/notebooks/2-evaluating.ipynb +316 -0
- tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
- tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
- tutorials/notebooks/4-uploading_results.ipynb +175 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
- tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
- tutorials/quickstarts/getting_started.ipynb +197 -0
- tutorials/utils/__init__.py +35 -0
- tutorials/utils/args_parser.py +132 -0
- tutorials/utils/output.py +23 -0
- tutorials/utils/setup.py +98 -0
- scorebook/metrics/metric_registry.py +0 -107
- scorebook-0.0.14.dist-info/RECORD +0 -53
- {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Registry module for evaluation metrics.
|
|
3
|
+
|
|
4
|
+
This module maintains a centralized registry of available evaluation metrics
|
|
5
|
+
that can be used to assess model performance. It provides a single access point
|
|
6
|
+
to retrieve all implemented metric classes.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import importlib
|
|
10
|
+
from typing import Any, Callable, Dict, List, Type, Union
|
|
11
|
+
|
|
12
|
+
from scorebook.metrics.core.metric_base import MetricBase
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MetricRegistry:
|
|
16
|
+
"""A registry for evaluation metrics.
|
|
17
|
+
|
|
18
|
+
This class provides a central registry for all evaluation metrics in the system.
|
|
19
|
+
Metrics are lazily loaded on demand - when you request a metric by name, it will
|
|
20
|
+
be automatically imported from the metrics directory using a naming convention.
|
|
21
|
+
|
|
22
|
+
Naming Convention:
|
|
23
|
+
All metric names are normalized by:
|
|
24
|
+
- Converting to lowercase
|
|
25
|
+
- Removing all underscores and spaces
|
|
26
|
+
|
|
27
|
+
Module files must follow this normalized naming (lowercase, no underscores/spaces):
|
|
28
|
+
Examples:
|
|
29
|
+
Class "Accuracy" → module "accuracy.py"
|
|
30
|
+
Class "F1Score" → module "f1score.py"
|
|
31
|
+
Class "MeanSquaredError" → module "meansquarederror.py"
|
|
32
|
+
|
|
33
|
+
User input is also normalized, so all variations work:
|
|
34
|
+
"f1_score", "F1Score", "f1 score" → all resolve to "f1score"
|
|
35
|
+
|
|
36
|
+
Collision Detection:
|
|
37
|
+
Class names that normalize to the same key will raise an error:
|
|
38
|
+
"F1Score" and "F1_Score" both → "f1score" (COLLISION)
|
|
39
|
+
"MetricName" and "Metric_Name" both → "metricname" (COLLISION)
|
|
40
|
+
|
|
41
|
+
Security:
|
|
42
|
+
Lazy loading is restricted to modules in the "scorebook.metrics." namespace.
|
|
43
|
+
Python's import system validates module names.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
_registry: Dict[str, Type[MetricBase]] = {}
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
def _normalize_name(name: str) -> str:
|
|
50
|
+
"""Normalize a metric name to a registry key."""
|
|
51
|
+
return name.lower().replace("_", "").replace(" ", "")
|
|
52
|
+
|
|
53
|
+
@classmethod
|
|
54
|
+
def register(cls) -> Callable[[Type[MetricBase]], Type[MetricBase]]:
|
|
55
|
+
"""Register a metric class in the registry.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
A decorator that registers the class and returns it.
|
|
59
|
+
|
|
60
|
+
Raises:
|
|
61
|
+
ValueError: If a metric with the given name is already registered.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
def decorator(metric_cls: Type[MetricBase]) -> Type[MetricBase]:
|
|
65
|
+
|
|
66
|
+
# Normalize the class name to a registry key
|
|
67
|
+
key = cls._normalize_name(metric_cls.__name__)
|
|
68
|
+
if key in cls._registry:
|
|
69
|
+
raise ValueError(
|
|
70
|
+
f"Metric '{key}' is already registered. "
|
|
71
|
+
f"Class names '{metric_cls.__name__}' and "
|
|
72
|
+
f"'{cls._registry[key].__name__}' both normalize to '{key}'."
|
|
73
|
+
)
|
|
74
|
+
cls._registry[key] = metric_cls
|
|
75
|
+
return metric_cls
|
|
76
|
+
|
|
77
|
+
return decorator
|
|
78
|
+
|
|
79
|
+
@classmethod
|
|
80
|
+
def _lazy_load_metric(cls, normalized_key: str) -> None:
|
|
81
|
+
"""Attempt to lazily load a metric module using naming convention.
|
|
82
|
+
|
|
83
|
+
Module files must be named using the normalized key (lowercase, no underscores/spaces).
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
normalized_key: The normalized metric name (lowercase, no underscores/spaces): "f1score"
|
|
87
|
+
|
|
88
|
+
Raises:
|
|
89
|
+
ValueError: If the module doesn't exist or fails to register
|
|
90
|
+
ImportError: If the module exists but has import errors
|
|
91
|
+
"""
|
|
92
|
+
# Check if already registered
|
|
93
|
+
if normalized_key in cls._registry:
|
|
94
|
+
return
|
|
95
|
+
|
|
96
|
+
# Try to import the module using the normalized key
|
|
97
|
+
try:
|
|
98
|
+
importlib.import_module(f"scorebook.metrics.{normalized_key}")
|
|
99
|
+
except ModuleNotFoundError:
|
|
100
|
+
# Module doesn't exist - provide helpful error
|
|
101
|
+
error_msg = (
|
|
102
|
+
f"Metric '{normalized_key}' could not be found. "
|
|
103
|
+
f"Attempted to import from 'scorebook.metrics.{normalized_key}'."
|
|
104
|
+
)
|
|
105
|
+
if cls._registry:
|
|
106
|
+
registered = ", ".join(sorted(cls._registry.keys()))
|
|
107
|
+
error_msg += f" Currently registered metrics: {registered}"
|
|
108
|
+
else:
|
|
109
|
+
error_msg += " No metrics are currently registered."
|
|
110
|
+
raise ValueError(error_msg)
|
|
111
|
+
except ImportError as e:
|
|
112
|
+
# Module exists but has import errors - re-raise with context
|
|
113
|
+
raise ImportError(
|
|
114
|
+
f"Failed to load metric '{normalized_key}' from module "
|
|
115
|
+
f"'scorebook.metrics.{normalized_key}': {e}"
|
|
116
|
+
) from e
|
|
117
|
+
|
|
118
|
+
@classmethod
|
|
119
|
+
def get(cls, name_or_class: Union[str, Type[MetricBase]], **kwargs: Any) -> MetricBase:
|
|
120
|
+
"""
|
|
121
|
+
Get an instance of a registered metric by name or class.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
name_or_class: The metric name (string) or class (subclass of MetricBase).
|
|
125
|
+
**kwargs: Additional arguments to pass to the metric's constructor.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
An instance of the requested metric.
|
|
129
|
+
|
|
130
|
+
Raises:
|
|
131
|
+
ValueError: If the metric cannot be found or loaded.
|
|
132
|
+
ImportError: If lazy loading fails due to import errors.
|
|
133
|
+
"""
|
|
134
|
+
# If input is a class that's a subclass of MetricBase, instantiate it directly
|
|
135
|
+
if isinstance(name_or_class, type) and issubclass(name_or_class, MetricBase):
|
|
136
|
+
return name_or_class(**kwargs)
|
|
137
|
+
|
|
138
|
+
# If input is a string, look up the class in the registry
|
|
139
|
+
if isinstance(name_or_class, str):
|
|
140
|
+
# Normalize the input to a registry key
|
|
141
|
+
normalized_key = cls._normalize_name(name_or_class)
|
|
142
|
+
|
|
143
|
+
# Try lazy loading if not already registered
|
|
144
|
+
if normalized_key not in cls._registry:
|
|
145
|
+
cls._lazy_load_metric(normalized_key)
|
|
146
|
+
|
|
147
|
+
# After lazy loading attempt, check registry
|
|
148
|
+
if normalized_key not in cls._registry:
|
|
149
|
+
raise ValueError(
|
|
150
|
+
f"Metric '{name_or_class}' module was loaded but failed to register. "
|
|
151
|
+
f"Ensure the metric class has the @scorebook_metric decorator."
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
return cls._registry[normalized_key](**kwargs)
|
|
155
|
+
|
|
156
|
+
raise ValueError(
|
|
157
|
+
f"Invalid metric type: {type(name_or_class)}. "
|
|
158
|
+
f"Must be string name or MetricBase subclass"
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
@classmethod
|
|
162
|
+
def list_metrics(cls) -> List[str]:
|
|
163
|
+
"""
|
|
164
|
+
List all registered metrics.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
A list of metric names.
|
|
168
|
+
"""
|
|
169
|
+
return list(cls._registry.keys())
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def scorebook_metric(cls: Type[MetricBase]) -> Type[MetricBase]:
|
|
173
|
+
"""Register a custom metric with Scorebook.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
cls: A metric class that inherits from MetricBase
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
The same class, now registered with Scorebook
|
|
180
|
+
|
|
181
|
+
Example:
|
|
182
|
+
```python
|
|
183
|
+
from scorebook import scorebook_metric
|
|
184
|
+
from scorebook.metrics import MetricBase
|
|
185
|
+
|
|
186
|
+
@scorebook_metric
|
|
187
|
+
class MyCustomMetric(MetricBase):
|
|
188
|
+
def score(self, outputs, labels):
|
|
189
|
+
# Your metric implementation
|
|
190
|
+
return aggregate_scores, item_scores
|
|
191
|
+
```
|
|
192
|
+
Raises:
|
|
193
|
+
ValueError: If a metric with the same normalized name is already registered
|
|
194
|
+
"""
|
|
195
|
+
return MetricRegistry.register()(cls)
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""Exact Match metric implementation for Scorebook."""
|
|
2
|
+
|
|
3
|
+
import string
|
|
4
|
+
from typing import Any, Dict, List, Tuple
|
|
5
|
+
|
|
6
|
+
from scorebook.metrics import MetricBase, scorebook_metric
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@scorebook_metric
|
|
10
|
+
class ExactMatch(MetricBase):
|
|
11
|
+
"""Exact Match metric for evaluating string predictions.
|
|
12
|
+
|
|
13
|
+
Compares strings for exact equality with optional preprocessing.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
case_insensitive: If True, convert strings to lowercase before comparison.
|
|
17
|
+
Defaults to True.
|
|
18
|
+
strip: If True, strip leading and trailing whitespace before comparison.
|
|
19
|
+
Defaults to True.
|
|
20
|
+
strip_punctuation: If True, strip leading and trailing punctuation before
|
|
21
|
+
comparison. Defaults to False.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def name(self) -> str:
|
|
26
|
+
"""Return the metric name."""
|
|
27
|
+
return "exact_match"
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
case_insensitive: bool = True,
|
|
32
|
+
strip: bool = True,
|
|
33
|
+
strip_punctuation: bool = False,
|
|
34
|
+
) -> None:
|
|
35
|
+
"""Initialize ExactMatch metric with preprocessing options.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
case_insensitive: If True, convert strings to lowercase before comparison.
|
|
39
|
+
Defaults to True.
|
|
40
|
+
strip: If True, strip leading and trailing whitespace before comparison.
|
|
41
|
+
Defaults to True.
|
|
42
|
+
strip_punctuation: If True, strip leading and trailing punctuation before
|
|
43
|
+
comparison. Defaults to False.
|
|
44
|
+
"""
|
|
45
|
+
self.case_insensitive = case_insensitive
|
|
46
|
+
self.strip = strip
|
|
47
|
+
self.strip_punctuation = strip_punctuation
|
|
48
|
+
|
|
49
|
+
def _preprocess(self, value: Any) -> Any:
|
|
50
|
+
"""Apply preprocessing to a value if it's a string.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
value: The value to preprocess.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
The preprocessed value (string) or original value (non-string).
|
|
57
|
+
"""
|
|
58
|
+
if not isinstance(value, str):
|
|
59
|
+
return value
|
|
60
|
+
|
|
61
|
+
result = value
|
|
62
|
+
if self.strip:
|
|
63
|
+
result = result.strip()
|
|
64
|
+
if self.strip_punctuation:
|
|
65
|
+
result = result.strip(string.punctuation)
|
|
66
|
+
if self.case_insensitive:
|
|
67
|
+
result = result.lower()
|
|
68
|
+
return result
|
|
69
|
+
|
|
70
|
+
def score(self, outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
|
|
71
|
+
"""Calculate the exact match score between predictions and references.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
outputs: A list of inference outputs.
|
|
75
|
+
labels: A list of ground truth labels.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
The aggregate exact match score for all items (matches / total).
|
|
79
|
+
The item scores for each output-label pair (true/false).
|
|
80
|
+
"""
|
|
81
|
+
if not outputs:
|
|
82
|
+
return {"exact_match": 0.0}, []
|
|
83
|
+
|
|
84
|
+
# Calculate item scores with preprocessing
|
|
85
|
+
item_scores = [
|
|
86
|
+
self._preprocess(output) == self._preprocess(label)
|
|
87
|
+
for output, label in zip(outputs, labels)
|
|
88
|
+
]
|
|
89
|
+
|
|
90
|
+
# Calculate aggregate score
|
|
91
|
+
matches = sum(item_scores)
|
|
92
|
+
total = len(outputs)
|
|
93
|
+
aggregate_scores = {"exact_match": matches / total}
|
|
94
|
+
|
|
95
|
+
return aggregate_scores, item_scores
|
scorebook/metrics/f1.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""F1 metric implementation for Scorebook."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Tuple, Union
|
|
4
|
+
|
|
5
|
+
from sklearn.metrics import f1_score
|
|
6
|
+
|
|
7
|
+
from scorebook.metrics import MetricBase, scorebook_metric
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@scorebook_metric
|
|
11
|
+
class F1(MetricBase):
|
|
12
|
+
"""F1 score metric for evaluating model predictions using scikit-learn.
|
|
13
|
+
|
|
14
|
+
F1 = 2 * (Precision * Recall) / (Precision + Recall)
|
|
15
|
+
where:
|
|
16
|
+
- Precision = TP / (TP + FP)
|
|
17
|
+
- Recall = TP / (TP + FN)
|
|
18
|
+
|
|
19
|
+
This metric can handle both binary and multi-class classification tasks.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
average: The averaging method(s) for multi-class classification.
|
|
23
|
+
Can be a single string or list of strings:
|
|
24
|
+
- 'macro': Unweighted mean across labels
|
|
25
|
+
- 'micro': Global calculation counting total TP, FP, FN
|
|
26
|
+
- 'weighted': Weighted mean by support
|
|
27
|
+
- 'all': All three methods simultaneously
|
|
28
|
+
- List of methods: Calculate multiple methods
|
|
29
|
+
Defaults to 'macro'.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, average: Union[str, List[str]] = "macro", **kwargs: Any) -> None:
|
|
33
|
+
"""Initialize F1 metric with specified averaging method(s).
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
average: Averaging method(s) - string or list of strings.
|
|
37
|
+
Options: 'macro', 'micro', 'weighted', 'all', or a list of methods.
|
|
38
|
+
Defaults to 'macro'.
|
|
39
|
+
**kwargs: Additional keyword arguments passed to scikit-learn's f1_score function.
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
ValueError: If average contains invalid methods or combines 'all' with others.
|
|
43
|
+
"""
|
|
44
|
+
# Normalize to list for validation
|
|
45
|
+
averages = [average] if isinstance(average, str) else average
|
|
46
|
+
|
|
47
|
+
# Validate
|
|
48
|
+
valid = {"macro", "micro", "weighted", "all"}
|
|
49
|
+
if not all(a in valid for a in averages):
|
|
50
|
+
raise ValueError(f"Invalid average method(s). Must be from {valid}.")
|
|
51
|
+
if len(averages) > 1 and "all" in averages:
|
|
52
|
+
raise ValueError("'all' cannot be combined with other methods.")
|
|
53
|
+
|
|
54
|
+
self.average = average
|
|
55
|
+
self.kwargs = kwargs
|
|
56
|
+
|
|
57
|
+
def score(self, outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
|
|
58
|
+
"""Calculate F1 score between predictions and references using scikit-learn.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
outputs: A list of inference outputs.
|
|
62
|
+
labels: A list of ground truth labels.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
Tuple containing:
|
|
66
|
+
- aggregate_scores (Dict[str, float]): Dictionary with F1 scores
|
|
67
|
+
keyed by averaging method (e.g., {"F1 (macro)": 0.85} or
|
|
68
|
+
{"F1 (macro)": 0.85, "F1 (micro)": 0.82}).
|
|
69
|
+
- item_scores (List[bool]): True/False list indicating correct
|
|
70
|
+
predictions.
|
|
71
|
+
|
|
72
|
+
"""
|
|
73
|
+
# Normalize to list of methods to calculate
|
|
74
|
+
if isinstance(self.average, str):
|
|
75
|
+
methods = ["macro", "micro", "weighted"] if self.average == "all" else [self.average]
|
|
76
|
+
else:
|
|
77
|
+
methods = self.average
|
|
78
|
+
|
|
79
|
+
# Handle empty lists
|
|
80
|
+
if not outputs:
|
|
81
|
+
return {f"F1 ({method})": 0.0 for method in methods}, []
|
|
82
|
+
|
|
83
|
+
# Calculate F1 score using scikit-learn with configured averaging method
|
|
84
|
+
# Default zero_division=0 unless overridden in kwargs
|
|
85
|
+
kwargs = {"zero_division": 0, **self.kwargs}
|
|
86
|
+
|
|
87
|
+
# Calculate item scores (correctness of each prediction)
|
|
88
|
+
item_scores = [output == label for output, label in zip(outputs, labels)]
|
|
89
|
+
|
|
90
|
+
# Calculate F1 for each method
|
|
91
|
+
aggregate_scores = {
|
|
92
|
+
f"F1 ({method})": f1_score(labels, outputs, average=method, **kwargs)
|
|
93
|
+
for method in methods
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
return aggregate_scores, item_scores
|
scorebook/metrics/precision.py
CHANGED
|
@@ -1,19 +1,94 @@
|
|
|
1
1
|
"""Precision metric implementation for Scorebook."""
|
|
2
2
|
|
|
3
|
-
from typing import Any, Dict, List, Tuple
|
|
3
|
+
from typing import Any, Dict, List, Tuple, Union
|
|
4
4
|
|
|
5
|
-
from
|
|
6
|
-
from scorebook.metrics.metric_registry import MetricRegistry
|
|
5
|
+
from sklearn.metrics import precision_score
|
|
7
6
|
|
|
7
|
+
from scorebook.metrics import MetricBase, scorebook_metric
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
|
|
10
|
+
@scorebook_metric
|
|
10
11
|
class Precision(MetricBase):
|
|
11
|
-
"""Precision metric for
|
|
12
|
+
"""Precision score metric for evaluating model predictions using scikit-learn.
|
|
12
13
|
|
|
13
14
|
Precision = TP / (TP + FP)
|
|
15
|
+
|
|
16
|
+
This metric can handle both binary and multi-class classification tasks.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
average: The averaging method(s) for multi-class classification.
|
|
20
|
+
Can be a single string or list of strings:
|
|
21
|
+
- 'macro': Unweighted mean across labels
|
|
22
|
+
- 'micro': Global calculation counting total TP, FP
|
|
23
|
+
- 'weighted': Weighted mean by support
|
|
24
|
+
- 'all': All three methods simultaneously
|
|
25
|
+
- List of methods: Calculate multiple methods
|
|
26
|
+
Defaults to 'macro'.
|
|
14
27
|
"""
|
|
15
28
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
29
|
+
def __init__(self, average: Union[str, List[str]] = "macro", **kwargs: Any) -> None:
|
|
30
|
+
"""Initialize Precision metric with specified averaging method(s).
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
average: Averaging method(s) - string or list of strings.
|
|
34
|
+
Options: 'macro', 'micro', 'weighted', 'all', or a list of methods.
|
|
35
|
+
Defaults to 'macro'.
|
|
36
|
+
**kwargs: Additional keyword arguments passed to sklearn's precision_score.
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
ValueError: If average contains invalid methods or combines 'all' with others.
|
|
40
|
+
"""
|
|
41
|
+
# Normalize to list for validation
|
|
42
|
+
averages = [average] if isinstance(average, str) else average
|
|
43
|
+
|
|
44
|
+
# Validate
|
|
45
|
+
valid = {"macro", "micro", "weighted", "all"}
|
|
46
|
+
if not all(a in valid for a in averages):
|
|
47
|
+
raise ValueError(f"Invalid average method(s). Must be from {valid}.")
|
|
48
|
+
if len(averages) > 1 and "all" in averages:
|
|
49
|
+
raise ValueError("'all' cannot be combined with other methods.")
|
|
50
|
+
|
|
51
|
+
self.average = average
|
|
52
|
+
self.kwargs = kwargs
|
|
53
|
+
|
|
54
|
+
def score(self, outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
|
|
55
|
+
"""Calculate Precision score between predictions and references using scikit-learn.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
outputs: A list of inference outputs.
|
|
59
|
+
labels: A list of ground truth labels.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
Tuple containing:
|
|
63
|
+
- aggregate_scores (Dict[str, float]): Dictionary with Precision scores
|
|
64
|
+
keyed by averaging method (e.g., {"Precision (macro)": 0.85} or
|
|
65
|
+
{"Precision (macro)": 0.85, "Precision (micro)": 0.82}).
|
|
66
|
+
- item_scores (List[bool]): True/False list indicating correct
|
|
67
|
+
predictions.
|
|
68
|
+
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
# Normalize to list of methods to calculate
|
|
72
|
+
if isinstance(self.average, str):
|
|
73
|
+
methods = ["macro", "micro", "weighted"] if self.average == "all" else [self.average]
|
|
74
|
+
else:
|
|
75
|
+
methods = self.average
|
|
76
|
+
|
|
77
|
+
# Handle empty lists
|
|
78
|
+
if not outputs:
|
|
79
|
+
return {f"Precision ({method})": 0.0 for method in methods}, []
|
|
80
|
+
|
|
81
|
+
# Calculate Precision score using scikit-learn with configured averaging method
|
|
82
|
+
# Default zero_division=0 unless overridden in kwargs
|
|
83
|
+
kwargs = {"zero_division": 0, **self.kwargs}
|
|
84
|
+
|
|
85
|
+
# Calculate item scores (correctness of each prediction)
|
|
86
|
+
item_scores = [output == label for output, label in zip(outputs, labels)]
|
|
87
|
+
|
|
88
|
+
# Calculate Precision for each method
|
|
89
|
+
aggregate_scores = {
|
|
90
|
+
f"Precision ({method})": precision_score(labels, outputs, average=method, **kwargs)
|
|
91
|
+
for method in methods
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
return aggregate_scores, item_scores
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Recall metric implementation for Scorebook."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Tuple, Union
|
|
4
|
+
|
|
5
|
+
from sklearn.metrics import recall_score
|
|
6
|
+
|
|
7
|
+
from scorebook.metrics import MetricBase, scorebook_metric
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@scorebook_metric
|
|
11
|
+
class Recall(MetricBase):
|
|
12
|
+
"""Recall score metric for evaluating model predictions using scikit-learn.
|
|
13
|
+
|
|
14
|
+
Recall = TP / (TP + FN)
|
|
15
|
+
|
|
16
|
+
This metric can handle both binary and multi-class classification tasks.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
average: The averaging method(s) for multi-class classification.
|
|
20
|
+
Can be a single string or list of strings:
|
|
21
|
+
- 'macro': Unweighted mean across labels
|
|
22
|
+
- 'micro': Global calculation counting total TP, FP, FN
|
|
23
|
+
- 'weighted': Weighted mean by support
|
|
24
|
+
- 'all': All three methods simultaneously
|
|
25
|
+
- List of methods: Calculate multiple methods
|
|
26
|
+
Defaults to 'macro'.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, average: Union[str, List[str]] = "macro", **kwargs: Any) -> None:
|
|
30
|
+
"""Initialize Recall metric with specified averaging method(s).
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
average: Averaging method(s) - string or list of strings.
|
|
34
|
+
Options: 'macro', 'micro', 'weighted', 'all', or a list of methods.
|
|
35
|
+
Defaults to 'macro'.
|
|
36
|
+
**kwargs: Additional keyword arguments passed to sklearn's recall_score.
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
ValueError: If average contains invalid methods or combines 'all' with others.
|
|
40
|
+
"""
|
|
41
|
+
# Normalize to list for validation
|
|
42
|
+
averages = [average] if isinstance(average, str) else average
|
|
43
|
+
|
|
44
|
+
# Validate
|
|
45
|
+
valid = {"macro", "micro", "weighted", "all"}
|
|
46
|
+
if not all(a in valid for a in averages):
|
|
47
|
+
raise ValueError(f"Invalid average method(s). Must be from {valid}.")
|
|
48
|
+
if len(averages) > 1 and "all" in averages:
|
|
49
|
+
raise ValueError("'all' cannot be combined with other methods.")
|
|
50
|
+
|
|
51
|
+
self.average = average
|
|
52
|
+
self.kwargs = kwargs
|
|
53
|
+
|
|
54
|
+
def score(self, outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
|
|
55
|
+
"""Calculate Recall score between predictions and references using scikit-learn.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
outputs: A list of inference outputs.
|
|
59
|
+
labels: A list of ground truth labels.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
Tuple containing:
|
|
63
|
+
- aggregate_scores (Dict[str, float]): Dictionary with Recall scores
|
|
64
|
+
keyed by averaging method (e.g., {"Recall (macro)": 0.85} or
|
|
65
|
+
{"Recall (macro)": 0.85, "Recall (micro)": 0.82}).
|
|
66
|
+
- item_scores (List[bool]): True/False list indicating correct
|
|
67
|
+
predictions.
|
|
68
|
+
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
# Normalize to list of methods to calculate
|
|
72
|
+
if isinstance(self.average, str):
|
|
73
|
+
methods = ["macro", "micro", "weighted"] if self.average == "all" else [self.average]
|
|
74
|
+
else:
|
|
75
|
+
methods = self.average
|
|
76
|
+
|
|
77
|
+
# Handle empty lists
|
|
78
|
+
if not outputs:
|
|
79
|
+
return {f"Recall ({method})": 0.0 for method in methods}, []
|
|
80
|
+
|
|
81
|
+
# Calculate Recall score using scikit-learn with configured averaging method
|
|
82
|
+
# Default zero_division=0 unless overridden in kwargs
|
|
83
|
+
kwargs = {"zero_division": 0, **self.kwargs}
|
|
84
|
+
|
|
85
|
+
# Calculate item scores (correctness of each prediction)
|
|
86
|
+
item_scores = [output == label for output, label in zip(outputs, labels)]
|
|
87
|
+
|
|
88
|
+
# Calculate Recall for each method
|
|
89
|
+
aggregate_scores = {
|
|
90
|
+
f"Recall ({method})": recall_score(labels, outputs, average=method, **kwargs)
|
|
91
|
+
for method in methods
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
return aggregate_scores, item_scores
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""ROUGE metric implementation for Scorebook."""
|
|
2
|
+
|
|
3
|
+
import warnings
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
|
+
|
|
6
|
+
from rouge_score import rouge_scorer
|
|
7
|
+
|
|
8
|
+
from scorebook.metrics import MetricBase, scorebook_metric
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@scorebook_metric
|
|
12
|
+
class ROUGE(MetricBase):
|
|
13
|
+
"""ROUGE metric for evaluating text generation quality.
|
|
14
|
+
|
|
15
|
+
ROUGE (Recall-Oriented Understudy for Gisting Evaluation) measures
|
|
16
|
+
the overlap between generated text and reference text.
|
|
17
|
+
Returns ROUGE-1 and ROUGE-L F1 scores.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, rouge_types: Optional[List[str]] = None, **kwargs: Any) -> None:
|
|
21
|
+
"""Initialize the ROUGE metric.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
rouge_types: List of ROUGE types to calculate (e.g., ["rouge1", "rouge2", "rougeL"]).
|
|
25
|
+
Defaults to ["rouge1", "rougeL"].
|
|
26
|
+
**kwargs: Additional keyword arguments to pass to RougeScorer
|
|
27
|
+
(e.g., use_stemmer, split_summaries, tokenizer).
|
|
28
|
+
Defaults to use_stemmer=True if not provided.
|
|
29
|
+
"""
|
|
30
|
+
if rouge_types is None:
|
|
31
|
+
warnings.warn(
|
|
32
|
+
"No rouge_types specified, defaulting to ['rouge1', 'rougeL']",
|
|
33
|
+
UserWarning,
|
|
34
|
+
stacklevel=2,
|
|
35
|
+
)
|
|
36
|
+
rouge_types = ["rouge1", "rougeL"]
|
|
37
|
+
if "use_stemmer" not in kwargs:
|
|
38
|
+
warnings.warn(
|
|
39
|
+
"use_stemmer not specified, defaulting to True",
|
|
40
|
+
UserWarning,
|
|
41
|
+
stacklevel=2,
|
|
42
|
+
)
|
|
43
|
+
kwargs["use_stemmer"] = True
|
|
44
|
+
self.rouge_types = rouge_types
|
|
45
|
+
self.scorer = rouge_scorer.RougeScorer(rouge_types, **kwargs)
|
|
46
|
+
|
|
47
|
+
def score(self, outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
|
|
48
|
+
"""Calculate ROUGE scores between predictions and references.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
outputs: A list of generated text outputs.
|
|
52
|
+
labels: A list of reference text labels.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
A tuple containing:
|
|
56
|
+
- aggregate_scores: Dict with average F1 scores for each configured ROUGE type
|
|
57
|
+
- item_scores: List of dicts with F1 scores for each configured ROUGE type
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
if not outputs: # Handle empty lists
|
|
61
|
+
return {rouge_type: 0.0 for rouge_type in self.rouge_types}, []
|
|
62
|
+
|
|
63
|
+
# Calculate item scores
|
|
64
|
+
item_scores = []
|
|
65
|
+
for output, label in zip(outputs, labels):
|
|
66
|
+
# Convert to strings if needed
|
|
67
|
+
output_str = str(output) if output is not None else ""
|
|
68
|
+
label_str = str(label) if label is not None else ""
|
|
69
|
+
|
|
70
|
+
# Calculate ROUGE scores
|
|
71
|
+
scores = self.scorer.score(output_str, label_str)
|
|
72
|
+
|
|
73
|
+
# Extract F1 scores (fmeasure) for all configured rouge types
|
|
74
|
+
item_score = {
|
|
75
|
+
rouge_type: scores[rouge_type].fmeasure for rouge_type in self.rouge_types
|
|
76
|
+
}
|
|
77
|
+
item_scores.append(item_score)
|
|
78
|
+
|
|
79
|
+
# Calculate aggregate scores (average of all items for each rouge type)
|
|
80
|
+
aggregate_scores = {
|
|
81
|
+
rouge_type: sum(item[rouge_type] for item in item_scores) / len(item_scores)
|
|
82
|
+
for rouge_type in self.rouge_types
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
return aggregate_scores, item_scores
|