scorebook 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +12 -5
- scorebook/cli/auth.py +1 -1
- scorebook/dashboard/__init__.py +1 -0
- scorebook/dashboard/create_project.py +91 -0
- scorebook/{trismik → dashboard}/credentials.py +57 -12
- scorebook/{trismik → dashboard}/upload_results.py +1 -1
- scorebook/eval_datasets/__init__.py +0 -4
- scorebook/eval_datasets/eval_dataset.py +4 -2
- scorebook/evaluate/__init__.py +1 -15
- scorebook/evaluate/_async/evaluate_async.py +36 -19
- scorebook/evaluate/_sync/evaluate.py +36 -19
- scorebook/evaluate/evaluate_helpers.py +4 -3
- scorebook/inference/__init__.py +1 -11
- scorebook/inference/clients/__init__.py +1 -8
- scorebook/inference/inference_pipeline.py +1 -1
- scorebook/metrics/README.md +121 -0
- scorebook/metrics/__init__.py +7 -16
- scorebook/metrics/accuracy.py +2 -6
- scorebook/metrics/bertscore.py +50 -0
- scorebook/metrics/bleu.py +82 -0
- scorebook/metrics/core/__init__.py +1 -0
- scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
- scorebook/metrics/core/metric_registry.py +195 -0
- scorebook/metrics/exactmatch.py +95 -0
- scorebook/metrics/f1.py +96 -0
- scorebook/metrics/precision.py +84 -9
- scorebook/metrics/recall.py +94 -0
- scorebook/metrics/rouge.py +85 -0
- scorebook/score/__init__.py +0 -5
- scorebook/score/_async/score_async.py +3 -2
- scorebook/score/_sync/score.py +3 -2
- scorebook/score/score_helpers.py +29 -12
- scorebook/types.py +3 -3
- scorebook/utils/__init__.py +0 -22
- scorebook/utils/common_helpers.py +1 -1
- scorebook/utils/mock_llm/__init__.py +41 -0
- scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
- scorebook/utils/progress_bars.py +58 -786
- scorebook-0.0.15.dist-info/METADATA +300 -0
- scorebook-0.0.15.dist-info/RECORD +110 -0
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
- tutorials/README.md +147 -0
- tutorials/__init__.py +5 -0
- tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
- tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
- tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
- tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
- tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
- tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
- tutorials/examples/1-score/__init__.py +0 -0
- tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
- tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
- tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
- tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
- tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
- tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
- tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
- tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
- tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
- tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
- tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
- tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
- tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
- tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
- tutorials/examples/6-providers/aws/__init__.py +1 -0
- tutorials/examples/6-providers/aws/batch_example.py +219 -0
- tutorials/examples/6-providers/portkey/__init__.py +1 -0
- tutorials/examples/6-providers/portkey/batch_example.py +120 -0
- tutorials/examples/6-providers/portkey/messages_example.py +121 -0
- tutorials/examples/6-providers/vertex/__init__.py +1 -0
- tutorials/examples/6-providers/vertex/batch_example.py +166 -0
- tutorials/examples/6-providers/vertex/messages_example.py +142 -0
- tutorials/examples/__init__.py +0 -0
- tutorials/notebooks/1-scoring.ipynb +162 -0
- tutorials/notebooks/2-evaluating.ipynb +316 -0
- tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
- tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
- tutorials/notebooks/4-uploading_results.ipynb +175 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
- tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
- tutorials/quickstarts/getting_started.ipynb +197 -0
- tutorials/utils/__init__.py +35 -0
- tutorials/utils/args_parser.py +132 -0
- tutorials/utils/output.py +23 -0
- tutorials/utils/setup.py +98 -0
- scorebook/metrics/metric_registry.py +0 -105
- scorebook/trismik/__init__.py +0 -10
- scorebook-0.0.13.dist-info/METADATA +0 -389
- scorebook-0.0.13.dist-info/RECORD +0 -50
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# Adding Metrics to Scorebook
|
|
2
|
+
|
|
3
|
+
This guide explains how to add new metrics to Scorebook.
|
|
4
|
+
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
1. Create a metric file: `src/scorebook/metrics/yourmetric.py`
|
|
8
|
+
2. Implement the metric class
|
|
9
|
+
3. Add tests
|
|
10
|
+
4. Submit PR for review
|
|
11
|
+
|
|
12
|
+
### Where to Put Tests
|
|
13
|
+
|
|
14
|
+
Tests go in one of two directories:
|
|
15
|
+
|
|
16
|
+
- **`tests/unit/test_metrics/`** - For fast tests using mocked data. These run on every commit.
|
|
17
|
+
- **`tests/extended/test_metrics/`** - For tests that require external dependencies, large datasets, or are computationally expensive.
|
|
18
|
+
|
|
19
|
+
Most metrics only need unit tests. Use extended tests when your metric relies on external APIs, models, or takes significant time to run.
|
|
20
|
+
|
|
21
|
+
See [CONTRIBUTING.md](../../../CONTRIBUTING.md) for instructions on running tests.
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Requirements
|
|
26
|
+
|
|
27
|
+
Your metric must:
|
|
28
|
+
|
|
29
|
+
- Use the `@scorebook_metric` decorator
|
|
30
|
+
- Inherit from `MetricBase`
|
|
31
|
+
- Implement the `score()` static method
|
|
32
|
+
|
|
33
|
+
The `score()` method returns a tuple of `(aggregate_scores, item_scores)`:
|
|
34
|
+
|
|
35
|
+
- **aggregate_scores**: A `Dict[str, float]` with overall metric values (e.g., `{"accuracy": 0.85}`)
|
|
36
|
+
- **item_scores**: A `List` of per-item scores. For metrics that produce a single value per item, use `int`, `float`, `bool`, or `str`. For metrics that produce multiple values per item, use a `Dict[str, Union[int, float, bool, str]]` where keys are metric names.
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## File Naming
|
|
41
|
+
|
|
42
|
+
Metric files must use normalized names (lowercase, no underscores/spaces). This naming convention is required for the registry's lazy loading system to work.
|
|
43
|
+
|
|
44
|
+
1. User requests a metric by name (e.g., `"f1_score"`, `"F1Score"`, or `"f1 score"`)
|
|
45
|
+
2. The registry normalizes the input → `"f1score"`
|
|
46
|
+
3. The registry imports `scorebook.metrics.f1score`
|
|
47
|
+
4. The `@scorebook_metric` decorator registers the class
|
|
48
|
+
|
|
49
|
+
**Examples:**
|
|
50
|
+
- Class: `F1Score` → File: `f1score.py` → User can request: `"f1score"`, `"F1Score"`, `"f1_score"`, `"f1 score"`
|
|
51
|
+
- Class: `MeanSquaredError` → File: `meansquarederror.py` → User can request: `"MeanSquaredError"`, `"mean_squared_error"`, etc.
|
|
52
|
+
|
|
53
|
+
**Collision detection:** Class names that normalize to the same key will raise an error at registration time. For example, `F1Score` and `F1_Score` both normalize to `"f1score"` and cannot coexist.
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Implementation Template
|
|
58
|
+
|
|
59
|
+
Create your metric file in `src/scorebook/metrics/yourmetric.py`:
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
"""Brief description of the metric."""
|
|
63
|
+
|
|
64
|
+
from typing import Any, Dict, List, Tuple
|
|
65
|
+
|
|
66
|
+
from scorebook.metrics import MetricBase, scorebook_metric
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@scorebook_metric
|
|
70
|
+
class YourMetric(MetricBase):
|
|
71
|
+
"""One-line description of what this metric measures.
|
|
72
|
+
|
|
73
|
+
Formula or explanation (e.g., Accuracy = correct / total).
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def score(outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
|
|
77
|
+
"""Calculate metric score between outputs and labels.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
outputs: A list of model inference outputs.
|
|
81
|
+
labels: A list of ground truth labels.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
Tuple containing:
|
|
85
|
+
- Aggregate scores dict (e.g., {"your_metric": 0.85})
|
|
86
|
+
- List of per-item scores
|
|
87
|
+
|
|
88
|
+
Raises:
|
|
89
|
+
ValueError: If outputs and labels have different lengths.
|
|
90
|
+
"""
|
|
91
|
+
# Input validation
|
|
92
|
+
if len(outputs) != len(labels):
|
|
93
|
+
raise ValueError("Number of outputs must match number of labels")
|
|
94
|
+
|
|
95
|
+
if not outputs:
|
|
96
|
+
return {"your_metric": 0.0}, []
|
|
97
|
+
|
|
98
|
+
# Calculate per-item scores
|
|
99
|
+
item_scores = [calculate_score(out, lab) for out, lab in zip(outputs, labels)]
|
|
100
|
+
|
|
101
|
+
# Calculate aggregate score
|
|
102
|
+
aggregate_score = sum(item_scores) / len(item_scores)
|
|
103
|
+
|
|
104
|
+
return {"your_metric": aggregate_score}, item_scores
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
---
|
|
108
|
+
|
|
109
|
+
## Documentation
|
|
110
|
+
|
|
111
|
+
Each metric should have:
|
|
112
|
+
|
|
113
|
+
1. **Module-level docstring**: Brief description at the top of the file
|
|
114
|
+
2. **Class docstring**: What the metric measures, formula, and any limitations
|
|
115
|
+
3. **Method docstring**: Args, Returns, and Raises sections
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## Example
|
|
120
|
+
|
|
121
|
+
See `src/scorebook/metrics/accuracy.py` for a complete reference implementation.
|
scorebook/metrics/__init__.py
CHANGED
|
@@ -1,18 +1,9 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Metrics for evaluating model predictions.
|
|
1
|
+
"""Metrics for evaluating model predictions."""
|
|
3
2
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
generation metrics like accuracy, precision, recall, F1-score, etc.
|
|
3
|
+
from scorebook.metrics.core.metric_base import MetricBase
|
|
4
|
+
from scorebook.metrics.core.metric_registry import scorebook_metric
|
|
7
5
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
from scorebook.metrics.accuracy import Accuracy
|
|
14
|
-
from scorebook.metrics.metric_base import MetricBase
|
|
15
|
-
from scorebook.metrics.metric_registry import MetricRegistry
|
|
16
|
-
from scorebook.metrics.precision import Precision
|
|
17
|
-
|
|
18
|
-
__all__ = ["MetricBase", "Precision", "Accuracy", "MetricRegistry"]
|
|
6
|
+
__all__ = [
|
|
7
|
+
"MetricBase",
|
|
8
|
+
"scorebook_metric",
|
|
9
|
+
]
|
scorebook/metrics/accuracy.py
CHANGED
|
@@ -2,11 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
from typing import Any, Dict, List, Tuple
|
|
4
4
|
|
|
5
|
-
from scorebook.metrics
|
|
6
|
-
from scorebook.metrics.metric_registry import MetricRegistry
|
|
5
|
+
from scorebook.metrics import MetricBase, scorebook_metric
|
|
7
6
|
|
|
8
7
|
|
|
9
|
-
@
|
|
8
|
+
@scorebook_metric
|
|
10
9
|
class Accuracy(MetricBase):
|
|
11
10
|
"""Accuracy metric for evaluating model predictions of any type.
|
|
12
11
|
|
|
@@ -25,9 +24,6 @@ class Accuracy(MetricBase):
|
|
|
25
24
|
The aggregate accuracy score for all items (correct predictions / total predictions).
|
|
26
25
|
The item scores for each output-label pair (true/false).
|
|
27
26
|
"""
|
|
28
|
-
if len(outputs) != len(labels):
|
|
29
|
-
raise ValueError("Number of outputs must match number of labels")
|
|
30
|
-
|
|
31
27
|
if not outputs: # Handle empty lists
|
|
32
28
|
return {"accuracy": 0.0}, []
|
|
33
29
|
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""BertScore implementation for Scorebook."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Tuple
|
|
4
|
+
|
|
5
|
+
import bert_score
|
|
6
|
+
|
|
7
|
+
from scorebook.metrics import scorebook_metric
|
|
8
|
+
from scorebook.metrics.core.metric_base import MetricBase
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@scorebook_metric
|
|
12
|
+
class BertScore(MetricBase):
|
|
13
|
+
"""BertScore metric for evaluating model predictions against reference text."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, **kwargs: Any) -> None:
|
|
16
|
+
"""Initialize BertScore metric."""
|
|
17
|
+
defaults = {"lang": "en", "verbose": False}
|
|
18
|
+
self.kwargs = {**defaults, **kwargs} # User kwargs override defaults
|
|
19
|
+
|
|
20
|
+
def score(self, outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
|
|
21
|
+
"""Calculate bert score between predictions and references.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
outputs: A list of inference outputs.
|
|
25
|
+
labels: A list of ground truth labels.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
A tuple containing:
|
|
29
|
+
- aggregate_scores (Dict[str, float]): Dictionary with average precision,
|
|
30
|
+
recall, and F1 scores for all items.
|
|
31
|
+
- item_scores (List[Dict[str, float]]): List of dictionaries with precision,
|
|
32
|
+
recall, and F1 scores for each output-label pair.
|
|
33
|
+
"""
|
|
34
|
+
if not outputs: # Handle empty lists
|
|
35
|
+
return {"precision": 0.0, "recall": 0.0, "F1": 0.0}, []
|
|
36
|
+
|
|
37
|
+
# Calculate item scores
|
|
38
|
+
p_scores, r_scores, f1_scores = bert_score.score(outputs, labels, **self.kwargs)
|
|
39
|
+
|
|
40
|
+
item_scores = [
|
|
41
|
+
{"precision": p, "recall": r, "F1": f1}
|
|
42
|
+
for p, r, f1 in zip(p_scores.tolist(), r_scores.tolist(), f1_scores.tolist())
|
|
43
|
+
]
|
|
44
|
+
aggregate_scores = {
|
|
45
|
+
"precision": p_scores.mean().item(),
|
|
46
|
+
"recall": r_scores.mean().item(),
|
|
47
|
+
"F1": f1_scores.mean().item(),
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
return aggregate_scores, item_scores
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""BLEU metric implementation for Scorebook, based on sacrebleu."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Tuple
|
|
4
|
+
|
|
5
|
+
import sacrebleu
|
|
6
|
+
|
|
7
|
+
from scorebook.metrics import MetricBase, scorebook_metric
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@scorebook_metric
|
|
11
|
+
class BLEU(MetricBase):
|
|
12
|
+
"""BLEU metric implementation for Scorebook, based on sacrebleu."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, compact: bool = True, **kwargs: Any) -> None:
|
|
15
|
+
"""
|
|
16
|
+
Generate BLEU metric.
|
|
17
|
+
|
|
18
|
+
:param compact: if True, returns only the BLEU metric; if False,
|
|
19
|
+
returns the full signature of BLEU.
|
|
20
|
+
:param kwargs: additional arguments passed to BLEU.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
self.compact = compact
|
|
24
|
+
self.corpus_bleu = sacrebleu.metrics.BLEU(**kwargs)
|
|
25
|
+
|
|
26
|
+
# Overwrite effective order for sentence level scores
|
|
27
|
+
kwargs["effective_order"] = True
|
|
28
|
+
self.sentence_bleu = sacrebleu.metrics.BLEU(**kwargs)
|
|
29
|
+
|
|
30
|
+
def score(self, outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
|
|
31
|
+
"""Calculate accuracy score between predictions and references.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
outputs: A list of inference outputs.
|
|
35
|
+
labels: A list of ground truth labels.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
The aggregate accuracy score for all items (correct predictions / total predictions).
|
|
39
|
+
The item scores for each output-label pair (true/false).
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
if not outputs: # Handle empty lists
|
|
43
|
+
return {"BLEU": 0.0}, []
|
|
44
|
+
|
|
45
|
+
item_scores = []
|
|
46
|
+
# Calculate item scores
|
|
47
|
+
for output, label in zip(outputs, labels):
|
|
48
|
+
item_bleu: sacrebleu.metrics.BLEUScore = self.sentence_bleu.sentence_score(
|
|
49
|
+
output, [label]
|
|
50
|
+
)
|
|
51
|
+
item_score = {
|
|
52
|
+
"BLEU": item_bleu.score,
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
if not self.compact:
|
|
56
|
+
item_score["1-gram"] = item_bleu.precisions[0]
|
|
57
|
+
item_score["2-gram"] = item_bleu.precisions[1]
|
|
58
|
+
item_score["3-gram"] = item_bleu.precisions[2]
|
|
59
|
+
item_score["4-gram"] = item_bleu.precisions[3]
|
|
60
|
+
item_score["BP"] = item_bleu.bp
|
|
61
|
+
item_score["ratio"] = item_bleu.ratio
|
|
62
|
+
item_score["hyp_len"] = item_bleu.sys_len
|
|
63
|
+
item_score["ref_len"] = item_bleu.ref_len
|
|
64
|
+
|
|
65
|
+
item_scores.append(item_score)
|
|
66
|
+
|
|
67
|
+
# Calculate aggregate score
|
|
68
|
+
|
|
69
|
+
corpus_bleu: sacrebleu.metrics.BLEUScore = self.corpus_bleu.corpus_score(outputs, [labels])
|
|
70
|
+
aggregate_scores = {"BLEU": corpus_bleu.score}
|
|
71
|
+
|
|
72
|
+
if not self.compact:
|
|
73
|
+
aggregate_scores["1-gram"] = corpus_bleu.precisions[0]
|
|
74
|
+
aggregate_scores["2-gram"] = corpus_bleu.precisions[1]
|
|
75
|
+
aggregate_scores["3-gram"] = corpus_bleu.precisions[2]
|
|
76
|
+
aggregate_scores["4-gram"] = corpus_bleu.precisions[3]
|
|
77
|
+
aggregate_scores["BP"] = corpus_bleu.bp
|
|
78
|
+
aggregate_scores["ratio"] = corpus_bleu.ratio
|
|
79
|
+
aggregate_scores["hyp_len"] = corpus_bleu.sys_len
|
|
80
|
+
aggregate_scores["ref_len"] = corpus_bleu.ref_len
|
|
81
|
+
|
|
82
|
+
return aggregate_scores, item_scores
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Core metric framework components."""
|
|
@@ -12,9 +12,8 @@ class MetricBase(ABC):
|
|
|
12
12
|
"""Return the metric name based on the class name."""
|
|
13
13
|
return self.__class__.__name__.lower()
|
|
14
14
|
|
|
15
|
-
@staticmethod
|
|
16
15
|
@abstractmethod
|
|
17
|
-
def score(outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
|
|
16
|
+
def score(self, outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
|
|
18
17
|
"""Calculate the metric score for a list of outputs and labels.
|
|
19
18
|
|
|
20
19
|
Args:
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Registry module for evaluation metrics.
|
|
3
|
+
|
|
4
|
+
This module maintains a centralized registry of available evaluation metrics
|
|
5
|
+
that can be used to assess model performance. It provides a single access point
|
|
6
|
+
to retrieve all implemented metric classes.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import importlib
|
|
10
|
+
from typing import Any, Callable, Dict, List, Type, Union
|
|
11
|
+
|
|
12
|
+
from scorebook.metrics.core.metric_base import MetricBase
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MetricRegistry:
|
|
16
|
+
"""A registry for evaluation metrics.
|
|
17
|
+
|
|
18
|
+
This class provides a central registry for all evaluation metrics in the system.
|
|
19
|
+
Metrics are lazily loaded on demand - when you request a metric by name, it will
|
|
20
|
+
be automatically imported from the metrics directory using a naming convention.
|
|
21
|
+
|
|
22
|
+
Naming Convention:
|
|
23
|
+
All metric names are normalized by:
|
|
24
|
+
- Converting to lowercase
|
|
25
|
+
- Removing all underscores and spaces
|
|
26
|
+
|
|
27
|
+
Module files must follow this normalized naming (lowercase, no underscores/spaces):
|
|
28
|
+
Examples:
|
|
29
|
+
Class "Accuracy" → module "accuracy.py"
|
|
30
|
+
Class "F1Score" → module "f1score.py"
|
|
31
|
+
Class "MeanSquaredError" → module "meansquarederror.py"
|
|
32
|
+
|
|
33
|
+
User input is also normalized, so all variations work:
|
|
34
|
+
"f1_score", "F1Score", "f1 score" → all resolve to "f1score"
|
|
35
|
+
|
|
36
|
+
Collision Detection:
|
|
37
|
+
Class names that normalize to the same key will raise an error:
|
|
38
|
+
"F1Score" and "F1_Score" both → "f1score" (COLLISION)
|
|
39
|
+
"MetricName" and "Metric_Name" both → "metricname" (COLLISION)
|
|
40
|
+
|
|
41
|
+
Security:
|
|
42
|
+
Lazy loading is restricted to modules in the "scorebook.metrics." namespace.
|
|
43
|
+
Python's import system validates module names.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
_registry: Dict[str, Type[MetricBase]] = {}
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
def _normalize_name(name: str) -> str:
|
|
50
|
+
"""Normalize a metric name to a registry key."""
|
|
51
|
+
return name.lower().replace("_", "").replace(" ", "")
|
|
52
|
+
|
|
53
|
+
@classmethod
|
|
54
|
+
def register(cls) -> Callable[[Type[MetricBase]], Type[MetricBase]]:
|
|
55
|
+
"""Register a metric class in the registry.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
A decorator that registers the class and returns it.
|
|
59
|
+
|
|
60
|
+
Raises:
|
|
61
|
+
ValueError: If a metric with the given name is already registered.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
def decorator(metric_cls: Type[MetricBase]) -> Type[MetricBase]:
|
|
65
|
+
|
|
66
|
+
# Normalize the class name to a registry key
|
|
67
|
+
key = cls._normalize_name(metric_cls.__name__)
|
|
68
|
+
if key in cls._registry:
|
|
69
|
+
raise ValueError(
|
|
70
|
+
f"Metric '{key}' is already registered. "
|
|
71
|
+
f"Class names '{metric_cls.__name__}' and "
|
|
72
|
+
f"'{cls._registry[key].__name__}' both normalize to '{key}'."
|
|
73
|
+
)
|
|
74
|
+
cls._registry[key] = metric_cls
|
|
75
|
+
return metric_cls
|
|
76
|
+
|
|
77
|
+
return decorator
|
|
78
|
+
|
|
79
|
+
@classmethod
|
|
80
|
+
def _lazy_load_metric(cls, normalized_key: str) -> None:
|
|
81
|
+
"""Attempt to lazily load a metric module using naming convention.
|
|
82
|
+
|
|
83
|
+
Module files must be named using the normalized key (lowercase, no underscores/spaces).
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
normalized_key: The normalized metric name (lowercase, no underscores/spaces): "f1score"
|
|
87
|
+
|
|
88
|
+
Raises:
|
|
89
|
+
ValueError: If the module doesn't exist or fails to register
|
|
90
|
+
ImportError: If the module exists but has import errors
|
|
91
|
+
"""
|
|
92
|
+
# Check if already registered
|
|
93
|
+
if normalized_key in cls._registry:
|
|
94
|
+
return
|
|
95
|
+
|
|
96
|
+
# Try to import the module using the normalized key
|
|
97
|
+
try:
|
|
98
|
+
importlib.import_module(f"scorebook.metrics.{normalized_key}")
|
|
99
|
+
except ModuleNotFoundError:
|
|
100
|
+
# Module doesn't exist - provide helpful error
|
|
101
|
+
error_msg = (
|
|
102
|
+
f"Metric '{normalized_key}' could not be found. "
|
|
103
|
+
f"Attempted to import from 'scorebook.metrics.{normalized_key}'."
|
|
104
|
+
)
|
|
105
|
+
if cls._registry:
|
|
106
|
+
registered = ", ".join(sorted(cls._registry.keys()))
|
|
107
|
+
error_msg += f" Currently registered metrics: {registered}"
|
|
108
|
+
else:
|
|
109
|
+
error_msg += " No metrics are currently registered."
|
|
110
|
+
raise ValueError(error_msg)
|
|
111
|
+
except ImportError as e:
|
|
112
|
+
# Module exists but has import errors - re-raise with context
|
|
113
|
+
raise ImportError(
|
|
114
|
+
f"Failed to load metric '{normalized_key}' from module "
|
|
115
|
+
f"'scorebook.metrics.{normalized_key}': {e}"
|
|
116
|
+
) from e
|
|
117
|
+
|
|
118
|
+
@classmethod
|
|
119
|
+
def get(cls, name_or_class: Union[str, Type[MetricBase]], **kwargs: Any) -> MetricBase:
|
|
120
|
+
"""
|
|
121
|
+
Get an instance of a registered metric by name or class.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
name_or_class: The metric name (string) or class (subclass of MetricBase).
|
|
125
|
+
**kwargs: Additional arguments to pass to the metric's constructor.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
An instance of the requested metric.
|
|
129
|
+
|
|
130
|
+
Raises:
|
|
131
|
+
ValueError: If the metric cannot be found or loaded.
|
|
132
|
+
ImportError: If lazy loading fails due to import errors.
|
|
133
|
+
"""
|
|
134
|
+
# If input is a class that's a subclass of MetricBase, instantiate it directly
|
|
135
|
+
if isinstance(name_or_class, type) and issubclass(name_or_class, MetricBase):
|
|
136
|
+
return name_or_class(**kwargs)
|
|
137
|
+
|
|
138
|
+
# If input is a string, look up the class in the registry
|
|
139
|
+
if isinstance(name_or_class, str):
|
|
140
|
+
# Normalize the input to a registry key
|
|
141
|
+
normalized_key = cls._normalize_name(name_or_class)
|
|
142
|
+
|
|
143
|
+
# Try lazy loading if not already registered
|
|
144
|
+
if normalized_key not in cls._registry:
|
|
145
|
+
cls._lazy_load_metric(normalized_key)
|
|
146
|
+
|
|
147
|
+
# After lazy loading attempt, check registry
|
|
148
|
+
if normalized_key not in cls._registry:
|
|
149
|
+
raise ValueError(
|
|
150
|
+
f"Metric '{name_or_class}' module was loaded but failed to register. "
|
|
151
|
+
f"Ensure the metric class has the @scorebook_metric decorator."
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
return cls._registry[normalized_key](**kwargs)
|
|
155
|
+
|
|
156
|
+
raise ValueError(
|
|
157
|
+
f"Invalid metric type: {type(name_or_class)}. "
|
|
158
|
+
f"Must be string name or MetricBase subclass"
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
@classmethod
|
|
162
|
+
def list_metrics(cls) -> List[str]:
|
|
163
|
+
"""
|
|
164
|
+
List all registered metrics.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
A list of metric names.
|
|
168
|
+
"""
|
|
169
|
+
return list(cls._registry.keys())
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def scorebook_metric(cls: Type[MetricBase]) -> Type[MetricBase]:
|
|
173
|
+
"""Register a custom metric with Scorebook.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
cls: A metric class that inherits from MetricBase
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
The same class, now registered with Scorebook
|
|
180
|
+
|
|
181
|
+
Example:
|
|
182
|
+
```python
|
|
183
|
+
from scorebook import scorebook_metric
|
|
184
|
+
from scorebook.metrics import MetricBase
|
|
185
|
+
|
|
186
|
+
@scorebook_metric
|
|
187
|
+
class MyCustomMetric(MetricBase):
|
|
188
|
+
def score(self, outputs, labels):
|
|
189
|
+
# Your metric implementation
|
|
190
|
+
return aggregate_scores, item_scores
|
|
191
|
+
```
|
|
192
|
+
Raises:
|
|
193
|
+
ValueError: If a metric with the same normalized name is already registered
|
|
194
|
+
"""
|
|
195
|
+
return MetricRegistry.register()(cls)
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""Exact Match metric implementation for Scorebook."""
|
|
2
|
+
|
|
3
|
+
import string
|
|
4
|
+
from typing import Any, Dict, List, Tuple
|
|
5
|
+
|
|
6
|
+
from scorebook.metrics import MetricBase, scorebook_metric
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@scorebook_metric
|
|
10
|
+
class ExactMatch(MetricBase):
|
|
11
|
+
"""Exact Match metric for evaluating string predictions.
|
|
12
|
+
|
|
13
|
+
Compares strings for exact equality with optional preprocessing.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
case_insensitive: If True, convert strings to lowercase before comparison.
|
|
17
|
+
Defaults to True.
|
|
18
|
+
strip: If True, strip leading and trailing whitespace before comparison.
|
|
19
|
+
Defaults to True.
|
|
20
|
+
strip_punctuation: If True, strip leading and trailing punctuation before
|
|
21
|
+
comparison. Defaults to False.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def name(self) -> str:
|
|
26
|
+
"""Return the metric name."""
|
|
27
|
+
return "exact_match"
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
case_insensitive: bool = True,
|
|
32
|
+
strip: bool = True,
|
|
33
|
+
strip_punctuation: bool = False,
|
|
34
|
+
) -> None:
|
|
35
|
+
"""Initialize ExactMatch metric with preprocessing options.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
case_insensitive: If True, convert strings to lowercase before comparison.
|
|
39
|
+
Defaults to True.
|
|
40
|
+
strip: If True, strip leading and trailing whitespace before comparison.
|
|
41
|
+
Defaults to True.
|
|
42
|
+
strip_punctuation: If True, strip leading and trailing punctuation before
|
|
43
|
+
comparison. Defaults to False.
|
|
44
|
+
"""
|
|
45
|
+
self.case_insensitive = case_insensitive
|
|
46
|
+
self.strip = strip
|
|
47
|
+
self.strip_punctuation = strip_punctuation
|
|
48
|
+
|
|
49
|
+
def _preprocess(self, value: Any) -> Any:
|
|
50
|
+
"""Apply preprocessing to a value if it's a string.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
value: The value to preprocess.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
The preprocessed value (string) or original value (non-string).
|
|
57
|
+
"""
|
|
58
|
+
if not isinstance(value, str):
|
|
59
|
+
return value
|
|
60
|
+
|
|
61
|
+
result = value
|
|
62
|
+
if self.strip:
|
|
63
|
+
result = result.strip()
|
|
64
|
+
if self.strip_punctuation:
|
|
65
|
+
result = result.strip(string.punctuation)
|
|
66
|
+
if self.case_insensitive:
|
|
67
|
+
result = result.lower()
|
|
68
|
+
return result
|
|
69
|
+
|
|
70
|
+
def score(self, outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
|
|
71
|
+
"""Calculate the exact match score between predictions and references.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
outputs: A list of inference outputs.
|
|
75
|
+
labels: A list of ground truth labels.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
The aggregate exact match score for all items (matches / total).
|
|
79
|
+
The item scores for each output-label pair (true/false).
|
|
80
|
+
"""
|
|
81
|
+
if not outputs:
|
|
82
|
+
return {"exact_match": 0.0}, []
|
|
83
|
+
|
|
84
|
+
# Calculate item scores with preprocessing
|
|
85
|
+
item_scores = [
|
|
86
|
+
self._preprocess(output) == self._preprocess(label)
|
|
87
|
+
for output, label in zip(outputs, labels)
|
|
88
|
+
]
|
|
89
|
+
|
|
90
|
+
# Calculate aggregate score
|
|
91
|
+
matches = sum(item_scores)
|
|
92
|
+
total = len(outputs)
|
|
93
|
+
aggregate_scores = {"exact_match": matches / total}
|
|
94
|
+
|
|
95
|
+
return aggregate_scores, item_scores
|