scorebook 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +15 -0
- scorebook/evaluator.py +228 -0
- scorebook/inference/__init__.py +11 -0
- scorebook/inference/openai.py +185 -0
- scorebook/inference/portkey.py +186 -0
- scorebook/metrics/__init__.py +18 -0
- scorebook/metrics/accuracy.py +42 -0
- scorebook/metrics/metric_base.py +28 -0
- scorebook/metrics/metric_registry.py +105 -0
- scorebook/metrics/precision.py +19 -0
- scorebook/types/__init__.py +11 -0
- scorebook/types/eval_dataset.py +310 -0
- scorebook/types/eval_result.py +129 -0
- scorebook/types/inference_pipeline.py +84 -0
- scorebook/utils/__init__.py +8 -0
- scorebook/utils/async_utils.py +27 -0
- scorebook/utils/io_helpers.py +28 -0
- scorebook/utils/mappers.py +36 -0
- scorebook/utils/progress_bars.py +89 -0
- scorebook/utils/transform_helpers.py +25 -0
- scorebook-0.0.1.dist-info/LICENSE +21 -0
- scorebook-0.0.1.dist-info/METADATA +376 -0
- scorebook-0.0.1.dist-info/RECORD +24 -0
- scorebook-0.0.1.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Base class for evaluation metrics."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import Any, Dict, List, Tuple
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class MetricBase(ABC):
|
|
8
|
+
"""Base class for all evaluation metrics."""
|
|
9
|
+
|
|
10
|
+
@property
|
|
11
|
+
def name(self) -> str:
|
|
12
|
+
"""Return the metric name based on the class name."""
|
|
13
|
+
return self.__class__.__name__.lower()
|
|
14
|
+
|
|
15
|
+
@staticmethod
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def score(outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
|
|
18
|
+
"""Calculate the metric score for a list of outputs and labels.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
outputs: A list of inference outputs.
|
|
22
|
+
labels: A list of ground truth labels.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
Aggregate metric scores for all items.
|
|
26
|
+
Individual scores for each item.
|
|
27
|
+
"""
|
|
28
|
+
raise NotImplementedError("MetricBase is an abstract class")
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Registry module for evaluation metrics.
|
|
3
|
+
|
|
4
|
+
This module maintains a centralized registry of available evaluation metrics
|
|
5
|
+
that can be used to assess model performance. It provides a single access point
|
|
6
|
+
to retrieve all implemented metric classes.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Any, Callable, Dict, List, Type, Union
|
|
10
|
+
|
|
11
|
+
from scorebook.metrics.metric_base import MetricBase
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MetricRegistry:
|
|
15
|
+
"""A registry for evaluation metrics.
|
|
16
|
+
|
|
17
|
+
This class provides a central registry for all evaluation metrics in the system.
|
|
18
|
+
It allows metrics to be registered with unique names and retrieved either by
|
|
19
|
+
name or by class. The registry ensures that metrics are properly initialized
|
|
20
|
+
and accessible throughout the application.
|
|
21
|
+
|
|
22
|
+
The registry supports:
|
|
23
|
+
- Registering new metric classes with optional custom names
|
|
24
|
+
- Retrieving metric instances by name or class
|
|
25
|
+
- Listing all available metrics
|
|
26
|
+
|
|
27
|
+
Usage:
|
|
28
|
+
@MetricRegistry.register("custom_name")
|
|
29
|
+
class MyMetric(MetricBase):
|
|
30
|
+
...
|
|
31
|
+
|
|
32
|
+
# Get by name
|
|
33
|
+
metric = MetricRegistry.get("custom_name")
|
|
34
|
+
|
|
35
|
+
# Get by class
|
|
36
|
+
metric = MetricRegistry.get(MyMetric)
|
|
37
|
+
|
|
38
|
+
# List available metrics
|
|
39
|
+
metrics = MetricRegistry.list_metrics()
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
_registry: Dict[str, Type[MetricBase]] = {}
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def register(cls) -> Callable[[Type[MetricBase]], Type[MetricBase]]:
|
|
46
|
+
"""
|
|
47
|
+
Register a metric class in the registry.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
A decorator that registers the class and returns it.
|
|
51
|
+
|
|
52
|
+
Raises:
|
|
53
|
+
ValueError: If a metric with the given name is already registered.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def decorator(metric_cls: Type[MetricBase]) -> Type[MetricBase]:
|
|
57
|
+
|
|
58
|
+
key = metric_cls.__name__.lower()
|
|
59
|
+
if key in cls._registry:
|
|
60
|
+
raise ValueError(f"Metric '{key}' is already registered")
|
|
61
|
+
cls._registry[key] = metric_cls
|
|
62
|
+
return metric_cls
|
|
63
|
+
|
|
64
|
+
return decorator
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def get(cls, name_or_class: Union[str, Type[MetricBase]], **kwargs: Any) -> MetricBase:
|
|
68
|
+
"""
|
|
69
|
+
Get an instance of a registered metric by name or class.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
name_or_class: The metric name (string) or class (subclass of BaseMetric).
|
|
73
|
+
**kwargs: Additional arguments to pass to the metric's constructor.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
An instance of the requested metric.
|
|
77
|
+
|
|
78
|
+
Raises:
|
|
79
|
+
ValueError: If the metric name is not registered.
|
|
80
|
+
"""
|
|
81
|
+
# If input is a class that's a subclass of BaseMetric, instantiate it directly
|
|
82
|
+
if isinstance(name_or_class, type) and issubclass(name_or_class, MetricBase):
|
|
83
|
+
return name_or_class(**kwargs)
|
|
84
|
+
|
|
85
|
+
# If input is a string, look up the class in the registry
|
|
86
|
+
if isinstance(name_or_class, str):
|
|
87
|
+
key = name_or_class.lower()
|
|
88
|
+
if key not in cls._registry:
|
|
89
|
+
raise ValueError(f"Metric '{name_or_class}' not registered.")
|
|
90
|
+
return cls._registry[key](**kwargs)
|
|
91
|
+
|
|
92
|
+
raise ValueError(
|
|
93
|
+
f"Invalid metric type: {type(name_or_class)}."
|
|
94
|
+
f"Must be string name or BaseMetric subclass"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
@classmethod
|
|
98
|
+
def list_metrics(cls) -> List[str]:
|
|
99
|
+
"""
|
|
100
|
+
List all registered metrics.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
A list of metric names.
|
|
104
|
+
"""
|
|
105
|
+
return list(cls._registry.keys())
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Precision metric implementation for Scorebook."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Tuple
|
|
4
|
+
|
|
5
|
+
from scorebook.metrics.metric_base import MetricBase
|
|
6
|
+
from scorebook.metrics.metric_registry import MetricRegistry
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@MetricRegistry.register()
|
|
10
|
+
class Precision(MetricBase):
|
|
11
|
+
"""Precision metric for binary classification.
|
|
12
|
+
|
|
13
|
+
Precision = TP / (TP + FP)
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
@staticmethod
|
|
17
|
+
def score(outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
|
|
18
|
+
"""Not implemented."""
|
|
19
|
+
raise NotImplementedError("Precision not implemented")
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Types package containing data structures and type definitions for the Scorebook framework.
|
|
3
|
+
|
|
4
|
+
This module provides core data types used throughout the framework for dataset handling
|
|
5
|
+
and evaluation results.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from scorebook.types.eval_dataset import EvalDataset
|
|
9
|
+
from scorebook.types.eval_result import EvalResult
|
|
10
|
+
|
|
11
|
+
__all__ = ["EvalDataset", "EvalResult"]
|
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
"""Eval Dataset implementation for scorebook."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import json
|
|
5
|
+
from typing import Any, Dict, Iterator, List, Optional, Type, Union
|
|
6
|
+
|
|
7
|
+
from datasets import Dataset as HuggingFaceDataset
|
|
8
|
+
from datasets import DatasetDict as HuggingFaceDatasetDict
|
|
9
|
+
from datasets import load_dataset
|
|
10
|
+
|
|
11
|
+
from scorebook.metrics import MetricBase, MetricRegistry
|
|
12
|
+
from scorebook.utils import validate_path
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class EvalDataset:
|
|
16
|
+
"""Eval Dataset implementation for scorebook."""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
name: str,
|
|
21
|
+
label: str,
|
|
22
|
+
metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
|
|
23
|
+
hf_dataset: HuggingFaceDataset,
|
|
24
|
+
):
|
|
25
|
+
"""
|
|
26
|
+
Create a new scorebook evaluation dataset instance.
|
|
27
|
+
|
|
28
|
+
:param name: The name of the evaluation dataset.
|
|
29
|
+
:param label: The label field of the dataset.
|
|
30
|
+
:param metrics: The specified metrics associated with the dataset.
|
|
31
|
+
:param hf_dataset: The dataset as a hugging face dataset object.
|
|
32
|
+
"""
|
|
33
|
+
self.name: str = name
|
|
34
|
+
self.label: str = label
|
|
35
|
+
self.metrics: List[MetricBase] = self._resolve_metrics(metrics)
|
|
36
|
+
self._hf_dataset: Optional[HuggingFaceDataset] = hf_dataset
|
|
37
|
+
|
|
38
|
+
def __len__(self) -> int:
|
|
39
|
+
"""Return the number of items in the dataset."""
|
|
40
|
+
if self._hf_dataset is None:
|
|
41
|
+
raise ValueError("Dataset is not initialized")
|
|
42
|
+
return len(self._hf_dataset)
|
|
43
|
+
|
|
44
|
+
def __getitem__(self, key: Union[int, str]) -> Union[Dict[str, Any], List[Any]]:
|
|
45
|
+
"""
|
|
46
|
+
Allow item access by index (int) or by column name (str).
|
|
47
|
+
|
|
48
|
+
- eval_dataset[i] returns the i-th example (dict).
|
|
49
|
+
- eval_dataset["feature"] returns a list of values for that feature.
|
|
50
|
+
"""
|
|
51
|
+
if self._hf_dataset is None:
|
|
52
|
+
raise ValueError("Dataset is not initialized")
|
|
53
|
+
if isinstance(key, int):
|
|
54
|
+
return dict(self._hf_dataset[key]) # Ensure we return a Dict[str, Any]
|
|
55
|
+
elif isinstance(key, str):
|
|
56
|
+
return list(self._hf_dataset[key]) # Ensure we return a List[Any]
|
|
57
|
+
else:
|
|
58
|
+
raise TypeError(f"Invalid key type: {type(key)}. Must be int or str.")
|
|
59
|
+
|
|
60
|
+
def __str__(self) -> str:
|
|
61
|
+
"""Return a formatted string summary of the evaluation dataset."""
|
|
62
|
+
if self._hf_dataset is None:
|
|
63
|
+
return f"EvalDataset(name='{self.name}', status='uninitialized')"
|
|
64
|
+
|
|
65
|
+
num_rows = len(self._hf_dataset)
|
|
66
|
+
fields = ", ".join(self.column_names)
|
|
67
|
+
metrics = ", ".join([metric.name for metric in self.metrics])
|
|
68
|
+
|
|
69
|
+
return (
|
|
70
|
+
f"EvalDataset(\n"
|
|
71
|
+
f" name='{self.name}',\n"
|
|
72
|
+
f" rows={num_rows},\n"
|
|
73
|
+
f" label='{self.label}',\n"
|
|
74
|
+
f" fields=[{fields}],\n"
|
|
75
|
+
f" metrics=[{metrics}]\n"
|
|
76
|
+
f")"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def __iter__(self) -> Iterator[Dict[str, Any]]:
|
|
80
|
+
"""Return an iterator over all examples in the dataset."""
|
|
81
|
+
if self._hf_dataset is None:
|
|
82
|
+
raise ValueError("Dataset is not initialized")
|
|
83
|
+
return iter(self._hf_dataset)
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def items(self) -> List[Any]:
|
|
87
|
+
"""Return a list of all examples in the dataset."""
|
|
88
|
+
if self._hf_dataset is None:
|
|
89
|
+
raise ValueError("Dataset is not initialized")
|
|
90
|
+
return list(self._hf_dataset)
|
|
91
|
+
|
|
92
|
+
@property
|
|
93
|
+
def column_names(self) -> List[str]:
|
|
94
|
+
"""Return a list of column/feature names available in the dataset."""
|
|
95
|
+
if self._hf_dataset is None:
|
|
96
|
+
raise ValueError("Dataset is not initialized")
|
|
97
|
+
return list(map(str, self._hf_dataset.column_names))
|
|
98
|
+
|
|
99
|
+
@classmethod
|
|
100
|
+
def from_list(
|
|
101
|
+
cls,
|
|
102
|
+
name: str,
|
|
103
|
+
label: str,
|
|
104
|
+
metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
|
|
105
|
+
data: List[Dict[str, Any]],
|
|
106
|
+
) -> "EvalDataset":
|
|
107
|
+
"""Instantiate an EvalDataset from a list of dictionaries.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
name: The name of the evaluation dataset.
|
|
111
|
+
label: The field used as the evaluation label (ground truth).
|
|
112
|
+
metrics: The specified metrics associated with the dataset.
|
|
113
|
+
data: List of dictionaries containing the dataset examples.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
A scorebook EvalDataset wrapping a Hugging Face dataset.
|
|
117
|
+
"""
|
|
118
|
+
return cls(
|
|
119
|
+
name=name, label=label, metrics=metrics, hf_dataset=HuggingFaceDataset.from_list(data)
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
@classmethod
|
|
123
|
+
def from_csv(
|
|
124
|
+
cls,
|
|
125
|
+
file_path: str,
|
|
126
|
+
label: str,
|
|
127
|
+
metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
|
|
128
|
+
name: Optional[str] = None,
|
|
129
|
+
encoding: str = "utf-8",
|
|
130
|
+
newline: str = "",
|
|
131
|
+
**reader_kwargs: Any,
|
|
132
|
+
) -> "EvalDataset":
|
|
133
|
+
"""Instantiate a scorebook dataset from a CSV file.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
file_path: Path to the CSV file.
|
|
137
|
+
label: The field used as the evaluation label (ground truth).
|
|
138
|
+
metrics: The specified metrics associated with the dataset.
|
|
139
|
+
name: Optional name for the eval dataset, if not provided, the path is used
|
|
140
|
+
encoding: Encoding of the CSV file.
|
|
141
|
+
newline: Newline character of the CSV file.
|
|
142
|
+
reader_kwargs: Dict of kwargs passed to `csv.DictReader`.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
A scorebook EvalDataset.
|
|
146
|
+
|
|
147
|
+
Raises:
|
|
148
|
+
FileNotFoundError: If the file does not exist at the given path.
|
|
149
|
+
ValueError: If the CSV file cannot be parsed or is empty.
|
|
150
|
+
"""
|
|
151
|
+
reader_kwargs = reader_kwargs or {}
|
|
152
|
+
path = validate_path(file_path, expected_suffix=".csv")
|
|
153
|
+
|
|
154
|
+
try:
|
|
155
|
+
with open(path, encoding=encoding, newline=newline) as csvfile:
|
|
156
|
+
reader = csv.DictReader(csvfile, **reader_kwargs)
|
|
157
|
+
data = [row for row in reader]
|
|
158
|
+
except csv.Error as e:
|
|
159
|
+
raise ValueError(f"Failed to parse CSV file {file_path}: {e}") from e
|
|
160
|
+
|
|
161
|
+
if not data:
|
|
162
|
+
raise ValueError(f"CSV file {file_path} is empty or contains only headers.")
|
|
163
|
+
|
|
164
|
+
name = name if name else path.stem
|
|
165
|
+
return cls(
|
|
166
|
+
name=name,
|
|
167
|
+
label=label,
|
|
168
|
+
metrics=metrics,
|
|
169
|
+
hf_dataset=HuggingFaceDataset.from_list(data),
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
@classmethod
|
|
173
|
+
def from_json(
|
|
174
|
+
cls,
|
|
175
|
+
file_path: str,
|
|
176
|
+
label: str,
|
|
177
|
+
metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
|
|
178
|
+
name: Optional[str] = None,
|
|
179
|
+
split: Optional[str] = None,
|
|
180
|
+
) -> "EvalDataset":
|
|
181
|
+
"""Instantiate an EvalDataset from a JSON file.
|
|
182
|
+
|
|
183
|
+
The JSON file must follow one of two supported formats:
|
|
184
|
+
|
|
185
|
+
1. **Flat format** – a list of dictionaries:
|
|
186
|
+
[
|
|
187
|
+
{"input": "What is 2+2?", "label": "4"},
|
|
188
|
+
{"input": "Capital of France?", "label": "Paris"}
|
|
189
|
+
]
|
|
190
|
+
|
|
191
|
+
2. **Split format** – a dictionary of named splits:
|
|
192
|
+
{
|
|
193
|
+
"train": [{"input": ..., "label": ...}],
|
|
194
|
+
"test": [{"input": ..., "label": ...}]
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
file_path: Path to the JSON file on disk.
|
|
199
|
+
label: The field used as the evaluation label (ground truth).
|
|
200
|
+
metrics: The specified metrics associated with the dataset.
|
|
201
|
+
name: Optional name for the eval dataset, if not provided, the path is used
|
|
202
|
+
split: If the JSON uses a split structure, this is the split name to load.
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
A scorebook EvalDataset wrapping a Hugging Face dataset.
|
|
206
|
+
|
|
207
|
+
Raises:
|
|
208
|
+
FileNotFoundError: If the file does not exist.
|
|
209
|
+
ValueError: If the JSON is invalid or the structure is unsupported.
|
|
210
|
+
"""
|
|
211
|
+
path = validate_path(file_path, expected_suffix=".json")
|
|
212
|
+
|
|
213
|
+
try:
|
|
214
|
+
with path.open("r", encoding="utf-8") as f:
|
|
215
|
+
data = json.load(f)
|
|
216
|
+
except json.JSONDecodeError as e:
|
|
217
|
+
raise ValueError(f"Invalid JSON in {file_path}: {e}") from e
|
|
218
|
+
|
|
219
|
+
if isinstance(data, dict):
|
|
220
|
+
if split is None:
|
|
221
|
+
raise ValueError(f"Split name must be provided for split-style JSON: {file_path}")
|
|
222
|
+
split_data = data.get(split)
|
|
223
|
+
if split_data is None:
|
|
224
|
+
raise ValueError(f"Split '{split}' not found in JSON file: {file_path}")
|
|
225
|
+
if not isinstance(split_data, list):
|
|
226
|
+
raise ValueError(f"Split '{split}' is not a list of examples in: {file_path}")
|
|
227
|
+
hf_dataset = HuggingFaceDataset.from_list(split_data)
|
|
228
|
+
elif isinstance(data, list):
|
|
229
|
+
hf_dataset = HuggingFaceDataset.from_list(data)
|
|
230
|
+
else:
|
|
231
|
+
raise ValueError(f"Unsupported JSON structure in {file_path}. Expected list or dict.")
|
|
232
|
+
|
|
233
|
+
name = name if name else path.stem
|
|
234
|
+
return cls(name=name, label=label, metrics=metrics, hf_dataset=hf_dataset)
|
|
235
|
+
|
|
236
|
+
@classmethod
|
|
237
|
+
def from_huggingface(
|
|
238
|
+
cls,
|
|
239
|
+
path: str,
|
|
240
|
+
label: str,
|
|
241
|
+
metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
|
|
242
|
+
split: Optional[str] = None,
|
|
243
|
+
name: Optional[str] = None,
|
|
244
|
+
) -> "EvalDataset":
|
|
245
|
+
"""Instantiate an EvalDataset from a dataset available on Hugging Face Hub.
|
|
246
|
+
|
|
247
|
+
If a specific split is provided (e.g., "train" or "test"), it will be loaded directly.
|
|
248
|
+
If no split is specified, the method attempts to load the full dataset. If the dataset
|
|
249
|
+
is split into multiple subsets (i.e., a DatasetDict), it defaults to loading the "test"
|
|
250
|
+
split.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
path: The path of the dataset on the Hugging Face Hub.
|
|
254
|
+
label: The field used as the evaluation label (ground truth).
|
|
255
|
+
metrics: The specified metrics associated with the dataset.
|
|
256
|
+
split: Optional name of the split to load.
|
|
257
|
+
name: Optional dataset configuration name.
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
An EvalDataset wrapping the selected Hugging Face dataset.
|
|
261
|
+
|
|
262
|
+
Raises:
|
|
263
|
+
ValueError: If the dataset cannot be loaded, or the expected split is missing.
|
|
264
|
+
"""
|
|
265
|
+
try:
|
|
266
|
+
kwargs = {}
|
|
267
|
+
if split is not None:
|
|
268
|
+
kwargs["split"] = split
|
|
269
|
+
if name is not None:
|
|
270
|
+
kwargs["name"] = name
|
|
271
|
+
ds = load_dataset(path, **kwargs)
|
|
272
|
+
except Exception as e:
|
|
273
|
+
raise ValueError(f"Failed to load dataset '{path}' from Hugging Face: {e}") from e
|
|
274
|
+
|
|
275
|
+
if isinstance(ds, HuggingFaceDataset):
|
|
276
|
+
hf_dataset = ds
|
|
277
|
+
elif isinstance(ds, HuggingFaceDatasetDict):
|
|
278
|
+
if "test" in ds:
|
|
279
|
+
hf_dataset = ds["test"]
|
|
280
|
+
else:
|
|
281
|
+
raise ValueError(
|
|
282
|
+
f"Split not specified and no 'test' split found in dataset '{path}'."
|
|
283
|
+
)
|
|
284
|
+
else:
|
|
285
|
+
raise ValueError(f"Unexpected dataset type for '{path}': {type(ds)}")
|
|
286
|
+
|
|
287
|
+
return cls(name=path, label=label, metrics=metrics, hf_dataset=hf_dataset)
|
|
288
|
+
|
|
289
|
+
@staticmethod
|
|
290
|
+
def _resolve_metrics(
|
|
291
|
+
metrics: Union[
|
|
292
|
+
str, Type[MetricBase], MetricBase, List[Union[str, Type[MetricBase], MetricBase]]
|
|
293
|
+
]
|
|
294
|
+
) -> List[MetricBase]:
|
|
295
|
+
"""
|
|
296
|
+
Convert metric names/classes into a list of MetricBase instances using MetricRegistry.
|
|
297
|
+
|
|
298
|
+
Used to normalize metrics to a metric type.
|
|
299
|
+
"""
|
|
300
|
+
if not isinstance(metrics, list):
|
|
301
|
+
metrics = [metrics]
|
|
302
|
+
|
|
303
|
+
resolved: List[MetricBase] = []
|
|
304
|
+
for m in metrics:
|
|
305
|
+
if isinstance(m, MetricBase):
|
|
306
|
+
resolved.append(m) # Already an instance
|
|
307
|
+
else:
|
|
308
|
+
resolved.append(MetricRegistry.get(m)) # Use registry for str or class
|
|
309
|
+
|
|
310
|
+
return resolved
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module defines the data structures used to represent evaluation results.
|
|
3
|
+
|
|
4
|
+
including individual prediction outcomes and aggregated dataset metrics.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import csv
|
|
8
|
+
import json
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Dict, List
|
|
12
|
+
|
|
13
|
+
from scorebook.types.eval_dataset import EvalDataset
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class EvalResult:
|
|
18
|
+
"""
|
|
19
|
+
Container for evaluation results from an entire dataset.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
eval_dataset: The dataset used for evaluation.
|
|
23
|
+
inference_outputs: A list of model predictions or outputs.
|
|
24
|
+
metric_scores: A dictionary mapping metric names to their scores.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
eval_dataset: EvalDataset
|
|
28
|
+
inference_outputs: List[Any]
|
|
29
|
+
metric_scores: Dict[str, Dict[str, Any]]
|
|
30
|
+
hyperparams: Dict[str, Any]
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def item_scores(self) -> List[Dict[str, Any]]:
|
|
34
|
+
"""Return a list of dictionaries containing scores for each evaluated item."""
|
|
35
|
+
results = []
|
|
36
|
+
metric_names = list(self.metric_scores.keys()) if self.metric_scores else []
|
|
37
|
+
|
|
38
|
+
for idx, item in enumerate(self.eval_dataset.items):
|
|
39
|
+
if idx >= len(self.inference_outputs):
|
|
40
|
+
break
|
|
41
|
+
|
|
42
|
+
result = {
|
|
43
|
+
"item_id": idx,
|
|
44
|
+
"dataset_name": self.eval_dataset.name,
|
|
45
|
+
**{
|
|
46
|
+
metric: self.metric_scores[metric]["item_scores"][idx]
|
|
47
|
+
for metric in metric_names
|
|
48
|
+
},
|
|
49
|
+
**self.hyperparams,
|
|
50
|
+
}
|
|
51
|
+
results.append(result)
|
|
52
|
+
|
|
53
|
+
return results
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def aggregate_scores(self) -> Dict[str, Any]:
|
|
57
|
+
"""Return the aggregated scores across all evaluated items."""
|
|
58
|
+
result: Dict[str, Any] = {"dataset_name": self.eval_dataset.name}
|
|
59
|
+
if not self.metric_scores:
|
|
60
|
+
return result
|
|
61
|
+
|
|
62
|
+
for metric, scores in self.metric_scores.items():
|
|
63
|
+
# Flatten the aggregate scores from each metric into the result
|
|
64
|
+
result.update(
|
|
65
|
+
{
|
|
66
|
+
key if key == metric else f"{metric}_{key}": value
|
|
67
|
+
for key, value in scores["aggregate_scores"].items()
|
|
68
|
+
}
|
|
69
|
+
)
|
|
70
|
+
for hyperparam, value in self.hyperparams.items():
|
|
71
|
+
result[hyperparam] = value
|
|
72
|
+
return result
|
|
73
|
+
|
|
74
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
75
|
+
"""Return a dictionary representing the evaluation results."""
|
|
76
|
+
return {
|
|
77
|
+
"aggregate": [
|
|
78
|
+
{
|
|
79
|
+
**getattr(self.eval_dataset, "hyperparams", {}),
|
|
80
|
+
**self.aggregate_scores,
|
|
81
|
+
}
|
|
82
|
+
],
|
|
83
|
+
"per_sample": [item for item in self.item_scores],
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
def to_csv(self, file_path: str) -> None:
|
|
87
|
+
"""Save evaluation results to a CSV file.
|
|
88
|
+
|
|
89
|
+
The CSV will contain item-level results.
|
|
90
|
+
"""
|
|
91
|
+
Path(file_path).parent.mkdir(parents=True, exist_ok=True)
|
|
92
|
+
|
|
93
|
+
with open(file_path, "w", newline="") as f:
|
|
94
|
+
writer = csv.writer(f)
|
|
95
|
+
|
|
96
|
+
# Write a header with all possible metric names
|
|
97
|
+
item_fields = list(self.eval_dataset.items[0].keys()) if self.eval_dataset.items else []
|
|
98
|
+
metric_names = list(self.metric_scores.keys()) if self.metric_scores else []
|
|
99
|
+
header = ["item_id"] + item_fields + ["inference_output"] + metric_names
|
|
100
|
+
writer.writerow(header)
|
|
101
|
+
|
|
102
|
+
# Write item data
|
|
103
|
+
for idx, item in enumerate(self.eval_dataset.items):
|
|
104
|
+
if idx >= len(self.inference_outputs):
|
|
105
|
+
break
|
|
106
|
+
|
|
107
|
+
row = (
|
|
108
|
+
[idx]
|
|
109
|
+
+ list(item.values())
|
|
110
|
+
+ [self.inference_outputs[idx]]
|
|
111
|
+
+ [self.metric_scores[metric]["item_scores"][idx] for metric in metric_names]
|
|
112
|
+
)
|
|
113
|
+
writer.writerow(row)
|
|
114
|
+
|
|
115
|
+
def to_json(self, file_path: str) -> None:
|
|
116
|
+
"""Save evaluation results to a JSON file in structured format (Option 2)."""
|
|
117
|
+
Path(file_path).parent.mkdir(parents=True, exist_ok=True)
|
|
118
|
+
with open(file_path, "w") as f:
|
|
119
|
+
json.dump(self.to_dict(), f, indent=2)
|
|
120
|
+
|
|
121
|
+
def __str__(self) -> str:
|
|
122
|
+
"""Return a formatted string representation of the evaluation results."""
|
|
123
|
+
result = [
|
|
124
|
+
f"Eval Dataset: {self.eval_dataset.name}",
|
|
125
|
+
"\nAggregate Scores:",
|
|
126
|
+
]
|
|
127
|
+
for metric_name, score in self.aggregate_scores.items():
|
|
128
|
+
result.append(f"\n {metric_name}: {score:.4f}")
|
|
129
|
+
return "".join(result)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Inference pipeline implementation for processing items through model inference.
|
|
3
|
+
|
|
4
|
+
This module provides a pipeline structure for handling model inference tasks,
|
|
5
|
+
supporting preprocessing, model inference, and postprocessing steps in a
|
|
6
|
+
configurable way.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
from typing import Any, Callable, Dict, List, Optional, cast
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class InferencePipeline:
|
|
14
|
+
"""A pipeline for processing items through model inference.
|
|
15
|
+
|
|
16
|
+
This class implements a three-stage pipeline that handles:
|
|
17
|
+
1. Preprocessing of input items
|
|
18
|
+
2. Model inference
|
|
19
|
+
3. Postprocessing of model outputs
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
Attributes:
|
|
23
|
+
model: Name or identifier of the model being used
|
|
24
|
+
preprocessor: Function to prepare items for model inference
|
|
25
|
+
inference_function: Function that performs the actual model inference
|
|
26
|
+
postprocessor: Function to process the model outputs
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
model: str,
|
|
32
|
+
inference_function: Callable,
|
|
33
|
+
preprocessor: Optional[Callable] = None,
|
|
34
|
+
postprocessor: Optional[Callable] = None,
|
|
35
|
+
) -> None:
|
|
36
|
+
"""Initialize the inference pipeline.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
model: Name or identifier of the model to use
|
|
40
|
+
inference_function: Function that performs model inference
|
|
41
|
+
preprocessor: Optional function to prepare items for inference.
|
|
42
|
+
postprocessor: Optional function to process model outputs.
|
|
43
|
+
"""
|
|
44
|
+
self.model: str = model
|
|
45
|
+
self.inference_function = inference_function
|
|
46
|
+
self.preprocessor: Optional[Callable] = preprocessor
|
|
47
|
+
self.postprocessor: Optional[Callable] = postprocessor
|
|
48
|
+
|
|
49
|
+
async def run(self, items: List[Dict[str, Any]], **hyperparameters: Any) -> List[Any]:
|
|
50
|
+
"""Execute the complete inference pipeline on a list of items.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
items: List of items to process through the pipeline
|
|
54
|
+
**hyperparameters: Model-specific parameters for inference
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
List of processed outputs after running through the complete pipeline
|
|
58
|
+
"""
|
|
59
|
+
if self.preprocessor:
|
|
60
|
+
input_items = [self.preprocessor(item) for item in items]
|
|
61
|
+
else:
|
|
62
|
+
input_items = items
|
|
63
|
+
|
|
64
|
+
if asyncio.iscoroutinefunction(self.inference_function):
|
|
65
|
+
inference_outputs = await self.inference_function(input_items, **hyperparameters)
|
|
66
|
+
else:
|
|
67
|
+
inference_outputs = self.inference_function(input_items, **hyperparameters)
|
|
68
|
+
|
|
69
|
+
if self.postprocessor:
|
|
70
|
+
return [self.postprocessor(inference_output) for inference_output in inference_outputs]
|
|
71
|
+
else:
|
|
72
|
+
return cast(List[Any], inference_outputs)
|
|
73
|
+
|
|
74
|
+
async def __call__(self, items: List[Dict[str, Any]], **hyperparameters: Any) -> List[Any]:
|
|
75
|
+
"""Make the pipeline instance callable by wrapping the run method.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
items: List of items to process through the pipeline
|
|
79
|
+
**hyperparameters: Model-specific parameters for inference
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
List of processed outputs after running through the complete pipeline
|
|
83
|
+
"""
|
|
84
|
+
return await self.run(items, **hyperparameters)
|