dataeval 0.86.2__tar.gz → 0.86.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataeval-0.86.2 → dataeval-0.86.4}/PKG-INFO +1 -1
- {dataeval-0.86.2 → dataeval-0.86.4}/pyproject.toml +1 -1
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/__init__.py +1 -1
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/__init__.py +0 -2
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/_images.py +3 -1
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/_metadata.py +40 -63
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/selections/_classfilter.py +2 -2
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metadata/_distance.py +1 -1
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metadata/_utils.py +4 -2
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/bias/_balance.py +6 -5
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/bias/_parity.py +2 -1
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/stats/_labelstats.py +24 -32
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/outputs/_base.py +1 -1
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/outputs/_bias.py +21 -18
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/outputs/_estimators.py +2 -1
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/outputs/_linters.py +18 -18
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/outputs/_stats.py +20 -20
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/outputs/_utils.py +3 -2
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/outputs/_workflows.py +9 -7
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/typing.py +4 -4
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/_plot.py +10 -10
- dataeval-0.86.2/src/dataeval/data/_targets.py +0 -89
- {dataeval-0.86.2 → dataeval-0.86.4}/LICENSE.txt +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/README.md +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/_log.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/config.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/_embeddings.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/_selection.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/_split.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/selections/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/selections/_classbalance.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/selections/_indices.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/selections/_limit.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/selections/_prioritize.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/selections/_reverse.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/selections/_shuffle.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_base.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_cvm.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_ks.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_mmd.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_mvdc.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_nml/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_nml/_base.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_nml/_chunk.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_nml/_domainclassifier.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_nml/_result.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_nml/_thresholds.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_uncertainty.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/updates.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/linters/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/linters/duplicates.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/linters/outliers.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/ood/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/ood/ae.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/ood/base.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/ood/mixin.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metadata/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metadata/_ood.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/bias/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/bias/_completeness.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/bias/_coverage.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/bias/_diversity.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/estimators/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/estimators/_ber.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/estimators/_clusterer.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/estimators/_divergence.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/estimators/_uap.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/stats/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/stats/_base.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/stats/_boxratiostats.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/stats/_dimensionstats.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/stats/_hashstats.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/stats/_imagestats.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/stats/_pixelstats.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/stats/_visualstats.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/outputs/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/outputs/_drift.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/outputs/_metadata.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/outputs/_ood.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/py.typed +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/_array.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/_bin.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/_clusterer.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/_fast_mst.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/_image.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/_method.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/_mst.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/data/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/data/_dataset.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/data/collate.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/data/metadata.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/datasets/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/datasets/_antiuav.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/datasets/_base.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/datasets/_cifar10.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/datasets/_fileio.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/datasets/_milco.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/datasets/_mixin.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/datasets/_mnist.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/datasets/_ships.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/datasets/_types.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/datasets/_voc.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/torch/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/torch/_blocks.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/torch/_gmm.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/torch/_internal.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/torch/models.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/torch/trainer.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/workflows/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/workflows/sufficiency.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: dataeval
|
3
|
-
Version: 0.86.
|
3
|
+
Version: 0.86.4
|
4
4
|
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
5
5
|
Home-page: https://dataeval.ai/
|
6
6
|
License: MIT
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "dataeval"
|
3
|
-
version = "0.86.
|
3
|
+
version = "0.86.4" # dynamic
|
4
4
|
description = "DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks"
|
5
5
|
license = "MIT"
|
6
6
|
readme = "README.md"
|
@@ -6,7 +6,6 @@ __all__ = [
|
|
6
6
|
"Metadata",
|
7
7
|
"Select",
|
8
8
|
"SplitDatasetOutput",
|
9
|
-
"Targets",
|
10
9
|
"split_dataset",
|
11
10
|
]
|
12
11
|
|
@@ -15,5 +14,4 @@ from dataeval.data._images import Images
|
|
15
14
|
from dataeval.data._metadata import Metadata
|
16
15
|
from dataeval.data._selection import Select
|
17
16
|
from dataeval.data._split import split_dataset
|
18
|
-
from dataeval.data._targets import Targets
|
19
17
|
from dataeval.outputs._utils import SplitDatasetOutput
|
@@ -4,6 +4,8 @@ __all__ = []
|
|
4
4
|
|
5
5
|
from typing import TYPE_CHECKING, Any, Generic, Iterator, Sequence, TypeVar, cast, overload
|
6
6
|
|
7
|
+
import numpy as np
|
8
|
+
|
7
9
|
from dataeval.typing import Array, ArrayLike, Dataset
|
8
10
|
from dataeval.utils._array import as_numpy, channels_first_to_last
|
9
11
|
|
@@ -58,7 +60,7 @@ class Images(Generic[T]):
|
|
58
60
|
num_images = len(indices)
|
59
61
|
num_rows = (num_images + images_per_row - 1) // images_per_row
|
60
62
|
fig, axes = plt.subplots(num_rows, images_per_row, figsize=figsize)
|
61
|
-
for i, ax in enumerate(axes.flatten()):
|
63
|
+
for i, ax in enumerate(np.asarray(axes).flatten()):
|
62
64
|
image = channels_first_to_last(as_numpy(self[i]))
|
63
65
|
ax.imshow(image)
|
64
66
|
ax.axis("off")
|
@@ -4,7 +4,7 @@ __all__ = []
|
|
4
4
|
|
5
5
|
import warnings
|
6
6
|
from dataclasses import dataclass
|
7
|
-
from typing import
|
7
|
+
from typing import Any, Iterable, Literal, Mapping, Sequence
|
8
8
|
|
9
9
|
import numpy as np
|
10
10
|
import polars as pl
|
@@ -19,10 +19,9 @@ from dataeval.utils._array import as_numpy
|
|
19
19
|
from dataeval.utils._bin import bin_data, digitize_data
|
20
20
|
from dataeval.utils.data.metadata import merge
|
21
21
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
from dataeval.data._targets import Targets
|
22
|
+
|
23
|
+
def _binned(name: str) -> str:
|
24
|
+
return f"{name}[]"
|
26
25
|
|
27
26
|
|
28
27
|
@dataclass
|
@@ -51,20 +50,20 @@ class Metadata:
|
|
51
50
|
|
52
51
|
def __init__(
|
53
52
|
self,
|
54
|
-
dataset: AnnotatedDataset[tuple[Any, Any,
|
53
|
+
dataset: AnnotatedDataset[tuple[Any, Any, Mapping[str, Any]]],
|
55
54
|
*,
|
56
55
|
continuous_factor_bins: Mapping[str, int | Sequence[float]] | None = None,
|
57
56
|
auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = "uniform_width",
|
58
57
|
exclude: Sequence[str] | None = None,
|
59
58
|
include: Sequence[str] | None = None,
|
60
59
|
) -> None:
|
61
|
-
self._targets: Targets
|
62
60
|
self._class_labels: NDArray[np.intp]
|
63
61
|
self._class_names: list[str]
|
64
62
|
self._image_indices: NDArray[np.intp]
|
65
63
|
self._factors: dict[str, FactorInfo]
|
66
64
|
self._dropped_factors: dict[str, list[str]]
|
67
65
|
self._dataframe: pl.DataFrame
|
66
|
+
self._raw: Sequence[Mapping[str, Any]]
|
68
67
|
|
69
68
|
self._is_structured = False
|
70
69
|
self._is_binned = False
|
@@ -80,13 +79,7 @@ class Metadata:
|
|
80
79
|
self._include = set(include or ())
|
81
80
|
|
82
81
|
@property
|
83
|
-
def
|
84
|
-
"""Target information for the dataset."""
|
85
|
-
self._structure()
|
86
|
-
return self._targets
|
87
|
-
|
88
|
-
@property
|
89
|
-
def raw(self) -> list[dict[str, Any]]:
|
82
|
+
def raw(self) -> Sequence[Mapping[str, Any]]:
|
90
83
|
"""The raw list of metadata dictionaries for the dataset."""
|
91
84
|
self._structure()
|
92
85
|
return self._raw
|
@@ -146,7 +139,7 @@ class Metadata:
|
|
146
139
|
return self._dataframe
|
147
140
|
|
148
141
|
@property
|
149
|
-
def dropped_factors(self) ->
|
142
|
+
def dropped_factors(self) -> Mapping[str, Sequence[str]]:
|
150
143
|
"""Factors that were dropped during preprocessing and the reasons why they were dropped."""
|
151
144
|
self._structure()
|
152
145
|
return self._dropped_factors
|
@@ -165,16 +158,16 @@ class Metadata:
|
|
165
158
|
)
|
166
159
|
|
167
160
|
@property
|
168
|
-
def factor_names(self) ->
|
161
|
+
def factor_names(self) -> Sequence[str]:
|
169
162
|
"""Factor names of the metadata."""
|
170
163
|
self._structure()
|
171
|
-
return list(self._factors)
|
164
|
+
return list(filter(self._filter, self._factors))
|
172
165
|
|
173
166
|
@property
|
174
|
-
def factor_info(self) ->
|
167
|
+
def factor_info(self) -> Mapping[str, FactorInfo]:
|
175
168
|
"""Factor types of the metadata."""
|
176
169
|
self._bin()
|
177
|
-
return self._factors
|
170
|
+
return dict(filter(self._filter, self._factors.items()))
|
178
171
|
|
179
172
|
@property
|
180
173
|
def factor_data(self) -> NDArray[Any]:
|
@@ -192,7 +185,7 @@ class Metadata:
|
|
192
185
|
return self._class_labels
|
193
186
|
|
194
187
|
@property
|
195
|
-
def class_names(self) ->
|
188
|
+
def class_names(self) -> Sequence[str]:
|
196
189
|
"""Class names as a list of strings."""
|
197
190
|
self._structure()
|
198
191
|
return self._class_names
|
@@ -206,13 +199,17 @@ class Metadata:
|
|
206
199
|
@property
|
207
200
|
def image_count(self) -> int:
|
208
201
|
self._bin()
|
209
|
-
return int(self._image_indices.max() + 1)
|
202
|
+
return 0 if self._image_indices.size == 0 else int(self._image_indices.max() + 1)
|
203
|
+
|
204
|
+
def _filter(self, factor: str | tuple[str, Any]) -> bool:
|
205
|
+
factor = factor[0] if isinstance(factor, tuple) else factor
|
206
|
+
return factor in self.include if self.include else factor not in self.exclude
|
210
207
|
|
211
208
|
def _reset_bins(self, cols: Iterable[str] | None = None) -> None:
|
212
209
|
if self._is_binned:
|
213
210
|
columns = self._dataframe.columns
|
214
|
-
for col in (col for col in cols or columns if
|
215
|
-
self._dataframe.drop_in_place(
|
211
|
+
for col in (col for col in cols or columns if _binned(col) in columns):
|
212
|
+
self._dataframe.drop_in_place(_binned(col))
|
216
213
|
self._factors[col] = FactorInfo()
|
217
214
|
self._is_binned = False
|
218
215
|
|
@@ -220,7 +217,7 @@ class Metadata:
|
|
220
217
|
if self._is_structured:
|
221
218
|
return
|
222
219
|
|
223
|
-
raw:
|
220
|
+
raw: Sequence[Mapping[str, Any]] = []
|
224
221
|
|
225
222
|
labels = []
|
226
223
|
bboxes = []
|
@@ -255,6 +252,14 @@ class Metadata:
|
|
255
252
|
bboxes = as_numpy(bboxes).astype(np.float32) if is_od else None
|
256
253
|
srcidx = as_numpy(srcidx).astype(np.intp) if is_od else None
|
257
254
|
|
255
|
+
index2label = self._dataset.metadata.get("index2label", {i: str(i) for i in np.unique(labels)})
|
256
|
+
|
257
|
+
targets_per_image = None if srcidx is None else np.unique(srcidx, return_counts=True)[1].tolist()
|
258
|
+
merged = merge(raw, return_dropped=True, ignore_lists=False, targets_per_image=targets_per_image)
|
259
|
+
|
260
|
+
reserved = ["image_index", "class_label", "score", "box"]
|
261
|
+
factor_dict = {f"metadata_{k}" if k in reserved else k: v for k, v in merged[0].items() if k != "_image_index"}
|
262
|
+
|
258
263
|
target_dict = {
|
259
264
|
"image_index": srcidx if srcidx is not None else np.arange(len(labels)),
|
260
265
|
"class_label": labels,
|
@@ -262,20 +267,11 @@ class Metadata:
|
|
262
267
|
"box": bboxes if bboxes is not None else [None] * len(labels),
|
263
268
|
}
|
264
269
|
|
265
|
-
self._targets = Targets(labels, scores, bboxes, srcidx)
|
266
270
|
self._raw = raw
|
267
|
-
|
268
|
-
index2label = self._dataset.metadata.get("index2label", {})
|
271
|
+
self._index2label = index2label
|
269
272
|
self._class_labels = labels
|
270
|
-
self._class_names =
|
273
|
+
self._class_names = list(index2label.values())
|
271
274
|
self._image_indices = target_dict["image_index"]
|
272
|
-
|
273
|
-
targets_per_image = None if srcidx is None else np.unique(srcidx, return_counts=True)[1].tolist()
|
274
|
-
merged = merge(raw, return_dropped=True, ignore_lists=False, targets_per_image=targets_per_image)
|
275
|
-
|
276
|
-
reserved = ["image_index", "class_label", "score", "box"]
|
277
|
-
factor_dict = {f"metadata_{k}" if k in reserved else k: v for k, v in merged[0].items() if k != "_image_index"}
|
278
|
-
|
279
275
|
self._factors = dict.fromkeys(factor_dict, FactorInfo())
|
280
276
|
self._dataframe = pl.DataFrame({**target_dict, **factor_dict})
|
281
277
|
self._dropped_factors = merged[1]
|
@@ -302,10 +298,10 @@ class Metadata:
|
|
302
298
|
)
|
303
299
|
|
304
300
|
column_set = set(df.columns)
|
305
|
-
for col in (col for col in self.factor_names if
|
301
|
+
for col in (col for col in self.factor_names if _binned(col) not in column_set):
|
306
302
|
# Get data as numpy array for processing
|
307
303
|
data = df[col].to_numpy()
|
308
|
-
col_dz =
|
304
|
+
col_dz = _binned(col)
|
309
305
|
if col in factor_bins:
|
310
306
|
# User provided binning
|
311
307
|
bins = factor_bins[col]
|
@@ -332,31 +328,14 @@ class Metadata:
|
|
332
328
|
df = df.with_columns(pl.Series(name=col_dz, values=binned_data.astype(np.int64)))
|
333
329
|
factor_info[col] = FactorInfo("continuous", col_dz)
|
334
330
|
else:
|
335
|
-
factor_info[col] = FactorInfo("discrete",
|
331
|
+
factor_info[col] = FactorInfo("discrete", col)
|
336
332
|
|
337
333
|
# Store the results
|
338
334
|
self._dataframe = df
|
339
335
|
self._factors.update(factor_info)
|
340
336
|
self._is_binned = True
|
341
337
|
|
342
|
-
def
|
343
|
-
"""
|
344
|
-
Get the names of factors of a specific type.
|
345
|
-
|
346
|
-
Parameters
|
347
|
-
----------
|
348
|
-
factor_type : Literal["categorical", "continuous", "discrete"]
|
349
|
-
The type of factors to retrieve.
|
350
|
-
|
351
|
-
Returns
|
352
|
-
-------
|
353
|
-
list[str]
|
354
|
-
List of factor names of the specified type.
|
355
|
-
"""
|
356
|
-
self._bin()
|
357
|
-
return [name for name, info in self.factor_info.items() if info.factor_type == factor_type]
|
358
|
-
|
359
|
-
def add_factors(self, factors: Mapping[str, Any]) -> None:
|
338
|
+
def add_factors(self, factors: Mapping[str, Array | Sequence[Any]]) -> None:
|
360
339
|
"""
|
361
340
|
Add additional factors to the metadata.
|
362
341
|
|
@@ -365,16 +344,15 @@ class Metadata:
|
|
365
344
|
|
366
345
|
Parameters
|
367
346
|
----------
|
368
|
-
factors : Mapping[str,
|
347
|
+
factors : Mapping[str, Array | Sequence[Any]]
|
369
348
|
Dictionary of factors to add to the metadata.
|
370
349
|
"""
|
371
350
|
self._structure()
|
372
351
|
|
373
|
-
targets = len(self.
|
352
|
+
targets = len(self.dataframe)
|
374
353
|
images = self.image_count
|
375
|
-
|
376
|
-
|
377
|
-
images_match = targets_match if images == targets else all(f == images for f in lengths.values())
|
354
|
+
targets_match = all(len(v) == targets for v in factors.values())
|
355
|
+
images_match = targets_match if images == targets else all(len(v) == images for v in factors.values())
|
378
356
|
if not targets_match and not images_match:
|
379
357
|
raise ValueError(
|
380
358
|
"The lists/arrays in the provided factors have a different length than the current metadata factors."
|
@@ -382,8 +360,7 @@ class Metadata:
|
|
382
360
|
|
383
361
|
new_columns = []
|
384
362
|
for k, v in factors.items():
|
385
|
-
|
386
|
-
data = v if (self.targets.source is None or lengths[k] == targets) else v[self.targets.source]
|
363
|
+
data = as_numpy(v)[self.image_indices]
|
387
364
|
new_columns.append(pl.Series(name=k, values=data))
|
388
365
|
self._factors[k] = FactorInfo()
|
389
366
|
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
|
-
from typing import Any, Generic, Iterable, Sequence, Sized, TypeVar, cast
|
5
|
+
from typing import Any, Generic, Iterable, Mapping, Sequence, Sized, TypeVar, cast
|
6
6
|
|
7
7
|
import numpy as np
|
8
8
|
from numpy.typing import NDArray
|
@@ -92,7 +92,7 @@ class ClassFilterSubSelection(Subselection[Any]):
|
|
92
92
|
def __init__(self, classes: Sequence[int]) -> None:
|
93
93
|
self.classes = classes
|
94
94
|
|
95
|
-
def _filter(self, d:
|
95
|
+
def _filter(self, d: Mapping[str, Any], mask: NDArray[np.bool_]) -> dict[str, Any]:
|
96
96
|
return {k: self._filter(v, mask) if isinstance(v, dict) else _try_mask_object(v, mask) for k, v in d.items()}
|
97
97
|
|
98
98
|
def __call__(self, datum: _TDatum) -> _TDatum:
|
@@ -81,7 +81,7 @@ def metadata_distance(metadata1: Metadata, metadata2: Metadata) -> MetadataDista
|
|
81
81
|
"""
|
82
82
|
|
83
83
|
_compare_keys(metadata1.factor_names, metadata2.factor_names)
|
84
|
-
cont_fnames = metadata1.
|
84
|
+
cont_fnames = [name for name, info in metadata1.factor_info.items() if info.factor_type == "continuous"]
|
85
85
|
|
86
86
|
if not cont_fnames:
|
87
87
|
return MetadataDistanceOutput({})
|
@@ -1,9 +1,11 @@
|
|
1
1
|
__all__ = []
|
2
2
|
|
3
|
+
from typing import Sequence
|
4
|
+
|
3
5
|
from numpy.typing import NDArray
|
4
6
|
|
5
7
|
|
6
|
-
def _compare_keys(keys1:
|
8
|
+
def _compare_keys(keys1: Sequence[str], keys2: Sequence[str]) -> None:
|
7
9
|
"""
|
8
10
|
Raises error when two lists are not equivalent including ordering
|
9
11
|
|
@@ -24,7 +26,7 @@ def _compare_keys(keys1: list[str], keys2: list[str]) -> None:
|
|
24
26
|
raise ValueError(f"Metadata keys must be identical, got {keys1} and {keys2}")
|
25
27
|
|
26
28
|
|
27
|
-
def _validate_factors_and_data(factors:
|
29
|
+
def _validate_factors_and_data(factors: Sequence[str], data: NDArray) -> None:
|
28
30
|
"""
|
29
31
|
Raises error when the number of factors and number of rows do not match
|
30
32
|
|
@@ -99,9 +99,10 @@ def balance(
|
|
99
99
|
factor_types = {"class_label": "categorical"} | {k: v.factor_type for k, v in metadata.factor_info.items()}
|
100
100
|
is_discrete = [factor_type != "continuous" for factor_type in factor_types.values()]
|
101
101
|
num_factors = len(factor_types)
|
102
|
+
class_labels = metadata.class_labels
|
102
103
|
|
103
104
|
mi = np.full((num_factors, num_factors), np.nan, dtype=np.float32)
|
104
|
-
data = np.hstack((
|
105
|
+
data = np.hstack((class_labels[:, np.newaxis], data))
|
105
106
|
|
106
107
|
for idx, factor_type in enumerate(factor_types.values()):
|
107
108
|
if factor_type != "continuous":
|
@@ -132,12 +133,12 @@ def balance(
|
|
132
133
|
factors = nmi[1:, 1:]
|
133
134
|
|
134
135
|
# assume class is a factor
|
135
|
-
|
136
|
+
u_classes = np.unique(class_labels)
|
137
|
+
num_classes = len(u_classes)
|
136
138
|
classwise_mi = np.full((num_classes, num_factors), np.nan, dtype=np.float32)
|
137
139
|
|
138
140
|
# classwise targets
|
139
|
-
|
140
|
-
tgt_bin = data[:, 0][:, None] == classes
|
141
|
+
tgt_bin = data[:, 0][:, None] == u_classes
|
141
142
|
|
142
143
|
# classification MI for discrete/categorical features
|
143
144
|
for idx in range(num_classes):
|
@@ -157,6 +158,6 @@ def balance(
|
|
157
158
|
classwise = classwise_mi / norm_factor
|
158
159
|
|
159
160
|
# Grabbing factor names for plotting function
|
160
|
-
factor_names = ["class_label"] + metadata.factor_names
|
161
|
+
factor_names = ["class_label"] + list(metadata.factor_names)
|
161
162
|
|
162
163
|
return BalanceOutput(balance, factors, classwise, factor_names, metadata.class_names)
|
@@ -259,7 +259,8 @@ def parity(metadata: Metadata) -> ParityOutput:
|
|
259
259
|
counts = np.nonzero(contingency_matrix < 5)
|
260
260
|
unique_factor_values = np.unique(col_data)
|
261
261
|
current_factor_name = metadata.factor_names[i]
|
262
|
-
for
|
262
|
+
for _factor, _class in zip(counts[0], counts[1]):
|
263
|
+
int_factor, int_class = int(_factor), int(_class)
|
263
264
|
if contingency_matrix[int_factor, int_class] > 0:
|
264
265
|
factor_category = unique_factor_values[int_factor].item()
|
265
266
|
class_name = metadata.class_names[int_class]
|
@@ -2,8 +2,9 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
|
-
from
|
6
|
-
|
5
|
+
from typing import Any, TypeVar
|
6
|
+
|
7
|
+
import polars as pl
|
7
8
|
|
8
9
|
from dataeval.data._metadata import Metadata
|
9
10
|
from dataeval.outputs import LabelStatsOutput
|
@@ -13,10 +14,6 @@ from dataeval.typing import AnnotatedDataset
|
|
13
14
|
TValue = TypeVar("TValue")
|
14
15
|
|
15
16
|
|
16
|
-
def _sort_to_list(d: Mapping[int, TValue]) -> list[TValue]:
|
17
|
-
return [t[1] for t in sorted(d.items())]
|
18
|
-
|
19
|
-
|
20
17
|
@set_metadata
|
21
18
|
def labelstats(dataset: Metadata | AnnotatedDataset[Any]) -> LabelStatsOutput:
|
22
19
|
"""
|
@@ -52,39 +49,34 @@ def labelstats(dataset: Metadata | AnnotatedDataset[Any]) -> LabelStatsOutput:
|
|
52
49
|
pig: 2 - 2
|
53
50
|
chicken: 5 - 5
|
54
51
|
"""
|
55
|
-
|
56
|
-
|
57
|
-
label_counts: Counter[int] = Counter()
|
58
|
-
image_counts: Counter[int] = Counter()
|
59
|
-
index_location = defaultdict(list[int])
|
60
|
-
label_per_image: list[int] = []
|
61
|
-
|
62
|
-
index2label = dict(enumerate(dataset.class_names))
|
63
|
-
|
64
|
-
for i, target in enumerate(dataset.targets):
|
65
|
-
group = target.labels.tolist()
|
52
|
+
metadata = Metadata(dataset) if isinstance(dataset, AnnotatedDataset) else dataset
|
53
|
+
metadata_df = metadata.dataframe
|
66
54
|
|
67
|
-
|
68
|
-
|
55
|
+
# Count occurrences of each label across all images
|
56
|
+
label_counts_df = metadata_df.group_by("class_label").len()
|
57
|
+
label_counts = dict(zip(label_counts_df["class_label"], label_counts_df["len"]))
|
69
58
|
|
70
|
-
|
71
|
-
|
59
|
+
# Count unique images per label (how many images contain each label)
|
60
|
+
image_counts_df = metadata_df.select(["image_index", "class_label"]).unique().group_by("class_label").len()
|
61
|
+
image_counts = dict(zip(image_counts_df["class_label"], image_counts_df["len"]))
|
72
62
|
|
73
|
-
|
74
|
-
|
63
|
+
# Create index_location mapping (which images contain each label)
|
64
|
+
index_location: dict[int, list[int]] = {}
|
65
|
+
for row in metadata_df.group_by("class_label").agg(pl.col("image_index")).to_dicts():
|
66
|
+
indices = row["image_index"]
|
67
|
+
index_location[row["class_label"]] = sorted(dict.fromkeys(indices)) if isinstance(indices, list) else [indices]
|
75
68
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
index_location[item].append(i)
|
69
|
+
# Count labels per image
|
70
|
+
label_per_image_df = metadata_df.group_by("image_index").agg(pl.len().alias("label_count"))
|
71
|
+
label_per_image = label_per_image_df.sort("image_index")["label_count"].to_list()
|
80
72
|
|
81
73
|
return LabelStatsOutput(
|
82
|
-
label_counts_per_class=
|
74
|
+
label_counts_per_class=label_counts,
|
83
75
|
label_counts_per_image=label_per_image,
|
84
|
-
image_counts_per_class=
|
85
|
-
image_indices_per_class=
|
76
|
+
image_counts_per_class=image_counts,
|
77
|
+
image_indices_per_class=index_location,
|
86
78
|
image_count=len(label_per_image),
|
87
|
-
class_count=len(
|
79
|
+
class_count=len(metadata.class_names),
|
88
80
|
label_count=sum(label_counts.values()),
|
89
|
-
class_names=
|
81
|
+
class_names=metadata.class_names,
|
90
82
|
)
|
@@ -147,7 +147,7 @@ P = ParamSpec("P")
|
|
147
147
|
R = TypeVar("R", bound=GenericOutput)
|
148
148
|
|
149
149
|
|
150
|
-
def set_metadata(fn: Callable[P, R] | None = None, *, state:
|
150
|
+
def set_metadata(fn: Callable[P, R] | None = None, *, state: Sequence[str] | None = None) -> Callable[P, R]:
|
151
151
|
"""Decorator to stamp Output classes with runtime metadata"""
|
152
152
|
|
153
153
|
if fn is None:
|
@@ -4,7 +4,7 @@ __all__ = []
|
|
4
4
|
|
5
5
|
import contextlib
|
6
6
|
from dataclasses import asdict, dataclass
|
7
|
-
from typing import Any, TypeVar
|
7
|
+
from typing import Any, Mapping, Sequence, TypeVar
|
8
8
|
|
9
9
|
import numpy as np
|
10
10
|
import pandas as pd
|
@@ -39,7 +39,7 @@ class ToDataFrameMixin:
|
|
39
39
|
This method requires `pandas <https://pandas.pydata.org/>`_ to be installed.
|
40
40
|
"""
|
41
41
|
return pd.DataFrame(
|
42
|
-
index=self.factor_names, # type: ignore -
|
42
|
+
index=self.factor_names, # type: ignore - Sequence[str] is documented as acceptable index type
|
43
43
|
data={
|
44
44
|
"score": self.score.round(2),
|
45
45
|
"p-value": self.p_value.round(2),
|
@@ -58,7 +58,7 @@ class ParityOutput(ToDataFrameMixin, Output):
|
|
58
58
|
chi-squared score(s) of the test
|
59
59
|
p_value : NDArray[np.float64]
|
60
60
|
p-value(s) of the test
|
61
|
-
factor_names :
|
61
|
+
factor_names : Sequence[str]
|
62
62
|
Names of each metadata factor
|
63
63
|
insufficient_data: dict
|
64
64
|
Dictionary of metadata factors with less than 5 class occurrences per value
|
@@ -66,8 +66,8 @@ class ParityOutput(ToDataFrameMixin, Output):
|
|
66
66
|
|
67
67
|
score: NDArray[np.float64]
|
68
68
|
p_value: NDArray[np.float64]
|
69
|
-
factor_names:
|
70
|
-
insufficient_data:
|
69
|
+
factor_names: Sequence[str]
|
70
|
+
insufficient_data: Mapping[str, Mapping[int, Mapping[str, int]]]
|
71
71
|
|
72
72
|
|
73
73
|
@dataclass(frozen=True)
|
@@ -145,12 +145,15 @@ class CoverageOutput(Output):
|
|
145
145
|
cols = min(3, num_images)
|
146
146
|
fig, axs = plt.subplots(rows, cols, figsize=(3 * cols, 3 * rows))
|
147
147
|
|
148
|
-
|
148
|
+
# Flatten axes using numpy array explicitly for compatibility
|
149
|
+
axs_flat = np.asarray(axs).flatten()
|
150
|
+
|
151
|
+
for image, ax in zip(images[:num_images], axs_flat):
|
149
152
|
image = channels_first_to_last(as_numpy(image))
|
150
153
|
ax.imshow(image)
|
151
154
|
ax.axis("off")
|
152
155
|
|
153
|
-
for ax in
|
156
|
+
for ax in axs_flat[num_images:]:
|
154
157
|
ax.axis("off")
|
155
158
|
|
156
159
|
fig.tight_layout()
|
@@ -187,22 +190,22 @@ class BalanceOutput(Output):
|
|
187
190
|
Estimate of inter/intra-factor mutual information
|
188
191
|
classwise : NDArray[np.float64]
|
189
192
|
Estimate of mutual information between metadata factors and individual class labels
|
190
|
-
factor_names :
|
193
|
+
factor_names : Sequence[str]
|
191
194
|
Names of each metadata factor
|
192
|
-
class_names :
|
195
|
+
class_names : Sequence[str]
|
193
196
|
List of the class labels present in the dataset
|
194
197
|
"""
|
195
198
|
|
196
199
|
balance: NDArray[np.float64]
|
197
200
|
factors: NDArray[np.float64]
|
198
201
|
classwise: NDArray[np.float64]
|
199
|
-
factor_names:
|
200
|
-
class_names:
|
202
|
+
factor_names: Sequence[str]
|
203
|
+
class_names: Sequence[str]
|
201
204
|
|
202
205
|
def plot(
|
203
206
|
self,
|
204
|
-
row_labels:
|
205
|
-
col_labels:
|
207
|
+
row_labels: Sequence[Any] | NDArray[Any] | None = None,
|
208
|
+
col_labels: Sequence[Any] | NDArray[Any] | None = None,
|
206
209
|
plot_classwise: bool = False,
|
207
210
|
) -> Figure:
|
208
211
|
"""
|
@@ -276,16 +279,16 @@ class DiversityOutput(Output):
|
|
276
279
|
:term:`Diversity` index for classes and factors
|
277
280
|
classwise : NDArray[np.double]
|
278
281
|
Classwise diversity index [n_class x n_factor]
|
279
|
-
factor_names :
|
282
|
+
factor_names : Sequence[str]
|
280
283
|
Names of each metadata factor
|
281
|
-
class_names :
|
284
|
+
class_names : Sequence[str]
|
282
285
|
Class labels for each value in the dataset
|
283
286
|
"""
|
284
287
|
|
285
288
|
diversity_index: NDArray[np.double]
|
286
289
|
classwise: NDArray[np.double]
|
287
|
-
factor_names:
|
288
|
-
class_names:
|
290
|
+
factor_names: Sequence[str]
|
291
|
+
class_names: Sequence[str]
|
289
292
|
|
290
293
|
def plot(
|
291
294
|
self,
|
@@ -333,7 +336,7 @@ class DiversityOutput(Output):
|
|
333
336
|
import matplotlib.pyplot as plt
|
334
337
|
|
335
338
|
fig, ax = plt.subplots(figsize=(8, 8))
|
336
|
-
heat_labels = ["class_labels"] + self.factor_names
|
339
|
+
heat_labels = ["class_labels"] + list(self.factor_names)
|
337
340
|
ax.bar(heat_labels, self.diversity_index)
|
338
341
|
ax.set_xlabel("Factors")
|
339
342
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
|
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
5
|
from dataclasses import dataclass
|
6
|
+
from typing import Sequence
|
6
7
|
|
7
8
|
import numpy as np
|
8
9
|
from numpy.typing import NDArray
|
@@ -64,7 +65,7 @@ class ClustererOutput(Output):
|
|
64
65
|
"""
|
65
66
|
return np.nonzero(self.clusters == -1)[0]
|
66
67
|
|
67
|
-
def find_duplicates(self) -> tuple[
|
68
|
+
def find_duplicates(self) -> tuple[Sequence[Sequence[int]], Sequence[Sequence[int]]]:
|
68
69
|
"""
|
69
70
|
Finds duplicate and near duplicate data based on cluster average distance
|
70
71
|
|