dataeval 0.86.2__tar.gz → 0.86.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataeval-0.86.2 → dataeval-0.86.3}/PKG-INFO +1 -1
- {dataeval-0.86.2 → dataeval-0.86.3}/pyproject.toml +1 -1
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/__init__.py +1 -1
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/__init__.py +0 -2
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/_metadata.py +26 -41
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/selections/_classfilter.py +2 -2
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metadata/_utils.py +4 -2
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/bias/_balance.py +1 -1
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/bias/_parity.py +2 -1
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/stats/_labelstats.py +24 -28
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/outputs/_base.py +1 -1
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/outputs/_bias.py +21 -18
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/outputs/_estimators.py +2 -1
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/outputs/_linters.py +17 -17
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/outputs/_stats.py +20 -20
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/outputs/_utils.py +3 -2
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/outputs/_workflows.py +9 -7
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/typing.py +4 -4
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/_plot.py +4 -4
- dataeval-0.86.2/src/dataeval/data/_targets.py +0 -89
- {dataeval-0.86.2 → dataeval-0.86.3}/LICENSE.txt +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/README.md +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/_log.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/config.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/_embeddings.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/_images.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/_selection.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/_split.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/selections/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/selections/_classbalance.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/selections/_indices.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/selections/_limit.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/selections/_prioritize.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/selections/_reverse.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/selections/_shuffle.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_base.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_cvm.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_ks.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_mmd.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_mvdc.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_nml/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_nml/_base.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_nml/_chunk.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_nml/_domainclassifier.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_nml/_result.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_nml/_thresholds.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_uncertainty.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/updates.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/linters/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/linters/duplicates.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/linters/outliers.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/ood/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/ood/ae.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/ood/base.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/ood/mixin.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metadata/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metadata/_distance.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metadata/_ood.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/bias/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/bias/_completeness.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/bias/_coverage.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/bias/_diversity.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/estimators/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/estimators/_ber.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/estimators/_clusterer.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/estimators/_divergence.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/estimators/_uap.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/stats/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/stats/_base.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/stats/_boxratiostats.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/stats/_dimensionstats.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/stats/_hashstats.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/stats/_imagestats.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/stats/_pixelstats.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/stats/_visualstats.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/outputs/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/outputs/_drift.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/outputs/_metadata.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/outputs/_ood.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/py.typed +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/_array.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/_bin.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/_clusterer.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/_fast_mst.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/_image.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/_method.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/_mst.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/data/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/data/_dataset.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/data/collate.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/data/metadata.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/datasets/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/datasets/_antiuav.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/datasets/_base.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/datasets/_cifar10.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/datasets/_fileio.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/datasets/_milco.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/datasets/_mixin.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/datasets/_mnist.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/datasets/_ships.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/datasets/_types.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/datasets/_voc.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/torch/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/torch/_blocks.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/torch/_gmm.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/torch/_internal.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/torch/models.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/torch/trainer.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/workflows/__init__.py +0 -0
- {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/workflows/sufficiency.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: dataeval
|
3
|
-
Version: 0.86.
|
3
|
+
Version: 0.86.3
|
4
4
|
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
5
5
|
Home-page: https://dataeval.ai/
|
6
6
|
License: MIT
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "dataeval"
|
3
|
-
version = "0.86.
|
3
|
+
version = "0.86.3" # dynamic
|
4
4
|
description = "DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks"
|
5
5
|
license = "MIT"
|
6
6
|
readme = "README.md"
|
@@ -6,7 +6,6 @@ __all__ = [
|
|
6
6
|
"Metadata",
|
7
7
|
"Select",
|
8
8
|
"SplitDatasetOutput",
|
9
|
-
"Targets",
|
10
9
|
"split_dataset",
|
11
10
|
]
|
12
11
|
|
@@ -15,5 +14,4 @@ from dataeval.data._images import Images
|
|
15
14
|
from dataeval.data._metadata import Metadata
|
16
15
|
from dataeval.data._selection import Select
|
17
16
|
from dataeval.data._split import split_dataset
|
18
|
-
from dataeval.data._targets import Targets
|
19
17
|
from dataeval.outputs._utils import SplitDatasetOutput
|
@@ -4,7 +4,7 @@ __all__ = []
|
|
4
4
|
|
5
5
|
import warnings
|
6
6
|
from dataclasses import dataclass
|
7
|
-
from typing import
|
7
|
+
from typing import Any, Iterable, Literal, Mapping, Sequence
|
8
8
|
|
9
9
|
import numpy as np
|
10
10
|
import polars as pl
|
@@ -19,11 +19,6 @@ from dataeval.utils._array import as_numpy
|
|
19
19
|
from dataeval.utils._bin import bin_data, digitize_data
|
20
20
|
from dataeval.utils.data.metadata import merge
|
21
21
|
|
22
|
-
if TYPE_CHECKING:
|
23
|
-
from dataeval.data import Targets
|
24
|
-
else:
|
25
|
-
from dataeval.data._targets import Targets
|
26
|
-
|
27
22
|
|
28
23
|
@dataclass
|
29
24
|
class FactorInfo:
|
@@ -51,20 +46,20 @@ class Metadata:
|
|
51
46
|
|
52
47
|
def __init__(
|
53
48
|
self,
|
54
|
-
dataset: AnnotatedDataset[tuple[Any, Any,
|
49
|
+
dataset: AnnotatedDataset[tuple[Any, Any, Mapping[str, Any]]],
|
55
50
|
*,
|
56
51
|
continuous_factor_bins: Mapping[str, int | Sequence[float]] | None = None,
|
57
52
|
auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = "uniform_width",
|
58
53
|
exclude: Sequence[str] | None = None,
|
59
54
|
include: Sequence[str] | None = None,
|
60
55
|
) -> None:
|
61
|
-
self._targets: Targets
|
62
56
|
self._class_labels: NDArray[np.intp]
|
63
57
|
self._class_names: list[str]
|
64
58
|
self._image_indices: NDArray[np.intp]
|
65
59
|
self._factors: dict[str, FactorInfo]
|
66
60
|
self._dropped_factors: dict[str, list[str]]
|
67
61
|
self._dataframe: pl.DataFrame
|
62
|
+
self._raw: Sequence[Mapping[str, Any]]
|
68
63
|
|
69
64
|
self._is_structured = False
|
70
65
|
self._is_binned = False
|
@@ -80,13 +75,7 @@ class Metadata:
|
|
80
75
|
self._include = set(include or ())
|
81
76
|
|
82
77
|
@property
|
83
|
-
def
|
84
|
-
"""Target information for the dataset."""
|
85
|
-
self._structure()
|
86
|
-
return self._targets
|
87
|
-
|
88
|
-
@property
|
89
|
-
def raw(self) -> list[dict[str, Any]]:
|
78
|
+
def raw(self) -> Sequence[Mapping[str, Any]]:
|
90
79
|
"""The raw list of metadata dictionaries for the dataset."""
|
91
80
|
self._structure()
|
92
81
|
return self._raw
|
@@ -146,7 +135,7 @@ class Metadata:
|
|
146
135
|
return self._dataframe
|
147
136
|
|
148
137
|
@property
|
149
|
-
def dropped_factors(self) ->
|
138
|
+
def dropped_factors(self) -> Mapping[str, Sequence[str]]:
|
150
139
|
"""Factors that were dropped during preprocessing and the reasons why they were dropped."""
|
151
140
|
self._structure()
|
152
141
|
return self._dropped_factors
|
@@ -165,13 +154,13 @@ class Metadata:
|
|
165
154
|
)
|
166
155
|
|
167
156
|
@property
|
168
|
-
def factor_names(self) ->
|
157
|
+
def factor_names(self) -> Sequence[str]:
|
169
158
|
"""Factor names of the metadata."""
|
170
159
|
self._structure()
|
171
160
|
return list(self._factors)
|
172
161
|
|
173
162
|
@property
|
174
|
-
def factor_info(self) ->
|
163
|
+
def factor_info(self) -> Mapping[str, FactorInfo]:
|
175
164
|
"""Factor types of the metadata."""
|
176
165
|
self._bin()
|
177
166
|
return self._factors
|
@@ -192,7 +181,7 @@ class Metadata:
|
|
192
181
|
return self._class_labels
|
193
182
|
|
194
183
|
@property
|
195
|
-
def class_names(self) ->
|
184
|
+
def class_names(self) -> Sequence[str]:
|
196
185
|
"""Class names as a list of strings."""
|
197
186
|
self._structure()
|
198
187
|
return self._class_names
|
@@ -220,7 +209,7 @@ class Metadata:
|
|
220
209
|
if self._is_structured:
|
221
210
|
return
|
222
211
|
|
223
|
-
raw:
|
212
|
+
raw: Sequence[Mapping[str, Any]] = []
|
224
213
|
|
225
214
|
labels = []
|
226
215
|
bboxes = []
|
@@ -255,6 +244,14 @@ class Metadata:
|
|
255
244
|
bboxes = as_numpy(bboxes).astype(np.float32) if is_od else None
|
256
245
|
srcidx = as_numpy(srcidx).astype(np.intp) if is_od else None
|
257
246
|
|
247
|
+
index2label = self._dataset.metadata.get("index2label", {})
|
248
|
+
|
249
|
+
targets_per_image = None if srcidx is None else np.unique(srcidx, return_counts=True)[1].tolist()
|
250
|
+
merged = merge(raw, return_dropped=True, ignore_lists=False, targets_per_image=targets_per_image)
|
251
|
+
|
252
|
+
reserved = ["image_index", "class_label", "score", "box"]
|
253
|
+
factor_dict = {f"metadata_{k}" if k in reserved else k: v for k, v in merged[0].items() if k != "_image_index"}
|
254
|
+
|
258
255
|
target_dict = {
|
259
256
|
"image_index": srcidx if srcidx is not None else np.arange(len(labels)),
|
260
257
|
"class_label": labels,
|
@@ -262,20 +259,10 @@ class Metadata:
|
|
262
259
|
"box": bboxes if bboxes is not None else [None] * len(labels),
|
263
260
|
}
|
264
261
|
|
265
|
-
self._targets = Targets(labels, scores, bboxes, srcidx)
|
266
262
|
self._raw = raw
|
267
|
-
|
268
|
-
index2label = self._dataset.metadata.get("index2label", {})
|
269
263
|
self._class_labels = labels
|
270
|
-
self._class_names = [index2label.get(i, str(i)) for i in np.unique(
|
264
|
+
self._class_names = [index2label.get(i, str(i)) for i in np.unique(labels)]
|
271
265
|
self._image_indices = target_dict["image_index"]
|
272
|
-
|
273
|
-
targets_per_image = None if srcidx is None else np.unique(srcidx, return_counts=True)[1].tolist()
|
274
|
-
merged = merge(raw, return_dropped=True, ignore_lists=False, targets_per_image=targets_per_image)
|
275
|
-
|
276
|
-
reserved = ["image_index", "class_label", "score", "box"]
|
277
|
-
factor_dict = {f"metadata_{k}" if k in reserved else k: v for k, v in merged[0].items() if k != "_image_index"}
|
278
|
-
|
279
266
|
self._factors = dict.fromkeys(factor_dict, FactorInfo())
|
280
267
|
self._dataframe = pl.DataFrame({**target_dict, **factor_dict})
|
281
268
|
self._dropped_factors = merged[1]
|
@@ -332,14 +319,14 @@ class Metadata:
|
|
332
319
|
df = df.with_columns(pl.Series(name=col_dz, values=binned_data.astype(np.int64)))
|
333
320
|
factor_info[col] = FactorInfo("continuous", col_dz)
|
334
321
|
else:
|
335
|
-
factor_info[col] = FactorInfo("discrete",
|
322
|
+
factor_info[col] = FactorInfo("discrete", col)
|
336
323
|
|
337
324
|
# Store the results
|
338
325
|
self._dataframe = df
|
339
326
|
self._factors.update(factor_info)
|
340
327
|
self._is_binned = True
|
341
328
|
|
342
|
-
def get_factors_by_type(self, factor_type: Literal["categorical", "continuous", "discrete"]) ->
|
329
|
+
def get_factors_by_type(self, factor_type: Literal["categorical", "continuous", "discrete"]) -> Sequence[str]:
|
343
330
|
"""
|
344
331
|
Get the names of factors of a specific type.
|
345
332
|
|
@@ -356,7 +343,7 @@ class Metadata:
|
|
356
343
|
self._bin()
|
357
344
|
return [name for name, info in self.factor_info.items() if info.factor_type == factor_type]
|
358
345
|
|
359
|
-
def add_factors(self, factors: Mapping[str, Any]) -> None:
|
346
|
+
def add_factors(self, factors: Mapping[str, Array | Sequence[Any]]) -> None:
|
360
347
|
"""
|
361
348
|
Add additional factors to the metadata.
|
362
349
|
|
@@ -365,16 +352,15 @@ class Metadata:
|
|
365
352
|
|
366
353
|
Parameters
|
367
354
|
----------
|
368
|
-
factors : Mapping[str,
|
355
|
+
factors : Mapping[str, Array | Sequence[Any]]
|
369
356
|
Dictionary of factors to add to the metadata.
|
370
357
|
"""
|
371
358
|
self._structure()
|
372
359
|
|
373
|
-
targets = len(self.
|
360
|
+
targets = len(self.dataframe)
|
374
361
|
images = self.image_count
|
375
|
-
|
376
|
-
|
377
|
-
images_match = targets_match if images == targets else all(f == images for f in lengths.values())
|
362
|
+
targets_match = all(len(v) == targets for v in factors.values())
|
363
|
+
images_match = targets_match if images == targets else all(len(v) == images for v in factors.values())
|
378
364
|
if not targets_match and not images_match:
|
379
365
|
raise ValueError(
|
380
366
|
"The lists/arrays in the provided factors have a different length than the current metadata factors."
|
@@ -382,8 +368,7 @@ class Metadata:
|
|
382
368
|
|
383
369
|
new_columns = []
|
384
370
|
for k, v in factors.items():
|
385
|
-
|
386
|
-
data = v if (self.targets.source is None or lengths[k] == targets) else v[self.targets.source]
|
371
|
+
data = as_numpy(v)[self.image_indices]
|
387
372
|
new_columns.append(pl.Series(name=k, values=data))
|
388
373
|
self._factors[k] = FactorInfo()
|
389
374
|
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
|
-
from typing import Any, Generic, Iterable, Sequence, Sized, TypeVar, cast
|
5
|
+
from typing import Any, Generic, Iterable, Mapping, Sequence, Sized, TypeVar, cast
|
6
6
|
|
7
7
|
import numpy as np
|
8
8
|
from numpy.typing import NDArray
|
@@ -92,7 +92,7 @@ class ClassFilterSubSelection(Subselection[Any]):
|
|
92
92
|
def __init__(self, classes: Sequence[int]) -> None:
|
93
93
|
self.classes = classes
|
94
94
|
|
95
|
-
def _filter(self, d:
|
95
|
+
def _filter(self, d: Mapping[str, Any], mask: NDArray[np.bool_]) -> dict[str, Any]:
|
96
96
|
return {k: self._filter(v, mask) if isinstance(v, dict) else _try_mask_object(v, mask) for k, v in d.items()}
|
97
97
|
|
98
98
|
def __call__(self, datum: _TDatum) -> _TDatum:
|
@@ -1,9 +1,11 @@
|
|
1
1
|
__all__ = []
|
2
2
|
|
3
|
+
from typing import Sequence
|
4
|
+
|
3
5
|
from numpy.typing import NDArray
|
4
6
|
|
5
7
|
|
6
|
-
def _compare_keys(keys1:
|
8
|
+
def _compare_keys(keys1: Sequence[str], keys2: Sequence[str]) -> None:
|
7
9
|
"""
|
8
10
|
Raises error when two lists are not equivalent including ordering
|
9
11
|
|
@@ -24,7 +26,7 @@ def _compare_keys(keys1: list[str], keys2: list[str]) -> None:
|
|
24
26
|
raise ValueError(f"Metadata keys must be identical, got {keys1} and {keys2}")
|
25
27
|
|
26
28
|
|
27
|
-
def _validate_factors_and_data(factors:
|
29
|
+
def _validate_factors_and_data(factors: Sequence[str], data: NDArray) -> None:
|
28
30
|
"""
|
29
31
|
Raises error when the number of factors and number of rows do not match
|
30
32
|
|
@@ -157,6 +157,6 @@ def balance(
|
|
157
157
|
classwise = classwise_mi / norm_factor
|
158
158
|
|
159
159
|
# Grabbing factor names for plotting function
|
160
|
-
factor_names = ["class_label"] + metadata.factor_names
|
160
|
+
factor_names = ["class_label"] + list(metadata.factor_names)
|
161
161
|
|
162
162
|
return BalanceOutput(balance, factors, classwise, factor_names, metadata.class_names)
|
@@ -259,7 +259,8 @@ def parity(metadata: Metadata) -> ParityOutput:
|
|
259
259
|
counts = np.nonzero(contingency_matrix < 5)
|
260
260
|
unique_factor_values = np.unique(col_data)
|
261
261
|
current_factor_name = metadata.factor_names[i]
|
262
|
-
for
|
262
|
+
for _factor, _class in zip(counts[0], counts[1]):
|
263
|
+
int_factor, int_class = int(_factor), int(_class)
|
263
264
|
if contingency_matrix[int_factor, int_class] > 0:
|
264
265
|
factor_category = unique_factor_values[int_factor].item()
|
265
266
|
class_name = metadata.class_names[int_class]
|
@@ -2,9 +2,10 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
|
-
from collections import Counter, defaultdict
|
6
5
|
from typing import Any, Mapping, TypeVar
|
7
6
|
|
7
|
+
import polars as pl
|
8
|
+
|
8
9
|
from dataeval.data._metadata import Metadata
|
9
10
|
from dataeval.outputs import LabelStatsOutput
|
10
11
|
from dataeval.outputs._base import set_metadata
|
@@ -52,39 +53,34 @@ def labelstats(dataset: Metadata | AnnotatedDataset[Any]) -> LabelStatsOutput:
|
|
52
53
|
pig: 2 - 2
|
53
54
|
chicken: 5 - 5
|
54
55
|
"""
|
55
|
-
|
56
|
-
|
57
|
-
label_counts: Counter[int] = Counter()
|
58
|
-
image_counts: Counter[int] = Counter()
|
59
|
-
index_location = defaultdict(list[int])
|
60
|
-
label_per_image: list[int] = []
|
61
|
-
|
62
|
-
index2label = dict(enumerate(dataset.class_names))
|
63
|
-
|
64
|
-
for i, target in enumerate(dataset.targets):
|
65
|
-
group = target.labels.tolist()
|
56
|
+
metadata = Metadata(dataset) if isinstance(dataset, AnnotatedDataset) else dataset
|
57
|
+
metadata_df = metadata.dataframe
|
66
58
|
|
67
|
-
|
68
|
-
|
59
|
+
# Count occurrences of each label across all images
|
60
|
+
label_counts_df = metadata_df.group_by("class_label").len()
|
61
|
+
label_counts = label_counts_df.sort("class_label")["len"].to_list()
|
69
62
|
|
70
|
-
|
71
|
-
|
63
|
+
# Count unique images per label (how many images contain each label)
|
64
|
+
image_counts_df = metadata_df.select(["image_index", "class_label"]).unique().group_by("class_label").len()
|
65
|
+
image_counts = image_counts_df.sort("class_label")["len"].to_list()
|
72
66
|
|
73
|
-
|
74
|
-
|
67
|
+
# Create index_location mapping (which images contain each label)
|
68
|
+
index_location: list[list[int]] = [[] for _ in range(len(metadata.class_names))]
|
69
|
+
for row in metadata_df.group_by("class_label").agg(pl.col("image_index")).to_dicts():
|
70
|
+
indices = row["image_index"]
|
71
|
+
index_location[row["class_label"]] = sorted(dict.fromkeys(indices)) if isinstance(indices, list) else [indices]
|
75
72
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
index_location[item].append(i)
|
73
|
+
# Count labels per image
|
74
|
+
label_per_image_df = metadata_df.group_by("image_index").agg(pl.count().alias("label_count"))
|
75
|
+
label_per_image = label_per_image_df.sort("image_index")["label_count"].to_list()
|
80
76
|
|
81
77
|
return LabelStatsOutput(
|
82
|
-
label_counts_per_class=
|
78
|
+
label_counts_per_class=label_counts,
|
83
79
|
label_counts_per_image=label_per_image,
|
84
|
-
image_counts_per_class=
|
85
|
-
image_indices_per_class=
|
80
|
+
image_counts_per_class=image_counts,
|
81
|
+
image_indices_per_class=index_location,
|
86
82
|
image_count=len(label_per_image),
|
87
|
-
class_count=len(
|
88
|
-
label_count=sum(label_counts
|
89
|
-
class_names=
|
83
|
+
class_count=len(metadata.class_names),
|
84
|
+
label_count=sum(label_counts),
|
85
|
+
class_names=metadata.class_names,
|
90
86
|
)
|
@@ -147,7 +147,7 @@ P = ParamSpec("P")
|
|
147
147
|
R = TypeVar("R", bound=GenericOutput)
|
148
148
|
|
149
149
|
|
150
|
-
def set_metadata(fn: Callable[P, R] | None = None, *, state:
|
150
|
+
def set_metadata(fn: Callable[P, R] | None = None, *, state: Sequence[str] | None = None) -> Callable[P, R]:
|
151
151
|
"""Decorator to stamp Output classes with runtime metadata"""
|
152
152
|
|
153
153
|
if fn is None:
|
@@ -4,7 +4,7 @@ __all__ = []
|
|
4
4
|
|
5
5
|
import contextlib
|
6
6
|
from dataclasses import asdict, dataclass
|
7
|
-
from typing import Any, TypeVar
|
7
|
+
from typing import Any, Mapping, Sequence, TypeVar
|
8
8
|
|
9
9
|
import numpy as np
|
10
10
|
import pandas as pd
|
@@ -39,7 +39,7 @@ class ToDataFrameMixin:
|
|
39
39
|
This method requires `pandas <https://pandas.pydata.org/>`_ to be installed.
|
40
40
|
"""
|
41
41
|
return pd.DataFrame(
|
42
|
-
index=self.factor_names, # type: ignore -
|
42
|
+
index=self.factor_names, # type: ignore - Sequence[str] is documented as acceptable index type
|
43
43
|
data={
|
44
44
|
"score": self.score.round(2),
|
45
45
|
"p-value": self.p_value.round(2),
|
@@ -58,7 +58,7 @@ class ParityOutput(ToDataFrameMixin, Output):
|
|
58
58
|
chi-squared score(s) of the test
|
59
59
|
p_value : NDArray[np.float64]
|
60
60
|
p-value(s) of the test
|
61
|
-
factor_names :
|
61
|
+
factor_names : Sequence[str]
|
62
62
|
Names of each metadata factor
|
63
63
|
insufficient_data: dict
|
64
64
|
Dictionary of metadata factors with less than 5 class occurrences per value
|
@@ -66,8 +66,8 @@ class ParityOutput(ToDataFrameMixin, Output):
|
|
66
66
|
|
67
67
|
score: NDArray[np.float64]
|
68
68
|
p_value: NDArray[np.float64]
|
69
|
-
factor_names:
|
70
|
-
insufficient_data:
|
69
|
+
factor_names: Sequence[str]
|
70
|
+
insufficient_data: Mapping[str, Mapping[int, Mapping[str, int]]]
|
71
71
|
|
72
72
|
|
73
73
|
@dataclass(frozen=True)
|
@@ -145,12 +145,15 @@ class CoverageOutput(Output):
|
|
145
145
|
cols = min(3, num_images)
|
146
146
|
fig, axs = plt.subplots(rows, cols, figsize=(3 * cols, 3 * rows))
|
147
147
|
|
148
|
-
|
148
|
+
# Flatten axes using numpy array explicitly for compatibility
|
149
|
+
axs_flat = np.asarray(axs).flatten()
|
150
|
+
|
151
|
+
for image, ax in zip(images[:num_images], axs_flat):
|
149
152
|
image = channels_first_to_last(as_numpy(image))
|
150
153
|
ax.imshow(image)
|
151
154
|
ax.axis("off")
|
152
155
|
|
153
|
-
for ax in
|
156
|
+
for ax in axs_flat[num_images:]:
|
154
157
|
ax.axis("off")
|
155
158
|
|
156
159
|
fig.tight_layout()
|
@@ -187,22 +190,22 @@ class BalanceOutput(Output):
|
|
187
190
|
Estimate of inter/intra-factor mutual information
|
188
191
|
classwise : NDArray[np.float64]
|
189
192
|
Estimate of mutual information between metadata factors and individual class labels
|
190
|
-
factor_names :
|
193
|
+
factor_names : Sequence[str]
|
191
194
|
Names of each metadata factor
|
192
|
-
class_names :
|
195
|
+
class_names : Sequence[str]
|
193
196
|
List of the class labels present in the dataset
|
194
197
|
"""
|
195
198
|
|
196
199
|
balance: NDArray[np.float64]
|
197
200
|
factors: NDArray[np.float64]
|
198
201
|
classwise: NDArray[np.float64]
|
199
|
-
factor_names:
|
200
|
-
class_names:
|
202
|
+
factor_names: Sequence[str]
|
203
|
+
class_names: Sequence[str]
|
201
204
|
|
202
205
|
def plot(
|
203
206
|
self,
|
204
|
-
row_labels:
|
205
|
-
col_labels:
|
207
|
+
row_labels: Sequence[Any] | NDArray[Any] | None = None,
|
208
|
+
col_labels: Sequence[Any] | NDArray[Any] | None = None,
|
206
209
|
plot_classwise: bool = False,
|
207
210
|
) -> Figure:
|
208
211
|
"""
|
@@ -276,16 +279,16 @@ class DiversityOutput(Output):
|
|
276
279
|
:term:`Diversity` index for classes and factors
|
277
280
|
classwise : NDArray[np.double]
|
278
281
|
Classwise diversity index [n_class x n_factor]
|
279
|
-
factor_names :
|
282
|
+
factor_names : Sequence[str]
|
280
283
|
Names of each metadata factor
|
281
|
-
class_names :
|
284
|
+
class_names : Sequence[str]
|
282
285
|
Class labels for each value in the dataset
|
283
286
|
"""
|
284
287
|
|
285
288
|
diversity_index: NDArray[np.double]
|
286
289
|
classwise: NDArray[np.double]
|
287
|
-
factor_names:
|
288
|
-
class_names:
|
290
|
+
factor_names: Sequence[str]
|
291
|
+
class_names: Sequence[str]
|
289
292
|
|
290
293
|
def plot(
|
291
294
|
self,
|
@@ -333,7 +336,7 @@ class DiversityOutput(Output):
|
|
333
336
|
import matplotlib.pyplot as plt
|
334
337
|
|
335
338
|
fig, ax = plt.subplots(figsize=(8, 8))
|
336
|
-
heat_labels = ["class_labels"] + self.factor_names
|
339
|
+
heat_labels = ["class_labels"] + list(self.factor_names)
|
337
340
|
ax.bar(heat_labels, self.diversity_index)
|
338
341
|
ax.set_xlabel("Factors")
|
339
342
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
|
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
5
|
from dataclasses import dataclass
|
6
|
+
from typing import Sequence
|
6
7
|
|
7
8
|
import numpy as np
|
8
9
|
from numpy.typing import NDArray
|
@@ -64,7 +65,7 @@ class ClustererOutput(Output):
|
|
64
65
|
"""
|
65
66
|
return np.nonzero(self.clusters == -1)[0]
|
66
67
|
|
67
|
-
def find_duplicates(self) -> tuple[
|
68
|
+
def find_duplicates(self) -> tuple[Sequence[Sequence[int]], Sequence[Sequence[int]]]:
|
68
69
|
"""
|
69
70
|
Finds duplicate and near duplicate data based on cluster average distance
|
70
71
|
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
5
|
from dataclasses import dataclass
|
6
|
-
from typing import Generic, TypeVar, Union
|
6
|
+
from typing import Generic, Mapping, Sequence, TypeVar, Union
|
7
7
|
|
8
8
|
import pandas as pd
|
9
9
|
from typing_extensions import TypeAlias
|
@@ -11,13 +11,13 @@ from typing_extensions import TypeAlias
|
|
11
11
|
from dataeval.outputs._base import Output
|
12
12
|
from dataeval.outputs._stats import DimensionStatsOutput, LabelStatsOutput, PixelStatsOutput, VisualStatsOutput
|
13
13
|
|
14
|
-
DuplicateGroup: TypeAlias =
|
15
|
-
DatasetDuplicateGroupMap: TypeAlias =
|
14
|
+
DuplicateGroup: TypeAlias = Sequence[int]
|
15
|
+
DatasetDuplicateGroupMap: TypeAlias = Mapping[int, DuplicateGroup]
|
16
16
|
TIndexCollection = TypeVar("TIndexCollection", DuplicateGroup, DatasetDuplicateGroupMap)
|
17
17
|
|
18
|
-
IndexIssueMap: TypeAlias =
|
18
|
+
IndexIssueMap: TypeAlias = Mapping[int, Mapping[str, float]]
|
19
19
|
OutlierStatsOutput: TypeAlias = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
|
20
|
-
TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap,
|
20
|
+
TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap, Sequence[IndexIssueMap])
|
21
21
|
|
22
22
|
|
23
23
|
@dataclass(frozen=True)
|
@@ -27,9 +27,9 @@ class DuplicatesOutput(Output, Generic[TIndexCollection]):
|
|
27
27
|
|
28
28
|
Attributes
|
29
29
|
----------
|
30
|
-
exact :
|
30
|
+
exact : Sequence[Sequence[int] | Mapping[int, Sequence[int]]]
|
31
31
|
Indices of images that are exact matches
|
32
|
-
near:
|
32
|
+
near: Sequence[Sequence[int] | Mapping[int, Sequence[int]]]
|
33
33
|
Indices of images that are near matches
|
34
34
|
|
35
35
|
Notes
|
@@ -39,13 +39,13 @@ class DuplicatesOutput(Output, Generic[TIndexCollection]):
|
|
39
39
|
index of the dataset, and the value is the list index groups from that dataset.
|
40
40
|
"""
|
41
41
|
|
42
|
-
exact:
|
43
|
-
near:
|
42
|
+
exact: Sequence[TIndexCollection]
|
43
|
+
near: Sequence[TIndexCollection]
|
44
44
|
|
45
45
|
|
46
46
|
def _reorganize_by_class_and_metric(
|
47
47
|
result: IndexIssueMap, lstats: LabelStatsOutput
|
48
|
-
) -> tuple[
|
48
|
+
) -> tuple[Mapping[str, Sequence[int]], Mapping[str, Mapping[str, int]]]:
|
49
49
|
"""Flip result from grouping by image to grouping by class and metric"""
|
50
50
|
metrics: dict[str, list[int]] = {}
|
51
51
|
class_wise: dict[str, dict[str, int]] = {label: {} for label in lstats.class_names}
|
@@ -61,7 +61,7 @@ def _reorganize_by_class_and_metric(
|
|
61
61
|
return metrics, class_wise
|
62
62
|
|
63
63
|
|
64
|
-
def _create_table(metrics:
|
64
|
+
def _create_table(metrics: Mapping[str, Sequence[int]], class_wise: Mapping[str, Mapping[str, int]]) -> Sequence[str]:
|
65
65
|
"""Create table for displaying the results"""
|
66
66
|
max_class_length = max(len(str(label)) for label in class_wise) + 2
|
67
67
|
max_total = max(len(metrics[group]) for group in metrics) + 2
|
@@ -71,7 +71,7 @@ def _create_table(metrics: dict[str, list[int]], class_wise: dict[str, dict[str,
|
|
71
71
|
+ [f"{group:^{max(5, len(str(group))) + 2}}" for group in sorted(metrics.keys())]
|
72
72
|
+ [f"{'Total':<{max_total}}"]
|
73
73
|
)
|
74
|
-
table_rows:
|
74
|
+
table_rows: Sequence[str] = []
|
75
75
|
|
76
76
|
for class_cat, results in class_wise.items():
|
77
77
|
table_value = [f"{class_cat:>{max_class_length}}"]
|
@@ -86,7 +86,7 @@ def _create_table(metrics: dict[str, list[int]], class_wise: dict[str, dict[str,
|
|
86
86
|
return [table_header] + table_rows
|
87
87
|
|
88
88
|
|
89
|
-
def _create_pandas_dataframe(class_wise:
|
89
|
+
def _create_pandas_dataframe(class_wise: Mapping[str, Mapping[str, int]]) -> Sequence[Mapping[str, str | int]]:
|
90
90
|
"""Create data for pandas dataframe"""
|
91
91
|
data = []
|
92
92
|
for label, metrics_dict in class_wise.items():
|
@@ -105,7 +105,7 @@ class OutliersOutput(Output, Generic[TIndexIssueMap]):
|
|
105
105
|
|
106
106
|
Attributes
|
107
107
|
----------
|
108
|
-
issues :
|
108
|
+
issues : Mapping[int, Mapping[str, float]] | Sequence[Mapping[int, Mapping[str, float]]]
|
109
109
|
Indices of image Outliers with their associated issue type and calculated values.
|
110
110
|
|
111
111
|
- For a single dataset, a dictionary containing the indices of outliers and
|
@@ -117,7 +117,7 @@ class OutliersOutput(Output, Generic[TIndexIssueMap]):
|
|
117
117
|
issues: TIndexIssueMap
|
118
118
|
|
119
119
|
def __len__(self) -> int:
|
120
|
-
if isinstance(self.issues,
|
120
|
+
if isinstance(self.issues, Mapping):
|
121
121
|
return len(self.issues)
|
122
122
|
return sum(len(d) for d in self.issues)
|
123
123
|
|
@@ -134,7 +134,7 @@ class OutliersOutput(Output, Generic[TIndexIssueMap]):
|
|
134
134
|
-------
|
135
135
|
str
|
136
136
|
"""
|
137
|
-
if isinstance(self.issues,
|
137
|
+
if isinstance(self.issues, Mapping):
|
138
138
|
metrics, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
|
139
139
|
listed_table = _create_table(metrics, classwise)
|
140
140
|
table = "\n".join(listed_table)
|
@@ -165,7 +165,7 @@ class OutliersOutput(Output, Generic[TIndexIssueMap]):
|
|
165
165
|
-----
|
166
166
|
This method requires `pandas <https://pandas.pydata.org/>`_ to be installed.
|
167
167
|
"""
|
168
|
-
if isinstance(self.issues,
|
168
|
+
if isinstance(self.issues, Mapping):
|
169
169
|
_, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
|
170
170
|
data = _create_pandas_dataframe(classwise)
|
171
171
|
df = pd.DataFrame(data)
|