dataeval 0.86.2__py3-none-any.whl → 0.86.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +1 -1
- dataeval/data/__init__.py +0 -2
- dataeval/data/_metadata.py +26 -41
- dataeval/data/selections/_classfilter.py +2 -2
- dataeval/metadata/_utils.py +4 -2
- dataeval/metrics/bias/_balance.py +1 -1
- dataeval/metrics/bias/_parity.py +2 -1
- dataeval/metrics/stats/_labelstats.py +24 -28
- dataeval/outputs/_base.py +1 -1
- dataeval/outputs/_bias.py +21 -18
- dataeval/outputs/_estimators.py +2 -1
- dataeval/outputs/_linters.py +17 -17
- dataeval/outputs/_stats.py +20 -20
- dataeval/outputs/_utils.py +3 -2
- dataeval/outputs/_workflows.py +9 -7
- dataeval/typing.py +4 -4
- dataeval/utils/_plot.py +4 -4
- {dataeval-0.86.2.dist-info → dataeval-0.86.3.dist-info}/METADATA +1 -1
- {dataeval-0.86.2.dist-info → dataeval-0.86.3.dist-info}/RECORD +21 -22
- dataeval/data/_targets.py +0 -89
- {dataeval-0.86.2.dist-info → dataeval-0.86.3.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.86.2.dist-info → dataeval-0.86.3.dist-info}/WHEEL +0 -0
dataeval/__init__.py
CHANGED
dataeval/data/__init__.py
CHANGED
@@ -6,7 +6,6 @@ __all__ = [
|
|
6
6
|
"Metadata",
|
7
7
|
"Select",
|
8
8
|
"SplitDatasetOutput",
|
9
|
-
"Targets",
|
10
9
|
"split_dataset",
|
11
10
|
]
|
12
11
|
|
@@ -15,5 +14,4 @@ from dataeval.data._images import Images
|
|
15
14
|
from dataeval.data._metadata import Metadata
|
16
15
|
from dataeval.data._selection import Select
|
17
16
|
from dataeval.data._split import split_dataset
|
18
|
-
from dataeval.data._targets import Targets
|
19
17
|
from dataeval.outputs._utils import SplitDatasetOutput
|
dataeval/data/_metadata.py
CHANGED
@@ -4,7 +4,7 @@ __all__ = []
|
|
4
4
|
|
5
5
|
import warnings
|
6
6
|
from dataclasses import dataclass
|
7
|
-
from typing import
|
7
|
+
from typing import Any, Iterable, Literal, Mapping, Sequence
|
8
8
|
|
9
9
|
import numpy as np
|
10
10
|
import polars as pl
|
@@ -19,11 +19,6 @@ from dataeval.utils._array import as_numpy
|
|
19
19
|
from dataeval.utils._bin import bin_data, digitize_data
|
20
20
|
from dataeval.utils.data.metadata import merge
|
21
21
|
|
22
|
-
if TYPE_CHECKING:
|
23
|
-
from dataeval.data import Targets
|
24
|
-
else:
|
25
|
-
from dataeval.data._targets import Targets
|
26
|
-
|
27
22
|
|
28
23
|
@dataclass
|
29
24
|
class FactorInfo:
|
@@ -51,20 +46,20 @@ class Metadata:
|
|
51
46
|
|
52
47
|
def __init__(
|
53
48
|
self,
|
54
|
-
dataset: AnnotatedDataset[tuple[Any, Any,
|
49
|
+
dataset: AnnotatedDataset[tuple[Any, Any, Mapping[str, Any]]],
|
55
50
|
*,
|
56
51
|
continuous_factor_bins: Mapping[str, int | Sequence[float]] | None = None,
|
57
52
|
auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = "uniform_width",
|
58
53
|
exclude: Sequence[str] | None = None,
|
59
54
|
include: Sequence[str] | None = None,
|
60
55
|
) -> None:
|
61
|
-
self._targets: Targets
|
62
56
|
self._class_labels: NDArray[np.intp]
|
63
57
|
self._class_names: list[str]
|
64
58
|
self._image_indices: NDArray[np.intp]
|
65
59
|
self._factors: dict[str, FactorInfo]
|
66
60
|
self._dropped_factors: dict[str, list[str]]
|
67
61
|
self._dataframe: pl.DataFrame
|
62
|
+
self._raw: Sequence[Mapping[str, Any]]
|
68
63
|
|
69
64
|
self._is_structured = False
|
70
65
|
self._is_binned = False
|
@@ -80,13 +75,7 @@ class Metadata:
|
|
80
75
|
self._include = set(include or ())
|
81
76
|
|
82
77
|
@property
|
83
|
-
def
|
84
|
-
"""Target information for the dataset."""
|
85
|
-
self._structure()
|
86
|
-
return self._targets
|
87
|
-
|
88
|
-
@property
|
89
|
-
def raw(self) -> list[dict[str, Any]]:
|
78
|
+
def raw(self) -> Sequence[Mapping[str, Any]]:
|
90
79
|
"""The raw list of metadata dictionaries for the dataset."""
|
91
80
|
self._structure()
|
92
81
|
return self._raw
|
@@ -146,7 +135,7 @@ class Metadata:
|
|
146
135
|
return self._dataframe
|
147
136
|
|
148
137
|
@property
|
149
|
-
def dropped_factors(self) ->
|
138
|
+
def dropped_factors(self) -> Mapping[str, Sequence[str]]:
|
150
139
|
"""Factors that were dropped during preprocessing and the reasons why they were dropped."""
|
151
140
|
self._structure()
|
152
141
|
return self._dropped_factors
|
@@ -165,13 +154,13 @@ class Metadata:
|
|
165
154
|
)
|
166
155
|
|
167
156
|
@property
|
168
|
-
def factor_names(self) ->
|
157
|
+
def factor_names(self) -> Sequence[str]:
|
169
158
|
"""Factor names of the metadata."""
|
170
159
|
self._structure()
|
171
160
|
return list(self._factors)
|
172
161
|
|
173
162
|
@property
|
174
|
-
def factor_info(self) ->
|
163
|
+
def factor_info(self) -> Mapping[str, FactorInfo]:
|
175
164
|
"""Factor types of the metadata."""
|
176
165
|
self._bin()
|
177
166
|
return self._factors
|
@@ -192,7 +181,7 @@ class Metadata:
|
|
192
181
|
return self._class_labels
|
193
182
|
|
194
183
|
@property
|
195
|
-
def class_names(self) ->
|
184
|
+
def class_names(self) -> Sequence[str]:
|
196
185
|
"""Class names as a list of strings."""
|
197
186
|
self._structure()
|
198
187
|
return self._class_names
|
@@ -220,7 +209,7 @@ class Metadata:
|
|
220
209
|
if self._is_structured:
|
221
210
|
return
|
222
211
|
|
223
|
-
raw:
|
212
|
+
raw: Sequence[Mapping[str, Any]] = []
|
224
213
|
|
225
214
|
labels = []
|
226
215
|
bboxes = []
|
@@ -255,6 +244,14 @@ class Metadata:
|
|
255
244
|
bboxes = as_numpy(bboxes).astype(np.float32) if is_od else None
|
256
245
|
srcidx = as_numpy(srcidx).astype(np.intp) if is_od else None
|
257
246
|
|
247
|
+
index2label = self._dataset.metadata.get("index2label", {})
|
248
|
+
|
249
|
+
targets_per_image = None if srcidx is None else np.unique(srcidx, return_counts=True)[1].tolist()
|
250
|
+
merged = merge(raw, return_dropped=True, ignore_lists=False, targets_per_image=targets_per_image)
|
251
|
+
|
252
|
+
reserved = ["image_index", "class_label", "score", "box"]
|
253
|
+
factor_dict = {f"metadata_{k}" if k in reserved else k: v for k, v in merged[0].items() if k != "_image_index"}
|
254
|
+
|
258
255
|
target_dict = {
|
259
256
|
"image_index": srcidx if srcidx is not None else np.arange(len(labels)),
|
260
257
|
"class_label": labels,
|
@@ -262,20 +259,10 @@ class Metadata:
|
|
262
259
|
"box": bboxes if bboxes is not None else [None] * len(labels),
|
263
260
|
}
|
264
261
|
|
265
|
-
self._targets = Targets(labels, scores, bboxes, srcidx)
|
266
262
|
self._raw = raw
|
267
|
-
|
268
|
-
index2label = self._dataset.metadata.get("index2label", {})
|
269
263
|
self._class_labels = labels
|
270
|
-
self._class_names = [index2label.get(i, str(i)) for i in np.unique(
|
264
|
+
self._class_names = [index2label.get(i, str(i)) for i in np.unique(labels)]
|
271
265
|
self._image_indices = target_dict["image_index"]
|
272
|
-
|
273
|
-
targets_per_image = None if srcidx is None else np.unique(srcidx, return_counts=True)[1].tolist()
|
274
|
-
merged = merge(raw, return_dropped=True, ignore_lists=False, targets_per_image=targets_per_image)
|
275
|
-
|
276
|
-
reserved = ["image_index", "class_label", "score", "box"]
|
277
|
-
factor_dict = {f"metadata_{k}" if k in reserved else k: v for k, v in merged[0].items() if k != "_image_index"}
|
278
|
-
|
279
266
|
self._factors = dict.fromkeys(factor_dict, FactorInfo())
|
280
267
|
self._dataframe = pl.DataFrame({**target_dict, **factor_dict})
|
281
268
|
self._dropped_factors = merged[1]
|
@@ -332,14 +319,14 @@ class Metadata:
|
|
332
319
|
df = df.with_columns(pl.Series(name=col_dz, values=binned_data.astype(np.int64)))
|
333
320
|
factor_info[col] = FactorInfo("continuous", col_dz)
|
334
321
|
else:
|
335
|
-
factor_info[col] = FactorInfo("discrete",
|
322
|
+
factor_info[col] = FactorInfo("discrete", col)
|
336
323
|
|
337
324
|
# Store the results
|
338
325
|
self._dataframe = df
|
339
326
|
self._factors.update(factor_info)
|
340
327
|
self._is_binned = True
|
341
328
|
|
342
|
-
def get_factors_by_type(self, factor_type: Literal["categorical", "continuous", "discrete"]) ->
|
329
|
+
def get_factors_by_type(self, factor_type: Literal["categorical", "continuous", "discrete"]) -> Sequence[str]:
|
343
330
|
"""
|
344
331
|
Get the names of factors of a specific type.
|
345
332
|
|
@@ -356,7 +343,7 @@ class Metadata:
|
|
356
343
|
self._bin()
|
357
344
|
return [name for name, info in self.factor_info.items() if info.factor_type == factor_type]
|
358
345
|
|
359
|
-
def add_factors(self, factors: Mapping[str, Any]) -> None:
|
346
|
+
def add_factors(self, factors: Mapping[str, Array | Sequence[Any]]) -> None:
|
360
347
|
"""
|
361
348
|
Add additional factors to the metadata.
|
362
349
|
|
@@ -365,16 +352,15 @@ class Metadata:
|
|
365
352
|
|
366
353
|
Parameters
|
367
354
|
----------
|
368
|
-
factors : Mapping[str,
|
355
|
+
factors : Mapping[str, Array | Sequence[Any]]
|
369
356
|
Dictionary of factors to add to the metadata.
|
370
357
|
"""
|
371
358
|
self._structure()
|
372
359
|
|
373
|
-
targets = len(self.
|
360
|
+
targets = len(self.dataframe)
|
374
361
|
images = self.image_count
|
375
|
-
|
376
|
-
|
377
|
-
images_match = targets_match if images == targets else all(f == images for f in lengths.values())
|
362
|
+
targets_match = all(len(v) == targets for v in factors.values())
|
363
|
+
images_match = targets_match if images == targets else all(len(v) == images for v in factors.values())
|
378
364
|
if not targets_match and not images_match:
|
379
365
|
raise ValueError(
|
380
366
|
"The lists/arrays in the provided factors have a different length than the current metadata factors."
|
@@ -382,8 +368,7 @@ class Metadata:
|
|
382
368
|
|
383
369
|
new_columns = []
|
384
370
|
for k, v in factors.items():
|
385
|
-
|
386
|
-
data = v if (self.targets.source is None or lengths[k] == targets) else v[self.targets.source]
|
371
|
+
data = as_numpy(v)[self.image_indices]
|
387
372
|
new_columns.append(pl.Series(name=k, values=data))
|
388
373
|
self._factors[k] = FactorInfo()
|
389
374
|
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
|
-
from typing import Any, Generic, Iterable, Sequence, Sized, TypeVar, cast
|
5
|
+
from typing import Any, Generic, Iterable, Mapping, Sequence, Sized, TypeVar, cast
|
6
6
|
|
7
7
|
import numpy as np
|
8
8
|
from numpy.typing import NDArray
|
@@ -92,7 +92,7 @@ class ClassFilterSubSelection(Subselection[Any]):
|
|
92
92
|
def __init__(self, classes: Sequence[int]) -> None:
|
93
93
|
self.classes = classes
|
94
94
|
|
95
|
-
def _filter(self, d:
|
95
|
+
def _filter(self, d: Mapping[str, Any], mask: NDArray[np.bool_]) -> dict[str, Any]:
|
96
96
|
return {k: self._filter(v, mask) if isinstance(v, dict) else _try_mask_object(v, mask) for k, v in d.items()}
|
97
97
|
|
98
98
|
def __call__(self, datum: _TDatum) -> _TDatum:
|
dataeval/metadata/_utils.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
__all__ = []
|
2
2
|
|
3
|
+
from typing import Sequence
|
4
|
+
|
3
5
|
from numpy.typing import NDArray
|
4
6
|
|
5
7
|
|
6
|
-
def _compare_keys(keys1:
|
8
|
+
def _compare_keys(keys1: Sequence[str], keys2: Sequence[str]) -> None:
|
7
9
|
"""
|
8
10
|
Raises error when two lists are not equivalent including ordering
|
9
11
|
|
@@ -24,7 +26,7 @@ def _compare_keys(keys1: list[str], keys2: list[str]) -> None:
|
|
24
26
|
raise ValueError(f"Metadata keys must be identical, got {keys1} and {keys2}")
|
25
27
|
|
26
28
|
|
27
|
-
def _validate_factors_and_data(factors:
|
29
|
+
def _validate_factors_and_data(factors: Sequence[str], data: NDArray) -> None:
|
28
30
|
"""
|
29
31
|
Raises error when the number of factors and number of rows do not match
|
30
32
|
|
@@ -157,6 +157,6 @@ def balance(
|
|
157
157
|
classwise = classwise_mi / norm_factor
|
158
158
|
|
159
159
|
# Grabbing factor names for plotting function
|
160
|
-
factor_names = ["class_label"] + metadata.factor_names
|
160
|
+
factor_names = ["class_label"] + list(metadata.factor_names)
|
161
161
|
|
162
162
|
return BalanceOutput(balance, factors, classwise, factor_names, metadata.class_names)
|
dataeval/metrics/bias/_parity.py
CHANGED
@@ -259,7 +259,8 @@ def parity(metadata: Metadata) -> ParityOutput:
|
|
259
259
|
counts = np.nonzero(contingency_matrix < 5)
|
260
260
|
unique_factor_values = np.unique(col_data)
|
261
261
|
current_factor_name = metadata.factor_names[i]
|
262
|
-
for
|
262
|
+
for _factor, _class in zip(counts[0], counts[1]):
|
263
|
+
int_factor, int_class = int(_factor), int(_class)
|
263
264
|
if contingency_matrix[int_factor, int_class] > 0:
|
264
265
|
factor_category = unique_factor_values[int_factor].item()
|
265
266
|
class_name = metadata.class_names[int_class]
|
@@ -2,9 +2,10 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
|
-
from collections import Counter, defaultdict
|
6
5
|
from typing import Any, Mapping, TypeVar
|
7
6
|
|
7
|
+
import polars as pl
|
8
|
+
|
8
9
|
from dataeval.data._metadata import Metadata
|
9
10
|
from dataeval.outputs import LabelStatsOutput
|
10
11
|
from dataeval.outputs._base import set_metadata
|
@@ -52,39 +53,34 @@ def labelstats(dataset: Metadata | AnnotatedDataset[Any]) -> LabelStatsOutput:
|
|
52
53
|
pig: 2 - 2
|
53
54
|
chicken: 5 - 5
|
54
55
|
"""
|
55
|
-
|
56
|
-
|
57
|
-
label_counts: Counter[int] = Counter()
|
58
|
-
image_counts: Counter[int] = Counter()
|
59
|
-
index_location = defaultdict(list[int])
|
60
|
-
label_per_image: list[int] = []
|
61
|
-
|
62
|
-
index2label = dict(enumerate(dataset.class_names))
|
63
|
-
|
64
|
-
for i, target in enumerate(dataset.targets):
|
65
|
-
group = target.labels.tolist()
|
56
|
+
metadata = Metadata(dataset) if isinstance(dataset, AnnotatedDataset) else dataset
|
57
|
+
metadata_df = metadata.dataframe
|
66
58
|
|
67
|
-
|
68
|
-
|
59
|
+
# Count occurrences of each label across all images
|
60
|
+
label_counts_df = metadata_df.group_by("class_label").len()
|
61
|
+
label_counts = label_counts_df.sort("class_label")["len"].to_list()
|
69
62
|
|
70
|
-
|
71
|
-
|
63
|
+
# Count unique images per label (how many images contain each label)
|
64
|
+
image_counts_df = metadata_df.select(["image_index", "class_label"]).unique().group_by("class_label").len()
|
65
|
+
image_counts = image_counts_df.sort("class_label")["len"].to_list()
|
72
66
|
|
73
|
-
|
74
|
-
|
67
|
+
# Create index_location mapping (which images contain each label)
|
68
|
+
index_location: list[list[int]] = [[] for _ in range(len(metadata.class_names))]
|
69
|
+
for row in metadata_df.group_by("class_label").agg(pl.col("image_index")).to_dicts():
|
70
|
+
indices = row["image_index"]
|
71
|
+
index_location[row["class_label"]] = sorted(dict.fromkeys(indices)) if isinstance(indices, list) else [indices]
|
75
72
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
index_location[item].append(i)
|
73
|
+
# Count labels per image
|
74
|
+
label_per_image_df = metadata_df.group_by("image_index").agg(pl.count().alias("label_count"))
|
75
|
+
label_per_image = label_per_image_df.sort("image_index")["label_count"].to_list()
|
80
76
|
|
81
77
|
return LabelStatsOutput(
|
82
|
-
label_counts_per_class=
|
78
|
+
label_counts_per_class=label_counts,
|
83
79
|
label_counts_per_image=label_per_image,
|
84
|
-
image_counts_per_class=
|
85
|
-
image_indices_per_class=
|
80
|
+
image_counts_per_class=image_counts,
|
81
|
+
image_indices_per_class=index_location,
|
86
82
|
image_count=len(label_per_image),
|
87
|
-
class_count=len(
|
88
|
-
label_count=sum(label_counts
|
89
|
-
class_names=
|
83
|
+
class_count=len(metadata.class_names),
|
84
|
+
label_count=sum(label_counts),
|
85
|
+
class_names=metadata.class_names,
|
90
86
|
)
|
dataeval/outputs/_base.py
CHANGED
@@ -147,7 +147,7 @@ P = ParamSpec("P")
|
|
147
147
|
R = TypeVar("R", bound=GenericOutput)
|
148
148
|
|
149
149
|
|
150
|
-
def set_metadata(fn: Callable[P, R] | None = None, *, state:
|
150
|
+
def set_metadata(fn: Callable[P, R] | None = None, *, state: Sequence[str] | None = None) -> Callable[P, R]:
|
151
151
|
"""Decorator to stamp Output classes with runtime metadata"""
|
152
152
|
|
153
153
|
if fn is None:
|
dataeval/outputs/_bias.py
CHANGED
@@ -4,7 +4,7 @@ __all__ = []
|
|
4
4
|
|
5
5
|
import contextlib
|
6
6
|
from dataclasses import asdict, dataclass
|
7
|
-
from typing import Any, TypeVar
|
7
|
+
from typing import Any, Mapping, Sequence, TypeVar
|
8
8
|
|
9
9
|
import numpy as np
|
10
10
|
import pandas as pd
|
@@ -39,7 +39,7 @@ class ToDataFrameMixin:
|
|
39
39
|
This method requires `pandas <https://pandas.pydata.org/>`_ to be installed.
|
40
40
|
"""
|
41
41
|
return pd.DataFrame(
|
42
|
-
index=self.factor_names, # type: ignore -
|
42
|
+
index=self.factor_names, # type: ignore - Sequence[str] is documented as acceptable index type
|
43
43
|
data={
|
44
44
|
"score": self.score.round(2),
|
45
45
|
"p-value": self.p_value.round(2),
|
@@ -58,7 +58,7 @@ class ParityOutput(ToDataFrameMixin, Output):
|
|
58
58
|
chi-squared score(s) of the test
|
59
59
|
p_value : NDArray[np.float64]
|
60
60
|
p-value(s) of the test
|
61
|
-
factor_names :
|
61
|
+
factor_names : Sequence[str]
|
62
62
|
Names of each metadata factor
|
63
63
|
insufficient_data: dict
|
64
64
|
Dictionary of metadata factors with less than 5 class occurrences per value
|
@@ -66,8 +66,8 @@ class ParityOutput(ToDataFrameMixin, Output):
|
|
66
66
|
|
67
67
|
score: NDArray[np.float64]
|
68
68
|
p_value: NDArray[np.float64]
|
69
|
-
factor_names:
|
70
|
-
insufficient_data:
|
69
|
+
factor_names: Sequence[str]
|
70
|
+
insufficient_data: Mapping[str, Mapping[int, Mapping[str, int]]]
|
71
71
|
|
72
72
|
|
73
73
|
@dataclass(frozen=True)
|
@@ -145,12 +145,15 @@ class CoverageOutput(Output):
|
|
145
145
|
cols = min(3, num_images)
|
146
146
|
fig, axs = plt.subplots(rows, cols, figsize=(3 * cols, 3 * rows))
|
147
147
|
|
148
|
-
|
148
|
+
# Flatten axes using numpy array explicitly for compatibility
|
149
|
+
axs_flat = np.asarray(axs).flatten()
|
150
|
+
|
151
|
+
for image, ax in zip(images[:num_images], axs_flat):
|
149
152
|
image = channels_first_to_last(as_numpy(image))
|
150
153
|
ax.imshow(image)
|
151
154
|
ax.axis("off")
|
152
155
|
|
153
|
-
for ax in
|
156
|
+
for ax in axs_flat[num_images:]:
|
154
157
|
ax.axis("off")
|
155
158
|
|
156
159
|
fig.tight_layout()
|
@@ -187,22 +190,22 @@ class BalanceOutput(Output):
|
|
187
190
|
Estimate of inter/intra-factor mutual information
|
188
191
|
classwise : NDArray[np.float64]
|
189
192
|
Estimate of mutual information between metadata factors and individual class labels
|
190
|
-
factor_names :
|
193
|
+
factor_names : Sequence[str]
|
191
194
|
Names of each metadata factor
|
192
|
-
class_names :
|
195
|
+
class_names : Sequence[str]
|
193
196
|
List of the class labels present in the dataset
|
194
197
|
"""
|
195
198
|
|
196
199
|
balance: NDArray[np.float64]
|
197
200
|
factors: NDArray[np.float64]
|
198
201
|
classwise: NDArray[np.float64]
|
199
|
-
factor_names:
|
200
|
-
class_names:
|
202
|
+
factor_names: Sequence[str]
|
203
|
+
class_names: Sequence[str]
|
201
204
|
|
202
205
|
def plot(
|
203
206
|
self,
|
204
|
-
row_labels:
|
205
|
-
col_labels:
|
207
|
+
row_labels: Sequence[Any] | NDArray[Any] | None = None,
|
208
|
+
col_labels: Sequence[Any] | NDArray[Any] | None = None,
|
206
209
|
plot_classwise: bool = False,
|
207
210
|
) -> Figure:
|
208
211
|
"""
|
@@ -276,16 +279,16 @@ class DiversityOutput(Output):
|
|
276
279
|
:term:`Diversity` index for classes and factors
|
277
280
|
classwise : NDArray[np.double]
|
278
281
|
Classwise diversity index [n_class x n_factor]
|
279
|
-
factor_names :
|
282
|
+
factor_names : Sequence[str]
|
280
283
|
Names of each metadata factor
|
281
|
-
class_names :
|
284
|
+
class_names : Sequence[str]
|
282
285
|
Class labels for each value in the dataset
|
283
286
|
"""
|
284
287
|
|
285
288
|
diversity_index: NDArray[np.double]
|
286
289
|
classwise: NDArray[np.double]
|
287
|
-
factor_names:
|
288
|
-
class_names:
|
290
|
+
factor_names: Sequence[str]
|
291
|
+
class_names: Sequence[str]
|
289
292
|
|
290
293
|
def plot(
|
291
294
|
self,
|
@@ -333,7 +336,7 @@ class DiversityOutput(Output):
|
|
333
336
|
import matplotlib.pyplot as plt
|
334
337
|
|
335
338
|
fig, ax = plt.subplots(figsize=(8, 8))
|
336
|
-
heat_labels = ["class_labels"] + self.factor_names
|
339
|
+
heat_labels = ["class_labels"] + list(self.factor_names)
|
337
340
|
ax.bar(heat_labels, self.diversity_index)
|
338
341
|
ax.set_xlabel("Factors")
|
339
342
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
|
dataeval/outputs/_estimators.py
CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
5
|
from dataclasses import dataclass
|
6
|
+
from typing import Sequence
|
6
7
|
|
7
8
|
import numpy as np
|
8
9
|
from numpy.typing import NDArray
|
@@ -64,7 +65,7 @@ class ClustererOutput(Output):
|
|
64
65
|
"""
|
65
66
|
return np.nonzero(self.clusters == -1)[0]
|
66
67
|
|
67
|
-
def find_duplicates(self) -> tuple[
|
68
|
+
def find_duplicates(self) -> tuple[Sequence[Sequence[int]], Sequence[Sequence[int]]]:
|
68
69
|
"""
|
69
70
|
Finds duplicate and near duplicate data based on cluster average distance
|
70
71
|
|
dataeval/outputs/_linters.py
CHANGED
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
5
|
from dataclasses import dataclass
|
6
|
-
from typing import Generic, TypeVar, Union
|
6
|
+
from typing import Generic, Mapping, Sequence, TypeVar, Union
|
7
7
|
|
8
8
|
import pandas as pd
|
9
9
|
from typing_extensions import TypeAlias
|
@@ -11,13 +11,13 @@ from typing_extensions import TypeAlias
|
|
11
11
|
from dataeval.outputs._base import Output
|
12
12
|
from dataeval.outputs._stats import DimensionStatsOutput, LabelStatsOutput, PixelStatsOutput, VisualStatsOutput
|
13
13
|
|
14
|
-
DuplicateGroup: TypeAlias =
|
15
|
-
DatasetDuplicateGroupMap: TypeAlias =
|
14
|
+
DuplicateGroup: TypeAlias = Sequence[int]
|
15
|
+
DatasetDuplicateGroupMap: TypeAlias = Mapping[int, DuplicateGroup]
|
16
16
|
TIndexCollection = TypeVar("TIndexCollection", DuplicateGroup, DatasetDuplicateGroupMap)
|
17
17
|
|
18
|
-
IndexIssueMap: TypeAlias =
|
18
|
+
IndexIssueMap: TypeAlias = Mapping[int, Mapping[str, float]]
|
19
19
|
OutlierStatsOutput: TypeAlias = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
|
20
|
-
TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap,
|
20
|
+
TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap, Sequence[IndexIssueMap])
|
21
21
|
|
22
22
|
|
23
23
|
@dataclass(frozen=True)
|
@@ -27,9 +27,9 @@ class DuplicatesOutput(Output, Generic[TIndexCollection]):
|
|
27
27
|
|
28
28
|
Attributes
|
29
29
|
----------
|
30
|
-
exact :
|
30
|
+
exact : Sequence[Sequence[int] | Mapping[int, Sequence[int]]]
|
31
31
|
Indices of images that are exact matches
|
32
|
-
near:
|
32
|
+
near: Sequence[Sequence[int] | Mapping[int, Sequence[int]]]
|
33
33
|
Indices of images that are near matches
|
34
34
|
|
35
35
|
Notes
|
@@ -39,13 +39,13 @@ class DuplicatesOutput(Output, Generic[TIndexCollection]):
|
|
39
39
|
index of the dataset, and the value is the list index groups from that dataset.
|
40
40
|
"""
|
41
41
|
|
42
|
-
exact:
|
43
|
-
near:
|
42
|
+
exact: Sequence[TIndexCollection]
|
43
|
+
near: Sequence[TIndexCollection]
|
44
44
|
|
45
45
|
|
46
46
|
def _reorganize_by_class_and_metric(
|
47
47
|
result: IndexIssueMap, lstats: LabelStatsOutput
|
48
|
-
) -> tuple[
|
48
|
+
) -> tuple[Mapping[str, Sequence[int]], Mapping[str, Mapping[str, int]]]:
|
49
49
|
"""Flip result from grouping by image to grouping by class and metric"""
|
50
50
|
metrics: dict[str, list[int]] = {}
|
51
51
|
class_wise: dict[str, dict[str, int]] = {label: {} for label in lstats.class_names}
|
@@ -61,7 +61,7 @@ def _reorganize_by_class_and_metric(
|
|
61
61
|
return metrics, class_wise
|
62
62
|
|
63
63
|
|
64
|
-
def _create_table(metrics:
|
64
|
+
def _create_table(metrics: Mapping[str, Sequence[int]], class_wise: Mapping[str, Mapping[str, int]]) -> Sequence[str]:
|
65
65
|
"""Create table for displaying the results"""
|
66
66
|
max_class_length = max(len(str(label)) for label in class_wise) + 2
|
67
67
|
max_total = max(len(metrics[group]) for group in metrics) + 2
|
@@ -71,7 +71,7 @@ def _create_table(metrics: dict[str, list[int]], class_wise: dict[str, dict[str,
|
|
71
71
|
+ [f"{group:^{max(5, len(str(group))) + 2}}" for group in sorted(metrics.keys())]
|
72
72
|
+ [f"{'Total':<{max_total}}"]
|
73
73
|
)
|
74
|
-
table_rows:
|
74
|
+
table_rows: Sequence[str] = []
|
75
75
|
|
76
76
|
for class_cat, results in class_wise.items():
|
77
77
|
table_value = [f"{class_cat:>{max_class_length}}"]
|
@@ -86,7 +86,7 @@ def _create_table(metrics: dict[str, list[int]], class_wise: dict[str, dict[str,
|
|
86
86
|
return [table_header] + table_rows
|
87
87
|
|
88
88
|
|
89
|
-
def _create_pandas_dataframe(class_wise:
|
89
|
+
def _create_pandas_dataframe(class_wise: Mapping[str, Mapping[str, int]]) -> Sequence[Mapping[str, str | int]]:
|
90
90
|
"""Create data for pandas dataframe"""
|
91
91
|
data = []
|
92
92
|
for label, metrics_dict in class_wise.items():
|
@@ -105,7 +105,7 @@ class OutliersOutput(Output, Generic[TIndexIssueMap]):
|
|
105
105
|
|
106
106
|
Attributes
|
107
107
|
----------
|
108
|
-
issues :
|
108
|
+
issues : Mapping[int, Mapping[str, float]] | Sequence[Mapping[int, Mapping[str, float]]]
|
109
109
|
Indices of image Outliers with their associated issue type and calculated values.
|
110
110
|
|
111
111
|
- For a single dataset, a dictionary containing the indices of outliers and
|
@@ -117,7 +117,7 @@ class OutliersOutput(Output, Generic[TIndexIssueMap]):
|
|
117
117
|
issues: TIndexIssueMap
|
118
118
|
|
119
119
|
def __len__(self) -> int:
|
120
|
-
if isinstance(self.issues,
|
120
|
+
if isinstance(self.issues, Mapping):
|
121
121
|
return len(self.issues)
|
122
122
|
return sum(len(d) for d in self.issues)
|
123
123
|
|
@@ -134,7 +134,7 @@ class OutliersOutput(Output, Generic[TIndexIssueMap]):
|
|
134
134
|
-------
|
135
135
|
str
|
136
136
|
"""
|
137
|
-
if isinstance(self.issues,
|
137
|
+
if isinstance(self.issues, Mapping):
|
138
138
|
metrics, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
|
139
139
|
listed_table = _create_table(metrics, classwise)
|
140
140
|
table = "\n".join(listed_table)
|
@@ -165,7 +165,7 @@ class OutliersOutput(Output, Generic[TIndexIssueMap]):
|
|
165
165
|
-----
|
166
166
|
This method requires `pandas <https://pandas.pydata.org/>`_ to be installed.
|
167
167
|
"""
|
168
|
-
if isinstance(self.issues,
|
168
|
+
if isinstance(self.issues, Mapping):
|
169
169
|
_, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
|
170
170
|
data = _create_pandas_dataframe(classwise)
|
171
171
|
df = pd.DataFrame(data)
|
dataeval/outputs/_stats.py
CHANGED
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
5
|
from dataclasses import dataclass
|
6
|
-
from typing import TYPE_CHECKING, Any, Iterable, NamedTuple, Optional, Sequence, Union
|
6
|
+
from typing import TYPE_CHECKING, Any, Iterable, Mapping, NamedTuple, Optional, Sequence, Union
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
import pandas as pd
|
@@ -61,7 +61,7 @@ class BaseStatsOutput(Output):
|
|
61
61
|
The number of detected objects in each image
|
62
62
|
"""
|
63
63
|
|
64
|
-
source_index:
|
64
|
+
source_index: Sequence[SourceIndex]
|
65
65
|
object_count: NDArray[np.uint16]
|
66
66
|
image_count: int
|
67
67
|
|
@@ -80,7 +80,7 @@ class BaseStatsOutput(Output):
|
|
80
80
|
self,
|
81
81
|
channel_index: OptionalRange,
|
82
82
|
channel_count: OptionalRange = None,
|
83
|
-
) ->
|
83
|
+
) -> Sequence[bool]:
|
84
84
|
"""
|
85
85
|
Boolean mask for results filtered to specified channel index and optionally the count
|
86
86
|
of the channels per image.
|
@@ -92,8 +92,8 @@ class BaseStatsOutput(Output):
|
|
92
92
|
channel_count : int | Iterable[int] | None
|
93
93
|
Optional count(s) of channels to filter for
|
94
94
|
"""
|
95
|
-
mask:
|
96
|
-
cur_mask:
|
95
|
+
mask: Sequence[bool] = []
|
96
|
+
cur_mask: Sequence[bool] = []
|
97
97
|
cur_image = 0
|
98
98
|
cur_max_channel = 0
|
99
99
|
for source_index in list(self.source_index) + [None]:
|
@@ -113,7 +113,7 @@ class BaseStatsOutput(Output):
|
|
113
113
|
|
114
114
|
def _get_channels(
|
115
115
|
self, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None
|
116
|
-
) -> tuple[int,
|
116
|
+
) -> tuple[int, Sequence[bool] | None]:
|
117
117
|
source_index = self.data()[SOURCE_INDEX]
|
118
118
|
raw_channels = int(max([si.channel or 0 for si in source_index])) + 1
|
119
119
|
if isinstance(channel_index, int):
|
@@ -140,7 +140,7 @@ class BaseStatsOutput(Output):
|
|
140
140
|
self,
|
141
141
|
filter: str | Sequence[str] | None = None, # noqa: A002
|
142
142
|
exclude_constant: bool = False,
|
143
|
-
) ->
|
143
|
+
) -> Mapping[str, NDArray[Any]]:
|
144
144
|
"""
|
145
145
|
Returns all 1-dimensional data as a dictionary of numpy arrays.
|
146
146
|
|
@@ -153,7 +153,7 @@ class BaseStatsOutput(Output):
|
|
153
153
|
|
154
154
|
Returns
|
155
155
|
-------
|
156
|
-
|
156
|
+
Mapping[str, NDArray[Any]]
|
157
157
|
"""
|
158
158
|
filter_ = [filter] if isinstance(filter, str) else filter
|
159
159
|
return {
|
@@ -253,8 +253,8 @@ class HashStatsOutput(BaseStatsOutput):
|
|
253
253
|
:term:`Perception-based Hash` of the images as a hex string
|
254
254
|
"""
|
255
255
|
|
256
|
-
xxhash:
|
257
|
-
pchash:
|
256
|
+
xxhash: Sequence[str]
|
257
|
+
pchash: Sequence[str]
|
258
258
|
|
259
259
|
|
260
260
|
@dataclass(frozen=True)
|
@@ -264,15 +264,15 @@ class LabelStatsOutput(Output):
|
|
264
264
|
|
265
265
|
Attributes
|
266
266
|
----------
|
267
|
-
label_counts_per_class :
|
267
|
+
label_counts_per_class : Mapping[int, int]
|
268
268
|
Dictionary whose keys are the different label classes and
|
269
269
|
values are total counts of each class
|
270
|
-
label_counts_per_image :
|
270
|
+
label_counts_per_image : Sequence[int]
|
271
271
|
Number of labels per image
|
272
|
-
image_counts_per_class :
|
272
|
+
image_counts_per_class : Mapping[int, int]
|
273
273
|
Dictionary whose keys are the different label classes and
|
274
274
|
values are total counts of each image the class is present in
|
275
|
-
image_indices_per_class :
|
275
|
+
image_indices_per_class : Mapping[int, list]
|
276
276
|
Dictionary whose keys are the different label classes and
|
277
277
|
values are lists containing the images that have that label
|
278
278
|
image_count : int
|
@@ -281,17 +281,17 @@ class LabelStatsOutput(Output):
|
|
281
281
|
Total number of classes present
|
282
282
|
label_count : int
|
283
283
|
Total number of labels present
|
284
|
-
class_names :
|
284
|
+
class_names : Sequence[str]
|
285
285
|
"""
|
286
286
|
|
287
|
-
label_counts_per_class:
|
288
|
-
label_counts_per_image:
|
289
|
-
image_counts_per_class:
|
290
|
-
image_indices_per_class:
|
287
|
+
label_counts_per_class: Sequence[int]
|
288
|
+
label_counts_per_image: Sequence[int]
|
289
|
+
image_counts_per_class: Sequence[int]
|
290
|
+
image_indices_per_class: Sequence[Sequence[int]]
|
291
291
|
image_count: int
|
292
292
|
class_count: int
|
293
293
|
label_count: int
|
294
|
-
class_names:
|
294
|
+
class_names: Sequence[str]
|
295
295
|
|
296
296
|
def to_table(self) -> str:
|
297
297
|
"""
|
dataeval/outputs/_utils.py
CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
5
|
from dataclasses import dataclass
|
6
|
+
from typing import Sequence
|
6
7
|
|
7
8
|
import numpy as np
|
8
9
|
from numpy.typing import NDArray
|
@@ -36,9 +37,9 @@ class SplitDatasetOutput(Output):
|
|
36
37
|
----------
|
37
38
|
test: NDArray[np.intp]
|
38
39
|
Indices for the test set
|
39
|
-
folds:
|
40
|
+
folds: Sequence[TrainValSplit]
|
40
41
|
List of train and validation split indices
|
41
42
|
"""
|
42
43
|
|
43
44
|
test: NDArray[np.intp]
|
44
|
-
folds:
|
45
|
+
folds: Sequence[TrainValSplit]
|
dataeval/outputs/_workflows.py
CHANGED
@@ -177,7 +177,9 @@ def calc_params(p_i: NDArray[Any], n_i: NDArray[Any], niter: int) -> NDArray[Any
|
|
177
177
|
return res.x
|
178
178
|
|
179
179
|
|
180
|
-
def get_curve_params(
|
180
|
+
def get_curve_params(
|
181
|
+
measures: Mapping[str, NDArray[Any]], ranges: NDArray[Any], niter: int
|
182
|
+
) -> Mapping[str, NDArray[Any]]:
|
181
183
|
"""Calculates and aggregates parameters for both single and multi-class metrics"""
|
182
184
|
output = {}
|
183
185
|
for name, measure in measures.items():
|
@@ -208,7 +210,7 @@ class SufficiencyOutput(Output):
|
|
208
210
|
"""
|
209
211
|
|
210
212
|
steps: NDArray[np.uint32]
|
211
|
-
measures:
|
213
|
+
measures: Mapping[str, NDArray[np.float64]]
|
212
214
|
n_iter: int = 1000
|
213
215
|
|
214
216
|
def __post_init__(self) -> None:
|
@@ -220,7 +222,7 @@ class SufficiencyOutput(Output):
|
|
220
222
|
self._params = None
|
221
223
|
|
222
224
|
@property
|
223
|
-
def params(self) ->
|
225
|
+
def params(self) -> Mapping[str, NDArray[Any]]:
|
224
226
|
if self._params is None:
|
225
227
|
self._params = {}
|
226
228
|
if self.n_iter not in self._params:
|
@@ -270,7 +272,7 @@ class SufficiencyOutput(Output):
|
|
270
272
|
proj._params = self._params
|
271
273
|
return proj
|
272
274
|
|
273
|
-
def plot(self, class_names: Sequence[str] | None = None) ->
|
275
|
+
def plot(self, class_names: Sequence[str] | None = None) -> Sequence[Figure]:
|
274
276
|
"""
|
275
277
|
Plotting function for data :term:`sufficience<Sufficiency>` tasks.
|
276
278
|
|
@@ -281,7 +283,7 @@ class SufficiencyOutput(Output):
|
|
281
283
|
|
282
284
|
Returns
|
283
285
|
-------
|
284
|
-
|
286
|
+
Sequence[Figure]
|
285
287
|
List of Figures for each measure
|
286
288
|
|
287
289
|
Raises
|
@@ -325,7 +327,7 @@ class SufficiencyOutput(Output):
|
|
325
327
|
|
326
328
|
def inv_project(
|
327
329
|
self, targets: Mapping[str, ArrayLike], n_iter: int | None = None
|
328
|
-
) ->
|
330
|
+
) -> Mapping[str, NDArray[np.float64]]:
|
329
331
|
"""
|
330
332
|
Calculate training samples needed to achieve target model metric values.
|
331
333
|
|
@@ -339,7 +341,7 @@ class SufficiencyOutput(Output):
|
|
339
341
|
|
340
342
|
Returns
|
341
343
|
-------
|
342
|
-
|
344
|
+
Mapping[str, NDArray]
|
343
345
|
List of the number of training samples needed to achieve each
|
344
346
|
corresponding entry in targets
|
345
347
|
"""
|
dataeval/typing.py
CHANGED
@@ -21,7 +21,7 @@ __all__ = [
|
|
21
21
|
|
22
22
|
|
23
23
|
import sys
|
24
|
-
from typing import Any, Generic, Iterator, Protocol, TypedDict, TypeVar, runtime_checkable
|
24
|
+
from typing import Any, Generic, Iterator, Mapping, Protocol, TypedDict, TypeVar, runtime_checkable
|
25
25
|
|
26
26
|
import numpy.typing
|
27
27
|
from typing_extensions import NotRequired, ReadOnly, Required
|
@@ -159,7 +159,7 @@ class AnnotatedDataset(Dataset[_T_co], Generic[_T_co], Protocol):
|
|
159
159
|
# ========== IMAGE CLASSIFICATION DATASETS ==========
|
160
160
|
|
161
161
|
|
162
|
-
ImageClassificationDatum: TypeAlias = tuple[ArrayLike, ArrayLike,
|
162
|
+
ImageClassificationDatum: TypeAlias = tuple[ArrayLike, ArrayLike, Mapping[str, Any]]
|
163
163
|
"""
|
164
164
|
Type alias for an image classification datum tuple.
|
165
165
|
|
@@ -199,7 +199,7 @@ class ObjectDetectionTarget(Protocol):
|
|
199
199
|
def scores(self) -> ArrayLike: ...
|
200
200
|
|
201
201
|
|
202
|
-
ObjectDetectionDatum: TypeAlias = tuple[ArrayLike, ObjectDetectionTarget,
|
202
|
+
ObjectDetectionDatum: TypeAlias = tuple[ArrayLike, ObjectDetectionTarget, Mapping[str, Any]]
|
203
203
|
"""
|
204
204
|
Type alias for an object detection datum tuple.
|
205
205
|
|
@@ -240,7 +240,7 @@ class SegmentationTarget(Protocol):
|
|
240
240
|
def scores(self) -> ArrayLike: ...
|
241
241
|
|
242
242
|
|
243
|
-
SegmentationDatum: TypeAlias = tuple[ArrayLike, SegmentationTarget,
|
243
|
+
SegmentationDatum: TypeAlias = tuple[ArrayLike, SegmentationTarget, Mapping[str, Any]]
|
244
244
|
"""
|
245
245
|
Type alias for an image classification datum tuple.
|
246
246
|
|
dataeval/utils/_plot.py
CHANGED
@@ -4,7 +4,7 @@ __all__ = []
|
|
4
4
|
|
5
5
|
import contextlib
|
6
6
|
import math
|
7
|
-
from typing import Any
|
7
|
+
from typing import Any, Mapping, Sequence
|
8
8
|
|
9
9
|
import numpy as np
|
10
10
|
|
@@ -134,7 +134,7 @@ def format_text(*args: str) -> str:
|
|
134
134
|
|
135
135
|
|
136
136
|
def histogram_plot(
|
137
|
-
data_dict:
|
137
|
+
data_dict: Mapping[str, Any],
|
138
138
|
log: bool = True,
|
139
139
|
xlabel: str = "values",
|
140
140
|
ylabel: str = "counts",
|
@@ -186,10 +186,10 @@ def histogram_plot(
|
|
186
186
|
|
187
187
|
|
188
188
|
def channel_histogram_plot(
|
189
|
-
data_dict:
|
189
|
+
data_dict: Mapping[str, Any],
|
190
190
|
log: bool = True,
|
191
191
|
max_channels: int = 3,
|
192
|
-
ch_mask:
|
192
|
+
ch_mask: Sequence[bool] | None = None,
|
193
193
|
xlabel: str = "values",
|
194
194
|
ylabel: str = "counts",
|
195
195
|
) -> Figure:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: dataeval
|
3
|
-
Version: 0.86.
|
3
|
+
Version: 0.86.3
|
4
4
|
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
5
5
|
Home-page: https://dataeval.ai/
|
6
6
|
License: MIT
|
@@ -1,16 +1,15 @@
|
|
1
|
-
dataeval/__init__.py,sha256=
|
1
|
+
dataeval/__init__.py,sha256=Z_VUOb2gf--uAtqeXyzIPUm11noNeEj16OSfkc6H6-Y,1636
|
2
2
|
dataeval/_log.py,sha256=C7AGkIRzymvYJ0LQXtnShiy3i5Xrp8T58JzIHHguk_Q,365
|
3
3
|
dataeval/config.py,sha256=hjad0TK1UmaKQlUuxqxt64_OAUqZkHjicBf06cvTyrQ,4082
|
4
|
-
dataeval/data/__init__.py,sha256=
|
4
|
+
dataeval/data/__init__.py,sha256=wzQ6uUFLNB3VJR0a2QnRBYwEmwXT93q0WpHu7FmFW1E,486
|
5
5
|
dataeval/data/_embeddings.py,sha256=PFjpdV9bfusCB4taTIYSzx1hP8nJb_KCkZTN8kMw-Hs,12885
|
6
6
|
dataeval/data/_images.py,sha256=3d4Cv-xg5z6_LVtw1eL_QdFwzbDI1cwvPNQblkrMEMk,2622
|
7
|
-
dataeval/data/_metadata.py,sha256=
|
7
|
+
dataeval/data/_metadata.py,sha256=jEfGZhhvry7qtjU47VifL8ZO1hqXg1jntR3CztkaoWU,14462
|
8
8
|
dataeval/data/_selection.py,sha256=r06xeiyK8nTWPLyItkoPQRWZI1i6LATSue_cuEbCdc4,4463
|
9
9
|
dataeval/data/_split.py,sha256=nQABR05vxil2Qx7-uX4Fm0_DWpibskBGDJOYj_b1u3I,16737
|
10
|
-
dataeval/data/_targets.py,sha256=pXrHBwT4Pi8DauaOxDVnIMwowWWlXuvSb07ShW7O2zk,3119
|
11
10
|
dataeval/data/selections/__init__.py,sha256=2m8ZB53wXzqLcqmc6p5atO6graB6ZyiRSNJFxf11X_g,613
|
12
11
|
dataeval/data/selections/_classbalance.py,sha256=7v8ApoL3X8eCZ6fGDNTehE_bZ1loaP3TlhsJLaICVWg,1458
|
13
|
-
dataeval/data/selections/_classfilter.py,sha256=
|
12
|
+
dataeval/data/selections/_classfilter.py,sha256=bXfoYnWnAfUGsAQSlLufJeF2PfgRKekFHfBx8hv1r3w,4351
|
14
13
|
dataeval/data/selections/_indices.py,sha256=RFsR9z10aM3N0gJSfKrukFpi-LkiQGXoOwXhmOQ5cpg,630
|
15
14
|
dataeval/data/selections/_limit.py,sha256=JG4GmEiNKt3sk4PbOUbBnGGzNlyz72H-kQrt8COMm4Y,512
|
16
15
|
dataeval/data/selections/_prioritize.py,sha256=4dGUvgR7m6NGzzPU0N_bw0Xhujo8b72Wo8L4PGHbvBo,11233
|
@@ -41,14 +40,14 @@ dataeval/detectors/ood/mixin.py,sha256=0_o-1HPvgf3-Lf1MSOIfjj5UB8LTLEBGYtJJfyCCz
|
|
41
40
|
dataeval/metadata/__init__.py,sha256=XDDmJbOZBNM6pL0r6Nbu6oMRoyAh22IDkPYGndNlkZU,316
|
42
41
|
dataeval/metadata/_distance.py,sha256=AABrGoQyD13z9Fqlz3NyfX0Iow_vjBwAugIv6OSRTTE,4187
|
43
42
|
dataeval/metadata/_ood.py,sha256=lNPHouj_9WfM_uTtsaiRaPn46RcVy3YebD1c32vDj-c,8981
|
44
|
-
dataeval/metadata/_utils.py,sha256=
|
43
|
+
dataeval/metadata/_utils.py,sha256=BcGoYVfA4AkAWpInY5txOc3QBpsGf6cnnUAsHOQTJAE,1210
|
45
44
|
dataeval/metrics/__init__.py,sha256=8VC8q3HuJN3o_WN51Ae2_wXznl3RMXIvA5GYVcy7vr8,225
|
46
45
|
dataeval/metrics/bias/__init__.py,sha256=329S1_3WnWqeU4-qVcbe0fMy4lDrj9uKslWHIQf93yg,839
|
47
|
-
dataeval/metrics/bias/_balance.py,sha256=
|
46
|
+
dataeval/metrics/bias/_balance.py,sha256=L5TR8Twwodulk8xkhE-L7PR-isPGw4LusIjL3ZHIH8c,5525
|
48
47
|
dataeval/metrics/bias/_completeness.py,sha256=BysXU2Jpw33n5dl3acJFEqF3mFGiJLsfG4n5Q2fkTaY,4608
|
49
48
|
dataeval/metrics/bias/_coverage.py,sha256=PeUoOiaghUEdn6Ov8z2-am7-fnBVIPcFbJK7Ty5JObA,3647
|
50
49
|
dataeval/metrics/bias/_diversity.py,sha256=25udDKmel9IjeVT5nM4dOa1apda66QdRxBc922yuUvI,5830
|
51
|
-
dataeval/metrics/bias/_parity.py,sha256=
|
50
|
+
dataeval/metrics/bias/_parity.py,sha256=Kmzr9-NXxGzGtj6A-qUa88FTGaRyJU2xQj7tsplXJH4,11427
|
52
51
|
dataeval/metrics/estimators/__init__.py,sha256=Pnds8uIyAovt2fKqZjiHCIP_kVoBWlVllekYuK5UmmU,568
|
53
52
|
dataeval/metrics/estimators/_ber.py,sha256=C30E5LiGGTAfo31zWFYDptDg0R7CTJGJ-a60YgzSkYY,5382
|
54
53
|
dataeval/metrics/estimators/_clusterer.py,sha256=1HrpihGTJ63IkNSOy4Ibw633Gllkm1RxKmoKT5MOgt0,1434
|
@@ -60,22 +59,22 @@ dataeval/metrics/stats/_boxratiostats.py,sha256=ROZrlqgbowkGfCR5PJ5TL7Og40iMOdUq
|
|
60
59
|
dataeval/metrics/stats/_dimensionstats.py,sha256=EVO-BlxrZl8qrP09lwPbyWdrG1ZeDtgj4LiswDwEZ1I,2896
|
61
60
|
dataeval/metrics/stats/_hashstats.py,sha256=qa1CYRgOebkxqkALfffaPM-kJ074ZbyfpWbfOfuObSs,4758
|
62
61
|
dataeval/metrics/stats/_imagestats.py,sha256=gUPNgN5Zwzdr7WnSwbve1NXNsyxd5dy3cSnlR_7guCg,3007
|
63
|
-
dataeval/metrics/stats/_labelstats.py,sha256=
|
62
|
+
dataeval/metrics/stats/_labelstats.py,sha256=bOLH4FEBN4JZ5njdRVjEK7GUb47XBMq9eqYUgXSLmCY,3071
|
64
63
|
dataeval/metrics/stats/_pixelstats.py,sha256=5RCQh0OQkHiCkn3DgCPVxKoFfifX_FOtwsnotADSZ0I,3265
|
65
64
|
dataeval/metrics/stats/_visualstats.py,sha256=0k6bvAL_d66nQMfG7bydCOFJb7B0dhgG7fqCjVTp1sg,3707
|
66
65
|
dataeval/outputs/__init__.py,sha256=geHB5M3QOiFFaQGV4ZwDTTKpqZPvPePbqG7lzaPhaXQ,1741
|
67
|
-
dataeval/outputs/_base.py,sha256
|
68
|
-
dataeval/outputs/_bias.py,sha256=
|
66
|
+
dataeval/outputs/_base.py,sha256=-Wa0gFcBVLbfWPMZyCql7x4vGsnkLP4pecsQIeUZ2_Y,5904
|
67
|
+
dataeval/outputs/_bias.py,sha256=1OZpKncYTryjPLRHb4d6NlhE27uPT57gCob_5jtjKDI,10456
|
69
68
|
dataeval/outputs/_drift.py,sha256=rKn5vqMR6XNujgSqfHsH76oFkoGsUusquZL2Qy4Ae6Y,4581
|
70
|
-
dataeval/outputs/_estimators.py,sha256=
|
71
|
-
dataeval/outputs/_linters.py,sha256=
|
69
|
+
dataeval/outputs/_estimators.py,sha256=mh-R08CgYtmq9ffANDMYR-V4vrZnSjOjEyOMiMDZ2Ic,3091
|
70
|
+
dataeval/outputs/_linters.py,sha256=ZClITD4XY99TunS_9ABTl7eauppoUdpCZU1pCVvD0cI,6700
|
72
71
|
dataeval/outputs/_metadata.py,sha256=ffZgpX8KWURPHXpOWjbvJ2KRqWQkS2nWuIjKUzoHhMI,1710
|
73
72
|
dataeval/outputs/_ood.py,sha256=suLKVXULGtXH0rq9eXHI1d3d2jhGmItJtz4QiQd47A4,1718
|
74
|
-
dataeval/outputs/_stats.py,sha256=
|
75
|
-
dataeval/outputs/_utils.py,sha256=
|
76
|
-
dataeval/outputs/_workflows.py,sha256=
|
73
|
+
dataeval/outputs/_stats.py,sha256=KIghl-glm9A_h1eVQDKqdTQg8o2zedltWyX4NkCsv2U,15226
|
74
|
+
dataeval/outputs/_utils.py,sha256=NfhYaGT2PZlhIs8ICKUsPWHZXjhWYDkEJqBDdqMeaOM,929
|
75
|
+
dataeval/outputs/_workflows.py,sha256=K786mOgegxVi81diUA-qpbwGEkwa8YA7Fk4ttgjJeaY,10831
|
77
76
|
dataeval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
78
|
-
dataeval/typing.py,sha256=
|
77
|
+
dataeval/typing.py,sha256=W8rqFFkAqE5a5ar3MmB-O5gcMJqvoDKXC8Y0ggBqAKo,7216
|
79
78
|
dataeval/utils/__init__.py,sha256=hRvyUK7b3d6JBEV5u47rFcOHEcmDYqAvZQw_T5pDAWw,264
|
80
79
|
dataeval/utils/_array.py,sha256=ftX8S6HKAIUOuc1xd30VC3Pz5yUzRglDpCLisWY_tHs,5888
|
81
80
|
dataeval/utils/_bin.py,sha256=w3eJ2Szw5eapqQ0cGv731rhNgLFGW0cCz2pXo9I6CuY,7296
|
@@ -84,7 +83,7 @@ dataeval/utils/_fast_mst.py,sha256=pv42flr1Uf5RBa9qDG0YLDXWH7Mr7a9zpauO1HqZXaY,8
|
|
84
83
|
dataeval/utils/_image.py,sha256=4uxTIOYZZlRJOfNmdA3ek3no3FrLWCK5un48kStMDt8,3578
|
85
84
|
dataeval/utils/_method.py,sha256=9B9JQbgqWJBRhQJb7glajUtWaQzUTIUuvrZ9_bisxsM,394
|
86
85
|
dataeval/utils/_mst.py,sha256=bLmJmu_1Dtj3hC5gQp3oAiJ_7TKtEjahTqusVRRU4eI,2168
|
87
|
-
dataeval/utils/_plot.py,sha256=
|
86
|
+
dataeval/utils/_plot.py,sha256=3yn5UGL2xUeayPNws2bkvxm9ZCURsVUkpvSrwOqUE7g,7145
|
88
87
|
dataeval/utils/data/__init__.py,sha256=xGzrjrOxOP2DP1tU84AWMKPnSxFvSjM81CTlDg4rNM8,331
|
89
88
|
dataeval/utils/data/_dataset.py,sha256=CFK9h-XPN7J-iF2nXol6keMDbGm6VIweFAMAjXRUlhg,9527
|
90
89
|
dataeval/utils/data/collate.py,sha256=5egEEKhNNCGeNLChO1p6dZ4Wg6x51VEaMNHz7hEZUxI,3936
|
@@ -108,7 +107,7 @@ dataeval/utils/torch/models.py,sha256=1idpXyjrYcCBSsbxxRUOto8xr4MJNjDEqQHiIXVU5Z
|
|
108
107
|
dataeval/utils/torch/trainer.py,sha256=Oc2lK13uPGhmLYbmAqlPWyKxgG4YJFlnSXCqFHUZbdA,5528
|
109
108
|
dataeval/workflows/__init__.py,sha256=ou8y0KO-d6W5lgmcyLjKlf-J_ckP3vilW7wHkgiDlZ4,255
|
110
109
|
dataeval/workflows/sufficiency.py,sha256=j-R8dg4XE6a66p_oTXG2GNzgg3vGk85CTblxhFXaxog,8513
|
111
|
-
dataeval-0.86.
|
112
|
-
dataeval-0.86.
|
113
|
-
dataeval-0.86.
|
114
|
-
dataeval-0.86.
|
110
|
+
dataeval-0.86.3.dist-info/LICENSE.txt,sha256=uAooygKWvX6NbU9Ran9oG2msttoG8aeTeHSTe5JeCnY,1061
|
111
|
+
dataeval-0.86.3.dist-info/METADATA,sha256=1zOfOabm9w57nxAWZw5InEzmqyWRRko10btPqT0h64o,5353
|
112
|
+
dataeval-0.86.3.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
113
|
+
dataeval-0.86.3.dist-info/RECORD,,
|
dataeval/data/_targets.py
DELETED
@@ -1,89 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
from typing import Iterator
|
4
|
-
|
5
|
-
__all__ = []
|
6
|
-
|
7
|
-
from dataclasses import dataclass
|
8
|
-
|
9
|
-
import numpy as np
|
10
|
-
from numpy.typing import NDArray
|
11
|
-
|
12
|
-
|
13
|
-
def _len(arr: NDArray, dim: int) -> int:
|
14
|
-
return 0 if len(arr) == 0 else len(np.atleast_1d(arr) if dim == 1 else np.atleast_2d(arr))
|
15
|
-
|
16
|
-
|
17
|
-
@dataclass(frozen=True)
|
18
|
-
class Targets:
|
19
|
-
"""
|
20
|
-
Dataclass defining targets for image classification or object detection.
|
21
|
-
|
22
|
-
Attributes
|
23
|
-
----------
|
24
|
-
labels : NDArray[np.intp]
|
25
|
-
Labels (N,) for N images or objects
|
26
|
-
scores : NDArray[np.float32]
|
27
|
-
Probability scores (N, M) for N images of M classes or confidence score (N,) of objects
|
28
|
-
bboxes : NDArray[np.float32] | None
|
29
|
-
Bounding boxes (N, 4) for N objects in (x0, y0, x1, y1) format
|
30
|
-
source : NDArray[np.intp] | None
|
31
|
-
Source image index (N,) for N objects
|
32
|
-
size : int
|
33
|
-
Count of objects
|
34
|
-
"""
|
35
|
-
|
36
|
-
labels: NDArray[np.intp]
|
37
|
-
scores: NDArray[np.float32]
|
38
|
-
bboxes: NDArray[np.float32] | None
|
39
|
-
source: NDArray[np.intp] | None
|
40
|
-
|
41
|
-
def __post_init__(self) -> None:
|
42
|
-
if (self.bboxes is None) != (self.source is None):
|
43
|
-
raise ValueError("Either both bboxes and source must be provided or neither.")
|
44
|
-
|
45
|
-
labels = _len(self.labels, 1)
|
46
|
-
scores = _len(self.scores, 2) if self.bboxes is None else _len(self.scores, 1)
|
47
|
-
bboxes = labels if self.bboxes is None else _len(self.bboxes, 2)
|
48
|
-
source = labels if self.source is None else _len(self.source, 1)
|
49
|
-
|
50
|
-
if labels != scores or labels != bboxes or labels != source:
|
51
|
-
raise ValueError(
|
52
|
-
"Labels, scores, bboxes and source must be the same length (if provided).\n"
|
53
|
-
+ f" labels: {self.labels.shape}\n"
|
54
|
-
+ f" scores: {self.scores.shape}\n"
|
55
|
-
+ f" bboxes: {None if self.bboxes is None else self.bboxes.shape}\n"
|
56
|
-
+ f" source: {None if self.source is None else self.source.shape}\n"
|
57
|
-
)
|
58
|
-
|
59
|
-
if self.bboxes is not None and len(self.bboxes) > 0 and self.bboxes.shape[-1] != 4:
|
60
|
-
raise ValueError("Bounding boxes must be in (x0, y0, x1, y1) format.")
|
61
|
-
|
62
|
-
@property
|
63
|
-
def size(self) -> int:
|
64
|
-
return len(self.labels)
|
65
|
-
|
66
|
-
def __len__(self) -> int:
|
67
|
-
if self.source is None:
|
68
|
-
return len(self.labels)
|
69
|
-
return len(np.unique(self.source))
|
70
|
-
|
71
|
-
def __getitem__(self, idx: int, /) -> Targets:
|
72
|
-
if self.source is None or self.bboxes is None:
|
73
|
-
return Targets(
|
74
|
-
np.atleast_1d(self.labels[idx]),
|
75
|
-
np.atleast_2d(self.scores[idx]),
|
76
|
-
None,
|
77
|
-
None,
|
78
|
-
)
|
79
|
-
mask = np.where(self.source == idx, True, False)
|
80
|
-
return Targets(
|
81
|
-
np.atleast_1d(self.labels[mask]),
|
82
|
-
np.atleast_1d(self.scores[mask]),
|
83
|
-
np.atleast_2d(self.bboxes[mask]),
|
84
|
-
np.atleast_1d(self.source[mask]),
|
85
|
-
)
|
86
|
-
|
87
|
-
def __iter__(self) -> Iterator[Targets]:
|
88
|
-
for i in range(len(self.labels)) if self.source is None else np.unique(self.source):
|
89
|
-
yield self[i]
|
File without changes
|
File without changes
|