dataeval 0.86.2__py3-none-any.whl → 0.86.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dataeval/__init__.py CHANGED
@@ -8,7 +8,7 @@ shifts that impact performance of deployed models.
8
8
  from __future__ import annotations
9
9
 
10
10
  __all__ = ["config", "detectors", "log", "metrics", "typing", "utils", "workflows"]
11
- __version__ = "0.86.2"
11
+ __version__ = "0.86.4"
12
12
 
13
13
  import logging
14
14
 
dataeval/data/__init__.py CHANGED
@@ -6,7 +6,6 @@ __all__ = [
6
6
  "Metadata",
7
7
  "Select",
8
8
  "SplitDatasetOutput",
9
- "Targets",
10
9
  "split_dataset",
11
10
  ]
12
11
 
@@ -15,5 +14,4 @@ from dataeval.data._images import Images
15
14
  from dataeval.data._metadata import Metadata
16
15
  from dataeval.data._selection import Select
17
16
  from dataeval.data._split import split_dataset
18
- from dataeval.data._targets import Targets
19
17
  from dataeval.outputs._utils import SplitDatasetOutput
dataeval/data/_images.py CHANGED
@@ -4,6 +4,8 @@ __all__ = []
4
4
 
5
5
  from typing import TYPE_CHECKING, Any, Generic, Iterator, Sequence, TypeVar, cast, overload
6
6
 
7
+ import numpy as np
8
+
7
9
  from dataeval.typing import Array, ArrayLike, Dataset
8
10
  from dataeval.utils._array import as_numpy, channels_first_to_last
9
11
 
@@ -58,7 +60,7 @@ class Images(Generic[T]):
58
60
  num_images = len(indices)
59
61
  num_rows = (num_images + images_per_row - 1) // images_per_row
60
62
  fig, axes = plt.subplots(num_rows, images_per_row, figsize=figsize)
61
- for i, ax in enumerate(axes.flatten()):
63
+ for i, ax in enumerate(np.asarray(axes).flatten()):
62
64
  image = channels_first_to_last(as_numpy(self[i]))
63
65
  ax.imshow(image)
64
66
  ax.axis("off")
@@ -4,7 +4,7 @@ __all__ = []
4
4
 
5
5
  import warnings
6
6
  from dataclasses import dataclass
7
- from typing import TYPE_CHECKING, Any, Iterable, Literal, Mapping, Sequence, Sized
7
+ from typing import Any, Iterable, Literal, Mapping, Sequence
8
8
 
9
9
  import numpy as np
10
10
  import polars as pl
@@ -19,10 +19,9 @@ from dataeval.utils._array import as_numpy
19
19
  from dataeval.utils._bin import bin_data, digitize_data
20
20
  from dataeval.utils.data.metadata import merge
21
21
 
22
- if TYPE_CHECKING:
23
- from dataeval.data import Targets
24
- else:
25
- from dataeval.data._targets import Targets
22
+
23
+ def _binned(name: str) -> str:
24
+ return f"{name}[]"
26
25
 
27
26
 
28
27
  @dataclass
@@ -51,20 +50,20 @@ class Metadata:
51
50
 
52
51
  def __init__(
53
52
  self,
54
- dataset: AnnotatedDataset[tuple[Any, Any, dict[str, Any]]],
53
+ dataset: AnnotatedDataset[tuple[Any, Any, Mapping[str, Any]]],
55
54
  *,
56
55
  continuous_factor_bins: Mapping[str, int | Sequence[float]] | None = None,
57
56
  auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = "uniform_width",
58
57
  exclude: Sequence[str] | None = None,
59
58
  include: Sequence[str] | None = None,
60
59
  ) -> None:
61
- self._targets: Targets
62
60
  self._class_labels: NDArray[np.intp]
63
61
  self._class_names: list[str]
64
62
  self._image_indices: NDArray[np.intp]
65
63
  self._factors: dict[str, FactorInfo]
66
64
  self._dropped_factors: dict[str, list[str]]
67
65
  self._dataframe: pl.DataFrame
66
+ self._raw: Sequence[Mapping[str, Any]]
68
67
 
69
68
  self._is_structured = False
70
69
  self._is_binned = False
@@ -80,13 +79,7 @@ class Metadata:
80
79
  self._include = set(include or ())
81
80
 
82
81
  @property
83
- def targets(self) -> Targets:
84
- """Target information for the dataset."""
85
- self._structure()
86
- return self._targets
87
-
88
- @property
89
- def raw(self) -> list[dict[str, Any]]:
82
+ def raw(self) -> Sequence[Mapping[str, Any]]:
90
83
  """The raw list of metadata dictionaries for the dataset."""
91
84
  self._structure()
92
85
  return self._raw
@@ -146,7 +139,7 @@ class Metadata:
146
139
  return self._dataframe
147
140
 
148
141
  @property
149
- def dropped_factors(self) -> dict[str, list[str]]:
142
+ def dropped_factors(self) -> Mapping[str, Sequence[str]]:
150
143
  """Factors that were dropped during preprocessing and the reasons why they were dropped."""
151
144
  self._structure()
152
145
  return self._dropped_factors
@@ -165,16 +158,16 @@ class Metadata:
165
158
  )
166
159
 
167
160
  @property
168
- def factor_names(self) -> list[str]:
161
+ def factor_names(self) -> Sequence[str]:
169
162
  """Factor names of the metadata."""
170
163
  self._structure()
171
- return list(self._factors)
164
+ return list(filter(self._filter, self._factors))
172
165
 
173
166
  @property
174
- def factor_info(self) -> dict[str, FactorInfo]:
167
+ def factor_info(self) -> Mapping[str, FactorInfo]:
175
168
  """Factor types of the metadata."""
176
169
  self._bin()
177
- return self._factors
170
+ return dict(filter(self._filter, self._factors.items()))
178
171
 
179
172
  @property
180
173
  def factor_data(self) -> NDArray[Any]:
@@ -192,7 +185,7 @@ class Metadata:
192
185
  return self._class_labels
193
186
 
194
187
  @property
195
- def class_names(self) -> list[str]:
188
+ def class_names(self) -> Sequence[str]:
196
189
  """Class names as a list of strings."""
197
190
  self._structure()
198
191
  return self._class_names
@@ -206,13 +199,17 @@ class Metadata:
206
199
  @property
207
200
  def image_count(self) -> int:
208
201
  self._bin()
209
- return int(self._image_indices.max() + 1)
202
+ return 0 if self._image_indices.size == 0 else int(self._image_indices.max() + 1)
203
+
204
+ def _filter(self, factor: str | tuple[str, Any]) -> bool:
205
+ factor = factor[0] if isinstance(factor, tuple) else factor
206
+ return factor in self.include if self.include else factor not in self.exclude
210
207
 
211
208
  def _reset_bins(self, cols: Iterable[str] | None = None) -> None:
212
209
  if self._is_binned:
213
210
  columns = self._dataframe.columns
214
- for col in (col for col in cols or columns if f"{col}[|]" in columns):
215
- self._dataframe.drop_in_place(f"{col}[|]")
211
+ for col in (col for col in cols or columns if _binned(col) in columns):
212
+ self._dataframe.drop_in_place(_binned(col))
216
213
  self._factors[col] = FactorInfo()
217
214
  self._is_binned = False
218
215
 
@@ -220,7 +217,7 @@ class Metadata:
220
217
  if self._is_structured:
221
218
  return
222
219
 
223
- raw: list[dict[str, Any]] = []
220
+ raw: Sequence[Mapping[str, Any]] = []
224
221
 
225
222
  labels = []
226
223
  bboxes = []
@@ -255,6 +252,14 @@ class Metadata:
255
252
  bboxes = as_numpy(bboxes).astype(np.float32) if is_od else None
256
253
  srcidx = as_numpy(srcidx).astype(np.intp) if is_od else None
257
254
 
255
+ index2label = self._dataset.metadata.get("index2label", {i: str(i) for i in np.unique(labels)})
256
+
257
+ targets_per_image = None if srcidx is None else np.unique(srcidx, return_counts=True)[1].tolist()
258
+ merged = merge(raw, return_dropped=True, ignore_lists=False, targets_per_image=targets_per_image)
259
+
260
+ reserved = ["image_index", "class_label", "score", "box"]
261
+ factor_dict = {f"metadata_{k}" if k in reserved else k: v for k, v in merged[0].items() if k != "_image_index"}
262
+
258
263
  target_dict = {
259
264
  "image_index": srcidx if srcidx is not None else np.arange(len(labels)),
260
265
  "class_label": labels,
@@ -262,20 +267,11 @@ class Metadata:
262
267
  "box": bboxes if bboxes is not None else [None] * len(labels),
263
268
  }
264
269
 
265
- self._targets = Targets(labels, scores, bboxes, srcidx)
266
270
  self._raw = raw
267
-
268
- index2label = self._dataset.metadata.get("index2label", {})
271
+ self._index2label = index2label
269
272
  self._class_labels = labels
270
- self._class_names = [index2label.get(i, str(i)) for i in np.unique(self._class_labels)]
273
+ self._class_names = list(index2label.values())
271
274
  self._image_indices = target_dict["image_index"]
272
-
273
- targets_per_image = None if srcidx is None else np.unique(srcidx, return_counts=True)[1].tolist()
274
- merged = merge(raw, return_dropped=True, ignore_lists=False, targets_per_image=targets_per_image)
275
-
276
- reserved = ["image_index", "class_label", "score", "box"]
277
- factor_dict = {f"metadata_{k}" if k in reserved else k: v for k, v in merged[0].items() if k != "_image_index"}
278
-
279
275
  self._factors = dict.fromkeys(factor_dict, FactorInfo())
280
276
  self._dataframe = pl.DataFrame({**target_dict, **factor_dict})
281
277
  self._dropped_factors = merged[1]
@@ -302,10 +298,10 @@ class Metadata:
302
298
  )
303
299
 
304
300
  column_set = set(df.columns)
305
- for col in (col for col in self.factor_names if f"{col}[|]" not in column_set):
301
+ for col in (col for col in self.factor_names if _binned(col) not in column_set):
306
302
  # Get data as numpy array for processing
307
303
  data = df[col].to_numpy()
308
- col_dz = f"{col}[|]"
304
+ col_dz = _binned(col)
309
305
  if col in factor_bins:
310
306
  # User provided binning
311
307
  bins = factor_bins[col]
@@ -332,31 +328,14 @@ class Metadata:
332
328
  df = df.with_columns(pl.Series(name=col_dz, values=binned_data.astype(np.int64)))
333
329
  factor_info[col] = FactorInfo("continuous", col_dz)
334
330
  else:
335
- factor_info[col] = FactorInfo("discrete", col_dz)
331
+ factor_info[col] = FactorInfo("discrete", col)
336
332
 
337
333
  # Store the results
338
334
  self._dataframe = df
339
335
  self._factors.update(factor_info)
340
336
  self._is_binned = True
341
337
 
342
- def get_factors_by_type(self, factor_type: Literal["categorical", "continuous", "discrete"]) -> list[str]:
343
- """
344
- Get the names of factors of a specific type.
345
-
346
- Parameters
347
- ----------
348
- factor_type : Literal["categorical", "continuous", "discrete"]
349
- The type of factors to retrieve.
350
-
351
- Returns
352
- -------
353
- list[str]
354
- List of factor names of the specified type.
355
- """
356
- self._bin()
357
- return [name for name, info in self.factor_info.items() if info.factor_type == factor_type]
358
-
359
- def add_factors(self, factors: Mapping[str, Any]) -> None:
338
+ def add_factors(self, factors: Mapping[str, Array | Sequence[Any]]) -> None:
360
339
  """
361
340
  Add additional factors to the metadata.
362
341
 
@@ -365,16 +344,15 @@ class Metadata:
365
344
 
366
345
  Parameters
367
346
  ----------
368
- factors : Mapping[str, ArrayLike]
347
+ factors : Mapping[str, Array | Sequence[Any]]
369
348
  Dictionary of factors to add to the metadata.
370
349
  """
371
350
  self._structure()
372
351
 
373
- targets = len(self.targets.source) if self.targets.source is not None else len(self.targets)
352
+ targets = len(self.dataframe)
374
353
  images = self.image_count
375
- lengths = {k: len(v if isinstance(v, Sized) else np.atleast_1d(as_numpy(v))) for k, v in factors.items()}
376
- targets_match = all(f == targets for f in lengths.values())
377
- images_match = targets_match if images == targets else all(f == images for f in lengths.values())
354
+ targets_match = all(len(v) == targets for v in factors.values())
355
+ images_match = targets_match if images == targets else all(len(v) == images for v in factors.values())
378
356
  if not targets_match and not images_match:
379
357
  raise ValueError(
380
358
  "The lists/arrays in the provided factors have a different length than the current metadata factors."
@@ -382,8 +360,7 @@ class Metadata:
382
360
 
383
361
  new_columns = []
384
362
  for k, v in factors.items():
385
- v = as_numpy(v)
386
- data = v if (self.targets.source is None or lengths[k] == targets) else v[self.targets.source]
363
+ data = as_numpy(v)[self.image_indices]
387
364
  new_columns.append(pl.Series(name=k, values=data))
388
365
  self._factors[k] = FactorInfo()
389
366
 
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
- from typing import Any, Generic, Iterable, Sequence, Sized, TypeVar, cast
5
+ from typing import Any, Generic, Iterable, Mapping, Sequence, Sized, TypeVar, cast
6
6
 
7
7
  import numpy as np
8
8
  from numpy.typing import NDArray
@@ -92,7 +92,7 @@ class ClassFilterSubSelection(Subselection[Any]):
92
92
  def __init__(self, classes: Sequence[int]) -> None:
93
93
  self.classes = classes
94
94
 
95
- def _filter(self, d: dict[str, Any], mask: NDArray[np.bool_]) -> dict[str, Any]:
95
+ def _filter(self, d: Mapping[str, Any], mask: NDArray[np.bool_]) -> dict[str, Any]:
96
96
  return {k: self._filter(v, mask) if isinstance(v, dict) else _try_mask_object(v, mask) for k, v in d.items()}
97
97
 
98
98
  def __call__(self, datum: _TDatum) -> _TDatum:
@@ -81,7 +81,7 @@ def metadata_distance(metadata1: Metadata, metadata2: Metadata) -> MetadataDista
81
81
  """
82
82
 
83
83
  _compare_keys(metadata1.factor_names, metadata2.factor_names)
84
- cont_fnames = metadata1.get_factors_by_type("continuous")
84
+ cont_fnames = [name for name, info in metadata1.factor_info.items() if info.factor_type == "continuous"]
85
85
 
86
86
  if not cont_fnames:
87
87
  return MetadataDistanceOutput({})
@@ -1,9 +1,11 @@
1
1
  __all__ = []
2
2
 
3
+ from typing import Sequence
4
+
3
5
  from numpy.typing import NDArray
4
6
 
5
7
 
6
- def _compare_keys(keys1: list[str], keys2: list[str]) -> None:
8
+ def _compare_keys(keys1: Sequence[str], keys2: Sequence[str]) -> None:
7
9
  """
8
10
  Raises error when two lists are not equivalent including ordering
9
11
 
@@ -24,7 +26,7 @@ def _compare_keys(keys1: list[str], keys2: list[str]) -> None:
24
26
  raise ValueError(f"Metadata keys must be identical, got {keys1} and {keys2}")
25
27
 
26
28
 
27
- def _validate_factors_and_data(factors: list[str], data: NDArray) -> None:
29
+ def _validate_factors_and_data(factors: Sequence[str], data: NDArray) -> None:
28
30
  """
29
31
  Raises error when the number of factors and number of rows do not match
30
32
 
@@ -99,9 +99,10 @@ def balance(
99
99
  factor_types = {"class_label": "categorical"} | {k: v.factor_type for k, v in metadata.factor_info.items()}
100
100
  is_discrete = [factor_type != "continuous" for factor_type in factor_types.values()]
101
101
  num_factors = len(factor_types)
102
+ class_labels = metadata.class_labels
102
103
 
103
104
  mi = np.full((num_factors, num_factors), np.nan, dtype=np.float32)
104
- data = np.hstack((metadata.class_labels[:, np.newaxis], data))
105
+ data = np.hstack((class_labels[:, np.newaxis], data))
105
106
 
106
107
  for idx, factor_type in enumerate(factor_types.values()):
107
108
  if factor_type != "continuous":
@@ -132,12 +133,12 @@ def balance(
132
133
  factors = nmi[1:, 1:]
133
134
 
134
135
  # assume class is a factor
135
- num_classes = len(metadata.class_names)
136
+ u_classes = np.unique(class_labels)
137
+ num_classes = len(u_classes)
136
138
  classwise_mi = np.full((num_classes, num_factors), np.nan, dtype=np.float32)
137
139
 
138
140
  # classwise targets
139
- classes = np.unique(metadata.class_labels)
140
- tgt_bin = data[:, 0][:, None] == classes
141
+ tgt_bin = data[:, 0][:, None] == u_classes
141
142
 
142
143
  # classification MI for discrete/categorical features
143
144
  for idx in range(num_classes):
@@ -157,6 +158,6 @@ def balance(
157
158
  classwise = classwise_mi / norm_factor
158
159
 
159
160
  # Grabbing factor names for plotting function
160
- factor_names = ["class_label"] + metadata.factor_names
161
+ factor_names = ["class_label"] + list(metadata.factor_names)
161
162
 
162
163
  return BalanceOutput(balance, factors, classwise, factor_names, metadata.class_names)
@@ -259,7 +259,8 @@ def parity(metadata: Metadata) -> ParityOutput:
259
259
  counts = np.nonzero(contingency_matrix < 5)
260
260
  unique_factor_values = np.unique(col_data)
261
261
  current_factor_name = metadata.factor_names[i]
262
- for int_factor, int_class in zip(counts[0], counts[1]):
262
+ for _factor, _class in zip(counts[0], counts[1]):
263
+ int_factor, int_class = int(_factor), int(_class)
263
264
  if contingency_matrix[int_factor, int_class] > 0:
264
265
  factor_category = unique_factor_values[int_factor].item()
265
266
  class_name = metadata.class_names[int_class]
@@ -2,8 +2,9 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
- from collections import Counter, defaultdict
6
- from typing import Any, Mapping, TypeVar
5
+ from typing import Any, TypeVar
6
+
7
+ import polars as pl
7
8
 
8
9
  from dataeval.data._metadata import Metadata
9
10
  from dataeval.outputs import LabelStatsOutput
@@ -13,10 +14,6 @@ from dataeval.typing import AnnotatedDataset
13
14
  TValue = TypeVar("TValue")
14
15
 
15
16
 
16
- def _sort_to_list(d: Mapping[int, TValue]) -> list[TValue]:
17
- return [t[1] for t in sorted(d.items())]
18
-
19
-
20
17
  @set_metadata
21
18
  def labelstats(dataset: Metadata | AnnotatedDataset[Any]) -> LabelStatsOutput:
22
19
  """
@@ -52,39 +49,34 @@ def labelstats(dataset: Metadata | AnnotatedDataset[Any]) -> LabelStatsOutput:
52
49
  pig: 2 - 2
53
50
  chicken: 5 - 5
54
51
  """
55
- dataset = Metadata(dataset) if isinstance(dataset, AnnotatedDataset) else dataset
56
-
57
- label_counts: Counter[int] = Counter()
58
- image_counts: Counter[int] = Counter()
59
- index_location = defaultdict(list[int])
60
- label_per_image: list[int] = []
61
-
62
- index2label = dict(enumerate(dataset.class_names))
63
-
64
- for i, target in enumerate(dataset.targets):
65
- group = target.labels.tolist()
52
+ metadata = Metadata(dataset) if isinstance(dataset, AnnotatedDataset) else dataset
53
+ metadata_df = metadata.dataframe
66
54
 
67
- # Count occurrences of each label in all sublists
68
- label_counts.update(group)
55
+ # Count occurrences of each label across all images
56
+ label_counts_df = metadata_df.group_by("class_label").len()
57
+ label_counts = dict(zip(label_counts_df["class_label"], label_counts_df["len"]))
69
58
 
70
- # Get the number of labels per image
71
- label_per_image.append(len(group))
59
+ # Count unique images per label (how many images contain each label)
60
+ image_counts_df = metadata_df.select(["image_index", "class_label"]).unique().group_by("class_label").len()
61
+ image_counts = dict(zip(image_counts_df["class_label"], image_counts_df["len"]))
72
62
 
73
- # Create a set of unique items in the current sublist
74
- unique_items: set[int] = set(group)
63
+ # Create index_location mapping (which images contain each label)
64
+ index_location: dict[int, list[int]] = {}
65
+ for row in metadata_df.group_by("class_label").agg(pl.col("image_index")).to_dicts():
66
+ indices = row["image_index"]
67
+ index_location[row["class_label"]] = sorted(dict.fromkeys(indices)) if isinstance(indices, list) else [indices]
75
68
 
76
- # Update image counts and index locations
77
- image_counts.update(unique_items)
78
- for item in unique_items:
79
- index_location[item].append(i)
69
+ # Count labels per image
70
+ label_per_image_df = metadata_df.group_by("image_index").agg(pl.len().alias("label_count"))
71
+ label_per_image = label_per_image_df.sort("image_index")["label_count"].to_list()
80
72
 
81
73
  return LabelStatsOutput(
82
- label_counts_per_class=_sort_to_list(label_counts),
74
+ label_counts_per_class=label_counts,
83
75
  label_counts_per_image=label_per_image,
84
- image_counts_per_class=_sort_to_list(image_counts),
85
- image_indices_per_class=_sort_to_list(index_location),
76
+ image_counts_per_class=image_counts,
77
+ image_indices_per_class=index_location,
86
78
  image_count=len(label_per_image),
87
- class_count=len(label_counts),
79
+ class_count=len(metadata.class_names),
88
80
  label_count=sum(label_counts.values()),
89
- class_names=list(index2label.values()),
81
+ class_names=metadata.class_names,
90
82
  )
dataeval/outputs/_base.py CHANGED
@@ -147,7 +147,7 @@ P = ParamSpec("P")
147
147
  R = TypeVar("R", bound=GenericOutput)
148
148
 
149
149
 
150
- def set_metadata(fn: Callable[P, R] | None = None, *, state: list[str] | None = None) -> Callable[P, R]:
150
+ def set_metadata(fn: Callable[P, R] | None = None, *, state: Sequence[str] | None = None) -> Callable[P, R]:
151
151
  """Decorator to stamp Output classes with runtime metadata"""
152
152
 
153
153
  if fn is None:
dataeval/outputs/_bias.py CHANGED
@@ -4,7 +4,7 @@ __all__ = []
4
4
 
5
5
  import contextlib
6
6
  from dataclasses import asdict, dataclass
7
- from typing import Any, TypeVar
7
+ from typing import Any, Mapping, Sequence, TypeVar
8
8
 
9
9
  import numpy as np
10
10
  import pandas as pd
@@ -39,7 +39,7 @@ class ToDataFrameMixin:
39
39
  This method requires `pandas <https://pandas.pydata.org/>`_ to be installed.
40
40
  """
41
41
  return pd.DataFrame(
42
- index=self.factor_names, # type: ignore - list[str] is documented as acceptable index type
42
+ index=self.factor_names, # type: ignore - Sequence[str] is documented as acceptable index type
43
43
  data={
44
44
  "score": self.score.round(2),
45
45
  "p-value": self.p_value.round(2),
@@ -58,7 +58,7 @@ class ParityOutput(ToDataFrameMixin, Output):
58
58
  chi-squared score(s) of the test
59
59
  p_value : NDArray[np.float64]
60
60
  p-value(s) of the test
61
- factor_names : list[str]
61
+ factor_names : Sequence[str]
62
62
  Names of each metadata factor
63
63
  insufficient_data: dict
64
64
  Dictionary of metadata factors with less than 5 class occurrences per value
@@ -66,8 +66,8 @@ class ParityOutput(ToDataFrameMixin, Output):
66
66
 
67
67
  score: NDArray[np.float64]
68
68
  p_value: NDArray[np.float64]
69
- factor_names: list[str]
70
- insufficient_data: dict[str, dict[int, dict[str, int]]]
69
+ factor_names: Sequence[str]
70
+ insufficient_data: Mapping[str, Mapping[int, Mapping[str, int]]]
71
71
 
72
72
 
73
73
  @dataclass(frozen=True)
@@ -145,12 +145,15 @@ class CoverageOutput(Output):
145
145
  cols = min(3, num_images)
146
146
  fig, axs = plt.subplots(rows, cols, figsize=(3 * cols, 3 * rows))
147
147
 
148
- for image, ax in zip(images[:num_images], axs.flat):
148
+ # Flatten axes using numpy array explicitly for compatibility
149
+ axs_flat = np.asarray(axs).flatten()
150
+
151
+ for image, ax in zip(images[:num_images], axs_flat):
149
152
  image = channels_first_to_last(as_numpy(image))
150
153
  ax.imshow(image)
151
154
  ax.axis("off")
152
155
 
153
- for ax in axs.flat[num_images:]:
156
+ for ax in axs_flat[num_images:]:
154
157
  ax.axis("off")
155
158
 
156
159
  fig.tight_layout()
@@ -187,22 +190,22 @@ class BalanceOutput(Output):
187
190
  Estimate of inter/intra-factor mutual information
188
191
  classwise : NDArray[np.float64]
189
192
  Estimate of mutual information between metadata factors and individual class labels
190
- factor_names : list[str]
193
+ factor_names : Sequence[str]
191
194
  Names of each metadata factor
192
- class_names : list[str]
195
+ class_names : Sequence[str]
193
196
  List of the class labels present in the dataset
194
197
  """
195
198
 
196
199
  balance: NDArray[np.float64]
197
200
  factors: NDArray[np.float64]
198
201
  classwise: NDArray[np.float64]
199
- factor_names: list[str]
200
- class_names: list[str]
202
+ factor_names: Sequence[str]
203
+ class_names: Sequence[str]
201
204
 
202
205
  def plot(
203
206
  self,
204
- row_labels: list[Any] | NDArray[Any] | None = None,
205
- col_labels: list[Any] | NDArray[Any] | None = None,
207
+ row_labels: Sequence[Any] | NDArray[Any] | None = None,
208
+ col_labels: Sequence[Any] | NDArray[Any] | None = None,
206
209
  plot_classwise: bool = False,
207
210
  ) -> Figure:
208
211
  """
@@ -276,16 +279,16 @@ class DiversityOutput(Output):
276
279
  :term:`Diversity` index for classes and factors
277
280
  classwise : NDArray[np.double]
278
281
  Classwise diversity index [n_class x n_factor]
279
- factor_names : list[str]
282
+ factor_names : Sequence[str]
280
283
  Names of each metadata factor
281
- class_names : list[str]
284
+ class_names : Sequence[str]
282
285
  Class labels for each value in the dataset
283
286
  """
284
287
 
285
288
  diversity_index: NDArray[np.double]
286
289
  classwise: NDArray[np.double]
287
- factor_names: list[str]
288
- class_names: list[str]
290
+ factor_names: Sequence[str]
291
+ class_names: Sequence[str]
289
292
 
290
293
  def plot(
291
294
  self,
@@ -333,7 +336,7 @@ class DiversityOutput(Output):
333
336
  import matplotlib.pyplot as plt
334
337
 
335
338
  fig, ax = plt.subplots(figsize=(8, 8))
336
- heat_labels = ["class_labels"] + self.factor_names
339
+ heat_labels = ["class_labels"] + list(self.factor_names)
337
340
  ax.bar(heat_labels, self.diversity_index)
338
341
  ax.set_xlabel("Factors")
339
342
  plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  __all__ = []
4
4
 
5
5
  from dataclasses import dataclass
6
+ from typing import Sequence
6
7
 
7
8
  import numpy as np
8
9
  from numpy.typing import NDArray
@@ -64,7 +65,7 @@ class ClustererOutput(Output):
64
65
  """
65
66
  return np.nonzero(self.clusters == -1)[0]
66
67
 
67
- def find_duplicates(self) -> tuple[list[list[int]], list[list[int]]]:
68
+ def find_duplicates(self) -> tuple[Sequence[Sequence[int]], Sequence[Sequence[int]]]:
68
69
  """
69
70
  Finds duplicate and near duplicate data based on cluster average distance
70
71
 
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
  __all__ = []
4
4
 
5
5
  from dataclasses import dataclass
6
- from typing import Generic, TypeVar, Union
6
+ from typing import Generic, Mapping, Sequence, TypeVar, Union
7
7
 
8
8
  import pandas as pd
9
9
  from typing_extensions import TypeAlias
@@ -11,13 +11,13 @@ from typing_extensions import TypeAlias
11
11
  from dataeval.outputs._base import Output
12
12
  from dataeval.outputs._stats import DimensionStatsOutput, LabelStatsOutput, PixelStatsOutput, VisualStatsOutput
13
13
 
14
- DuplicateGroup: TypeAlias = list[int]
15
- DatasetDuplicateGroupMap: TypeAlias = dict[int, DuplicateGroup]
14
+ DuplicateGroup: TypeAlias = Sequence[int]
15
+ DatasetDuplicateGroupMap: TypeAlias = Mapping[int, DuplicateGroup]
16
16
  TIndexCollection = TypeVar("TIndexCollection", DuplicateGroup, DatasetDuplicateGroupMap)
17
17
 
18
- IndexIssueMap: TypeAlias = dict[int, dict[str, float]]
18
+ IndexIssueMap: TypeAlias = Mapping[int, Mapping[str, float]]
19
19
  OutlierStatsOutput: TypeAlias = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
20
- TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap, list[IndexIssueMap])
20
+ TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap, Sequence[IndexIssueMap])
21
21
 
22
22
 
23
23
  @dataclass(frozen=True)
@@ -27,9 +27,9 @@ class DuplicatesOutput(Output, Generic[TIndexCollection]):
27
27
 
28
28
  Attributes
29
29
  ----------
30
- exact : list[list[int] | dict[int, list[int]]]
30
+ exact : Sequence[Sequence[int] | Mapping[int, Sequence[int]]]
31
31
  Indices of images that are exact matches
32
- near: list[list[int] | dict[int, list[int]]]
32
+ near: Sequence[Sequence[int] | Mapping[int, Sequence[int]]]
33
33
  Indices of images that are near matches
34
34
 
35
35
  Notes
@@ -39,13 +39,13 @@ class DuplicatesOutput(Output, Generic[TIndexCollection]):
39
39
  index of the dataset, and the value is the list index groups from that dataset.
40
40
  """
41
41
 
42
- exact: list[TIndexCollection]
43
- near: list[TIndexCollection]
42
+ exact: Sequence[TIndexCollection]
43
+ near: Sequence[TIndexCollection]
44
44
 
45
45
 
46
46
  def _reorganize_by_class_and_metric(
47
47
  result: IndexIssueMap, lstats: LabelStatsOutput
48
- ) -> tuple[dict[str, list[int]], dict[str, dict[str, int]]]:
48
+ ) -> tuple[Mapping[str, Sequence[int]], Mapping[str, Mapping[str, int]]]:
49
49
  """Flip result from grouping by image to grouping by class and metric"""
50
50
  metrics: dict[str, list[int]] = {}
51
51
  class_wise: dict[str, dict[str, int]] = {label: {} for label in lstats.class_names}
@@ -54,14 +54,14 @@ def _reorganize_by_class_and_metric(
54
54
  for img, group in result.items():
55
55
  for extreme in group:
56
56
  metrics.setdefault(extreme, []).append(img)
57
- for i, images in enumerate(lstats.image_indices_per_class):
57
+ for i, images in lstats.image_indices_per_class.items():
58
58
  if img in images:
59
59
  class_wise[lstats.class_names[i]][extreme] = class_wise[lstats.class_names[i]].get(extreme, 0) + 1
60
60
 
61
61
  return metrics, class_wise
62
62
 
63
63
 
64
- def _create_table(metrics: dict[str, list[int]], class_wise: dict[str, dict[str, int]]) -> list[str]:
64
+ def _create_table(metrics: Mapping[str, Sequence[int]], class_wise: Mapping[str, Mapping[str, int]]) -> Sequence[str]:
65
65
  """Create table for displaying the results"""
66
66
  max_class_length = max(len(str(label)) for label in class_wise) + 2
67
67
  max_total = max(len(metrics[group]) for group in metrics) + 2
@@ -71,7 +71,7 @@ def _create_table(metrics: dict[str, list[int]], class_wise: dict[str, dict[str,
71
71
  + [f"{group:^{max(5, len(str(group))) + 2}}" for group in sorted(metrics.keys())]
72
72
  + [f"{'Total':<{max_total}}"]
73
73
  )
74
- table_rows: list[str] = []
74
+ table_rows: Sequence[str] = []
75
75
 
76
76
  for class_cat, results in class_wise.items():
77
77
  table_value = [f"{class_cat:>{max_class_length}}"]
@@ -86,7 +86,7 @@ def _create_table(metrics: dict[str, list[int]], class_wise: dict[str, dict[str,
86
86
  return [table_header] + table_rows
87
87
 
88
88
 
89
- def _create_pandas_dataframe(class_wise: dict[str, dict[str, int]]) -> list[dict[str, str | int]]:
89
+ def _create_pandas_dataframe(class_wise: Mapping[str, Mapping[str, int]]) -> Sequence[Mapping[str, str | int]]:
90
90
  """Create data for pandas dataframe"""
91
91
  data = []
92
92
  for label, metrics_dict in class_wise.items():
@@ -105,7 +105,7 @@ class OutliersOutput(Output, Generic[TIndexIssueMap]):
105
105
 
106
106
  Attributes
107
107
  ----------
108
- issues : dict[int, dict[str, float]] | list[dict[int, dict[str, float]]]
108
+ issues : Mapping[int, Mapping[str, float]] | Sequence[Mapping[int, Mapping[str, float]]]
109
109
  Indices of image Outliers with their associated issue type and calculated values.
110
110
 
111
111
  - For a single dataset, a dictionary containing the indices of outliers and
@@ -117,7 +117,7 @@ class OutliersOutput(Output, Generic[TIndexIssueMap]):
117
117
  issues: TIndexIssueMap
118
118
 
119
119
  def __len__(self) -> int:
120
- if isinstance(self.issues, dict):
120
+ if isinstance(self.issues, Mapping):
121
121
  return len(self.issues)
122
122
  return sum(len(d) for d in self.issues)
123
123
 
@@ -134,7 +134,7 @@ class OutliersOutput(Output, Generic[TIndexIssueMap]):
134
134
  -------
135
135
  str
136
136
  """
137
- if isinstance(self.issues, dict):
137
+ if isinstance(self.issues, Mapping):
138
138
  metrics, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
139
139
  listed_table = _create_table(metrics, classwise)
140
140
  table = "\n".join(listed_table)
@@ -165,7 +165,7 @@ class OutliersOutput(Output, Generic[TIndexIssueMap]):
165
165
  -----
166
166
  This method requires `pandas <https://pandas.pydata.org/>`_ to be installed.
167
167
  """
168
- if isinstance(self.issues, dict):
168
+ if isinstance(self.issues, Mapping):
169
169
  _, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
170
170
  data = _create_pandas_dataframe(classwise)
171
171
  df = pd.DataFrame(data)
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
  __all__ = []
4
4
 
5
5
  from dataclasses import dataclass
6
- from typing import TYPE_CHECKING, Any, Iterable, NamedTuple, Optional, Sequence, Union
6
+ from typing import TYPE_CHECKING, Any, Iterable, Mapping, NamedTuple, Optional, Sequence, Union
7
7
 
8
8
  import numpy as np
9
9
  import pandas as pd
@@ -61,7 +61,7 @@ class BaseStatsOutput(Output):
61
61
  The number of detected objects in each image
62
62
  """
63
63
 
64
- source_index: list[SourceIndex]
64
+ source_index: Sequence[SourceIndex]
65
65
  object_count: NDArray[np.uint16]
66
66
  image_count: int
67
67
 
@@ -80,7 +80,7 @@ class BaseStatsOutput(Output):
80
80
  self,
81
81
  channel_index: OptionalRange,
82
82
  channel_count: OptionalRange = None,
83
- ) -> list[bool]:
83
+ ) -> Sequence[bool]:
84
84
  """
85
85
  Boolean mask for results filtered to specified channel index and optionally the count
86
86
  of the channels per image.
@@ -92,8 +92,8 @@ class BaseStatsOutput(Output):
92
92
  channel_count : int | Iterable[int] | None
93
93
  Optional count(s) of channels to filter for
94
94
  """
95
- mask: list[bool] = []
96
- cur_mask: list[bool] = []
95
+ mask: Sequence[bool] = []
96
+ cur_mask: Sequence[bool] = []
97
97
  cur_image = 0
98
98
  cur_max_channel = 0
99
99
  for source_index in list(self.source_index) + [None]:
@@ -113,7 +113,7 @@ class BaseStatsOutput(Output):
113
113
 
114
114
  def _get_channels(
115
115
  self, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None
116
- ) -> tuple[int, list[bool] | None]:
116
+ ) -> tuple[int, Sequence[bool] | None]:
117
117
  source_index = self.data()[SOURCE_INDEX]
118
118
  raw_channels = int(max([si.channel or 0 for si in source_index])) + 1
119
119
  if isinstance(channel_index, int):
@@ -140,7 +140,7 @@ class BaseStatsOutput(Output):
140
140
  self,
141
141
  filter: str | Sequence[str] | None = None, # noqa: A002
142
142
  exclude_constant: bool = False,
143
- ) -> dict[str, NDArray[Any]]:
143
+ ) -> Mapping[str, NDArray[Any]]:
144
144
  """
145
145
  Returns all 1-dimensional data as a dictionary of numpy arrays.
146
146
 
@@ -153,7 +153,7 @@ class BaseStatsOutput(Output):
153
153
 
154
154
  Returns
155
155
  -------
156
- dict[str, NDArray[Any]]
156
+ Mapping[str, NDArray[Any]]
157
157
  """
158
158
  filter_ = [filter] if isinstance(filter, str) else filter
159
159
  return {
@@ -253,8 +253,8 @@ class HashStatsOutput(BaseStatsOutput):
253
253
  :term:`Perception-based Hash` of the images as a hex string
254
254
  """
255
255
 
256
- xxhash: list[str]
257
- pchash: list[str]
256
+ xxhash: Sequence[str]
257
+ pchash: Sequence[str]
258
258
 
259
259
 
260
260
  @dataclass(frozen=True)
@@ -264,15 +264,15 @@ class LabelStatsOutput(Output):
264
264
 
265
265
  Attributes
266
266
  ----------
267
- label_counts_per_class : dict[int, int]
267
+ label_counts_per_class : Mapping[int, int]
268
268
  Dictionary whose keys are the different label classes and
269
269
  values are total counts of each class
270
- label_counts_per_image : list[int]
270
+ label_counts_per_image : Sequence[int]
271
271
  Number of labels per image
272
- image_counts_per_class : dict[int, int]
272
+ image_counts_per_class : Mapping[int, int]
273
273
  Dictionary whose keys are the different label classes and
274
274
  values are total counts of each image the class is present in
275
- image_indices_per_class : dict[int, list]
275
+ image_indices_per_class : Mapping[int, Sequence[int]]
276
276
  Dictionary whose keys are the different label classes and
277
277
  values are lists containing the images that have that label
278
278
  image_count : int
@@ -281,17 +281,17 @@ class LabelStatsOutput(Output):
281
281
  Total number of classes present
282
282
  label_count : int
283
283
  Total number of labels present
284
- class_names : list[str]
284
+ class_names : Sequence[str]
285
285
  """
286
286
 
287
- label_counts_per_class: list[int]
288
- label_counts_per_image: list[int]
289
- image_counts_per_class: list[int]
290
- image_indices_per_class: list[list[int]]
287
+ label_counts_per_class: Mapping[int, int]
288
+ label_counts_per_image: Sequence[int]
289
+ image_counts_per_class: Mapping[int, int]
290
+ image_indices_per_class: Mapping[int, Sequence[int]]
291
291
  image_count: int
292
292
  class_count: int
293
293
  label_count: int
294
- class_names: list[str]
294
+ class_names: Sequence[str]
295
295
 
296
296
  def to_table(self) -> str:
297
297
  """
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  __all__ = []
4
4
 
5
5
  from dataclasses import dataclass
6
+ from typing import Sequence
6
7
 
7
8
  import numpy as np
8
9
  from numpy.typing import NDArray
@@ -36,9 +37,9 @@ class SplitDatasetOutput(Output):
36
37
  ----------
37
38
  test: NDArray[np.intp]
38
39
  Indices for the test set
39
- folds: list[TrainValSplit]
40
+ folds: Sequence[TrainValSplit]
40
41
  List of train and validation split indices
41
42
  """
42
43
 
43
44
  test: NDArray[np.intp]
44
- folds: list[TrainValSplit]
45
+ folds: Sequence[TrainValSplit]
@@ -177,7 +177,9 @@ def calc_params(p_i: NDArray[Any], n_i: NDArray[Any], niter: int) -> NDArray[Any
177
177
  return res.x
178
178
 
179
179
 
180
- def get_curve_params(measures: dict[str, NDArray[Any]], ranges: NDArray[Any], niter: int) -> dict[str, NDArray[Any]]:
180
+ def get_curve_params(
181
+ measures: Mapping[str, NDArray[Any]], ranges: NDArray[Any], niter: int
182
+ ) -> Mapping[str, NDArray[Any]]:
181
183
  """Calculates and aggregates parameters for both single and multi-class metrics"""
182
184
  output = {}
183
185
  for name, measure in measures.items():
@@ -208,7 +210,7 @@ class SufficiencyOutput(Output):
208
210
  """
209
211
 
210
212
  steps: NDArray[np.uint32]
211
- measures: dict[str, NDArray[np.float64]]
213
+ measures: Mapping[str, NDArray[np.float64]]
212
214
  n_iter: int = 1000
213
215
 
214
216
  def __post_init__(self) -> None:
@@ -220,7 +222,7 @@ class SufficiencyOutput(Output):
220
222
  self._params = None
221
223
 
222
224
  @property
223
- def params(self) -> dict[str, NDArray[Any]]:
225
+ def params(self) -> Mapping[str, NDArray[Any]]:
224
226
  if self._params is None:
225
227
  self._params = {}
226
228
  if self.n_iter not in self._params:
@@ -270,7 +272,7 @@ class SufficiencyOutput(Output):
270
272
  proj._params = self._params
271
273
  return proj
272
274
 
273
- def plot(self, class_names: Sequence[str] | None = None) -> list[Figure]:
275
+ def plot(self, class_names: Sequence[str] | None = None) -> Sequence[Figure]:
274
276
  """
275
277
  Plotting function for data :term:`sufficience<Sufficiency>` tasks.
276
278
 
@@ -281,7 +283,7 @@ class SufficiencyOutput(Output):
281
283
 
282
284
  Returns
283
285
  -------
284
- list[Figure]
286
+ Sequence[Figure]
285
287
  List of Figures for each measure
286
288
 
287
289
  Raises
@@ -325,7 +327,7 @@ class SufficiencyOutput(Output):
325
327
 
326
328
  def inv_project(
327
329
  self, targets: Mapping[str, ArrayLike], n_iter: int | None = None
328
- ) -> dict[str, NDArray[np.float64]]:
330
+ ) -> Mapping[str, NDArray[np.float64]]:
329
331
  """
330
332
  Calculate training samples needed to achieve target model metric values.
331
333
 
@@ -339,7 +341,7 @@ class SufficiencyOutput(Output):
339
341
 
340
342
  Returns
341
343
  -------
342
- dict[str, NDArray]
344
+ Mapping[str, NDArray]
343
345
  List of the number of training samples needed to achieve each
344
346
  corresponding entry in targets
345
347
  """
dataeval/typing.py CHANGED
@@ -21,7 +21,7 @@ __all__ = [
21
21
 
22
22
 
23
23
  import sys
24
- from typing import Any, Generic, Iterator, Protocol, TypedDict, TypeVar, runtime_checkable
24
+ from typing import Any, Generic, Iterator, Mapping, Protocol, TypedDict, TypeVar, runtime_checkable
25
25
 
26
26
  import numpy.typing
27
27
  from typing_extensions import NotRequired, ReadOnly, Required
@@ -159,7 +159,7 @@ class AnnotatedDataset(Dataset[_T_co], Generic[_T_co], Protocol):
159
159
  # ========== IMAGE CLASSIFICATION DATASETS ==========
160
160
 
161
161
 
162
- ImageClassificationDatum: TypeAlias = tuple[ArrayLike, ArrayLike, dict[str, Any]]
162
+ ImageClassificationDatum: TypeAlias = tuple[ArrayLike, ArrayLike, Mapping[str, Any]]
163
163
  """
164
164
  Type alias for an image classification datum tuple.
165
165
 
@@ -199,7 +199,7 @@ class ObjectDetectionTarget(Protocol):
199
199
  def scores(self) -> ArrayLike: ...
200
200
 
201
201
 
202
- ObjectDetectionDatum: TypeAlias = tuple[ArrayLike, ObjectDetectionTarget, dict[str, Any]]
202
+ ObjectDetectionDatum: TypeAlias = tuple[ArrayLike, ObjectDetectionTarget, Mapping[str, Any]]
203
203
  """
204
204
  Type alias for an object detection datum tuple.
205
205
 
@@ -240,7 +240,7 @@ class SegmentationTarget(Protocol):
240
240
  def scores(self) -> ArrayLike: ...
241
241
 
242
242
 
243
- SegmentationDatum: TypeAlias = tuple[ArrayLike, SegmentationTarget, dict[str, Any]]
243
+ SegmentationDatum: TypeAlias = tuple[ArrayLike, SegmentationTarget, Mapping[str, Any]]
244
244
  """
245
245
  Type alias for an image classification datum tuple.
246
246
 
dataeval/utils/_plot.py CHANGED
@@ -4,7 +4,7 @@ __all__ = []
4
4
 
5
5
  import contextlib
6
6
  import math
7
- from typing import Any
7
+ from typing import Any, Mapping, Sequence
8
8
 
9
9
  import numpy as np
10
10
 
@@ -134,7 +134,7 @@ def format_text(*args: str) -> str:
134
134
 
135
135
 
136
136
  def histogram_plot(
137
- data_dict: dict[str, Any],
137
+ data_dict: Mapping[str, Any],
138
138
  log: bool = True,
139
139
  xlabel: str = "values",
140
140
  ylabel: str = "counts",
@@ -164,9 +164,9 @@ def histogram_plot(
164
164
  rows = math.ceil(num_metrics / 3)
165
165
  cols = min(num_metrics, 3)
166
166
  fig, axs = plt.subplots(rows, 3, figsize=(cols * 3 + 1, rows * 3))
167
-
167
+ axs_flat = np.asarray(axs).flatten()
168
168
  for ax, metric in zip(
169
- axs.flat,
169
+ axs_flat,
170
170
  data_dict,
171
171
  ):
172
172
  # Plot the histogram for the chosen metric
@@ -177,7 +177,7 @@ def histogram_plot(
177
177
  ax.set_ylabel(ylabel)
178
178
  ax.set_xlabel(xlabel)
179
179
 
180
- for ax in axs.flat[num_metrics:]:
180
+ for ax in axs_flat[num_metrics:]:
181
181
  ax.axis("off")
182
182
  ax.set_visible(False)
183
183
 
@@ -186,10 +186,10 @@ def histogram_plot(
186
186
 
187
187
 
188
188
  def channel_histogram_plot(
189
- data_dict: dict[str, Any],
189
+ data_dict: Mapping[str, Any],
190
190
  log: bool = True,
191
191
  max_channels: int = 3,
192
- ch_mask: list[bool] | None = None,
192
+ ch_mask: Sequence[bool] | None = None,
193
193
  xlabel: str = "values",
194
194
  ylabel: str = "counts",
195
195
  ) -> Figure:
@@ -222,9 +222,9 @@ def channel_histogram_plot(
222
222
  rows = math.ceil(num_metrics / 3)
223
223
  cols = min(num_metrics, 3)
224
224
  fig, axs = plt.subplots(rows, 3, figsize=(cols * 3 + 1, rows * 3))
225
-
225
+ axs_flat = np.asarray(axs).flatten()
226
226
  for ax, metric in zip(
227
- axs.flat,
227
+ axs_flat,
228
228
  data_keys,
229
229
  ):
230
230
  # Plot the histogram for the chosen metric
@@ -246,7 +246,7 @@ def channel_histogram_plot(
246
246
  ax.set_ylabel(ylabel)
247
247
  ax.set_xlabel(xlabel)
248
248
 
249
- for ax in axs.flat[num_metrics:]:
249
+ for ax in axs_flat[num_metrics:]:
250
250
  ax.axis("off")
251
251
  ax.set_visible(False)
252
252
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dataeval
3
- Version: 0.86.2
3
+ Version: 0.86.4
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Home-page: https://dataeval.ai/
6
6
  License: MIT
@@ -1,16 +1,15 @@
1
- dataeval/__init__.py,sha256=7Q_nGiQN6g8Le7VtOsemNgn5mC_6gR3NhazolD_arSQ,1636
1
+ dataeval/__init__.py,sha256=6gfYCGo82QKKO58jQSma27Mr-R316vmCDbTjXRh5B7o,1636
2
2
  dataeval/_log.py,sha256=C7AGkIRzymvYJ0LQXtnShiy3i5Xrp8T58JzIHHguk_Q,365
3
3
  dataeval/config.py,sha256=hjad0TK1UmaKQlUuxqxt64_OAUqZkHjicBf06cvTyrQ,4082
4
- dataeval/data/__init__.py,sha256=qNnRRiVP_sLthkkHpUrMgI_r8dQK-cC-xoGrrjQeRKc,544
4
+ dataeval/data/__init__.py,sha256=wzQ6uUFLNB3VJR0a2QnRBYwEmwXT93q0WpHu7FmFW1E,486
5
5
  dataeval/data/_embeddings.py,sha256=PFjpdV9bfusCB4taTIYSzx1hP8nJb_KCkZTN8kMw-Hs,12885
6
- dataeval/data/_images.py,sha256=3d4Cv-xg5z6_LVtw1eL_QdFwzbDI1cwvPNQblkrMEMk,2622
7
- dataeval/data/_metadata.py,sha256=GzXtecy7EvrB3ZJJbaCQjmpsdHXRL5788ckKbzeI54w,14994
6
+ dataeval/data/_images.py,sha256=Rc_59CuU4zfN7Xm7an1XUx8ZghQg6a56VJWMZD9edRw,2654
7
+ dataeval/data/_metadata.py,sha256=5pND6IZ5KeEGrhCDiBVxhU_BXWU0okBxt8oNkZ9a2_M,14309
8
8
  dataeval/data/_selection.py,sha256=r06xeiyK8nTWPLyItkoPQRWZI1i6LATSue_cuEbCdc4,4463
9
9
  dataeval/data/_split.py,sha256=nQABR05vxil2Qx7-uX4Fm0_DWpibskBGDJOYj_b1u3I,16737
10
- dataeval/data/_targets.py,sha256=pXrHBwT4Pi8DauaOxDVnIMwowWWlXuvSb07ShW7O2zk,3119
11
10
  dataeval/data/selections/__init__.py,sha256=2m8ZB53wXzqLcqmc6p5atO6graB6ZyiRSNJFxf11X_g,613
12
11
  dataeval/data/selections/_classbalance.py,sha256=7v8ApoL3X8eCZ6fGDNTehE_bZ1loaP3TlhsJLaICVWg,1458
13
- dataeval/data/selections/_classfilter.py,sha256=KQOmcTIcV3ZPWuiwqOmwX0SB5I2qlbxLSlwINUZWOjU,4339
12
+ dataeval/data/selections/_classfilter.py,sha256=bXfoYnWnAfUGsAQSlLufJeF2PfgRKekFHfBx8hv1r3w,4351
14
13
  dataeval/data/selections/_indices.py,sha256=RFsR9z10aM3N0gJSfKrukFpi-LkiQGXoOwXhmOQ5cpg,630
15
14
  dataeval/data/selections/_limit.py,sha256=JG4GmEiNKt3sk4PbOUbBnGGzNlyz72H-kQrt8COMm4Y,512
16
15
  dataeval/data/selections/_prioritize.py,sha256=4dGUvgR7m6NGzzPU0N_bw0Xhujo8b72Wo8L4PGHbvBo,11233
@@ -39,16 +38,16 @@ dataeval/detectors/ood/ae.py,sha256=fTrUfFxv6xUqzKpwMC8rW3JrizA16M_bgzqLuBKMrS0,
39
38
  dataeval/detectors/ood/base.py,sha256=9b-Ljznf0lB1SXF4F_Aj3eJ4Y3ijGEDPMjucUsWOGJM,3051
40
39
  dataeval/detectors/ood/mixin.py,sha256=0_o-1HPvgf3-Lf1MSOIfjj5UB8LTLEBGYtJJfyCCzwc,5431
41
40
  dataeval/metadata/__init__.py,sha256=XDDmJbOZBNM6pL0r6Nbu6oMRoyAh22IDkPYGndNlkZU,316
42
- dataeval/metadata/_distance.py,sha256=AABrGoQyD13z9Fqlz3NyfX0Iow_vjBwAugIv6OSRTTE,4187
41
+ dataeval/metadata/_distance.py,sha256=MbXM9idsooNWnGLaTKg8j4ZqavUeJUjuW7EPW3-UQyg,4234
43
42
  dataeval/metadata/_ood.py,sha256=lNPHouj_9WfM_uTtsaiRaPn46RcVy3YebD1c32vDj-c,8981
44
- dataeval/metadata/_utils.py,sha256=r8qBJT83RblobD5W5zyTVi6vYi51Dwkqswizdbzss-M,1169
43
+ dataeval/metadata/_utils.py,sha256=BcGoYVfA4AkAWpInY5txOc3QBpsGf6cnnUAsHOQTJAE,1210
45
44
  dataeval/metrics/__init__.py,sha256=8VC8q3HuJN3o_WN51Ae2_wXznl3RMXIvA5GYVcy7vr8,225
46
45
  dataeval/metrics/bias/__init__.py,sha256=329S1_3WnWqeU4-qVcbe0fMy4lDrj9uKslWHIQf93yg,839
47
- dataeval/metrics/bias/_balance.py,sha256=FcMOA3ge-sQ-0Id2E0K_6hTjNAV3ejJhlB5r4lxlJWI,5519
46
+ dataeval/metrics/bias/_balance.py,sha256=fREtoMLUZPOf_ivqNKwij6oPiKMTk02ECO5rWURf3KY,5541
48
47
  dataeval/metrics/bias/_completeness.py,sha256=BysXU2Jpw33n5dl3acJFEqF3mFGiJLsfG4n5Q2fkTaY,4608
49
48
  dataeval/metrics/bias/_coverage.py,sha256=PeUoOiaghUEdn6Ov8z2-am7-fnBVIPcFbJK7Ty5JObA,3647
50
49
  dataeval/metrics/bias/_diversity.py,sha256=25udDKmel9IjeVT5nM4dOa1apda66QdRxBc922yuUvI,5830
51
- dataeval/metrics/bias/_parity.py,sha256=OHUSHPOeC8e1I3acALHbQv5bK4V7SqAT7ds9gNVNzSU,11371
50
+ dataeval/metrics/bias/_parity.py,sha256=Kmzr9-NXxGzGtj6A-qUa88FTGaRyJU2xQj7tsplXJH4,11427
52
51
  dataeval/metrics/estimators/__init__.py,sha256=Pnds8uIyAovt2fKqZjiHCIP_kVoBWlVllekYuK5UmmU,568
53
52
  dataeval/metrics/estimators/_ber.py,sha256=C30E5LiGGTAfo31zWFYDptDg0R7CTJGJ-a60YgzSkYY,5382
54
53
  dataeval/metrics/estimators/_clusterer.py,sha256=1HrpihGTJ63IkNSOy4Ibw633Gllkm1RxKmoKT5MOgt0,1434
@@ -60,22 +59,22 @@ dataeval/metrics/stats/_boxratiostats.py,sha256=ROZrlqgbowkGfCR5PJ5TL7Og40iMOdUq
60
59
  dataeval/metrics/stats/_dimensionstats.py,sha256=EVO-BlxrZl8qrP09lwPbyWdrG1ZeDtgj4LiswDwEZ1I,2896
61
60
  dataeval/metrics/stats/_hashstats.py,sha256=qa1CYRgOebkxqkALfffaPM-kJ074ZbyfpWbfOfuObSs,4758
62
61
  dataeval/metrics/stats/_imagestats.py,sha256=gUPNgN5Zwzdr7WnSwbve1NXNsyxd5dy3cSnlR_7guCg,3007
63
- dataeval/metrics/stats/_labelstats.py,sha256=lz8I6eSd8tFkmQqy5cOG8hn9yxs0mP-Ic9ratFHiuoU,2813
62
+ dataeval/metrics/stats/_labelstats.py,sha256=UG7aKpFctLJvca3rC9sPT_25sCes77KpgZguJYMXfU0,2949
64
63
  dataeval/metrics/stats/_pixelstats.py,sha256=5RCQh0OQkHiCkn3DgCPVxKoFfifX_FOtwsnotADSZ0I,3265
65
64
  dataeval/metrics/stats/_visualstats.py,sha256=0k6bvAL_d66nQMfG7bydCOFJb7B0dhgG7fqCjVTp1sg,3707
66
65
  dataeval/outputs/__init__.py,sha256=geHB5M3QOiFFaQGV4ZwDTTKpqZPvPePbqG7lzaPhaXQ,1741
67
- dataeval/outputs/_base.py,sha256=7KRWFIEw0UHdhb1em92bPE1YqbMYumAW1QD0QfPwVLc,5900
68
- dataeval/outputs/_bias.py,sha256=W5QWjtZzMfCaztw6lf0VTZsuSDrNgCcdAvNx6P4fIAo,10254
66
+ dataeval/outputs/_base.py,sha256=-Wa0gFcBVLbfWPMZyCql7x4vGsnkLP4pecsQIeUZ2_Y,5904
67
+ dataeval/outputs/_bias.py,sha256=1OZpKncYTryjPLRHb4d6NlhE27uPT57gCob_5jtjKDI,10456
69
68
  dataeval/outputs/_drift.py,sha256=rKn5vqMR6XNujgSqfHsH76oFkoGsUusquZL2Qy4Ae6Y,4581
70
- dataeval/outputs/_estimators.py,sha256=a2oAIxxEDZ9WLGfMWH8KD-BVUS_SnULRPR-iI9hFPoQ,3047
71
- dataeval/outputs/_linters.py,sha256=3vI8zsSF-JecQut500A629sICidQLWqhEZcj7o7_cfs,6554
69
+ dataeval/outputs/_estimators.py,sha256=mh-R08CgYtmq9ffANDMYR-V4vrZnSjOjEyOMiMDZ2Ic,3091
70
+ dataeval/outputs/_linters.py,sha256=k8lkd8EZ23q0m-HOD-FgqMcLQFy1UH7vws2ucLPyn08,6697
72
71
  dataeval/outputs/_metadata.py,sha256=ffZgpX8KWURPHXpOWjbvJ2KRqWQkS2nWuIjKUzoHhMI,1710
73
72
  dataeval/outputs/_ood.py,sha256=suLKVXULGtXH0rq9eXHI1d3d2jhGmItJtz4QiQd47A4,1718
74
- dataeval/outputs/_stats.py,sha256=YDdVQmFcOvb4_NYc_d2a2JCA0Zkuh1o6_qupQkc_X1w,15142
75
- dataeval/outputs/_utils.py,sha256=HHlGC7sk416m_3Bgn075Qdblz_aPup_UOafJpB0RuXY,893
76
- dataeval/outputs/_workflows.py,sha256=0xSwPxBATa29tvwJtpovjYrq4la9fkbamHM_qsw-Llc,10799
73
+ dataeval/outputs/_stats.py,sha256=F-515PGBNB69DXM-YaCkGHAyaXkCD-yYvKfj4-q7R4w,15247
74
+ dataeval/outputs/_utils.py,sha256=NfhYaGT2PZlhIs8ICKUsPWHZXjhWYDkEJqBDdqMeaOM,929
75
+ dataeval/outputs/_workflows.py,sha256=K786mOgegxVi81diUA-qpbwGEkwa8YA7Fk4ttgjJeaY,10831
77
76
  dataeval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
78
- dataeval/typing.py,sha256=GDMuef-oFFukNtsiKFmsExHdNvYR_j-tQcsCwZ9reow,7198
77
+ dataeval/typing.py,sha256=W8rqFFkAqE5a5ar3MmB-O5gcMJqvoDKXC8Y0ggBqAKo,7216
79
78
  dataeval/utils/__init__.py,sha256=hRvyUK7b3d6JBEV5u47rFcOHEcmDYqAvZQw_T5pDAWw,264
80
79
  dataeval/utils/_array.py,sha256=ftX8S6HKAIUOuc1xd30VC3Pz5yUzRglDpCLisWY_tHs,5888
81
80
  dataeval/utils/_bin.py,sha256=w3eJ2Szw5eapqQ0cGv731rhNgLFGW0cCz2pXo9I6CuY,7296
@@ -84,7 +83,7 @@ dataeval/utils/_fast_mst.py,sha256=pv42flr1Uf5RBa9qDG0YLDXWH7Mr7a9zpauO1HqZXaY,8
84
83
  dataeval/utils/_image.py,sha256=4uxTIOYZZlRJOfNmdA3ek3no3FrLWCK5un48kStMDt8,3578
85
84
  dataeval/utils/_method.py,sha256=9B9JQbgqWJBRhQJb7glajUtWaQzUTIUuvrZ9_bisxsM,394
86
85
  dataeval/utils/_mst.py,sha256=bLmJmu_1Dtj3hC5gQp3oAiJ_7TKtEjahTqusVRRU4eI,2168
87
- dataeval/utils/_plot.py,sha256=zP0bEvtrLdws7r1Jte8Camq-q5K5F6T8iuv3bStnEJc,7116
86
+ dataeval/utils/_plot.py,sha256=1rnMkBRvTFLoTAHqXwF7c7GJ5_5iqlgarZKAzmYciLk,7225
88
87
  dataeval/utils/data/__init__.py,sha256=xGzrjrOxOP2DP1tU84AWMKPnSxFvSjM81CTlDg4rNM8,331
89
88
  dataeval/utils/data/_dataset.py,sha256=CFK9h-XPN7J-iF2nXol6keMDbGm6VIweFAMAjXRUlhg,9527
90
89
  dataeval/utils/data/collate.py,sha256=5egEEKhNNCGeNLChO1p6dZ4Wg6x51VEaMNHz7hEZUxI,3936
@@ -108,7 +107,7 @@ dataeval/utils/torch/models.py,sha256=1idpXyjrYcCBSsbxxRUOto8xr4MJNjDEqQHiIXVU5Z
108
107
  dataeval/utils/torch/trainer.py,sha256=Oc2lK13uPGhmLYbmAqlPWyKxgG4YJFlnSXCqFHUZbdA,5528
109
108
  dataeval/workflows/__init__.py,sha256=ou8y0KO-d6W5lgmcyLjKlf-J_ckP3vilW7wHkgiDlZ4,255
110
109
  dataeval/workflows/sufficiency.py,sha256=j-R8dg4XE6a66p_oTXG2GNzgg3vGk85CTblxhFXaxog,8513
111
- dataeval-0.86.2.dist-info/LICENSE.txt,sha256=uAooygKWvX6NbU9Ran9oG2msttoG8aeTeHSTe5JeCnY,1061
112
- dataeval-0.86.2.dist-info/METADATA,sha256=6y6bI8GBv_VjBs1mpjAZJ9R5UBTKT7RHQRRUGJdyPCk,5353
113
- dataeval-0.86.2.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
114
- dataeval-0.86.2.dist-info/RECORD,,
110
+ dataeval-0.86.4.dist-info/LICENSE.txt,sha256=uAooygKWvX6NbU9Ran9oG2msttoG8aeTeHSTe5JeCnY,1061
111
+ dataeval-0.86.4.dist-info/METADATA,sha256=qdxTuVh3WxpHvsdRZhAvQIYxiATJLDixoF97xMFYrXM,5353
112
+ dataeval-0.86.4.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
113
+ dataeval-0.86.4.dist-info/RECORD,,
dataeval/data/_targets.py DELETED
@@ -1,89 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import Iterator
4
-
5
- __all__ = []
6
-
7
- from dataclasses import dataclass
8
-
9
- import numpy as np
10
- from numpy.typing import NDArray
11
-
12
-
13
- def _len(arr: NDArray, dim: int) -> int:
14
- return 0 if len(arr) == 0 else len(np.atleast_1d(arr) if dim == 1 else np.atleast_2d(arr))
15
-
16
-
17
- @dataclass(frozen=True)
18
- class Targets:
19
- """
20
- Dataclass defining targets for image classification or object detection.
21
-
22
- Attributes
23
- ----------
24
- labels : NDArray[np.intp]
25
- Labels (N,) for N images or objects
26
- scores : NDArray[np.float32]
27
- Probability scores (N, M) for N images of M classes or confidence score (N,) of objects
28
- bboxes : NDArray[np.float32] | None
29
- Bounding boxes (N, 4) for N objects in (x0, y0, x1, y1) format
30
- source : NDArray[np.intp] | None
31
- Source image index (N,) for N objects
32
- size : int
33
- Count of objects
34
- """
35
-
36
- labels: NDArray[np.intp]
37
- scores: NDArray[np.float32]
38
- bboxes: NDArray[np.float32] | None
39
- source: NDArray[np.intp] | None
40
-
41
- def __post_init__(self) -> None:
42
- if (self.bboxes is None) != (self.source is None):
43
- raise ValueError("Either both bboxes and source must be provided or neither.")
44
-
45
- labels = _len(self.labels, 1)
46
- scores = _len(self.scores, 2) if self.bboxes is None else _len(self.scores, 1)
47
- bboxes = labels if self.bboxes is None else _len(self.bboxes, 2)
48
- source = labels if self.source is None else _len(self.source, 1)
49
-
50
- if labels != scores or labels != bboxes or labels != source:
51
- raise ValueError(
52
- "Labels, scores, bboxes and source must be the same length (if provided).\n"
53
- + f" labels: {self.labels.shape}\n"
54
- + f" scores: {self.scores.shape}\n"
55
- + f" bboxes: {None if self.bboxes is None else self.bboxes.shape}\n"
56
- + f" source: {None if self.source is None else self.source.shape}\n"
57
- )
58
-
59
- if self.bboxes is not None and len(self.bboxes) > 0 and self.bboxes.shape[-1] != 4:
60
- raise ValueError("Bounding boxes must be in (x0, y0, x1, y1) format.")
61
-
62
- @property
63
- def size(self) -> int:
64
- return len(self.labels)
65
-
66
- def __len__(self) -> int:
67
- if self.source is None:
68
- return len(self.labels)
69
- return len(np.unique(self.source))
70
-
71
- def __getitem__(self, idx: int, /) -> Targets:
72
- if self.source is None or self.bboxes is None:
73
- return Targets(
74
- np.atleast_1d(self.labels[idx]),
75
- np.atleast_2d(self.scores[idx]),
76
- None,
77
- None,
78
- )
79
- mask = np.where(self.source == idx, True, False)
80
- return Targets(
81
- np.atleast_1d(self.labels[mask]),
82
- np.atleast_1d(self.scores[mask]),
83
- np.atleast_2d(self.bboxes[mask]),
84
- np.atleast_1d(self.source[mask]),
85
- )
86
-
87
- def __iter__(self) -> Iterator[Targets]:
88
- for i in range(len(self.labels)) if self.source is None else np.unique(self.source):
89
- yield self[i]