dataeval 0.86.2__tar.gz → 0.86.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. {dataeval-0.86.2 → dataeval-0.86.4}/PKG-INFO +1 -1
  2. {dataeval-0.86.2 → dataeval-0.86.4}/pyproject.toml +1 -1
  3. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/__init__.py +1 -1
  4. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/__init__.py +0 -2
  5. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/_images.py +3 -1
  6. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/_metadata.py +40 -63
  7. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/selections/_classfilter.py +2 -2
  8. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metadata/_distance.py +1 -1
  9. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metadata/_utils.py +4 -2
  10. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/bias/_balance.py +6 -5
  11. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/bias/_parity.py +2 -1
  12. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/stats/_labelstats.py +24 -32
  13. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/outputs/_base.py +1 -1
  14. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/outputs/_bias.py +21 -18
  15. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/outputs/_estimators.py +2 -1
  16. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/outputs/_linters.py +18 -18
  17. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/outputs/_stats.py +20 -20
  18. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/outputs/_utils.py +3 -2
  19. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/outputs/_workflows.py +9 -7
  20. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/typing.py +4 -4
  21. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/_plot.py +10 -10
  22. dataeval-0.86.2/src/dataeval/data/_targets.py +0 -89
  23. {dataeval-0.86.2 → dataeval-0.86.4}/LICENSE.txt +0 -0
  24. {dataeval-0.86.2 → dataeval-0.86.4}/README.md +0 -0
  25. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/_log.py +0 -0
  26. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/config.py +0 -0
  27. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/_embeddings.py +0 -0
  28. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/_selection.py +0 -0
  29. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/_split.py +0 -0
  30. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/selections/__init__.py +0 -0
  31. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/selections/_classbalance.py +0 -0
  32. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/selections/_indices.py +0 -0
  33. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/selections/_limit.py +0 -0
  34. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/selections/_prioritize.py +0 -0
  35. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/selections/_reverse.py +0 -0
  36. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/data/selections/_shuffle.py +0 -0
  37. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/__init__.py +0 -0
  38. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/__init__.py +0 -0
  39. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_base.py +0 -0
  40. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_cvm.py +0 -0
  41. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_ks.py +0 -0
  42. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_mmd.py +0 -0
  43. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_mvdc.py +0 -0
  44. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_nml/__init__.py +0 -0
  45. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_nml/_base.py +0 -0
  46. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_nml/_chunk.py +0 -0
  47. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_nml/_domainclassifier.py +0 -0
  48. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_nml/_result.py +0 -0
  49. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_nml/_thresholds.py +0 -0
  50. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/_uncertainty.py +0 -0
  51. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/drift/updates.py +0 -0
  52. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/linters/__init__.py +0 -0
  53. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/linters/duplicates.py +0 -0
  54. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/linters/outliers.py +0 -0
  55. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/ood/__init__.py +0 -0
  56. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/ood/ae.py +0 -0
  57. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/ood/base.py +0 -0
  58. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/detectors/ood/mixin.py +0 -0
  59. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metadata/__init__.py +0 -0
  60. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metadata/_ood.py +0 -0
  61. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/__init__.py +0 -0
  62. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/bias/__init__.py +0 -0
  63. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/bias/_completeness.py +0 -0
  64. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/bias/_coverage.py +0 -0
  65. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/bias/_diversity.py +0 -0
  66. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/estimators/__init__.py +0 -0
  67. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/estimators/_ber.py +0 -0
  68. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/estimators/_clusterer.py +0 -0
  69. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/estimators/_divergence.py +0 -0
  70. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/estimators/_uap.py +0 -0
  71. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/stats/__init__.py +0 -0
  72. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/stats/_base.py +0 -0
  73. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/stats/_boxratiostats.py +0 -0
  74. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/stats/_dimensionstats.py +0 -0
  75. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/stats/_hashstats.py +0 -0
  76. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/stats/_imagestats.py +0 -0
  77. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/stats/_pixelstats.py +0 -0
  78. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/metrics/stats/_visualstats.py +0 -0
  79. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/outputs/__init__.py +0 -0
  80. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/outputs/_drift.py +0 -0
  81. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/outputs/_metadata.py +0 -0
  82. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/outputs/_ood.py +0 -0
  83. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/py.typed +0 -0
  84. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/__init__.py +0 -0
  85. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/_array.py +0 -0
  86. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/_bin.py +0 -0
  87. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/_clusterer.py +0 -0
  88. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/_fast_mst.py +0 -0
  89. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/_image.py +0 -0
  90. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/_method.py +0 -0
  91. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/_mst.py +0 -0
  92. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/data/__init__.py +0 -0
  93. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/data/_dataset.py +0 -0
  94. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/data/collate.py +0 -0
  95. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/data/metadata.py +0 -0
  96. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/datasets/__init__.py +0 -0
  97. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/datasets/_antiuav.py +0 -0
  98. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/datasets/_base.py +0 -0
  99. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/datasets/_cifar10.py +0 -0
  100. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/datasets/_fileio.py +0 -0
  101. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/datasets/_milco.py +0 -0
  102. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/datasets/_mixin.py +0 -0
  103. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/datasets/_mnist.py +0 -0
  104. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/datasets/_ships.py +0 -0
  105. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/datasets/_types.py +0 -0
  106. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/datasets/_voc.py +0 -0
  107. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/torch/__init__.py +0 -0
  108. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/torch/_blocks.py +0 -0
  109. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/torch/_gmm.py +0 -0
  110. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/torch/_internal.py +0 -0
  111. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/torch/models.py +0 -0
  112. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/utils/torch/trainer.py +0 -0
  113. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/workflows/__init__.py +0 -0
  114. {dataeval-0.86.2 → dataeval-0.86.4}/src/dataeval/workflows/sufficiency.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dataeval
3
- Version: 0.86.2
3
+ Version: 0.86.4
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Home-page: https://dataeval.ai/
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "dataeval"
3
- version = "0.86.2" # dynamic
3
+ version = "0.86.4" # dynamic
4
4
  description = "DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks"
5
5
  license = "MIT"
6
6
  readme = "README.md"
@@ -8,7 +8,7 @@ shifts that impact performance of deployed models.
8
8
  from __future__ import annotations
9
9
 
10
10
  __all__ = ["config", "detectors", "log", "metrics", "typing", "utils", "workflows"]
11
- __version__ = "0.86.2"
11
+ __version__ = "0.86.4"
12
12
 
13
13
  import logging
14
14
 
@@ -6,7 +6,6 @@ __all__ = [
6
6
  "Metadata",
7
7
  "Select",
8
8
  "SplitDatasetOutput",
9
- "Targets",
10
9
  "split_dataset",
11
10
  ]
12
11
 
@@ -15,5 +14,4 @@ from dataeval.data._images import Images
15
14
  from dataeval.data._metadata import Metadata
16
15
  from dataeval.data._selection import Select
17
16
  from dataeval.data._split import split_dataset
18
- from dataeval.data._targets import Targets
19
17
  from dataeval.outputs._utils import SplitDatasetOutput
@@ -4,6 +4,8 @@ __all__ = []
4
4
 
5
5
  from typing import TYPE_CHECKING, Any, Generic, Iterator, Sequence, TypeVar, cast, overload
6
6
 
7
+ import numpy as np
8
+
7
9
  from dataeval.typing import Array, ArrayLike, Dataset
8
10
  from dataeval.utils._array import as_numpy, channels_first_to_last
9
11
 
@@ -58,7 +60,7 @@ class Images(Generic[T]):
58
60
  num_images = len(indices)
59
61
  num_rows = (num_images + images_per_row - 1) // images_per_row
60
62
  fig, axes = plt.subplots(num_rows, images_per_row, figsize=figsize)
61
- for i, ax in enumerate(axes.flatten()):
63
+ for i, ax in enumerate(np.asarray(axes).flatten()):
62
64
  image = channels_first_to_last(as_numpy(self[i]))
63
65
  ax.imshow(image)
64
66
  ax.axis("off")
@@ -4,7 +4,7 @@ __all__ = []
4
4
 
5
5
  import warnings
6
6
  from dataclasses import dataclass
7
- from typing import TYPE_CHECKING, Any, Iterable, Literal, Mapping, Sequence, Sized
7
+ from typing import Any, Iterable, Literal, Mapping, Sequence
8
8
 
9
9
  import numpy as np
10
10
  import polars as pl
@@ -19,10 +19,9 @@ from dataeval.utils._array import as_numpy
19
19
  from dataeval.utils._bin import bin_data, digitize_data
20
20
  from dataeval.utils.data.metadata import merge
21
21
 
22
- if TYPE_CHECKING:
23
- from dataeval.data import Targets
24
- else:
25
- from dataeval.data._targets import Targets
22
+
23
+ def _binned(name: str) -> str:
24
+ return f"{name}[]"
26
25
 
27
26
 
28
27
  @dataclass
@@ -51,20 +50,20 @@ class Metadata:
51
50
 
52
51
  def __init__(
53
52
  self,
54
- dataset: AnnotatedDataset[tuple[Any, Any, dict[str, Any]]],
53
+ dataset: AnnotatedDataset[tuple[Any, Any, Mapping[str, Any]]],
55
54
  *,
56
55
  continuous_factor_bins: Mapping[str, int | Sequence[float]] | None = None,
57
56
  auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = "uniform_width",
58
57
  exclude: Sequence[str] | None = None,
59
58
  include: Sequence[str] | None = None,
60
59
  ) -> None:
61
- self._targets: Targets
62
60
  self._class_labels: NDArray[np.intp]
63
61
  self._class_names: list[str]
64
62
  self._image_indices: NDArray[np.intp]
65
63
  self._factors: dict[str, FactorInfo]
66
64
  self._dropped_factors: dict[str, list[str]]
67
65
  self._dataframe: pl.DataFrame
66
+ self._raw: Sequence[Mapping[str, Any]]
68
67
 
69
68
  self._is_structured = False
70
69
  self._is_binned = False
@@ -80,13 +79,7 @@ class Metadata:
80
79
  self._include = set(include or ())
81
80
 
82
81
  @property
83
- def targets(self) -> Targets:
84
- """Target information for the dataset."""
85
- self._structure()
86
- return self._targets
87
-
88
- @property
89
- def raw(self) -> list[dict[str, Any]]:
82
+ def raw(self) -> Sequence[Mapping[str, Any]]:
90
83
  """The raw list of metadata dictionaries for the dataset."""
91
84
  self._structure()
92
85
  return self._raw
@@ -146,7 +139,7 @@ class Metadata:
146
139
  return self._dataframe
147
140
 
148
141
  @property
149
- def dropped_factors(self) -> dict[str, list[str]]:
142
+ def dropped_factors(self) -> Mapping[str, Sequence[str]]:
150
143
  """Factors that were dropped during preprocessing and the reasons why they were dropped."""
151
144
  self._structure()
152
145
  return self._dropped_factors
@@ -165,16 +158,16 @@ class Metadata:
165
158
  )
166
159
 
167
160
  @property
168
- def factor_names(self) -> list[str]:
161
+ def factor_names(self) -> Sequence[str]:
169
162
  """Factor names of the metadata."""
170
163
  self._structure()
171
- return list(self._factors)
164
+ return list(filter(self._filter, self._factors))
172
165
 
173
166
  @property
174
- def factor_info(self) -> dict[str, FactorInfo]:
167
+ def factor_info(self) -> Mapping[str, FactorInfo]:
175
168
  """Factor types of the metadata."""
176
169
  self._bin()
177
- return self._factors
170
+ return dict(filter(self._filter, self._factors.items()))
178
171
 
179
172
  @property
180
173
  def factor_data(self) -> NDArray[Any]:
@@ -192,7 +185,7 @@ class Metadata:
192
185
  return self._class_labels
193
186
 
194
187
  @property
195
- def class_names(self) -> list[str]:
188
+ def class_names(self) -> Sequence[str]:
196
189
  """Class names as a list of strings."""
197
190
  self._structure()
198
191
  return self._class_names
@@ -206,13 +199,17 @@ class Metadata:
206
199
  @property
207
200
  def image_count(self) -> int:
208
201
  self._bin()
209
- return int(self._image_indices.max() + 1)
202
+ return 0 if self._image_indices.size == 0 else int(self._image_indices.max() + 1)
203
+
204
+ def _filter(self, factor: str | tuple[str, Any]) -> bool:
205
+ factor = factor[0] if isinstance(factor, tuple) else factor
206
+ return factor in self.include if self.include else factor not in self.exclude
210
207
 
211
208
  def _reset_bins(self, cols: Iterable[str] | None = None) -> None:
212
209
  if self._is_binned:
213
210
  columns = self._dataframe.columns
214
- for col in (col for col in cols or columns if f"{col}[|]" in columns):
215
- self._dataframe.drop_in_place(f"{col}[|]")
211
+ for col in (col for col in cols or columns if _binned(col) in columns):
212
+ self._dataframe.drop_in_place(_binned(col))
216
213
  self._factors[col] = FactorInfo()
217
214
  self._is_binned = False
218
215
 
@@ -220,7 +217,7 @@ class Metadata:
220
217
  if self._is_structured:
221
218
  return
222
219
 
223
- raw: list[dict[str, Any]] = []
220
+ raw: Sequence[Mapping[str, Any]] = []
224
221
 
225
222
  labels = []
226
223
  bboxes = []
@@ -255,6 +252,14 @@ class Metadata:
255
252
  bboxes = as_numpy(bboxes).astype(np.float32) if is_od else None
256
253
  srcidx = as_numpy(srcidx).astype(np.intp) if is_od else None
257
254
 
255
+ index2label = self._dataset.metadata.get("index2label", {i: str(i) for i in np.unique(labels)})
256
+
257
+ targets_per_image = None if srcidx is None else np.unique(srcidx, return_counts=True)[1].tolist()
258
+ merged = merge(raw, return_dropped=True, ignore_lists=False, targets_per_image=targets_per_image)
259
+
260
+ reserved = ["image_index", "class_label", "score", "box"]
261
+ factor_dict = {f"metadata_{k}" if k in reserved else k: v for k, v in merged[0].items() if k != "_image_index"}
262
+
258
263
  target_dict = {
259
264
  "image_index": srcidx if srcidx is not None else np.arange(len(labels)),
260
265
  "class_label": labels,
@@ -262,20 +267,11 @@ class Metadata:
262
267
  "box": bboxes if bboxes is not None else [None] * len(labels),
263
268
  }
264
269
 
265
- self._targets = Targets(labels, scores, bboxes, srcidx)
266
270
  self._raw = raw
267
-
268
- index2label = self._dataset.metadata.get("index2label", {})
271
+ self._index2label = index2label
269
272
  self._class_labels = labels
270
- self._class_names = [index2label.get(i, str(i)) for i in np.unique(self._class_labels)]
273
+ self._class_names = list(index2label.values())
271
274
  self._image_indices = target_dict["image_index"]
272
-
273
- targets_per_image = None if srcidx is None else np.unique(srcidx, return_counts=True)[1].tolist()
274
- merged = merge(raw, return_dropped=True, ignore_lists=False, targets_per_image=targets_per_image)
275
-
276
- reserved = ["image_index", "class_label", "score", "box"]
277
- factor_dict = {f"metadata_{k}" if k in reserved else k: v for k, v in merged[0].items() if k != "_image_index"}
278
-
279
275
  self._factors = dict.fromkeys(factor_dict, FactorInfo())
280
276
  self._dataframe = pl.DataFrame({**target_dict, **factor_dict})
281
277
  self._dropped_factors = merged[1]
@@ -302,10 +298,10 @@ class Metadata:
302
298
  )
303
299
 
304
300
  column_set = set(df.columns)
305
- for col in (col for col in self.factor_names if f"{col}[|]" not in column_set):
301
+ for col in (col for col in self.factor_names if _binned(col) not in column_set):
306
302
  # Get data as numpy array for processing
307
303
  data = df[col].to_numpy()
308
- col_dz = f"{col}[|]"
304
+ col_dz = _binned(col)
309
305
  if col in factor_bins:
310
306
  # User provided binning
311
307
  bins = factor_bins[col]
@@ -332,31 +328,14 @@ class Metadata:
332
328
  df = df.with_columns(pl.Series(name=col_dz, values=binned_data.astype(np.int64)))
333
329
  factor_info[col] = FactorInfo("continuous", col_dz)
334
330
  else:
335
- factor_info[col] = FactorInfo("discrete", col_dz)
331
+ factor_info[col] = FactorInfo("discrete", col)
336
332
 
337
333
  # Store the results
338
334
  self._dataframe = df
339
335
  self._factors.update(factor_info)
340
336
  self._is_binned = True
341
337
 
342
- def get_factors_by_type(self, factor_type: Literal["categorical", "continuous", "discrete"]) -> list[str]:
343
- """
344
- Get the names of factors of a specific type.
345
-
346
- Parameters
347
- ----------
348
- factor_type : Literal["categorical", "continuous", "discrete"]
349
- The type of factors to retrieve.
350
-
351
- Returns
352
- -------
353
- list[str]
354
- List of factor names of the specified type.
355
- """
356
- self._bin()
357
- return [name for name, info in self.factor_info.items() if info.factor_type == factor_type]
358
-
359
- def add_factors(self, factors: Mapping[str, Any]) -> None:
338
+ def add_factors(self, factors: Mapping[str, Array | Sequence[Any]]) -> None:
360
339
  """
361
340
  Add additional factors to the metadata.
362
341
 
@@ -365,16 +344,15 @@ class Metadata:
365
344
 
366
345
  Parameters
367
346
  ----------
368
- factors : Mapping[str, ArrayLike]
347
+ factors : Mapping[str, Array | Sequence[Any]]
369
348
  Dictionary of factors to add to the metadata.
370
349
  """
371
350
  self._structure()
372
351
 
373
- targets = len(self.targets.source) if self.targets.source is not None else len(self.targets)
352
+ targets = len(self.dataframe)
374
353
  images = self.image_count
375
- lengths = {k: len(v if isinstance(v, Sized) else np.atleast_1d(as_numpy(v))) for k, v in factors.items()}
376
- targets_match = all(f == targets for f in lengths.values())
377
- images_match = targets_match if images == targets else all(f == images for f in lengths.values())
354
+ targets_match = all(len(v) == targets for v in factors.values())
355
+ images_match = targets_match if images == targets else all(len(v) == images for v in factors.values())
378
356
  if not targets_match and not images_match:
379
357
  raise ValueError(
380
358
  "The lists/arrays in the provided factors have a different length than the current metadata factors."
@@ -382,8 +360,7 @@ class Metadata:
382
360
 
383
361
  new_columns = []
384
362
  for k, v in factors.items():
385
- v = as_numpy(v)
386
- data = v if (self.targets.source is None or lengths[k] == targets) else v[self.targets.source]
363
+ data = as_numpy(v)[self.image_indices]
387
364
  new_columns.append(pl.Series(name=k, values=data))
388
365
  self._factors[k] = FactorInfo()
389
366
 
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
- from typing import Any, Generic, Iterable, Sequence, Sized, TypeVar, cast
5
+ from typing import Any, Generic, Iterable, Mapping, Sequence, Sized, TypeVar, cast
6
6
 
7
7
  import numpy as np
8
8
  from numpy.typing import NDArray
@@ -92,7 +92,7 @@ class ClassFilterSubSelection(Subselection[Any]):
92
92
  def __init__(self, classes: Sequence[int]) -> None:
93
93
  self.classes = classes
94
94
 
95
- def _filter(self, d: dict[str, Any], mask: NDArray[np.bool_]) -> dict[str, Any]:
95
+ def _filter(self, d: Mapping[str, Any], mask: NDArray[np.bool_]) -> dict[str, Any]:
96
96
  return {k: self._filter(v, mask) if isinstance(v, dict) else _try_mask_object(v, mask) for k, v in d.items()}
97
97
 
98
98
  def __call__(self, datum: _TDatum) -> _TDatum:
@@ -81,7 +81,7 @@ def metadata_distance(metadata1: Metadata, metadata2: Metadata) -> MetadataDista
81
81
  """
82
82
 
83
83
  _compare_keys(metadata1.factor_names, metadata2.factor_names)
84
- cont_fnames = metadata1.get_factors_by_type("continuous")
84
+ cont_fnames = [name for name, info in metadata1.factor_info.items() if info.factor_type == "continuous"]
85
85
 
86
86
  if not cont_fnames:
87
87
  return MetadataDistanceOutput({})
@@ -1,9 +1,11 @@
1
1
  __all__ = []
2
2
 
3
+ from typing import Sequence
4
+
3
5
  from numpy.typing import NDArray
4
6
 
5
7
 
6
- def _compare_keys(keys1: list[str], keys2: list[str]) -> None:
8
+ def _compare_keys(keys1: Sequence[str], keys2: Sequence[str]) -> None:
7
9
  """
8
10
  Raises error when two lists are not equivalent including ordering
9
11
 
@@ -24,7 +26,7 @@ def _compare_keys(keys1: list[str], keys2: list[str]) -> None:
24
26
  raise ValueError(f"Metadata keys must be identical, got {keys1} and {keys2}")
25
27
 
26
28
 
27
- def _validate_factors_and_data(factors: list[str], data: NDArray) -> None:
29
+ def _validate_factors_and_data(factors: Sequence[str], data: NDArray) -> None:
28
30
  """
29
31
  Raises error when the number of factors and number of rows do not match
30
32
 
@@ -99,9 +99,10 @@ def balance(
99
99
  factor_types = {"class_label": "categorical"} | {k: v.factor_type for k, v in metadata.factor_info.items()}
100
100
  is_discrete = [factor_type != "continuous" for factor_type in factor_types.values()]
101
101
  num_factors = len(factor_types)
102
+ class_labels = metadata.class_labels
102
103
 
103
104
  mi = np.full((num_factors, num_factors), np.nan, dtype=np.float32)
104
- data = np.hstack((metadata.class_labels[:, np.newaxis], data))
105
+ data = np.hstack((class_labels[:, np.newaxis], data))
105
106
 
106
107
  for idx, factor_type in enumerate(factor_types.values()):
107
108
  if factor_type != "continuous":
@@ -132,12 +133,12 @@ def balance(
132
133
  factors = nmi[1:, 1:]
133
134
 
134
135
  # assume class is a factor
135
- num_classes = len(metadata.class_names)
136
+ u_classes = np.unique(class_labels)
137
+ num_classes = len(u_classes)
136
138
  classwise_mi = np.full((num_classes, num_factors), np.nan, dtype=np.float32)
137
139
 
138
140
  # classwise targets
139
- classes = np.unique(metadata.class_labels)
140
- tgt_bin = data[:, 0][:, None] == classes
141
+ tgt_bin = data[:, 0][:, None] == u_classes
141
142
 
142
143
  # classification MI for discrete/categorical features
143
144
  for idx in range(num_classes):
@@ -157,6 +158,6 @@ def balance(
157
158
  classwise = classwise_mi / norm_factor
158
159
 
159
160
  # Grabbing factor names for plotting function
160
- factor_names = ["class_label"] + metadata.factor_names
161
+ factor_names = ["class_label"] + list(metadata.factor_names)
161
162
 
162
163
  return BalanceOutput(balance, factors, classwise, factor_names, metadata.class_names)
@@ -259,7 +259,8 @@ def parity(metadata: Metadata) -> ParityOutput:
259
259
  counts = np.nonzero(contingency_matrix < 5)
260
260
  unique_factor_values = np.unique(col_data)
261
261
  current_factor_name = metadata.factor_names[i]
262
- for int_factor, int_class in zip(counts[0], counts[1]):
262
+ for _factor, _class in zip(counts[0], counts[1]):
263
+ int_factor, int_class = int(_factor), int(_class)
263
264
  if contingency_matrix[int_factor, int_class] > 0:
264
265
  factor_category = unique_factor_values[int_factor].item()
265
266
  class_name = metadata.class_names[int_class]
@@ -2,8 +2,9 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
- from collections import Counter, defaultdict
6
- from typing import Any, Mapping, TypeVar
5
+ from typing import Any, TypeVar
6
+
7
+ import polars as pl
7
8
 
8
9
  from dataeval.data._metadata import Metadata
9
10
  from dataeval.outputs import LabelStatsOutput
@@ -13,10 +14,6 @@ from dataeval.typing import AnnotatedDataset
13
14
  TValue = TypeVar("TValue")
14
15
 
15
16
 
16
- def _sort_to_list(d: Mapping[int, TValue]) -> list[TValue]:
17
- return [t[1] for t in sorted(d.items())]
18
-
19
-
20
17
  @set_metadata
21
18
  def labelstats(dataset: Metadata | AnnotatedDataset[Any]) -> LabelStatsOutput:
22
19
  """
@@ -52,39 +49,34 @@ def labelstats(dataset: Metadata | AnnotatedDataset[Any]) -> LabelStatsOutput:
52
49
  pig: 2 - 2
53
50
  chicken: 5 - 5
54
51
  """
55
- dataset = Metadata(dataset) if isinstance(dataset, AnnotatedDataset) else dataset
56
-
57
- label_counts: Counter[int] = Counter()
58
- image_counts: Counter[int] = Counter()
59
- index_location = defaultdict(list[int])
60
- label_per_image: list[int] = []
61
-
62
- index2label = dict(enumerate(dataset.class_names))
63
-
64
- for i, target in enumerate(dataset.targets):
65
- group = target.labels.tolist()
52
+ metadata = Metadata(dataset) if isinstance(dataset, AnnotatedDataset) else dataset
53
+ metadata_df = metadata.dataframe
66
54
 
67
- # Count occurrences of each label in all sublists
68
- label_counts.update(group)
55
+ # Count occurrences of each label across all images
56
+ label_counts_df = metadata_df.group_by("class_label").len()
57
+ label_counts = dict(zip(label_counts_df["class_label"], label_counts_df["len"]))
69
58
 
70
- # Get the number of labels per image
71
- label_per_image.append(len(group))
59
+ # Count unique images per label (how many images contain each label)
60
+ image_counts_df = metadata_df.select(["image_index", "class_label"]).unique().group_by("class_label").len()
61
+ image_counts = dict(zip(image_counts_df["class_label"], image_counts_df["len"]))
72
62
 
73
- # Create a set of unique items in the current sublist
74
- unique_items: set[int] = set(group)
63
+ # Create index_location mapping (which images contain each label)
64
+ index_location: dict[int, list[int]] = {}
65
+ for row in metadata_df.group_by("class_label").agg(pl.col("image_index")).to_dicts():
66
+ indices = row["image_index"]
67
+ index_location[row["class_label"]] = sorted(dict.fromkeys(indices)) if isinstance(indices, list) else [indices]
75
68
 
76
- # Update image counts and index locations
77
- image_counts.update(unique_items)
78
- for item in unique_items:
79
- index_location[item].append(i)
69
+ # Count labels per image
70
+ label_per_image_df = metadata_df.group_by("image_index").agg(pl.len().alias("label_count"))
71
+ label_per_image = label_per_image_df.sort("image_index")["label_count"].to_list()
80
72
 
81
73
  return LabelStatsOutput(
82
- label_counts_per_class=_sort_to_list(label_counts),
74
+ label_counts_per_class=label_counts,
83
75
  label_counts_per_image=label_per_image,
84
- image_counts_per_class=_sort_to_list(image_counts),
85
- image_indices_per_class=_sort_to_list(index_location),
76
+ image_counts_per_class=image_counts,
77
+ image_indices_per_class=index_location,
86
78
  image_count=len(label_per_image),
87
- class_count=len(label_counts),
79
+ class_count=len(metadata.class_names),
88
80
  label_count=sum(label_counts.values()),
89
- class_names=list(index2label.values()),
81
+ class_names=metadata.class_names,
90
82
  )
@@ -147,7 +147,7 @@ P = ParamSpec("P")
147
147
  R = TypeVar("R", bound=GenericOutput)
148
148
 
149
149
 
150
- def set_metadata(fn: Callable[P, R] | None = None, *, state: list[str] | None = None) -> Callable[P, R]:
150
+ def set_metadata(fn: Callable[P, R] | None = None, *, state: Sequence[str] | None = None) -> Callable[P, R]:
151
151
  """Decorator to stamp Output classes with runtime metadata"""
152
152
 
153
153
  if fn is None:
@@ -4,7 +4,7 @@ __all__ = []
4
4
 
5
5
  import contextlib
6
6
  from dataclasses import asdict, dataclass
7
- from typing import Any, TypeVar
7
+ from typing import Any, Mapping, Sequence, TypeVar
8
8
 
9
9
  import numpy as np
10
10
  import pandas as pd
@@ -39,7 +39,7 @@ class ToDataFrameMixin:
39
39
  This method requires `pandas <https://pandas.pydata.org/>`_ to be installed.
40
40
  """
41
41
  return pd.DataFrame(
42
- index=self.factor_names, # type: ignore - list[str] is documented as acceptable index type
42
+ index=self.factor_names, # type: ignore - Sequence[str] is documented as acceptable index type
43
43
  data={
44
44
  "score": self.score.round(2),
45
45
  "p-value": self.p_value.round(2),
@@ -58,7 +58,7 @@ class ParityOutput(ToDataFrameMixin, Output):
58
58
  chi-squared score(s) of the test
59
59
  p_value : NDArray[np.float64]
60
60
  p-value(s) of the test
61
- factor_names : list[str]
61
+ factor_names : Sequence[str]
62
62
  Names of each metadata factor
63
63
  insufficient_data: dict
64
64
  Dictionary of metadata factors with less than 5 class occurrences per value
@@ -66,8 +66,8 @@ class ParityOutput(ToDataFrameMixin, Output):
66
66
 
67
67
  score: NDArray[np.float64]
68
68
  p_value: NDArray[np.float64]
69
- factor_names: list[str]
70
- insufficient_data: dict[str, dict[int, dict[str, int]]]
69
+ factor_names: Sequence[str]
70
+ insufficient_data: Mapping[str, Mapping[int, Mapping[str, int]]]
71
71
 
72
72
 
73
73
  @dataclass(frozen=True)
@@ -145,12 +145,15 @@ class CoverageOutput(Output):
145
145
  cols = min(3, num_images)
146
146
  fig, axs = plt.subplots(rows, cols, figsize=(3 * cols, 3 * rows))
147
147
 
148
- for image, ax in zip(images[:num_images], axs.flat):
148
+ # Flatten axes using numpy array explicitly for compatibility
149
+ axs_flat = np.asarray(axs).flatten()
150
+
151
+ for image, ax in zip(images[:num_images], axs_flat):
149
152
  image = channels_first_to_last(as_numpy(image))
150
153
  ax.imshow(image)
151
154
  ax.axis("off")
152
155
 
153
- for ax in axs.flat[num_images:]:
156
+ for ax in axs_flat[num_images:]:
154
157
  ax.axis("off")
155
158
 
156
159
  fig.tight_layout()
@@ -187,22 +190,22 @@ class BalanceOutput(Output):
187
190
  Estimate of inter/intra-factor mutual information
188
191
  classwise : NDArray[np.float64]
189
192
  Estimate of mutual information between metadata factors and individual class labels
190
- factor_names : list[str]
193
+ factor_names : Sequence[str]
191
194
  Names of each metadata factor
192
- class_names : list[str]
195
+ class_names : Sequence[str]
193
196
  List of the class labels present in the dataset
194
197
  """
195
198
 
196
199
  balance: NDArray[np.float64]
197
200
  factors: NDArray[np.float64]
198
201
  classwise: NDArray[np.float64]
199
- factor_names: list[str]
200
- class_names: list[str]
202
+ factor_names: Sequence[str]
203
+ class_names: Sequence[str]
201
204
 
202
205
  def plot(
203
206
  self,
204
- row_labels: list[Any] | NDArray[Any] | None = None,
205
- col_labels: list[Any] | NDArray[Any] | None = None,
207
+ row_labels: Sequence[Any] | NDArray[Any] | None = None,
208
+ col_labels: Sequence[Any] | NDArray[Any] | None = None,
206
209
  plot_classwise: bool = False,
207
210
  ) -> Figure:
208
211
  """
@@ -276,16 +279,16 @@ class DiversityOutput(Output):
276
279
  :term:`Diversity` index for classes and factors
277
280
  classwise : NDArray[np.double]
278
281
  Classwise diversity index [n_class x n_factor]
279
- factor_names : list[str]
282
+ factor_names : Sequence[str]
280
283
  Names of each metadata factor
281
- class_names : list[str]
284
+ class_names : Sequence[str]
282
285
  Class labels for each value in the dataset
283
286
  """
284
287
 
285
288
  diversity_index: NDArray[np.double]
286
289
  classwise: NDArray[np.double]
287
- factor_names: list[str]
288
- class_names: list[str]
290
+ factor_names: Sequence[str]
291
+ class_names: Sequence[str]
289
292
 
290
293
  def plot(
291
294
  self,
@@ -333,7 +336,7 @@ class DiversityOutput(Output):
333
336
  import matplotlib.pyplot as plt
334
337
 
335
338
  fig, ax = plt.subplots(figsize=(8, 8))
336
- heat_labels = ["class_labels"] + self.factor_names
339
+ heat_labels = ["class_labels"] + list(self.factor_names)
337
340
  ax.bar(heat_labels, self.diversity_index)
338
341
  ax.set_xlabel("Factors")
339
342
  plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  __all__ = []
4
4
 
5
5
  from dataclasses import dataclass
6
+ from typing import Sequence
6
7
 
7
8
  import numpy as np
8
9
  from numpy.typing import NDArray
@@ -64,7 +65,7 @@ class ClustererOutput(Output):
64
65
  """
65
66
  return np.nonzero(self.clusters == -1)[0]
66
67
 
67
- def find_duplicates(self) -> tuple[list[list[int]], list[list[int]]]:
68
+ def find_duplicates(self) -> tuple[Sequence[Sequence[int]], Sequence[Sequence[int]]]:
68
69
  """
69
70
  Finds duplicate and near duplicate data based on cluster average distance
70
71