dataeval 0.86.2__tar.gz → 0.86.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. {dataeval-0.86.2 → dataeval-0.86.3}/PKG-INFO +1 -1
  2. {dataeval-0.86.2 → dataeval-0.86.3}/pyproject.toml +1 -1
  3. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/__init__.py +1 -1
  4. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/__init__.py +0 -2
  5. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/_metadata.py +26 -41
  6. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/selections/_classfilter.py +2 -2
  7. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metadata/_utils.py +4 -2
  8. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/bias/_balance.py +1 -1
  9. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/bias/_parity.py +2 -1
  10. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/stats/_labelstats.py +24 -28
  11. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/outputs/_base.py +1 -1
  12. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/outputs/_bias.py +21 -18
  13. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/outputs/_estimators.py +2 -1
  14. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/outputs/_linters.py +17 -17
  15. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/outputs/_stats.py +20 -20
  16. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/outputs/_utils.py +3 -2
  17. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/outputs/_workflows.py +9 -7
  18. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/typing.py +4 -4
  19. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/_plot.py +4 -4
  20. dataeval-0.86.2/src/dataeval/data/_targets.py +0 -89
  21. {dataeval-0.86.2 → dataeval-0.86.3}/LICENSE.txt +0 -0
  22. {dataeval-0.86.2 → dataeval-0.86.3}/README.md +0 -0
  23. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/_log.py +0 -0
  24. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/config.py +0 -0
  25. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/_embeddings.py +0 -0
  26. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/_images.py +0 -0
  27. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/_selection.py +0 -0
  28. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/_split.py +0 -0
  29. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/selections/__init__.py +0 -0
  30. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/selections/_classbalance.py +0 -0
  31. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/selections/_indices.py +0 -0
  32. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/selections/_limit.py +0 -0
  33. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/selections/_prioritize.py +0 -0
  34. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/selections/_reverse.py +0 -0
  35. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/data/selections/_shuffle.py +0 -0
  36. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/__init__.py +0 -0
  37. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/__init__.py +0 -0
  38. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_base.py +0 -0
  39. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_cvm.py +0 -0
  40. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_ks.py +0 -0
  41. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_mmd.py +0 -0
  42. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_mvdc.py +0 -0
  43. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_nml/__init__.py +0 -0
  44. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_nml/_base.py +0 -0
  45. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_nml/_chunk.py +0 -0
  46. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_nml/_domainclassifier.py +0 -0
  47. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_nml/_result.py +0 -0
  48. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_nml/_thresholds.py +0 -0
  49. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/_uncertainty.py +0 -0
  50. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/drift/updates.py +0 -0
  51. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/linters/__init__.py +0 -0
  52. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/linters/duplicates.py +0 -0
  53. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/linters/outliers.py +0 -0
  54. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/ood/__init__.py +0 -0
  55. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/ood/ae.py +0 -0
  56. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/ood/base.py +0 -0
  57. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/detectors/ood/mixin.py +0 -0
  58. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metadata/__init__.py +0 -0
  59. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metadata/_distance.py +0 -0
  60. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metadata/_ood.py +0 -0
  61. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/__init__.py +0 -0
  62. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/bias/__init__.py +0 -0
  63. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/bias/_completeness.py +0 -0
  64. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/bias/_coverage.py +0 -0
  65. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/bias/_diversity.py +0 -0
  66. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/estimators/__init__.py +0 -0
  67. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/estimators/_ber.py +0 -0
  68. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/estimators/_clusterer.py +0 -0
  69. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/estimators/_divergence.py +0 -0
  70. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/estimators/_uap.py +0 -0
  71. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/stats/__init__.py +0 -0
  72. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/stats/_base.py +0 -0
  73. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/stats/_boxratiostats.py +0 -0
  74. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/stats/_dimensionstats.py +0 -0
  75. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/stats/_hashstats.py +0 -0
  76. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/stats/_imagestats.py +0 -0
  77. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/stats/_pixelstats.py +0 -0
  78. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/metrics/stats/_visualstats.py +0 -0
  79. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/outputs/__init__.py +0 -0
  80. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/outputs/_drift.py +0 -0
  81. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/outputs/_metadata.py +0 -0
  82. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/outputs/_ood.py +0 -0
  83. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/py.typed +0 -0
  84. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/__init__.py +0 -0
  85. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/_array.py +0 -0
  86. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/_bin.py +0 -0
  87. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/_clusterer.py +0 -0
  88. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/_fast_mst.py +0 -0
  89. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/_image.py +0 -0
  90. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/_method.py +0 -0
  91. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/_mst.py +0 -0
  92. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/data/__init__.py +0 -0
  93. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/data/_dataset.py +0 -0
  94. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/data/collate.py +0 -0
  95. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/data/metadata.py +0 -0
  96. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/datasets/__init__.py +0 -0
  97. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/datasets/_antiuav.py +0 -0
  98. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/datasets/_base.py +0 -0
  99. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/datasets/_cifar10.py +0 -0
  100. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/datasets/_fileio.py +0 -0
  101. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/datasets/_milco.py +0 -0
  102. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/datasets/_mixin.py +0 -0
  103. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/datasets/_mnist.py +0 -0
  104. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/datasets/_ships.py +0 -0
  105. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/datasets/_types.py +0 -0
  106. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/datasets/_voc.py +0 -0
  107. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/torch/__init__.py +0 -0
  108. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/torch/_blocks.py +0 -0
  109. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/torch/_gmm.py +0 -0
  110. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/torch/_internal.py +0 -0
  111. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/torch/models.py +0 -0
  112. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/utils/torch/trainer.py +0 -0
  113. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/workflows/__init__.py +0 -0
  114. {dataeval-0.86.2 → dataeval-0.86.3}/src/dataeval/workflows/sufficiency.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dataeval
3
- Version: 0.86.2
3
+ Version: 0.86.3
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Home-page: https://dataeval.ai/
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "dataeval"
3
- version = "0.86.2" # dynamic
3
+ version = "0.86.3" # dynamic
4
4
  description = "DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks"
5
5
  license = "MIT"
6
6
  readme = "README.md"
@@ -8,7 +8,7 @@ shifts that impact performance of deployed models.
8
8
  from __future__ import annotations
9
9
 
10
10
  __all__ = ["config", "detectors", "log", "metrics", "typing", "utils", "workflows"]
11
- __version__ = "0.86.2"
11
+ __version__ = "0.86.3"
12
12
 
13
13
  import logging
14
14
 
@@ -6,7 +6,6 @@ __all__ = [
6
6
  "Metadata",
7
7
  "Select",
8
8
  "SplitDatasetOutput",
9
- "Targets",
10
9
  "split_dataset",
11
10
  ]
12
11
 
@@ -15,5 +14,4 @@ from dataeval.data._images import Images
15
14
  from dataeval.data._metadata import Metadata
16
15
  from dataeval.data._selection import Select
17
16
  from dataeval.data._split import split_dataset
18
- from dataeval.data._targets import Targets
19
17
  from dataeval.outputs._utils import SplitDatasetOutput
@@ -4,7 +4,7 @@ __all__ = []
4
4
 
5
5
  import warnings
6
6
  from dataclasses import dataclass
7
- from typing import TYPE_CHECKING, Any, Iterable, Literal, Mapping, Sequence, Sized
7
+ from typing import Any, Iterable, Literal, Mapping, Sequence
8
8
 
9
9
  import numpy as np
10
10
  import polars as pl
@@ -19,11 +19,6 @@ from dataeval.utils._array import as_numpy
19
19
  from dataeval.utils._bin import bin_data, digitize_data
20
20
  from dataeval.utils.data.metadata import merge
21
21
 
22
- if TYPE_CHECKING:
23
- from dataeval.data import Targets
24
- else:
25
- from dataeval.data._targets import Targets
26
-
27
22
 
28
23
  @dataclass
29
24
  class FactorInfo:
@@ -51,20 +46,20 @@ class Metadata:
51
46
 
52
47
  def __init__(
53
48
  self,
54
- dataset: AnnotatedDataset[tuple[Any, Any, dict[str, Any]]],
49
+ dataset: AnnotatedDataset[tuple[Any, Any, Mapping[str, Any]]],
55
50
  *,
56
51
  continuous_factor_bins: Mapping[str, int | Sequence[float]] | None = None,
57
52
  auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = "uniform_width",
58
53
  exclude: Sequence[str] | None = None,
59
54
  include: Sequence[str] | None = None,
60
55
  ) -> None:
61
- self._targets: Targets
62
56
  self._class_labels: NDArray[np.intp]
63
57
  self._class_names: list[str]
64
58
  self._image_indices: NDArray[np.intp]
65
59
  self._factors: dict[str, FactorInfo]
66
60
  self._dropped_factors: dict[str, list[str]]
67
61
  self._dataframe: pl.DataFrame
62
+ self._raw: Sequence[Mapping[str, Any]]
68
63
 
69
64
  self._is_structured = False
70
65
  self._is_binned = False
@@ -80,13 +75,7 @@ class Metadata:
80
75
  self._include = set(include or ())
81
76
 
82
77
  @property
83
- def targets(self) -> Targets:
84
- """Target information for the dataset."""
85
- self._structure()
86
- return self._targets
87
-
88
- @property
89
- def raw(self) -> list[dict[str, Any]]:
78
+ def raw(self) -> Sequence[Mapping[str, Any]]:
90
79
  """The raw list of metadata dictionaries for the dataset."""
91
80
  self._structure()
92
81
  return self._raw
@@ -146,7 +135,7 @@ class Metadata:
146
135
  return self._dataframe
147
136
 
148
137
  @property
149
- def dropped_factors(self) -> dict[str, list[str]]:
138
+ def dropped_factors(self) -> Mapping[str, Sequence[str]]:
150
139
  """Factors that were dropped during preprocessing and the reasons why they were dropped."""
151
140
  self._structure()
152
141
  return self._dropped_factors
@@ -165,13 +154,13 @@ class Metadata:
165
154
  )
166
155
 
167
156
  @property
168
- def factor_names(self) -> list[str]:
157
+ def factor_names(self) -> Sequence[str]:
169
158
  """Factor names of the metadata."""
170
159
  self._structure()
171
160
  return list(self._factors)
172
161
 
173
162
  @property
174
- def factor_info(self) -> dict[str, FactorInfo]:
163
+ def factor_info(self) -> Mapping[str, FactorInfo]:
175
164
  """Factor types of the metadata."""
176
165
  self._bin()
177
166
  return self._factors
@@ -192,7 +181,7 @@ class Metadata:
192
181
  return self._class_labels
193
182
 
194
183
  @property
195
- def class_names(self) -> list[str]:
184
+ def class_names(self) -> Sequence[str]:
196
185
  """Class names as a list of strings."""
197
186
  self._structure()
198
187
  return self._class_names
@@ -220,7 +209,7 @@ class Metadata:
220
209
  if self._is_structured:
221
210
  return
222
211
 
223
- raw: list[dict[str, Any]] = []
212
+ raw: Sequence[Mapping[str, Any]] = []
224
213
 
225
214
  labels = []
226
215
  bboxes = []
@@ -255,6 +244,14 @@ class Metadata:
255
244
  bboxes = as_numpy(bboxes).astype(np.float32) if is_od else None
256
245
  srcidx = as_numpy(srcidx).astype(np.intp) if is_od else None
257
246
 
247
+ index2label = self._dataset.metadata.get("index2label", {})
248
+
249
+ targets_per_image = None if srcidx is None else np.unique(srcidx, return_counts=True)[1].tolist()
250
+ merged = merge(raw, return_dropped=True, ignore_lists=False, targets_per_image=targets_per_image)
251
+
252
+ reserved = ["image_index", "class_label", "score", "box"]
253
+ factor_dict = {f"metadata_{k}" if k in reserved else k: v for k, v in merged[0].items() if k != "_image_index"}
254
+
258
255
  target_dict = {
259
256
  "image_index": srcidx if srcidx is not None else np.arange(len(labels)),
260
257
  "class_label": labels,
@@ -262,20 +259,10 @@ class Metadata:
262
259
  "box": bboxes if bboxes is not None else [None] * len(labels),
263
260
  }
264
261
 
265
- self._targets = Targets(labels, scores, bboxes, srcidx)
266
262
  self._raw = raw
267
-
268
- index2label = self._dataset.metadata.get("index2label", {})
269
263
  self._class_labels = labels
270
- self._class_names = [index2label.get(i, str(i)) for i in np.unique(self._class_labels)]
264
+ self._class_names = [index2label.get(i, str(i)) for i in np.unique(labels)]
271
265
  self._image_indices = target_dict["image_index"]
272
-
273
- targets_per_image = None if srcidx is None else np.unique(srcidx, return_counts=True)[1].tolist()
274
- merged = merge(raw, return_dropped=True, ignore_lists=False, targets_per_image=targets_per_image)
275
-
276
- reserved = ["image_index", "class_label", "score", "box"]
277
- factor_dict = {f"metadata_{k}" if k in reserved else k: v for k, v in merged[0].items() if k != "_image_index"}
278
-
279
266
  self._factors = dict.fromkeys(factor_dict, FactorInfo())
280
267
  self._dataframe = pl.DataFrame({**target_dict, **factor_dict})
281
268
  self._dropped_factors = merged[1]
@@ -332,14 +319,14 @@ class Metadata:
332
319
  df = df.with_columns(pl.Series(name=col_dz, values=binned_data.astype(np.int64)))
333
320
  factor_info[col] = FactorInfo("continuous", col_dz)
334
321
  else:
335
- factor_info[col] = FactorInfo("discrete", col_dz)
322
+ factor_info[col] = FactorInfo("discrete", col)
336
323
 
337
324
  # Store the results
338
325
  self._dataframe = df
339
326
  self._factors.update(factor_info)
340
327
  self._is_binned = True
341
328
 
342
- def get_factors_by_type(self, factor_type: Literal["categorical", "continuous", "discrete"]) -> list[str]:
329
+ def get_factors_by_type(self, factor_type: Literal["categorical", "continuous", "discrete"]) -> Sequence[str]:
343
330
  """
344
331
  Get the names of factors of a specific type.
345
332
 
@@ -356,7 +343,7 @@ class Metadata:
356
343
  self._bin()
357
344
  return [name for name, info in self.factor_info.items() if info.factor_type == factor_type]
358
345
 
359
- def add_factors(self, factors: Mapping[str, Any]) -> None:
346
+ def add_factors(self, factors: Mapping[str, Array | Sequence[Any]]) -> None:
360
347
  """
361
348
  Add additional factors to the metadata.
362
349
 
@@ -365,16 +352,15 @@ class Metadata:
365
352
 
366
353
  Parameters
367
354
  ----------
368
- factors : Mapping[str, ArrayLike]
355
+ factors : Mapping[str, Array | Sequence[Any]]
369
356
  Dictionary of factors to add to the metadata.
370
357
  """
371
358
  self._structure()
372
359
 
373
- targets = len(self.targets.source) if self.targets.source is not None else len(self.targets)
360
+ targets = len(self.dataframe)
374
361
  images = self.image_count
375
- lengths = {k: len(v if isinstance(v, Sized) else np.atleast_1d(as_numpy(v))) for k, v in factors.items()}
376
- targets_match = all(f == targets for f in lengths.values())
377
- images_match = targets_match if images == targets else all(f == images for f in lengths.values())
362
+ targets_match = all(len(v) == targets for v in factors.values())
363
+ images_match = targets_match if images == targets else all(len(v) == images for v in factors.values())
378
364
  if not targets_match and not images_match:
379
365
  raise ValueError(
380
366
  "The lists/arrays in the provided factors have a different length than the current metadata factors."
@@ -382,8 +368,7 @@ class Metadata:
382
368
 
383
369
  new_columns = []
384
370
  for k, v in factors.items():
385
- v = as_numpy(v)
386
- data = v if (self.targets.source is None or lengths[k] == targets) else v[self.targets.source]
371
+ data = as_numpy(v)[self.image_indices]
387
372
  new_columns.append(pl.Series(name=k, values=data))
388
373
  self._factors[k] = FactorInfo()
389
374
 
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
- from typing import Any, Generic, Iterable, Sequence, Sized, TypeVar, cast
5
+ from typing import Any, Generic, Iterable, Mapping, Sequence, Sized, TypeVar, cast
6
6
 
7
7
  import numpy as np
8
8
  from numpy.typing import NDArray
@@ -92,7 +92,7 @@ class ClassFilterSubSelection(Subselection[Any]):
92
92
  def __init__(self, classes: Sequence[int]) -> None:
93
93
  self.classes = classes
94
94
 
95
- def _filter(self, d: dict[str, Any], mask: NDArray[np.bool_]) -> dict[str, Any]:
95
+ def _filter(self, d: Mapping[str, Any], mask: NDArray[np.bool_]) -> dict[str, Any]:
96
96
  return {k: self._filter(v, mask) if isinstance(v, dict) else _try_mask_object(v, mask) for k, v in d.items()}
97
97
 
98
98
  def __call__(self, datum: _TDatum) -> _TDatum:
@@ -1,9 +1,11 @@
1
1
  __all__ = []
2
2
 
3
+ from typing import Sequence
4
+
3
5
  from numpy.typing import NDArray
4
6
 
5
7
 
6
- def _compare_keys(keys1: list[str], keys2: list[str]) -> None:
8
+ def _compare_keys(keys1: Sequence[str], keys2: Sequence[str]) -> None:
7
9
  """
8
10
  Raises error when two lists are not equivalent including ordering
9
11
 
@@ -24,7 +26,7 @@ def _compare_keys(keys1: list[str], keys2: list[str]) -> None:
24
26
  raise ValueError(f"Metadata keys must be identical, got {keys1} and {keys2}")
25
27
 
26
28
 
27
- def _validate_factors_and_data(factors: list[str], data: NDArray) -> None:
29
+ def _validate_factors_and_data(factors: Sequence[str], data: NDArray) -> None:
28
30
  """
29
31
  Raises error when the number of factors and number of rows do not match
30
32
 
@@ -157,6 +157,6 @@ def balance(
157
157
  classwise = classwise_mi / norm_factor
158
158
 
159
159
  # Grabbing factor names for plotting function
160
- factor_names = ["class_label"] + metadata.factor_names
160
+ factor_names = ["class_label"] + list(metadata.factor_names)
161
161
 
162
162
  return BalanceOutput(balance, factors, classwise, factor_names, metadata.class_names)
@@ -259,7 +259,8 @@ def parity(metadata: Metadata) -> ParityOutput:
259
259
  counts = np.nonzero(contingency_matrix < 5)
260
260
  unique_factor_values = np.unique(col_data)
261
261
  current_factor_name = metadata.factor_names[i]
262
- for int_factor, int_class in zip(counts[0], counts[1]):
262
+ for _factor, _class in zip(counts[0], counts[1]):
263
+ int_factor, int_class = int(_factor), int(_class)
263
264
  if contingency_matrix[int_factor, int_class] > 0:
264
265
  factor_category = unique_factor_values[int_factor].item()
265
266
  class_name = metadata.class_names[int_class]
@@ -2,9 +2,10 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
- from collections import Counter, defaultdict
6
5
  from typing import Any, Mapping, TypeVar
7
6
 
7
+ import polars as pl
8
+
8
9
  from dataeval.data._metadata import Metadata
9
10
  from dataeval.outputs import LabelStatsOutput
10
11
  from dataeval.outputs._base import set_metadata
@@ -52,39 +53,34 @@ def labelstats(dataset: Metadata | AnnotatedDataset[Any]) -> LabelStatsOutput:
52
53
  pig: 2 - 2
53
54
  chicken: 5 - 5
54
55
  """
55
- dataset = Metadata(dataset) if isinstance(dataset, AnnotatedDataset) else dataset
56
-
57
- label_counts: Counter[int] = Counter()
58
- image_counts: Counter[int] = Counter()
59
- index_location = defaultdict(list[int])
60
- label_per_image: list[int] = []
61
-
62
- index2label = dict(enumerate(dataset.class_names))
63
-
64
- for i, target in enumerate(dataset.targets):
65
- group = target.labels.tolist()
56
+ metadata = Metadata(dataset) if isinstance(dataset, AnnotatedDataset) else dataset
57
+ metadata_df = metadata.dataframe
66
58
 
67
- # Count occurrences of each label in all sublists
68
- label_counts.update(group)
59
+ # Count occurrences of each label across all images
60
+ label_counts_df = metadata_df.group_by("class_label").len()
61
+ label_counts = label_counts_df.sort("class_label")["len"].to_list()
69
62
 
70
- # Get the number of labels per image
71
- label_per_image.append(len(group))
63
+ # Count unique images per label (how many images contain each label)
64
+ image_counts_df = metadata_df.select(["image_index", "class_label"]).unique().group_by("class_label").len()
65
+ image_counts = image_counts_df.sort("class_label")["len"].to_list()
72
66
 
73
- # Create a set of unique items in the current sublist
74
- unique_items: set[int] = set(group)
67
+ # Create index_location mapping (which images contain each label)
68
+ index_location: list[list[int]] = [[] for _ in range(len(metadata.class_names))]
69
+ for row in metadata_df.group_by("class_label").agg(pl.col("image_index")).to_dicts():
70
+ indices = row["image_index"]
71
+ index_location[row["class_label"]] = sorted(dict.fromkeys(indices)) if isinstance(indices, list) else [indices]
75
72
 
76
- # Update image counts and index locations
77
- image_counts.update(unique_items)
78
- for item in unique_items:
79
- index_location[item].append(i)
73
+ # Count labels per image
74
+ label_per_image_df = metadata_df.group_by("image_index").agg(pl.count().alias("label_count"))
75
+ label_per_image = label_per_image_df.sort("image_index")["label_count"].to_list()
80
76
 
81
77
  return LabelStatsOutput(
82
- label_counts_per_class=_sort_to_list(label_counts),
78
+ label_counts_per_class=label_counts,
83
79
  label_counts_per_image=label_per_image,
84
- image_counts_per_class=_sort_to_list(image_counts),
85
- image_indices_per_class=_sort_to_list(index_location),
80
+ image_counts_per_class=image_counts,
81
+ image_indices_per_class=index_location,
86
82
  image_count=len(label_per_image),
87
- class_count=len(label_counts),
88
- label_count=sum(label_counts.values()),
89
- class_names=list(index2label.values()),
83
+ class_count=len(metadata.class_names),
84
+ label_count=sum(label_counts),
85
+ class_names=metadata.class_names,
90
86
  )
@@ -147,7 +147,7 @@ P = ParamSpec("P")
147
147
  R = TypeVar("R", bound=GenericOutput)
148
148
 
149
149
 
150
- def set_metadata(fn: Callable[P, R] | None = None, *, state: list[str] | None = None) -> Callable[P, R]:
150
+ def set_metadata(fn: Callable[P, R] | None = None, *, state: Sequence[str] | None = None) -> Callable[P, R]:
151
151
  """Decorator to stamp Output classes with runtime metadata"""
152
152
 
153
153
  if fn is None:
@@ -4,7 +4,7 @@ __all__ = []
4
4
 
5
5
  import contextlib
6
6
  from dataclasses import asdict, dataclass
7
- from typing import Any, TypeVar
7
+ from typing import Any, Mapping, Sequence, TypeVar
8
8
 
9
9
  import numpy as np
10
10
  import pandas as pd
@@ -39,7 +39,7 @@ class ToDataFrameMixin:
39
39
  This method requires `pandas <https://pandas.pydata.org/>`_ to be installed.
40
40
  """
41
41
  return pd.DataFrame(
42
- index=self.factor_names, # type: ignore - list[str] is documented as acceptable index type
42
+ index=self.factor_names, # type: ignore - Sequence[str] is documented as acceptable index type
43
43
  data={
44
44
  "score": self.score.round(2),
45
45
  "p-value": self.p_value.round(2),
@@ -58,7 +58,7 @@ class ParityOutput(ToDataFrameMixin, Output):
58
58
  chi-squared score(s) of the test
59
59
  p_value : NDArray[np.float64]
60
60
  p-value(s) of the test
61
- factor_names : list[str]
61
+ factor_names : Sequence[str]
62
62
  Names of each metadata factor
63
63
  insufficient_data: dict
64
64
  Dictionary of metadata factors with less than 5 class occurrences per value
@@ -66,8 +66,8 @@ class ParityOutput(ToDataFrameMixin, Output):
66
66
 
67
67
  score: NDArray[np.float64]
68
68
  p_value: NDArray[np.float64]
69
- factor_names: list[str]
70
- insufficient_data: dict[str, dict[int, dict[str, int]]]
69
+ factor_names: Sequence[str]
70
+ insufficient_data: Mapping[str, Mapping[int, Mapping[str, int]]]
71
71
 
72
72
 
73
73
  @dataclass(frozen=True)
@@ -145,12 +145,15 @@ class CoverageOutput(Output):
145
145
  cols = min(3, num_images)
146
146
  fig, axs = plt.subplots(rows, cols, figsize=(3 * cols, 3 * rows))
147
147
 
148
- for image, ax in zip(images[:num_images], axs.flat):
148
+ # Flatten axes using numpy array explicitly for compatibility
149
+ axs_flat = np.asarray(axs).flatten()
150
+
151
+ for image, ax in zip(images[:num_images], axs_flat):
149
152
  image = channels_first_to_last(as_numpy(image))
150
153
  ax.imshow(image)
151
154
  ax.axis("off")
152
155
 
153
- for ax in axs.flat[num_images:]:
156
+ for ax in axs_flat[num_images:]:
154
157
  ax.axis("off")
155
158
 
156
159
  fig.tight_layout()
@@ -187,22 +190,22 @@ class BalanceOutput(Output):
187
190
  Estimate of inter/intra-factor mutual information
188
191
  classwise : NDArray[np.float64]
189
192
  Estimate of mutual information between metadata factors and individual class labels
190
- factor_names : list[str]
193
+ factor_names : Sequence[str]
191
194
  Names of each metadata factor
192
- class_names : list[str]
195
+ class_names : Sequence[str]
193
196
  List of the class labels present in the dataset
194
197
  """
195
198
 
196
199
  balance: NDArray[np.float64]
197
200
  factors: NDArray[np.float64]
198
201
  classwise: NDArray[np.float64]
199
- factor_names: list[str]
200
- class_names: list[str]
202
+ factor_names: Sequence[str]
203
+ class_names: Sequence[str]
201
204
 
202
205
  def plot(
203
206
  self,
204
- row_labels: list[Any] | NDArray[Any] | None = None,
205
- col_labels: list[Any] | NDArray[Any] | None = None,
207
+ row_labels: Sequence[Any] | NDArray[Any] | None = None,
208
+ col_labels: Sequence[Any] | NDArray[Any] | None = None,
206
209
  plot_classwise: bool = False,
207
210
  ) -> Figure:
208
211
  """
@@ -276,16 +279,16 @@ class DiversityOutput(Output):
276
279
  :term:`Diversity` index for classes and factors
277
280
  classwise : NDArray[np.double]
278
281
  Classwise diversity index [n_class x n_factor]
279
- factor_names : list[str]
282
+ factor_names : Sequence[str]
280
283
  Names of each metadata factor
281
- class_names : list[str]
284
+ class_names : Sequence[str]
282
285
  Class labels for each value in the dataset
283
286
  """
284
287
 
285
288
  diversity_index: NDArray[np.double]
286
289
  classwise: NDArray[np.double]
287
- factor_names: list[str]
288
- class_names: list[str]
290
+ factor_names: Sequence[str]
291
+ class_names: Sequence[str]
289
292
 
290
293
  def plot(
291
294
  self,
@@ -333,7 +336,7 @@ class DiversityOutput(Output):
333
336
  import matplotlib.pyplot as plt
334
337
 
335
338
  fig, ax = plt.subplots(figsize=(8, 8))
336
- heat_labels = ["class_labels"] + self.factor_names
339
+ heat_labels = ["class_labels"] + list(self.factor_names)
337
340
  ax.bar(heat_labels, self.diversity_index)
338
341
  ax.set_xlabel("Factors")
339
342
  plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  __all__ = []
4
4
 
5
5
  from dataclasses import dataclass
6
+ from typing import Sequence
6
7
 
7
8
  import numpy as np
8
9
  from numpy.typing import NDArray
@@ -64,7 +65,7 @@ class ClustererOutput(Output):
64
65
  """
65
66
  return np.nonzero(self.clusters == -1)[0]
66
67
 
67
- def find_duplicates(self) -> tuple[list[list[int]], list[list[int]]]:
68
+ def find_duplicates(self) -> tuple[Sequence[Sequence[int]], Sequence[Sequence[int]]]:
68
69
  """
69
70
  Finds duplicate and near duplicate data based on cluster average distance
70
71
 
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
  __all__ = []
4
4
 
5
5
  from dataclasses import dataclass
6
- from typing import Generic, TypeVar, Union
6
+ from typing import Generic, Mapping, Sequence, TypeVar, Union
7
7
 
8
8
  import pandas as pd
9
9
  from typing_extensions import TypeAlias
@@ -11,13 +11,13 @@ from typing_extensions import TypeAlias
11
11
  from dataeval.outputs._base import Output
12
12
  from dataeval.outputs._stats import DimensionStatsOutput, LabelStatsOutput, PixelStatsOutput, VisualStatsOutput
13
13
 
14
- DuplicateGroup: TypeAlias = list[int]
15
- DatasetDuplicateGroupMap: TypeAlias = dict[int, DuplicateGroup]
14
+ DuplicateGroup: TypeAlias = Sequence[int]
15
+ DatasetDuplicateGroupMap: TypeAlias = Mapping[int, DuplicateGroup]
16
16
  TIndexCollection = TypeVar("TIndexCollection", DuplicateGroup, DatasetDuplicateGroupMap)
17
17
 
18
- IndexIssueMap: TypeAlias = dict[int, dict[str, float]]
18
+ IndexIssueMap: TypeAlias = Mapping[int, Mapping[str, float]]
19
19
  OutlierStatsOutput: TypeAlias = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
20
- TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap, list[IndexIssueMap])
20
+ TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap, Sequence[IndexIssueMap])
21
21
 
22
22
 
23
23
  @dataclass(frozen=True)
@@ -27,9 +27,9 @@ class DuplicatesOutput(Output, Generic[TIndexCollection]):
27
27
 
28
28
  Attributes
29
29
  ----------
30
- exact : list[list[int] | dict[int, list[int]]]
30
+ exact : Sequence[Sequence[int] | Mapping[int, Sequence[int]]]
31
31
  Indices of images that are exact matches
32
- near: list[list[int] | dict[int, list[int]]]
32
+ near: Sequence[Sequence[int] | Mapping[int, Sequence[int]]]
33
33
  Indices of images that are near matches
34
34
 
35
35
  Notes
@@ -39,13 +39,13 @@ class DuplicatesOutput(Output, Generic[TIndexCollection]):
39
39
  index of the dataset, and the value is the list index groups from that dataset.
40
40
  """
41
41
 
42
- exact: list[TIndexCollection]
43
- near: list[TIndexCollection]
42
+ exact: Sequence[TIndexCollection]
43
+ near: Sequence[TIndexCollection]
44
44
 
45
45
 
46
46
  def _reorganize_by_class_and_metric(
47
47
  result: IndexIssueMap, lstats: LabelStatsOutput
48
- ) -> tuple[dict[str, list[int]], dict[str, dict[str, int]]]:
48
+ ) -> tuple[Mapping[str, Sequence[int]], Mapping[str, Mapping[str, int]]]:
49
49
  """Flip result from grouping by image to grouping by class and metric"""
50
50
  metrics: dict[str, list[int]] = {}
51
51
  class_wise: dict[str, dict[str, int]] = {label: {} for label in lstats.class_names}
@@ -61,7 +61,7 @@ def _reorganize_by_class_and_metric(
61
61
  return metrics, class_wise
62
62
 
63
63
 
64
- def _create_table(metrics: dict[str, list[int]], class_wise: dict[str, dict[str, int]]) -> list[str]:
64
+ def _create_table(metrics: Mapping[str, Sequence[int]], class_wise: Mapping[str, Mapping[str, int]]) -> Sequence[str]:
65
65
  """Create table for displaying the results"""
66
66
  max_class_length = max(len(str(label)) for label in class_wise) + 2
67
67
  max_total = max(len(metrics[group]) for group in metrics) + 2
@@ -71,7 +71,7 @@ def _create_table(metrics: dict[str, list[int]], class_wise: dict[str, dict[str,
71
71
  + [f"{group:^{max(5, len(str(group))) + 2}}" for group in sorted(metrics.keys())]
72
72
  + [f"{'Total':<{max_total}}"]
73
73
  )
74
- table_rows: list[str] = []
74
+ table_rows: Sequence[str] = []
75
75
 
76
76
  for class_cat, results in class_wise.items():
77
77
  table_value = [f"{class_cat:>{max_class_length}}"]
@@ -86,7 +86,7 @@ def _create_table(metrics: dict[str, list[int]], class_wise: dict[str, dict[str,
86
86
  return [table_header] + table_rows
87
87
 
88
88
 
89
- def _create_pandas_dataframe(class_wise: dict[str, dict[str, int]]) -> list[dict[str, str | int]]:
89
+ def _create_pandas_dataframe(class_wise: Mapping[str, Mapping[str, int]]) -> Sequence[Mapping[str, str | int]]:
90
90
  """Create data for pandas dataframe"""
91
91
  data = []
92
92
  for label, metrics_dict in class_wise.items():
@@ -105,7 +105,7 @@ class OutliersOutput(Output, Generic[TIndexIssueMap]):
105
105
 
106
106
  Attributes
107
107
  ----------
108
- issues : dict[int, dict[str, float]] | list[dict[int, dict[str, float]]]
108
+ issues : Mapping[int, Mapping[str, float]] | Sequence[Mapping[int, Mapping[str, float]]]
109
109
  Indices of image Outliers with their associated issue type and calculated values.
110
110
 
111
111
  - For a single dataset, a dictionary containing the indices of outliers and
@@ -117,7 +117,7 @@ class OutliersOutput(Output, Generic[TIndexIssueMap]):
117
117
  issues: TIndexIssueMap
118
118
 
119
119
  def __len__(self) -> int:
120
- if isinstance(self.issues, dict):
120
+ if isinstance(self.issues, Mapping):
121
121
  return len(self.issues)
122
122
  return sum(len(d) for d in self.issues)
123
123
 
@@ -134,7 +134,7 @@ class OutliersOutput(Output, Generic[TIndexIssueMap]):
134
134
  -------
135
135
  str
136
136
  """
137
- if isinstance(self.issues, dict):
137
+ if isinstance(self.issues, Mapping):
138
138
  metrics, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
139
139
  listed_table = _create_table(metrics, classwise)
140
140
  table = "\n".join(listed_table)
@@ -165,7 +165,7 @@ class OutliersOutput(Output, Generic[TIndexIssueMap]):
165
165
  -----
166
166
  This method requires `pandas <https://pandas.pydata.org/>`_ to be installed.
167
167
  """
168
- if isinstance(self.issues, dict):
168
+ if isinstance(self.issues, Mapping):
169
169
  _, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
170
170
  data = _create_pandas_dataframe(classwise)
171
171
  df = pd.DataFrame(data)