dataeval 0.86.1__tar.gz → 0.86.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. {dataeval-0.86.1 → dataeval-0.86.2}/PKG-INFO +2 -1
  2. {dataeval-0.86.1 → dataeval-0.86.2}/pyproject.toml +2 -1
  3. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/__init__.py +1 -1
  4. dataeval-0.86.2/src/dataeval/data/_metadata.py +392 -0
  5. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/_split.py +2 -2
  6. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metadata/_distance.py +10 -7
  7. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metadata/_ood.py +11 -103
  8. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/bias/_balance.py +23 -33
  9. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/bias/_diversity.py +16 -14
  10. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/bias/_parity.py +9 -6
  11. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/outputs/_bias.py +7 -51
  12. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/data/_dataset.py +22 -8
  13. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/datasets/_milco.py +3 -1
  14. dataeval-0.86.1/src/dataeval/data/_metadata.py +0 -393
  15. {dataeval-0.86.1 → dataeval-0.86.2}/LICENSE.txt +0 -0
  16. {dataeval-0.86.1 → dataeval-0.86.2}/README.md +0 -0
  17. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/_log.py +0 -0
  18. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/config.py +0 -0
  19. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/__init__.py +0 -0
  20. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/_embeddings.py +0 -0
  21. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/_images.py +0 -0
  22. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/_selection.py +0 -0
  23. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/_targets.py +0 -0
  24. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/selections/__init__.py +0 -0
  25. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/selections/_classbalance.py +0 -0
  26. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/selections/_classfilter.py +0 -0
  27. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/selections/_indices.py +0 -0
  28. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/selections/_limit.py +0 -0
  29. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/selections/_prioritize.py +0 -0
  30. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/selections/_reverse.py +0 -0
  31. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/selections/_shuffle.py +0 -0
  32. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/__init__.py +0 -0
  33. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/__init__.py +0 -0
  34. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_base.py +0 -0
  35. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_cvm.py +0 -0
  36. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_ks.py +0 -0
  37. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_mmd.py +0 -0
  38. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_mvdc.py +0 -0
  39. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_nml/__init__.py +0 -0
  40. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_nml/_base.py +0 -0
  41. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_nml/_chunk.py +0 -0
  42. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_nml/_domainclassifier.py +0 -0
  43. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_nml/_result.py +0 -0
  44. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_nml/_thresholds.py +0 -0
  45. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_uncertainty.py +0 -0
  46. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/updates.py +0 -0
  47. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/linters/__init__.py +0 -0
  48. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/linters/duplicates.py +0 -0
  49. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/linters/outliers.py +0 -0
  50. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/ood/__init__.py +0 -0
  51. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/ood/ae.py +0 -0
  52. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/ood/base.py +0 -0
  53. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/ood/mixin.py +0 -0
  54. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metadata/__init__.py +0 -0
  55. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metadata/_utils.py +0 -0
  56. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/__init__.py +0 -0
  57. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/bias/__init__.py +0 -0
  58. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/bias/_completeness.py +0 -0
  59. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/bias/_coverage.py +0 -0
  60. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/estimators/__init__.py +0 -0
  61. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/estimators/_ber.py +0 -0
  62. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/estimators/_clusterer.py +0 -0
  63. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/estimators/_divergence.py +0 -0
  64. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/estimators/_uap.py +0 -0
  65. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/stats/__init__.py +0 -0
  66. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/stats/_base.py +0 -0
  67. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/stats/_boxratiostats.py +0 -0
  68. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/stats/_dimensionstats.py +0 -0
  69. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/stats/_hashstats.py +0 -0
  70. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/stats/_imagestats.py +0 -0
  71. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/stats/_labelstats.py +0 -0
  72. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/stats/_pixelstats.py +0 -0
  73. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/stats/_visualstats.py +0 -0
  74. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/outputs/__init__.py +0 -0
  75. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/outputs/_base.py +0 -0
  76. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/outputs/_drift.py +0 -0
  77. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/outputs/_estimators.py +0 -0
  78. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/outputs/_linters.py +0 -0
  79. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/outputs/_metadata.py +0 -0
  80. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/outputs/_ood.py +0 -0
  81. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/outputs/_stats.py +0 -0
  82. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/outputs/_utils.py +0 -0
  83. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/outputs/_workflows.py +0 -0
  84. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/py.typed +0 -0
  85. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/typing.py +0 -0
  86. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/__init__.py +0 -0
  87. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/_array.py +0 -0
  88. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/_bin.py +0 -0
  89. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/_clusterer.py +0 -0
  90. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/_fast_mst.py +0 -0
  91. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/_image.py +0 -0
  92. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/_method.py +0 -0
  93. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/_mst.py +0 -0
  94. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/_plot.py +0 -0
  95. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/data/__init__.py +0 -0
  96. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/data/collate.py +0 -0
  97. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/data/metadata.py +0 -0
  98. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/datasets/__init__.py +0 -0
  99. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/datasets/_antiuav.py +0 -0
  100. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/datasets/_base.py +0 -0
  101. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/datasets/_cifar10.py +0 -0
  102. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/datasets/_fileio.py +0 -0
  103. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/datasets/_mixin.py +0 -0
  104. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/datasets/_mnist.py +0 -0
  105. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/datasets/_ships.py +0 -0
  106. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/datasets/_types.py +0 -0
  107. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/datasets/_voc.py +0 -0
  108. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/torch/__init__.py +0 -0
  109. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/torch/_blocks.py +0 -0
  110. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/torch/_gmm.py +0 -0
  111. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/torch/_internal.py +0 -0
  112. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/torch/models.py +0 -0
  113. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/torch/trainer.py +0 -0
  114. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/workflows/__init__.py +0 -0
  115. {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/workflows/sufficiency.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dataeval
3
- Version: 0.86.1
3
+ Version: 0.86.2
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Home-page: https://dataeval.ai/
6
6
  License: MIT
@@ -29,6 +29,7 @@ Requires-Dist: numba (>=0.59.1)
29
29
  Requires-Dist: numpy (>=1.24.2)
30
30
  Requires-Dist: pandas (>=2.0)
31
31
  Requires-Dist: pillow (>=10.3.0)
32
+ Requires-Dist: polars (>=1.0.0)
32
33
  Requires-Dist: requests
33
34
  Requires-Dist: scikit-learn (>=1.5.0)
34
35
  Requires-Dist: scipy (>=1.10)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "dataeval"
3
- version = "0.86.1" # dynamic
3
+ version = "0.86.2" # dynamic
4
4
  description = "DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks"
5
5
  license = "MIT"
6
6
  readme = "README.md"
@@ -49,6 +49,7 @@ numba = {version = ">=0.59.1"}
49
49
  numpy = {version = ">=1.24.2"}
50
50
  pandas = {version = ">=2.0"}
51
51
  pillow = {version = ">=10.3.0"}
52
+ polars = {version = ">=1.0.0"}
52
53
  requests = {version = "*"}
53
54
  scipy = {version = ">=1.10"}
54
55
  scikit-learn = {version = ">=1.5.0"}
@@ -8,7 +8,7 @@ shifts that impact performance of deployed models.
8
8
  from __future__ import annotations
9
9
 
10
10
  __all__ = ["config", "detectors", "log", "metrics", "typing", "utils", "workflows"]
11
- __version__ = "0.86.1"
11
+ __version__ = "0.86.2"
12
12
 
13
13
  import logging
14
14
 
@@ -0,0 +1,392 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = []
4
+
5
+ import warnings
6
+ from dataclasses import dataclass
7
+ from typing import TYPE_CHECKING, Any, Iterable, Literal, Mapping, Sequence, Sized
8
+
9
+ import numpy as np
10
+ import polars as pl
11
+ from numpy.typing import NDArray
12
+
13
+ from dataeval.typing import (
14
+ AnnotatedDataset,
15
+ Array,
16
+ ObjectDetectionTarget,
17
+ )
18
+ from dataeval.utils._array import as_numpy
19
+ from dataeval.utils._bin import bin_data, digitize_data
20
+ from dataeval.utils.data.metadata import merge
21
+
22
+ if TYPE_CHECKING:
23
+ from dataeval.data import Targets
24
+ else:
25
+ from dataeval.data._targets import Targets
26
+
27
+
28
+ @dataclass
29
+ class FactorInfo:
30
+ factor_type: Literal["categorical", "continuous", "discrete"] | None = None
31
+ discretized_col: str | None = None
32
+
33
+
34
+ class Metadata:
35
+ """
36
+ Class containing binned metadata using Polars DataFrames.
37
+
38
+ Parameters
39
+ ----------
40
+ dataset : ImageClassificationDataset or ObjectDetectionDataset
41
+ Dataset to access original targets and metadata from.
42
+ continuous_factor_bins : Mapping[str, int | Sequence[float]] | None, default None
43
+ Mapping from continuous factor name to the number of bins or bin edges
44
+ auto_bin_method : Literal["uniform_width", "uniform_count", "clusters"], default "uniform_width"
45
+ Method for automatically determining the number of bins for continuous factors
46
+ exclude : Sequence[str] | None, default None
47
+ Filter metadata factors to exclude the specified factors, cannot be set with `include`
48
+ include : Sequence[str] | None, default None
49
+ Filter metadata factors to include the specified factors, cannot be set with `exclude`
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ dataset: AnnotatedDataset[tuple[Any, Any, dict[str, Any]]],
55
+ *,
56
+ continuous_factor_bins: Mapping[str, int | Sequence[float]] | None = None,
57
+ auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = "uniform_width",
58
+ exclude: Sequence[str] | None = None,
59
+ include: Sequence[str] | None = None,
60
+ ) -> None:
61
+ self._targets: Targets
62
+ self._class_labels: NDArray[np.intp]
63
+ self._class_names: list[str]
64
+ self._image_indices: NDArray[np.intp]
65
+ self._factors: dict[str, FactorInfo]
66
+ self._dropped_factors: dict[str, list[str]]
67
+ self._dataframe: pl.DataFrame
68
+
69
+ self._is_structured = False
70
+ self._is_binned = False
71
+
72
+ self._dataset = dataset
73
+ self._continuous_factor_bins = dict(continuous_factor_bins) if continuous_factor_bins else {}
74
+ self._auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = auto_bin_method
75
+
76
+ if exclude is not None and include is not None:
77
+ raise ValueError("Filters for `exclude` and `include` are mutually exclusive.")
78
+
79
+ self._exclude = set(exclude or ())
80
+ self._include = set(include or ())
81
+
82
+ @property
83
+ def targets(self) -> Targets:
84
+ """Target information for the dataset."""
85
+ self._structure()
86
+ return self._targets
87
+
88
+ @property
89
+ def raw(self) -> list[dict[str, Any]]:
90
+ """The raw list of metadata dictionaries for the dataset."""
91
+ self._structure()
92
+ return self._raw
93
+
94
+ @property
95
+ def exclude(self) -> set[str]:
96
+ """Factors to exclude from the metadata."""
97
+ return self._exclude
98
+
99
+ @exclude.setter
100
+ def exclude(self, value: Sequence[str]) -> None:
101
+ exclude = set(value)
102
+ if self._exclude != exclude:
103
+ self._exclude = exclude
104
+ self._include = set()
105
+ self._is_binned = False
106
+
107
+ @property
108
+ def include(self) -> set[str]:
109
+ """Factors to include from the metadata."""
110
+ return self._include
111
+
112
+ @include.setter
113
+ def include(self, value: Sequence[str]) -> None:
114
+ include = set(value)
115
+ if self._include != include:
116
+ self._include = include
117
+ self._exclude = set()
118
+ self._is_binned = False
119
+
120
+ @property
121
+ def continuous_factor_bins(self) -> Mapping[str, int | Sequence[float]]:
122
+ """Map of factor names to bin counts or bin edges."""
123
+ return self._continuous_factor_bins
124
+
125
+ @continuous_factor_bins.setter
126
+ def continuous_factor_bins(self, bins: Mapping[str, int | Sequence[float]]) -> None:
127
+ if self._continuous_factor_bins != bins:
128
+ self._continuous_factor_bins = dict(bins)
129
+ self._reset_bins(bins)
130
+
131
+ @property
132
+ def auto_bin_method(self) -> Literal["uniform_width", "uniform_count", "clusters"]:
133
+ """Binning method to use when continuous_factor_bins is not defined."""
134
+ return self._auto_bin_method
135
+
136
+ @auto_bin_method.setter
137
+ def auto_bin_method(self, method: Literal["uniform_width", "uniform_count", "clusters"]) -> None:
138
+ if self._auto_bin_method != method:
139
+ self._auto_bin_method = method
140
+ self._reset_bins()
141
+
142
+ @property
143
+ def dataframe(self) -> pl.DataFrame:
144
+ """Dataframe containing target information and metadata factors."""
145
+ self._structure()
146
+ return self._dataframe
147
+
148
+ @property
149
+ def dropped_factors(self) -> dict[str, list[str]]:
150
+ """Factors that were dropped during preprocessing and the reasons why they were dropped."""
151
+ self._structure()
152
+ return self._dropped_factors
153
+
154
+ @property
155
+ def discretized_data(self) -> NDArray[np.int64]:
156
+ """Factor data with continuous data discretized."""
157
+ if not self.factor_names:
158
+ return np.array([], dtype=np.int64)
159
+
160
+ self._bin()
161
+ return (
162
+ self.dataframe.select([info.discretized_col or name for name, info in self.factor_info.items()])
163
+ .to_numpy()
164
+ .astype(np.int64)
165
+ )
166
+
167
+ @property
168
+ def factor_names(self) -> list[str]:
169
+ """Factor names of the metadata."""
170
+ self._structure()
171
+ return list(self._factors)
172
+
173
+ @property
174
+ def factor_info(self) -> dict[str, FactorInfo]:
175
+ """Factor types of the metadata."""
176
+ self._bin()
177
+ return self._factors
178
+
179
+ @property
180
+ def factor_data(self) -> NDArray[Any]:
181
+ """Factor data as a NumPy array."""
182
+ if not self.factor_names:
183
+ return np.array([], dtype=np.float64)
184
+
185
+ # Extract continuous columns and convert to NumPy array
186
+ return self.dataframe.select(self.factor_names).to_numpy()
187
+
188
+ @property
189
+ def class_labels(self) -> NDArray[np.intp]:
190
+ """Class labels as a NumPy array."""
191
+ self._structure()
192
+ return self._class_labels
193
+
194
+ @property
195
+ def class_names(self) -> list[str]:
196
+ """Class names as a list of strings."""
197
+ self._structure()
198
+ return self._class_names
199
+
200
+ @property
201
+ def image_indices(self) -> NDArray[np.intp]:
202
+ """Indices of images as a NumPy array."""
203
+ self._bin()
204
+ return self._image_indices
205
+
206
+ @property
207
+ def image_count(self) -> int:
208
+ self._bin()
209
+ return int(self._image_indices.max() + 1)
210
+
211
+ def _reset_bins(self, cols: Iterable[str] | None = None) -> None:
212
+ if self._is_binned:
213
+ columns = self._dataframe.columns
214
+ for col in (col for col in cols or columns if f"{col}[|]" in columns):
215
+ self._dataframe.drop_in_place(f"{col}[|]")
216
+ self._factors[col] = FactorInfo()
217
+ self._is_binned = False
218
+
219
+ def _structure(self) -> None:
220
+ if self._is_structured:
221
+ return
222
+
223
+ raw: list[dict[str, Any]] = []
224
+
225
+ labels = []
226
+ bboxes = []
227
+ scores = []
228
+ srcidx = []
229
+ is_od = None
230
+ for i in range(len(self._dataset)):
231
+ _, target, metadata = self._dataset[i]
232
+
233
+ raw.append(metadata)
234
+
235
+ if is_od_target := isinstance(target, ObjectDetectionTarget):
236
+ target_labels = as_numpy(target.labels)
237
+ target_len = len(target_labels)
238
+ labels.extend(target_labels.tolist())
239
+ bboxes.extend(as_numpy(target.boxes).tolist())
240
+ scores.extend(as_numpy(target.scores).tolist())
241
+ srcidx.extend([i] * target_len)
242
+ elif isinstance(target, Array):
243
+ target_len = 1
244
+ labels.append(int(np.argmax(as_numpy(target))))
245
+ scores.append(target)
246
+ else:
247
+ raise TypeError("Encountered unsupported target type in dataset")
248
+
249
+ is_od = is_od_target if is_od is None else is_od
250
+ if is_od != is_od_target:
251
+ raise ValueError("Encountered unexpected target type in dataset")
252
+
253
+ labels = as_numpy(labels).astype(np.intp)
254
+ scores = as_numpy(scores).astype(np.float32)
255
+ bboxes = as_numpy(bboxes).astype(np.float32) if is_od else None
256
+ srcidx = as_numpy(srcidx).astype(np.intp) if is_od else None
257
+
258
+ target_dict = {
259
+ "image_index": srcidx if srcidx is not None else np.arange(len(labels)),
260
+ "class_label": labels,
261
+ "score": scores,
262
+ "box": bboxes if bboxes is not None else [None] * len(labels),
263
+ }
264
+
265
+ self._targets = Targets(labels, scores, bboxes, srcidx)
266
+ self._raw = raw
267
+
268
+ index2label = self._dataset.metadata.get("index2label", {})
269
+ self._class_labels = labels
270
+ self._class_names = [index2label.get(i, str(i)) for i in np.unique(self._class_labels)]
271
+ self._image_indices = target_dict["image_index"]
272
+
273
+ targets_per_image = None if srcidx is None else np.unique(srcidx, return_counts=True)[1].tolist()
274
+ merged = merge(raw, return_dropped=True, ignore_lists=False, targets_per_image=targets_per_image)
275
+
276
+ reserved = ["image_index", "class_label", "score", "box"]
277
+ factor_dict = {f"metadata_{k}" if k in reserved else k: v for k, v in merged[0].items() if k != "_image_index"}
278
+
279
+ self._factors = dict.fromkeys(factor_dict, FactorInfo())
280
+ self._dataframe = pl.DataFrame({**target_dict, **factor_dict})
281
+ self._dropped_factors = merged[1]
282
+ self._is_structured = True
283
+
284
+ def _bin(self) -> None:
285
+ """Populate factor info and bin non-categorical factors."""
286
+ if self._is_binned:
287
+ return
288
+
289
+ # Start with an empty set of factor info
290
+ factor_info: dict[str, FactorInfo] = {}
291
+
292
+ # Create a mutable DataFrame for updates
293
+ df = self.dataframe.clone()
294
+ factor_bins = self.continuous_factor_bins
295
+
296
+ # Check for invalid keys
297
+ invalid_keys = set(factor_bins.keys()) - set(df.columns)
298
+ if invalid_keys:
299
+ warnings.warn(
300
+ f"The keys - {invalid_keys} - are present in the `continuous_factor_bins` dictionary "
301
+ "but are not columns in the metadata DataFrame. Unknown keys will be ignored."
302
+ )
303
+
304
+ column_set = set(df.columns)
305
+ for col in (col for col in self.factor_names if f"{col}[|]" not in column_set):
306
+ # Get data as numpy array for processing
307
+ data = df[col].to_numpy()
308
+ col_dz = f"{col}[|]"
309
+ if col in factor_bins:
310
+ # User provided binning
311
+ bins = factor_bins[col]
312
+ df = df.with_columns(pl.Series(name=col_dz, values=digitize_data(data, bins).astype(np.int64)))
313
+ factor_info[col] = FactorInfo("continuous", col_dz)
314
+ else:
315
+ # Check if data is numeric
316
+ unique, ordinal = np.unique(data, return_inverse=True)
317
+ if not np.issubdtype(data.dtype, np.number) or unique.size <= max(20, data.size * 0.01):
318
+ # Non-numeric data or small number of unique values - convert to categorical
319
+ df = df.with_columns(pl.Series(name=col_dz, values=ordinal.astype(np.int64)))
320
+ factor_info[col] = FactorInfo("categorical", col_dz)
321
+ elif data.dtype == float:
322
+ # Many unique values - discretize by binning
323
+ warnings.warn(
324
+ f"A user defined binning was not provided for {col}. "
325
+ f"Using the {self.auto_bin_method} method to discretize the data. "
326
+ "It is recommended that the user rerun and supply the desired "
327
+ "bins using the continuous_factor_bins parameter.",
328
+ UserWarning,
329
+ )
330
+ # Create binned version
331
+ binned_data = bin_data(data, self.auto_bin_method)
332
+ df = df.with_columns(pl.Series(name=col_dz, values=binned_data.astype(np.int64)))
333
+ factor_info[col] = FactorInfo("continuous", col_dz)
334
+ else:
335
+ factor_info[col] = FactorInfo("discrete", col_dz)
336
+
337
+ # Store the results
338
+ self._dataframe = df
339
+ self._factors.update(factor_info)
340
+ self._is_binned = True
341
+
342
+ def get_factors_by_type(self, factor_type: Literal["categorical", "continuous", "discrete"]) -> list[str]:
343
+ """
344
+ Get the names of factors of a specific type.
345
+
346
+ Parameters
347
+ ----------
348
+ factor_type : Literal["categorical", "continuous", "discrete"]
349
+ The type of factors to retrieve.
350
+
351
+ Returns
352
+ -------
353
+ list[str]
354
+ List of factor names of the specified type.
355
+ """
356
+ self._bin()
357
+ return [name for name, info in self.factor_info.items() if info.factor_type == factor_type]
358
+
359
+ def add_factors(self, factors: Mapping[str, Any]) -> None:
360
+ """
361
+ Add additional factors to the metadata.
362
+
363
+ The number of measures per factor must match the number of images
364
+ in the dataset or the number of detections in the dataset.
365
+
366
+ Parameters
367
+ ----------
368
+ factors : Mapping[str, ArrayLike]
369
+ Dictionary of factors to add to the metadata.
370
+ """
371
+ self._structure()
372
+
373
+ targets = len(self.targets.source) if self.targets.source is not None else len(self.targets)
374
+ images = self.image_count
375
+ lengths = {k: len(v if isinstance(v, Sized) else np.atleast_1d(as_numpy(v))) for k, v in factors.items()}
376
+ targets_match = all(f == targets for f in lengths.values())
377
+ images_match = targets_match if images == targets else all(f == images for f in lengths.values())
378
+ if not targets_match and not images_match:
379
+ raise ValueError(
380
+ "The lists/arrays in the provided factors have a different length than the current metadata factors."
381
+ )
382
+
383
+ new_columns = []
384
+ for k, v in factors.items():
385
+ v = as_numpy(v)
386
+ data = v if (self.targets.source is None or lengths[k] == targets) else v[self.targets.source]
387
+ new_columns.append(pl.Series(name=k, values=data))
388
+ self._factors[k] = FactorInfo()
389
+
390
+ if new_columns:
391
+ self._dataframe = self.dataframe.with_columns(new_columns)
392
+ self._is_binned = False
@@ -207,8 +207,8 @@ def get_groups(metadata: Metadata, split_on: Sequence[str] | None) -> NDArray[np
207
207
  return None
208
208
 
209
209
  split_set = set(split_on)
210
- indices = [i for i, name in enumerate(metadata.discrete_factor_names) if name in split_set]
211
- binned_features = metadata.discrete_data[:, indices]
210
+ indices = [i for i, name in enumerate(metadata.factor_names) if name in split_set]
211
+ binned_features = metadata.discretized_data[:, indices]
212
212
  return np.unique(binned_features, axis=0, return_inverse=True)[1]
213
213
 
214
214
 
@@ -80,14 +80,17 @@ def metadata_distance(metadata1: Metadata, metadata2: Metadata) -> MetadataDista
80
80
  MetadataDistanceValues(statistic=1.0, location=0.44354838709677413, dist=2.7, pvalue=0.0)
81
81
  """
82
82
 
83
- _compare_keys(metadata1.continuous_factor_names, metadata2.continuous_factor_names)
84
- fnames = metadata1.continuous_factor_names
83
+ _compare_keys(metadata1.factor_names, metadata2.factor_names)
84
+ cont_fnames = metadata1.get_factors_by_type("continuous")
85
85
 
86
- cont1 = np.atleast_2d(metadata1.continuous_data) # (S, F)
87
- cont2 = np.atleast_2d(metadata2.continuous_data) # (S, F)
86
+ if not cont_fnames:
87
+ return MetadataDistanceOutput({})
88
88
 
89
- _validate_factors_and_data(fnames, cont1)
90
- _validate_factors_and_data(fnames, cont2)
89
+ cont1 = np.atleast_2d(metadata1.dataframe[cont_fnames].to_numpy()) # (S, F)
90
+ cont2 = np.atleast_2d(metadata2.dataframe[cont_fnames].to_numpy()) # (S, F)
91
+
92
+ _validate_factors_and_data(cont_fnames, cont1)
93
+ _validate_factors_and_data(cont_fnames, cont2)
91
94
 
92
95
  N = len(cont1)
93
96
  M = len(cont2)
@@ -104,7 +107,7 @@ def metadata_distance(metadata1: Metadata, metadata2: Metadata) -> MetadataDista
104
107
  results: dict[str, MetadataDistanceValues] = {}
105
108
 
106
109
  # Per factor
107
- for i, fname in enumerate(fnames):
110
+ for i, fname in enumerate(cont_fnames):
108
111
  fdata1 = cont1[:, i] # (S, 1)
109
112
  fdata2 = cont2[:, i] # (S, 1)
110
113
 
@@ -15,95 +15,6 @@ from dataeval.outputs import MostDeviatedFactorsOutput, OODOutput, OODPredictorO
15
15
  from dataeval.outputs._base import set_metadata
16
16
 
17
17
 
18
- def _combine_discrete_continuous(metadata: Metadata) -> tuple[list[str], NDArray[np.float64]]:
19
- """Combines the discrete and continuous data of a :class:`Metadata` object
20
-
21
- Returns
22
- -------
23
- Tuple[list[str], NDArray]
24
- The combined list of factors names and the combined discrete and continuous data
25
-
26
- Note
27
- ----
28
- Discrete and continuous data must have the same number of samples
29
- """
30
- names = []
31
- data = []
32
-
33
- if metadata.discrete_factor_names and metadata.discrete_data.size != 0:
34
- names.extend(metadata.discrete_factor_names)
35
- data.append(metadata.discrete_data)
36
-
37
- if metadata.continuous_factor_names and metadata.continuous_data.size != 0:
38
- names.extend(metadata.continuous_factor_names)
39
- data.append(metadata.continuous_data)
40
-
41
- return names, np.hstack(data, dtype=np.float64) if data else np.array([], dtype=np.float64)
42
-
43
-
44
- def _combine_metadata(
45
- metadata_1: Metadata, metadata_2: Metadata
46
- ) -> tuple[list[str], list[NDArray[np.float64 | np.int64]], list[NDArray[np.int64 | np.float64]]]:
47
- """
48
- Combines the factor names and data arrays of metadata_1 and metadata_2 when the names
49
- match exactly and data has the same number of columns (factors).
50
-
51
- Parameters
52
- ----------
53
- metadata_1 : Metadata
54
- The set of factor names used as reference to determine the correct factor names and length of data
55
- metadata_2 : Metadata
56
- The compared set of factor names and data that must match metadata_1
57
-
58
- Returns
59
- -------
60
- list[str]
61
- The combined discrete and continuous factor names in that order.
62
- list[NDArray]
63
- Combined discrete and continuous data of metadata_1
64
- list[NDArray]
65
- Combined discrete and continuous data of metadata_2
66
-
67
- Raises
68
- ------
69
- ValueError
70
- If keys do not match in metadata_1 and metadata_2
71
- ValueError
72
- If the length of keys do not match the length of the data
73
- """
74
- factor_names: list[str] = []
75
- m1_data: list[NDArray[np.int64 | np.float64]] = []
76
- m2_data: list[NDArray[np.int64 | np.float64]] = []
77
-
78
- # Both metadata must have the same number of factors (cols), but not necessarily samples (row)
79
- if metadata_1.total_num_factors != metadata_2.total_num_factors:
80
- raise ValueError(
81
- f"Number of factors differs between metadata_1 ({metadata_1.total_num_factors}) "
82
- f"and metadata_2 ({metadata_2.total_num_factors})"
83
- )
84
-
85
- # Validate and attach discrete data
86
- if metadata_1.discrete_factor_names:
87
- _compare_keys(metadata_1.discrete_factor_names, metadata_2.discrete_factor_names)
88
- _validate_factors_and_data(metadata_1.discrete_factor_names, metadata_1.discrete_data)
89
-
90
- factor_names.extend(metadata_1.discrete_factor_names)
91
- m1_data.append(metadata_1.discrete_data)
92
- m2_data.append(metadata_2.discrete_data)
93
-
94
- # Validate and attach continuous data
95
- if metadata_1.continuous_factor_names:
96
- _compare_keys(metadata_1.continuous_factor_names, metadata_2.continuous_factor_names)
97
- _validate_factors_and_data(metadata_1.continuous_factor_names, metadata_1.continuous_data)
98
-
99
- factor_names.extend(metadata_1.continuous_factor_names)
100
- m1_data.append(metadata_1.continuous_data)
101
- m2_data.append(metadata_2.continuous_data)
102
-
103
- # Turns list of discrete and continuous into one array
104
- return factor_names, m1_data, m2_data
105
-
106
-
107
18
  def _calc_median_deviations(reference: NDArray, test: NDArray) -> NDArray:
108
19
  """
109
20
  Calculates deviations of the test data from the median of the reference data
@@ -207,16 +118,13 @@ def find_most_deviated_factors(
207
118
  if not any(ood_mask):
208
119
  return MostDeviatedFactorsOutput([])
209
120
 
210
- # Combines reference and test factor names and data if exists and match exactly
211
- # shape -> (samples, factors)
212
- factor_names, md_1, md_2 = _combine_metadata(
213
- metadata_1=metadata_ref,
214
- metadata_2=metadata_tst,
215
- )
121
+ factor_names = metadata_ref.factor_names
122
+ ref_data = metadata_ref.factor_data
123
+ tst_data = metadata_tst.factor_data
216
124
 
217
- # Stack discrete and continuous factors as separate factors. Must have equal sample counts
218
- ref_data = np.hstack(md_1) if md_1 else np.array([]) # (S, Fd + Fc)
219
- tst_data = np.hstack(md_2) if md_2 else np.array([]) # (S, Fd + Fc)
125
+ _compare_keys(factor_names, metadata_tst.factor_names)
126
+ _validate_factors_and_data(factor_names, ref_data)
127
+ _validate_factors_and_data(factor_names, tst_data)
220
128
 
221
129
  if len(ref_data) < 3:
222
130
  warnings.warn(
@@ -256,6 +164,7 @@ which is what many library functions return, multiply it by _NATS2BITS to get it
256
164
  """
257
165
 
258
166
 
167
+ @set_metadata
259
168
  def find_ood_predictors(
260
169
  metadata: Metadata,
261
170
  ood: OODOutput,
@@ -305,8 +214,8 @@ def find_ood_predictors(
305
214
 
306
215
  ood_mask: NDArray[np.bool_] = ood.is_ood
307
216
 
308
- discrete_features_count = len(metadata.discrete_factor_names)
309
- factors, data = _combine_discrete_continuous(metadata) # (F, ), (S, F) => F = Fd + Fc
217
+ factors = metadata.factor_names
218
+ data = metadata.factor_data
310
219
 
311
220
  # No metadata correlated with out of distribution data, return 0.0 for all factors
312
221
  if not any(ood_mask):
@@ -320,14 +229,13 @@ def find_ood_predictors(
320
229
  # Calculate mean, std of each factor over all samples
321
230
  scaled_data = (data - np.mean(data, axis=0)) / np.std(data, axis=0, ddof=1) # (S, F)
322
231
 
323
- discrete_features = np.zeros_like(factors, dtype=np.bool_)
324
- discrete_features[:discrete_features_count] = True
232
+ discrete_features = [info.factor_type != "continuous" for info in metadata.factor_info.values()]
325
233
 
326
234
  mutual_info_values = (
327
235
  mutual_info_classif(
328
236
  X=scaled_data,
329
237
  y=ood_mask,
330
- discrete_features=discrete_features, # type: ignore -> sklearn issue - NDArray[bool] not of accepted type Union[ArrayLike, 'auto']
238
+ discrete_features=discrete_features, # type: ignore - sklearn function not typed
331
239
  random_state=get_seed(),
332
240
  )
333
241
  * _NATS2BITS