dataeval 0.86.9__py3-none-any.whl → 0.88.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +1 -1
- dataeval/_log.py +1 -1
- dataeval/_version.py +2 -2
- dataeval/config.py +4 -19
- dataeval/data/_embeddings.py +78 -35
- dataeval/data/_images.py +41 -8
- dataeval/data/_metadata.py +348 -66
- dataeval/data/_selection.py +22 -7
- dataeval/data/_split.py +3 -2
- dataeval/data/selections/_classbalance.py +4 -3
- dataeval/data/selections/_classfilter.py +9 -8
- dataeval/data/selections/_indices.py +4 -3
- dataeval/data/selections/_prioritize.py +249 -29
- dataeval/data/selections/_reverse.py +1 -1
- dataeval/data/selections/_shuffle.py +5 -4
- dataeval/detectors/drift/_base.py +2 -1
- dataeval/detectors/drift/_mmd.py +2 -1
- dataeval/detectors/drift/_nml/_base.py +1 -1
- dataeval/detectors/drift/_nml/_chunk.py +2 -1
- dataeval/detectors/drift/_nml/_result.py +3 -2
- dataeval/detectors/drift/_nml/_thresholds.py +6 -5
- dataeval/detectors/drift/_uncertainty.py +2 -1
- dataeval/detectors/linters/duplicates.py +2 -1
- dataeval/detectors/linters/outliers.py +4 -3
- dataeval/detectors/ood/__init__.py +2 -1
- dataeval/detectors/ood/ae.py +1 -1
- dataeval/detectors/ood/base.py +39 -1
- dataeval/detectors/ood/knn.py +95 -0
- dataeval/detectors/ood/mixin.py +2 -1
- dataeval/metadata/_utils.py +1 -1
- dataeval/metrics/bias/_balance.py +29 -22
- dataeval/metrics/bias/_diversity.py +4 -4
- dataeval/metrics/bias/_parity.py +2 -2
- dataeval/metrics/stats/_base.py +3 -29
- dataeval/metrics/stats/_boxratiostats.py +2 -1
- dataeval/metrics/stats/_dimensionstats.py +2 -1
- dataeval/metrics/stats/_hashstats.py +21 -3
- dataeval/metrics/stats/_pixelstats.py +2 -1
- dataeval/metrics/stats/_visualstats.py +2 -1
- dataeval/outputs/_base.py +2 -3
- dataeval/outputs/_bias.py +2 -1
- dataeval/outputs/_estimators.py +1 -1
- dataeval/outputs/_linters.py +3 -3
- dataeval/outputs/_stats.py +3 -3
- dataeval/outputs/_utils.py +1 -1
- dataeval/outputs/_workflows.py +49 -31
- dataeval/typing.py +23 -9
- dataeval/utils/__init__.py +2 -2
- dataeval/utils/_array.py +3 -2
- dataeval/utils/_bin.py +9 -7
- dataeval/utils/_method.py +2 -3
- dataeval/utils/_multiprocessing.py +34 -0
- dataeval/utils/_plot.py +2 -1
- dataeval/utils/data/__init__.py +6 -5
- dataeval/utils/data/{metadata.py → _merge.py} +3 -2
- dataeval/utils/data/_validate.py +170 -0
- dataeval/utils/data/collate.py +2 -1
- dataeval/utils/torch/_internal.py +2 -1
- dataeval/utils/torch/trainer.py +1 -1
- dataeval/workflows/sufficiency.py +13 -9
- {dataeval-0.86.9.dist-info → dataeval-0.88.0.dist-info}/METADATA +8 -21
- dataeval-0.88.0.dist-info/RECORD +105 -0
- dataeval/utils/data/_dataset.py +0 -246
- dataeval/utils/datasets/__init__.py +0 -21
- dataeval/utils/datasets/_antiuav.py +0 -189
- dataeval/utils/datasets/_base.py +0 -266
- dataeval/utils/datasets/_cifar10.py +0 -201
- dataeval/utils/datasets/_fileio.py +0 -142
- dataeval/utils/datasets/_milco.py +0 -197
- dataeval/utils/datasets/_mixin.py +0 -54
- dataeval/utils/datasets/_mnist.py +0 -202
- dataeval/utils/datasets/_seadrone.py +0 -512
- dataeval/utils/datasets/_ships.py +0 -144
- dataeval/utils/datasets/_types.py +0 -48
- dataeval/utils/datasets/_voc.py +0 -583
- dataeval-0.86.9.dist-info/RECORD +0 -115
- {dataeval-0.86.9.dist-info → dataeval-0.88.0.dist-info}/WHEEL +0 -0
- /dataeval-0.86.9.dist-info/licenses/LICENSE.txt → /dataeval-0.88.0.dist-info/licenses/LICENSE +0 -0
dataeval/data/_metadata.py
CHANGED
@@ -3,12 +3,14 @@ from __future__ import annotations
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
5
|
import warnings
|
6
|
+
from collections.abc import Callable, Iterable, Mapping, Sequence, Sized
|
6
7
|
from dataclasses import dataclass
|
7
|
-
from typing import Any,
|
8
|
+
from typing import Any, Literal
|
8
9
|
|
9
10
|
import numpy as np
|
10
11
|
import polars as pl
|
11
12
|
from numpy.typing import NDArray
|
13
|
+
from tqdm.auto import tqdm
|
12
14
|
|
13
15
|
from dataeval.typing import (
|
14
16
|
AnnotatedDataset,
|
@@ -16,36 +18,60 @@ from dataeval.typing import (
|
|
16
18
|
ObjectDetectionTarget,
|
17
19
|
)
|
18
20
|
from dataeval.utils._array import as_numpy
|
19
|
-
from dataeval.utils._bin import bin_data, digitize_data
|
20
|
-
from dataeval.utils.data.
|
21
|
+
from dataeval.utils._bin import bin_data, digitize_data, is_continuous
|
22
|
+
from dataeval.utils.data._merge import merge
|
21
23
|
|
22
24
|
|
23
25
|
def _binned(name: str) -> str:
|
24
|
-
return f"{name}
|
26
|
+
return f"{name}↕"
|
27
|
+
|
28
|
+
|
29
|
+
def _digitized(name: str) -> str:
|
30
|
+
return f"{name}#"
|
25
31
|
|
26
32
|
|
27
33
|
@dataclass
|
28
34
|
class FactorInfo:
|
29
|
-
factor_type: Literal["categorical", "continuous", "discrete"]
|
30
|
-
|
35
|
+
factor_type: Literal["categorical", "continuous", "discrete"]
|
36
|
+
is_binned: bool = False
|
37
|
+
is_digitized: bool = False
|
38
|
+
|
39
|
+
|
40
|
+
def _to_col(name: str, info: FactorInfo, binned: bool = True) -> str:
|
41
|
+
if binned and info.is_binned:
|
42
|
+
return _binned(name)
|
43
|
+
if info.is_digitized:
|
44
|
+
return _digitized(name)
|
45
|
+
return name
|
31
46
|
|
32
47
|
|
33
48
|
class Metadata:
|
34
|
-
"""
|
35
|
-
|
49
|
+
"""Collection of binned metadata using Polars DataFrames.
|
50
|
+
|
51
|
+
Processes dataset metadata by automatically binning continuous factors and digitizing
|
52
|
+
categorical factors for analysis and visualization workflows.
|
36
53
|
|
37
54
|
Parameters
|
38
55
|
----------
|
39
56
|
dataset : ImageClassificationDataset or ObjectDetectionDataset
|
40
|
-
Dataset
|
57
|
+
Dataset that provides original targets and metadata for processing.
|
41
58
|
continuous_factor_bins : Mapping[str, int | Sequence[float]] | None, default None
|
42
|
-
Mapping from continuous factor
|
59
|
+
Mapping from continuous factor names to bin counts or explicit bin edges.
|
60
|
+
When None, uses automatic discretization.
|
43
61
|
auto_bin_method : Literal["uniform_width", "uniform_count", "clusters"], default "uniform_width"
|
44
|
-
|
62
|
+
Binning strategy for continuous factors without explicit bins. Default "uniform_width"
|
63
|
+
provides intuitive equal-width intervals for most distributions.
|
45
64
|
exclude : Sequence[str] | None, default None
|
46
|
-
|
65
|
+
Factor names to exclude from processing. Cannot be used with `include` parameter.
|
66
|
+
When None, processes all available factors.
|
47
67
|
include : Sequence[str] | None, default None
|
48
|
-
|
68
|
+
Factor names to include in processing. Cannot be used with `exclude` parameter.
|
69
|
+
When None, processes all available factors.
|
70
|
+
|
71
|
+
Raises
|
72
|
+
------
|
73
|
+
ValueError
|
74
|
+
When both exclude and include parameters are specified simultaneously.
|
49
75
|
"""
|
50
76
|
|
51
77
|
def __init__(
|
@@ -60,7 +86,7 @@ class Metadata:
|
|
60
86
|
self._class_labels: NDArray[np.intp]
|
61
87
|
self._class_names: list[str]
|
62
88
|
self._image_indices: NDArray[np.intp]
|
63
|
-
self._factors: dict[str, FactorInfo]
|
89
|
+
self._factors: dict[str, FactorInfo | None]
|
64
90
|
self._dropped_factors: dict[str, list[str]]
|
65
91
|
self._dataframe: pl.DataFrame
|
66
92
|
self._raw: Sequence[Mapping[str, Any]]
|
@@ -81,17 +107,48 @@ class Metadata:
|
|
81
107
|
|
82
108
|
@property
|
83
109
|
def raw(self) -> Sequence[Mapping[str, Any]]:
|
84
|
-
"""
|
110
|
+
"""Original metadata dictionaries extracted from the dataset.
|
111
|
+
|
112
|
+
Access the unprocessed metadata as it was provided in the original dataset before
|
113
|
+
any binning, filtering, or transformation operations.
|
114
|
+
|
115
|
+
Returns
|
116
|
+
-------
|
117
|
+
Sequence[Mapping[str, Any]]
|
118
|
+
List of metadata dictionaries, one per dataset item, containing the original key-value
|
119
|
+
pairs as provided in the source data
|
120
|
+
|
121
|
+
Notes
|
122
|
+
-----
|
123
|
+
This property triggers dataset structure analysis on first access.
|
124
|
+
"""
|
85
125
|
self._structure()
|
86
126
|
return self._raw
|
87
127
|
|
88
128
|
@property
|
89
129
|
def exclude(self) -> set[str]:
|
90
|
-
"""
|
130
|
+
"""Factor names excluded from metadata processing.
|
131
|
+
|
132
|
+
Returns
|
133
|
+
-------
|
134
|
+
set[str]
|
135
|
+
Set of factor names that are filtered out during processing.
|
136
|
+
Empty set when no exclusions are active.
|
137
|
+
|
138
|
+
"""
|
91
139
|
return self._exclude
|
92
140
|
|
93
141
|
@exclude.setter
|
94
142
|
def exclude(self, value: Sequence[str]) -> None:
|
143
|
+
"""Set factor names to exclude from processing.
|
144
|
+
|
145
|
+
Automatically clears include filter and resets binning state when exclusion list changes.
|
146
|
+
|
147
|
+
Parameters
|
148
|
+
----------
|
149
|
+
value : Sequence[str]
|
150
|
+
Factor names to exclude from metadata analysis.
|
151
|
+
"""
|
95
152
|
exclude = set(value)
|
96
153
|
if self._exclude != exclude:
|
97
154
|
self._exclude = exclude
|
@@ -100,11 +157,27 @@ class Metadata:
|
|
100
157
|
|
101
158
|
@property
|
102
159
|
def include(self) -> set[str]:
|
103
|
-
"""
|
160
|
+
"""Factor names included in metadata processing.
|
161
|
+
|
162
|
+
Returns
|
163
|
+
-------
|
164
|
+
set[str]
|
165
|
+
Set of factor names that are processed during analysis. Empty set when no inclusion filter is active.
|
166
|
+
"""
|
104
167
|
return self._include
|
105
168
|
|
106
169
|
@include.setter
|
107
170
|
def include(self, value: Sequence[str]) -> None:
|
171
|
+
"""Set factor names to include in processing.
|
172
|
+
|
173
|
+
Automatically clears exclude filter and resets binning state when
|
174
|
+
inclusion list changes.
|
175
|
+
|
176
|
+
Parameters
|
177
|
+
----------
|
178
|
+
value : Sequence[str]
|
179
|
+
Factor names to include in metadata analysis.
|
180
|
+
"""
|
108
181
|
include = set(value)
|
109
182
|
if self._include != include:
|
110
183
|
self._include = include
|
@@ -113,66 +186,214 @@ class Metadata:
|
|
113
186
|
|
114
187
|
@property
|
115
188
|
def continuous_factor_bins(self) -> Mapping[str, int | Sequence[float]]:
|
116
|
-
"""
|
189
|
+
"""Binning configuration for continuous factors.
|
190
|
+
|
191
|
+
Returns
|
192
|
+
-------
|
193
|
+
Mapping[str, int | Sequence[float]]
|
194
|
+
Dictionary mapping factor names to either the number of bins
|
195
|
+
(int) or explicit bin edges (sequence of floats).
|
196
|
+
"""
|
117
197
|
return self._continuous_factor_bins
|
118
198
|
|
119
199
|
@continuous_factor_bins.setter
|
120
200
|
def continuous_factor_bins(self, bins: Mapping[str, int | Sequence[float]]) -> None:
|
201
|
+
"""Update binning configuration for continuous factors.
|
202
|
+
|
203
|
+
Triggers re-binning when configuration changes to ensure data
|
204
|
+
consistency with new bin specifications.
|
205
|
+
|
206
|
+
Parameters
|
207
|
+
----------
|
208
|
+
bins : Mapping[str, int | Sequence[float]]
|
209
|
+
Dictionary mapping factor names to bin counts or explicit edges.
|
210
|
+
"""
|
121
211
|
if self._continuous_factor_bins != bins:
|
122
212
|
self._continuous_factor_bins = dict(bins)
|
123
213
|
self._reset_bins(bins)
|
124
214
|
|
125
215
|
@property
|
126
216
|
def auto_bin_method(self) -> Literal["uniform_width", "uniform_count", "clusters"]:
|
127
|
-
"""
|
217
|
+
"""Automatic binning strategy for continuous factors.
|
218
|
+
|
219
|
+
Returns
|
220
|
+
-------
|
221
|
+
{"uniform_width", "uniform_count", "clusters"}
|
222
|
+
Current method used for automatic discretization of continuous
|
223
|
+
factors that lack explicit bin specifications.
|
224
|
+
"""
|
128
225
|
return self._auto_bin_method
|
129
226
|
|
130
227
|
@auto_bin_method.setter
|
131
228
|
def auto_bin_method(self, method: Literal["uniform_width", "uniform_count", "clusters"]) -> None:
|
229
|
+
"""Set automatic binning strategy for continuous factors.
|
230
|
+
|
231
|
+
Triggers re-binning with the new method when strategy changes to
|
232
|
+
ensure consistent discretization across all factors.
|
233
|
+
|
234
|
+
Parameters
|
235
|
+
----------
|
236
|
+
method : {"uniform_width", "uniform_count", "clusters"}
|
237
|
+
Binning strategy to apply for continuous factors without
|
238
|
+
explicit bin configurations.
|
239
|
+
"""
|
132
240
|
if self._auto_bin_method != method:
|
133
241
|
self._auto_bin_method = method
|
134
242
|
self._reset_bins()
|
135
243
|
|
136
244
|
@property
|
137
245
|
def dataframe(self) -> pl.DataFrame:
|
138
|
-
"""
|
246
|
+
"""Processed DataFrame containing targets and metadata factors.
|
247
|
+
|
248
|
+
Access the main data structure with target information (class labels,
|
249
|
+
scores, bounding boxes) and processed metadata factors ready for analysis.
|
250
|
+
|
251
|
+
Returns
|
252
|
+
-------
|
253
|
+
pl.DataFrame
|
254
|
+
DataFrame with columns for image indices, class labels, scores,
|
255
|
+
bounding boxes (when applicable), and all processed metadata factors.
|
256
|
+
|
257
|
+
Notes
|
258
|
+
-----
|
259
|
+
This property triggers dataset structure analysis on first access.
|
260
|
+
Factor binning occurs automatically when accessing factor-related data.
|
261
|
+
"""
|
139
262
|
self._structure()
|
140
263
|
return self._dataframe
|
141
264
|
|
142
265
|
@property
|
143
266
|
def dropped_factors(self) -> Mapping[str, Sequence[str]]:
|
144
|
-
"""Factors
|
267
|
+
"""Factors removed during preprocessing with removal reasons.
|
268
|
+
|
269
|
+
Returns
|
270
|
+
-------
|
271
|
+
Mapping[str, Sequence[str]]
|
272
|
+
Dictionary mapping dropped factor names to lists of reasons
|
273
|
+
why they were excluded from the final dataset.
|
274
|
+
|
275
|
+
Notes
|
276
|
+
-----
|
277
|
+
This property triggers dataset structure analysis on first access.
|
278
|
+
Common removal reasons include incompatible data types, excessive
|
279
|
+
missing values, or insufficient variation.
|
280
|
+
"""
|
145
281
|
self._structure()
|
146
282
|
return self._dropped_factors
|
147
283
|
|
148
284
|
@property
|
149
|
-
def
|
150
|
-
"""Factor data with
|
285
|
+
def digitized_data(self) -> NDArray[np.int64]:
|
286
|
+
"""Factor data with categorical values converted to integer codes.
|
287
|
+
|
288
|
+
Access processed factor data where categorical factors are digitized
|
289
|
+
to integer codes but continuous factors remain in their original form.
|
290
|
+
|
291
|
+
Returns
|
292
|
+
-------
|
293
|
+
NDArray[np.int64]
|
294
|
+
Array with shape (n_samples, n_factors) containing integer-coded
|
295
|
+
categorical data. Returns empty array when no factors are available.
|
296
|
+
|
297
|
+
Notes
|
298
|
+
-----
|
299
|
+
This property triggers factor binning analysis on first access.
|
300
|
+
Use this for algorithms that can handle mixed categorical and
|
301
|
+
continuous data types.
|
302
|
+
"""
|
151
303
|
if not self.factor_names:
|
152
304
|
return np.array([], dtype=np.int64)
|
153
305
|
|
154
306
|
self._bin()
|
155
307
|
return (
|
156
|
-
self.dataframe.select([
|
308
|
+
self.dataframe.select([_to_col(k, v, False) for k, v in self.factor_info.items()])
|
309
|
+
.to_numpy()
|
310
|
+
.astype(np.int64)
|
311
|
+
)
|
312
|
+
|
313
|
+
@property
|
314
|
+
def binned_data(self) -> NDArray[np.int64]:
|
315
|
+
"""Factor data with continuous values discretized into bins.
|
316
|
+
|
317
|
+
Access fully processed factor data where both categorical and
|
318
|
+
continuous factors are converted to integer bin indices.
|
319
|
+
|
320
|
+
Returns
|
321
|
+
-------
|
322
|
+
NDArray[np.int64]
|
323
|
+
Array with shape (n_samples, n_factors) containing binned integer
|
324
|
+
data ready for categorical analysis algorithms. Returns empty array
|
325
|
+
when no factors are available.
|
326
|
+
|
327
|
+
Notes
|
328
|
+
-----
|
329
|
+
This property triggers factor binning analysis on first access.
|
330
|
+
Use this for algorithms requiring purely discrete input data.
|
331
|
+
"""
|
332
|
+
if not self.factor_names:
|
333
|
+
return np.array([], dtype=np.int64)
|
334
|
+
|
335
|
+
self._bin()
|
336
|
+
return (
|
337
|
+
self.dataframe.select([_to_col(k, v, True) for k, v in self.factor_info.items()])
|
157
338
|
.to_numpy()
|
158
339
|
.astype(np.int64)
|
159
340
|
)
|
160
341
|
|
161
342
|
@property
|
162
343
|
def factor_names(self) -> Sequence[str]:
|
163
|
-
"""
|
344
|
+
"""Names of all processed metadata factors.
|
345
|
+
|
346
|
+
Returns
|
347
|
+
-------
|
348
|
+
Sequence[str]
|
349
|
+
List of factor names that passed filtering and preprocessing steps.
|
350
|
+
Order matches columns in factor_data, digitized_data, and binned_data.
|
351
|
+
|
352
|
+
Notes
|
353
|
+
-----
|
354
|
+
This property triggers dataset structure analysis on first access.
|
355
|
+
Factor names respect include/exclude filtering settings.
|
356
|
+
"""
|
164
357
|
self._structure()
|
165
358
|
return list(filter(self._filter, self._factors))
|
166
359
|
|
167
360
|
@property
|
168
361
|
def factor_info(self) -> Mapping[str, FactorInfo]:
|
169
|
-
"""
|
362
|
+
"""Type information and processing status for each factor.
|
363
|
+
|
364
|
+
Returns
|
365
|
+
-------
|
366
|
+
Mapping[str, FactorInfo]
|
367
|
+
Dictionary mapping factor names to FactorInfo objects containing
|
368
|
+
data type classification and processing flags (binned, digitized).
|
369
|
+
|
370
|
+
Notes
|
371
|
+
-----
|
372
|
+
This property triggers factor binning analysis on first access.
|
373
|
+
Only includes factors that survived preprocessing and filtering.
|
374
|
+
"""
|
170
375
|
self._bin()
|
171
|
-
return dict(filter(self._filter, self._factors.items()))
|
376
|
+
return dict(filter(self._filter, ((k, v) for k, v in self._factors.items() if v is not None)))
|
172
377
|
|
173
378
|
@property
|
174
379
|
def factor_data(self) -> NDArray[Any]:
|
175
|
-
"""
|
380
|
+
"""Raw factor values before binning or digitization.
|
381
|
+
|
382
|
+
Access unprocessed factor data in its original numeric form before
|
383
|
+
any categorical encoding or binning transformations are applied.
|
384
|
+
|
385
|
+
Returns
|
386
|
+
-------
|
387
|
+
NDArray[Any]
|
388
|
+
Array with shape (n_samples, n_factors) containing original factor
|
389
|
+
values. Returns empty array when no factors are available.
|
390
|
+
|
391
|
+
Notes
|
392
|
+
-----
|
393
|
+
Use this for algorithms that can work with mixed data types or when
|
394
|
+
you need access to original continuous values. For analysis-ready
|
395
|
+
integer data, use binned_data or digitized_data instead.
|
396
|
+
"""
|
176
397
|
if not self.factor_names:
|
177
398
|
return np.array([], dtype=np.float64)
|
178
399
|
|
@@ -181,24 +402,67 @@ class Metadata:
|
|
181
402
|
|
182
403
|
@property
|
183
404
|
def class_labels(self) -> NDArray[np.intp]:
|
184
|
-
"""
|
405
|
+
"""Target class labels as integer indices.
|
406
|
+
|
407
|
+
Returns
|
408
|
+
-------
|
409
|
+
NDArray[np.intp]
|
410
|
+
Array of class indices corresponding to dataset targets. For
|
411
|
+
object detection datasets, contains one label per detection.
|
412
|
+
|
413
|
+
Notes
|
414
|
+
-----
|
415
|
+
This property triggers dataset structure analysis on first access.
|
416
|
+
Use class_names property to get human-readable label names.
|
417
|
+
"""
|
185
418
|
self._structure()
|
186
419
|
return self._class_labels
|
187
420
|
|
188
421
|
@property
|
189
422
|
def class_names(self) -> Sequence[str]:
|
190
|
-
"""
|
423
|
+
"""Human-readable names corresponding to class labels.
|
424
|
+
|
425
|
+
Returns
|
426
|
+
-------
|
427
|
+
Sequence[str]
|
428
|
+
List of class names where index corresponds to class label value.
|
429
|
+
Derived from dataset metadata or auto-generated from label indices.
|
430
|
+
|
431
|
+
Notes
|
432
|
+
-----
|
433
|
+
This property triggers dataset structure analysis on first access.
|
434
|
+
"""
|
191
435
|
self._structure()
|
192
436
|
return self._class_names
|
193
437
|
|
194
438
|
@property
|
195
439
|
def image_indices(self) -> NDArray[np.intp]:
|
196
|
-
"""
|
197
|
-
|
440
|
+
"""Dataset indices linking targets back to source images.
|
441
|
+
|
442
|
+
Returns
|
443
|
+
-------
|
444
|
+
NDArray[np.intp]
|
445
|
+
Array mapping each target/detection back to its source image
|
446
|
+
index in the original dataset. Essential for object detection
|
447
|
+
datasets where multiple detections come from single images.
|
448
|
+
|
449
|
+
Notes
|
450
|
+
-----
|
451
|
+
This property triggers dataset structure analysis on first access.
|
452
|
+
"""
|
453
|
+
self._structure()
|
198
454
|
return self._image_indices
|
199
455
|
|
200
456
|
@property
|
201
457
|
def image_count(self) -> int:
|
458
|
+
"""Total number of images in the dataset.
|
459
|
+
|
460
|
+
Returns
|
461
|
+
-------
|
462
|
+
int
|
463
|
+
Count of unique images in the source dataset, regardless of
|
464
|
+
how many targets/detections each image contains.
|
465
|
+
"""
|
202
466
|
if self._count == 0:
|
203
467
|
self._structure()
|
204
468
|
return self._count
|
@@ -212,7 +476,7 @@ class Metadata:
|
|
212
476
|
columns = self._dataframe.columns
|
213
477
|
for col in (col for col in cols or columns if _binned(col) in columns):
|
214
478
|
self._dataframe.drop_in_place(_binned(col))
|
215
|
-
self._factors[col] =
|
479
|
+
self._factors[col] = None
|
216
480
|
self._is_binned = False
|
217
481
|
|
218
482
|
def _structure(self) -> None:
|
@@ -226,7 +490,7 @@ class Metadata:
|
|
226
490
|
scores = []
|
227
491
|
srcidx = []
|
228
492
|
is_od = None
|
229
|
-
for i in range(len(self._dataset)):
|
493
|
+
for i in tqdm(range(len(self._dataset))):
|
230
494
|
_, target, metadata = self._dataset[i]
|
231
495
|
|
232
496
|
raw.append(metadata)
|
@@ -235,15 +499,15 @@ class Metadata:
|
|
235
499
|
target_labels = as_numpy(target.labels)
|
236
500
|
target_len = len(target_labels)
|
237
501
|
if target_len:
|
238
|
-
labels.
|
239
|
-
bboxes.
|
240
|
-
scores.
|
502
|
+
labels.append(target_labels)
|
503
|
+
bboxes.append(as_numpy(target.boxes))
|
504
|
+
scores.append(as_numpy(target.scores))
|
241
505
|
srcidx.extend([i] * target_len)
|
242
506
|
elif isinstance(target, Array):
|
243
|
-
|
244
|
-
|
245
|
-
labels.append(
|
246
|
-
scores.append(
|
507
|
+
target_scores = as_numpy(target)
|
508
|
+
if len(target_scores):
|
509
|
+
labels.append([np.argmax(target_scores)])
|
510
|
+
scores.append([target_scores])
|
247
511
|
srcidx.append(i)
|
248
512
|
else:
|
249
513
|
raise TypeError("Encountered unsupported target type in dataset")
|
@@ -252,10 +516,11 @@ class Metadata:
|
|
252
516
|
if is_od != is_od_target:
|
253
517
|
raise ValueError("Encountered unexpected target type in dataset")
|
254
518
|
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
519
|
+
np_asarray: Callable[..., np.ndarray] = np.concatenate if srcidx else np.asarray
|
520
|
+
labels = np_asarray(labels, dtype=np.intp)
|
521
|
+
scores = np_asarray(scores, dtype=np.float32)
|
522
|
+
bboxes = np_asarray(bboxes, dtype=np.float32) if is_od else None
|
523
|
+
srcidx = np.asarray(srcidx, dtype=np.intp)
|
259
524
|
|
260
525
|
index2label = self._dataset.metadata.get("index2label", {i: str(i) for i in np.unique(labels)})
|
261
526
|
|
@@ -277,7 +542,7 @@ class Metadata:
|
|
277
542
|
self._class_labels = labels
|
278
543
|
self._class_names = list(index2label.values())
|
279
544
|
self._image_indices = target_dict["image_index"]
|
280
|
-
self._factors = dict.fromkeys(factor_dict,
|
545
|
+
self._factors = dict.fromkeys(factor_dict, None)
|
281
546
|
self._dataframe = pl.DataFrame({**target_dict, **factor_dict})
|
282
547
|
self._dropped_factors = merged[1]
|
283
548
|
self._is_structured = True
|
@@ -303,24 +568,25 @@ class Metadata:
|
|
303
568
|
)
|
304
569
|
|
305
570
|
column_set = set(df.columns)
|
306
|
-
for col in (col for col in self.factor_names if _binned(col)
|
571
|
+
for col in (col for col in self.factor_names if not {_binned(col), _digitized(col)} & column_set):
|
307
572
|
# Get data as numpy array for processing
|
308
573
|
data = df[col].to_numpy()
|
309
|
-
col_dz = _binned(col)
|
310
574
|
if col in factor_bins:
|
311
575
|
# User provided binning
|
312
576
|
bins = factor_bins[col]
|
313
|
-
|
314
|
-
|
577
|
+
col_bn = _binned(col)
|
578
|
+
df = df.with_columns(pl.Series(name=col_bn, values=digitize_data(data, bins).astype(np.int64)))
|
579
|
+
factor_info[col] = FactorInfo("continuous", is_binned=True)
|
315
580
|
else:
|
316
581
|
# Check if data is numeric
|
317
|
-
|
318
|
-
if not np.issubdtype(data.dtype, np.number)
|
319
|
-
# Non-numeric data
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
582
|
+
_, ordinal = np.unique(data, return_inverse=True)
|
583
|
+
if not np.issubdtype(data.dtype, np.number):
|
584
|
+
# Non-numeric data - convert to categorical
|
585
|
+
col_dg = _digitized(col)
|
586
|
+
df = df.with_columns(pl.Series(name=col_dg, values=ordinal.astype(np.int64)))
|
587
|
+
factor_info[col] = FactorInfo("categorical", is_digitized=True)
|
588
|
+
elif is_continuous(data, self.image_indices):
|
589
|
+
# Continuous values - discretize by binning
|
324
590
|
warnings.warn(
|
325
591
|
f"A user defined binning was not provided for {col}. "
|
326
592
|
f"Using the {self.auto_bin_method} method to discretize the data. "
|
@@ -330,10 +596,12 @@ class Metadata:
|
|
330
596
|
)
|
331
597
|
# Create binned version
|
332
598
|
binned_data = bin_data(data, self.auto_bin_method)
|
333
|
-
|
334
|
-
|
599
|
+
col_bn = _binned(col)
|
600
|
+
df = df.with_columns(pl.Series(name=col_bn, values=binned_data.astype(np.int64)))
|
601
|
+
factor_info[col] = FactorInfo("continuous", is_binned=True)
|
335
602
|
else:
|
336
|
-
|
603
|
+
# Non-continuous values - treat as discrete
|
604
|
+
factor_info[col] = FactorInfo("discrete")
|
337
605
|
|
338
606
|
# Store the results
|
339
607
|
self._dataframe = df
|
@@ -341,16 +609,30 @@ class Metadata:
|
|
341
609
|
self._is_binned = True
|
342
610
|
|
343
611
|
def add_factors(self, factors: Mapping[str, Array | Sequence[Any]]) -> None:
|
344
|
-
"""
|
345
|
-
Add additional factors to the metadata.
|
612
|
+
"""Add additional factors to metadata collection.
|
346
613
|
|
347
|
-
|
348
|
-
|
614
|
+
Extend the current metadata with new factors, automatically handling
|
615
|
+
length validation and integration with existing data structures.
|
349
616
|
|
350
617
|
Parameters
|
351
618
|
----------
|
352
619
|
factors : Mapping[str, Array | Sequence[Any]]
|
353
|
-
Dictionary
|
620
|
+
Dictionary mapping factor names to their values. Factor length must
|
621
|
+
match either the number of images or number of detections in the dataset.
|
622
|
+
|
623
|
+
Raises
|
624
|
+
------
|
625
|
+
ValueError
|
626
|
+
When factor lengths do not match dataset dimensions.
|
627
|
+
|
628
|
+
Examples
|
629
|
+
--------
|
630
|
+
>>> metadata = Metadata(dataset)
|
631
|
+
>>> new_factors = {
|
632
|
+
... "brightness": [0.2, 0.8, 0.5, 0.3, 0.4, 0.1, 0.3, 0.2],
|
633
|
+
... "contrast": [1.1, 0.9, 1.0, 0.8, 1.2, 1.0, 0.7, 1.3],
|
634
|
+
... }
|
635
|
+
>>> metadata.add_factors(new_factors)
|
354
636
|
"""
|
355
637
|
self._structure()
|
356
638
|
|
@@ -367,7 +649,7 @@ class Metadata:
|
|
367
649
|
for k, v in factors.items():
|
368
650
|
data = as_numpy(v)[self.image_indices]
|
369
651
|
new_columns.append(pl.Series(name=k, values=data))
|
370
|
-
self._factors[k] =
|
652
|
+
self._factors[k] = None
|
371
653
|
|
372
654
|
if new_columns:
|
373
655
|
self._dataframe = self.dataframe.with_columns(new_columns)
|