dataeval 0.86.9__py3-none-any.whl → 0.88.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. dataeval/__init__.py +1 -1
  2. dataeval/_log.py +1 -1
  3. dataeval/_version.py +2 -2
  4. dataeval/config.py +4 -19
  5. dataeval/data/_embeddings.py +78 -35
  6. dataeval/data/_images.py +41 -8
  7. dataeval/data/_metadata.py +348 -66
  8. dataeval/data/_selection.py +22 -7
  9. dataeval/data/_split.py +3 -2
  10. dataeval/data/selections/_classbalance.py +4 -3
  11. dataeval/data/selections/_classfilter.py +9 -8
  12. dataeval/data/selections/_indices.py +4 -3
  13. dataeval/data/selections/_prioritize.py +249 -29
  14. dataeval/data/selections/_reverse.py +1 -1
  15. dataeval/data/selections/_shuffle.py +5 -4
  16. dataeval/detectors/drift/_base.py +2 -1
  17. dataeval/detectors/drift/_mmd.py +2 -1
  18. dataeval/detectors/drift/_nml/_base.py +1 -1
  19. dataeval/detectors/drift/_nml/_chunk.py +2 -1
  20. dataeval/detectors/drift/_nml/_result.py +3 -2
  21. dataeval/detectors/drift/_nml/_thresholds.py +6 -5
  22. dataeval/detectors/drift/_uncertainty.py +2 -1
  23. dataeval/detectors/linters/duplicates.py +2 -1
  24. dataeval/detectors/linters/outliers.py +4 -3
  25. dataeval/detectors/ood/__init__.py +2 -1
  26. dataeval/detectors/ood/ae.py +1 -1
  27. dataeval/detectors/ood/base.py +39 -1
  28. dataeval/detectors/ood/knn.py +95 -0
  29. dataeval/detectors/ood/mixin.py +2 -1
  30. dataeval/metadata/_utils.py +1 -1
  31. dataeval/metrics/bias/_balance.py +29 -22
  32. dataeval/metrics/bias/_diversity.py +4 -4
  33. dataeval/metrics/bias/_parity.py +2 -2
  34. dataeval/metrics/stats/_base.py +3 -29
  35. dataeval/metrics/stats/_boxratiostats.py +2 -1
  36. dataeval/metrics/stats/_dimensionstats.py +2 -1
  37. dataeval/metrics/stats/_hashstats.py +21 -3
  38. dataeval/metrics/stats/_pixelstats.py +2 -1
  39. dataeval/metrics/stats/_visualstats.py +2 -1
  40. dataeval/outputs/_base.py +2 -3
  41. dataeval/outputs/_bias.py +2 -1
  42. dataeval/outputs/_estimators.py +1 -1
  43. dataeval/outputs/_linters.py +3 -3
  44. dataeval/outputs/_stats.py +3 -3
  45. dataeval/outputs/_utils.py +1 -1
  46. dataeval/outputs/_workflows.py +49 -31
  47. dataeval/typing.py +23 -9
  48. dataeval/utils/__init__.py +2 -2
  49. dataeval/utils/_array.py +3 -2
  50. dataeval/utils/_bin.py +9 -7
  51. dataeval/utils/_method.py +2 -3
  52. dataeval/utils/_multiprocessing.py +34 -0
  53. dataeval/utils/_plot.py +2 -1
  54. dataeval/utils/data/__init__.py +6 -5
  55. dataeval/utils/data/{metadata.py → _merge.py} +3 -2
  56. dataeval/utils/data/_validate.py +170 -0
  57. dataeval/utils/data/collate.py +2 -1
  58. dataeval/utils/torch/_internal.py +2 -1
  59. dataeval/utils/torch/trainer.py +1 -1
  60. dataeval/workflows/sufficiency.py +13 -9
  61. {dataeval-0.86.9.dist-info → dataeval-0.88.0.dist-info}/METADATA +8 -21
  62. dataeval-0.88.0.dist-info/RECORD +105 -0
  63. dataeval/utils/data/_dataset.py +0 -246
  64. dataeval/utils/datasets/__init__.py +0 -21
  65. dataeval/utils/datasets/_antiuav.py +0 -189
  66. dataeval/utils/datasets/_base.py +0 -266
  67. dataeval/utils/datasets/_cifar10.py +0 -201
  68. dataeval/utils/datasets/_fileio.py +0 -142
  69. dataeval/utils/datasets/_milco.py +0 -197
  70. dataeval/utils/datasets/_mixin.py +0 -54
  71. dataeval/utils/datasets/_mnist.py +0 -202
  72. dataeval/utils/datasets/_seadrone.py +0 -512
  73. dataeval/utils/datasets/_ships.py +0 -144
  74. dataeval/utils/datasets/_types.py +0 -48
  75. dataeval/utils/datasets/_voc.py +0 -583
  76. dataeval-0.86.9.dist-info/RECORD +0 -115
  77. {dataeval-0.86.9.dist-info → dataeval-0.88.0.dist-info}/WHEEL +0 -0
  78. /dataeval-0.86.9.dist-info/licenses/LICENSE.txt → /dataeval-0.88.0.dist-info/licenses/LICENSE +0 -0
@@ -3,12 +3,14 @@ from __future__ import annotations
3
3
  __all__ = []
4
4
 
5
5
  import warnings
6
+ from collections.abc import Callable, Iterable, Mapping, Sequence, Sized
6
7
  from dataclasses import dataclass
7
- from typing import Any, Iterable, Literal, Mapping, Sequence, Sized
8
+ from typing import Any, Literal
8
9
 
9
10
  import numpy as np
10
11
  import polars as pl
11
12
  from numpy.typing import NDArray
13
+ from tqdm.auto import tqdm
12
14
 
13
15
  from dataeval.typing import (
14
16
  AnnotatedDataset,
@@ -16,36 +18,60 @@ from dataeval.typing import (
16
18
  ObjectDetectionTarget,
17
19
  )
18
20
  from dataeval.utils._array import as_numpy
19
- from dataeval.utils._bin import bin_data, digitize_data
20
- from dataeval.utils.data.metadata import merge
21
+ from dataeval.utils._bin import bin_data, digitize_data, is_continuous
22
+ from dataeval.utils.data._merge import merge
21
23
 
22
24
 
23
25
  def _binned(name: str) -> str:
24
- return f"{name}[]"
26
+ return f"{name}"
27
+
28
+
29
+ def _digitized(name: str) -> str:
30
+ return f"{name}#"
25
31
 
26
32
 
27
33
  @dataclass
28
34
  class FactorInfo:
29
- factor_type: Literal["categorical", "continuous", "discrete"] | None = None
30
- discretized_col: str | None = None
35
+ factor_type: Literal["categorical", "continuous", "discrete"]
36
+ is_binned: bool = False
37
+ is_digitized: bool = False
38
+
39
+
40
+ def _to_col(name: str, info: FactorInfo, binned: bool = True) -> str:
41
+ if binned and info.is_binned:
42
+ return _binned(name)
43
+ if info.is_digitized:
44
+ return _digitized(name)
45
+ return name
31
46
 
32
47
 
33
48
  class Metadata:
34
- """
35
- Class containing binned metadata using Polars DataFrames.
49
+ """Collection of binned metadata using Polars DataFrames.
50
+
51
+ Processes dataset metadata by automatically binning continuous factors and digitizing
52
+ categorical factors for analysis and visualization workflows.
36
53
 
37
54
  Parameters
38
55
  ----------
39
56
  dataset : ImageClassificationDataset or ObjectDetectionDataset
40
- Dataset to access original targets and metadata from.
57
+ Dataset that provides original targets and metadata for processing.
41
58
  continuous_factor_bins : Mapping[str, int | Sequence[float]] | None, default None
42
- Mapping from continuous factor name to the number of bins or bin edges
59
+ Mapping from continuous factor names to bin counts or explicit bin edges.
60
+ When None, uses automatic discretization.
43
61
  auto_bin_method : Literal["uniform_width", "uniform_count", "clusters"], default "uniform_width"
44
- Method for automatically determining the number of bins for continuous factors
62
+ Binning strategy for continuous factors without explicit bins. Default "uniform_width"
63
+ provides intuitive equal-width intervals for most distributions.
45
64
  exclude : Sequence[str] | None, default None
46
- Filter metadata factors to exclude the specified factors, cannot be set with `include`
65
+ Factor names to exclude from processing. Cannot be used with `include` parameter.
66
+ When None, processes all available factors.
47
67
  include : Sequence[str] | None, default None
48
- Filter metadata factors to include the specified factors, cannot be set with `exclude`
68
+ Factor names to include in processing. Cannot be used with `exclude` parameter.
69
+ When None, processes all available factors.
70
+
71
+ Raises
72
+ ------
73
+ ValueError
74
+ When both exclude and include parameters are specified simultaneously.
49
75
  """
50
76
 
51
77
  def __init__(
@@ -60,7 +86,7 @@ class Metadata:
60
86
  self._class_labels: NDArray[np.intp]
61
87
  self._class_names: list[str]
62
88
  self._image_indices: NDArray[np.intp]
63
- self._factors: dict[str, FactorInfo]
89
+ self._factors: dict[str, FactorInfo | None]
64
90
  self._dropped_factors: dict[str, list[str]]
65
91
  self._dataframe: pl.DataFrame
66
92
  self._raw: Sequence[Mapping[str, Any]]
@@ -81,17 +107,48 @@ class Metadata:
81
107
 
82
108
  @property
83
109
  def raw(self) -> Sequence[Mapping[str, Any]]:
84
- """The raw list of metadata dictionaries for the dataset."""
110
+ """Original metadata dictionaries extracted from the dataset.
111
+
112
+ Access the unprocessed metadata as it was provided in the original dataset before
113
+ any binning, filtering, or transformation operations.
114
+
115
+ Returns
116
+ -------
117
+ Sequence[Mapping[str, Any]]
118
+ List of metadata dictionaries, one per dataset item, containing the original key-value
119
+ pairs as provided in the source data
120
+
121
+ Notes
122
+ -----
123
+ This property triggers dataset structure analysis on first access.
124
+ """
85
125
  self._structure()
86
126
  return self._raw
87
127
 
88
128
  @property
89
129
  def exclude(self) -> set[str]:
90
- """Factors to exclude from the metadata."""
130
+ """Factor names excluded from metadata processing.
131
+
132
+ Returns
133
+ -------
134
+ set[str]
135
+ Set of factor names that are filtered out during processing.
136
+ Empty set when no exclusions are active.
137
+
138
+ """
91
139
  return self._exclude
92
140
 
93
141
  @exclude.setter
94
142
  def exclude(self, value: Sequence[str]) -> None:
143
+ """Set factor names to exclude from processing.
144
+
145
+ Automatically clears include filter and resets binning state when exclusion list changes.
146
+
147
+ Parameters
148
+ ----------
149
+ value : Sequence[str]
150
+ Factor names to exclude from metadata analysis.
151
+ """
95
152
  exclude = set(value)
96
153
  if self._exclude != exclude:
97
154
  self._exclude = exclude
@@ -100,11 +157,27 @@ class Metadata:
100
157
 
101
158
  @property
102
159
  def include(self) -> set[str]:
103
- """Factors to include from the metadata."""
160
+ """Factor names included in metadata processing.
161
+
162
+ Returns
163
+ -------
164
+ set[str]
165
+ Set of factor names that are processed during analysis. Empty set when no inclusion filter is active.
166
+ """
104
167
  return self._include
105
168
 
106
169
  @include.setter
107
170
  def include(self, value: Sequence[str]) -> None:
171
+ """Set factor names to include in processing.
172
+
173
+ Automatically clears exclude filter and resets binning state when
174
+ inclusion list changes.
175
+
176
+ Parameters
177
+ ----------
178
+ value : Sequence[str]
179
+ Factor names to include in metadata analysis.
180
+ """
108
181
  include = set(value)
109
182
  if self._include != include:
110
183
  self._include = include
@@ -113,66 +186,214 @@ class Metadata:
113
186
 
114
187
  @property
115
188
  def continuous_factor_bins(self) -> Mapping[str, int | Sequence[float]]:
116
- """Map of factor names to bin counts or bin edges."""
189
+ """Binning configuration for continuous factors.
190
+
191
+ Returns
192
+ -------
193
+ Mapping[str, int | Sequence[float]]
194
+ Dictionary mapping factor names to either the number of bins
195
+ (int) or explicit bin edges (sequence of floats).
196
+ """
117
197
  return self._continuous_factor_bins
118
198
 
119
199
  @continuous_factor_bins.setter
120
200
  def continuous_factor_bins(self, bins: Mapping[str, int | Sequence[float]]) -> None:
201
+ """Update binning configuration for continuous factors.
202
+
203
+ Triggers re-binning when configuration changes to ensure data
204
+ consistency with new bin specifications.
205
+
206
+ Parameters
207
+ ----------
208
+ bins : Mapping[str, int | Sequence[float]]
209
+ Dictionary mapping factor names to bin counts or explicit edges.
210
+ """
121
211
  if self._continuous_factor_bins != bins:
122
212
  self._continuous_factor_bins = dict(bins)
123
213
  self._reset_bins(bins)
124
214
 
125
215
  @property
126
216
  def auto_bin_method(self) -> Literal["uniform_width", "uniform_count", "clusters"]:
127
- """Binning method to use when continuous_factor_bins is not defined."""
217
+ """Automatic binning strategy for continuous factors.
218
+
219
+ Returns
220
+ -------
221
+ {"uniform_width", "uniform_count", "clusters"}
222
+ Current method used for automatic discretization of continuous
223
+ factors that lack explicit bin specifications.
224
+ """
128
225
  return self._auto_bin_method
129
226
 
130
227
  @auto_bin_method.setter
131
228
  def auto_bin_method(self, method: Literal["uniform_width", "uniform_count", "clusters"]) -> None:
229
+ """Set automatic binning strategy for continuous factors.
230
+
231
+ Triggers re-binning with the new method when strategy changes to
232
+ ensure consistent discretization across all factors.
233
+
234
+ Parameters
235
+ ----------
236
+ method : {"uniform_width", "uniform_count", "clusters"}
237
+ Binning strategy to apply for continuous factors without
238
+ explicit bin configurations.
239
+ """
132
240
  if self._auto_bin_method != method:
133
241
  self._auto_bin_method = method
134
242
  self._reset_bins()
135
243
 
136
244
  @property
137
245
  def dataframe(self) -> pl.DataFrame:
138
- """Dataframe containing target information and metadata factors."""
246
+ """Processed DataFrame containing targets and metadata factors.
247
+
248
+ Access the main data structure with target information (class labels,
249
+ scores, bounding boxes) and processed metadata factors ready for analysis.
250
+
251
+ Returns
252
+ -------
253
+ pl.DataFrame
254
+ DataFrame with columns for image indices, class labels, scores,
255
+ bounding boxes (when applicable), and all processed metadata factors.
256
+
257
+ Notes
258
+ -----
259
+ This property triggers dataset structure analysis on first access.
260
+ Factor binning occurs automatically when accessing factor-related data.
261
+ """
139
262
  self._structure()
140
263
  return self._dataframe
141
264
 
142
265
  @property
143
266
  def dropped_factors(self) -> Mapping[str, Sequence[str]]:
144
- """Factors that were dropped during preprocessing and the reasons why they were dropped."""
267
+ """Factors removed during preprocessing with removal reasons.
268
+
269
+ Returns
270
+ -------
271
+ Mapping[str, Sequence[str]]
272
+ Dictionary mapping dropped factor names to lists of reasons
273
+ why they were excluded from the final dataset.
274
+
275
+ Notes
276
+ -----
277
+ This property triggers dataset structure analysis on first access.
278
+ Common removal reasons include incompatible data types, excessive
279
+ missing values, or insufficient variation.
280
+ """
145
281
  self._structure()
146
282
  return self._dropped_factors
147
283
 
148
284
  @property
149
- def discretized_data(self) -> NDArray[np.int64]:
150
- """Factor data with continuous data discretized."""
285
+ def digitized_data(self) -> NDArray[np.int64]:
286
+ """Factor data with categorical values converted to integer codes.
287
+
288
+ Access processed factor data where categorical factors are digitized
289
+ to integer codes but continuous factors remain in their original form.
290
+
291
+ Returns
292
+ -------
293
+ NDArray[np.int64]
294
+ Array with shape (n_samples, n_factors) containing integer-coded
295
+ categorical data. Returns empty array when no factors are available.
296
+
297
+ Notes
298
+ -----
299
+ This property triggers factor binning analysis on first access.
300
+ Use this for algorithms that can handle mixed categorical and
301
+ continuous data types.
302
+ """
151
303
  if not self.factor_names:
152
304
  return np.array([], dtype=np.int64)
153
305
 
154
306
  self._bin()
155
307
  return (
156
- self.dataframe.select([info.discretized_col or name for name, info in self.factor_info.items()])
308
+ self.dataframe.select([_to_col(k, v, False) for k, v in self.factor_info.items()])
309
+ .to_numpy()
310
+ .astype(np.int64)
311
+ )
312
+
313
+ @property
314
+ def binned_data(self) -> NDArray[np.int64]:
315
+ """Factor data with continuous values discretized into bins.
316
+
317
+ Access fully processed factor data where both categorical and
318
+ continuous factors are converted to integer bin indices.
319
+
320
+ Returns
321
+ -------
322
+ NDArray[np.int64]
323
+ Array with shape (n_samples, n_factors) containing binned integer
324
+ data ready for categorical analysis algorithms. Returns empty array
325
+ when no factors are available.
326
+
327
+ Notes
328
+ -----
329
+ This property triggers factor binning analysis on first access.
330
+ Use this for algorithms requiring purely discrete input data.
331
+ """
332
+ if not self.factor_names:
333
+ return np.array([], dtype=np.int64)
334
+
335
+ self._bin()
336
+ return (
337
+ self.dataframe.select([_to_col(k, v, True) for k, v in self.factor_info.items()])
157
338
  .to_numpy()
158
339
  .astype(np.int64)
159
340
  )
160
341
 
161
342
  @property
162
343
  def factor_names(self) -> Sequence[str]:
163
- """Factor names of the metadata."""
344
+ """Names of all processed metadata factors.
345
+
346
+ Returns
347
+ -------
348
+ Sequence[str]
349
+ List of factor names that passed filtering and preprocessing steps.
350
+ Order matches columns in factor_data, digitized_data, and binned_data.
351
+
352
+ Notes
353
+ -----
354
+ This property triggers dataset structure analysis on first access.
355
+ Factor names respect include/exclude filtering settings.
356
+ """
164
357
  self._structure()
165
358
  return list(filter(self._filter, self._factors))
166
359
 
167
360
  @property
168
361
  def factor_info(self) -> Mapping[str, FactorInfo]:
169
- """Factor types of the metadata."""
362
+ """Type information and processing status for each factor.
363
+
364
+ Returns
365
+ -------
366
+ Mapping[str, FactorInfo]
367
+ Dictionary mapping factor names to FactorInfo objects containing
368
+ data type classification and processing flags (binned, digitized).
369
+
370
+ Notes
371
+ -----
372
+ This property triggers factor binning analysis on first access.
373
+ Only includes factors that survived preprocessing and filtering.
374
+ """
170
375
  self._bin()
171
- return dict(filter(self._filter, self._factors.items()))
376
+ return dict(filter(self._filter, ((k, v) for k, v in self._factors.items() if v is not None)))
172
377
 
173
378
  @property
174
379
  def factor_data(self) -> NDArray[Any]:
175
- """Factor data as a NumPy array."""
380
+ """Raw factor values before binning or digitization.
381
+
382
+ Access unprocessed factor data in its original numeric form before
383
+ any categorical encoding or binning transformations are applied.
384
+
385
+ Returns
386
+ -------
387
+ NDArray[Any]
388
+ Array with shape (n_samples, n_factors) containing original factor
389
+ values. Returns empty array when no factors are available.
390
+
391
+ Notes
392
+ -----
393
+ Use this for algorithms that can work with mixed data types or when
394
+ you need access to original continuous values. For analysis-ready
395
+ integer data, use binned_data or digitized_data instead.
396
+ """
176
397
  if not self.factor_names:
177
398
  return np.array([], dtype=np.float64)
178
399
 
@@ -181,24 +402,67 @@ class Metadata:
181
402
 
182
403
  @property
183
404
  def class_labels(self) -> NDArray[np.intp]:
184
- """Class labels as a NumPy array."""
405
+ """Target class labels as integer indices.
406
+
407
+ Returns
408
+ -------
409
+ NDArray[np.intp]
410
+ Array of class indices corresponding to dataset targets. For
411
+ object detection datasets, contains one label per detection.
412
+
413
+ Notes
414
+ -----
415
+ This property triggers dataset structure analysis on first access.
416
+ Use class_names property to get human-readable label names.
417
+ """
185
418
  self._structure()
186
419
  return self._class_labels
187
420
 
188
421
  @property
189
422
  def class_names(self) -> Sequence[str]:
190
- """Class names as a list of strings."""
423
+ """Human-readable names corresponding to class labels.
424
+
425
+ Returns
426
+ -------
427
+ Sequence[str]
428
+ List of class names where index corresponds to class label value.
429
+ Derived from dataset metadata or auto-generated from label indices.
430
+
431
+ Notes
432
+ -----
433
+ This property triggers dataset structure analysis on first access.
434
+ """
191
435
  self._structure()
192
436
  return self._class_names
193
437
 
194
438
  @property
195
439
  def image_indices(self) -> NDArray[np.intp]:
196
- """Indices of images as a NumPy array."""
197
- self._bin()
440
+ """Dataset indices linking targets back to source images.
441
+
442
+ Returns
443
+ -------
444
+ NDArray[np.intp]
445
+ Array mapping each target/detection back to its source image
446
+ index in the original dataset. Essential for object detection
447
+ datasets where multiple detections come from single images.
448
+
449
+ Notes
450
+ -----
451
+ This property triggers dataset structure analysis on first access.
452
+ """
453
+ self._structure()
198
454
  return self._image_indices
199
455
 
200
456
  @property
201
457
  def image_count(self) -> int:
458
+ """Total number of images in the dataset.
459
+
460
+ Returns
461
+ -------
462
+ int
463
+ Count of unique images in the source dataset, regardless of
464
+ how many targets/detections each image contains.
465
+ """
202
466
  if self._count == 0:
203
467
  self._structure()
204
468
  return self._count
@@ -212,7 +476,7 @@ class Metadata:
212
476
  columns = self._dataframe.columns
213
477
  for col in (col for col in cols or columns if _binned(col) in columns):
214
478
  self._dataframe.drop_in_place(_binned(col))
215
- self._factors[col] = FactorInfo()
479
+ self._factors[col] = None
216
480
  self._is_binned = False
217
481
 
218
482
  def _structure(self) -> None:
@@ -226,7 +490,7 @@ class Metadata:
226
490
  scores = []
227
491
  srcidx = []
228
492
  is_od = None
229
- for i in range(len(self._dataset)):
493
+ for i in tqdm(range(len(self._dataset))):
230
494
  _, target, metadata = self._dataset[i]
231
495
 
232
496
  raw.append(metadata)
@@ -235,15 +499,15 @@ class Metadata:
235
499
  target_labels = as_numpy(target.labels)
236
500
  target_len = len(target_labels)
237
501
  if target_len:
238
- labels.extend(target_labels.tolist())
239
- bboxes.extend(as_numpy(target.boxes).tolist())
240
- scores.extend(as_numpy(target.scores).tolist())
502
+ labels.append(target_labels)
503
+ bboxes.append(as_numpy(target.boxes))
504
+ scores.append(as_numpy(target.scores))
241
505
  srcidx.extend([i] * target_len)
242
506
  elif isinstance(target, Array):
243
- if len(target):
244
- target_len = 1
245
- labels.append(int(np.argmax(as_numpy(target))))
246
- scores.append(target)
507
+ target_scores = as_numpy(target)
508
+ if len(target_scores):
509
+ labels.append([np.argmax(target_scores)])
510
+ scores.append([target_scores])
247
511
  srcidx.append(i)
248
512
  else:
249
513
  raise TypeError("Encountered unsupported target type in dataset")
@@ -252,10 +516,11 @@ class Metadata:
252
516
  if is_od != is_od_target:
253
517
  raise ValueError("Encountered unexpected target type in dataset")
254
518
 
255
- labels = as_numpy(labels).astype(np.intp)
256
- scores = as_numpy(scores).astype(np.float32)
257
- bboxes = as_numpy(bboxes).astype(np.float32) if is_od else None
258
- srcidx = as_numpy(srcidx).astype(np.intp)
519
+ np_asarray: Callable[..., np.ndarray] = np.concatenate if srcidx else np.asarray
520
+ labels = np_asarray(labels, dtype=np.intp)
521
+ scores = np_asarray(scores, dtype=np.float32)
522
+ bboxes = np_asarray(bboxes, dtype=np.float32) if is_od else None
523
+ srcidx = np.asarray(srcidx, dtype=np.intp)
259
524
 
260
525
  index2label = self._dataset.metadata.get("index2label", {i: str(i) for i in np.unique(labels)})
261
526
 
@@ -277,7 +542,7 @@ class Metadata:
277
542
  self._class_labels = labels
278
543
  self._class_names = list(index2label.values())
279
544
  self._image_indices = target_dict["image_index"]
280
- self._factors = dict.fromkeys(factor_dict, FactorInfo())
545
+ self._factors = dict.fromkeys(factor_dict, None)
281
546
  self._dataframe = pl.DataFrame({**target_dict, **factor_dict})
282
547
  self._dropped_factors = merged[1]
283
548
  self._is_structured = True
@@ -303,24 +568,25 @@ class Metadata:
303
568
  )
304
569
 
305
570
  column_set = set(df.columns)
306
- for col in (col for col in self.factor_names if _binned(col) not in column_set):
571
+ for col in (col for col in self.factor_names if not {_binned(col), _digitized(col)} & column_set):
307
572
  # Get data as numpy array for processing
308
573
  data = df[col].to_numpy()
309
- col_dz = _binned(col)
310
574
  if col in factor_bins:
311
575
  # User provided binning
312
576
  bins = factor_bins[col]
313
- df = df.with_columns(pl.Series(name=col_dz, values=digitize_data(data, bins).astype(np.int64)))
314
- factor_info[col] = FactorInfo("continuous", col_dz)
577
+ col_bn = _binned(col)
578
+ df = df.with_columns(pl.Series(name=col_bn, values=digitize_data(data, bins).astype(np.int64)))
579
+ factor_info[col] = FactorInfo("continuous", is_binned=True)
315
580
  else:
316
581
  # Check if data is numeric
317
- unique, ordinal = np.unique(data, return_inverse=True)
318
- if not np.issubdtype(data.dtype, np.number) or unique.size <= max(20, data.size * 0.01):
319
- # Non-numeric data or small number of unique values - convert to categorical
320
- df = df.with_columns(pl.Series(name=col_dz, values=ordinal.astype(np.int64)))
321
- factor_info[col] = FactorInfo("categorical", col_dz)
322
- elif data.dtype == float:
323
- # Many unique values - discretize by binning
582
+ _, ordinal = np.unique(data, return_inverse=True)
583
+ if not np.issubdtype(data.dtype, np.number):
584
+ # Non-numeric data - convert to categorical
585
+ col_dg = _digitized(col)
586
+ df = df.with_columns(pl.Series(name=col_dg, values=ordinal.astype(np.int64)))
587
+ factor_info[col] = FactorInfo("categorical", is_digitized=True)
588
+ elif is_continuous(data, self.image_indices):
589
+ # Continuous values - discretize by binning
324
590
  warnings.warn(
325
591
  f"A user defined binning was not provided for {col}. "
326
592
  f"Using the {self.auto_bin_method} method to discretize the data. "
@@ -330,10 +596,12 @@ class Metadata:
330
596
  )
331
597
  # Create binned version
332
598
  binned_data = bin_data(data, self.auto_bin_method)
333
- df = df.with_columns(pl.Series(name=col_dz, values=binned_data.astype(np.int64)))
334
- factor_info[col] = FactorInfo("continuous", col_dz)
599
+ col_bn = _binned(col)
600
+ df = df.with_columns(pl.Series(name=col_bn, values=binned_data.astype(np.int64)))
601
+ factor_info[col] = FactorInfo("continuous", is_binned=True)
335
602
  else:
336
- factor_info[col] = FactorInfo("discrete", col)
603
+ # Non-continuous values - treat as discrete
604
+ factor_info[col] = FactorInfo("discrete")
337
605
 
338
606
  # Store the results
339
607
  self._dataframe = df
@@ -341,16 +609,30 @@ class Metadata:
341
609
  self._is_binned = True
342
610
 
343
611
  def add_factors(self, factors: Mapping[str, Array | Sequence[Any]]) -> None:
344
- """
345
- Add additional factors to the metadata.
612
+ """Add additional factors to metadata collection.
346
613
 
347
- The number of measures per factor must match the number of images
348
- in the dataset or the number of detections in the dataset.
614
+ Extend the current metadata with new factors, automatically handling
615
+ length validation and integration with existing data structures.
349
616
 
350
617
  Parameters
351
618
  ----------
352
619
  factors : Mapping[str, Array | Sequence[Any]]
353
- Dictionary of factors to add to the metadata.
620
+ Dictionary mapping factor names to their values. Factor length must
621
+ match either the number of images or number of detections in the dataset.
622
+
623
+ Raises
624
+ ------
625
+ ValueError
626
+ When factor lengths do not match dataset dimensions.
627
+
628
+ Examples
629
+ --------
630
+ >>> metadata = Metadata(dataset)
631
+ >>> new_factors = {
632
+ ... "brightness": [0.2, 0.8, 0.5, 0.3, 0.4, 0.1, 0.3, 0.2],
633
+ ... "contrast": [1.1, 0.9, 1.0, 0.8, 1.2, 1.0, 0.7, 1.3],
634
+ ... }
635
+ >>> metadata.add_factors(new_factors)
354
636
  """
355
637
  self._structure()
356
638
 
@@ -367,7 +649,7 @@ class Metadata:
367
649
  for k, v in factors.items():
368
650
  data = as_numpy(v)[self.image_indices]
369
651
  new_columns.append(pl.Series(name=k, values=data))
370
- self._factors[k] = FactorInfo()
652
+ self._factors[k] = None
371
653
 
372
654
  if new_columns:
373
655
  self._dataframe = self.dataframe.with_columns(new_columns)