dataeval 1.0.3__tar.gz → 1.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. {dataeval-1.0.3 → dataeval-1.0.5}/PKG-INFO +1 -1
  2. {dataeval-1.0.3 → dataeval-1.0.5}/pyproject.toml +18 -2
  3. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/_embeddings.py +2 -2
  4. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/_experimental.py +5 -5
  5. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/_metadata.py +44 -26
  6. dataeval-1.0.5/src/dataeval/_version.py +24 -0
  7. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/bias/_balance.py +4 -2
  8. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/bias/_diversity.py +2 -2
  9. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/config.py +3 -3
  10. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_bin.py +2 -2
  11. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_calculators/_cache.py +11 -2
  12. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_calculators/_pixelstats.py +14 -2
  13. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_clusterer.py +4 -4
  14. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_compute_ratios.py +3 -3
  15. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_compute_stats.py +31 -10
  16. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_fast_hdbscan/_cluster_trees.py +6 -6
  17. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_fast_hdbscan/_mst.py +1 -1
  18. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_label_stats.py +1 -1
  19. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_metadata_insights.py +2 -2
  20. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_mutual_info.py +11 -15
  21. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_parity.py +1 -1
  22. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_uap.py +8 -10
  23. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/extractors/_onnx.py +2 -2
  24. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/extractors/_torch.py +1 -1
  25. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/performance/_output.py +3 -3
  26. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/quality/_duplicates.py +106 -61
  27. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/quality/_outliers.py +42 -37
  28. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/quality/_shared.py +3 -2
  29. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/scope/_prioritize.py +2 -2
  30. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/selection/_classbalance.py +1 -1
  31. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/selection/_classfilter.py +1 -1
  32. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_drift/_base.py +1 -1
  33. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_drift/_univariate.py +1 -1
  34. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_shared/_reconstruction.py +39 -35
  35. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/types.py +4 -4
  36. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/utils/_internal.py +6 -6
  37. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/utils/data.py +4 -4
  38. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/utils/onnx.py +1 -1
  39. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/utils/preprocessing.py +10 -4
  40. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/utils/thresholds.py +3 -3
  41. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/utils/training.py +1 -1
  42. dataeval-1.0.3/src/dataeval/_version.py +0 -34
  43. {dataeval-1.0.3 → dataeval-1.0.5}/.gitignore +0 -0
  44. {dataeval-1.0.3 → dataeval-1.0.5}/LICENSE +0 -0
  45. {dataeval-1.0.3 → dataeval-1.0.5}/README.md +0 -0
  46. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/__init__.py +0 -0
  47. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/_helpers.py +0 -0
  48. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/_log.py +0 -0
  49. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/_warm_cache.py +0 -0
  50. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/bias/__init__.py +0 -0
  51. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/bias/_parity.py +0 -0
  52. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/__init__.py +0 -0
  53. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_ber.py +0 -0
  54. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_calculators/__init__.py +0 -0
  55. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_calculators/_base.py +0 -0
  56. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_calculators/_dimensionstats.py +0 -0
  57. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_calculators/_hashstats.py +0 -0
  58. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_calculators/_register.py +0 -0
  59. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_calculators/_registry.py +0 -0
  60. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_calculators/_visualstats.py +0 -0
  61. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_completeness.py +0 -0
  62. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_coverage.py +0 -0
  63. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_divergence.py +0 -0
  64. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_diversity.py +0 -0
  65. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_fast_hdbscan/_disjoint_set.py +0 -0
  66. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_feature_distance.py +0 -0
  67. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_hash.py +0 -0
  68. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_label_errors.py +0 -0
  69. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_label_parity.py +0 -0
  70. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_mst.py +0 -0
  71. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_nullmodel.py +0 -0
  72. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_rank.py +0 -0
  73. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/exceptions.py +0 -0
  74. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/extractors/__init__.py +0 -0
  75. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/extractors/_bovw.py +0 -0
  76. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/extractors/_flatten.py +0 -0
  77. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/extractors/_uncertainty.py +0 -0
  78. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/flags.py +0 -0
  79. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/performance/__init__.py +0 -0
  80. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/performance/_aggregator.py +0 -0
  81. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/performance/_sufficiency.py +0 -0
  82. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/performance/schedules.py +0 -0
  83. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/protocols.py +0 -0
  84. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/py.typed +0 -0
  85. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/quality/__init__.py +0 -0
  86. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/scope/__init__.py +0 -0
  87. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/selection/__init__.py +0 -0
  88. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/selection/_indices.py +0 -0
  89. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/selection/_limit.py +0 -0
  90. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/selection/_reverse.py +0 -0
  91. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/selection/_select.py +0 -0
  92. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/selection/_shuffle.py +0 -0
  93. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/__init__.py +0 -0
  94. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_drift/__init__.py +0 -0
  95. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_drift/_chunk.py +0 -0
  96. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_drift/_domain_classifier.py +0 -0
  97. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_drift/_kneighbors.py +0 -0
  98. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_drift/_mmd.py +0 -0
  99. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_drift/_reconstruction.py +0 -0
  100. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_ood/__init__.py +0 -0
  101. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_ood/_base.py +0 -0
  102. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_ood/_domain_classifier.py +0 -0
  103. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_ood/_kneighbors.py +0 -0
  104. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_ood/_reconstruction.py +0 -0
  105. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_shared/__init__.py +0 -0
  106. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_shared/_domain_classifier.py +0 -0
  107. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_shared/_kneighbors.py +0 -0
  108. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/update_strategies.py +0 -0
  109. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/utils/__init__.py +0 -0
  110. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/utils/losses.py +0 -0
  111. {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/utils/models.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataeval
3
- Version: 1.0.3
3
+ Version: 1.0.5
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Project-URL: Homepage, https://dataeval.ai/
6
6
  Project-URL: Repository, https://github.com/aria-ml/dataeval/
@@ -118,6 +118,14 @@ docs = [
118
118
  "markupsafe>=3,<3.0.2",
119
119
  "jupytext>=1.19.1",
120
120
  ]
121
+ security = [ # keep in sync with [tool.uv.constraint-dependencies]
122
+ "cryptography>=46.0.5", # CVE-2026-26007: Missing Subgroup Validation for SECT Curves
123
+ "filelock>=3.20.3", # GHSA-w853-jp5j-5j7f, GHSA-qmgc-5h2g-mvrw
124
+ "pillow>=12.1.1", # CVE-2026-25990: OOB write via PSD image
125
+ "protobuf>=6.33.5", # GHSA-7gcm-g887-7qv7
126
+ "setuptools>=82.0.0", # CVE-2026-23949: (jaraco_context) path traversal in tarball()
127
+ # CVE-2026-24049: (wheel) privilege escalation via unpack
128
+ ]
121
129
  dev = [
122
130
  { include-group = "base" },
123
131
  { include-group = "lint" },
@@ -141,7 +149,9 @@ conflicts = [
141
149
  ]
142
150
  constraint-dependencies = [
143
151
  "cryptography>=46.0.5", # CVE-2026-26007: Missing Subgroup Validation for SECT Curves
152
+ "filelock>=3.20.3", # GHSA-w853-jp5j-5j7f, GHSA-qmgc-5h2g-mvrw
144
153
  "pillow>=12.1.1", # CVE-2026-25990: OOB write via PSD image
154
+ "protobuf>=6.33.5", # GHSA-7gcm-g887-7qv7
145
155
  "setuptools>=82.0.0", # CVE-2026-23949: (jaraco_context) path traversal in tarball()
146
156
  # CVE-2026-24049: (wheel) privilege escalation via unpack
147
157
  ]
@@ -262,20 +272,23 @@ exclude = [
262
272
  ".jupyter_cache",
263
273
  "*env*",
264
274
  "output",
275
+ "_build",
265
276
  "build",
266
277
  ".nox",
267
278
  ".tox",
279
+ "prototype",
268
280
  "src/dataeval/_version.py",
269
281
  ]
270
282
  line-length = 120
271
283
  indent-width = 4
272
284
  target-version = "py310"
285
+ extend-include = ["*.ipynb"]
273
286
 
274
287
  [tool.ruff.lint]
275
288
  select = ["F", "E", "W", "C90", "I", "N", "D", "UP", "YTT", "ANN", "S", "BLE", "B", "A",
276
289
  "COM", "C4", "T10", "ISC", "ICN", "PYI", "PT", "Q", "RSE", "RET", "SLF", "SIM",
277
- "TID252", "ARG", "FIX", "PD", "FLY", "NPY", "RUF100", "PERF"]
278
- ignore = ["ANN401", "COM812", "NPY002", "SLF001"]
290
+ "TID252", "ARG", "FIX", "PD", "FLY", "NPY", "RUF027", "RUF100", "PERF"]
291
+ ignore = ["ANN101", "ANN102", "ANN401", "C408", "C416", "COM812", "NPY002", "SLF001"]
279
292
  fixable = ["ALL"]
280
293
  unfixable = []
281
294
  dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
@@ -287,6 +300,9 @@ builtins-strict-checking = false
287
300
  [tool.ruff.lint.isort]
288
301
  known-first-party = ["dataeval"]
289
302
 
303
+ [tool.ruff.lint.mccabe]
304
+ max-complexity = 5
305
+
290
306
  [tool.ruff.lint.pydocstyle]
291
307
  convention = "numpy"
292
308
 
@@ -509,7 +509,7 @@ class Embeddings(Array, FeatureExtractor):
509
509
  images.append(image)
510
510
  return images
511
511
 
512
- def _batch(self, indices: Sequence[int]) -> Iterator[NDArray[Any]]:
512
+ def _batch(self, indices: Sequence[int]) -> Iterator[NDArray[Any]]: # noqa: C901
513
513
  """Process indices in batches using the extractor."""
514
514
  if self._dataset is None:
515
515
  raise NotFittedError("No dataset bound. Call bind() first.")
@@ -559,7 +559,7 @@ class Embeddings(Array, FeatureExtractor):
559
559
  batch_indices = list(indices[batch_start : batch_start + self._batch_size])
560
560
  yield self._embeddings[batch_indices]
561
561
 
562
- def __getitem__(self, key: int | Iterable[int] | slice, /) -> NDArray[Any]:
562
+ def __getitem__(self, key: int | Iterable[int] | slice, /) -> NDArray[Any]: # noqa: C901
563
563
  """
564
564
  Access embeddings by index, indices or slice.
565
565
 
@@ -13,7 +13,7 @@ from dataeval.exceptions import DeprecatedWarning, ExperimentalWarning
13
13
  F = TypeVar("F", bound=Callable[..., Any])
14
14
 
15
15
 
16
- def _make_warning_message(
16
+ def _make_warning_message( # noqa: C901
17
17
  name: str,
18
18
  kind: str,
19
19
  *,
@@ -51,7 +51,7 @@ def _prepend_doc_note(doc: str | None, note: str) -> str:
51
51
  def experimental(_target: F) -> F: ...
52
52
  @overload
53
53
  def experimental(*, alternative: str | None = None, details: str | None = None) -> Callable[[F], F]: ...
54
- def experimental(
54
+ def experimental( # noqa: C901
55
55
  _target: F | None = None,
56
56
  *,
57
57
  alternative: str | None = None,
@@ -72,7 +72,7 @@ def experimental(
72
72
  def my_func(): ...
73
73
  """
74
74
 
75
- def decorator(target: F) -> F:
75
+ def decorator(target: F) -> F: # noqa: C901
76
76
  name = getattr(target, "__qualname__", getattr(target, "__name__", str(target)))
77
77
  msg = _make_warning_message(name, "experimental", alternative=alternative, details=details)
78
78
  warned = False
@@ -118,7 +118,7 @@ def deprecated(
118
118
  alternative: str | None = None,
119
119
  details: str | None = None,
120
120
  ) -> Callable[[F], F]: ...
121
- def deprecated(
121
+ def deprecated( # noqa: C901
122
122
  _target: F | None = None,
123
123
  *,
124
124
  since: str | None = None,
@@ -141,7 +141,7 @@ def deprecated(
141
141
  def old_func(): ...
142
142
  """
143
143
 
144
- def decorator(target: F) -> F:
144
+ def decorator(target: F) -> F: # noqa: C901
145
145
  name = getattr(target, "__qualname__", getattr(target, "__name__", str(target)))
146
146
  msg = _make_warning_message(
147
147
  name,
@@ -39,6 +39,7 @@ class FactorInfo:
39
39
  factor_type: Literal["categorical", "continuous", "discrete"]
40
40
  is_binned: bool = False
41
41
  is_digitized: bool = False
42
+ level: Literal["image", "target"] = "image"
42
43
 
43
44
 
44
45
  def _to_col(name: str, info: FactorInfo, binned: bool = True) -> str:
@@ -110,8 +111,8 @@ class Metadata(Array, FeatureExtractor):
110
111
  *,
111
112
  continuous_factor_bins: Mapping[str, int | Sequence[float]] | None = None,
112
113
  auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = "uniform_width",
113
- exclude: Sequence[str] | None = None,
114
- include: Sequence[str] | None = None,
114
+ exclude: str | Sequence[str] | None = None,
115
+ include: str | Sequence[str] | None = None,
115
116
  ) -> None:
116
117
  self._class_labels: NDArray[np.intp]
117
118
  self._item_indices: NDArray[np.intp]
@@ -132,8 +133,8 @@ class Metadata(Array, FeatureExtractor):
132
133
  if exclude is not None and include is not None:
133
134
  raise ValueError("Filters for `exclude` and `include` are mutually exclusive.")
134
135
 
135
- self._exclude = set(exclude or ())
136
- self._include = set(include or ())
136
+ self._exclude = {exclude} if isinstance(exclude, str) else set(exclude or ())
137
+ self._include = {include} if isinstance(include, str) else set(include or ())
137
138
  self._target_factors_only = False
138
139
 
139
140
  def __repr__(self) -> str:
@@ -281,7 +282,7 @@ class Metadata(Array, FeatureExtractor):
281
282
  raise NotFittedError("No dataset bound. Call bind() first.")
282
283
  yield from self.factor_data
283
284
 
284
- def __getitem__(self, index: int | str | slice) -> Array:
285
+ def __getitem__(self, index: int | str | slice) -> Array: # noqa: C901
285
286
  """Get binned metadata for specific indices or factors.
286
287
 
287
288
  Parameters
@@ -423,17 +424,17 @@ class Metadata(Array, FeatureExtractor):
423
424
  return self._exclude
424
425
 
425
426
  @exclude.setter
426
- def exclude(self, value: Sequence[str]) -> None:
427
+ def exclude(self, value: str | Sequence[str]) -> None:
427
428
  """Set factor names to exclude from processing.
428
429
 
429
430
  Automatically clears include filter and resets binning state when exclusion list changes.
430
431
 
431
432
  Parameters
432
433
  ----------
433
- value : Sequence[str]
434
- Factor names to exclude from metadata analysis.
434
+ value : str | Sequence[str]
435
+ Factor name or names to exclude from metadata analysis.
435
436
  """
436
- exclude = set(value)
437
+ exclude = {value} if isinstance(value, str) else set(value)
437
438
  if self._exclude != exclude:
438
439
  self._exclude = exclude
439
440
  self._include = set()
@@ -451,7 +452,7 @@ class Metadata(Array, FeatureExtractor):
451
452
  return self._include
452
453
 
453
454
  @include.setter
454
- def include(self, value: Sequence[str]) -> None:
455
+ def include(self, value: str | Sequence[str]) -> None:
455
456
  """Set factor names to include in processing.
456
457
 
457
458
  Automatically clears exclude filter and resets binning state when
@@ -459,10 +460,10 @@ class Metadata(Array, FeatureExtractor):
459
460
 
460
461
  Parameters
461
462
  ----------
462
- value : Sequence[str]
463
- Factor names to include in metadata analysis.
463
+ value : str | Sequence[str]
464
+ Factor name or names to include in metadata analysis.
464
465
  """
465
- include = set(value)
466
+ include = {value} if isinstance(value, str) else set(value)
466
467
  if self._include != include:
467
468
  self._include = include
468
469
  self._exclude = set()
@@ -933,7 +934,7 @@ class Metadata(Array, FeatureExtractor):
933
934
  factor = factor[0] if isinstance(factor, tuple) else factor
934
935
  return factor in self.include if self.include else factor not in self.exclude
935
936
 
936
- def _reset_bins(self, cols: Iterable[str] | None = None) -> None:
937
+ def _reset_bins(self, cols: Iterable[str] | None = None) -> None: # noqa: C901
937
938
  if self._is_binned:
938
939
  columns = self._dataframe.columns
939
940
  for col in cols or columns:
@@ -1005,7 +1006,7 @@ class Metadata(Array, FeatureExtractor):
1005
1006
  )
1006
1007
  return target_rows
1007
1008
 
1008
- def _get_target_factor_values(
1009
+ def _get_target_factor_values( # noqa: C901
1009
1010
  self,
1010
1011
  factor_name: str,
1011
1012
  factor_values: Any,
@@ -1115,13 +1116,26 @@ class Metadata(Array, FeatureExtractor):
1115
1116
  raise ValueError(f"Invalid level: {level}. Must be 'image', 'target', or 'auto'")
1116
1117
 
1117
1118
  def _create_factor_column(self, data_array: NDArray, level: str, num_image_rows: int) -> list:
1118
- """Create a factor column with values at the appropriate level."""
1119
+ """Create a factor column with values at the appropriate level.
1120
+
1121
+ For OD datasets with image-level factors, values are stored in image rows
1122
+ and replicated to target rows using item_index mapping, so that bias
1123
+ evaluators can access them via target_data.
1124
+ """
1119
1125
  if level == "image":
1120
- # Create column: image-level values in image rows, None in target rows
1121
- full_data = [None] * len(self.dataframe)
1122
- for idx, val in enumerate(data_array):
1123
- full_data[idx] = val # Image rows come first in our structure
1124
- return full_data
1126
+ # Image rows get the values directly
1127
+ image_values: list = data_array.tolist()
1128
+
1129
+ if self.has_targets():
1130
+ # For OD datasets, replicate image-level values to target rows
1131
+ # using the item_index column which maps each target to its source image
1132
+ target_df = self._dataframe.filter(pl.col("target_index").is_not_null())
1133
+ target_image_indices = target_df["item_index"].to_numpy()
1134
+ target_values = data_array[target_image_indices].tolist()
1135
+ else:
1136
+ target_values = []
1137
+
1138
+ return image_values + target_values
1125
1139
  # level == "target"
1126
1140
  # Create column: None in image rows, target-level values in target rows
1127
1141
  return [None] * num_image_rows + list(data_array)
@@ -1138,7 +1152,7 @@ class Metadata(Array, FeatureExtractor):
1138
1152
  self._structure()
1139
1153
  return bool(self._has_targets)
1140
1154
 
1141
- def _process_targets(
1155
+ def _process_targets( # noqa: C901
1142
1156
  self,
1143
1157
  raw: list,
1144
1158
  labels: list,
@@ -1267,9 +1281,10 @@ class Metadata(Array, FeatureExtractor):
1267
1281
  k for k in factors if not isinstance(self._dataframe.schema.get(k), pl.List | pl.Struct | pl.Array)
1268
1282
  }
1269
1283
 
1270
- self._factors = dict.fromkeys(usable_factors, None)
1284
+ existing = self._factors if hasattr(self, "_factors") else {}
1285
+ self._factors = {k: existing.get(k) for k in usable_factors}
1271
1286
 
1272
- def _structure(
1287
+ def _structure( # noqa: C901
1273
1288
  self,
1274
1289
  *,
1275
1290
  progress_callback: ProgressCallback | None = None,
@@ -1463,7 +1478,7 @@ class Metadata(Array, FeatureExtractor):
1463
1478
  df = self._add_column_with_padding(df, col_dg, ordinal.astype(np.int64), is_od)
1464
1479
  return df, FactorInfo("discrete", is_digitized=True)
1465
1480
 
1466
- def _bin(
1481
+ def _bin( # noqa: C901
1467
1482
  self,
1468
1483
  *,
1469
1484
  progress_callback: ProgressCallback | None = None,
@@ -1492,9 +1507,12 @@ class Metadata(Array, FeatureExtractor):
1492
1507
  factors_to_process = [col for col in self.factor_names if not {_binned(col), _digitized(col)} & column_set]
1493
1508
  total_factors = len(factors_to_process)
1494
1509
 
1510
+ target_only = self._target_factors - self._image_factors if is_od else set()
1495
1511
  for i, col in enumerate(factors_to_process):
1496
1512
  data = data_df[col].to_numpy()
1497
1513
  df, info = self._process_factor(df, col, data, factor_bins, is_od)
1514
+ if is_od and col in target_only:
1515
+ info.level = "target"
1498
1516
  factor_info[col] = info
1499
1517
 
1500
1518
  if progress_callback:
@@ -1505,7 +1523,7 @@ class Metadata(Array, FeatureExtractor):
1505
1523
  self._factors.update(factor_info)
1506
1524
  self._is_binned = True
1507
1525
 
1508
- def add_factors(
1526
+ def add_factors( # noqa: C901
1509
1527
  self,
1510
1528
  factors: Mapping[str, Array1D[Any]],
1511
1529
  level: Literal["image", "target", "auto"] = "auto",
@@ -0,0 +1,24 @@
1
+ # file generated by vcs-versioning
2
+ # don't change, don't track in version control
3
+ from __future__ import annotations
4
+
5
+ __all__ = [
6
+ "__version__",
7
+ "__version_tuple__",
8
+ "version",
9
+ "version_tuple",
10
+ "__commit_id__",
11
+ "commit_id",
12
+ ]
13
+
14
+ version: str
15
+ __version__: str
16
+ __version_tuple__: tuple[int | str, ...]
17
+ version_tuple: tuple[int | str, ...]
18
+ commit_id: str | None
19
+ __commit_id__: str | None
20
+
21
+ __version__ = version = '1.0.5'
22
+ __version_tuple__ = version_tuple = (1, 0, 5)
23
+
24
+ __commit_id__ = commit_id = None
@@ -3,6 +3,7 @@ __all__ = []
3
3
  from dataclasses import dataclass
4
4
  from typing import Any, Literal
5
5
 
6
+ import numpy as np
6
7
  import polars as pl
7
8
 
8
9
  from dataeval import Metadata
@@ -146,7 +147,7 @@ class Balance(Evaluator):
146
147
  super().__init__(locals())
147
148
 
148
149
  @set_metadata(state=["num_neighbors", "class_imbalance_threshold", "factor_correlation_threshold"])
149
- def evaluate(self, data: AnnotatedDataset[Any] | MetadataLike) -> BalanceOutput:
150
+ def evaluate(self, data: AnnotatedDataset[Any] | MetadataLike) -> BalanceOutput: # noqa: C901
150
151
  """
151
152
  Compute mutual information between factors and identify imbalanced classes.
152
153
 
@@ -269,8 +270,9 @@ class Balance(Evaluator):
269
270
  # Include class_label as the first factor (index 0), then all metadata factors
270
271
  all_factor_names = ["class_label"] + factor_names
271
272
 
273
+ u_classes = np.unique(self.metadata.class_labels)
272
274
  for class_idx in range(classwise.shape[0]):
273
- class_name = index2label.get(class_idx, str(class_idx))
275
+ class_name = index2label.get(int(u_classes[class_idx]), str(u_classes[class_idx]))
274
276
  for factor_idx in range(classwise.shape[1]):
275
277
  mi_value = classwise[class_idx, factor_idx]
276
278
  class_name_col.append(class_name)
@@ -135,7 +135,7 @@ class Diversity(Evaluator):
135
135
  super().__init__(locals())
136
136
 
137
137
  @set_metadata(state=["method", "threshold"])
138
- def evaluate(self, data: AnnotatedDataset[Any] | MetadataLike) -> DiversityOutput:
138
+ def evaluate(self, data: AnnotatedDataset[Any] | MetadataLike) -> DiversityOutput: # noqa: C901
139
139
  """
140
140
  Compute diversity and classwise diversity for the dataset.
141
141
 
@@ -251,7 +251,7 @@ class Diversity(Evaluator):
251
251
  is_low_diversity_col: list[bool] = []
252
252
 
253
253
  for class_idx in range(classwise_div.shape[0]):
254
- class_name = index2label.get(class_idx, str(class_idx))
254
+ class_name = index2label.get(int(u_classes[class_idx]), str(u_classes[class_idx]))
255
255
  for factor_idx in range(num_factors):
256
256
  div_value = classwise_div[class_idx, factor_idx]
257
257
  if not np.isnan(div_value):
@@ -23,7 +23,7 @@ from pydantic import BaseModel, ConfigDict, field_validator
23
23
 
24
24
  from dataeval.protocols import DeviceLike
25
25
 
26
- ### GLOBAL CONFIG ###
26
+ # GLOBAL CONFIG ###
27
27
 
28
28
 
29
29
  class GlobalConfig(BaseModel):
@@ -77,7 +77,7 @@ class GlobalConfig(BaseModel):
77
77
  _config = GlobalConfig()
78
78
 
79
79
 
80
- ### CONTEXT MANAGER ###
80
+ # CONTEXT MANAGER ###
81
81
 
82
82
 
83
83
  class _ConfigContextManager:
@@ -96,7 +96,7 @@ class _ConfigContextManager:
96
96
  setattr(_config, self._attr_name, self._old)
97
97
 
98
98
 
99
- ### FUNCS ###
99
+ # FUNCS ###
100
100
 
101
101
 
102
102
  def _todevice(device: DeviceLike) -> torch.device:
@@ -92,7 +92,7 @@ def bin_data(data: NDArray[Any], bin_method: str) -> NDArray[np.intp]:
92
92
  return np.digitize(data, bin_edges)
93
93
 
94
94
 
95
- def is_continuous(data: NDArray[np.number[Any]], image_indices: NDArray[np.number[Any]] | None = None) -> bool:
95
+ def is_continuous(data: NDArray[np.number[Any]], image_indices: NDArray[np.number[Any]] | None = None) -> bool: # noqa: C901
96
96
  """
97
97
  Determine whether the data is continuous or discrete using the Wasserstein distance.
98
98
 
@@ -144,7 +144,7 @@ def is_continuous(data: NDArray[np.number[Any]], image_indices: NDArray[np.numbe
144
144
  return bool(shift < DISCRETE_MIN_WD) # if NNN is close enough to uniform, consider the sample continuous.
145
145
 
146
146
 
147
- def _bin_by_clusters(data: NDArray[np.number[Any]]) -> NDArray[np.float64]:
147
+ def _bin_by_clusters(data: NDArray[np.number[Any]]) -> NDArray[np.float64]: # noqa: C901
148
148
  """
149
149
  Bin continuous data by using the Clusterer to identify clusters.
150
150
 
@@ -22,7 +22,13 @@ class CalculatorCache:
22
22
  This class adapts based on the data type passed in.
23
23
  """
24
24
 
25
- def __init__(self, datum: Any, box: BoundingBox | None = None, per_channel: bool = False) -> None:
25
+ def __init__(
26
+ self,
27
+ datum: Any,
28
+ box: BoundingBox | None = None,
29
+ per_channel: bool = False,
30
+ normalize_pixel_values: bool = False,
31
+ ) -> None:
26
32
  is_spatial = len(datum.shape) >= 2
27
33
  self.raw = datum
28
34
  # Assume image data for now (will be generic in future)
@@ -30,6 +36,7 @@ class CalculatorCache:
30
36
  self.height: int = datum.shape[-2] if is_spatial else 0
31
37
  self.shape: tuple[int, ...] = datum.shape
32
38
  self.per_channel_mode = per_channel
39
+ self.normalize_pixel_values = normalize_pixel_values
33
40
  self.has_box = box is not None
34
41
 
35
42
  # Ensure bounding box
@@ -54,7 +61,9 @@ class CalculatorCache:
54
61
 
55
62
  @cached_property
56
63
  def scaled(self) -> NDArray[Any]:
57
- return rescale(self.image)
64
+ if self.normalize_pixel_values:
65
+ return rescale(self.image)
66
+ return self.image
58
67
 
59
68
  @cached_property
60
69
  def per_channel(self) -> NDArray[Any]:
@@ -39,11 +39,23 @@ class PixelStatCalculator(Calculator[ImageStats]):
39
39
  def _var_func(self, data: NDArray[Any], **kw: Any) -> Any:
40
40
  return np.nanvar(data, **kw) if self._has_nan else np.var(data, **kw)
41
41
 
42
+ @cached_property
43
+ def _histogram_range(self) -> tuple[float, float]:
44
+ if self.cache.normalize_pixel_values:
45
+ return (0.0, 1.0)
46
+ from dataeval.utils.preprocessing import get_bitdepth
47
+
48
+ bitdepth = get_bitdepth(self.cache.scaled)
49
+ if bitdepth.depth == 0:
50
+ return (0.0, 1.0)
51
+ return (0.0, float(bitdepth.pmax))
52
+
42
53
  @cached_property
43
54
  def histogram(self) -> NDArray[np.float64]:
55
+ r = self._histogram_range
44
56
  if self.per_channel_mode:
45
- return np.apply_along_axis(lambda y: np.histogram(y, bins=256, range=(0, 1))[0], 1, self.cache.per_channel)
46
- return np.histogram(self.cache.scaled, bins=256, range=(0, 1))[0]
57
+ return np.apply_along_axis(lambda y: np.histogram(y, bins=256, range=r)[0], 1, self.cache.per_channel)
58
+ return np.histogram(self.cache.scaled, bins=256, range=r)[0]
47
59
 
48
60
  def get_applicable_flags(self) -> ImageStats:
49
61
  """Return which flags this calculator handles."""
@@ -99,7 +99,7 @@ class _Clusters:
99
99
  prob: NDArray[np.float64] = exp / np.sum(exp)
100
100
  return prob
101
101
 
102
- def _sort_by_weights(self, embeddings: NDArray[np.float64]) -> NDArray[np.intp]:
102
+ def _sort_by_weights(self, embeddings: NDArray[np.float64]) -> NDArray[np.intp]: # noqa: C901
103
103
  """Sort samples using complexity-based weighted sampling."""
104
104
  labels = self._get_labels(embeddings)
105
105
  pr = self._complexity(embeddings)
@@ -356,7 +356,7 @@ class _HDBSCAN:
356
356
  self.cluster_selection_epsilon = 0.0
357
357
  self.cluster_selection_method = "eom"
358
358
 
359
- def fit(self, embeddings: NDArray[np.floating]) -> "_HDBSCAN":
359
+ def fit(self, embeddings: NDArray[np.floating]) -> "_HDBSCAN": # noqa: C901
360
360
  """
361
361
  Find clusters based on hierarchical density-based clustering.
362
362
 
@@ -541,7 +541,7 @@ class ClusterStats(TypedDict):
541
541
  nearest_cluster_idx: NDArray[np.int64]
542
542
 
543
543
 
544
- def compute_cluster_stats(
544
+ def compute_cluster_stats( # noqa: C901
545
545
  embeddings: NDArray[np.floating],
546
546
  cluster_labels: _Clusters | NDArray[np.int64],
547
547
  ) -> ClusterStats:
@@ -642,7 +642,7 @@ def compute_cluster_stats(
642
642
  )
643
643
 
644
644
 
645
- def cluster(
645
+ def cluster( # noqa: C901
646
646
  embeddings: ArrayND[float],
647
647
  algorithm: Literal["kmeans", "hdbscan"] = "hdbscan",
648
648
  n_clusters: int | None = None,
@@ -101,7 +101,7 @@ def _build_image_lookup(source_indices: Sequence[SourceIndex]) -> dict[tuple[int
101
101
  return lookup
102
102
 
103
103
 
104
- def _calculate_ratio_for_stat(
104
+ def _calculate_ratio_for_stat( # noqa: C901
105
105
  stat_name: str,
106
106
  box_value: Any,
107
107
  img_value: Any,
@@ -160,7 +160,7 @@ def _calculate_ratio_for_stat(
160
160
  return box_value
161
161
 
162
162
 
163
- def _validate_separate_inputs(
163
+ def _validate_separate_inputs( # noqa: C901
164
164
  stats_output: StatsResult,
165
165
  box_stats_output: StatsResult,
166
166
  ) -> tuple[Sequence[SourceIndex], Sequence[SourceIndex]]:
@@ -241,7 +241,7 @@ def _validate_unified_input(source_indices: Sequence[SourceIndex]) -> None:
241
241
  )
242
242
 
243
243
 
244
- def compute_ratios(
244
+ def compute_ratios( # noqa: C901
245
245
  stats_output: StatsResult,
246
246
  *,
247
247
  target_stats_output: StatsResult | None = None,