dataeval 1.0.4__tar.gz → 1.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. {dataeval-1.0.4 → dataeval-1.0.6}/PKG-INFO +1 -1
  2. {dataeval-1.0.4 → dataeval-1.0.6}/pyproject.toml +36 -3
  3. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/_embeddings.py +2 -2
  4. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/_experimental.py +5 -5
  5. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/_metadata.py +8 -8
  6. dataeval-1.0.6/src/dataeval/_version.py +24 -0
  7. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/bias/_balance.py +20 -17
  8. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/bias/_diversity.py +5 -3
  9. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/bias/_parity.py +0 -2
  10. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/config.py +3 -3
  11. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_ber.py +10 -2
  12. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_bin.py +2 -2
  13. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_clusterer.py +5 -5
  14. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_compute_ratios.py +3 -3
  15. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_compute_stats.py +3 -3
  16. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_coverage.py +31 -13
  17. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_fast_hdbscan/_cluster_trees.py +6 -6
  18. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_fast_hdbscan/_mst.py +1 -1
  19. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_feature_distance.py +4 -3
  20. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_label_parity.py +3 -3
  21. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_label_stats.py +1 -1
  22. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_metadata_insights.py +26 -22
  23. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_mst.py +1 -1
  24. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_mutual_info.py +31 -35
  25. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_parity.py +1 -1
  26. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_uap.py +10 -12
  27. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/extractors/_onnx.py +2 -2
  28. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/extractors/_torch.py +1 -1
  29. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/performance/_output.py +3 -3
  30. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/performance/_sufficiency.py +1 -1
  31. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/quality/_duplicates.py +49 -36
  32. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/quality/_outliers.py +37 -37
  33. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/quality/_shared.py +3 -2
  34. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/scope/_prioritize.py +8 -5
  35. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/selection/_classbalance.py +1 -1
  36. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/selection/_classfilter.py +1 -1
  37. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/shift/_drift/_base.py +3 -2
  38. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/shift/_drift/_univariate.py +1 -1
  39. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/shift/_ood/_domain_classifier.py +25 -9
  40. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/shift/_shared/_reconstruction.py +39 -35
  41. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/types.py +4 -4
  42. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/utils/_internal.py +6 -6
  43. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/utils/data.py +4 -4
  44. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/utils/onnx.py +1 -1
  45. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/utils/preprocessing.py +3 -3
  46. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/utils/thresholds.py +3 -3
  47. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/utils/training.py +1 -1
  48. dataeval-1.0.4/src/dataeval/_version.py +0 -34
  49. {dataeval-1.0.4 → dataeval-1.0.6}/.gitignore +0 -0
  50. {dataeval-1.0.4 → dataeval-1.0.6}/LICENSE +0 -0
  51. {dataeval-1.0.4 → dataeval-1.0.6}/README.md +0 -0
  52. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/__init__.py +0 -0
  53. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/_helpers.py +0 -0
  54. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/_log.py +0 -0
  55. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/_warm_cache.py +0 -0
  56. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/bias/__init__.py +0 -0
  57. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/__init__.py +0 -0
  58. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_calculators/__init__.py +0 -0
  59. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_calculators/_base.py +0 -0
  60. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_calculators/_cache.py +0 -0
  61. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_calculators/_dimensionstats.py +0 -0
  62. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_calculators/_hashstats.py +0 -0
  63. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_calculators/_pixelstats.py +0 -0
  64. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_calculators/_register.py +0 -0
  65. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_calculators/_registry.py +0 -0
  66. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_calculators/_visualstats.py +0 -0
  67. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_completeness.py +0 -0
  68. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_divergence.py +0 -0
  69. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_diversity.py +0 -0
  70. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_fast_hdbscan/_disjoint_set.py +0 -0
  71. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_hash.py +0 -0
  72. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_label_errors.py +0 -0
  73. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_nullmodel.py +0 -0
  74. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/core/_rank.py +0 -0
  75. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/exceptions.py +0 -0
  76. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/extractors/__init__.py +0 -0
  77. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/extractors/_bovw.py +0 -0
  78. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/extractors/_flatten.py +0 -0
  79. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/extractors/_uncertainty.py +0 -0
  80. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/flags.py +0 -0
  81. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/performance/__init__.py +0 -0
  82. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/performance/_aggregator.py +0 -0
  83. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/performance/schedules.py +0 -0
  84. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/protocols.py +0 -0
  85. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/py.typed +0 -0
  86. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/quality/__init__.py +0 -0
  87. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/scope/__init__.py +0 -0
  88. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/selection/__init__.py +0 -0
  89. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/selection/_indices.py +0 -0
  90. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/selection/_limit.py +0 -0
  91. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/selection/_reverse.py +0 -0
  92. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/selection/_select.py +0 -0
  93. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/selection/_shuffle.py +0 -0
  94. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/shift/__init__.py +0 -0
  95. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/shift/_drift/__init__.py +0 -0
  96. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/shift/_drift/_chunk.py +0 -0
  97. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/shift/_drift/_domain_classifier.py +0 -0
  98. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/shift/_drift/_kneighbors.py +0 -0
  99. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/shift/_drift/_mmd.py +0 -0
  100. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/shift/_drift/_reconstruction.py +0 -0
  101. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/shift/_ood/__init__.py +0 -0
  102. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/shift/_ood/_base.py +0 -0
  103. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/shift/_ood/_kneighbors.py +0 -0
  104. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/shift/_ood/_reconstruction.py +0 -0
  105. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/shift/_shared/__init__.py +0 -0
  106. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/shift/_shared/_domain_classifier.py +0 -0
  107. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/shift/_shared/_kneighbors.py +0 -0
  108. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/shift/update_strategies.py +0 -0
  109. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/utils/__init__.py +0 -0
  110. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/utils/losses.py +0 -0
  111. {dataeval-1.0.4 → dataeval-1.0.6}/src/dataeval/utils/models.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataeval
3
- Version: 1.0.4
3
+ Version: 1.0.6
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Project-URL: Homepage, https://dataeval.ai/
6
6
  Project-URL: Repository, https://github.com/aria-ml/dataeval/
@@ -114,10 +114,24 @@ docs = [
114
114
  "sphinx-tabs>=3.4.7",
115
115
  "Sphinx>=7.2.6,<9.0.0", # sphinx-immaterial <= 0.13.9 is not compatible with sphinx >=9.0
116
116
  "torchmetrics>=1.0.0",
117
- "torchvision>=0.17.0",
118
117
  "markupsafe>=3,<3.0.2",
119
118
  "jupytext>=1.19.1",
120
119
  ]
120
+ security = [ # keep in sync with [tool.uv.constraint-dependencies]
121
+ "cryptography>=46.0.5", # CVE-2026-26007: Missing Subgroup Validation for SECT Curves
122
+ "filelock>=3.20.3", # GHSA-w853-jp5j-5j7f, GHSA-qmgc-5h2g-mvrw
123
+ "onnx>=1.21.0", # CVE-2026-28500: Untrusted Model Repository Warnings Suppressed by silent=True
124
+ # CVE-2026-34445: Malicious ONNX models can crash servers by exploiting unprotected object settings
125
+ # CVE-2026-27489: Vulnerable to Path Traversal via Symlink
126
+ # GHSA-q56x-g2fj-4rj6: TOCTOU arbitrary file read/write in save_external_dat
127
+ "pillow>=12.1.1", # CVE-2026-25990: OOB write via PSD image
128
+ "poetry>=2.3.3", # CVE-2026-34591: Poetry Has Wheel Path Traversal Which Can Lead to Arbitrary File Write
129
+ "protobuf>=6.33.5", # GHSA-7gcm-g887-7qv7
130
+ "setuptools>=82.0.0", # CVE-2026-23949: (jaraco_context) path traversal in tarball()
131
+ # CVE-2026-24049: (wheel) privilege escalation via unpack
132
+ "tornado>=6.5.5", # CVE-2026-31958: Tornado is vulnerable to DoS due to too many multipart parts
133
+ # CVE-2026-35536: Tornado has cookie attribute injection via .RequestHandler.set_cookie
134
+ ]
121
135
  dev = [
122
136
  { include-group = "base" },
123
137
  { include-group = "lint" },
@@ -141,9 +155,18 @@ conflicts = [
141
155
  ]
142
156
  constraint-dependencies = [
143
157
  "cryptography>=46.0.5", # CVE-2026-26007: Missing Subgroup Validation for SECT Curves
158
+ "filelock>=3.20.3", # GHSA-w853-jp5j-5j7f, GHSA-qmgc-5h2g-mvrw
159
+ "onnx>=1.21.0", # CVE-2026-28500: Untrusted Model Repository Warnings Suppressed by silent=True
160
+ # CVE-2026-34445: Malicious ONNX models can crash servers by exploiting unprotected object settings
161
+ # CVE-2026-27489: Vulnerable to Path Traversal via Symlink
162
+ # GHSA-q56x-g2fj-4rj6: TOCTOU arbitrary file read/write in save_external_dat
144
163
  "pillow>=12.1.1", # CVE-2026-25990: OOB write via PSD image
164
+ "poetry>=2.3.3", # CVE-2026-34591: Poetry Has Wheel Path Traversal Which Can Lead to Arbitrary File Write
165
+ "protobuf>=6.33.5", # GHSA-7gcm-g887-7qv7
145
166
  "setuptools>=82.0.0", # CVE-2026-23949: (jaraco_context) path traversal in tarball()
146
167
  # CVE-2026-24049: (wheel) privilege escalation via unpack
168
+ "tornado>=6.5.5", # CVE-2026-31958: Tornado is vulnerable to DoS due to too many multipart parts
169
+ # CVE-2026-35536: Tornado has cookie attribute injection via .RequestHandler.set_cookie
147
170
  ]
148
171
 
149
172
  [[tool.uv.index]]
@@ -201,6 +224,9 @@ version-file = "src/dataeval/_version.py"
201
224
  [tool.poetry]
202
225
  version = "0.0.0" # unused
203
226
 
227
+ [tool.poetry.dependencies]
228
+ python = ">=3.10,<3.15"
229
+
204
230
  [tool.pyproject2conda.dependencies]
205
231
  numpy = { skip = true, packages = "numpy>=1.24.2" }
206
232
  scikit-learn = { skip = true, packages = "scikit-learn>=1.5.0" }
@@ -262,20 +288,23 @@ exclude = [
262
288
  ".jupyter_cache",
263
289
  "*env*",
264
290
  "output",
291
+ "_build",
265
292
  "build",
266
293
  ".nox",
267
294
  ".tox",
295
+ "prototype",
268
296
  "src/dataeval/_version.py",
269
297
  ]
270
298
  line-length = 120
271
299
  indent-width = 4
272
300
  target-version = "py310"
301
+ extend-include = ["*.ipynb"]
273
302
 
274
303
  [tool.ruff.lint]
275
304
  select = ["F", "E", "W", "C90", "I", "N", "D", "UP", "YTT", "ANN", "S", "BLE", "B", "A",
276
305
  "COM", "C4", "T10", "ISC", "ICN", "PYI", "PT", "Q", "RSE", "RET", "SLF", "SIM",
277
- "TID252", "ARG", "FIX", "PD", "FLY", "NPY", "RUF100", "PERF"]
278
- ignore = ["ANN401", "COM812", "NPY002", "SLF001"]
306
+ "TID252", "ARG", "FIX", "PD", "FLY", "NPY", "RUF027", "RUF100", "PERF"]
307
+ ignore = ["ANN101", "ANN102", "ANN401", "C408", "C416", "COM812", "NPY002", "SLF001"]
279
308
  fixable = ["ALL"]
280
309
  unfixable = []
281
310
  dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
@@ -287,10 +316,14 @@ builtins-strict-checking = false
287
316
  [tool.ruff.lint.isort]
288
317
  known-first-party = ["dataeval"]
289
318
 
319
+ [tool.ruff.lint.mccabe]
320
+ max-complexity = 5
321
+
290
322
  [tool.ruff.lint.pydocstyle]
291
323
  convention = "numpy"
292
324
 
293
325
  [tool.ruff.format]
326
+ preview = true
294
327
  quote-style = "double"
295
328
  indent-style = "space"
296
329
  skip-magic-trailing-comma = false
@@ -509,7 +509,7 @@ class Embeddings(Array, FeatureExtractor):
509
509
  images.append(image)
510
510
  return images
511
511
 
512
- def _batch(self, indices: Sequence[int]) -> Iterator[NDArray[Any]]:
512
+ def _batch(self, indices: Sequence[int]) -> Iterator[NDArray[Any]]: # noqa: C901
513
513
  """Process indices in batches using the extractor."""
514
514
  if self._dataset is None:
515
515
  raise NotFittedError("No dataset bound. Call bind() first.")
@@ -559,7 +559,7 @@ class Embeddings(Array, FeatureExtractor):
559
559
  batch_indices = list(indices[batch_start : batch_start + self._batch_size])
560
560
  yield self._embeddings[batch_indices]
561
561
 
562
- def __getitem__(self, key: int | Iterable[int] | slice, /) -> NDArray[Any]:
562
+ def __getitem__(self, key: int | Iterable[int] | slice, /) -> NDArray[Any]: # noqa: C901
563
563
  """
564
564
  Access embeddings by index, indices or slice.
565
565
 
@@ -13,7 +13,7 @@ from dataeval.exceptions import DeprecatedWarning, ExperimentalWarning
13
13
  F = TypeVar("F", bound=Callable[..., Any])
14
14
 
15
15
 
16
- def _make_warning_message(
16
+ def _make_warning_message( # noqa: C901
17
17
  name: str,
18
18
  kind: str,
19
19
  *,
@@ -51,7 +51,7 @@ def _prepend_doc_note(doc: str | None, note: str) -> str:
51
51
  def experimental(_target: F) -> F: ...
52
52
  @overload
53
53
  def experimental(*, alternative: str | None = None, details: str | None = None) -> Callable[[F], F]: ...
54
- def experimental(
54
+ def experimental( # noqa: C901
55
55
  _target: F | None = None,
56
56
  *,
57
57
  alternative: str | None = None,
@@ -72,7 +72,7 @@ def experimental(
72
72
  def my_func(): ...
73
73
  """
74
74
 
75
- def decorator(target: F) -> F:
75
+ def decorator(target: F) -> F: # noqa: C901
76
76
  name = getattr(target, "__qualname__", getattr(target, "__name__", str(target)))
77
77
  msg = _make_warning_message(name, "experimental", alternative=alternative, details=details)
78
78
  warned = False
@@ -118,7 +118,7 @@ def deprecated(
118
118
  alternative: str | None = None,
119
119
  details: str | None = None,
120
120
  ) -> Callable[[F], F]: ...
121
- def deprecated(
121
+ def deprecated( # noqa: C901
122
122
  _target: F | None = None,
123
123
  *,
124
124
  since: str | None = None,
@@ -141,7 +141,7 @@ def deprecated(
141
141
  def old_func(): ...
142
142
  """
143
143
 
144
- def decorator(target: F) -> F:
144
+ def decorator(target: F) -> F: # noqa: C901
145
145
  name = getattr(target, "__qualname__", getattr(target, "__name__", str(target)))
146
146
  msg = _make_warning_message(
147
147
  name,
@@ -282,7 +282,7 @@ class Metadata(Array, FeatureExtractor):
282
282
  raise NotFittedError("No dataset bound. Call bind() first.")
283
283
  yield from self.factor_data
284
284
 
285
- def __getitem__(self, index: int | str | slice) -> Array:
285
+ def __getitem__(self, index: int | str | slice) -> Array: # noqa: C901
286
286
  """Get binned metadata for specific indices or factors.
287
287
 
288
288
  Parameters
@@ -650,7 +650,7 @@ class Metadata(Array, FeatureExtractor):
650
650
  -------
651
651
  Sequence[str]
652
652
  List of factor names that passed filtering and preprocessing steps.
653
- Order matches columns in factor_data and binned_data.
653
+ Order matches columns in factor_data.
654
654
 
655
655
  Notes
656
656
  -----
@@ -934,7 +934,7 @@ class Metadata(Array, FeatureExtractor):
934
934
  factor = factor[0] if isinstance(factor, tuple) else factor
935
935
  return factor in self.include if self.include else factor not in self.exclude
936
936
 
937
- def _reset_bins(self, cols: Iterable[str] | None = None) -> None:
937
+ def _reset_bins(self, cols: Iterable[str] | None = None) -> None: # noqa: C901
938
938
  if self._is_binned:
939
939
  columns = self._dataframe.columns
940
940
  for col in cols or columns:
@@ -1006,7 +1006,7 @@ class Metadata(Array, FeatureExtractor):
1006
1006
  )
1007
1007
  return target_rows
1008
1008
 
1009
- def _get_target_factor_values(
1009
+ def _get_target_factor_values( # noqa: C901
1010
1010
  self,
1011
1011
  factor_name: str,
1012
1012
  factor_values: Any,
@@ -1152,7 +1152,7 @@ class Metadata(Array, FeatureExtractor):
1152
1152
  self._structure()
1153
1153
  return bool(self._has_targets)
1154
1154
 
1155
- def _process_targets(
1155
+ def _process_targets( # noqa: C901
1156
1156
  self,
1157
1157
  raw: list,
1158
1158
  labels: list,
@@ -1284,7 +1284,7 @@ class Metadata(Array, FeatureExtractor):
1284
1284
  existing = self._factors if hasattr(self, "_factors") else {}
1285
1285
  self._factors = {k: existing.get(k) for k in usable_factors}
1286
1286
 
1287
- def _structure(
1287
+ def _structure( # noqa: C901
1288
1288
  self,
1289
1289
  *,
1290
1290
  progress_callback: ProgressCallback | None = None,
@@ -1478,7 +1478,7 @@ class Metadata(Array, FeatureExtractor):
1478
1478
  df = self._add_column_with_padding(df, col_dg, ordinal.astype(np.int64), is_od)
1479
1479
  return df, FactorInfo("discrete", is_digitized=True)
1480
1480
 
1481
- def _bin(
1481
+ def _bin( # noqa: C901
1482
1482
  self,
1483
1483
  *,
1484
1484
  progress_callback: ProgressCallback | None = None,
@@ -1523,7 +1523,7 @@ class Metadata(Array, FeatureExtractor):
1523
1523
  self._factors.update(factor_info)
1524
1524
  self._is_binned = True
1525
1525
 
1526
- def add_factors(
1526
+ def add_factors( # noqa: C901
1527
1527
  self,
1528
1528
  factors: Mapping[str, Array1D[Any]],
1529
1529
  level: Literal["image", "target", "auto"] = "auto",
@@ -0,0 +1,24 @@
1
+ # file generated by vcs-versioning
2
+ # don't change, don't track in version control
3
+ from __future__ import annotations
4
+
5
+ __all__ = [
6
+ "__version__",
7
+ "__version_tuple__",
8
+ "version",
9
+ "version_tuple",
10
+ "__commit_id__",
11
+ "commit_id",
12
+ ]
13
+
14
+ version: str
15
+ __version__: str
16
+ __version_tuple__: tuple[int | str, ...]
17
+ version_tuple: tuple[int | str, ...]
18
+ commit_id: str | None
19
+ __commit_id__: str | None
20
+
21
+ __version__ = version = '1.0.6'
22
+ __version_tuple__ = version_tuple = (1, 0, 6)
23
+
24
+ __commit_id__ = commit_id = None
@@ -22,28 +22,30 @@ class BalanceOutput(DictOutput):
22
22
  """
23
23
  Output class for the :class:`.Balance` :term:`bias<Bias>` evaluator.
24
24
 
25
- Contains three polars DataFrames with mutual information scores and threshold flags.
25
+ Contains three polars DataFrames with normalized mutual information scores and threshold flags.
26
26
 
27
27
  Attributes
28
28
  ----------
29
29
  balance : pl.DataFrame
30
- DataFrame with global class-to-factor mutual information:
30
+ DataFrame with global class-to-factor normalized mutual information:
31
31
 
32
- - factor_name: str - Name of the metadata factor
33
- - mi_value: float - Mutual information value between this factor and class labels
32
+ - factor_name: str - Name of the metadata factor. Includes "class_label"
33
+ which represents the self-information (always 1.0).
34
+ - mi_value: float - Normalized mutual information value between this
35
+ factor and class labels
34
36
  factors : pl.DataFrame
35
- DataFrame with inter-factor mutual information correlations:
37
+ DataFrame with inter-factor normalized mutual information correlations:
36
38
 
37
39
  - factor1: str - Name of the first factor
38
40
  - factor2: str - Name of the second factor
39
- - mi_value: float - Mutual information value
41
+ - mi_value: float - Normalized mutual information value
40
42
  - is_correlated: bool - True if mi_value > factor_correlation_threshold
41
43
  classwise : pl.DataFrame
42
- DataFrame with per-class-to-factor mutual information:
44
+ DataFrame with per-class-to-factor normalized mutual information:
43
45
 
44
46
  - class_name: str - Name of the class
45
47
  - factor_name: str - Name of the metadata factor
46
- - mi_value: float - Mutual information value
48
+ - mi_value: float - Normalized mutual information value
47
49
  - is_imbalanced: bool - True if mi_value > class_imbalance_threshold
48
50
  """
49
51
 
@@ -58,21 +60,21 @@ class BalanceOutput(DictOutput):
58
60
 
59
61
  class Balance(Evaluator):
60
62
  """
61
- Computes mutual information (MI) between factors (class label, metadata, label/image properties).
63
+ Computes normalized mutual information (NMI) between factors (class label, metadata, label/image properties).
62
64
 
63
65
  Identifies imbalanced classes and highly correlated metadata factors based on
64
- mutual information thresholds.
66
+ NMI thresholds.
65
67
 
66
68
  Parameters
67
69
  ----------
68
70
  num_neighbors : int, default 5
69
71
  Number of points to consider as neighbors
70
72
  class_imbalance_threshold : float, default 0.3
71
- Threshold for identifying imbalanced classes. Classes with MI above this
73
+ Threshold for identifying imbalanced classes. Classes with NMI above this
72
74
  threshold with any metadata factor are considered imbalanced.
73
75
  factor_correlation_threshold : float, default 0.5
74
76
  Threshold for identifying highly correlated metadata factors. Factor pairs
75
- with MI above this threshold are considered highly correlated.
77
+ with NMI above this threshold are considered highly correlated.
76
78
 
77
79
  Attributes
78
80
  ----------
@@ -89,7 +91,8 @@ class Balance(Evaluator):
89
91
  -----
90
92
  We use `mutual_info_classif` from sklearn since class label is categorical.
91
93
  `mutual_info_classif` outputs are consistent up to O(1e-4) and depend on a random
92
- seed. MI is computed differently for categorical and continuous variables.
94
+ seed. MI is computed differently for categorical and continuous variables, and
95
+ in all cases normalized or transformed to [0, 1] prior to being returned.
93
96
 
94
97
  Examples
95
98
  --------
@@ -147,9 +150,9 @@ class Balance(Evaluator):
147
150
  super().__init__(locals())
148
151
 
149
152
  @set_metadata(state=["num_neighbors", "class_imbalance_threshold", "factor_correlation_threshold"])
150
- def evaluate(self, data: AnnotatedDataset[Any] | MetadataLike) -> BalanceOutput:
153
+ def evaluate(self, data: AnnotatedDataset[Any] | MetadataLike) -> BalanceOutput: # noqa: C901
151
154
  """
152
- Compute mutual information between factors and identify imbalanced classes.
155
+ Compute normalized mutual information between factors and identify imbalanced classes.
153
156
 
154
157
  Parameters
155
158
  ----------
@@ -160,7 +163,7 @@ class Balance(Evaluator):
160
163
  Returns
161
164
  -------
162
165
  BalanceOutput
163
- Three DataFrames containing MI scores and threshold flags:
166
+ Three DataFrames containing NMI scores and threshold flags:
164
167
 
165
168
  - balance: Global class-to-factor mutual information
166
169
  - factors: Inter-factor mutual information
@@ -168,7 +171,7 @@ class Balance(Evaluator):
168
171
 
169
172
  Example
170
173
  -------
171
- Return balance (mutual information) of factors with class_labels
174
+ Return balance (NMI) of factors with class_labels
172
175
 
173
176
  >>> from dataeval import Metadata
174
177
  >>> metadata = Metadata(dataset)
@@ -56,7 +56,7 @@ class Diversity(Evaluator):
56
56
  Through standard histogram binning, for continuous variables.
57
57
 
58
58
  The method specified defines diversity as the inverse Simpson diversity index linearly rescaled to
59
- the unit interval, or the normalized form of the Shannon entropy.
59
+ the unit interval [0, 1], or the normalized form of the Shannon entropy.
60
60
 
61
61
  diversity = 1 implies that samples are evenly distributed across a particular factor
62
62
  diversity = 0 implies that all samples belong to one category/bin
@@ -66,7 +66,9 @@ class Diversity(Evaluator):
66
66
  Parameters
67
67
  ----------
68
68
  method : "simpson" or "shannon", default "simpson"
69
- The methodology used for defining diversity
69
+ The methodology used for defining diversity. When "simpson" is used,
70
+ the index is linearly rescaled so that 1.0 represents maximum diversity
71
+ (even distribution) and 0.0 represents minimum diversity (all samples in one bin).
70
72
  threshold : float, default 0.5
71
73
  Threshold for identifying low diversity. Factors with diversity values
72
74
  at or below this threshold are flagged as having low diversity.
@@ -135,7 +137,7 @@ class Diversity(Evaluator):
135
137
  super().__init__(locals())
136
138
 
137
139
  @set_metadata(state=["method", "threshold"])
138
- def evaluate(self, data: AnnotatedDataset[Any] | MetadataLike) -> DiversityOutput:
140
+ def evaluate(self, data: AnnotatedDataset[Any] | MetadataLike) -> DiversityOutput: # noqa: C901
139
141
  """
140
142
  Compute diversity and classwise diversity for the dataset.
141
143
 
@@ -118,8 +118,6 @@ class Parity(Evaluator):
118
118
 
119
119
  >>> config = Parity.Config(score_threshold=0.4, p_value_threshold=0.01)
120
120
  >>> parity = Parity(config=config)
121
-
122
- output = parity(metadata.binned_data, metadata.class_labels.tolist())
123
121
  """
124
122
 
125
123
  class Config(EvaluatorConfig):
@@ -23,7 +23,7 @@ from pydantic import BaseModel, ConfigDict, field_validator
23
23
 
24
24
  from dataeval.protocols import DeviceLike
25
25
 
26
- ### GLOBAL CONFIG ###
26
+ # GLOBAL CONFIG ###
27
27
 
28
28
 
29
29
  class GlobalConfig(BaseModel):
@@ -77,7 +77,7 @@ class GlobalConfig(BaseModel):
77
77
  _config = GlobalConfig()
78
78
 
79
79
 
80
- ### CONTEXT MANAGER ###
80
+ # CONTEXT MANAGER ###
81
81
 
82
82
 
83
83
  class _ConfigContextManager:
@@ -96,7 +96,7 @@ class _ConfigContextManager:
96
96
  setattr(_config, self._attr_name, self._old)
97
97
 
98
98
 
99
- ### FUNCS ###
99
+ # FUNCS ###
100
100
 
101
101
 
102
102
  def _todevice(device: DeviceLike) -> torch.device:
@@ -78,6 +78,8 @@ def ber_mst(embeddings: ArrayND[float], class_labels: Array1D[int]) -> BERResult
78
78
  """
79
79
  Estimate Multi-class :term:`Bayes error rate<Bayes Error Rate (BER)>` using a minimum spanning tree.
80
80
 
81
+ BER bounds the irreducible classification error given the current feature
82
+ representation — the error attributable to class overlap in embedding space.
81
83
  Uses FR with a minimum spanning tree (MST) test statistic basis.
82
84
 
83
85
  Parameters
@@ -137,7 +139,13 @@ def ber_knn(embeddings: ArrayND[float], class_labels: Array1D[int], k: int) -> B
137
139
  """
138
140
  Estimate Multi-class :term:`Bayes error rate<Bayes Error Rate (BER)>` using KNN.
139
141
 
140
- Uses KNN test statistic basis.
142
+ BER bounds the irreducible classification error given the current feature
143
+ representation — the error attributable to class overlap in embedding space.
144
+ Uses KNN test statistic basis. The estimator's behavior depends on the value of k:
145
+ - k=1: Uses 1-NN for the lower bound and 2-NN for the upper bound.
146
+ - k=2: Uses 2-NN for the lower bound and 3-NN for the upper bound.
147
+ - 2<k<=5: Uses k-NN for the lower bound and (k+1)-NN for the upper bound.
148
+ - k>5: Only available for binary classification; uses k-NN for both bounds with specialized asymptotic weights.
141
149
 
142
150
  Parameters
143
151
  ----------
@@ -146,7 +154,7 @@ def ber_knn(embeddings: ArrayND[float], class_labels: Array1D[int], k: int) -> B
146
154
  class_labels : Array1D[int]
147
155
  Array of class labels for each image. Can be a 1D list, or array-like object.
148
156
  k : int
149
- Number of nearest neighbors for KNN estimator
157
+ Number of nearest neighbors for KNN estimator. Should be between 1 and the number of samples.
150
158
 
151
159
  Returns
152
160
  -------
@@ -92,7 +92,7 @@ def bin_data(data: NDArray[Any], bin_method: str) -> NDArray[np.intp]:
92
92
  return np.digitize(data, bin_edges)
93
93
 
94
94
 
95
- def is_continuous(data: NDArray[np.number[Any]], image_indices: NDArray[np.number[Any]] | None = None) -> bool:
95
+ def is_continuous(data: NDArray[np.number[Any]], image_indices: NDArray[np.number[Any]] | None = None) -> bool: # noqa: C901
96
96
  """
97
97
  Determine whether the data is continuous or discrete using the Wasserstein distance.
98
98
 
@@ -144,7 +144,7 @@ def is_continuous(data: NDArray[np.number[Any]], image_indices: NDArray[np.numbe
144
144
  return bool(shift < DISCRETE_MIN_WD) # if NNN is close enough to uniform, consider the sample continuous.
145
145
 
146
146
 
147
- def _bin_by_clusters(data: NDArray[np.number[Any]]) -> NDArray[np.float64]:
147
+ def _bin_by_clusters(data: NDArray[np.number[Any]]) -> NDArray[np.float64]: # noqa: C901
148
148
  """
149
149
  Bin continuous data by using the Clusterer to identify clusters.
150
150
 
@@ -99,7 +99,7 @@ class _Clusters:
99
99
  prob: NDArray[np.float64] = exp / np.sum(exp)
100
100
  return prob
101
101
 
102
- def _sort_by_weights(self, embeddings: NDArray[np.float64]) -> NDArray[np.intp]:
102
+ def _sort_by_weights(self, embeddings: NDArray[np.float64]) -> NDArray[np.intp]: # noqa: C901
103
103
  """Sort samples using complexity-based weighted sampling."""
104
104
  labels = self._get_labels(embeddings)
105
105
  pr = self._complexity(embeddings)
@@ -241,7 +241,7 @@ class _HDBSCANSorter:
241
241
  n_samples_per_cluster = np.bincount(labels)
242
242
  _logger.debug(
243
243
  "HDBSCAN clustering complete: %d clusters, samples per cluster: min=%d, max=%d, mean=%.1f",
244
- clst.unique_clusters,
244
+ len(clst.unique_clusters),
245
245
  np.min(n_samples_per_cluster),
246
246
  np.max(n_samples_per_cluster),
247
247
  np.mean(n_samples_per_cluster),
@@ -356,7 +356,7 @@ class _HDBSCAN:
356
356
  self.cluster_selection_epsilon = 0.0
357
357
  self.cluster_selection_method = "eom"
358
358
 
359
- def fit(self, embeddings: NDArray[np.floating]) -> "_HDBSCAN":
359
+ def fit(self, embeddings: NDArray[np.floating]) -> "_HDBSCAN": # noqa: C901
360
360
  """
361
361
  Find clusters based on hierarchical density-based clustering.
362
362
 
@@ -541,7 +541,7 @@ class ClusterStats(TypedDict):
541
541
  nearest_cluster_idx: NDArray[np.int64]
542
542
 
543
543
 
544
- def compute_cluster_stats(
544
+ def compute_cluster_stats( # noqa: C901
545
545
  embeddings: NDArray[np.floating],
546
546
  cluster_labels: _Clusters | NDArray[np.int64],
547
547
  ) -> ClusterStats:
@@ -642,7 +642,7 @@ def compute_cluster_stats(
642
642
  )
643
643
 
644
644
 
645
- def cluster(
645
+ def cluster( # noqa: C901
646
646
  embeddings: ArrayND[float],
647
647
  algorithm: Literal["kmeans", "hdbscan"] = "hdbscan",
648
648
  n_clusters: int | None = None,
@@ -101,7 +101,7 @@ def _build_image_lookup(source_indices: Sequence[SourceIndex]) -> dict[tuple[int
101
101
  return lookup
102
102
 
103
103
 
104
- def _calculate_ratio_for_stat(
104
+ def _calculate_ratio_for_stat( # noqa: C901
105
105
  stat_name: str,
106
106
  box_value: Any,
107
107
  img_value: Any,
@@ -160,7 +160,7 @@ def _calculate_ratio_for_stat(
160
160
  return box_value
161
161
 
162
162
 
163
- def _validate_separate_inputs(
163
+ def _validate_separate_inputs( # noqa: C901
164
164
  stats_output: StatsResult,
165
165
  box_stats_output: StatsResult,
166
166
  ) -> tuple[Sequence[SourceIndex], Sequence[SourceIndex]]:
@@ -241,7 +241,7 @@ def _validate_unified_input(source_indices: Sequence[SourceIndex]) -> None:
241
241
  )
242
242
 
243
243
 
244
- def compute_ratios(
244
+ def compute_ratios( # noqa: C901
245
245
  stats_output: StatsResult,
246
246
  *,
247
247
  target_stats_output: StatsResult | None = None,
@@ -158,7 +158,7 @@ def _determine_channel_indices(calculator_output: list[dict[str, list[Any]]], nu
158
158
  return sorted(channel_indices_needed, key=lambda x: -1 if x is None else x)
159
159
 
160
160
 
161
- def _reconcile_stats(
161
+ def _reconcile_stats( # noqa: C901
162
162
  calculator_output: list[dict[str, list[Any]]],
163
163
  sorted_channels: list[int | None],
164
164
  empty_values_map: dict[str, Any],
@@ -344,7 +344,7 @@ def _aggregate_batch(
344
344
  _UNSET = object()
345
345
 
346
346
 
347
- def compute_stats(
347
+ def compute_stats( # noqa: C901
348
348
  data: Iterable[ArrayLike] | Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
349
349
  *,
350
350
  boxes: Iterable[Iterable[BoxLike] | None] | None = None,
@@ -544,7 +544,7 @@ def compute_stats(
544
544
  )
545
545
 
546
546
 
547
- def combine_stats_results(
547
+ def combine_stats_results( # noqa: C901
548
548
  results: StatsResult | Sequence[StatsResult],
549
549
  ) -> tuple[StatsMap, list[SourceIndex], list[int]]:
550
550
  """Combine one or more StatsResults into unified stats, source_index, and dataset_steps.