dataeval 1.0.4__tar.gz → 1.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. {dataeval-1.0.4 → dataeval-1.0.5}/PKG-INFO +1 -1
  2. {dataeval-1.0.4 → dataeval-1.0.5}/pyproject.toml +18 -2
  3. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/_embeddings.py +2 -2
  4. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/_experimental.py +5 -5
  5. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/_metadata.py +7 -7
  6. dataeval-1.0.5/src/dataeval/_version.py +24 -0
  7. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/bias/_balance.py +1 -1
  8. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/bias/_diversity.py +1 -1
  9. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/config.py +3 -3
  10. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_bin.py +2 -2
  11. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_clusterer.py +4 -4
  12. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_compute_ratios.py +3 -3
  13. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_compute_stats.py +3 -3
  14. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_fast_hdbscan/_cluster_trees.py +6 -6
  15. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_fast_hdbscan/_mst.py +1 -1
  16. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_label_stats.py +1 -1
  17. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_metadata_insights.py +2 -2
  18. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_mutual_info.py +11 -15
  19. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_parity.py +1 -1
  20. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_uap.py +8 -10
  21. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/extractors/_onnx.py +2 -2
  22. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/extractors/_torch.py +1 -1
  23. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/performance/_output.py +3 -3
  24. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/quality/_duplicates.py +48 -35
  25. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/quality/_outliers.py +35 -35
  26. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/quality/_shared.py +3 -2
  27. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/scope/_prioritize.py +2 -2
  28. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/selection/_classbalance.py +1 -1
  29. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/selection/_classfilter.py +1 -1
  30. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_drift/_base.py +1 -1
  31. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_drift/_univariate.py +1 -1
  32. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_shared/_reconstruction.py +39 -35
  33. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/types.py +4 -4
  34. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/utils/_internal.py +6 -6
  35. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/utils/data.py +4 -4
  36. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/utils/onnx.py +1 -1
  37. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/utils/preprocessing.py +3 -3
  38. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/utils/thresholds.py +3 -3
  39. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/utils/training.py +1 -1
  40. dataeval-1.0.4/src/dataeval/_version.py +0 -34
  41. {dataeval-1.0.4 → dataeval-1.0.5}/.gitignore +0 -0
  42. {dataeval-1.0.4 → dataeval-1.0.5}/LICENSE +0 -0
  43. {dataeval-1.0.4 → dataeval-1.0.5}/README.md +0 -0
  44. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/__init__.py +0 -0
  45. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/_helpers.py +0 -0
  46. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/_log.py +0 -0
  47. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/_warm_cache.py +0 -0
  48. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/bias/__init__.py +0 -0
  49. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/bias/_parity.py +0 -0
  50. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/__init__.py +0 -0
  51. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_ber.py +0 -0
  52. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_calculators/__init__.py +0 -0
  53. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_calculators/_base.py +0 -0
  54. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_calculators/_cache.py +0 -0
  55. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_calculators/_dimensionstats.py +0 -0
  56. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_calculators/_hashstats.py +0 -0
  57. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_calculators/_pixelstats.py +0 -0
  58. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_calculators/_register.py +0 -0
  59. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_calculators/_registry.py +0 -0
  60. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_calculators/_visualstats.py +0 -0
  61. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_completeness.py +0 -0
  62. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_coverage.py +0 -0
  63. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_divergence.py +0 -0
  64. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_diversity.py +0 -0
  65. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_fast_hdbscan/_disjoint_set.py +0 -0
  66. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_feature_distance.py +0 -0
  67. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_hash.py +0 -0
  68. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_label_errors.py +0 -0
  69. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_label_parity.py +0 -0
  70. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_mst.py +0 -0
  71. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_nullmodel.py +0 -0
  72. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_rank.py +0 -0
  73. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/exceptions.py +0 -0
  74. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/extractors/__init__.py +0 -0
  75. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/extractors/_bovw.py +0 -0
  76. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/extractors/_flatten.py +0 -0
  77. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/extractors/_uncertainty.py +0 -0
  78. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/flags.py +0 -0
  79. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/performance/__init__.py +0 -0
  80. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/performance/_aggregator.py +0 -0
  81. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/performance/_sufficiency.py +0 -0
  82. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/performance/schedules.py +0 -0
  83. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/protocols.py +0 -0
  84. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/py.typed +0 -0
  85. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/quality/__init__.py +0 -0
  86. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/scope/__init__.py +0 -0
  87. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/selection/__init__.py +0 -0
  88. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/selection/_indices.py +0 -0
  89. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/selection/_limit.py +0 -0
  90. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/selection/_reverse.py +0 -0
  91. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/selection/_select.py +0 -0
  92. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/selection/_shuffle.py +0 -0
  93. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/__init__.py +0 -0
  94. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_drift/__init__.py +0 -0
  95. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_drift/_chunk.py +0 -0
  96. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_drift/_domain_classifier.py +0 -0
  97. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_drift/_kneighbors.py +0 -0
  98. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_drift/_mmd.py +0 -0
  99. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_drift/_reconstruction.py +0 -0
  100. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_ood/__init__.py +0 -0
  101. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_ood/_base.py +0 -0
  102. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_ood/_domain_classifier.py +0 -0
  103. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_ood/_kneighbors.py +0 -0
  104. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_ood/_reconstruction.py +0 -0
  105. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_shared/__init__.py +0 -0
  106. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_shared/_domain_classifier.py +0 -0
  107. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_shared/_kneighbors.py +0 -0
  108. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/update_strategies.py +0 -0
  109. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/utils/__init__.py +0 -0
  110. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/utils/losses.py +0 -0
  111. {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/utils/models.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataeval
3
- Version: 1.0.4
3
+ Version: 1.0.5
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Project-URL: Homepage, https://dataeval.ai/
6
6
  Project-URL: Repository, https://github.com/aria-ml/dataeval/
@@ -118,6 +118,14 @@ docs = [
118
118
  "markupsafe>=3,<3.0.2",
119
119
  "jupytext>=1.19.1",
120
120
  ]
121
+ security = [ # keep in sync with [tool.uv.constraint-dependencies]
122
+ "cryptography>=46.0.5", # CVE-2026-26007: Missing Subgroup Validation for SECT Curves
123
+ "filelock>=3.20.3", # GHSA-w853-jp5j-5j7f, GHSA-qmgc-5h2g-mvrw
124
+ "pillow>=12.1.1", # CVE-2026-25990: OOB write via PSD image
125
+ "protobuf>=6.33.5", # GHSA-7gcm-g887-7qv7
126
+ "setuptools>=82.0.0", # CVE-2026-23949: (jaraco_context) path traversal in tarball()
127
+ # CVE-2026-24049: (wheel) privilege escalation via unpack
128
+ ]
121
129
  dev = [
122
130
  { include-group = "base" },
123
131
  { include-group = "lint" },
@@ -141,7 +149,9 @@ conflicts = [
141
149
  ]
142
150
  constraint-dependencies = [
143
151
  "cryptography>=46.0.5", # CVE-2026-26007: Missing Subgroup Validation for SECT Curves
152
+ "filelock>=3.20.3", # GHSA-w853-jp5j-5j7f, GHSA-qmgc-5h2g-mvrw
144
153
  "pillow>=12.1.1", # CVE-2026-25990: OOB write via PSD image
154
+ "protobuf>=6.33.5", # GHSA-7gcm-g887-7qv7
145
155
  "setuptools>=82.0.0", # CVE-2026-23949: (jaraco_context) path traversal in tarball()
146
156
  # CVE-2026-24049: (wheel) privilege escalation via unpack
147
157
  ]
@@ -262,20 +272,23 @@ exclude = [
262
272
  ".jupyter_cache",
263
273
  "*env*",
264
274
  "output",
275
+ "_build",
265
276
  "build",
266
277
  ".nox",
267
278
  ".tox",
279
+ "prototype",
268
280
  "src/dataeval/_version.py",
269
281
  ]
270
282
  line-length = 120
271
283
  indent-width = 4
272
284
  target-version = "py310"
285
+ extend-include = ["*.ipynb"]
273
286
 
274
287
  [tool.ruff.lint]
275
288
  select = ["F", "E", "W", "C90", "I", "N", "D", "UP", "YTT", "ANN", "S", "BLE", "B", "A",
276
289
  "COM", "C4", "T10", "ISC", "ICN", "PYI", "PT", "Q", "RSE", "RET", "SLF", "SIM",
277
- "TID252", "ARG", "FIX", "PD", "FLY", "NPY", "RUF100", "PERF"]
278
- ignore = ["ANN401", "COM812", "NPY002", "SLF001"]
290
+ "TID252", "ARG", "FIX", "PD", "FLY", "NPY", "RUF027", "RUF100", "PERF"]
291
+ ignore = ["ANN101", "ANN102", "ANN401", "C408", "C416", "COM812", "NPY002", "SLF001"]
279
292
  fixable = ["ALL"]
280
293
  unfixable = []
281
294
  dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
@@ -287,6 +300,9 @@ builtins-strict-checking = false
287
300
  [tool.ruff.lint.isort]
288
301
  known-first-party = ["dataeval"]
289
302
 
303
+ [tool.ruff.lint.mccabe]
304
+ max-complexity = 5
305
+
290
306
  [tool.ruff.lint.pydocstyle]
291
307
  convention = "numpy"
292
308
 
@@ -509,7 +509,7 @@ class Embeddings(Array, FeatureExtractor):
509
509
  images.append(image)
510
510
  return images
511
511
 
512
- def _batch(self, indices: Sequence[int]) -> Iterator[NDArray[Any]]:
512
+ def _batch(self, indices: Sequence[int]) -> Iterator[NDArray[Any]]: # noqa: C901
513
513
  """Process indices in batches using the extractor."""
514
514
  if self._dataset is None:
515
515
  raise NotFittedError("No dataset bound. Call bind() first.")
@@ -559,7 +559,7 @@ class Embeddings(Array, FeatureExtractor):
559
559
  batch_indices = list(indices[batch_start : batch_start + self._batch_size])
560
560
  yield self._embeddings[batch_indices]
561
561
 
562
- def __getitem__(self, key: int | Iterable[int] | slice, /) -> NDArray[Any]:
562
+ def __getitem__(self, key: int | Iterable[int] | slice, /) -> NDArray[Any]: # noqa: C901
563
563
  """
564
564
  Access embeddings by index, indices or slice.
565
565
 
@@ -13,7 +13,7 @@ from dataeval.exceptions import DeprecatedWarning, ExperimentalWarning
13
13
  F = TypeVar("F", bound=Callable[..., Any])
14
14
 
15
15
 
16
- def _make_warning_message(
16
+ def _make_warning_message( # noqa: C901
17
17
  name: str,
18
18
  kind: str,
19
19
  *,
@@ -51,7 +51,7 @@ def _prepend_doc_note(doc: str | None, note: str) -> str:
51
51
  def experimental(_target: F) -> F: ...
52
52
  @overload
53
53
  def experimental(*, alternative: str | None = None, details: str | None = None) -> Callable[[F], F]: ...
54
- def experimental(
54
+ def experimental( # noqa: C901
55
55
  _target: F | None = None,
56
56
  *,
57
57
  alternative: str | None = None,
@@ -72,7 +72,7 @@ def experimental(
72
72
  def my_func(): ...
73
73
  """
74
74
 
75
- def decorator(target: F) -> F:
75
+ def decorator(target: F) -> F: # noqa: C901
76
76
  name = getattr(target, "__qualname__", getattr(target, "__name__", str(target)))
77
77
  msg = _make_warning_message(name, "experimental", alternative=alternative, details=details)
78
78
  warned = False
@@ -118,7 +118,7 @@ def deprecated(
118
118
  alternative: str | None = None,
119
119
  details: str | None = None,
120
120
  ) -> Callable[[F], F]: ...
121
- def deprecated(
121
+ def deprecated( # noqa: C901
122
122
  _target: F | None = None,
123
123
  *,
124
124
  since: str | None = None,
@@ -141,7 +141,7 @@ def deprecated(
141
141
  def old_func(): ...
142
142
  """
143
143
 
144
- def decorator(target: F) -> F:
144
+ def decorator(target: F) -> F: # noqa: C901
145
145
  name = getattr(target, "__qualname__", getattr(target, "__name__", str(target)))
146
146
  msg = _make_warning_message(
147
147
  name,
@@ -282,7 +282,7 @@ class Metadata(Array, FeatureExtractor):
282
282
  raise NotFittedError("No dataset bound. Call bind() first.")
283
283
  yield from self.factor_data
284
284
 
285
- def __getitem__(self, index: int | str | slice) -> Array:
285
+ def __getitem__(self, index: int | str | slice) -> Array: # noqa: C901
286
286
  """Get binned metadata for specific indices or factors.
287
287
 
288
288
  Parameters
@@ -934,7 +934,7 @@ class Metadata(Array, FeatureExtractor):
934
934
  factor = factor[0] if isinstance(factor, tuple) else factor
935
935
  return factor in self.include if self.include else factor not in self.exclude
936
936
 
937
- def _reset_bins(self, cols: Iterable[str] | None = None) -> None:
937
+ def _reset_bins(self, cols: Iterable[str] | None = None) -> None: # noqa: C901
938
938
  if self._is_binned:
939
939
  columns = self._dataframe.columns
940
940
  for col in cols or columns:
@@ -1006,7 +1006,7 @@ class Metadata(Array, FeatureExtractor):
1006
1006
  )
1007
1007
  return target_rows
1008
1008
 
1009
- def _get_target_factor_values(
1009
+ def _get_target_factor_values( # noqa: C901
1010
1010
  self,
1011
1011
  factor_name: str,
1012
1012
  factor_values: Any,
@@ -1152,7 +1152,7 @@ class Metadata(Array, FeatureExtractor):
1152
1152
  self._structure()
1153
1153
  return bool(self._has_targets)
1154
1154
 
1155
- def _process_targets(
1155
+ def _process_targets( # noqa: C901
1156
1156
  self,
1157
1157
  raw: list,
1158
1158
  labels: list,
@@ -1284,7 +1284,7 @@ class Metadata(Array, FeatureExtractor):
1284
1284
  existing = self._factors if hasattr(self, "_factors") else {}
1285
1285
  self._factors = {k: existing.get(k) for k in usable_factors}
1286
1286
 
1287
- def _structure(
1287
+ def _structure( # noqa: C901
1288
1288
  self,
1289
1289
  *,
1290
1290
  progress_callback: ProgressCallback | None = None,
@@ -1478,7 +1478,7 @@ class Metadata(Array, FeatureExtractor):
1478
1478
  df = self._add_column_with_padding(df, col_dg, ordinal.astype(np.int64), is_od)
1479
1479
  return df, FactorInfo("discrete", is_digitized=True)
1480
1480
 
1481
- def _bin(
1481
+ def _bin( # noqa: C901
1482
1482
  self,
1483
1483
  *,
1484
1484
  progress_callback: ProgressCallback | None = None,
@@ -1523,7 +1523,7 @@ class Metadata(Array, FeatureExtractor):
1523
1523
  self._factors.update(factor_info)
1524
1524
  self._is_binned = True
1525
1525
 
1526
- def add_factors(
1526
+ def add_factors( # noqa: C901
1527
1527
  self,
1528
1528
  factors: Mapping[str, Array1D[Any]],
1529
1529
  level: Literal["image", "target", "auto"] = "auto",
@@ -0,0 +1,24 @@
1
+ # file generated by vcs-versioning
2
+ # don't change, don't track in version control
3
+ from __future__ import annotations
4
+
5
+ __all__ = [
6
+ "__version__",
7
+ "__version_tuple__",
8
+ "version",
9
+ "version_tuple",
10
+ "__commit_id__",
11
+ "commit_id",
12
+ ]
13
+
14
+ version: str
15
+ __version__: str
16
+ __version_tuple__: tuple[int | str, ...]
17
+ version_tuple: tuple[int | str, ...]
18
+ commit_id: str | None
19
+ __commit_id__: str | None
20
+
21
+ __version__ = version = '1.0.5'
22
+ __version_tuple__ = version_tuple = (1, 0, 5)
23
+
24
+ __commit_id__ = commit_id = None
@@ -147,7 +147,7 @@ class Balance(Evaluator):
147
147
  super().__init__(locals())
148
148
 
149
149
  @set_metadata(state=["num_neighbors", "class_imbalance_threshold", "factor_correlation_threshold"])
150
- def evaluate(self, data: AnnotatedDataset[Any] | MetadataLike) -> BalanceOutput:
150
+ def evaluate(self, data: AnnotatedDataset[Any] | MetadataLike) -> BalanceOutput: # noqa: C901
151
151
  """
152
152
  Compute mutual information between factors and identify imbalanced classes.
153
153
 
@@ -135,7 +135,7 @@ class Diversity(Evaluator):
135
135
  super().__init__(locals())
136
136
 
137
137
  @set_metadata(state=["method", "threshold"])
138
- def evaluate(self, data: AnnotatedDataset[Any] | MetadataLike) -> DiversityOutput:
138
+ def evaluate(self, data: AnnotatedDataset[Any] | MetadataLike) -> DiversityOutput: # noqa: C901
139
139
  """
140
140
  Compute diversity and classwise diversity for the dataset.
141
141
 
@@ -23,7 +23,7 @@ from pydantic import BaseModel, ConfigDict, field_validator
23
23
 
24
24
  from dataeval.protocols import DeviceLike
25
25
 
26
- ### GLOBAL CONFIG ###
26
+ # GLOBAL CONFIG ###
27
27
 
28
28
 
29
29
  class GlobalConfig(BaseModel):
@@ -77,7 +77,7 @@ class GlobalConfig(BaseModel):
77
77
  _config = GlobalConfig()
78
78
 
79
79
 
80
- ### CONTEXT MANAGER ###
80
+ # CONTEXT MANAGER ###
81
81
 
82
82
 
83
83
  class _ConfigContextManager:
@@ -96,7 +96,7 @@ class _ConfigContextManager:
96
96
  setattr(_config, self._attr_name, self._old)
97
97
 
98
98
 
99
- ### FUNCS ###
99
+ # FUNCS ###
100
100
 
101
101
 
102
102
  def _todevice(device: DeviceLike) -> torch.device:
@@ -92,7 +92,7 @@ def bin_data(data: NDArray[Any], bin_method: str) -> NDArray[np.intp]:
92
92
  return np.digitize(data, bin_edges)
93
93
 
94
94
 
95
- def is_continuous(data: NDArray[np.number[Any]], image_indices: NDArray[np.number[Any]] | None = None) -> bool:
95
+ def is_continuous(data: NDArray[np.number[Any]], image_indices: NDArray[np.number[Any]] | None = None) -> bool: # noqa: C901
96
96
  """
97
97
  Determine whether the data is continuous or discrete using the Wasserstein distance.
98
98
 
@@ -144,7 +144,7 @@ def is_continuous(data: NDArray[np.number[Any]], image_indices: NDArray[np.numbe
144
144
  return bool(shift < DISCRETE_MIN_WD) # if NNN is close enough to uniform, consider the sample continuous.
145
145
 
146
146
 
147
- def _bin_by_clusters(data: NDArray[np.number[Any]]) -> NDArray[np.float64]:
147
+ def _bin_by_clusters(data: NDArray[np.number[Any]]) -> NDArray[np.float64]: # noqa: C901
148
148
  """
149
149
  Bin continuous data by using the Clusterer to identify clusters.
150
150
 
@@ -99,7 +99,7 @@ class _Clusters:
99
99
  prob: NDArray[np.float64] = exp / np.sum(exp)
100
100
  return prob
101
101
 
102
- def _sort_by_weights(self, embeddings: NDArray[np.float64]) -> NDArray[np.intp]:
102
+ def _sort_by_weights(self, embeddings: NDArray[np.float64]) -> NDArray[np.intp]: # noqa: C901
103
103
  """Sort samples using complexity-based weighted sampling."""
104
104
  labels = self._get_labels(embeddings)
105
105
  pr = self._complexity(embeddings)
@@ -356,7 +356,7 @@ class _HDBSCAN:
356
356
  self.cluster_selection_epsilon = 0.0
357
357
  self.cluster_selection_method = "eom"
358
358
 
359
- def fit(self, embeddings: NDArray[np.floating]) -> "_HDBSCAN":
359
+ def fit(self, embeddings: NDArray[np.floating]) -> "_HDBSCAN": # noqa: C901
360
360
  """
361
361
  Find clusters based on hierarchical density-based clustering.
362
362
 
@@ -541,7 +541,7 @@ class ClusterStats(TypedDict):
541
541
  nearest_cluster_idx: NDArray[np.int64]
542
542
 
543
543
 
544
- def compute_cluster_stats(
544
+ def compute_cluster_stats( # noqa: C901
545
545
  embeddings: NDArray[np.floating],
546
546
  cluster_labels: _Clusters | NDArray[np.int64],
547
547
  ) -> ClusterStats:
@@ -642,7 +642,7 @@ def compute_cluster_stats(
642
642
  )
643
643
 
644
644
 
645
- def cluster(
645
+ def cluster( # noqa: C901
646
646
  embeddings: ArrayND[float],
647
647
  algorithm: Literal["kmeans", "hdbscan"] = "hdbscan",
648
648
  n_clusters: int | None = None,
@@ -101,7 +101,7 @@ def _build_image_lookup(source_indices: Sequence[SourceIndex]) -> dict[tuple[int
101
101
  return lookup
102
102
 
103
103
 
104
- def _calculate_ratio_for_stat(
104
+ def _calculate_ratio_for_stat( # noqa: C901
105
105
  stat_name: str,
106
106
  box_value: Any,
107
107
  img_value: Any,
@@ -160,7 +160,7 @@ def _calculate_ratio_for_stat(
160
160
  return box_value
161
161
 
162
162
 
163
- def _validate_separate_inputs(
163
+ def _validate_separate_inputs( # noqa: C901
164
164
  stats_output: StatsResult,
165
165
  box_stats_output: StatsResult,
166
166
  ) -> tuple[Sequence[SourceIndex], Sequence[SourceIndex]]:
@@ -241,7 +241,7 @@ def _validate_unified_input(source_indices: Sequence[SourceIndex]) -> None:
241
241
  )
242
242
 
243
243
 
244
- def compute_ratios(
244
+ def compute_ratios( # noqa: C901
245
245
  stats_output: StatsResult,
246
246
  *,
247
247
  target_stats_output: StatsResult | None = None,
@@ -158,7 +158,7 @@ def _determine_channel_indices(calculator_output: list[dict[str, list[Any]]], nu
158
158
  return sorted(channel_indices_needed, key=lambda x: -1 if x is None else x)
159
159
 
160
160
 
161
- def _reconcile_stats(
161
+ def _reconcile_stats( # noqa: C901
162
162
  calculator_output: list[dict[str, list[Any]]],
163
163
  sorted_channels: list[int | None],
164
164
  empty_values_map: dict[str, Any],
@@ -344,7 +344,7 @@ def _aggregate_batch(
344
344
  _UNSET = object()
345
345
 
346
346
 
347
- def compute_stats(
347
+ def compute_stats( # noqa: C901
348
348
  data: Iterable[ArrayLike] | Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
349
349
  *,
350
350
  boxes: Iterable[Iterable[BoxLike] | None] | None = None,
@@ -544,7 +544,7 @@ def compute_stats(
544
544
  )
545
545
 
546
546
 
547
- def combine_stats_results(
547
+ def combine_stats_results( # noqa: C901
548
548
  results: StatsResult | Sequence[StatsResult],
549
549
  ) -> tuple[StatsMap, list[SourceIndex], list[int]]:
550
550
  """Combine one or more StatsResults into unified stats, source_index, and dataset_steps.
@@ -315,9 +315,9 @@ def cluster_tree_from_condensed_tree_bcubed(condensed_tree, cluster_tree, label_
315
315
  mask1 = condensed_tree.child_size > 1
316
316
  mask2 = condensed_tree.child_size == 1
317
317
  mask3 = np.array([child in label_indices_list for child in condensed_tree.child])
318
- mask4 = np.array(
319
- [parent in cluster_tree_parents for parent in condensed_tree.parent]
320
- ) # check that it's not a leaf cluster
318
+ mask4 = np.array([
319
+ parent in cluster_tree_parents for parent in condensed_tree.parent
320
+ ]) # check that it's not a leaf cluster
321
321
 
322
322
  mask = mask1 | (mask2 & mask3 & mask4)
323
323
 
@@ -501,9 +501,9 @@ def extract_clusters_bcubed(
501
501
  unselected_nodes,
502
502
  )
503
503
 
504
- return np.asarray(
505
- [node for node, selected in selected_clusters.items() if (selected and (node not in virtual_nodes))]
506
- )
504
+ return np.asarray([
505
+ node for node, selected in selected_clusters.items() if (selected and (node not in virtual_nodes))
506
+ ])
507
507
 
508
508
 
509
509
  @numba.njit(cache=True)
@@ -80,7 +80,7 @@ def _expand_tree(
80
80
 
81
81
 
82
82
  @numba.njit(locals={"i": numba.types.intp, "nbr": numba.types.intp, "dist": numba.types.float32}, cache=True)
83
- def _update_tree(
83
+ def _update_tree( # noqa: C901
84
84
  tree: NDArray[np.float32],
85
85
  total_edge: int,
86
86
  disjoint_set: tuple[NDArray[np.int64], NDArray[np.int64]],
@@ -54,7 +54,7 @@ class LabelStatsResult(TypedDict):
54
54
  empty_image_count: int
55
55
 
56
56
 
57
- def label_stats(
57
+ def label_stats( # noqa: C901
58
58
  class_labels: Iterable[int],
59
59
  item_indices: Iterable[int] | None = None,
60
60
  index2label: Mapping[int, str] | None = None,
@@ -68,7 +68,7 @@ def _calc_median_deviations(reference: NDArray[Any], test: NDArray[Any]) -> NDAr
68
68
  return np.abs(np.where(test_dev >= 0, test_dev / pscale, test_dev / nscale)) # (S_t, F)
69
69
 
70
70
 
71
- def factor_deviation(
71
+ def factor_deviation( # noqa: C901
72
72
  reference_factors: Mapping[str, NDArray[Any]],
73
73
  test_factors: Mapping[str, NDArray[Any]],
74
74
  indices: SequenceLike[int],
@@ -183,7 +183,7 @@ def factor_deviation(
183
183
  return results
184
184
 
185
185
 
186
- def factor_predictors(
186
+ def factor_predictors( # noqa: C901
187
187
  factors: Mapping[str, NDArray[Any]],
188
188
  indices: SequenceLike[int],
189
189
  discrete_features: list[bool] | None = None,
@@ -74,7 +74,7 @@ def _merge_labels_and_factors(
74
74
  return data, discrete_features
75
75
 
76
76
 
77
- def mutual_info(
77
+ def mutual_info( # noqa: C901
78
78
  class_labels: Array1D[int],
79
79
  factor_data: Array2D[int | float],
80
80
  discrete_features: Array1D[bool] | None = None,
@@ -124,13 +124,11 @@ def mutual_info(
124
124
 
125
125
  >>> rng = np.random.default_rng(175)
126
126
  >>> class_labels = rng.choice([0, 1, 2], size=100)
127
- >>> factor_data = np.column_stack(
128
- ... [
129
- ... rng.choice([25, 35, 45, 55], size=100), # age
130
- ... rng.choice([50000, 65000, 80000], size=100), # income
131
- ... rng.choice([0, 1], size=100), # gender
132
- ... ]
133
- ... )
127
+ >>> factor_data = np.column_stack([
128
+ ... rng.choice([25, 35, 45, 55], size=100), # age
129
+ ... rng.choice([50000, 65000, 80000], size=100), # income
130
+ ... rng.choice([0, 1], size=100), # gender
131
+ ... ])
134
132
  >>> result = mutual_info(class_labels=class_labels, factor_data=factor_data)
135
133
  >>> result["class_to_factor"]
136
134
  array([1. , 0.034, 0.026, 0.004])
@@ -250,13 +248,11 @@ def mutual_info_classwise(
250
248
 
251
249
  >>> rng = np.random.default_rng(175)
252
250
  >>> class_labels = rng.choice([0, 1, 2], size=100)
253
- >>> factor_data = np.column_stack(
254
- ... [
255
- ... rng.choice([25, 35, 45, 55], size=100), # age
256
- ... rng.choice([50000, 65000, 80000], size=100), # income
257
- ... rng.choice([0, 1], size=100), # gender
258
- ... ]
259
- ... )
251
+ >>> factor_data = np.column_stack([
252
+ ... rng.choice([25, 35, 45, 55], size=100), # age
253
+ ... rng.choice([50000, 65000, 80000], size=100), # income
254
+ ... rng.choice([0, 1], size=100), # gender
255
+ ... ])
260
256
  >>> mutual_info_classwise(class_labels=class_labels, factor_data=factor_data)
261
257
  array([[1.000e+00, 2.077e-02, 2.296e-03, 7.317e-04],
262
258
  [1.000e+00, 4.893e-02, 2.451e-02, 4.362e-03],
@@ -42,7 +42,7 @@ class ParityResult(TypedDict):
42
42
 
43
43
 
44
44
  @experimental
45
- def parity(
45
+ def parity( # noqa: C901
46
46
  factor_data: Array2D[int],
47
47
  class_labels: Array1D[int],
48
48
  ) -> ParityResult:
@@ -57,16 +57,14 @@ def uap(labels: Array2D[int], scores: Array2D[float]) -> float:
57
57
  0.8333333333333333
58
58
 
59
59
  >>> y_true = np.array([0, 0, 1, 1, 2, 2])
60
- >>> y_scores = np.array(
61
- ... [
62
- ... [0.7, 0.2, 0.1],
63
- ... [0.4, 0.3, 0.3],
64
- ... [0.1, 0.8, 0.1],
65
- ... [0.2, 0.3, 0.5],
66
- ... [0.4, 0.4, 0.2],
67
- ... [0.1, 0.2, 0.7],
68
- ... ]
69
- ... )
60
+ >>> y_scores = np.array([
61
+ ... [0.7, 0.2, 0.1],
62
+ ... [0.4, 0.3, 0.3],
63
+ ... [0.1, 0.8, 0.1],
64
+ ... [0.2, 0.3, 0.5],
65
+ ... [0.4, 0.4, 0.2],
66
+ ... [0.1, 0.2, 0.7],
67
+ ... ])
70
68
  >>> uap(y_true, y_scores)
71
69
  0.7777777777777777
72
70
  """
@@ -156,7 +156,7 @@ class OnnxExtractor(ReprMixin):
156
156
  return [transforms]
157
157
  return list(transforms)
158
158
 
159
- def _load_model(self) -> None:
159
+ def _load_model(self) -> None: # noqa: C901
160
160
  """Load the ONNX model and validate configuration."""
161
161
  session_cls = _get_inference_session()
162
162
  providers = _get_execution_providers()
@@ -220,7 +220,7 @@ class OnnxExtractor(ReprMixin):
220
220
  # Ensure float32 for ONNX
221
221
  return result.astype(np.float32)
222
222
 
223
- def __call__(self, data: Any) -> Array:
223
+ def __call__(self, data: Any) -> Array: # noqa: C901
224
224
  """
225
225
  Extract features from a batch of images.
226
226
 
@@ -137,7 +137,7 @@ class TorchExtractor(ReprMixin):
137
137
 
138
138
  return modules_dict[layer_name]
139
139
 
140
- def __call__(self, data: Any) -> Array:
140
+ def __call__(self, data: Any) -> Array: # noqa: C901
141
141
  """
142
142
  Extract features from a batch of images.
143
143
 
@@ -167,7 +167,7 @@ class SufficiencyOutput(DictOutput):
167
167
 
168
168
  return self._build_dataframe(proj_array, output)
169
169
 
170
- def inv_project(self, targets: Mapping[str, ArrayLike] | ArrayLike, n_iter: int = 1000) -> pl.DataFrame:
170
+ def inv_project(self, targets: Mapping[str, ArrayLike] | ArrayLike, n_iter: int = 1000) -> pl.DataFrame: # noqa: C901
171
171
  """
172
172
  Compute training samples needed to achieve target metric values.
173
173
 
@@ -214,7 +214,7 @@ class SufficiencyOutput(DictOutput):
214
214
  return self._build_inv_project_dataframe(results)
215
215
 
216
216
  @staticmethod
217
- def _build_inv_project_dataframe(
217
+ def _build_inv_project_dataframe( # noqa: C901
218
218
  results: Mapping[str, tuple[NDArray[Any], NDArray[np.int64]]],
219
219
  ) -> pl.DataFrame:
220
220
  """Build wide-format DataFrame from inverse projection results."""
@@ -405,7 +405,7 @@ def linear_initialization(metric: NDArray[Any], sizes: NDArray[Any], bounds: Con
405
405
  return np.array([scale, -negative_exponent, asymptote], dtype=np.float64)
406
406
 
407
407
 
408
- def calc_params(p_i: NDArray[Any], n_i: NDArray[Any], niter: int, unit_interval: bool) -> NDArray[np.float64]:
408
+ def calc_params(p_i: NDArray[Any], n_i: NDArray[Any], niter: int, unit_interval: bool) -> NDArray[np.float64]: # noqa: C901
409
409
  """
410
410
  Retrieve the inverse power curve coefficients for the line of best fit.
411
411