dataeval 1.0.4__tar.gz → 1.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataeval-1.0.4 → dataeval-1.0.5}/PKG-INFO +1 -1
- {dataeval-1.0.4 → dataeval-1.0.5}/pyproject.toml +18 -2
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/_embeddings.py +2 -2
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/_experimental.py +5 -5
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/_metadata.py +7 -7
- dataeval-1.0.5/src/dataeval/_version.py +24 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/bias/_balance.py +1 -1
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/bias/_diversity.py +1 -1
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/config.py +3 -3
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_bin.py +2 -2
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_clusterer.py +4 -4
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_compute_ratios.py +3 -3
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_compute_stats.py +3 -3
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_fast_hdbscan/_cluster_trees.py +6 -6
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_fast_hdbscan/_mst.py +1 -1
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_label_stats.py +1 -1
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_metadata_insights.py +2 -2
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_mutual_info.py +11 -15
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_parity.py +1 -1
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_uap.py +8 -10
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/extractors/_onnx.py +2 -2
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/extractors/_torch.py +1 -1
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/performance/_output.py +3 -3
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/quality/_duplicates.py +48 -35
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/quality/_outliers.py +35 -35
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/quality/_shared.py +3 -2
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/scope/_prioritize.py +2 -2
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/selection/_classbalance.py +1 -1
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/selection/_classfilter.py +1 -1
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_drift/_base.py +1 -1
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_drift/_univariate.py +1 -1
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_shared/_reconstruction.py +39 -35
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/types.py +4 -4
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/utils/_internal.py +6 -6
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/utils/data.py +4 -4
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/utils/onnx.py +1 -1
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/utils/preprocessing.py +3 -3
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/utils/thresholds.py +3 -3
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/utils/training.py +1 -1
- dataeval-1.0.4/src/dataeval/_version.py +0 -34
- {dataeval-1.0.4 → dataeval-1.0.5}/.gitignore +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/LICENSE +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/README.md +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/__init__.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/_helpers.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/_log.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/_warm_cache.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/bias/__init__.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/bias/_parity.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/__init__.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_ber.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_calculators/__init__.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_calculators/_base.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_calculators/_cache.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_calculators/_dimensionstats.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_calculators/_hashstats.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_calculators/_pixelstats.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_calculators/_register.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_calculators/_registry.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_calculators/_visualstats.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_completeness.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_coverage.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_divergence.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_diversity.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_fast_hdbscan/_disjoint_set.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_feature_distance.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_hash.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_label_errors.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_label_parity.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_mst.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_nullmodel.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/core/_rank.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/exceptions.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/extractors/__init__.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/extractors/_bovw.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/extractors/_flatten.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/extractors/_uncertainty.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/flags.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/performance/__init__.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/performance/_aggregator.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/performance/_sufficiency.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/performance/schedules.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/protocols.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/py.typed +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/quality/__init__.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/scope/__init__.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/selection/__init__.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/selection/_indices.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/selection/_limit.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/selection/_reverse.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/selection/_select.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/selection/_shuffle.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/__init__.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_drift/__init__.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_drift/_chunk.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_drift/_domain_classifier.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_drift/_kneighbors.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_drift/_mmd.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_drift/_reconstruction.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_ood/__init__.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_ood/_base.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_ood/_domain_classifier.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_ood/_kneighbors.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_ood/_reconstruction.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_shared/__init__.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_shared/_domain_classifier.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/_shared/_kneighbors.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/shift/update_strategies.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/utils/__init__.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/utils/losses.py +0 -0
- {dataeval-1.0.4 → dataeval-1.0.5}/src/dataeval/utils/models.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataeval
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.5
|
|
4
4
|
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
|
5
5
|
Project-URL: Homepage, https://dataeval.ai/
|
|
6
6
|
Project-URL: Repository, https://github.com/aria-ml/dataeval/
|
|
@@ -118,6 +118,14 @@ docs = [
|
|
|
118
118
|
"markupsafe>=3,<3.0.2",
|
|
119
119
|
"jupytext>=1.19.1",
|
|
120
120
|
]
|
|
121
|
+
security = [ # keep in sync with [tool.uv.constraint-dependencies]
|
|
122
|
+
"cryptography>=46.0.5", # CVE-2026-26007: Missing Subgroup Validation for SECT Curves
|
|
123
|
+
"filelock>=3.20.3", # GHSA-w853-jp5j-5j7f, GHSA-qmgc-5h2g-mvrw
|
|
124
|
+
"pillow>=12.1.1", # CVE-2026-25990: OOB write via PSD image
|
|
125
|
+
"protobuf>=6.33.5", # GHSA-7gcm-g887-7qv7
|
|
126
|
+
"setuptools>=82.0.0", # CVE-2026-23949: (jaraco_context) path traversal in tarball()
|
|
127
|
+
# CVE-2026-24049: (wheel) privilege escalation via unpack
|
|
128
|
+
]
|
|
121
129
|
dev = [
|
|
122
130
|
{ include-group = "base" },
|
|
123
131
|
{ include-group = "lint" },
|
|
@@ -141,7 +149,9 @@ conflicts = [
|
|
|
141
149
|
]
|
|
142
150
|
constraint-dependencies = [
|
|
143
151
|
"cryptography>=46.0.5", # CVE-2026-26007: Missing Subgroup Validation for SECT Curves
|
|
152
|
+
"filelock>=3.20.3", # GHSA-w853-jp5j-5j7f, GHSA-qmgc-5h2g-mvrw
|
|
144
153
|
"pillow>=12.1.1", # CVE-2026-25990: OOB write via PSD image
|
|
154
|
+
"protobuf>=6.33.5", # GHSA-7gcm-g887-7qv7
|
|
145
155
|
"setuptools>=82.0.0", # CVE-2026-23949: (jaraco_context) path traversal in tarball()
|
|
146
156
|
# CVE-2026-24049: (wheel) privilege escalation via unpack
|
|
147
157
|
]
|
|
@@ -262,20 +272,23 @@ exclude = [
|
|
|
262
272
|
".jupyter_cache",
|
|
263
273
|
"*env*",
|
|
264
274
|
"output",
|
|
275
|
+
"_build",
|
|
265
276
|
"build",
|
|
266
277
|
".nox",
|
|
267
278
|
".tox",
|
|
279
|
+
"prototype",
|
|
268
280
|
"src/dataeval/_version.py",
|
|
269
281
|
]
|
|
270
282
|
line-length = 120
|
|
271
283
|
indent-width = 4
|
|
272
284
|
target-version = "py310"
|
|
285
|
+
extend-include = ["*.ipynb"]
|
|
273
286
|
|
|
274
287
|
[tool.ruff.lint]
|
|
275
288
|
select = ["F", "E", "W", "C90", "I", "N", "D", "UP", "YTT", "ANN", "S", "BLE", "B", "A",
|
|
276
289
|
"COM", "C4", "T10", "ISC", "ICN", "PYI", "PT", "Q", "RSE", "RET", "SLF", "SIM",
|
|
277
|
-
"TID252", "ARG", "FIX", "PD", "FLY", "NPY", "RUF100", "PERF"]
|
|
278
|
-
ignore = ["ANN401", "COM812", "NPY002", "SLF001"]
|
|
290
|
+
"TID252", "ARG", "FIX", "PD", "FLY", "NPY", "RUF027", "RUF100", "PERF"]
|
|
291
|
+
ignore = ["ANN101", "ANN102", "ANN401", "C408", "C416", "COM812", "NPY002", "SLF001"]
|
|
279
292
|
fixable = ["ALL"]
|
|
280
293
|
unfixable = []
|
|
281
294
|
dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
|
|
@@ -287,6 +300,9 @@ builtins-strict-checking = false
|
|
|
287
300
|
[tool.ruff.lint.isort]
|
|
288
301
|
known-first-party = ["dataeval"]
|
|
289
302
|
|
|
303
|
+
[tool.ruff.lint.mccabe]
|
|
304
|
+
max-complexity = 5
|
|
305
|
+
|
|
290
306
|
[tool.ruff.lint.pydocstyle]
|
|
291
307
|
convention = "numpy"
|
|
292
308
|
|
|
@@ -509,7 +509,7 @@ class Embeddings(Array, FeatureExtractor):
|
|
|
509
509
|
images.append(image)
|
|
510
510
|
return images
|
|
511
511
|
|
|
512
|
-
def _batch(self, indices: Sequence[int]) -> Iterator[NDArray[Any]]:
|
|
512
|
+
def _batch(self, indices: Sequence[int]) -> Iterator[NDArray[Any]]: # noqa: C901
|
|
513
513
|
"""Process indices in batches using the extractor."""
|
|
514
514
|
if self._dataset is None:
|
|
515
515
|
raise NotFittedError("No dataset bound. Call bind() first.")
|
|
@@ -559,7 +559,7 @@ class Embeddings(Array, FeatureExtractor):
|
|
|
559
559
|
batch_indices = list(indices[batch_start : batch_start + self._batch_size])
|
|
560
560
|
yield self._embeddings[batch_indices]
|
|
561
561
|
|
|
562
|
-
def __getitem__(self, key: int | Iterable[int] | slice, /) -> NDArray[Any]:
|
|
562
|
+
def __getitem__(self, key: int | Iterable[int] | slice, /) -> NDArray[Any]: # noqa: C901
|
|
563
563
|
"""
|
|
564
564
|
Access embeddings by index, indices or slice.
|
|
565
565
|
|
|
@@ -13,7 +13,7 @@ from dataeval.exceptions import DeprecatedWarning, ExperimentalWarning
|
|
|
13
13
|
F = TypeVar("F", bound=Callable[..., Any])
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
def _make_warning_message(
|
|
16
|
+
def _make_warning_message( # noqa: C901
|
|
17
17
|
name: str,
|
|
18
18
|
kind: str,
|
|
19
19
|
*,
|
|
@@ -51,7 +51,7 @@ def _prepend_doc_note(doc: str | None, note: str) -> str:
|
|
|
51
51
|
def experimental(_target: F) -> F: ...
|
|
52
52
|
@overload
|
|
53
53
|
def experimental(*, alternative: str | None = None, details: str | None = None) -> Callable[[F], F]: ...
|
|
54
|
-
def experimental(
|
|
54
|
+
def experimental( # noqa: C901
|
|
55
55
|
_target: F | None = None,
|
|
56
56
|
*,
|
|
57
57
|
alternative: str | None = None,
|
|
@@ -72,7 +72,7 @@ def experimental(
|
|
|
72
72
|
def my_func(): ...
|
|
73
73
|
"""
|
|
74
74
|
|
|
75
|
-
def decorator(target: F) -> F:
|
|
75
|
+
def decorator(target: F) -> F: # noqa: C901
|
|
76
76
|
name = getattr(target, "__qualname__", getattr(target, "__name__", str(target)))
|
|
77
77
|
msg = _make_warning_message(name, "experimental", alternative=alternative, details=details)
|
|
78
78
|
warned = False
|
|
@@ -118,7 +118,7 @@ def deprecated(
|
|
|
118
118
|
alternative: str | None = None,
|
|
119
119
|
details: str | None = None,
|
|
120
120
|
) -> Callable[[F], F]: ...
|
|
121
|
-
def deprecated(
|
|
121
|
+
def deprecated( # noqa: C901
|
|
122
122
|
_target: F | None = None,
|
|
123
123
|
*,
|
|
124
124
|
since: str | None = None,
|
|
@@ -141,7 +141,7 @@ def deprecated(
|
|
|
141
141
|
def old_func(): ...
|
|
142
142
|
"""
|
|
143
143
|
|
|
144
|
-
def decorator(target: F) -> F:
|
|
144
|
+
def decorator(target: F) -> F: # noqa: C901
|
|
145
145
|
name = getattr(target, "__qualname__", getattr(target, "__name__", str(target)))
|
|
146
146
|
msg = _make_warning_message(
|
|
147
147
|
name,
|
|
@@ -282,7 +282,7 @@ class Metadata(Array, FeatureExtractor):
|
|
|
282
282
|
raise NotFittedError("No dataset bound. Call bind() first.")
|
|
283
283
|
yield from self.factor_data
|
|
284
284
|
|
|
285
|
-
def __getitem__(self, index: int | str | slice) -> Array:
|
|
285
|
+
def __getitem__(self, index: int | str | slice) -> Array: # noqa: C901
|
|
286
286
|
"""Get binned metadata for specific indices or factors.
|
|
287
287
|
|
|
288
288
|
Parameters
|
|
@@ -934,7 +934,7 @@ class Metadata(Array, FeatureExtractor):
|
|
|
934
934
|
factor = factor[0] if isinstance(factor, tuple) else factor
|
|
935
935
|
return factor in self.include if self.include else factor not in self.exclude
|
|
936
936
|
|
|
937
|
-
def _reset_bins(self, cols: Iterable[str] | None = None) -> None:
|
|
937
|
+
def _reset_bins(self, cols: Iterable[str] | None = None) -> None: # noqa: C901
|
|
938
938
|
if self._is_binned:
|
|
939
939
|
columns = self._dataframe.columns
|
|
940
940
|
for col in cols or columns:
|
|
@@ -1006,7 +1006,7 @@ class Metadata(Array, FeatureExtractor):
|
|
|
1006
1006
|
)
|
|
1007
1007
|
return target_rows
|
|
1008
1008
|
|
|
1009
|
-
def _get_target_factor_values(
|
|
1009
|
+
def _get_target_factor_values( # noqa: C901
|
|
1010
1010
|
self,
|
|
1011
1011
|
factor_name: str,
|
|
1012
1012
|
factor_values: Any,
|
|
@@ -1152,7 +1152,7 @@ class Metadata(Array, FeatureExtractor):
|
|
|
1152
1152
|
self._structure()
|
|
1153
1153
|
return bool(self._has_targets)
|
|
1154
1154
|
|
|
1155
|
-
def _process_targets(
|
|
1155
|
+
def _process_targets( # noqa: C901
|
|
1156
1156
|
self,
|
|
1157
1157
|
raw: list,
|
|
1158
1158
|
labels: list,
|
|
@@ -1284,7 +1284,7 @@ class Metadata(Array, FeatureExtractor):
|
|
|
1284
1284
|
existing = self._factors if hasattr(self, "_factors") else {}
|
|
1285
1285
|
self._factors = {k: existing.get(k) for k in usable_factors}
|
|
1286
1286
|
|
|
1287
|
-
def _structure(
|
|
1287
|
+
def _structure( # noqa: C901
|
|
1288
1288
|
self,
|
|
1289
1289
|
*,
|
|
1290
1290
|
progress_callback: ProgressCallback | None = None,
|
|
@@ -1478,7 +1478,7 @@ class Metadata(Array, FeatureExtractor):
|
|
|
1478
1478
|
df = self._add_column_with_padding(df, col_dg, ordinal.astype(np.int64), is_od)
|
|
1479
1479
|
return df, FactorInfo("discrete", is_digitized=True)
|
|
1480
1480
|
|
|
1481
|
-
def _bin(
|
|
1481
|
+
def _bin( # noqa: C901
|
|
1482
1482
|
self,
|
|
1483
1483
|
*,
|
|
1484
1484
|
progress_callback: ProgressCallback | None = None,
|
|
@@ -1523,7 +1523,7 @@ class Metadata(Array, FeatureExtractor):
|
|
|
1523
1523
|
self._factors.update(factor_info)
|
|
1524
1524
|
self._is_binned = True
|
|
1525
1525
|
|
|
1526
|
-
def add_factors(
|
|
1526
|
+
def add_factors( # noqa: C901
|
|
1527
1527
|
self,
|
|
1528
1528
|
factors: Mapping[str, Array1D[Any]],
|
|
1529
1529
|
level: Literal["image", "target", "auto"] = "auto",
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# file generated by vcs-versioning
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"__version__",
|
|
7
|
+
"__version_tuple__",
|
|
8
|
+
"version",
|
|
9
|
+
"version_tuple",
|
|
10
|
+
"__commit_id__",
|
|
11
|
+
"commit_id",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
version: str
|
|
15
|
+
__version__: str
|
|
16
|
+
__version_tuple__: tuple[int | str, ...]
|
|
17
|
+
version_tuple: tuple[int | str, ...]
|
|
18
|
+
commit_id: str | None
|
|
19
|
+
__commit_id__: str | None
|
|
20
|
+
|
|
21
|
+
__version__ = version = '1.0.5'
|
|
22
|
+
__version_tuple__ = version_tuple = (1, 0, 5)
|
|
23
|
+
|
|
24
|
+
__commit_id__ = commit_id = None
|
|
@@ -147,7 +147,7 @@ class Balance(Evaluator):
|
|
|
147
147
|
super().__init__(locals())
|
|
148
148
|
|
|
149
149
|
@set_metadata(state=["num_neighbors", "class_imbalance_threshold", "factor_correlation_threshold"])
|
|
150
|
-
def evaluate(self, data: AnnotatedDataset[Any] | MetadataLike) -> BalanceOutput:
|
|
150
|
+
def evaluate(self, data: AnnotatedDataset[Any] | MetadataLike) -> BalanceOutput: # noqa: C901
|
|
151
151
|
"""
|
|
152
152
|
Compute mutual information between factors and identify imbalanced classes.
|
|
153
153
|
|
|
@@ -135,7 +135,7 @@ class Diversity(Evaluator):
|
|
|
135
135
|
super().__init__(locals())
|
|
136
136
|
|
|
137
137
|
@set_metadata(state=["method", "threshold"])
|
|
138
|
-
def evaluate(self, data: AnnotatedDataset[Any] | MetadataLike) -> DiversityOutput:
|
|
138
|
+
def evaluate(self, data: AnnotatedDataset[Any] | MetadataLike) -> DiversityOutput: # noqa: C901
|
|
139
139
|
"""
|
|
140
140
|
Compute diversity and classwise diversity for the dataset.
|
|
141
141
|
|
|
@@ -23,7 +23,7 @@ from pydantic import BaseModel, ConfigDict, field_validator
|
|
|
23
23
|
|
|
24
24
|
from dataeval.protocols import DeviceLike
|
|
25
25
|
|
|
26
|
-
|
|
26
|
+
# GLOBAL CONFIG ###
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
class GlobalConfig(BaseModel):
|
|
@@ -77,7 +77,7 @@ class GlobalConfig(BaseModel):
|
|
|
77
77
|
_config = GlobalConfig()
|
|
78
78
|
|
|
79
79
|
|
|
80
|
-
|
|
80
|
+
# CONTEXT MANAGER ###
|
|
81
81
|
|
|
82
82
|
|
|
83
83
|
class _ConfigContextManager:
|
|
@@ -96,7 +96,7 @@ class _ConfigContextManager:
|
|
|
96
96
|
setattr(_config, self._attr_name, self._old)
|
|
97
97
|
|
|
98
98
|
|
|
99
|
-
|
|
99
|
+
# FUNCS ###
|
|
100
100
|
|
|
101
101
|
|
|
102
102
|
def _todevice(device: DeviceLike) -> torch.device:
|
|
@@ -92,7 +92,7 @@ def bin_data(data: NDArray[Any], bin_method: str) -> NDArray[np.intp]:
|
|
|
92
92
|
return np.digitize(data, bin_edges)
|
|
93
93
|
|
|
94
94
|
|
|
95
|
-
def is_continuous(data: NDArray[np.number[Any]], image_indices: NDArray[np.number[Any]] | None = None) -> bool:
|
|
95
|
+
def is_continuous(data: NDArray[np.number[Any]], image_indices: NDArray[np.number[Any]] | None = None) -> bool: # noqa: C901
|
|
96
96
|
"""
|
|
97
97
|
Determine whether the data is continuous or discrete using the Wasserstein distance.
|
|
98
98
|
|
|
@@ -144,7 +144,7 @@ def is_continuous(data: NDArray[np.number[Any]], image_indices: NDArray[np.numbe
|
|
|
144
144
|
return bool(shift < DISCRETE_MIN_WD) # if NNN is close enough to uniform, consider the sample continuous.
|
|
145
145
|
|
|
146
146
|
|
|
147
|
-
def _bin_by_clusters(data: NDArray[np.number[Any]]) -> NDArray[np.float64]:
|
|
147
|
+
def _bin_by_clusters(data: NDArray[np.number[Any]]) -> NDArray[np.float64]: # noqa: C901
|
|
148
148
|
"""
|
|
149
149
|
Bin continuous data by using the Clusterer to identify clusters.
|
|
150
150
|
|
|
@@ -99,7 +99,7 @@ class _Clusters:
|
|
|
99
99
|
prob: NDArray[np.float64] = exp / np.sum(exp)
|
|
100
100
|
return prob
|
|
101
101
|
|
|
102
|
-
def _sort_by_weights(self, embeddings: NDArray[np.float64]) -> NDArray[np.intp]:
|
|
102
|
+
def _sort_by_weights(self, embeddings: NDArray[np.float64]) -> NDArray[np.intp]: # noqa: C901
|
|
103
103
|
"""Sort samples using complexity-based weighted sampling."""
|
|
104
104
|
labels = self._get_labels(embeddings)
|
|
105
105
|
pr = self._complexity(embeddings)
|
|
@@ -356,7 +356,7 @@ class _HDBSCAN:
|
|
|
356
356
|
self.cluster_selection_epsilon = 0.0
|
|
357
357
|
self.cluster_selection_method = "eom"
|
|
358
358
|
|
|
359
|
-
def fit(self, embeddings: NDArray[np.floating]) -> "_HDBSCAN":
|
|
359
|
+
def fit(self, embeddings: NDArray[np.floating]) -> "_HDBSCAN": # noqa: C901
|
|
360
360
|
"""
|
|
361
361
|
Find clusters based on hierarchical density-based clustering.
|
|
362
362
|
|
|
@@ -541,7 +541,7 @@ class ClusterStats(TypedDict):
|
|
|
541
541
|
nearest_cluster_idx: NDArray[np.int64]
|
|
542
542
|
|
|
543
543
|
|
|
544
|
-
def compute_cluster_stats(
|
|
544
|
+
def compute_cluster_stats( # noqa: C901
|
|
545
545
|
embeddings: NDArray[np.floating],
|
|
546
546
|
cluster_labels: _Clusters | NDArray[np.int64],
|
|
547
547
|
) -> ClusterStats:
|
|
@@ -642,7 +642,7 @@ def compute_cluster_stats(
|
|
|
642
642
|
)
|
|
643
643
|
|
|
644
644
|
|
|
645
|
-
def cluster(
|
|
645
|
+
def cluster( # noqa: C901
|
|
646
646
|
embeddings: ArrayND[float],
|
|
647
647
|
algorithm: Literal["kmeans", "hdbscan"] = "hdbscan",
|
|
648
648
|
n_clusters: int | None = None,
|
|
@@ -101,7 +101,7 @@ def _build_image_lookup(source_indices: Sequence[SourceIndex]) -> dict[tuple[int
|
|
|
101
101
|
return lookup
|
|
102
102
|
|
|
103
103
|
|
|
104
|
-
def _calculate_ratio_for_stat(
|
|
104
|
+
def _calculate_ratio_for_stat( # noqa: C901
|
|
105
105
|
stat_name: str,
|
|
106
106
|
box_value: Any,
|
|
107
107
|
img_value: Any,
|
|
@@ -160,7 +160,7 @@ def _calculate_ratio_for_stat(
|
|
|
160
160
|
return box_value
|
|
161
161
|
|
|
162
162
|
|
|
163
|
-
def _validate_separate_inputs(
|
|
163
|
+
def _validate_separate_inputs( # noqa: C901
|
|
164
164
|
stats_output: StatsResult,
|
|
165
165
|
box_stats_output: StatsResult,
|
|
166
166
|
) -> tuple[Sequence[SourceIndex], Sequence[SourceIndex]]:
|
|
@@ -241,7 +241,7 @@ def _validate_unified_input(source_indices: Sequence[SourceIndex]) -> None:
|
|
|
241
241
|
)
|
|
242
242
|
|
|
243
243
|
|
|
244
|
-
def compute_ratios(
|
|
244
|
+
def compute_ratios( # noqa: C901
|
|
245
245
|
stats_output: StatsResult,
|
|
246
246
|
*,
|
|
247
247
|
target_stats_output: StatsResult | None = None,
|
|
@@ -158,7 +158,7 @@ def _determine_channel_indices(calculator_output: list[dict[str, list[Any]]], nu
|
|
|
158
158
|
return sorted(channel_indices_needed, key=lambda x: -1 if x is None else x)
|
|
159
159
|
|
|
160
160
|
|
|
161
|
-
def _reconcile_stats(
|
|
161
|
+
def _reconcile_stats( # noqa: C901
|
|
162
162
|
calculator_output: list[dict[str, list[Any]]],
|
|
163
163
|
sorted_channels: list[int | None],
|
|
164
164
|
empty_values_map: dict[str, Any],
|
|
@@ -344,7 +344,7 @@ def _aggregate_batch(
|
|
|
344
344
|
_UNSET = object()
|
|
345
345
|
|
|
346
346
|
|
|
347
|
-
def compute_stats(
|
|
347
|
+
def compute_stats( # noqa: C901
|
|
348
348
|
data: Iterable[ArrayLike] | Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
|
|
349
349
|
*,
|
|
350
350
|
boxes: Iterable[Iterable[BoxLike] | None] | None = None,
|
|
@@ -544,7 +544,7 @@ def compute_stats(
|
|
|
544
544
|
)
|
|
545
545
|
|
|
546
546
|
|
|
547
|
-
def combine_stats_results(
|
|
547
|
+
def combine_stats_results( # noqa: C901
|
|
548
548
|
results: StatsResult | Sequence[StatsResult],
|
|
549
549
|
) -> tuple[StatsMap, list[SourceIndex], list[int]]:
|
|
550
550
|
"""Combine one or more StatsResults into unified stats, source_index, and dataset_steps.
|
|
@@ -315,9 +315,9 @@ def cluster_tree_from_condensed_tree_bcubed(condensed_tree, cluster_tree, label_
|
|
|
315
315
|
mask1 = condensed_tree.child_size > 1
|
|
316
316
|
mask2 = condensed_tree.child_size == 1
|
|
317
317
|
mask3 = np.array([child in label_indices_list for child in condensed_tree.child])
|
|
318
|
-
mask4 = np.array(
|
|
319
|
-
|
|
320
|
-
) # check that it's not a leaf cluster
|
|
318
|
+
mask4 = np.array([
|
|
319
|
+
parent in cluster_tree_parents for parent in condensed_tree.parent
|
|
320
|
+
]) # check that it's not a leaf cluster
|
|
321
321
|
|
|
322
322
|
mask = mask1 | (mask2 & mask3 & mask4)
|
|
323
323
|
|
|
@@ -501,9 +501,9 @@ def extract_clusters_bcubed(
|
|
|
501
501
|
unselected_nodes,
|
|
502
502
|
)
|
|
503
503
|
|
|
504
|
-
return np.asarray(
|
|
505
|
-
|
|
506
|
-
)
|
|
504
|
+
return np.asarray([
|
|
505
|
+
node for node, selected in selected_clusters.items() if (selected and (node not in virtual_nodes))
|
|
506
|
+
])
|
|
507
507
|
|
|
508
508
|
|
|
509
509
|
@numba.njit(cache=True)
|
|
@@ -80,7 +80,7 @@ def _expand_tree(
|
|
|
80
80
|
|
|
81
81
|
|
|
82
82
|
@numba.njit(locals={"i": numba.types.intp, "nbr": numba.types.intp, "dist": numba.types.float32}, cache=True)
|
|
83
|
-
def _update_tree(
|
|
83
|
+
def _update_tree( # noqa: C901
|
|
84
84
|
tree: NDArray[np.float32],
|
|
85
85
|
total_edge: int,
|
|
86
86
|
disjoint_set: tuple[NDArray[np.int64], NDArray[np.int64]],
|
|
@@ -68,7 +68,7 @@ def _calc_median_deviations(reference: NDArray[Any], test: NDArray[Any]) -> NDAr
|
|
|
68
68
|
return np.abs(np.where(test_dev >= 0, test_dev / pscale, test_dev / nscale)) # (S_t, F)
|
|
69
69
|
|
|
70
70
|
|
|
71
|
-
def factor_deviation(
|
|
71
|
+
def factor_deviation( # noqa: C901
|
|
72
72
|
reference_factors: Mapping[str, NDArray[Any]],
|
|
73
73
|
test_factors: Mapping[str, NDArray[Any]],
|
|
74
74
|
indices: SequenceLike[int],
|
|
@@ -183,7 +183,7 @@ def factor_deviation(
|
|
|
183
183
|
return results
|
|
184
184
|
|
|
185
185
|
|
|
186
|
-
def factor_predictors(
|
|
186
|
+
def factor_predictors( # noqa: C901
|
|
187
187
|
factors: Mapping[str, NDArray[Any]],
|
|
188
188
|
indices: SequenceLike[int],
|
|
189
189
|
discrete_features: list[bool] | None = None,
|
|
@@ -74,7 +74,7 @@ def _merge_labels_and_factors(
|
|
|
74
74
|
return data, discrete_features
|
|
75
75
|
|
|
76
76
|
|
|
77
|
-
def mutual_info(
|
|
77
|
+
def mutual_info( # noqa: C901
|
|
78
78
|
class_labels: Array1D[int],
|
|
79
79
|
factor_data: Array2D[int | float],
|
|
80
80
|
discrete_features: Array1D[bool] | None = None,
|
|
@@ -124,13 +124,11 @@ def mutual_info(
|
|
|
124
124
|
|
|
125
125
|
>>> rng = np.random.default_rng(175)
|
|
126
126
|
>>> class_labels = rng.choice([0, 1, 2], size=100)
|
|
127
|
-
>>> factor_data = np.column_stack(
|
|
128
|
-
... [
|
|
129
|
-
...
|
|
130
|
-
...
|
|
131
|
-
...
|
|
132
|
-
... ]
|
|
133
|
-
... )
|
|
127
|
+
>>> factor_data = np.column_stack([
|
|
128
|
+
... rng.choice([25, 35, 45, 55], size=100), # age
|
|
129
|
+
... rng.choice([50000, 65000, 80000], size=100), # income
|
|
130
|
+
... rng.choice([0, 1], size=100), # gender
|
|
131
|
+
... ])
|
|
134
132
|
>>> result = mutual_info(class_labels=class_labels, factor_data=factor_data)
|
|
135
133
|
>>> result["class_to_factor"]
|
|
136
134
|
array([1. , 0.034, 0.026, 0.004])
|
|
@@ -250,13 +248,11 @@ def mutual_info_classwise(
|
|
|
250
248
|
|
|
251
249
|
>>> rng = np.random.default_rng(175)
|
|
252
250
|
>>> class_labels = rng.choice([0, 1, 2], size=100)
|
|
253
|
-
>>> factor_data = np.column_stack(
|
|
254
|
-
... [
|
|
255
|
-
...
|
|
256
|
-
...
|
|
257
|
-
...
|
|
258
|
-
... ]
|
|
259
|
-
... )
|
|
251
|
+
>>> factor_data = np.column_stack([
|
|
252
|
+
... rng.choice([25, 35, 45, 55], size=100), # age
|
|
253
|
+
... rng.choice([50000, 65000, 80000], size=100), # income
|
|
254
|
+
... rng.choice([0, 1], size=100), # gender
|
|
255
|
+
... ])
|
|
260
256
|
>>> mutual_info_classwise(class_labels=class_labels, factor_data=factor_data)
|
|
261
257
|
array([[1.000e+00, 2.077e-02, 2.296e-03, 7.317e-04],
|
|
262
258
|
[1.000e+00, 4.893e-02, 2.451e-02, 4.362e-03],
|
|
@@ -57,16 +57,14 @@ def uap(labels: Array2D[int], scores: Array2D[float]) -> float:
|
|
|
57
57
|
0.8333333333333333
|
|
58
58
|
|
|
59
59
|
>>> y_true = np.array([0, 0, 1, 1, 2, 2])
|
|
60
|
-
>>> y_scores = np.array(
|
|
61
|
-
... [
|
|
62
|
-
...
|
|
63
|
-
...
|
|
64
|
-
...
|
|
65
|
-
...
|
|
66
|
-
...
|
|
67
|
-
...
|
|
68
|
-
... ]
|
|
69
|
-
... )
|
|
60
|
+
>>> y_scores = np.array([
|
|
61
|
+
... [0.7, 0.2, 0.1],
|
|
62
|
+
... [0.4, 0.3, 0.3],
|
|
63
|
+
... [0.1, 0.8, 0.1],
|
|
64
|
+
... [0.2, 0.3, 0.5],
|
|
65
|
+
... [0.4, 0.4, 0.2],
|
|
66
|
+
... [0.1, 0.2, 0.7],
|
|
67
|
+
... ])
|
|
70
68
|
>>> uap(y_true, y_scores)
|
|
71
69
|
0.7777777777777777
|
|
72
70
|
"""
|
|
@@ -156,7 +156,7 @@ class OnnxExtractor(ReprMixin):
|
|
|
156
156
|
return [transforms]
|
|
157
157
|
return list(transforms)
|
|
158
158
|
|
|
159
|
-
def _load_model(self) -> None:
|
|
159
|
+
def _load_model(self) -> None: # noqa: C901
|
|
160
160
|
"""Load the ONNX model and validate configuration."""
|
|
161
161
|
session_cls = _get_inference_session()
|
|
162
162
|
providers = _get_execution_providers()
|
|
@@ -220,7 +220,7 @@ class OnnxExtractor(ReprMixin):
|
|
|
220
220
|
# Ensure float32 for ONNX
|
|
221
221
|
return result.astype(np.float32)
|
|
222
222
|
|
|
223
|
-
def __call__(self, data: Any) -> Array:
|
|
223
|
+
def __call__(self, data: Any) -> Array: # noqa: C901
|
|
224
224
|
"""
|
|
225
225
|
Extract features from a batch of images.
|
|
226
226
|
|
|
@@ -167,7 +167,7 @@ class SufficiencyOutput(DictOutput):
|
|
|
167
167
|
|
|
168
168
|
return self._build_dataframe(proj_array, output)
|
|
169
169
|
|
|
170
|
-
def inv_project(self, targets: Mapping[str, ArrayLike] | ArrayLike, n_iter: int = 1000) -> pl.DataFrame:
|
|
170
|
+
def inv_project(self, targets: Mapping[str, ArrayLike] | ArrayLike, n_iter: int = 1000) -> pl.DataFrame: # noqa: C901
|
|
171
171
|
"""
|
|
172
172
|
Compute training samples needed to achieve target metric values.
|
|
173
173
|
|
|
@@ -214,7 +214,7 @@ class SufficiencyOutput(DictOutput):
|
|
|
214
214
|
return self._build_inv_project_dataframe(results)
|
|
215
215
|
|
|
216
216
|
@staticmethod
|
|
217
|
-
def _build_inv_project_dataframe(
|
|
217
|
+
def _build_inv_project_dataframe( # noqa: C901
|
|
218
218
|
results: Mapping[str, tuple[NDArray[Any], NDArray[np.int64]]],
|
|
219
219
|
) -> pl.DataFrame:
|
|
220
220
|
"""Build wide-format DataFrame from inverse projection results."""
|
|
@@ -405,7 +405,7 @@ def linear_initialization(metric: NDArray[Any], sizes: NDArray[Any], bounds: Con
|
|
|
405
405
|
return np.array([scale, -negative_exponent, asymptote], dtype=np.float64)
|
|
406
406
|
|
|
407
407
|
|
|
408
|
-
def calc_params(p_i: NDArray[Any], n_i: NDArray[Any], niter: int, unit_interval: bool) -> NDArray[np.float64]:
|
|
408
|
+
def calc_params(p_i: NDArray[Any], n_i: NDArray[Any], niter: int, unit_interval: bool) -> NDArray[np.float64]: # noqa: C901
|
|
409
409
|
"""
|
|
410
410
|
Retrieve the inverse power curve coefficients for the line of best fit.
|
|
411
411
|
|