dataeval 1.0.3__tar.gz → 1.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataeval-1.0.3 → dataeval-1.0.5}/PKG-INFO +1 -1
- {dataeval-1.0.3 → dataeval-1.0.5}/pyproject.toml +18 -2
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/_embeddings.py +2 -2
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/_experimental.py +5 -5
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/_metadata.py +44 -26
- dataeval-1.0.5/src/dataeval/_version.py +24 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/bias/_balance.py +4 -2
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/bias/_diversity.py +2 -2
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/config.py +3 -3
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_bin.py +2 -2
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_calculators/_cache.py +11 -2
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_calculators/_pixelstats.py +14 -2
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_clusterer.py +4 -4
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_compute_ratios.py +3 -3
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_compute_stats.py +31 -10
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_fast_hdbscan/_cluster_trees.py +6 -6
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_fast_hdbscan/_mst.py +1 -1
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_label_stats.py +1 -1
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_metadata_insights.py +2 -2
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_mutual_info.py +11 -15
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_parity.py +1 -1
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_uap.py +8 -10
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/extractors/_onnx.py +2 -2
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/extractors/_torch.py +1 -1
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/performance/_output.py +3 -3
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/quality/_duplicates.py +106 -61
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/quality/_outliers.py +42 -37
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/quality/_shared.py +3 -2
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/scope/_prioritize.py +2 -2
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/selection/_classbalance.py +1 -1
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/selection/_classfilter.py +1 -1
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_drift/_base.py +1 -1
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_drift/_univariate.py +1 -1
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_shared/_reconstruction.py +39 -35
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/types.py +4 -4
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/utils/_internal.py +6 -6
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/utils/data.py +4 -4
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/utils/onnx.py +1 -1
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/utils/preprocessing.py +10 -4
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/utils/thresholds.py +3 -3
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/utils/training.py +1 -1
- dataeval-1.0.3/src/dataeval/_version.py +0 -34
- {dataeval-1.0.3 → dataeval-1.0.5}/.gitignore +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/LICENSE +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/README.md +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/__init__.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/_helpers.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/_log.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/_warm_cache.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/bias/__init__.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/bias/_parity.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/__init__.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_ber.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_calculators/__init__.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_calculators/_base.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_calculators/_dimensionstats.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_calculators/_hashstats.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_calculators/_register.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_calculators/_registry.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_calculators/_visualstats.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_completeness.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_coverage.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_divergence.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_diversity.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_fast_hdbscan/_disjoint_set.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_feature_distance.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_hash.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_label_errors.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_label_parity.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_mst.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_nullmodel.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/core/_rank.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/exceptions.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/extractors/__init__.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/extractors/_bovw.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/extractors/_flatten.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/extractors/_uncertainty.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/flags.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/performance/__init__.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/performance/_aggregator.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/performance/_sufficiency.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/performance/schedules.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/protocols.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/py.typed +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/quality/__init__.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/scope/__init__.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/selection/__init__.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/selection/_indices.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/selection/_limit.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/selection/_reverse.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/selection/_select.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/selection/_shuffle.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/__init__.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_drift/__init__.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_drift/_chunk.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_drift/_domain_classifier.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_drift/_kneighbors.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_drift/_mmd.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_drift/_reconstruction.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_ood/__init__.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_ood/_base.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_ood/_domain_classifier.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_ood/_kneighbors.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_ood/_reconstruction.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_shared/__init__.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_shared/_domain_classifier.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/_shared/_kneighbors.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/shift/update_strategies.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/utils/__init__.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/utils/losses.py +0 -0
- {dataeval-1.0.3 → dataeval-1.0.5}/src/dataeval/utils/models.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataeval
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.5
|
|
4
4
|
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
|
5
5
|
Project-URL: Homepage, https://dataeval.ai/
|
|
6
6
|
Project-URL: Repository, https://github.com/aria-ml/dataeval/
|
|
@@ -118,6 +118,14 @@ docs = [
|
|
|
118
118
|
"markupsafe>=3,<3.0.2",
|
|
119
119
|
"jupytext>=1.19.1",
|
|
120
120
|
]
|
|
121
|
+
security = [ # keep in sync with [tool.uv.constraint-dependencies]
|
|
122
|
+
"cryptography>=46.0.5", # CVE-2026-26007: Missing Subgroup Validation for SECT Curves
|
|
123
|
+
"filelock>=3.20.3", # GHSA-w853-jp5j-5j7f, GHSA-qmgc-5h2g-mvrw
|
|
124
|
+
"pillow>=12.1.1", # CVE-2026-25990: OOB write via PSD image
|
|
125
|
+
"protobuf>=6.33.5", # GHSA-7gcm-g887-7qv7
|
|
126
|
+
"setuptools>=82.0.0", # CVE-2026-23949: (jaraco_context) path traversal in tarball()
|
|
127
|
+
# CVE-2026-24049: (wheel) privilege escalation via unpack
|
|
128
|
+
]
|
|
121
129
|
dev = [
|
|
122
130
|
{ include-group = "base" },
|
|
123
131
|
{ include-group = "lint" },
|
|
@@ -141,7 +149,9 @@ conflicts = [
|
|
|
141
149
|
]
|
|
142
150
|
constraint-dependencies = [
|
|
143
151
|
"cryptography>=46.0.5", # CVE-2026-26007: Missing Subgroup Validation for SECT Curves
|
|
152
|
+
"filelock>=3.20.3", # GHSA-w853-jp5j-5j7f, GHSA-qmgc-5h2g-mvrw
|
|
144
153
|
"pillow>=12.1.1", # CVE-2026-25990: OOB write via PSD image
|
|
154
|
+
"protobuf>=6.33.5", # GHSA-7gcm-g887-7qv7
|
|
145
155
|
"setuptools>=82.0.0", # CVE-2026-23949: (jaraco_context) path traversal in tarball()
|
|
146
156
|
# CVE-2026-24049: (wheel) privilege escalation via unpack
|
|
147
157
|
]
|
|
@@ -262,20 +272,23 @@ exclude = [
|
|
|
262
272
|
".jupyter_cache",
|
|
263
273
|
"*env*",
|
|
264
274
|
"output",
|
|
275
|
+
"_build",
|
|
265
276
|
"build",
|
|
266
277
|
".nox",
|
|
267
278
|
".tox",
|
|
279
|
+
"prototype",
|
|
268
280
|
"src/dataeval/_version.py",
|
|
269
281
|
]
|
|
270
282
|
line-length = 120
|
|
271
283
|
indent-width = 4
|
|
272
284
|
target-version = "py310"
|
|
285
|
+
extend-include = ["*.ipynb"]
|
|
273
286
|
|
|
274
287
|
[tool.ruff.lint]
|
|
275
288
|
select = ["F", "E", "W", "C90", "I", "N", "D", "UP", "YTT", "ANN", "S", "BLE", "B", "A",
|
|
276
289
|
"COM", "C4", "T10", "ISC", "ICN", "PYI", "PT", "Q", "RSE", "RET", "SLF", "SIM",
|
|
277
|
-
"TID252", "ARG", "FIX", "PD", "FLY", "NPY", "RUF100", "PERF"]
|
|
278
|
-
ignore = ["ANN401", "COM812", "NPY002", "SLF001"]
|
|
290
|
+
"TID252", "ARG", "FIX", "PD", "FLY", "NPY", "RUF027", "RUF100", "PERF"]
|
|
291
|
+
ignore = ["ANN101", "ANN102", "ANN401", "C408", "C416", "COM812", "NPY002", "SLF001"]
|
|
279
292
|
fixable = ["ALL"]
|
|
280
293
|
unfixable = []
|
|
281
294
|
dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
|
|
@@ -287,6 +300,9 @@ builtins-strict-checking = false
|
|
|
287
300
|
[tool.ruff.lint.isort]
|
|
288
301
|
known-first-party = ["dataeval"]
|
|
289
302
|
|
|
303
|
+
[tool.ruff.lint.mccabe]
|
|
304
|
+
max-complexity = 5
|
|
305
|
+
|
|
290
306
|
[tool.ruff.lint.pydocstyle]
|
|
291
307
|
convention = "numpy"
|
|
292
308
|
|
|
@@ -509,7 +509,7 @@ class Embeddings(Array, FeatureExtractor):
|
|
|
509
509
|
images.append(image)
|
|
510
510
|
return images
|
|
511
511
|
|
|
512
|
-
def _batch(self, indices: Sequence[int]) -> Iterator[NDArray[Any]]:
|
|
512
|
+
def _batch(self, indices: Sequence[int]) -> Iterator[NDArray[Any]]: # noqa: C901
|
|
513
513
|
"""Process indices in batches using the extractor."""
|
|
514
514
|
if self._dataset is None:
|
|
515
515
|
raise NotFittedError("No dataset bound. Call bind() first.")
|
|
@@ -559,7 +559,7 @@ class Embeddings(Array, FeatureExtractor):
|
|
|
559
559
|
batch_indices = list(indices[batch_start : batch_start + self._batch_size])
|
|
560
560
|
yield self._embeddings[batch_indices]
|
|
561
561
|
|
|
562
|
-
def __getitem__(self, key: int | Iterable[int] | slice, /) -> NDArray[Any]:
|
|
562
|
+
def __getitem__(self, key: int | Iterable[int] | slice, /) -> NDArray[Any]: # noqa: C901
|
|
563
563
|
"""
|
|
564
564
|
Access embeddings by index, indices or slice.
|
|
565
565
|
|
|
@@ -13,7 +13,7 @@ from dataeval.exceptions import DeprecatedWarning, ExperimentalWarning
|
|
|
13
13
|
F = TypeVar("F", bound=Callable[..., Any])
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
def _make_warning_message(
|
|
16
|
+
def _make_warning_message( # noqa: C901
|
|
17
17
|
name: str,
|
|
18
18
|
kind: str,
|
|
19
19
|
*,
|
|
@@ -51,7 +51,7 @@ def _prepend_doc_note(doc: str | None, note: str) -> str:
|
|
|
51
51
|
def experimental(_target: F) -> F: ...
|
|
52
52
|
@overload
|
|
53
53
|
def experimental(*, alternative: str | None = None, details: str | None = None) -> Callable[[F], F]: ...
|
|
54
|
-
def experimental(
|
|
54
|
+
def experimental( # noqa: C901
|
|
55
55
|
_target: F | None = None,
|
|
56
56
|
*,
|
|
57
57
|
alternative: str | None = None,
|
|
@@ -72,7 +72,7 @@ def experimental(
|
|
|
72
72
|
def my_func(): ...
|
|
73
73
|
"""
|
|
74
74
|
|
|
75
|
-
def decorator(target: F) -> F:
|
|
75
|
+
def decorator(target: F) -> F: # noqa: C901
|
|
76
76
|
name = getattr(target, "__qualname__", getattr(target, "__name__", str(target)))
|
|
77
77
|
msg = _make_warning_message(name, "experimental", alternative=alternative, details=details)
|
|
78
78
|
warned = False
|
|
@@ -118,7 +118,7 @@ def deprecated(
|
|
|
118
118
|
alternative: str | None = None,
|
|
119
119
|
details: str | None = None,
|
|
120
120
|
) -> Callable[[F], F]: ...
|
|
121
|
-
def deprecated(
|
|
121
|
+
def deprecated( # noqa: C901
|
|
122
122
|
_target: F | None = None,
|
|
123
123
|
*,
|
|
124
124
|
since: str | None = None,
|
|
@@ -141,7 +141,7 @@ def deprecated(
|
|
|
141
141
|
def old_func(): ...
|
|
142
142
|
"""
|
|
143
143
|
|
|
144
|
-
def decorator(target: F) -> F:
|
|
144
|
+
def decorator(target: F) -> F: # noqa: C901
|
|
145
145
|
name = getattr(target, "__qualname__", getattr(target, "__name__", str(target)))
|
|
146
146
|
msg = _make_warning_message(
|
|
147
147
|
name,
|
|
@@ -39,6 +39,7 @@ class FactorInfo:
|
|
|
39
39
|
factor_type: Literal["categorical", "continuous", "discrete"]
|
|
40
40
|
is_binned: bool = False
|
|
41
41
|
is_digitized: bool = False
|
|
42
|
+
level: Literal["image", "target"] = "image"
|
|
42
43
|
|
|
43
44
|
|
|
44
45
|
def _to_col(name: str, info: FactorInfo, binned: bool = True) -> str:
|
|
@@ -110,8 +111,8 @@ class Metadata(Array, FeatureExtractor):
|
|
|
110
111
|
*,
|
|
111
112
|
continuous_factor_bins: Mapping[str, int | Sequence[float]] | None = None,
|
|
112
113
|
auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = "uniform_width",
|
|
113
|
-
exclude: Sequence[str] | None = None,
|
|
114
|
-
include: Sequence[str] | None = None,
|
|
114
|
+
exclude: str | Sequence[str] | None = None,
|
|
115
|
+
include: str | Sequence[str] | None = None,
|
|
115
116
|
) -> None:
|
|
116
117
|
self._class_labels: NDArray[np.intp]
|
|
117
118
|
self._item_indices: NDArray[np.intp]
|
|
@@ -132,8 +133,8 @@ class Metadata(Array, FeatureExtractor):
|
|
|
132
133
|
if exclude is not None and include is not None:
|
|
133
134
|
raise ValueError("Filters for `exclude` and `include` are mutually exclusive.")
|
|
134
135
|
|
|
135
|
-
self._exclude = set(exclude or ())
|
|
136
|
-
self._include = set(include or ())
|
|
136
|
+
self._exclude = {exclude} if isinstance(exclude, str) else set(exclude or ())
|
|
137
|
+
self._include = {include} if isinstance(include, str) else set(include or ())
|
|
137
138
|
self._target_factors_only = False
|
|
138
139
|
|
|
139
140
|
def __repr__(self) -> str:
|
|
@@ -281,7 +282,7 @@ class Metadata(Array, FeatureExtractor):
|
|
|
281
282
|
raise NotFittedError("No dataset bound. Call bind() first.")
|
|
282
283
|
yield from self.factor_data
|
|
283
284
|
|
|
284
|
-
def __getitem__(self, index: int | str | slice) -> Array:
|
|
285
|
+
def __getitem__(self, index: int | str | slice) -> Array: # noqa: C901
|
|
285
286
|
"""Get binned metadata for specific indices or factors.
|
|
286
287
|
|
|
287
288
|
Parameters
|
|
@@ -423,17 +424,17 @@ class Metadata(Array, FeatureExtractor):
|
|
|
423
424
|
return self._exclude
|
|
424
425
|
|
|
425
426
|
@exclude.setter
|
|
426
|
-
def exclude(self, value: Sequence[str]) -> None:
|
|
427
|
+
def exclude(self, value: str | Sequence[str]) -> None:
|
|
427
428
|
"""Set factor names to exclude from processing.
|
|
428
429
|
|
|
429
430
|
Automatically clears include filter and resets binning state when exclusion list changes.
|
|
430
431
|
|
|
431
432
|
Parameters
|
|
432
433
|
----------
|
|
433
|
-
value : Sequence[str]
|
|
434
|
-
Factor names to exclude from metadata analysis.
|
|
434
|
+
value : str | Sequence[str]
|
|
435
|
+
Factor name or names to exclude from metadata analysis.
|
|
435
436
|
"""
|
|
436
|
-
exclude = set(value)
|
|
437
|
+
exclude = {value} if isinstance(value, str) else set(value)
|
|
437
438
|
if self._exclude != exclude:
|
|
438
439
|
self._exclude = exclude
|
|
439
440
|
self._include = set()
|
|
@@ -451,7 +452,7 @@ class Metadata(Array, FeatureExtractor):
|
|
|
451
452
|
return self._include
|
|
452
453
|
|
|
453
454
|
@include.setter
|
|
454
|
-
def include(self, value: Sequence[str]) -> None:
|
|
455
|
+
def include(self, value: str | Sequence[str]) -> None:
|
|
455
456
|
"""Set factor names to include in processing.
|
|
456
457
|
|
|
457
458
|
Automatically clears exclude filter and resets binning state when
|
|
@@ -459,10 +460,10 @@ class Metadata(Array, FeatureExtractor):
|
|
|
459
460
|
|
|
460
461
|
Parameters
|
|
461
462
|
----------
|
|
462
|
-
value : Sequence[str]
|
|
463
|
-
Factor names to include in metadata analysis.
|
|
463
|
+
value : str | Sequence[str]
|
|
464
|
+
Factor name or names to include in metadata analysis.
|
|
464
465
|
"""
|
|
465
|
-
include = set(value)
|
|
466
|
+
include = {value} if isinstance(value, str) else set(value)
|
|
466
467
|
if self._include != include:
|
|
467
468
|
self._include = include
|
|
468
469
|
self._exclude = set()
|
|
@@ -933,7 +934,7 @@ class Metadata(Array, FeatureExtractor):
|
|
|
933
934
|
factor = factor[0] if isinstance(factor, tuple) else factor
|
|
934
935
|
return factor in self.include if self.include else factor not in self.exclude
|
|
935
936
|
|
|
936
|
-
def _reset_bins(self, cols: Iterable[str] | None = None) -> None:
|
|
937
|
+
def _reset_bins(self, cols: Iterable[str] | None = None) -> None: # noqa: C901
|
|
937
938
|
if self._is_binned:
|
|
938
939
|
columns = self._dataframe.columns
|
|
939
940
|
for col in cols or columns:
|
|
@@ -1005,7 +1006,7 @@ class Metadata(Array, FeatureExtractor):
|
|
|
1005
1006
|
)
|
|
1006
1007
|
return target_rows
|
|
1007
1008
|
|
|
1008
|
-
def _get_target_factor_values(
|
|
1009
|
+
def _get_target_factor_values( # noqa: C901
|
|
1009
1010
|
self,
|
|
1010
1011
|
factor_name: str,
|
|
1011
1012
|
factor_values: Any,
|
|
@@ -1115,13 +1116,26 @@ class Metadata(Array, FeatureExtractor):
|
|
|
1115
1116
|
raise ValueError(f"Invalid level: {level}. Must be 'image', 'target', or 'auto'")
|
|
1116
1117
|
|
|
1117
1118
|
def _create_factor_column(self, data_array: NDArray, level: str, num_image_rows: int) -> list:
|
|
1118
|
-
"""Create a factor column with values at the appropriate level.
|
|
1119
|
+
"""Create a factor column with values at the appropriate level.
|
|
1120
|
+
|
|
1121
|
+
For OD datasets with image-level factors, values are stored in image rows
|
|
1122
|
+
and replicated to target rows using item_index mapping, so that bias
|
|
1123
|
+
evaluators can access them via target_data.
|
|
1124
|
+
"""
|
|
1119
1125
|
if level == "image":
|
|
1120
|
-
#
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1126
|
+
# Image rows get the values directly
|
|
1127
|
+
image_values: list = data_array.tolist()
|
|
1128
|
+
|
|
1129
|
+
if self.has_targets():
|
|
1130
|
+
# For OD datasets, replicate image-level values to target rows
|
|
1131
|
+
# using the item_index column which maps each target to its source image
|
|
1132
|
+
target_df = self._dataframe.filter(pl.col("target_index").is_not_null())
|
|
1133
|
+
target_image_indices = target_df["item_index"].to_numpy()
|
|
1134
|
+
target_values = data_array[target_image_indices].tolist()
|
|
1135
|
+
else:
|
|
1136
|
+
target_values = []
|
|
1137
|
+
|
|
1138
|
+
return image_values + target_values
|
|
1125
1139
|
# level == "target"
|
|
1126
1140
|
# Create column: None in image rows, target-level values in target rows
|
|
1127
1141
|
return [None] * num_image_rows + list(data_array)
|
|
@@ -1138,7 +1152,7 @@ class Metadata(Array, FeatureExtractor):
|
|
|
1138
1152
|
self._structure()
|
|
1139
1153
|
return bool(self._has_targets)
|
|
1140
1154
|
|
|
1141
|
-
def _process_targets(
|
|
1155
|
+
def _process_targets( # noqa: C901
|
|
1142
1156
|
self,
|
|
1143
1157
|
raw: list,
|
|
1144
1158
|
labels: list,
|
|
@@ -1267,9 +1281,10 @@ class Metadata(Array, FeatureExtractor):
|
|
|
1267
1281
|
k for k in factors if not isinstance(self._dataframe.schema.get(k), pl.List | pl.Struct | pl.Array)
|
|
1268
1282
|
}
|
|
1269
1283
|
|
|
1270
|
-
self._factors
|
|
1284
|
+
existing = self._factors if hasattr(self, "_factors") else {}
|
|
1285
|
+
self._factors = {k: existing.get(k) for k in usable_factors}
|
|
1271
1286
|
|
|
1272
|
-
def _structure(
|
|
1287
|
+
def _structure( # noqa: C901
|
|
1273
1288
|
self,
|
|
1274
1289
|
*,
|
|
1275
1290
|
progress_callback: ProgressCallback | None = None,
|
|
@@ -1463,7 +1478,7 @@ class Metadata(Array, FeatureExtractor):
|
|
|
1463
1478
|
df = self._add_column_with_padding(df, col_dg, ordinal.astype(np.int64), is_od)
|
|
1464
1479
|
return df, FactorInfo("discrete", is_digitized=True)
|
|
1465
1480
|
|
|
1466
|
-
def _bin(
|
|
1481
|
+
def _bin( # noqa: C901
|
|
1467
1482
|
self,
|
|
1468
1483
|
*,
|
|
1469
1484
|
progress_callback: ProgressCallback | None = None,
|
|
@@ -1492,9 +1507,12 @@ class Metadata(Array, FeatureExtractor):
|
|
|
1492
1507
|
factors_to_process = [col for col in self.factor_names if not {_binned(col), _digitized(col)} & column_set]
|
|
1493
1508
|
total_factors = len(factors_to_process)
|
|
1494
1509
|
|
|
1510
|
+
target_only = self._target_factors - self._image_factors if is_od else set()
|
|
1495
1511
|
for i, col in enumerate(factors_to_process):
|
|
1496
1512
|
data = data_df[col].to_numpy()
|
|
1497
1513
|
df, info = self._process_factor(df, col, data, factor_bins, is_od)
|
|
1514
|
+
if is_od and col in target_only:
|
|
1515
|
+
info.level = "target"
|
|
1498
1516
|
factor_info[col] = info
|
|
1499
1517
|
|
|
1500
1518
|
if progress_callback:
|
|
@@ -1505,7 +1523,7 @@ class Metadata(Array, FeatureExtractor):
|
|
|
1505
1523
|
self._factors.update(factor_info)
|
|
1506
1524
|
self._is_binned = True
|
|
1507
1525
|
|
|
1508
|
-
def add_factors(
|
|
1526
|
+
def add_factors( # noqa: C901
|
|
1509
1527
|
self,
|
|
1510
1528
|
factors: Mapping[str, Array1D[Any]],
|
|
1511
1529
|
level: Literal["image", "target", "auto"] = "auto",
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# file generated by vcs-versioning
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"__version__",
|
|
7
|
+
"__version_tuple__",
|
|
8
|
+
"version",
|
|
9
|
+
"version_tuple",
|
|
10
|
+
"__commit_id__",
|
|
11
|
+
"commit_id",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
version: str
|
|
15
|
+
__version__: str
|
|
16
|
+
__version_tuple__: tuple[int | str, ...]
|
|
17
|
+
version_tuple: tuple[int | str, ...]
|
|
18
|
+
commit_id: str | None
|
|
19
|
+
__commit_id__: str | None
|
|
20
|
+
|
|
21
|
+
__version__ = version = '1.0.5'
|
|
22
|
+
__version_tuple__ = version_tuple = (1, 0, 5)
|
|
23
|
+
|
|
24
|
+
__commit_id__ = commit_id = None
|
|
@@ -3,6 +3,7 @@ __all__ = []
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from typing import Any, Literal
|
|
5
5
|
|
|
6
|
+
import numpy as np
|
|
6
7
|
import polars as pl
|
|
7
8
|
|
|
8
9
|
from dataeval import Metadata
|
|
@@ -146,7 +147,7 @@ class Balance(Evaluator):
|
|
|
146
147
|
super().__init__(locals())
|
|
147
148
|
|
|
148
149
|
@set_metadata(state=["num_neighbors", "class_imbalance_threshold", "factor_correlation_threshold"])
|
|
149
|
-
def evaluate(self, data: AnnotatedDataset[Any] | MetadataLike) -> BalanceOutput:
|
|
150
|
+
def evaluate(self, data: AnnotatedDataset[Any] | MetadataLike) -> BalanceOutput: # noqa: C901
|
|
150
151
|
"""
|
|
151
152
|
Compute mutual information between factors and identify imbalanced classes.
|
|
152
153
|
|
|
@@ -269,8 +270,9 @@ class Balance(Evaluator):
|
|
|
269
270
|
# Include class_label as the first factor (index 0), then all metadata factors
|
|
270
271
|
all_factor_names = ["class_label"] + factor_names
|
|
271
272
|
|
|
273
|
+
u_classes = np.unique(self.metadata.class_labels)
|
|
272
274
|
for class_idx in range(classwise.shape[0]):
|
|
273
|
-
class_name = index2label.get(class_idx, str(class_idx))
|
|
275
|
+
class_name = index2label.get(int(u_classes[class_idx]), str(u_classes[class_idx]))
|
|
274
276
|
for factor_idx in range(classwise.shape[1]):
|
|
275
277
|
mi_value = classwise[class_idx, factor_idx]
|
|
276
278
|
class_name_col.append(class_name)
|
|
@@ -135,7 +135,7 @@ class Diversity(Evaluator):
|
|
|
135
135
|
super().__init__(locals())
|
|
136
136
|
|
|
137
137
|
@set_metadata(state=["method", "threshold"])
|
|
138
|
-
def evaluate(self, data: AnnotatedDataset[Any] | MetadataLike) -> DiversityOutput:
|
|
138
|
+
def evaluate(self, data: AnnotatedDataset[Any] | MetadataLike) -> DiversityOutput: # noqa: C901
|
|
139
139
|
"""
|
|
140
140
|
Compute diversity and classwise diversity for the dataset.
|
|
141
141
|
|
|
@@ -251,7 +251,7 @@ class Diversity(Evaluator):
|
|
|
251
251
|
is_low_diversity_col: list[bool] = []
|
|
252
252
|
|
|
253
253
|
for class_idx in range(classwise_div.shape[0]):
|
|
254
|
-
class_name = index2label.get(class_idx, str(class_idx))
|
|
254
|
+
class_name = index2label.get(int(u_classes[class_idx]), str(u_classes[class_idx]))
|
|
255
255
|
for factor_idx in range(num_factors):
|
|
256
256
|
div_value = classwise_div[class_idx, factor_idx]
|
|
257
257
|
if not np.isnan(div_value):
|
|
@@ -23,7 +23,7 @@ from pydantic import BaseModel, ConfigDict, field_validator
|
|
|
23
23
|
|
|
24
24
|
from dataeval.protocols import DeviceLike
|
|
25
25
|
|
|
26
|
-
|
|
26
|
+
# GLOBAL CONFIG ###
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
class GlobalConfig(BaseModel):
|
|
@@ -77,7 +77,7 @@ class GlobalConfig(BaseModel):
|
|
|
77
77
|
_config = GlobalConfig()
|
|
78
78
|
|
|
79
79
|
|
|
80
|
-
|
|
80
|
+
# CONTEXT MANAGER ###
|
|
81
81
|
|
|
82
82
|
|
|
83
83
|
class _ConfigContextManager:
|
|
@@ -96,7 +96,7 @@ class _ConfigContextManager:
|
|
|
96
96
|
setattr(_config, self._attr_name, self._old)
|
|
97
97
|
|
|
98
98
|
|
|
99
|
-
|
|
99
|
+
# FUNCS ###
|
|
100
100
|
|
|
101
101
|
|
|
102
102
|
def _todevice(device: DeviceLike) -> torch.device:
|
|
@@ -92,7 +92,7 @@ def bin_data(data: NDArray[Any], bin_method: str) -> NDArray[np.intp]:
|
|
|
92
92
|
return np.digitize(data, bin_edges)
|
|
93
93
|
|
|
94
94
|
|
|
95
|
-
def is_continuous(data: NDArray[np.number[Any]], image_indices: NDArray[np.number[Any]] | None = None) -> bool:
|
|
95
|
+
def is_continuous(data: NDArray[np.number[Any]], image_indices: NDArray[np.number[Any]] | None = None) -> bool: # noqa: C901
|
|
96
96
|
"""
|
|
97
97
|
Determine whether the data is continuous or discrete using the Wasserstein distance.
|
|
98
98
|
|
|
@@ -144,7 +144,7 @@ def is_continuous(data: NDArray[np.number[Any]], image_indices: NDArray[np.numbe
|
|
|
144
144
|
return bool(shift < DISCRETE_MIN_WD) # if NNN is close enough to uniform, consider the sample continuous.
|
|
145
145
|
|
|
146
146
|
|
|
147
|
-
def _bin_by_clusters(data: NDArray[np.number[Any]]) -> NDArray[np.float64]:
|
|
147
|
+
def _bin_by_clusters(data: NDArray[np.number[Any]]) -> NDArray[np.float64]: # noqa: C901
|
|
148
148
|
"""
|
|
149
149
|
Bin continuous data by using the Clusterer to identify clusters.
|
|
150
150
|
|
|
@@ -22,7 +22,13 @@ class CalculatorCache:
|
|
|
22
22
|
This class adapts based on the data type passed in.
|
|
23
23
|
"""
|
|
24
24
|
|
|
25
|
-
def __init__(
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
datum: Any,
|
|
28
|
+
box: BoundingBox | None = None,
|
|
29
|
+
per_channel: bool = False,
|
|
30
|
+
normalize_pixel_values: bool = False,
|
|
31
|
+
) -> None:
|
|
26
32
|
is_spatial = len(datum.shape) >= 2
|
|
27
33
|
self.raw = datum
|
|
28
34
|
# Assume image data for now (will be generic in future)
|
|
@@ -30,6 +36,7 @@ class CalculatorCache:
|
|
|
30
36
|
self.height: int = datum.shape[-2] if is_spatial else 0
|
|
31
37
|
self.shape: tuple[int, ...] = datum.shape
|
|
32
38
|
self.per_channel_mode = per_channel
|
|
39
|
+
self.normalize_pixel_values = normalize_pixel_values
|
|
33
40
|
self.has_box = box is not None
|
|
34
41
|
|
|
35
42
|
# Ensure bounding box
|
|
@@ -54,7 +61,9 @@ class CalculatorCache:
|
|
|
54
61
|
|
|
55
62
|
@cached_property
|
|
56
63
|
def scaled(self) -> NDArray[Any]:
|
|
57
|
-
|
|
64
|
+
if self.normalize_pixel_values:
|
|
65
|
+
return rescale(self.image)
|
|
66
|
+
return self.image
|
|
58
67
|
|
|
59
68
|
@cached_property
|
|
60
69
|
def per_channel(self) -> NDArray[Any]:
|
|
@@ -39,11 +39,23 @@ class PixelStatCalculator(Calculator[ImageStats]):
|
|
|
39
39
|
def _var_func(self, data: NDArray[Any], **kw: Any) -> Any:
|
|
40
40
|
return np.nanvar(data, **kw) if self._has_nan else np.var(data, **kw)
|
|
41
41
|
|
|
42
|
+
@cached_property
|
|
43
|
+
def _histogram_range(self) -> tuple[float, float]:
|
|
44
|
+
if self.cache.normalize_pixel_values:
|
|
45
|
+
return (0.0, 1.0)
|
|
46
|
+
from dataeval.utils.preprocessing import get_bitdepth
|
|
47
|
+
|
|
48
|
+
bitdepth = get_bitdepth(self.cache.scaled)
|
|
49
|
+
if bitdepth.depth == 0:
|
|
50
|
+
return (0.0, 1.0)
|
|
51
|
+
return (0.0, float(bitdepth.pmax))
|
|
52
|
+
|
|
42
53
|
@cached_property
|
|
43
54
|
def histogram(self) -> NDArray[np.float64]:
|
|
55
|
+
r = self._histogram_range
|
|
44
56
|
if self.per_channel_mode:
|
|
45
|
-
return np.apply_along_axis(lambda y: np.histogram(y, bins=256, range=
|
|
46
|
-
return np.histogram(self.cache.scaled, bins=256, range=
|
|
57
|
+
return np.apply_along_axis(lambda y: np.histogram(y, bins=256, range=r)[0], 1, self.cache.per_channel)
|
|
58
|
+
return np.histogram(self.cache.scaled, bins=256, range=r)[0]
|
|
47
59
|
|
|
48
60
|
def get_applicable_flags(self) -> ImageStats:
|
|
49
61
|
"""Return which flags this calculator handles."""
|
|
@@ -99,7 +99,7 @@ class _Clusters:
|
|
|
99
99
|
prob: NDArray[np.float64] = exp / np.sum(exp)
|
|
100
100
|
return prob
|
|
101
101
|
|
|
102
|
-
def _sort_by_weights(self, embeddings: NDArray[np.float64]) -> NDArray[np.intp]:
|
|
102
|
+
def _sort_by_weights(self, embeddings: NDArray[np.float64]) -> NDArray[np.intp]: # noqa: C901
|
|
103
103
|
"""Sort samples using complexity-based weighted sampling."""
|
|
104
104
|
labels = self._get_labels(embeddings)
|
|
105
105
|
pr = self._complexity(embeddings)
|
|
@@ -356,7 +356,7 @@ class _HDBSCAN:
|
|
|
356
356
|
self.cluster_selection_epsilon = 0.0
|
|
357
357
|
self.cluster_selection_method = "eom"
|
|
358
358
|
|
|
359
|
-
def fit(self, embeddings: NDArray[np.floating]) -> "_HDBSCAN":
|
|
359
|
+
def fit(self, embeddings: NDArray[np.floating]) -> "_HDBSCAN": # noqa: C901
|
|
360
360
|
"""
|
|
361
361
|
Find clusters based on hierarchical density-based clustering.
|
|
362
362
|
|
|
@@ -541,7 +541,7 @@ class ClusterStats(TypedDict):
|
|
|
541
541
|
nearest_cluster_idx: NDArray[np.int64]
|
|
542
542
|
|
|
543
543
|
|
|
544
|
-
def compute_cluster_stats(
|
|
544
|
+
def compute_cluster_stats( # noqa: C901
|
|
545
545
|
embeddings: NDArray[np.floating],
|
|
546
546
|
cluster_labels: _Clusters | NDArray[np.int64],
|
|
547
547
|
) -> ClusterStats:
|
|
@@ -642,7 +642,7 @@ def compute_cluster_stats(
|
|
|
642
642
|
)
|
|
643
643
|
|
|
644
644
|
|
|
645
|
-
def cluster(
|
|
645
|
+
def cluster( # noqa: C901
|
|
646
646
|
embeddings: ArrayND[float],
|
|
647
647
|
algorithm: Literal["kmeans", "hdbscan"] = "hdbscan",
|
|
648
648
|
n_clusters: int | None = None,
|
|
@@ -101,7 +101,7 @@ def _build_image_lookup(source_indices: Sequence[SourceIndex]) -> dict[tuple[int
|
|
|
101
101
|
return lookup
|
|
102
102
|
|
|
103
103
|
|
|
104
|
-
def _calculate_ratio_for_stat(
|
|
104
|
+
def _calculate_ratio_for_stat( # noqa: C901
|
|
105
105
|
stat_name: str,
|
|
106
106
|
box_value: Any,
|
|
107
107
|
img_value: Any,
|
|
@@ -160,7 +160,7 @@ def _calculate_ratio_for_stat(
|
|
|
160
160
|
return box_value
|
|
161
161
|
|
|
162
162
|
|
|
163
|
-
def _validate_separate_inputs(
|
|
163
|
+
def _validate_separate_inputs( # noqa: C901
|
|
164
164
|
stats_output: StatsResult,
|
|
165
165
|
box_stats_output: StatsResult,
|
|
166
166
|
) -> tuple[Sequence[SourceIndex], Sequence[SourceIndex]]:
|
|
@@ -241,7 +241,7 @@ def _validate_unified_input(source_indices: Sequence[SourceIndex]) -> None:
|
|
|
241
241
|
)
|
|
242
242
|
|
|
243
243
|
|
|
244
|
-
def compute_ratios(
|
|
244
|
+
def compute_ratios( # noqa: C901
|
|
245
245
|
stats_output: StatsResult,
|
|
246
246
|
*,
|
|
247
247
|
target_stats_output: StatsResult | None = None,
|