dataeval 0.72.0__py3-none-any.whl → 0.72.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +4 -4
- dataeval/detectors/__init__.py +4 -3
- dataeval/detectors/drift/__init__.py +10 -11
- dataeval/{_internal/detectors → detectors}/drift/base.py +51 -102
- dataeval/{_internal/detectors → detectors}/drift/cvm.py +9 -8
- dataeval/{_internal/detectors → detectors}/drift/ks.py +11 -10
- dataeval/{_internal/detectors → detectors}/drift/mmd.py +33 -34
- dataeval/{_internal/detectors → detectors}/drift/torch.py +15 -13
- dataeval/{_internal/detectors → detectors}/drift/uncertainty.py +12 -9
- dataeval/detectors/drift/updates.py +61 -0
- dataeval/detectors/linters/__init__.py +3 -3
- dataeval/{_internal/detectors → detectors/linters}/clusterer.py +47 -45
- dataeval/{_internal/detectors → detectors/linters}/duplicates.py +20 -10
- dataeval/{_internal/detectors → detectors/linters}/merged_stats.py +3 -1
- dataeval/{_internal/detectors → detectors/linters}/outliers.py +19 -26
- dataeval/detectors/ood/__init__.py +8 -16
- dataeval/{_internal/detectors → detectors}/ood/ae.py +9 -9
- dataeval/{_internal/detectors → detectors}/ood/aegmm.py +10 -30
- dataeval/{_internal/detectors → detectors}/ood/base.py +27 -21
- dataeval/{_internal/detectors → detectors}/ood/llr.py +27 -23
- dataeval/detectors/ood/metadata_ks_compare.py +99 -0
- dataeval/detectors/ood/metadata_least_likely.py +119 -0
- dataeval/detectors/ood/metadata_ood_mi.py +92 -0
- dataeval/{_internal/detectors → detectors}/ood/vae.py +11 -13
- dataeval/{_internal/detectors → detectors}/ood/vaegmm.py +10 -32
- dataeval/{_internal/interop.py → interop.py} +12 -7
- dataeval/metrics/__init__.py +1 -1
- dataeval/metrics/bias/__init__.py +4 -4
- dataeval/{_internal/metrics → metrics/bias}/balance.py +70 -4
- dataeval/{_internal/metrics → metrics/bias}/coverage.py +10 -8
- dataeval/{_internal/metrics → metrics/bias}/diversity.py +54 -20
- dataeval/metrics/bias/metadata.py +275 -0
- dataeval/{_internal/metrics → metrics/bias}/parity.py +21 -17
- dataeval/metrics/estimators/__init__.py +3 -3
- dataeval/{_internal/metrics → metrics/estimators}/ber.py +31 -28
- dataeval/{_internal/metrics → metrics/estimators}/divergence.py +15 -16
- dataeval/{_internal/metrics → metrics/estimators}/uap.py +8 -6
- dataeval/metrics/stats/__init__.py +7 -7
- dataeval/{_internal/metrics → metrics}/stats/base.py +66 -40
- dataeval/{_internal/metrics → metrics}/stats/boxratiostats.py +19 -15
- dataeval/{_internal/metrics → metrics}/stats/datasetstats.py +19 -17
- dataeval/{_internal/metrics → metrics}/stats/dimensionstats.py +12 -10
- dataeval/metrics/stats/hashstats.py +156 -0
- dataeval/{_internal/metrics → metrics}/stats/labelstats.py +8 -6
- dataeval/{_internal/metrics → metrics}/stats/pixelstats.py +12 -11
- dataeval/{_internal/metrics → metrics}/stats/visualstats.py +14 -13
- dataeval/{_internal/output.py → output.py} +26 -6
- dataeval/utils/__init__.py +8 -4
- dataeval/utils/image.py +71 -0
- dataeval/utils/shared.py +151 -0
- dataeval/utils/split_dataset.py +486 -0
- dataeval/utils/tensorflow/__init__.py +9 -7
- dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/autoencoder.py +64 -68
- dataeval/{_internal/models/tensorflow/losses.py → utils/tensorflow/_internal/loss.py} +10 -9
- dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/pixelcnn.py +18 -22
- dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py +3 -1
- dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py +18 -18
- dataeval/utils/tensorflow/loss/__init__.py +6 -2
- dataeval/utils/torch/__init__.py +7 -3
- dataeval/{_internal/models/pytorch → utils/torch}/blocks.py +19 -14
- dataeval/{_internal → utils/torch}/datasets.py +49 -43
- dataeval/utils/torch/models.py +138 -0
- dataeval/{_internal/models/pytorch/autoencoder.py → utils/torch/trainer.py} +12 -141
- dataeval/{_internal → utils/torch}/utils.py +3 -1
- dataeval/workflows/__init__.py +1 -1
- dataeval/{_internal/workflows → workflows}/sufficiency.py +42 -37
- {dataeval-0.72.0.dist-info → dataeval-0.72.2.dist-info}/METADATA +7 -5
- dataeval-0.72.2.dist-info/RECORD +72 -0
- dataeval/_internal/detectors/__init__.py +0 -0
- dataeval/_internal/detectors/drift/__init__.py +0 -0
- dataeval/_internal/detectors/ood/__init__.py +0 -0
- dataeval/_internal/metrics/__init__.py +0 -0
- dataeval/_internal/metrics/stats/hashstats.py +0 -75
- dataeval/_internal/metrics/utils.py +0 -447
- dataeval/_internal/models/__init__.py +0 -0
- dataeval/_internal/models/pytorch/__init__.py +0 -0
- dataeval/_internal/models/pytorch/utils.py +0 -67
- dataeval/_internal/models/tensorflow/__init__.py +0 -0
- dataeval/_internal/workflows/__init__.py +0 -0
- dataeval/detectors/drift/kernels/__init__.py +0 -10
- dataeval/detectors/drift/updates/__init__.py +0 -7
- dataeval/utils/tensorflow/models/__init__.py +0 -9
- dataeval/utils/tensorflow/recon/__init__.py +0 -3
- dataeval/utils/torch/datasets/__init__.py +0 -12
- dataeval/utils/torch/models/__init__.py +0 -11
- dataeval/utils/torch/trainer/__init__.py +0 -7
- dataeval-0.72.0.dist-info/RECORD +0 -80
- /dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/gmm.py +0 -0
- {dataeval-0.72.0.dist-info → dataeval-0.72.2.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.72.0.dist-info → dataeval-0.72.2.dist-info}/WHEEL +0 -0
@@ -1,8 +1,10 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
__all__ = ["SufficiencyOutput", "Sufficiency"]
|
4
|
+
|
3
5
|
import warnings
|
4
6
|
from dataclasses import dataclass
|
5
|
-
from typing import Any, Callable, Iterable, Mapping, Sequence, cast
|
7
|
+
from typing import Any, Callable, Generic, Iterable, Mapping, Sequence, TypeVar, cast
|
6
8
|
|
7
9
|
import matplotlib.pyplot as plt
|
8
10
|
import numpy as np
|
@@ -13,8 +15,8 @@ from numpy.typing import ArrayLike, NDArray
|
|
13
15
|
from scipy.optimize import basinhopping
|
14
16
|
from torch.utils.data import Dataset
|
15
17
|
|
16
|
-
from dataeval.
|
17
|
-
from dataeval.
|
18
|
+
from dataeval.interop import as_numpy
|
19
|
+
from dataeval.output import OutputMetadata, set_metadata
|
18
20
|
|
19
21
|
|
20
22
|
@dataclass(frozen=True)
|
@@ -36,7 +38,7 @@ class SufficiencyOutput(OutputMetadata):
|
|
36
38
|
params: dict[str, NDArray[np.float64]]
|
37
39
|
measures: dict[str, NDArray[np.float64]]
|
38
40
|
|
39
|
-
def __post_init__(self):
|
41
|
+
def __post_init__(self) -> None:
|
40
42
|
c = len(self.steps)
|
41
43
|
if set(self.params) != set(self.measures):
|
42
44
|
raise ValueError("params and measures have a key mismatch")
|
@@ -45,7 +47,7 @@ class SufficiencyOutput(OutputMetadata):
|
|
45
47
|
if c != c_v:
|
46
48
|
raise ValueError(f"{m} does not contain the expected number ({c}) of data points.")
|
47
49
|
|
48
|
-
@set_metadata(
|
50
|
+
@set_metadata()
|
49
51
|
def project(
|
50
52
|
self,
|
51
53
|
projection: int | Iterable[int],
|
@@ -86,7 +88,7 @@ class SufficiencyOutput(OutputMetadata):
|
|
86
88
|
return SufficiencyOutput(projection, self.params, output)
|
87
89
|
|
88
90
|
def plot(self, class_names: Sequence[str] | None = None) -> list[Figure]:
|
89
|
-
"""Plotting function for data
|
91
|
+
"""Plotting function for data :term:`sufficience<Sufficiency>` tasks
|
90
92
|
|
91
93
|
Parameters
|
92
94
|
----------
|
@@ -170,7 +172,7 @@ class SufficiencyOutput(OutputMetadata):
|
|
170
172
|
return projection
|
171
173
|
|
172
174
|
|
173
|
-
def f_out(n_i: NDArray, x: NDArray) -> NDArray:
|
175
|
+
def f_out(n_i: NDArray[Any], x: NDArray[Any]) -> NDArray[Any]:
|
174
176
|
"""
|
175
177
|
Calculates the line of best fit based on its free parameters
|
176
178
|
|
@@ -189,7 +191,7 @@ def f_out(n_i: NDArray, x: NDArray) -> NDArray:
|
|
189
191
|
return x[0] * n_i ** (-x[1]) + x[2]
|
190
192
|
|
191
193
|
|
192
|
-
def f_inv_out(y_i: NDArray, x: NDArray) -> NDArray[np.uint64]:
|
194
|
+
def f_inv_out(y_i: NDArray[Any], x: NDArray[Any]) -> NDArray[np.uint64]:
|
193
195
|
"""
|
194
196
|
Inverse function for f_out()
|
195
197
|
|
@@ -209,7 +211,7 @@ def f_inv_out(y_i: NDArray, x: NDArray) -> NDArray[np.uint64]:
|
|
209
211
|
return np.asarray(n_i, dtype=np.uint64)
|
210
212
|
|
211
213
|
|
212
|
-
def calc_params(p_i: NDArray, n_i: NDArray, niter: int) -> NDArray:
|
214
|
+
def calc_params(p_i: NDArray[Any], n_i: NDArray[Any], niter: int) -> NDArray[Any]:
|
213
215
|
"""
|
214
216
|
Retrieves the inverse power curve coefficients for the line of best fit.
|
215
217
|
Global minimization is done via basin hopping. More info on this algorithm
|
@@ -254,7 +256,7 @@ def calc_params(p_i: NDArray, n_i: NDArray, niter: int) -> NDArray:
|
|
254
256
|
return res.x
|
255
257
|
|
256
258
|
|
257
|
-
def reset_parameters(model: nn.Module):
|
259
|
+
def reset_parameters(model: nn.Module) -> nn.Module:
|
258
260
|
"""
|
259
261
|
Re-initializes each layer in the model using
|
260
262
|
the layer's defined weight_init function
|
@@ -272,7 +274,7 @@ def reset_parameters(model: nn.Module):
|
|
272
274
|
return model.apply(fn=weight_reset)
|
273
275
|
|
274
276
|
|
275
|
-
def validate_dataset_len(dataset: Dataset) -> int:
|
277
|
+
def validate_dataset_len(dataset: Dataset[Any]) -> int:
|
276
278
|
if not hasattr(dataset, "__len__"):
|
277
279
|
raise TypeError("Must provide a dataset with a length attribute")
|
278
280
|
length: int = dataset.__len__() # type: ignore
|
@@ -281,7 +283,7 @@ def validate_dataset_len(dataset: Dataset) -> int:
|
|
281
283
|
return length
|
282
284
|
|
283
285
|
|
284
|
-
def project_steps(params: NDArray, projection: NDArray) -> NDArray:
|
286
|
+
def project_steps(params: NDArray[Any], projection: NDArray[Any]) -> NDArray[Any]:
|
285
287
|
"""Projects the measures for each value of X
|
286
288
|
|
287
289
|
Parameters
|
@@ -300,7 +302,7 @@ def project_steps(params: NDArray, projection: NDArray) -> NDArray:
|
|
300
302
|
return 1 - f_out(projection, params)
|
301
303
|
|
302
304
|
|
303
|
-
def inv_project_steps(params: NDArray, targets: NDArray) -> NDArray[np.uint64]:
|
305
|
+
def inv_project_steps(params: NDArray[Any], targets: NDArray[Any]) -> NDArray[np.uint64]:
|
304
306
|
"""Inverse function for project_steps()
|
305
307
|
|
306
308
|
Parameters
|
@@ -320,7 +322,7 @@ def inv_project_steps(params: NDArray, targets: NDArray) -> NDArray[np.uint64]:
|
|
320
322
|
return np.ceil(steps)
|
321
323
|
|
322
324
|
|
323
|
-
def get_curve_params(measures: dict[str, NDArray], ranges: NDArray, niter: int) -> dict[str, NDArray]:
|
325
|
+
def get_curve_params(measures: dict[str, NDArray[Any]], ranges: NDArray[Any], niter: int) -> dict[str, NDArray[Any]]:
|
324
326
|
"""Calculates and aggregates parameters for both single and multi-class metrics"""
|
325
327
|
output = {}
|
326
328
|
for name, measure in measures.items():
|
@@ -337,10 +339,10 @@ def get_curve_params(measures: dict[str, NDArray], ranges: NDArray, niter: int)
|
|
337
339
|
|
338
340
|
def plot_measure(
|
339
341
|
name: str,
|
340
|
-
steps: NDArray,
|
341
|
-
measure: NDArray,
|
342
|
-
params: NDArray,
|
343
|
-
projection: NDArray,
|
342
|
+
steps: NDArray[Any],
|
343
|
+
measure: NDArray[Any],
|
344
|
+
params: NDArray[Any],
|
345
|
+
projection: NDArray[Any],
|
344
346
|
) -> Figure:
|
345
347
|
fig = plt.figure()
|
346
348
|
fig = cast(Figure, fig)
|
@@ -367,9 +369,12 @@ def plot_measure(
|
|
367
369
|
return fig
|
368
370
|
|
369
371
|
|
370
|
-
|
372
|
+
T = TypeVar("T")
|
373
|
+
|
374
|
+
|
375
|
+
class Sufficiency(Generic[T]):
|
371
376
|
"""
|
372
|
-
Project dataset sufficiency using given a model and evaluation criteria
|
377
|
+
Project dataset :term:`sufficiency<Sufficiency>` using given a model and evaluation criteria
|
373
378
|
|
374
379
|
Parameters
|
375
380
|
----------
|
@@ -401,10 +406,10 @@ class Sufficiency:
|
|
401
406
|
def __init__(
|
402
407
|
self,
|
403
408
|
model: nn.Module,
|
404
|
-
train_ds: Dataset,
|
405
|
-
test_ds: Dataset,
|
406
|
-
train_fn: Callable[[nn.Module, Dataset, Sequence[int]], None],
|
407
|
-
eval_fn: Callable[[nn.Module, Dataset], Mapping[str, float] | Mapping[str, ArrayLike]],
|
409
|
+
train_ds: Dataset[T],
|
410
|
+
test_ds: Dataset[T],
|
411
|
+
train_fn: Callable[[nn.Module, Dataset[T], Sequence[int]], None],
|
412
|
+
eval_fn: Callable[[nn.Module, Dataset[T]], Mapping[str, float] | Mapping[str, ArrayLike]],
|
408
413
|
runs: int = 1,
|
409
414
|
substeps: int = 5,
|
410
415
|
train_kwargs: Mapping[str, Any] | None = None,
|
@@ -421,29 +426,29 @@ class Sufficiency:
|
|
421
426
|
self.eval_kwargs = eval_kwargs
|
422
427
|
|
423
428
|
@property
|
424
|
-
def train_ds(self):
|
429
|
+
def train_ds(self) -> Dataset[T]:
|
425
430
|
return self._train_ds
|
426
431
|
|
427
432
|
@train_ds.setter
|
428
|
-
def train_ds(self, value: Dataset):
|
433
|
+
def train_ds(self, value: Dataset[T]) -> None:
|
429
434
|
self._train_ds = value
|
430
435
|
self._length = validate_dataset_len(value)
|
431
436
|
|
432
437
|
@property
|
433
|
-
def test_ds(self):
|
438
|
+
def test_ds(self) -> Dataset[T]:
|
434
439
|
return self._test_ds
|
435
440
|
|
436
441
|
@test_ds.setter
|
437
|
-
def test_ds(self, value: Dataset):
|
442
|
+
def test_ds(self, value: Dataset[T]) -> None:
|
438
443
|
validate_dataset_len(value)
|
439
444
|
self._test_ds = value
|
440
445
|
|
441
446
|
@property
|
442
|
-
def train_fn(self) -> Callable[[nn.Module, Dataset, Sequence[int]], None]:
|
447
|
+
def train_fn(self) -> Callable[[nn.Module, Dataset[T], Sequence[int]], None]:
|
443
448
|
return self._train_fn
|
444
449
|
|
445
450
|
@train_fn.setter
|
446
|
-
def train_fn(self, value: Callable[[nn.Module, Dataset, Sequence[int]], None]):
|
451
|
+
def train_fn(self, value: Callable[[nn.Module, Dataset[T], Sequence[int]], None]) -> None:
|
447
452
|
if not callable(value):
|
448
453
|
raise TypeError("Must provide a callable for train_fn.")
|
449
454
|
self._train_fn = value
|
@@ -451,14 +456,14 @@ class Sufficiency:
|
|
451
456
|
@property
|
452
457
|
def eval_fn(
|
453
458
|
self,
|
454
|
-
) -> Callable[[nn.Module, Dataset], dict[str, float] | Mapping[str, ArrayLike]]:
|
459
|
+
) -> Callable[[nn.Module, Dataset[T]], dict[str, float] | Mapping[str, ArrayLike]]:
|
455
460
|
return self._eval_fn
|
456
461
|
|
457
462
|
@eval_fn.setter
|
458
463
|
def eval_fn(
|
459
464
|
self,
|
460
|
-
value: Callable[[nn.Module, Dataset], dict[str, float] | Mapping[str, ArrayLike]],
|
461
|
-
):
|
465
|
+
value: Callable[[nn.Module, Dataset[T]], dict[str, float] | Mapping[str, ArrayLike]],
|
466
|
+
) -> None:
|
462
467
|
if not callable(value):
|
463
468
|
raise TypeError("Must provide a callable for eval_fn.")
|
464
469
|
self._eval_fn = value
|
@@ -468,7 +473,7 @@ class Sufficiency:
|
|
468
473
|
return self._train_kwargs
|
469
474
|
|
470
475
|
@train_kwargs.setter
|
471
|
-
def train_kwargs(self, value: Mapping[str, Any] | None):
|
476
|
+
def train_kwargs(self, value: Mapping[str, Any] | None) -> None:
|
472
477
|
self._train_kwargs = {} if value is None else value
|
473
478
|
|
474
479
|
@property
|
@@ -476,10 +481,10 @@ class Sufficiency:
|
|
476
481
|
return self._eval_kwargs
|
477
482
|
|
478
483
|
@eval_kwargs.setter
|
479
|
-
def eval_kwargs(self, value: Mapping[str, Any] | None):
|
484
|
+
def eval_kwargs(self, value: Mapping[str, Any] | None) -> None:
|
480
485
|
self._eval_kwargs = {} if value is None else value
|
481
486
|
|
482
|
-
@set_metadata(
|
487
|
+
@set_metadata(["runs", "substeps"])
|
483
488
|
def evaluate(self, eval_at: int | Iterable[int] | None = None, niter: int = 1000) -> SufficiencyOutput:
|
484
489
|
"""
|
485
490
|
Creates data indices, trains models, and returns plotting data
|
@@ -488,7 +493,7 @@ class Sufficiency:
|
|
488
493
|
----------
|
489
494
|
eval_at : int | Iterable[int] | None, default None
|
490
495
|
Specify this to collect accuracies over a specific set of dataset lengths, rather
|
491
|
-
than letting Sufficiency internally create the lengths to evaluate at.
|
496
|
+
than letting :term:`sufficiency<Sufficiency>` internally create the lengths to evaluate at.
|
492
497
|
niter : int, default 1000
|
493
498
|
Iterations to perform when using the basin-hopping method to curve-fit measure(s).
|
494
499
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: dataeval
|
3
|
-
Version: 0.72.
|
3
|
+
Version: 0.72.2
|
4
4
|
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
5
5
|
Home-page: https://dataeval.ai/
|
6
6
|
License: MIT
|
@@ -8,7 +8,7 @@ Author: Andrew Weng
|
|
8
8
|
Author-email: andrew.weng@ariacoustics.com
|
9
9
|
Maintainer: ARiA
|
10
10
|
Maintainer-email: dataeval@ariacoustics.com
|
11
|
-
Requires-Python: >=3.9,<3.
|
11
|
+
Requires-Python: >=3.9,<3.13
|
12
12
|
Classifier: Development Status :: 4 - Beta
|
13
13
|
Classifier: Intended Audience :: Science/Research
|
14
14
|
Classifier: License :: OSI Approved :: MIT License
|
@@ -17,24 +17,26 @@ Classifier: Programming Language :: Python :: 3
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.9
|
18
18
|
Classifier: Programming Language :: Python :: 3.10
|
19
19
|
Classifier: Programming Language :: Python :: 3.11
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
20
21
|
Classifier: Programming Language :: Python :: 3 :: Only
|
21
22
|
Classifier: Topic :: Scientific/Engineering
|
22
23
|
Provides-Extra: all
|
23
24
|
Provides-Extra: tensorflow
|
24
25
|
Provides-Extra: torch
|
25
26
|
Requires-Dist: hdbscan (>=0.8.36)
|
27
|
+
Requires-Dist: markupsafe (<3.0.2) ; extra == "tensorflow" or extra == "all"
|
26
28
|
Requires-Dist: matplotlib ; extra == "torch" or extra == "all"
|
27
29
|
Requires-Dist: numpy (>1.24.3)
|
28
|
-
Requires-Dist: nvidia-cudnn-cu11 (>=8.6.0.163) ; extra == "tensorflow" or extra == "torch" or extra == "all"
|
29
30
|
Requires-Dist: pillow (>=10.3.0)
|
30
31
|
Requires-Dist: scikit-learn (>=1.5.0)
|
31
32
|
Requires-Dist: scipy (>=1.10)
|
32
|
-
Requires-Dist: tensorflow (>=2.16) ; extra == "tensorflow" or extra == "all"
|
33
|
+
Requires-Dist: tensorflow (>=2.16,<2.18) ; extra == "tensorflow" or extra == "all"
|
33
34
|
Requires-Dist: tensorflow_probability (>=0.24) ; extra == "tensorflow" or extra == "all"
|
34
35
|
Requires-Dist: tf-keras (>=2.16) ; extra == "tensorflow" or extra == "all"
|
35
36
|
Requires-Dist: torch (>=2.2.0) ; extra == "torch" or extra == "all"
|
36
37
|
Requires-Dist: torchvision (>=0.17.0) ; extra == "torch" or extra == "all"
|
37
38
|
Requires-Dist: tqdm
|
39
|
+
Requires-Dist: typing-extensions (>=4.12) ; python_version >= "3.9" and python_version < "3.10"
|
38
40
|
Requires-Dist: xxhash (>=3.3)
|
39
41
|
Project-URL: Documentation, https://dataeval.readthedocs.io/
|
40
42
|
Project-URL: Repository, https://github.com/aria-ml/dataeval/
|
@@ -44,7 +46,7 @@ Description-Content-Type: text/markdown
|
|
44
46
|
|
45
47
|
## About DataEval
|
46
48
|
|
47
|
-
DataEval focuses on characterizing image data and its impact on model performance across
|
49
|
+
DataEval focuses on characterizing image data and its impact on model performance across Classification and object-detection tasks.
|
48
50
|
|
49
51
|
<!-- start about -->
|
50
52
|
|
@@ -0,0 +1,72 @@
|
|
1
|
+
dataeval/__init__.py,sha256=UYhkwned7TR5hiU_c8I_qUaKogO1EODTBgT-9_t0ofI,641
|
2
|
+
dataeval/detectors/__init__.py,sha256=xdp8LYOFjV5tVbAwu0Y03KU9EajHkSFy_M3raqbxpDc,383
|
3
|
+
dataeval/detectors/drift/__init__.py,sha256=MRPWFOaoVoqAHW36nA5F3wk7QXJU4oecND2RbtgG9oY,757
|
4
|
+
dataeval/detectors/drift/base.py,sha256=0S-0MFpIFaJ4_8IGreFKSmyna2L50FBn7DVaoNWmw8E,14509
|
5
|
+
dataeval/detectors/drift/cvm.py,sha256=kc59w2_wtxFGNnLcaJRvX5v_38gPXiebSGNiFVdunEQ,4142
|
6
|
+
dataeval/detectors/drift/ks.py,sha256=gcpe1WIQeNeZdLYkdMZCFLXUp1bHMQUxwJE6-RLVOXs,4229
|
7
|
+
dataeval/detectors/drift/mmd.py,sha256=TqGOnUNYKwpS0GQPV3dSl-_qRa0g2flmoQ-dxzW_JfY,7586
|
8
|
+
dataeval/detectors/drift/torch.py,sha256=D46J72OPW8-PpP3w9ODMBfcDSdailIgVjgHVFpbYfws,11649
|
9
|
+
dataeval/detectors/drift/uncertainty.py,sha256=Xz2yzJjtJfw1vLag234jwRvaa_HK36nMajGx8bQaNRs,5322
|
10
|
+
dataeval/detectors/drift/updates.py,sha256=UJ0z5hlunRi7twnkLABfdJG3tT2EqX4y9IGx8_USYvo,1780
|
11
|
+
dataeval/detectors/linters/__init__.py,sha256=BvpaB1RUpkEhhXk3Mqi5NYoOcJKZRFSBOJCmQOIfYRU,483
|
12
|
+
dataeval/detectors/linters/clusterer.py,sha256=OtBE5rglAGdTTQRmKUHP6J-uWmnh2E3lZxeqJCnc87U,21014
|
13
|
+
dataeval/detectors/linters/duplicates.py,sha256=tOD43rJkvheIA3mznbUqHhft2yD3xRZQdCt61daIca4,5665
|
14
|
+
dataeval/detectors/linters/merged_stats.py,sha256=X-bDTwjyR8RuVmzxLaHZmQ5nI3oOWvsqVlitdSncapk,1355
|
15
|
+
dataeval/detectors/linters/outliers.py,sha256=BUVvtbKHo04KnRmrgb84MBr0l1gtcY3-xNCHjetFrEQ,10117
|
16
|
+
dataeval/detectors/ood/__init__.py,sha256=FVyVuaxVKAOgSTaaBf-j2OXXDarSBFcJ7CTlMV6w88s,661
|
17
|
+
dataeval/detectors/ood/ae.py,sha256=cdwrgCpQkueK_HQoQbeXw7s0oTE-6FKVtXe9vETDe5M,2117
|
18
|
+
dataeval/detectors/ood/aegmm.py,sha256=jK5aN1UjwwZaSLB3BpzH25eLp5wBqzlgylsfphaoZaE,1814
|
19
|
+
dataeval/detectors/ood/base.py,sha256=S9jl4xH2zB_-ixalysQJZEvRCGOqMQSruacvfd4Dnfc,8687
|
20
|
+
dataeval/detectors/ood/llr.py,sha256=HUNsro-cV7RR5Mht6pJ4NWCRR7aWeVdjwkBNurs5LbM,10378
|
21
|
+
dataeval/detectors/ood/metadata_ks_compare.py,sha256=jH7uDwyyBIIcTrRhQEdnLAdrwf7LfNczKBw0CpJyF5c,4282
|
22
|
+
dataeval/detectors/ood/metadata_least_likely.py,sha256=nxMCXUOjOfWHDTGT2SLE7OYBCydRq8zHLd8t17k7hMM,5193
|
23
|
+
dataeval/detectors/ood/metadata_ood_mi.py,sha256=KLay2BmgHrStBV92VpIs_B1yEfQKllsMTgzOQEng01I,4065
|
24
|
+
dataeval/detectors/ood/vae.py,sha256=O1jpGkpavtJAqn4WrmocPRMtkX4iSdkpiCDUPBF1Ano,2925
|
25
|
+
dataeval/detectors/ood/vaegmm.py,sha256=37epPiQKeicy6SZD0D7O7hCFQSajZ-8wvga1pmJiq2s,2183
|
26
|
+
dataeval/interop.py,sha256=CFtGyVTwTqkJFkNfhHYhnBRVwxKIQ9f-9Zuuz_uQDqo,1589
|
27
|
+
dataeval/metrics/__init__.py,sha256=fPBNLd-T6mCErZBBJrxWmXIL0jCk7fNUYIcNEBkMa80,238
|
28
|
+
dataeval/metrics/bias/__init__.py,sha256=puf645-hAO5hFHNHlZ239TPopqWIoN-uLGXFB8-hA_o,599
|
29
|
+
dataeval/metrics/bias/balance.py,sha256=pgxaIqFvRcygYlAUbM_BKrbi45WU7fRV08HBrI7Z5q4,8569
|
30
|
+
dataeval/metrics/bias/coverage.py,sha256=Ku9l-qvc6YrRiQ0PRzkpfjInyOhkAKKSO_bf_LnOwNg,3623
|
31
|
+
dataeval/metrics/bias/diversity.py,sha256=-cmh-vyAUrn4rbn6-ZXvLuaO43Ncj28GKyeTmhWRzfE,8973
|
32
|
+
dataeval/metrics/bias/metadata.py,sha256=nUZRwhcKaJM0GVwXn5k11Fa1s56_OtOBF7tmXjMDpsM,8919
|
33
|
+
dataeval/metrics/bias/parity.py,sha256=uJ3p8m6id5mZpDNnS1NmxCThb5V6v75lJv_0TGAhCRA,16668
|
34
|
+
dataeval/metrics/estimators/__init__.py,sha256=O6ocxJq8XDkfJWwXeJnnnzbOyRnFPKF4kTIVTTZYOA8,380
|
35
|
+
dataeval/metrics/estimators/ber.py,sha256=SVT-BIC_GLs0l2l2NhWu4OpRbgn96w-OwTSoPHTnQbE,5037
|
36
|
+
dataeval/metrics/estimators/divergence.py,sha256=pImaa216-YYTgGWDCSTcpJrC-dfl7150yVrPfW_TyGc,4293
|
37
|
+
dataeval/metrics/estimators/uap.py,sha256=Tz1VZOyUa68HlTh94Rl-wnXCWdTAVjTQc3LtSPEWVu4,2175
|
38
|
+
dataeval/metrics/stats/__init__.py,sha256=igLRaAt1nX6yRwC4xI0zNPBADi3u7EsSxWP3OZ8AqcU,1086
|
39
|
+
dataeval/metrics/stats/base.py,sha256=9M5g2FAWvd50HT-T2h-MCmYLpvk--em_yWro1qWGHFs,12177
|
40
|
+
dataeval/metrics/stats/boxratiostats.py,sha256=iNr-FdppiJ7XAeeLY-o7gL_PSxvT8j86iwRijKca2Eg,6465
|
41
|
+
dataeval/metrics/stats/datasetstats.py,sha256=LAMFCIS9v0RjLrdKUFuo8nY-3HLVvRlqQIXGMKtsHEw,6255
|
42
|
+
dataeval/metrics/stats/dimensionstats.py,sha256=xdTp2AbGH3xefUUsB4sDjgSKiojJ73DCHyuCOPKsErc,4056
|
43
|
+
dataeval/metrics/stats/hashstats.py,sha256=X6aSouaMhDcGZMLuCTje3G4QOr2i-Td6H3SyBFDF6mA,4960
|
44
|
+
dataeval/metrics/stats/labelstats.py,sha256=BKwSmyxCr2wYq8IMraCUS-b5wqacfT_BukJUYNfqeCo,4114
|
45
|
+
dataeval/metrics/stats/pixelstats.py,sha256=x90O10IqVjEORtYwueFLvJnVYTxhPBOOx5HMweBQnJY,4578
|
46
|
+
dataeval/metrics/stats/visualstats.py,sha256=y0xIvst7epcajk8vz2jngiAiz0T7DZC-M97Rs1-vV9I,4950
|
47
|
+
dataeval/output.py,sha256=jWXXNxFNBEaY1rN7Z-6LZl6bQT-I7z_wqr91Rhrdt_0,3061
|
48
|
+
dataeval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
49
|
+
dataeval/utils/__init__.py,sha256=zTgPsmloPy0qZMzb4xipNNdIWpaHtseGph68pIAD-hQ,684
|
50
|
+
dataeval/utils/image.py,sha256=KgC_1nW__nGN5q6bVZNvG4U_qIBdjcPATz9qe8f2XuA,1928
|
51
|
+
dataeval/utils/shared.py,sha256=BvEeYPMNQTmx4LSaImGeC0VkvcbEY3Byqtxa-jQ3xgc,3623
|
52
|
+
dataeval/utils/split_dataset.py,sha256=IopyxwC3FaZwgVriW4OXze-mDMpOlvRr83OADA5Jydk,19454
|
53
|
+
dataeval/utils/tensorflow/__init__.py,sha256=l4OjIA75JJXeNWDCkST1xtDMVYsw97lZ-9JXFBlyuYg,539
|
54
|
+
dataeval/utils/tensorflow/_internal/autoencoder.py,sha256=-pm4VqMEjHcrgre-K8uhMvaEVHyeqZsZbejrnlM6OtY,10430
|
55
|
+
dataeval/utils/tensorflow/_internal/gmm.py,sha256=QoEgbeax1GETqRmUF7A2ih9uFOZfFAjGzgH2ljExlAc,3669
|
56
|
+
dataeval/utils/tensorflow/_internal/loss.py,sha256=IXW_kxovLaTLd6UkMOIQLPEAGrOMILHDKagvRYgj-DE,4065
|
57
|
+
dataeval/utils/tensorflow/_internal/pixelcnn.py,sha256=Aa7koa7YxqhHmFequpsfMw2-61KO03evWWcvvFTuaco,48518
|
58
|
+
dataeval/utils/tensorflow/_internal/trainer.py,sha256=ld7pisl4ZXjEA6nxBStRNDEuNJme0IPo08oWqal6bYc,4167
|
59
|
+
dataeval/utils/tensorflow/_internal/utils.py,sha256=k1mjy44oE63SIkckvU8BTlqtWsCnGynJF4eYyw1pebQ,8799
|
60
|
+
dataeval/utils/tensorflow/loss/__init__.py,sha256=Q-66vt91Oe1ByYfo28tW32zXDq2MqQ2gngWgmIVmof8,227
|
61
|
+
dataeval/utils/torch/__init__.py,sha256=lpkqfgyARUxgrV94cZESQv8PIP2p-UnwItZ_wIr0XzQ,675
|
62
|
+
dataeval/utils/torch/blocks.py,sha256=HVhBTMMD5NA4qheMUgyol1KWiKZDIuc8k5j4RcMKmhk,1466
|
63
|
+
dataeval/utils/torch/datasets.py,sha256=9YV9-Uhq6NCMuu1hPhMnQXjmeI-Ld8ve1z_haxre88o,15023
|
64
|
+
dataeval/utils/torch/models.py,sha256=0BsXmLK8W1OZ8nnEGb1f9LzIeCgtevQC37dvKS1v1vA,3236
|
65
|
+
dataeval/utils/torch/trainer.py,sha256=EraOKiXxiMNiycStZNMR5yRz3ehgp87d9ewR9a9dV4w,5559
|
66
|
+
dataeval/utils/torch/utils.py,sha256=FI4LJ6DvXFQJVff8fxSCP7LRkp8H9BIUgYX0kk7_Cuo,1537
|
67
|
+
dataeval/workflows/__init__.py,sha256=x2JnOoKmLUCZOsB6RNPqMdVvxEb6Hpda5GPJnD_k0v0,310
|
68
|
+
dataeval/workflows/sufficiency.py,sha256=1jSYhH9i4oesmJYs5PZvWS1LGXf8ekOgNhpFtMPLPXk,18552
|
69
|
+
dataeval-0.72.2.dist-info/LICENSE.txt,sha256=Kpzcfobf1HlqafF-EX6dQLw9TlJiaJzfgvLQFukyXYw,1060
|
70
|
+
dataeval-0.72.2.dist-info/METADATA,sha256=ddOmTZA6nX7VceQhOmyQ-cQ1aBv2VU9Za32vnmjP-VE,4702
|
71
|
+
dataeval-0.72.2.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
72
|
+
dataeval-0.72.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
@@ -1,75 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
from dataclasses import dataclass
|
4
|
-
from typing import Iterable
|
5
|
-
|
6
|
-
from numpy.typing import ArrayLike
|
7
|
-
|
8
|
-
from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
|
9
|
-
from dataeval._internal.metrics.utils import pchash, xxhash
|
10
|
-
from dataeval._internal.output import set_metadata
|
11
|
-
|
12
|
-
|
13
|
-
@dataclass(frozen=True)
|
14
|
-
class HashStatsOutput(BaseStatsOutput):
|
15
|
-
"""
|
16
|
-
Output class for :func:`hashstats` stats metric
|
17
|
-
|
18
|
-
Attributes
|
19
|
-
----------
|
20
|
-
xxhash : List[str]
|
21
|
-
xxHash hash of the images as a hex string
|
22
|
-
pchash : List[str]
|
23
|
-
Perception hash of the images as a hex string
|
24
|
-
"""
|
25
|
-
|
26
|
-
xxhash: list[str]
|
27
|
-
pchash: list[str]
|
28
|
-
|
29
|
-
|
30
|
-
class HashStatsProcessor(StatsProcessor[HashStatsOutput]):
|
31
|
-
output_class = HashStatsOutput
|
32
|
-
image_function_map = {
|
33
|
-
"xxhash": lambda x: xxhash(x.image),
|
34
|
-
"pchash": lambda x: pchash(x.image),
|
35
|
-
}
|
36
|
-
|
37
|
-
|
38
|
-
@set_metadata("dataeval.metrics")
|
39
|
-
def hashstats(
|
40
|
-
images: Iterable[ArrayLike],
|
41
|
-
bboxes: Iterable[ArrayLike] | None = None,
|
42
|
-
) -> HashStatsOutput:
|
43
|
-
"""
|
44
|
-
Calculates hashes for each image
|
45
|
-
|
46
|
-
This function computes hashes from the images including exact hashes and perception-based
|
47
|
-
hashes. These hash values can be used to determine if images are exact or near matches.
|
48
|
-
|
49
|
-
Parameters
|
50
|
-
----------
|
51
|
-
images : ArrayLike
|
52
|
-
Images to hashing
|
53
|
-
bboxes : Iterable[ArrayLike] or None
|
54
|
-
Bounding boxes in `xyxy` format for each image
|
55
|
-
|
56
|
-
Returns
|
57
|
-
-------
|
58
|
-
HashStatsOutput
|
59
|
-
A dictionary-like object containing the computed hashes for each image.
|
60
|
-
|
61
|
-
See Also
|
62
|
-
--------
|
63
|
-
Duplicates
|
64
|
-
|
65
|
-
Examples
|
66
|
-
--------
|
67
|
-
Calculating the statistics on the images, whose shape is (C, H, W)
|
68
|
-
|
69
|
-
>>> results = hashstats(images)
|
70
|
-
>>> print(results.xxhash)
|
71
|
-
['a72434443d6e7336', 'efc12c2f14581d79', '4a1e03483a27d674', '3a3ecedbcf814226']
|
72
|
-
>>> print(results.pchash)
|
73
|
-
['8f25506af46a7c6a', '8000808000008080', '8e71f18e0ef18e0e', 'a956d6a956d6a928']
|
74
|
-
"""
|
75
|
-
return run_stats(images, bboxes, False, [HashStatsProcessor])[0]
|