dataeval 0.76.1__py3-none-any.whl → 0.82.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. dataeval/__init__.py +3 -3
  2. dataeval/config.py +77 -0
  3. dataeval/detectors/__init__.py +1 -1
  4. dataeval/detectors/drift/__init__.py +6 -6
  5. dataeval/detectors/drift/{base.py → _base.py} +40 -85
  6. dataeval/detectors/drift/{cvm.py → _cvm.py} +21 -28
  7. dataeval/detectors/drift/{ks.py → _ks.py} +20 -26
  8. dataeval/detectors/drift/{mmd.py → _mmd.py} +31 -43
  9. dataeval/detectors/drift/{torch.py → _torch.py} +2 -1
  10. dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} +24 -7
  11. dataeval/detectors/drift/updates.py +20 -3
  12. dataeval/detectors/linters/__init__.py +3 -5
  13. dataeval/detectors/linters/duplicates.py +13 -36
  14. dataeval/detectors/linters/outliers.py +23 -148
  15. dataeval/detectors/ood/__init__.py +1 -1
  16. dataeval/detectors/ood/ae.py +30 -9
  17. dataeval/detectors/ood/base.py +5 -4
  18. dataeval/detectors/ood/mixin.py +21 -7
  19. dataeval/detectors/ood/vae.py +73 -0
  20. dataeval/metadata/__init__.py +6 -0
  21. dataeval/metadata/_distance.py +167 -0
  22. dataeval/metadata/_ood.py +217 -0
  23. dataeval/metadata/_utils.py +44 -0
  24. dataeval/metrics/__init__.py +1 -1
  25. dataeval/metrics/bias/__init__.py +6 -4
  26. dataeval/metrics/bias/{balance.py → _balance.py} +15 -101
  27. dataeval/metrics/bias/_coverage.py +98 -0
  28. dataeval/metrics/bias/{diversity.py → _diversity.py} +18 -111
  29. dataeval/metrics/bias/{parity.py → _parity.py} +39 -77
  30. dataeval/metrics/estimators/__init__.py +15 -4
  31. dataeval/metrics/estimators/{ber.py → _ber.py} +42 -29
  32. dataeval/metrics/estimators/_clusterer.py +44 -0
  33. dataeval/metrics/estimators/{divergence.py → _divergence.py} +18 -30
  34. dataeval/metrics/estimators/{uap.py → _uap.py} +4 -18
  35. dataeval/metrics/stats/__init__.py +16 -13
  36. dataeval/metrics/stats/{base.py → _base.py} +82 -133
  37. dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} +15 -18
  38. dataeval/metrics/stats/_dimensionstats.py +75 -0
  39. dataeval/metrics/stats/{hashstats.py → _hashstats.py} +21 -37
  40. dataeval/metrics/stats/_imagestats.py +94 -0
  41. dataeval/metrics/stats/_labelstats.py +131 -0
  42. dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} +19 -50
  43. dataeval/metrics/stats/{visualstats.py → _visualstats.py} +23 -54
  44. dataeval/outputs/__init__.py +53 -0
  45. dataeval/{output.py → outputs/_base.py} +55 -25
  46. dataeval/outputs/_bias.py +381 -0
  47. dataeval/outputs/_drift.py +83 -0
  48. dataeval/outputs/_estimators.py +114 -0
  49. dataeval/outputs/_linters.py +184 -0
  50. dataeval/{detectors/ood/output.py → outputs/_ood.py} +22 -22
  51. dataeval/outputs/_stats.py +387 -0
  52. dataeval/outputs/_utils.py +44 -0
  53. dataeval/outputs/_workflows.py +364 -0
  54. dataeval/typing.py +234 -0
  55. dataeval/utils/__init__.py +2 -2
  56. dataeval/utils/_array.py +169 -0
  57. dataeval/utils/_bin.py +199 -0
  58. dataeval/utils/_clusterer.py +144 -0
  59. dataeval/utils/_fast_mst.py +189 -0
  60. dataeval/utils/{image.py → _image.py} +6 -4
  61. dataeval/utils/_method.py +14 -0
  62. dataeval/utils/{shared.py → _mst.py} +3 -65
  63. dataeval/utils/{plot.py → _plot.py} +6 -6
  64. dataeval/utils/data/__init__.py +26 -0
  65. dataeval/utils/data/_dataset.py +217 -0
  66. dataeval/utils/data/_embeddings.py +104 -0
  67. dataeval/utils/data/_images.py +68 -0
  68. dataeval/utils/data/_metadata.py +360 -0
  69. dataeval/utils/data/_selection.py +126 -0
  70. dataeval/utils/{dataset/split.py → data/_split.py} +12 -38
  71. dataeval/utils/data/_targets.py +85 -0
  72. dataeval/utils/data/collate.py +103 -0
  73. dataeval/utils/data/datasets/__init__.py +17 -0
  74. dataeval/utils/data/datasets/_base.py +254 -0
  75. dataeval/utils/data/datasets/_cifar10.py +134 -0
  76. dataeval/utils/data/datasets/_fileio.py +168 -0
  77. dataeval/utils/data/datasets/_milco.py +153 -0
  78. dataeval/utils/data/datasets/_mixin.py +56 -0
  79. dataeval/utils/data/datasets/_mnist.py +183 -0
  80. dataeval/utils/data/datasets/_ships.py +123 -0
  81. dataeval/utils/data/datasets/_types.py +52 -0
  82. dataeval/utils/data/datasets/_voc.py +352 -0
  83. dataeval/utils/data/selections/__init__.py +15 -0
  84. dataeval/utils/data/selections/_classfilter.py +57 -0
  85. dataeval/utils/data/selections/_indices.py +26 -0
  86. dataeval/utils/data/selections/_limit.py +26 -0
  87. dataeval/utils/data/selections/_reverse.py +18 -0
  88. dataeval/utils/data/selections/_shuffle.py +29 -0
  89. dataeval/utils/metadata.py +51 -376
  90. dataeval/utils/torch/{gmm.py → _gmm.py} +4 -2
  91. dataeval/utils/torch/{internal.py → _internal.py} +21 -51
  92. dataeval/utils/torch/models.py +43 -2
  93. dataeval/workflows/__init__.py +2 -1
  94. dataeval/workflows/sufficiency.py +11 -346
  95. {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/METADATA +5 -2
  96. dataeval-0.82.0.dist-info/RECORD +104 -0
  97. dataeval/detectors/linters/clusterer.py +0 -512
  98. dataeval/detectors/linters/merged_stats.py +0 -49
  99. dataeval/detectors/ood/metadata_ks_compare.py +0 -129
  100. dataeval/detectors/ood/metadata_least_likely.py +0 -119
  101. dataeval/interop.py +0 -69
  102. dataeval/metrics/bias/coverage.py +0 -194
  103. dataeval/metrics/stats/datasetstats.py +0 -202
  104. dataeval/metrics/stats/dimensionstats.py +0 -115
  105. dataeval/metrics/stats/labelstats.py +0 -210
  106. dataeval/utils/dataset/__init__.py +0 -7
  107. dataeval/utils/dataset/datasets.py +0 -412
  108. dataeval/utils/dataset/read.py +0 -63
  109. dataeval-0.76.1.dist-info/RECORD +0 -67
  110. /dataeval/{log.py → _log.py} +0 -0
  111. /dataeval/utils/torch/{blocks.py → _blocks.py} +0 -0
  112. {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/LICENSE.txt +0 -0
  113. {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/WHEEL +0 -0
@@ -4,41 +4,24 @@ import warnings
4
4
 
5
5
  __all__ = []
6
6
 
7
- from dataclasses import dataclass
8
- from typing import Callable, Iterable
7
+ from typing import Any, Callable
9
8
 
10
9
  import numpy as np
11
10
  import xxhash as xxh
12
- from numpy.typing import ArrayLike
13
11
  from PIL import Image
14
12
  from scipy.fftpack import dct
15
13
 
16
- from dataeval.interop import as_numpy
17
- from dataeval.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
18
- from dataeval.output import set_metadata
19
- from dataeval.utils.image import normalize_image_shape, rescale
14
+ from dataeval.metrics.stats._base import StatsProcessor, run_stats
15
+ from dataeval.outputs import HashStatsOutput
16
+ from dataeval.outputs._base import set_metadata
17
+ from dataeval.typing import ArrayLike, Dataset
18
+ from dataeval.utils._array import as_numpy
19
+ from dataeval.utils._image import normalize_image_shape, rescale
20
20
 
21
21
  HASH_SIZE = 8
22
22
  MAX_FACTOR = 4
23
23
 
24
24
 
25
- @dataclass(frozen=True)
26
- class HashStatsOutput(BaseStatsOutput):
27
- """
28
- Output class for :func:`hashstats` stats metric.
29
-
30
- Attributes
31
- ----------
32
- xxhash : List[str]
33
- xxHash hash of the images as a hex string
34
- pchash : List[str]
35
- :term:`Perception-based Hash` of the images as a hex string
36
- """
37
-
38
- xxhash: list[str]
39
- pchash: list[str]
40
-
41
-
42
25
  def pchash(image: ArrayLike) -> str:
43
26
  """
44
27
  Performs a perceptual hash on an image by resizing to a square NxN image
@@ -122,8 +105,9 @@ class HashStatsProcessor(StatsProcessor[HashStatsOutput]):
122
105
 
123
106
  @set_metadata
124
107
  def hashstats(
125
- images: Iterable[ArrayLike],
126
- bboxes: Iterable[ArrayLike] | None = None,
108
+ dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
109
+ *,
110
+ per_box: bool = False,
127
111
  ) -> HashStatsOutput:
128
112
  """
129
113
  Calculates hashes for each image.
@@ -133,10 +117,10 @@ def hashstats(
133
117
 
134
118
  Parameters
135
119
  ----------
136
- images : ArrayLike
137
- Images to hashing
138
- bboxes : Iterable[ArrayLike] or None
139
- Bounding boxes in `xyxy` format for each image
120
+ dataset : Dataset
121
+ Dataset to perform calculations on.
122
+ per_box : bool, default False
123
+ If True, perform calculations on each bounding box.
140
124
 
141
125
  Returns
142
126
  -------
@@ -149,12 +133,12 @@ def hashstats(
149
133
 
150
134
  Examples
151
135
  --------
152
- Calculating the statistics on the images, whose shape is (C, H, W)
136
+ Calculate the hashes of a dataset of images, whose shape is (C, H, W)
153
137
 
154
- >>> results = hashstats(stats_images)
155
- >>> print(results.xxhash)
156
- ['6274f837b34ed9f0', '256504fdb6e3d2a4', '7dd0c56ca8474fb0', '50956ad4592f5bbc', '5ba2354079d42aa5']
157
- >>> print(results.pchash)
158
- ['a666999999666666', 'e666999999266666', 'e666999966663299', 'e666999999266666', '96e91656e91616e9']
138
+ >>> results = hashstats(dataset)
139
+ >>> print(results.xxhash[:5])
140
+ ['66a93f556577c086', 'd8b686fb405c4105', '7ffdb4990ad44ac6', '42cd4c34c80f6006', 'c5519e36ac1f8839']
141
+ >>> print(results.pchash[:5])
142
+ ['e666999999266666', 'e666999999266666', 'e666999966666299', 'e666999999266666', '96e91656e91616e9']
159
143
  """
160
- return run_stats(images, bboxes, False, [HashStatsProcessor])[0]
144
+ return run_stats(dataset, per_box, False, [HashStatsProcessor])[0]
@@ -0,0 +1,94 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = []
4
+
5
+ from typing import Any, Literal, overload
6
+
7
+ from dataeval.metrics.stats._base import run_stats
8
+ from dataeval.metrics.stats._dimensionstats import DimensionStatsProcessor
9
+ from dataeval.metrics.stats._pixelstats import PixelStatsProcessor
10
+ from dataeval.metrics.stats._visualstats import VisualStatsProcessor
11
+ from dataeval.outputs import ChannelStatsOutput, ImageStatsOutput
12
+ from dataeval.outputs._base import set_metadata
13
+ from dataeval.typing import ArrayLike, Dataset
14
+
15
+
16
+ @overload
17
+ def imagestats(
18
+ dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
19
+ *,
20
+ per_box: bool = False,
21
+ per_channel: Literal[True],
22
+ ) -> ChannelStatsOutput: ...
23
+
24
+
25
+ @overload
26
+ def imagestats(
27
+ dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
28
+ *,
29
+ per_box: bool = False,
30
+ per_channel: Literal[False] = False,
31
+ ) -> ImageStatsOutput: ...
32
+
33
+
34
+ @set_metadata
35
+ def imagestats(
36
+ dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
37
+ *,
38
+ per_box: bool = False,
39
+ per_channel: bool = False,
40
+ ) -> ImageStatsOutput | ChannelStatsOutput:
41
+ """
42
+ Calculates various :term:`statistics<Statistics>` for each image.
43
+
44
+ This function computes dimension, pixel and visual metrics
45
+ on the images or individual bounding boxes for each image as
46
+ well as label statistics if provided.
47
+
48
+ Parameters
49
+ ----------
50
+ dataset : Dataset
51
+ Dataset to perform calculations on.
52
+ per_box : bool, default False
53
+ If True, perform calculations on each bounding box.
54
+ per_channel : bool, default False
55
+ If True, perform calculations on each channel.
56
+
57
+ Returns
58
+ -------
59
+ ImageStatsOutput or ChannelStatsOutput
60
+ Output class containing the outputs of various stats functions
61
+
62
+ See Also
63
+ --------
64
+ dimensionstats, labelstats, pixelstats, visualstats, Outliers
65
+
66
+ Examples
67
+ --------
68
+ Calculate dimension, pixel and visual statistics for a dataset containing 8
69
+ images.
70
+
71
+ >>> stats = imagestats(dataset)
72
+ >>> print(stats.aspect_ratio)
73
+ [1. 1. 1.333 1. 0.667 1. 1. 1. ]
74
+
75
+ >>> print(stats.sharpness)
76
+ [20.23 20.23 23.33 20.23 77.06 20.23 20.23 20.23]
77
+
78
+ Calculate the pixel and visual stats for a dataset containing 6 3-channel
79
+ images and 2 1-channel images for a total of 20 channels.
80
+
81
+ >>> ch_stats = imagestats(dataset, per_channel=True)
82
+ >>> print(ch_stats.brightness)
83
+ [0.027 0.152 0.277 0.127 0.135 0.142 0.259 0.377 0.385 0.392 0.508 0.626
84
+ 0.634 0.642 0.751 0.759 0.767 0.876 0.884 0.892]
85
+ """
86
+ if per_channel:
87
+ processors = [PixelStatsProcessor, VisualStatsProcessor]
88
+ output_cls = ChannelStatsOutput
89
+ else:
90
+ processors = [DimensionStatsProcessor, PixelStatsProcessor, VisualStatsProcessor]
91
+ output_cls = ImageStatsOutput
92
+
93
+ outputs = run_stats(dataset, per_box, per_channel, processors)
94
+ return output_cls(**{k: v for d in outputs for k, v in d.dict().items()})
@@ -0,0 +1,131 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = []
4
+
5
+ from collections import Counter, defaultdict
6
+ from typing import Any, Mapping, TypeVar
7
+
8
+ import numpy as np
9
+
10
+ from dataeval.outputs import LabelStatsOutput
11
+ from dataeval.outputs._base import set_metadata
12
+ from dataeval.typing import AnnotatedDataset, ArrayLike
13
+ from dataeval.utils._array import as_numpy
14
+ from dataeval.utils.data._metadata import Metadata
15
+
16
+ TValue = TypeVar("TValue")
17
+
18
+
19
+ def _ensure_2d(labels: ArrayLike) -> ArrayLike:
20
+ if isinstance(labels, np.ndarray):
21
+ return labels[:, None]
22
+ else:
23
+ return [[lbl] for lbl in labels] # type: ignore
24
+
25
+
26
+ def _get_list_depth(lst):
27
+ if isinstance(lst, list) and lst:
28
+ return 1 + max(_get_list_depth(item) for item in lst)
29
+ return 0
30
+
31
+
32
+ def _check_labels_dimension(labels: ArrayLike) -> ArrayLike:
33
+ # Check for nested lists beyond 2 levels
34
+
35
+ if isinstance(labels, np.ndarray):
36
+ if labels.ndim == 1:
37
+ return _ensure_2d(labels)
38
+ elif labels.ndim == 2:
39
+ return labels
40
+ else:
41
+ raise ValueError("The label array must not have more than 2 dimensions.")
42
+ elif isinstance(labels, list):
43
+ depth = _get_list_depth(labels)
44
+ if depth == 1:
45
+ return _ensure_2d(labels)
46
+ elif depth == 2:
47
+ return labels
48
+ else:
49
+ raise ValueError("The label list must not be empty or have more than 2 levels of nesting.")
50
+ else:
51
+ raise TypeError("Labels must be either a NumPy array or a list.")
52
+
53
+
54
+ def _sort_to_list(d: Mapping[int, TValue]) -> list[TValue]:
55
+ return [v for _, v in sorted(d.items())]
56
+
57
+
58
+ @set_metadata
59
+ def labelstats(dataset: Metadata | AnnotatedDataset[Any]) -> LabelStatsOutput:
60
+ """
61
+ Calculates :term:`statistics<Statistics>` for data labels.
62
+
63
+ This function computes counting metrics (e.g., total per class, total per image)
64
+ on the labels.
65
+
66
+ Parameters
67
+ ----------
68
+ dataset : Metadata or ImageClassificationDataset or ObjectDetect
69
+
70
+ Returns
71
+ -------
72
+ LabelStatsOutput
73
+ A dataclass containing the computed counting metrics for the labels.
74
+
75
+ Examples
76
+ --------
77
+ Calculate basic :term:`statistics<Statistics>` on labels for a dataset.
78
+
79
+ >>> from dataeval.utils.data import Metadata
80
+ >>> stats = labelstats(Metadata(dataset))
81
+ >>> print(stats.to_table())
82
+ Class Count: 5
83
+ Label Count: 15
84
+ Average # Labels per Image: 1.88
85
+ --------------------------------------
86
+ Label: Total Count - Image Count
87
+ horse: 2 - 2
88
+ cow: 4 - 3
89
+ sheep: 2 - 2
90
+ pig: 2 - 2
91
+ chicken: 5 - 5
92
+ """
93
+ dataset = Metadata(dataset) if isinstance(dataset, AnnotatedDataset) else dataset
94
+
95
+ label_counts: Counter[int] = Counter()
96
+ image_counts: Counter[int] = Counter()
97
+ index_location = defaultdict(list[int])
98
+ label_per_image: list[int] = []
99
+
100
+ index2label = dict(enumerate(dataset.class_names))
101
+ labels = [target.labels.tolist() for target in dataset.targets]
102
+
103
+ labels_2d = _check_labels_dimension(labels)
104
+
105
+ for i, group in enumerate(labels_2d):
106
+ group = as_numpy(group).tolist()
107
+
108
+ # Count occurrences of each label in all sublists
109
+ label_counts.update(group)
110
+
111
+ # Get the number of labels per image
112
+ label_per_image.append(len(group))
113
+
114
+ # Create a set of unique items in the current sublist
115
+ unique_items: set[int] = set(group)
116
+
117
+ # Update image counts and index locations
118
+ image_counts.update(unique_items)
119
+ for item in unique_items:
120
+ index_location[item].append(i)
121
+
122
+ return LabelStatsOutput(
123
+ label_counts_per_class=_sort_to_list(label_counts),
124
+ label_counts_per_image=label_per_image,
125
+ image_counts_per_class=_sort_to_list(image_counts),
126
+ image_indices_per_class=_sort_to_list(index_location),
127
+ image_count=len(label_per_image),
128
+ class_count=len(label_counts),
129
+ label_count=sum(label_counts.values()),
130
+ class_names=list(index2label.values()),
131
+ )
@@ -2,49 +2,15 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
- from dataclasses import dataclass
6
- from typing import Any, Callable, Iterable
5
+ from typing import Any, Callable
7
6
 
8
7
  import numpy as np
9
- from numpy.typing import ArrayLike, NDArray
10
8
  from scipy.stats import entropy, kurtosis, skew
11
9
 
12
- from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
13
- from dataeval.output import set_metadata
14
-
15
-
16
- @dataclass(frozen=True)
17
- class PixelStatsOutput(BaseStatsOutput, HistogramPlotMixin):
18
- """
19
- Output class for :func:`pixelstats` stats metric.
20
-
21
- Attributes
22
- ----------
23
- mean : NDArray[np.float16]
24
- Mean of the pixel values of the images
25
- std : NDArray[np.float16]
26
- Standard deviation of the pixel values of the images
27
- var : NDArray[np.float16]
28
- :term:`Variance` of the pixel values of the images
29
- skew : NDArray[np.float16]
30
- Skew of the pixel values of the images
31
- kurtosis : NDArray[np.float16]
32
- Kurtosis of the pixel values of the images
33
- histogram : NDArray[np.uint32]
34
- Histogram of the pixel values of the images across 256 bins scaled between 0 and 1
35
- entropy : NDArray[np.float16]
36
- Entropy of the pixel values of the images
37
- """
38
-
39
- mean: NDArray[np.float16]
40
- std: NDArray[np.float16]
41
- var: NDArray[np.float16]
42
- skew: NDArray[np.float16]
43
- kurtosis: NDArray[np.float16]
44
- histogram: NDArray[np.uint32]
45
- entropy: NDArray[np.float16]
46
-
47
- _excluded_keys = ["histogram"]
10
+ from dataeval.metrics.stats._base import StatsProcessor, run_stats
11
+ from dataeval.outputs import PixelStatsOutput
12
+ from dataeval.outputs._base import set_metadata
13
+ from dataeval.typing import ArrayLike, Dataset
48
14
 
49
15
 
50
16
  class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
@@ -71,8 +37,9 @@ class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
71
37
 
72
38
  @set_metadata
73
39
  def pixelstats(
74
- images: Iterable[ArrayLike],
75
- bboxes: Iterable[ArrayLike] | None = None,
40
+ dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
41
+ *,
42
+ per_box: bool = False,
76
43
  per_channel: bool = False,
77
44
  ) -> PixelStatsOutput:
78
45
  """
@@ -83,10 +50,12 @@ def pixelstats(
83
50
 
84
51
  Parameters
85
52
  ----------
86
- images : Iterable[ArrayLike]
87
- Images to perform calculations on
88
- bboxes : Iterable[ArrayLike] or None
89
- Bounding boxes in `xyxy` format for each image to perform calculations
53
+ dataset : Dataset
54
+ Dataset to perform calculations on.
55
+ per_box : bool, default False
56
+ If True, perform calculations on each bounding box.
57
+ per_channel : bool, default False
58
+ If True, perform calculations on each channel.
90
59
 
91
60
  Returns
92
61
  -------
@@ -106,12 +75,12 @@ def pixelstats(
106
75
 
107
76
  Examples
108
77
  --------
109
- Calculating the statistics on the images, whose shape is (C, H, W)
78
+ Calculate the pixel statistics of a dataset of 8 images, whose shape is (C, H, W).
110
79
 
111
- >>> results = pixelstats(stats_images)
80
+ >>> results = pixelstats(dataset)
112
81
  >>> print(results.mean)
113
- [0.2903 0.2108 0.397 0.596 0.743 ]
82
+ [0.181 0.132 0.248 0.373 0.464 0.613 0.734 0.854]
114
83
  >>> print(results.entropy)
115
- [4.99 2.371 1.179 2.406 0.668]
84
+ [4.527 1.883 0.811 1.883 0.298 1.883 1.883 1.883]
116
85
  """
117
- return run_stats(images, bboxes, per_channel, [PixelStatsProcessor])[0]
86
+ return run_stats(dataset, per_box, per_channel, [PixelStatsProcessor])[0]
@@ -2,60 +2,26 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
- from dataclasses import dataclass
6
- from typing import Any, Callable, Iterable
5
+ from typing import Any, Callable
7
6
 
8
7
  import numpy as np
9
- from numpy.typing import ArrayLike, NDArray
10
8
 
11
- from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
12
- from dataeval.output import set_metadata
13
- from dataeval.utils.image import edge_filter
9
+ from dataeval.metrics.stats._base import StatsProcessor, run_stats
10
+ from dataeval.outputs import VisualStatsOutput
11
+ from dataeval.outputs._base import set_metadata
12
+ from dataeval.typing import ArrayLike, Dataset
13
+ from dataeval.utils._image import edge_filter
14
14
 
15
15
  QUARTILES = (0, 25, 50, 75, 100)
16
16
 
17
17
 
18
- @dataclass(frozen=True)
19
- class VisualStatsOutput(BaseStatsOutput, HistogramPlotMixin):
20
- """
21
- Output class for :func:`visualstats` stats metric.
22
-
23
- Attributes
24
- ----------
25
- brightness : NDArray[np.float16]
26
- Brightness of the images
27
- contrast : NDArray[np.float16]
28
- Image contrast ratio
29
- darkness : NDArray[np.float16]
30
- Darkness of the images
31
- missing : NDArray[np.float16]
32
- Percentage of the images with missing pixels
33
- sharpness : NDArray[np.float16]
34
- Sharpness of the images
35
- zeros : NDArray[np.float16]
36
- Percentage of the images with zero value pixels
37
- percentiles : NDArray[np.float16]
38
- Percentiles of the pixel values of the images with quartiles of (0, 25, 50, 75, 100)
39
- """
40
-
41
- brightness: NDArray[np.float16]
42
- contrast: NDArray[np.float16]
43
- darkness: NDArray[np.float16]
44
- missing: NDArray[np.float16]
45
- sharpness: NDArray[np.float16]
46
- zeros: NDArray[np.float16]
47
- percentiles: NDArray[np.float16]
48
-
49
- _excluded_keys = ["percentiles"]
50
-
51
-
52
18
  class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
53
19
  output_class: type = VisualStatsOutput
54
20
  image_function_map: dict[str, Callable[[StatsProcessor[VisualStatsOutput]], Any]] = {
55
21
  "brightness": lambda x: x.get("percentiles")[1],
56
- "contrast": lambda x: np.nan_to_num(
57
- (np.max(x.get("percentiles")) - np.min(x.get("percentiles"))) / np.mean(x.get("percentiles"))
58
- ),
22
+ "contrast": lambda x: 0
23
+ if np.mean(x.get("percentiles")) == 0
24
+ else (np.max(x.get("percentiles")) - np.min(x.get("percentiles"))) / np.mean(x.get("percentiles")),
59
25
  "darkness": lambda x: x.get("percentiles")[-2],
60
26
  "missing": lambda x: np.count_nonzero(np.isnan(np.sum(x.image, axis=0))) / np.prod(x.shape[-2:]),
61
27
  "sharpness": lambda x: np.std(edge_filter(np.mean(x.image, axis=0))),
@@ -78,8 +44,9 @@ class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
78
44
 
79
45
  @set_metadata
80
46
  def visualstats(
81
- images: Iterable[ArrayLike],
82
- bboxes: Iterable[ArrayLike] | None = None,
47
+ dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
48
+ *,
49
+ per_box: bool = False,
83
50
  per_channel: bool = False,
84
51
  ) -> VisualStatsOutput:
85
52
  """
@@ -90,10 +57,12 @@ def visualstats(
90
57
 
91
58
  Parameters
92
59
  ----------
93
- images : Iterable[ArrayLike]
94
- Images to perform calculations on
95
- bboxes : Iterable[ArrayLike] or None
96
- Bounding boxes in `xyxy` format for each image to perform calculations on
60
+ dataset : Dataset
61
+ Dataset to perform calculations on.
62
+ per_box : bool, default False
63
+ If True, perform calculations on each bounding box.
64
+ per_channel : bool, default False
65
+ If True, perform calculations on each channel.
97
66
 
98
67
  Returns
99
68
  -------
@@ -112,12 +81,12 @@ def visualstats(
112
81
 
113
82
  Examples
114
83
  --------
115
- Calculating the :term:`statistics<Statistics>` on the images, whose shape is (C, H, W)
84
+ Calculate the visual statistics of a dataset of 8 images, whose shape is (C, H, W).
116
85
 
117
- >>> results = visualstats(stats_images)
86
+ >>> results = visualstats(dataset)
118
87
  >>> print(results.brightness)
119
- [0.1353 0.2085 0.4143 0.6084 0.8135]
88
+ [0.084 0.13 0.259 0.38 0.508 0.63 0.755 0.88 ]
120
89
  >>> print(results.contrast)
121
- [2.04 1.331 1.261 1.279 1.253]
90
+ [2.04 1.331 1.261 1.279 1.253 1.268 1.265 1.263]
122
91
  """
123
- return run_stats(images, bboxes, per_channel, [VisualStatsProcessor])[0]
92
+ return run_stats(dataset, per_box, per_channel, [VisualStatsProcessor])[0]
@@ -0,0 +1,53 @@
1
+ """
2
+ Output classes for DataEval to store function and method outputs
3
+ as well as runtime metadata for reproducibility and logging.
4
+ """
5
+
6
+ from ._base import ExecutionMetadata
7
+ from ._bias import BalanceOutput, CoverageOutput, DiversityOutput, LabelParityOutput, ParityOutput
8
+ from ._drift import DriftMMDOutput, DriftOutput
9
+ from ._estimators import BEROutput, ClustererOutput, DivergenceOutput, UAPOutput
10
+ from ._linters import DuplicatesOutput, OutliersOutput
11
+ from ._ood import OODOutput, OODScoreOutput
12
+ from ._stats import (
13
+ ChannelStatsOutput,
14
+ DimensionStatsOutput,
15
+ HashStatsOutput,
16
+ ImageStatsOutput,
17
+ LabelStatsOutput,
18
+ PixelStatsOutput,
19
+ SourceIndex,
20
+ VisualStatsOutput,
21
+ )
22
+ from ._utils import SplitDatasetOutput, TrainValSplit
23
+ from ._workflows import SufficiencyOutput
24
+
25
+ __all__ = [
26
+ "BEROutput",
27
+ "BalanceOutput",
28
+ "ChannelStatsOutput",
29
+ "ClustererOutput",
30
+ "CoverageOutput",
31
+ "DimensionStatsOutput",
32
+ "DivergenceOutput",
33
+ "DiversityOutput",
34
+ "DriftMMDOutput",
35
+ "DriftOutput",
36
+ "DuplicatesOutput",
37
+ "ExecutionMetadata",
38
+ "HashStatsOutput",
39
+ "ImageStatsOutput",
40
+ "LabelParityOutput",
41
+ "LabelStatsOutput",
42
+ "OODOutput",
43
+ "OODScoreOutput",
44
+ "OutliersOutput",
45
+ "ParityOutput",
46
+ "PixelStatsOutput",
47
+ "SourceIndex",
48
+ "SplitDatasetOutput",
49
+ "SufficiencyOutput",
50
+ "TrainValSplit",
51
+ "UAPOutput",
52
+ "VisualStatsOutput",
53
+ ]