dataeval 0.74.2__py3-none-any.whl → 0.76.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. dataeval/__init__.py +27 -23
  2. dataeval/detectors/__init__.py +2 -2
  3. dataeval/detectors/drift/__init__.py +14 -12
  4. dataeval/detectors/drift/base.py +3 -3
  5. dataeval/detectors/drift/cvm.py +1 -1
  6. dataeval/detectors/drift/ks.py +3 -2
  7. dataeval/detectors/drift/mmd.py +9 -7
  8. dataeval/detectors/drift/torch.py +12 -12
  9. dataeval/detectors/drift/uncertainty.py +5 -4
  10. dataeval/detectors/drift/updates.py +1 -1
  11. dataeval/detectors/linters/__init__.py +4 -4
  12. dataeval/detectors/linters/clusterer.py +5 -9
  13. dataeval/detectors/linters/duplicates.py +10 -14
  14. dataeval/detectors/linters/outliers.py +100 -5
  15. dataeval/detectors/ood/__init__.py +4 -11
  16. dataeval/detectors/ood/{ae_torch.py → ae.py} +6 -4
  17. dataeval/detectors/ood/base.py +47 -160
  18. dataeval/detectors/ood/metadata_ks_compare.py +34 -42
  19. dataeval/detectors/ood/metadata_least_likely.py +3 -3
  20. dataeval/detectors/ood/metadata_ood_mi.py +6 -5
  21. dataeval/detectors/ood/mixin.py +146 -0
  22. dataeval/detectors/ood/output.py +63 -0
  23. dataeval/interop.py +7 -6
  24. dataeval/{logging.py → log.py} +2 -0
  25. dataeval/metrics/__init__.py +3 -3
  26. dataeval/metrics/bias/__init__.py +10 -13
  27. dataeval/metrics/bias/balance.py +13 -11
  28. dataeval/metrics/bias/coverage.py +53 -5
  29. dataeval/metrics/bias/diversity.py +56 -24
  30. dataeval/metrics/bias/parity.py +20 -17
  31. dataeval/metrics/estimators/__init__.py +2 -2
  32. dataeval/metrics/estimators/ber.py +7 -4
  33. dataeval/metrics/estimators/divergence.py +4 -4
  34. dataeval/metrics/estimators/uap.py +4 -4
  35. dataeval/metrics/stats/__init__.py +19 -19
  36. dataeval/metrics/stats/base.py +28 -12
  37. dataeval/metrics/stats/boxratiostats.py +13 -14
  38. dataeval/metrics/stats/datasetstats.py +49 -20
  39. dataeval/metrics/stats/dimensionstats.py +8 -8
  40. dataeval/metrics/stats/hashstats.py +14 -10
  41. dataeval/metrics/stats/labelstats.py +94 -11
  42. dataeval/metrics/stats/pixelstats.py +11 -14
  43. dataeval/metrics/stats/visualstats.py +10 -13
  44. dataeval/output.py +23 -14
  45. dataeval/utils/__init__.py +5 -14
  46. dataeval/utils/dataset/__init__.py +7 -0
  47. dataeval/utils/{torch → dataset}/datasets.py +2 -0
  48. dataeval/utils/dataset/read.py +63 -0
  49. dataeval/utils/{split_dataset.py → dataset/split.py} +38 -30
  50. dataeval/utils/image.py +2 -2
  51. dataeval/utils/metadata.py +317 -14
  52. dataeval/{metrics/bias/metadata_utils.py → utils/plot.py} +91 -71
  53. dataeval/utils/torch/__init__.py +2 -17
  54. dataeval/utils/torch/gmm.py +29 -6
  55. dataeval/utils/torch/{utils.py → internal.py} +82 -58
  56. dataeval/utils/torch/models.py +10 -8
  57. dataeval/utils/torch/trainer.py +6 -85
  58. dataeval/workflows/__init__.py +2 -5
  59. dataeval/workflows/sufficiency.py +18 -8
  60. {dataeval-0.74.2.dist-info → dataeval-0.76.0.dist-info}/LICENSE.txt +2 -2
  61. dataeval-0.76.0.dist-info/METADATA +137 -0
  62. dataeval-0.76.0.dist-info/RECORD +67 -0
  63. dataeval/detectors/ood/base_torch.py +0 -109
  64. dataeval/metrics/bias/metadata_preprocessing.py +0 -285
  65. dataeval/utils/gmm.py +0 -26
  66. dataeval-0.74.2.dist-info/METADATA +0 -120
  67. dataeval-0.74.2.dist-info/RECORD +0 -66
  68. {dataeval-0.74.2.dist-info → dataeval-0.76.0.dist-info}/WHEEL +0 -0
@@ -1,5 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from dataeval.utils.plot import histogram_plot
4
+
3
5
  __all__ = []
4
6
 
5
7
  import re
@@ -100,19 +102,33 @@ class BaseStatsOutput(Output):
100
102
  for source_index in list(self.source_index) + [None]:
101
103
  if source_index is None or source_index.image > cur_image:
102
104
  mask.extend(cur_mask if matches(cur_max_channel + 1, channel_count) else [False for _ in cur_mask])
103
- if source_index is None:
104
- break
105
- cur_image = source_index.image
106
- cur_max_channel = 0
107
- cur_mask.clear()
108
- cur_mask.append(matches(source_index.channel, channel_index))
109
- cur_max_channel = max(cur_max_channel, source_index.channel or 0)
105
+ if source_index is not None:
106
+ cur_image = source_index.image
107
+ cur_max_channel = 0
108
+ cur_mask.clear()
109
+ if source_index is not None:
110
+ cur_mask.append(matches(source_index.channel, channel_index))
111
+ cur_max_channel = max(cur_max_channel, source_index.channel or 0)
110
112
  return mask
111
113
 
112
114
  def __len__(self) -> int:
113
115
  return len(self.source_index)
114
116
 
115
117
 
118
+ def _is_plottable(k: str, v: Any, excluded_keys: Iterable[str]) -> bool:
119
+ return isinstance(v, np.ndarray) and v[v != 0].size > 0 and all(k != x for x in excluded_keys)
120
+
121
+
122
+ class HistogramPlotMixin:
123
+ _excluded_keys: Iterable[str] = []
124
+
125
+ def dict(self) -> dict[str, Any]: ...
126
+
127
+ def plot(self, log: bool) -> None:
128
+ data_dict = {k: v for k, v in self.dict().items() if _is_plottable(k, v, self._excluded_keys)}
129
+ histogram_plot(data_dict, log)
130
+
131
+
116
132
  TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput, covariant=True)
117
133
 
118
134
 
@@ -126,7 +142,7 @@ class StatsProcessor(Generic[TStatsOutput]):
126
142
  self.raw = image
127
143
  self.width: int = image.shape[-1]
128
144
  self.height: int = image.shape[-2]
129
- self.box: NDArray[Any] = np.array([0, 0, self.width, self.height]) if box is None else box
145
+ self.box: NDArray[np.int64] = np.array([0, 0, self.width, self.height]) if box is None else box.astype(np.int64)
130
146
  self._per_channel = per_channel
131
147
  self._image = None
132
148
  self._shape = None
@@ -193,7 +209,7 @@ class StatsProcessorOutput(NamedTuple):
193
209
  results: list[dict[str, Any]]
194
210
  source_indices: list[SourceIndex]
195
211
  box_counts: list[int]
196
- warnings_list: list[tuple[int, int, NDArray[np.float64], tuple[int, ...]]]
212
+ warnings_list: list[str]
197
213
 
198
214
 
199
215
  def process_stats(
@@ -206,13 +222,13 @@ def process_stats(
206
222
  results_list: list[dict[str, Any]] = []
207
223
  source_indices: list[SourceIndex] = []
208
224
  box_counts: list[int] = []
209
- warnings_list: list[tuple[int, int, NDArray[np.float64], tuple[int, ...]]] = []
225
+ warnings_list: list[str] = []
210
226
  nboxes = [None] if boxes is None else normalize_box_shape(boxes)
211
227
  for i_b, box in enumerate(nboxes):
212
228
  i_b = None if box is None else i_b
213
229
  processor_list = [p(image, box, per_channel) for p in stats_processor_cls]
214
230
  if any(not p._is_valid_slice for p in processor_list) and i_b is not None and box is not None:
215
- warnings_list.append((i, i_b, box, image.shape))
231
+ warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} is out of bounds of {image.shape}.")
216
232
  results_list.append({k: v for p in processor_list for k, v in p.process().items()})
217
233
  if per_channel:
218
234
  source_indices.extend([SourceIndex(i, i_b, c) for c in range(image_boxes[0].shape[-3])])
@@ -302,7 +318,7 @@ def run_stats(
302
318
 
303
319
  # warnings are not emitted while in multiprocessing pools so we emit after gathering all warnings
304
320
  for w in warning_list:
305
- warnings.warn(f"Bounding box [{w[0]}][{w[1]}]: {w[2]} is out of bounds of {w[3]}.", UserWarning)
321
+ warnings.warn(w, UserWarning)
306
322
 
307
323
  output = {}
308
324
  for results in results_list:
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- __all__ = ["boxratiostats"]
3
+ __all__ = []
4
4
 
5
5
  import copy
6
6
  from typing import Any, Callable, Generic, TypeVar, cast
@@ -26,7 +26,7 @@ class BoxImageStatsOutputSlice(Generic[TStatOutput]):
26
26
  def __getitem__(self, key: str) -> NDArray[np.float64]:
27
27
  _stat = cast(np.ndarray, getattr(self._stats, key)).astype(np.float64)
28
28
  _shape = _stat[0].shape
29
- _slice = _stat[self._slice[0] : self._slice[1]]
29
+ _slice = _stat[int(self._slice[0]) : int(self._slice[1])]
30
30
  return _slice.reshape(-1, self._channels, *_shape) if self._channels else _slice.reshape(-1, *_shape)
31
31
 
32
32
  box: StatSlicer
@@ -102,7 +102,7 @@ def boxratiostats(
102
102
  imgstats: TStatOutput,
103
103
  ) -> TStatOutput:
104
104
  """
105
- Calculates ratio :term:`statistics<Statistics>` of box outputs over image outputs
105
+ Calculates ratio :term:`statistics<Statistics>` of box outputs over image outputs.
106
106
 
107
107
  Parameters
108
108
  ----------
@@ -130,17 +130,16 @@ def boxratiostats(
130
130
  --------
131
131
  Calculating the box ratio statistics using the dimension stats of the boxes and images
132
132
 
133
- >>> imagestats = dimensionstats(images)
134
- >>> boxstats = dimensionstats(images, bboxes)
133
+ >>> from dataeval.metrics.stats import dimensionstats
134
+ >>> imagestats = dimensionstats(stats_images)
135
+ >>> boxstats = dimensionstats(stats_images, bboxes)
135
136
  >>> ratiostats = boxratiostats(boxstats, imagestats)
136
137
  >>> print(ratiostats.aspect_ratio)
137
- [ 1.15169271 0.78450521 21.33333333 1.5234375 2.25651042 0.77799479
138
- 0.88867188 3.40625 1.73307292 1.11132812 0.75018315 0.45018315
139
- 0.69596354 20. 5.11197917 2.33333333 0.75 0.70019531]
138
+ [ 0.86376953 0.58837891 16. 0.85714286 1.26959707 0.43772894
139
+ 0.66650391 3.83296703 1.95018315]
140
140
  >>> print(ratiostats.size)
141
- [0.03401693 0.01383464 0.00130208 0.01822917 0.02327474 0.00683594
142
- 0.01220703 0.0168457 0.01057943 0.00976562 0.00130208 0.01098633
143
- 0.02246094 0.0012207 0.01123047 0.00911458 0.02636719 0.06835938]
141
+ [0.0255127 0.01037598 0.00097656 0.01822917 0.02327474 0.00683594
142
+ 0.00915527 0.03369141 0.02115885]
144
143
  """
145
144
  output_cls = type(boxstats)
146
145
  if type(boxstats) is not type(imgstats):
@@ -148,13 +147,13 @@ def boxratiostats(
148
147
  if boxstats.source_index[-1].image != imgstats.source_index[-1].image:
149
148
  raise ValueError("Stats index_map length mismatch. Check if the correct box and image stats were provided.")
150
149
  if all(count == 0 for count in boxstats.box_count):
151
- raise TypeError("Input for boxstats must contain box information.")
150
+ raise ValueError("Input for boxstats must contain box information.")
152
151
  if any(count != 0 for count in imgstats.box_count):
153
- raise TypeError("Input for imgstats must not contain box information.")
152
+ raise ValueError("Input for imgstats must not contain box information.")
154
153
  boxstats_has_channels = any(si.channel is None for si in boxstats.source_index)
155
154
  imgstats_has_channels = any(si.channel is None for si in imgstats.source_index)
156
155
  if boxstats_has_channels != imgstats_has_channels:
157
- raise TypeError("Input for boxstats and imgstats must have matching channel information.")
156
+ raise ValueError("Input for boxstats and imgstats must have matching channel information.")
158
157
 
159
158
  output_dict = {}
160
159
  for key in boxstats.dict():
@@ -1,13 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
- __all__ = ["DatasetStatsOutput", "ChannelStatsOutput", "datasetstats", "channelstats"]
3
+ __all__ = []
4
4
 
5
5
  from dataclasses import dataclass
6
6
  from typing import Any, Iterable
7
7
 
8
8
  from numpy.typing import ArrayLike
9
9
 
10
- from dataeval.metrics.stats.base import BaseStatsOutput, run_stats
10
+ from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, _is_plottable, run_stats
11
11
  from dataeval.metrics.stats.dimensionstats import (
12
12
  DimensionStatsOutput,
13
13
  DimensionStatsProcessor,
@@ -16,16 +16,17 @@ from dataeval.metrics.stats.labelstats import LabelStatsOutput, labelstats
16
16
  from dataeval.metrics.stats.pixelstats import PixelStatsOutput, PixelStatsProcessor
17
17
  from dataeval.metrics.stats.visualstats import VisualStatsOutput, VisualStatsProcessor
18
18
  from dataeval.output import Output, set_metadata
19
+ from dataeval.utils.plot import channel_histogram_plot
19
20
 
20
21
 
21
22
  @dataclass(frozen=True)
22
- class DatasetStatsOutput(Output):
23
+ class DatasetStatsOutput(Output, HistogramPlotMixin):
23
24
  """
24
- Output class for :func:`datasetstats` stats metric
25
+ Output class for :func:`datasetstats` stats metric.
25
26
 
26
27
  This class represents the outputs of various stats functions against a single
27
28
  dataset, such that each index across all stat outputs are representative of
28
- the same source image. Modifying or mixing outputs will result in inaccurate
29
+ the same source image. Modifying or mixing outputs will result in inaccurate
29
30
  outlier calculations if not created correctly.
30
31
 
31
32
  Attributes
@@ -41,6 +42,8 @@ class DatasetStatsOutput(Output):
41
42
  visualstats: VisualStatsOutput
42
43
  labelstats: LabelStatsOutput | None = None
43
44
 
45
+ _excluded_keys = ["histogram", "percentiles"]
46
+
44
47
  def _outputs(self) -> list[Output]:
45
48
  return [s for s in (self.dimensionstats, self.pixelstats, self.visualstats, self.labelstats) if s is not None]
46
49
 
@@ -53,14 +56,37 @@ class DatasetStatsOutput(Output):
53
56
  raise ValueError("All StatsOutput classes must contain the same number of image sources.")
54
57
 
55
58
 
59
+ def _get_channels(cls, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None):
60
+ raw_channels = max([si.channel for si in cls.dict()["source_index"]]) + 1
61
+ if isinstance(channel_index, int):
62
+ max_channels = 1 if channel_index < raw_channels else raw_channels
63
+ ch_mask = cls.pixelstats.get_channel_mask(channel_index)
64
+ elif isinstance(channel_index, Iterable) and all(isinstance(val, int) for val in list(channel_index)):
65
+ max_channels = len(list(channel_index))
66
+ ch_mask = cls.pixelstats.get_channel_mask(channel_index)
67
+ elif isinstance(channel_limit, int):
68
+ max_channels = channel_limit
69
+ ch_mask = cls.pixelstats.get_channel_mask(None, channel_limit)
70
+ else:
71
+ max_channels = raw_channels
72
+ ch_mask = None
73
+
74
+ if max_channels > raw_channels:
75
+ max_channels = raw_channels
76
+ if ch_mask is not None and not any(ch_mask):
77
+ ch_mask = None
78
+
79
+ return max_channels, ch_mask
80
+
81
+
56
82
  @dataclass(frozen=True)
57
83
  class ChannelStatsOutput(Output):
58
84
  """
59
- Output class for :func:`channelstats` stats metric
85
+ Output class for :func:`channelstats` stats metric.
60
86
 
61
87
  This class represents the outputs of various per-channel stats functions against
62
88
  a single dataset, such that each index across all stat outputs are representative
63
- of the same source image. Modifying or mixing outputs will result in inaccurate
89
+ of the same source image. Modifying or mixing outputs will result in inaccurate
64
90
  outlier calculations if not created correctly.
65
91
 
66
92
  Attributes
@@ -83,6 +109,13 @@ class ChannelStatsOutput(Output):
83
109
  if not all(length == lengths[0] for length in lengths):
84
110
  raise ValueError("All StatsOutput classes must contain the same number of image sources.")
85
111
 
112
+ def plot(
113
+ self, log: bool, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None
114
+ ) -> None:
115
+ max_channels, ch_mask = _get_channels(self, channel_limit, channel_index)
116
+ data_dict = {k: v for k, v in self.dict().items() if _is_plottable(k, v, ("histogram", "percentiles"))}
117
+ channel_histogram_plot(data_dict, log, max_channels, ch_mask)
118
+
86
119
 
87
120
  @set_metadata
88
121
  def datasetstats(
@@ -91,7 +124,7 @@ def datasetstats(
91
124
  labels: Iterable[ArrayLike] | None = None,
92
125
  ) -> DatasetStatsOutput:
93
126
  """
94
- Calculates various :term:`statistics<Statistics>` for each image
127
+ Calculates various :term:`statistics<Statistics>` for each image.
95
128
 
96
129
  This function computes dimension, pixel and visual metrics
97
130
  on the images or individual bounding boxes for each image as
@@ -119,13 +152,11 @@ def datasetstats(
119
152
  --------
120
153
  Calculating the dimension, pixel and visual stats for a dataset with bounding boxes
121
154
 
122
- >>> stats = datasetstats(images, bboxes)
155
+ >>> stats = datasetstats(stats_images, bboxes)
123
156
  >>> print(stats.dimensionstats.aspect_ratio)
124
- [ 0.864 0.5884 16. 1.143 1.692 0.5835 0.6665 2.555 1.3
125
- 0.8335 1. 0.6 0.522 15. 3.834 1.75 0.75 0.7 ]
126
- >>> print(stats.visualstats.contrast)
127
- [1.744 1.946 0.1164 0.0635 0.0633 0.06274 0.0429 0.0317 0.0317
128
- 0.02576 0.02081 0.02171 0.01915 0.01767 0.01799 0.01595 0.01433 0.01478]
157
+ [ 0.864 0.5884 16. 1.143 1.692 0.5835 0.6665 2.555 1.3 ]
158
+ >>> print(stats.visualstats.sharpness)
159
+ [4.04 4.434 0.2778 4.957 5.145 5.22 4.957 3.076 2.855 ]
129
160
  """
130
161
  outputs = run_stats(images, bboxes, False, [DimensionStatsProcessor, PixelStatsProcessor, VisualStatsProcessor])
131
162
  return DatasetStatsOutput(*outputs, labelstats=labelstats(labels) if labels else None) # type: ignore
@@ -137,7 +168,7 @@ def channelstats(
137
168
  bboxes: Iterable[ArrayLike] | None = None,
138
169
  ) -> ChannelStatsOutput:
139
170
  """
140
- Calculates various per-channel statistics for each image
171
+ Calculates various per-channel :term:`statistics` for each image.
141
172
 
142
173
  This function computes pixel and visual metrics on the images
143
174
  or individual bounding boxes for each image.
@@ -162,12 +193,10 @@ def channelstats(
162
193
  --------
163
194
  Calculating the per-channel pixel and visual stats for a dataset
164
195
 
165
- >>> stats = channelstats(images)
196
+ >>> stats = channelstats(stats_images)
166
197
  >>> print(stats.visualstats.darkness)
167
- [0.07495 0.1748 0.275 0.1047 0.11096 0.1172 0.2047 0.2109 0.2172
168
- 0.3047 0.311 0.3171 0.4048 0.411 0.4172 0.505 0.5107 0.517
169
- 0.6045 0.611 0.617 0.7046 0.711 0.7173 0.8047 0.811 0.8174
170
- 0.905 0.911 0.917 ]
198
+ [0.1499 0.3499 0.55 0.2094 0.2219 0.2344 0.4194 0.6094 0.622 0.6343
199
+ 0.8154]
171
200
  """
172
201
  outputs = run_stats(images, bboxes, True, [PixelStatsProcessor, VisualStatsProcessor])
173
202
  return ChannelStatsOutput(*outputs) # type: ignore
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- __all__ = ["DimensionStatsOutput", "dimensionstats"]
3
+ __all__ = []
4
4
 
5
5
  from dataclasses import dataclass
6
6
  from typing import Any, Callable, Iterable
@@ -8,15 +8,15 @@ from typing import Any, Callable, Iterable
8
8
  import numpy as np
9
9
  from numpy.typing import ArrayLike, NDArray
10
10
 
11
- from dataeval.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
11
+ from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
12
12
  from dataeval.output import set_metadata
13
13
  from dataeval.utils.image import get_bitdepth
14
14
 
15
15
 
16
16
  @dataclass(frozen=True)
17
- class DimensionStatsOutput(BaseStatsOutput):
17
+ class DimensionStatsOutput(BaseStatsOutput, HistogramPlotMixin):
18
18
  """
19
- Output class for :func:`dimensionstats` stats metric
19
+ Output class for :func:`dimensionstats` stats metric.
20
20
 
21
21
  Attributes
22
22
  ----------
@@ -79,7 +79,7 @@ def dimensionstats(
79
79
  bboxes: Iterable[ArrayLike] | None = None,
80
80
  ) -> DimensionStatsOutput:
81
81
  """
82
- Calculates dimension :term:`statistics<Statistics>` for each image
82
+ Calculates dimension :term:`statistics<Statistics>` for each image.
83
83
 
84
84
  This function computes various dimensional metrics (e.g., width, height, channels)
85
85
  on the images or individual bounding boxes for each image.
@@ -106,10 +106,10 @@ def dimensionstats(
106
106
  --------
107
107
  Calculating the dimension statistics on the images, whose shape is (C, H, W)
108
108
 
109
- >>> results = dimensionstats(images)
109
+ >>> results = dimensionstats(stats_images)
110
110
  >>> print(results.aspect_ratio)
111
- [0.75 0.75 0.75 0.75 0.75 0.75 1.333 0.75 0.75 1. ]
111
+ [1. 1. 1.333 1. 0.6665]
112
112
  >>> print(results.channels)
113
- [1 1 1 1 1 1 3 1 1 3]
113
+ [3 3 1 3 1]
114
114
  """
115
115
  return run_stats(images, bboxes, False, [DimensionStatsProcessor])[0]
@@ -1,6 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
- __all__ = ["HashStatsOutput", "hashstats"]
3
+ import warnings
4
+
5
+ __all__ = []
4
6
 
5
7
  from dataclasses import dataclass
6
8
  from typing import Callable, Iterable
@@ -23,7 +25,7 @@ MAX_FACTOR = 4
23
25
  @dataclass(frozen=True)
24
26
  class HashStatsOutput(BaseStatsOutput):
25
27
  """
26
- Output class for :func:`hashstats` stats metric
28
+ Output class for :func:`hashstats` stats metric.
27
29
 
28
30
  Attributes
29
31
  ----------
@@ -41,7 +43,7 @@ def pchash(image: ArrayLike) -> str:
41
43
  """
42
44
  Performs a perceptual hash on an image by resizing to a square NxN image
43
45
  using the Lanczos algorithm where N is 32x32 or the largest multiple of
44
- 8 that is smaller than the input image dimensions. The resampled image
46
+ 8 that is smaller than the input image dimensions. The resampled image
45
47
  is compressed using a discrete cosine transform and the lowest frequency
46
48
  component is encoded as a bit array of greater or less than median value
47
49
  and returned as a hex string.
@@ -54,13 +56,15 @@ def pchash(image: ArrayLike) -> str:
54
56
  Returns
55
57
  -------
56
58
  str
57
- The hex string hash of the image using perceptual hashing
59
+ The hex string hash of the image using perceptual hashing, or empty
60
+ string if the image is too small to be hashed
58
61
  """
59
62
  # Verify that the image is at least larger than an 8x8 image
60
63
  arr = as_numpy(image)
61
64
  min_dim = min(arr.shape[-2:])
62
65
  if min_dim < HASH_SIZE + 1:
63
- raise ValueError(f"Image must be larger than {HASH_SIZE}x{HASH_SIZE} for fuzzy hashing.")
66
+ warnings.warn(f"Image must be larger than {HASH_SIZE}x{HASH_SIZE} for fuzzy hashing.")
67
+ return ""
64
68
 
65
69
  # Calculates the dimensions of the resized square image
66
70
  resize_dim = HASH_SIZE * min((min_dim - 1) // HASH_SIZE, MAX_FACTOR)
@@ -92,7 +96,7 @@ def pchash(image: ArrayLike) -> str:
92
96
  def xxhash(image: ArrayLike) -> str:
93
97
  """
94
98
  Performs a fast non-cryptographic hash using the xxhash algorithm
95
- (xxhash.com) against the image as a flattened bytearray. The hash
99
+ (xxhash.com) against the image as a flattened bytearray. The hash
96
100
  is returned as a hex string.
97
101
 
98
102
  Parameters
@@ -122,7 +126,7 @@ def hashstats(
122
126
  bboxes: Iterable[ArrayLike] | None = None,
123
127
  ) -> HashStatsOutput:
124
128
  """
125
- Calculates hashes for each image
129
+ Calculates hashes for each image.
126
130
 
127
131
  This function computes hashes from the images including exact hashes and perception-based
128
132
  hashes. These hash values can be used to determine if images are exact or near matches.
@@ -147,10 +151,10 @@ def hashstats(
147
151
  --------
148
152
  Calculating the statistics on the images, whose shape is (C, H, W)
149
153
 
150
- >>> results = hashstats(images)
154
+ >>> results = hashstats(stats_images)
151
155
  >>> print(results.xxhash)
152
- ['a72434443d6e7336', 'efc12c2f14581d79', '4a1e03483a27d674', '3a3ecedbcf814226']
156
+ ['6274f837b34ed9f0', '256504fdb6e3d2a4', '7dd0c56ca8474fb0', '50956ad4592f5bbc', '5ba2354079d42aa5']
153
157
  >>> print(results.pchash)
154
- ['8f25506af46a7c6a', '8000808000008080', '8e71f18e0ef18e0e', 'a956d6a956d6a928']
158
+ ['a666999999666666', 'e666999999266666', 'e666999966663299', 'e666999999266666', '96e91656e91616e9']
155
159
  """
156
160
  return run_stats(images, bboxes, False, [HashStatsProcessor])[0]
@@ -1,21 +1,26 @@
1
1
  from __future__ import annotations
2
2
 
3
- __all__ = ["LabelStatsOutput", "labelstats"]
3
+ __all__ = []
4
4
 
5
+ # import contextlib
5
6
  from collections import Counter, defaultdict
6
7
  from dataclasses import dataclass
7
8
  from typing import Any, Iterable, Mapping, TypeVar
8
9
 
10
+ import numpy as np
9
11
  from numpy.typing import ArrayLike
10
12
 
11
- from dataeval.interop import to_numpy
13
+ from dataeval.interop import as_numpy
12
14
  from dataeval.output import Output, set_metadata
13
15
 
16
+ # with contextlib.suppress(ImportError):
17
+ # import pandas as pd
18
+
14
19
 
15
20
  @dataclass(frozen=True)
16
21
  class LabelStatsOutput(Output):
17
22
  """
18
- Output class for :func:`labelstats` stats metric
23
+ Output class for :func:`labelstats` stats metric.
19
24
 
20
25
  Attributes
21
26
  ----------
@@ -46,6 +51,47 @@ class LabelStatsOutput(Output):
46
51
  class_count: int
47
52
  label_count: int
48
53
 
54
+ def to_table(self) -> str:
55
+ max_char = max(len(key) if isinstance(key, str) else key // 10 + 1 for key in self.label_counts_per_class)
56
+ max_char = max(max_char, 5)
57
+ max_label = max(list(self.label_counts_per_class.values()))
58
+ max_img = max(list(self.image_counts_per_label.values()))
59
+ max_num = int(np.ceil(np.log10(max(max_label, max_img))))
60
+ max_num = max(max_num, 11)
61
+
62
+ # Display basic counts
63
+ table_str = f"Class Count: {self.class_count}\n"
64
+ table_str += f"Label Count: {self.label_count}\n"
65
+ table_str += f"Average # Labels per Image: {round(np.mean(self.label_counts_per_image), 2)}\n"
66
+ table_str += "--------------------------------------\n"
67
+
68
+ # Display counts per class
69
+ table_str += f"{'Label':>{max_char}}: Total Count - Image Count\n"
70
+ for cls in self.label_counts_per_class:
71
+ table_str += f"{cls:>{max_char}}: {self.label_counts_per_class[cls]:^{max_num}} "
72
+ table_str += f"- {self.image_counts_per_label[cls]:^{max_num}}\n"
73
+
74
+ return table_str
75
+
76
+ # def to_dataframe(self) -> pd.DataFrame:
77
+ # import pandas as pd
78
+
79
+ # class_list = []
80
+ # total_count = []
81
+ # image_count = []
82
+ # for cls in self.label_counts_per_class:
83
+ # class_list.append(cls)
84
+ # total_count.append(self.label_counts_per_class[cls])
85
+ # image_count.append(self.image_counts_per_label[cls])
86
+
87
+ # return pd.DataFrame(
88
+ # {
89
+ # "Label": class_list,
90
+ # "Total Count": total_count,
91
+ # "Image Count": image_count,
92
+ # }
93
+ # )
94
+
49
95
 
50
96
  TKey = TypeVar("TKey", int, str)
51
97
 
@@ -57,12 +103,47 @@ def sort(d: Mapping[TKey, Any]) -> dict[TKey, Any]:
57
103
  return dict(sorted(d.items(), key=lambda x: x[0]))
58
104
 
59
105
 
106
+ def _ensure_2d(labels: Iterable[ArrayLike]) -> Iterable[ArrayLike]:
107
+ if isinstance(labels, np.ndarray):
108
+ return labels[:, None]
109
+ else:
110
+ return [[lbl] for lbl in labels] # type: ignore
111
+
112
+
113
+ def _get_list_depth(lst):
114
+ if isinstance(lst, list) and lst:
115
+ return 1 + max(_get_list_depth(item) for item in lst)
116
+ return 0
117
+
118
+
119
+ def _check_labels_dimension(labels: Iterable[ArrayLike]) -> Iterable[ArrayLike]:
120
+ # Check for nested lists beyond 2 levels
121
+
122
+ if isinstance(labels, np.ndarray):
123
+ if labels.ndim == 1:
124
+ return _ensure_2d(labels)
125
+ elif labels.ndim == 2:
126
+ return labels
127
+ else:
128
+ raise ValueError("The label array must not have more than 2 dimensions.")
129
+ elif isinstance(labels, list):
130
+ depth = _get_list_depth(labels)
131
+ if depth == 1:
132
+ return _ensure_2d(labels)
133
+ elif depth == 2:
134
+ return labels
135
+ else:
136
+ raise ValueError("The label list must not be empty or have more than 2 levels of nesting.")
137
+ else:
138
+ raise TypeError("Labels must be either a NumPy array or a list.")
139
+
140
+
60
141
  @set_metadata
61
142
  def labelstats(
62
143
  labels: Iterable[ArrayLike],
63
144
  ) -> LabelStatsOutput:
64
145
  """
65
- Calculates :term:`statistics<Statistics>` for data labels
146
+ Calculates :term:`statistics<Statistics>` for data labels.
66
147
 
67
148
  This function computes counting metrics (e.g., total per class, total per image)
68
149
  on the labels.
@@ -86,23 +167,25 @@ def labelstats(
86
167
 
87
168
  >>> stats = labelstats(labels)
88
169
  >>> stats.label_counts_per_class
89
- {'chicken': 3, 'cow': 8, 'horse': 9, 'pig': 7, 'sheep': 7}
170
+ {'chicken': 12, 'cow': 5, 'horse': 4, 'pig': 7, 'sheep': 4}
90
171
  >>> stats.label_counts_per_image
91
- [3, 2, 3, 4, 1, 5, 4, 4, 4, 4]
172
+ [3, 3, 5, 3, 2, 5, 5, 2, 2, 2]
92
173
  >>> stats.image_counts_per_label
93
- {'chicken': 2, 'cow': 6, 'horse': 7, 'pig': 5, 'sheep': 7}
174
+ {'chicken': 8, 'cow': 4, 'horse': 4, 'pig': 7, 'sheep': 4}
94
175
  >>> (stats.image_count, stats.class_count, stats.label_count)
95
- (10, 5, 34)
176
+ (10, 5, 32)
96
177
  """
97
178
  label_counts = Counter()
98
179
  image_counts = Counter()
99
180
  index_location = defaultdict(list[int])
100
181
  label_per_image: list[int] = []
101
182
 
102
- for i, group in enumerate(labels):
103
- # Count occurrences of each label in all sublists
104
- group = to_numpy(group)
183
+ labels_2d = _check_labels_dimension(labels)
184
+
185
+ for i, group in enumerate(labels_2d):
186
+ group = as_numpy(group)
105
187
 
188
+ # Count occurrences of each label in all sublists
106
189
  label_counts.update(group)
107
190
 
108
191
  # Get the number of labels per image
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- __all__ = ["PixelStatsOutput", "pixelstats"]
3
+ __all__ = []
4
4
 
5
5
  from dataclasses import dataclass
6
6
  from typing import Any, Callable, Iterable
@@ -9,14 +9,14 @@ import numpy as np
9
9
  from numpy.typing import ArrayLike, NDArray
10
10
  from scipy.stats import entropy, kurtosis, skew
11
11
 
12
- from dataeval.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
12
+ from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
13
13
  from dataeval.output import set_metadata
14
14
 
15
15
 
16
16
  @dataclass(frozen=True)
17
- class PixelStatsOutput(BaseStatsOutput):
17
+ class PixelStatsOutput(BaseStatsOutput, HistogramPlotMixin):
18
18
  """
19
- Output class for :func:`pixelstats` stats metric
19
+ Output class for :func:`pixelstats` stats metric.
20
20
 
21
21
  Attributes
22
22
  ----------
@@ -44,11 +44,13 @@ class PixelStatsOutput(BaseStatsOutput):
44
44
  histogram: NDArray[np.uint32]
45
45
  entropy: NDArray[np.float16]
46
46
 
47
+ _excluded_keys = ["histogram"]
48
+
47
49
 
48
50
  class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
49
51
  output_class: type = PixelStatsOutput
50
52
  image_function_map: dict[str, Callable[[StatsProcessor[PixelStatsOutput]], Any]] = {
51
- "mean": lambda self: np.mean(self.scaled),
53
+ "mean": lambda x: np.mean(x.scaled),
52
54
  "std": lambda x: np.std(x.scaled),
53
55
  "var": lambda x: np.var(x.scaled),
54
56
  "skew": lambda x: np.nan_to_num(skew(x.scaled.ravel())),
@@ -74,7 +76,7 @@ def pixelstats(
74
76
  per_channel: bool = False,
75
77
  ) -> PixelStatsOutput:
76
78
  """
77
- Calculates pixel :term:`statistics<Statistics>` for each image
79
+ Calculates pixel :term:`statistics<Statistics>` for each image.
78
80
 
79
81
  This function computes various statistical metrics (e.g., mean, standard deviation, entropy)
80
82
  on the images as a whole.
@@ -106,15 +108,10 @@ def pixelstats(
106
108
  --------
107
109
  Calculating the statistics on the images, whose shape is (C, H, W)
108
110
 
109
- >>> results = pixelstats(images)
111
+ >>> results = pixelstats(stats_images)
110
112
  >>> print(results.mean)
111
- [0.04828 0.562 0.06726 0.09937 0.1315 0.1636 0.1957 0.2278 0.26
112
- 0.292 0.3242 0.3562 0.3884 0.4204 0.4526 0.4846 0.5166 0.549
113
- 0.581 0.6133 0.6455 0.6772 0.7095 0.7417 0.774 0.8057 0.838
114
- 0.87 0.9023 0.934 ]
113
+ [0.2903 0.2108 0.397 0.596 0.743 ]
115
114
  >>> print(results.entropy)
116
- [3.238 3.303 0.8125 1.028 0.8223 1.046 0.8247 1.041 0.8203 1.012
117
- 0.812 0.9883 0.795 0.9243 0.9243 0.795 0.9907 0.8125 1.028 0.8223
118
- 1.046 0.8247 1.041 0.8203 1.012 0.812 0.9883 0.795 0.9243 0.9243]
115
+ [4.99 2.371 1.179 2.406 0.668]
119
116
  """
120
117
  return run_stats(images, bboxes, per_channel, [PixelStatsProcessor])[0]