dataeval 0.76.1__py3-none-any.whl → 0.82.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. dataeval/__init__.py +3 -3
  2. dataeval/config.py +77 -0
  3. dataeval/detectors/__init__.py +1 -1
  4. dataeval/detectors/drift/__init__.py +6 -6
  5. dataeval/detectors/drift/{base.py → _base.py} +40 -85
  6. dataeval/detectors/drift/{cvm.py → _cvm.py} +21 -28
  7. dataeval/detectors/drift/{ks.py → _ks.py} +20 -26
  8. dataeval/detectors/drift/{mmd.py → _mmd.py} +31 -43
  9. dataeval/detectors/drift/{torch.py → _torch.py} +2 -1
  10. dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} +24 -7
  11. dataeval/detectors/drift/updates.py +20 -3
  12. dataeval/detectors/linters/__init__.py +3 -5
  13. dataeval/detectors/linters/duplicates.py +13 -36
  14. dataeval/detectors/linters/outliers.py +23 -148
  15. dataeval/detectors/ood/__init__.py +1 -1
  16. dataeval/detectors/ood/ae.py +30 -9
  17. dataeval/detectors/ood/base.py +5 -4
  18. dataeval/detectors/ood/mixin.py +21 -7
  19. dataeval/detectors/ood/vae.py +73 -0
  20. dataeval/metadata/__init__.py +6 -0
  21. dataeval/metadata/_distance.py +167 -0
  22. dataeval/metadata/_ood.py +217 -0
  23. dataeval/metadata/_utils.py +44 -0
  24. dataeval/metrics/__init__.py +1 -1
  25. dataeval/metrics/bias/__init__.py +6 -4
  26. dataeval/metrics/bias/{balance.py → _balance.py} +15 -101
  27. dataeval/metrics/bias/_coverage.py +98 -0
  28. dataeval/metrics/bias/{diversity.py → _diversity.py} +18 -111
  29. dataeval/metrics/bias/{parity.py → _parity.py} +39 -77
  30. dataeval/metrics/estimators/__init__.py +15 -4
  31. dataeval/metrics/estimators/{ber.py → _ber.py} +42 -29
  32. dataeval/metrics/estimators/_clusterer.py +44 -0
  33. dataeval/metrics/estimators/{divergence.py → _divergence.py} +18 -30
  34. dataeval/metrics/estimators/{uap.py → _uap.py} +4 -18
  35. dataeval/metrics/stats/__init__.py +16 -13
  36. dataeval/metrics/stats/{base.py → _base.py} +82 -133
  37. dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} +15 -18
  38. dataeval/metrics/stats/_dimensionstats.py +75 -0
  39. dataeval/metrics/stats/{hashstats.py → _hashstats.py} +21 -37
  40. dataeval/metrics/stats/_imagestats.py +94 -0
  41. dataeval/metrics/stats/_labelstats.py +131 -0
  42. dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} +19 -50
  43. dataeval/metrics/stats/{visualstats.py → _visualstats.py} +23 -54
  44. dataeval/outputs/__init__.py +53 -0
  45. dataeval/{output.py → outputs/_base.py} +55 -25
  46. dataeval/outputs/_bias.py +381 -0
  47. dataeval/outputs/_drift.py +83 -0
  48. dataeval/outputs/_estimators.py +114 -0
  49. dataeval/outputs/_linters.py +184 -0
  50. dataeval/{detectors/ood/output.py → outputs/_ood.py} +22 -22
  51. dataeval/outputs/_stats.py +387 -0
  52. dataeval/outputs/_utils.py +44 -0
  53. dataeval/outputs/_workflows.py +364 -0
  54. dataeval/typing.py +234 -0
  55. dataeval/utils/__init__.py +2 -2
  56. dataeval/utils/_array.py +169 -0
  57. dataeval/utils/_bin.py +199 -0
  58. dataeval/utils/_clusterer.py +144 -0
  59. dataeval/utils/_fast_mst.py +189 -0
  60. dataeval/utils/{image.py → _image.py} +6 -4
  61. dataeval/utils/_method.py +14 -0
  62. dataeval/utils/{shared.py → _mst.py} +3 -65
  63. dataeval/utils/{plot.py → _plot.py} +6 -6
  64. dataeval/utils/data/__init__.py +26 -0
  65. dataeval/utils/data/_dataset.py +217 -0
  66. dataeval/utils/data/_embeddings.py +104 -0
  67. dataeval/utils/data/_images.py +68 -0
  68. dataeval/utils/data/_metadata.py +360 -0
  69. dataeval/utils/data/_selection.py +126 -0
  70. dataeval/utils/{dataset/split.py → data/_split.py} +12 -38
  71. dataeval/utils/data/_targets.py +85 -0
  72. dataeval/utils/data/collate.py +103 -0
  73. dataeval/utils/data/datasets/__init__.py +17 -0
  74. dataeval/utils/data/datasets/_base.py +254 -0
  75. dataeval/utils/data/datasets/_cifar10.py +134 -0
  76. dataeval/utils/data/datasets/_fileio.py +168 -0
  77. dataeval/utils/data/datasets/_milco.py +153 -0
  78. dataeval/utils/data/datasets/_mixin.py +56 -0
  79. dataeval/utils/data/datasets/_mnist.py +183 -0
  80. dataeval/utils/data/datasets/_ships.py +123 -0
  81. dataeval/utils/data/datasets/_types.py +52 -0
  82. dataeval/utils/data/datasets/_voc.py +352 -0
  83. dataeval/utils/data/selections/__init__.py +15 -0
  84. dataeval/utils/data/selections/_classfilter.py +57 -0
  85. dataeval/utils/data/selections/_indices.py +26 -0
  86. dataeval/utils/data/selections/_limit.py +26 -0
  87. dataeval/utils/data/selections/_reverse.py +18 -0
  88. dataeval/utils/data/selections/_shuffle.py +29 -0
  89. dataeval/utils/metadata.py +51 -376
  90. dataeval/utils/torch/{gmm.py → _gmm.py} +4 -2
  91. dataeval/utils/torch/{internal.py → _internal.py} +21 -51
  92. dataeval/utils/torch/models.py +43 -2
  93. dataeval/workflows/__init__.py +2 -1
  94. dataeval/workflows/sufficiency.py +11 -346
  95. {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/METADATA +5 -2
  96. dataeval-0.82.0.dist-info/RECORD +104 -0
  97. dataeval/detectors/linters/clusterer.py +0 -512
  98. dataeval/detectors/linters/merged_stats.py +0 -49
  99. dataeval/detectors/ood/metadata_ks_compare.py +0 -129
  100. dataeval/detectors/ood/metadata_least_likely.py +0 -119
  101. dataeval/interop.py +0 -69
  102. dataeval/metrics/bias/coverage.py +0 -194
  103. dataeval/metrics/stats/datasetstats.py +0 -202
  104. dataeval/metrics/stats/dimensionstats.py +0 -115
  105. dataeval/metrics/stats/labelstats.py +0 -210
  106. dataeval/utils/dataset/__init__.py +0 -7
  107. dataeval/utils/dataset/datasets.py +0 -412
  108. dataeval/utils/dataset/read.py +0 -63
  109. dataeval-0.76.1.dist-info/RECORD +0 -67
  110. /dataeval/{log.py → _log.py} +0 -0
  111. /dataeval/utils/torch/{blocks.py → _blocks.py} +0 -0
  112. {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/LICENSE.txt +0 -0
  113. {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/WHEEL +0 -0
@@ -7,32 +7,17 @@ from __future__ import annotations
7
7
 
8
8
  __all__ = []
9
9
 
10
- from dataclasses import dataclass
11
10
  from typing import Literal
12
11
 
13
12
  import numpy as np
14
- from numpy.typing import ArrayLike, NDArray
13
+ from numpy.typing import NDArray
15
14
 
16
- from dataeval.interop import as_numpy
17
- from dataeval.output import Output, set_metadata
18
- from dataeval.utils.shared import compute_neighbors, get_method, minimum_spanning_tree
19
-
20
-
21
- @dataclass(frozen=True)
22
- class DivergenceOutput(Output):
23
- """
24
- Output class for :func:`divergence` estimator metric.
25
-
26
- Attributes
27
- ----------
28
- divergence : float
29
- :term:`Divergence` value calculated between 2 datasets ranging between 0.0 and 1.0
30
- errors : int
31
- The number of differing edges between the datasets
32
- """
33
-
34
- divergence: float
35
- errors: int
15
+ from dataeval.outputs import DivergenceOutput
16
+ from dataeval.outputs._base import set_metadata
17
+ from dataeval.typing import ArrayLike
18
+ from dataeval.utils._array import ensure_embeddings
19
+ from dataeval.utils._method import get_method
20
+ from dataeval.utils._mst import compute_neighbors, minimum_spanning_tree
36
21
 
37
22
 
38
23
  def divergence_mst(data: NDArray[np.float64], labels: NDArray[np.int_]) -> int:
@@ -78,18 +63,21 @@ def divergence_fnn(data: NDArray[np.float64], labels: NDArray[np.int_]) -> int:
78
63
  return errors
79
64
 
80
65
 
66
+ _DIVERGENCE_FN_MAP = {"FNN": divergence_fnn, "MST": divergence_mst}
67
+
68
+
81
69
  @set_metadata
82
- def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST"] = "FNN") -> DivergenceOutput:
70
+ def divergence(emb_a: ArrayLike, emb_b: ArrayLike, method: Literal["FNN", "MST"] = "FNN") -> DivergenceOutput:
83
71
  """
84
72
  Calculates the :term:`divergence` and any errors between the datasets.
85
73
 
86
74
  Parameters
87
75
  ----------
88
- data_a : ArrayLike, shape - (N, P)
89
- A dataset in an ArrayLike format to compare.
76
+ emb_a : ArrayLike, shape - (N, P)
77
+ Image embeddings in an ArrayLike format to compare.
90
78
  Function expects the data to have 2 dimensions, N number of observations in a P-dimensionial space.
91
- data_b : ArrayLike, shape - (N, P)
92
- A dataset in an ArrayLike format to compare.
79
+ emb_b : ArrayLike, shape - (N, P)
80
+ Image embeddings in an ArrayLike format to compare.
93
81
  Function expects the data to have 2 dimensions, N number of observations in a P-dimensionial space.
94
82
  method : Literal["MST, "FNN"], default "FNN"
95
83
  Method used to estimate dataset :term:`divergence<Divergence>`
@@ -125,9 +113,9 @@ def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST
125
113
  >>> divergence(datasetA, datasetB)
126
114
  DivergenceOutput(divergence=0.28, errors=36)
127
115
  """
128
- div_fn = get_method({"FNN": divergence_fnn, "MST": divergence_mst}, method)
129
- a = as_numpy(data_a)
130
- b = as_numpy(data_b)
116
+ div_fn = get_method(_DIVERGENCE_FN_MAP, method)
117
+ a = ensure_embeddings(emb_a, dtype=np.float64)
118
+ b = ensure_embeddings(emb_b, dtype=np.float64)
131
119
  N = a.shape[0]
132
120
  M = b.shape[0]
133
121
 
@@ -8,27 +8,13 @@ from __future__ import annotations
8
8
 
9
9
  __all__ = []
10
10
 
11
- from dataclasses import dataclass
12
11
 
13
- from numpy.typing import ArrayLike
14
12
  from sklearn.metrics import average_precision_score
15
13
 
16
- from dataeval.interop import as_numpy
17
- from dataeval.output import Output, set_metadata
18
-
19
-
20
- @dataclass(frozen=True)
21
- class UAPOutput(Output):
22
- """
23
- Output class for :func:`uap` estimator metric.
24
-
25
- Attributes
26
- ----------
27
- uap : float
28
- The empirical mean precision estimate
29
- """
30
-
31
- uap: float
14
+ from dataeval.outputs import UAPOutput
15
+ from dataeval.outputs._base import set_metadata
16
+ from dataeval.typing import ArrayLike
17
+ from dataeval.utils._array import as_numpy
32
18
 
33
19
 
34
20
  @set_metadata
@@ -5,15 +5,14 @@ and label statistics against the images and labels of a dataset.
5
5
 
6
6
  __all__ = [
7
7
  "ChannelStatsOutput",
8
- "DatasetStatsOutput",
8
+ "ImageStatsOutput",
9
9
  "DimensionStatsOutput",
10
10
  "HashStatsOutput",
11
11
  "LabelStatsOutput",
12
12
  "PixelStatsOutput",
13
13
  "VisualStatsOutput",
14
14
  "boxratiostats",
15
- "channelstats",
16
- "datasetstats",
15
+ "imagestats",
17
16
  "dimensionstats",
18
17
  "hashstats",
19
18
  "labelstats",
@@ -21,15 +20,19 @@ __all__ = [
21
20
  "visualstats",
22
21
  ]
23
22
 
24
- from dataeval.metrics.stats.boxratiostats import boxratiostats
25
- from dataeval.metrics.stats.datasetstats import (
23
+ from dataeval.metrics.stats._boxratiostats import boxratiostats
24
+ from dataeval.metrics.stats._dimensionstats import dimensionstats
25
+ from dataeval.metrics.stats._hashstats import hashstats
26
+ from dataeval.metrics.stats._imagestats import imagestats
27
+ from dataeval.metrics.stats._labelstats import labelstats
28
+ from dataeval.metrics.stats._pixelstats import pixelstats
29
+ from dataeval.metrics.stats._visualstats import visualstats
30
+ from dataeval.outputs._stats import (
26
31
  ChannelStatsOutput,
27
- DatasetStatsOutput,
28
- channelstats,
29
- datasetstats,
32
+ DimensionStatsOutput,
33
+ HashStatsOutput,
34
+ ImageStatsOutput,
35
+ LabelStatsOutput,
36
+ PixelStatsOutput,
37
+ VisualStatsOutput,
30
38
  )
31
- from dataeval.metrics.stats.dimensionstats import DimensionStatsOutput, dimensionstats
32
- from dataeval.metrics.stats.hashstats import HashStatsOutput, hashstats
33
- from dataeval.metrics.stats.labelstats import LabelStatsOutput, labelstats
34
- from dataeval.metrics.stats.pixelstats import PixelStatsOutput, pixelstats
35
- from dataeval.metrics.stats.visualstats import VisualStatsOutput, visualstats
@@ -1,39 +1,27 @@
1
1
  from __future__ import annotations
2
2
 
3
- from dataeval.utils.plot import histogram_plot
4
-
5
3
  __all__ = []
6
4
 
7
5
  import re
8
6
  import warnings
7
+ from collections import ChainMap
8
+ from copy import deepcopy
9
9
  from dataclasses import dataclass
10
10
  from functools import partial
11
- from itertools import repeat
12
11
  from multiprocessing import Pool
13
- from typing import Any, Callable, Generic, Iterable, NamedTuple, Optional, TypeVar, Union
12
+ from typing import Any, Callable, Generic, Iterable, Sequence, TypeVar, cast
14
13
 
15
14
  import numpy as np
16
15
  import tqdm
17
- from numpy.typing import ArrayLike, NDArray
16
+ from numpy.typing import NDArray
18
17
 
19
- from dataeval.interop import to_numpy_iter
20
- from dataeval.output import Output
21
- from dataeval.utils.image import normalize_image_shape, rescale
18
+ from dataeval.config import get_max_processes
19
+ from dataeval.outputs._stats import BaseStatsOutput, SourceIndex
20
+ from dataeval.typing import ArrayLike, Dataset, ObjectDetectionTarget
21
+ from dataeval.utils._array import to_numpy
22
+ from dataeval.utils._image import normalize_image_shape, rescale
22
23
 
23
24
  DTYPE_REGEX = re.compile(r"NDArray\[np\.(.*?)\]")
24
- SOURCE_INDEX = "source_index"
25
- BOX_COUNT = "box_count"
26
-
27
- # TODO: Replace with global config
28
- DEFAULT_PROCESSES: int | None = None
29
-
30
- OptionalRange = Optional[Union[int, Iterable[int]]]
31
-
32
-
33
- def matches(index: int | None, opt_range: OptionalRange) -> bool:
34
- if index is None or opt_range is None:
35
- return True
36
- return index in opt_range if isinstance(opt_range, Iterable) else index == opt_range
37
25
 
38
26
 
39
27
  def normalize_box_shape(bounding_box: NDArray[Any]) -> NDArray[Any]:
@@ -49,86 +37,6 @@ def normalize_box_shape(bounding_box: NDArray[Any]) -> NDArray[Any]:
49
37
  return bounding_box
50
38
 
51
39
 
52
- class SourceIndex(NamedTuple):
53
- """
54
- Attributes
55
- ----------
56
- image: int
57
- Index of the source image
58
- box : int | None
59
- Index of the box of the source image
60
- channel : int | None
61
- Index of the channel of the source image
62
- """
63
-
64
- image: int
65
- box: int | None
66
- channel: int | None
67
-
68
-
69
- @dataclass(frozen=True)
70
- class BaseStatsOutput(Output):
71
- """
72
- Attributes
73
- ----------
74
- source_index : List[SourceIndex]
75
- Mapping from statistic to source image, box and channel index
76
- box_count : NDArray[np.uint16]
77
- """
78
-
79
- source_index: list[SourceIndex]
80
- box_count: NDArray[np.uint16]
81
-
82
- def get_channel_mask(
83
- self,
84
- channel_index: OptionalRange,
85
- channel_count: OptionalRange = None,
86
- ) -> list[bool]:
87
- """
88
- Boolean mask for results filtered to specified channel index and optionally the count
89
- of the channels per image.
90
-
91
- Parameters
92
- ----------
93
- channel_index : int | Iterable[int] | None
94
- Index or indices of channel(s) to filter for
95
- channel_count : int | Iterable[int] | None
96
- Optional count(s) of channels to filter for
97
- """
98
- mask: list[bool] = []
99
- cur_mask: list[bool] = []
100
- cur_image = 0
101
- cur_max_channel = 0
102
- for source_index in list(self.source_index) + [None]:
103
- if source_index is None or source_index.image > cur_image:
104
- mask.extend(cur_mask if matches(cur_max_channel + 1, channel_count) else [False for _ in cur_mask])
105
- if source_index is not None:
106
- cur_image = source_index.image
107
- cur_max_channel = 0
108
- cur_mask.clear()
109
- if source_index is not None:
110
- cur_mask.append(matches(source_index.channel, channel_index))
111
- cur_max_channel = max(cur_max_channel, source_index.channel or 0)
112
- return mask
113
-
114
- def __len__(self) -> int:
115
- return len(self.source_index)
116
-
117
-
118
- def _is_plottable(k: str, v: Any, excluded_keys: Iterable[str]) -> bool:
119
- return isinstance(v, np.ndarray) and v[v != 0].size > 0 and all(k != x for x in excluded_keys)
120
-
121
-
122
- class HistogramPlotMixin:
123
- _excluded_keys: Iterable[str] = []
124
-
125
- def dict(self) -> dict[str, Any]: ...
126
-
127
- def plot(self, log: bool) -> None:
128
- data_dict = {k: v for k, v in self.dict().items() if _is_plottable(k, v, self._excluded_keys)}
129
- histogram_plot(data_dict, log)
130
-
131
-
132
40
  TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput, covariant=True)
133
41
 
134
42
 
@@ -193,10 +101,9 @@ class StatsProcessor(Generic[TStatsOutput]):
193
101
  cls, source: dict[str, Any], source_index: list[SourceIndex], box_count: list[int]
194
102
  ) -> TStatsOutput:
195
103
  output = {}
196
- for key in source:
197
- if key not in cls.output_class.__annotations__:
198
- continue
199
- stat_type: str = cls.output_class.__annotations__[key]
104
+ attrs = dict(ChainMap(*(getattr(c, "__annotations__", {}) for c in cls.output_class.__mro__)))
105
+ for key in (key for key in source if key in attrs):
106
+ stat_type: str = attrs[key]
200
107
  dtype_match = re.match(DTYPE_REGEX, stat_type)
201
108
  if dtype_match is not None:
202
109
  output[key] = np.asarray(source[key], dtype=np.dtype(dtype_match.group(1)))
@@ -205,7 +112,8 @@ class StatsProcessor(Generic[TStatsOutput]):
205
112
  return cls.output_class(**output, source_index=source_index, box_count=np.asarray(box_count, dtype=np.uint16))
206
113
 
207
114
 
208
- class StatsProcessorOutput(NamedTuple):
115
+ @dataclass
116
+ class StatsProcessorOutput:
209
117
  results: list[dict[str, Any]]
210
118
  source_indices: list[SourceIndex]
211
119
  box_counts: list[int]
@@ -214,16 +122,20 @@ class StatsProcessorOutput(NamedTuple):
214
122
 
215
123
  def process_stats(
216
124
  i: int,
217
- image_boxes: tuple[NDArray[Any], NDArray[Any] | None],
125
+ dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
126
+ per_box: bool,
218
127
  per_channel: bool,
219
128
  stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
220
129
  ) -> StatsProcessorOutput:
221
- image, boxes = image_boxes
130
+ data = dataset[i]
131
+ image, target = (to_numpy(cast(ArrayLike, data[0])), data[1]) if isinstance(data, tuple) else (to_numpy(data), None)
132
+ target = None if not isinstance(target, ObjectDetectionTarget) else target
133
+ boxes = to_numpy(target.boxes) if target is not None else None
222
134
  results_list: list[dict[str, Any]] = []
223
135
  source_indices: list[SourceIndex] = []
224
136
  box_counts: list[int] = []
225
137
  warnings_list: list[str] = []
226
- nboxes = [None] if boxes is None else normalize_box_shape(boxes)
138
+ nboxes = [None] if boxes is None or not per_box else normalize_box_shape(boxes)
227
139
  for i_b, box in enumerate(nboxes):
228
140
  i_b = None if box is None else i_b
229
141
  processor_list = [p(image, box, per_channel) for p in stats_processor_cls]
@@ -231,7 +143,7 @@ def process_stats(
231
143
  warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} is out of bounds of {image.shape}.")
232
144
  results_list.append({k: v for p in processor_list for k, v in p.process().items()})
233
145
  if per_channel:
234
- source_indices.extend([SourceIndex(i, i_b, c) for c in range(image_boxes[0].shape[-3])])
146
+ source_indices.extend([SourceIndex(i, i_b, c) for c in range(image.shape[-3])])
235
147
  else:
236
148
  source_indices.append(SourceIndex(i, i_b, None))
237
149
  box_counts.append(0 if boxes is None else len(boxes))
@@ -239,16 +151,18 @@ def process_stats(
239
151
 
240
152
 
241
153
  def process_stats_unpack(
242
- args: tuple[int, tuple[NDArray[Any], NDArray[Any] | None]],
154
+ i: int,
155
+ dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
156
+ per_box: bool,
243
157
  per_channel: bool,
244
158
  stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
245
159
  ) -> StatsProcessorOutput:
246
- return process_stats(*args, per_channel=per_channel, stats_processor_cls=stats_processor_cls)
160
+ return process_stats(i, dataset, per_box=per_box, per_channel=per_channel, stats_processor_cls=stats_processor_cls)
247
161
 
248
162
 
249
163
  def run_stats(
250
- images: Iterable[ArrayLike],
251
- bboxes: Iterable[ArrayLike] | None,
164
+ dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
165
+ per_box: bool,
252
166
  per_channel: bool,
253
167
  stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
254
168
  ) -> list[TStatsOutput]:
@@ -261,26 +175,20 @@ def run_stats(
261
175
 
262
176
  Parameters
263
177
  ----------
264
- images : Iterable[ArrayLike]
265
- An iterable of images (e.g., list of arrays), where each image is represented as an
266
- array-like structure (e.g., :term:`NumPy` arrays).
267
- bboxes : Iterable[ArrayLike]
268
- An iterable of bounding boxes (e.g. list of arrays) where each bounding box is represented
269
- as an array-like structure in the format of (X0, Y0, X1, Y1). The length of the bounding boxes
270
- iterable should match the length of the input images.
178
+ data : Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]]
179
+ A dataset of images and targets to compute statistics on.
180
+ per_box : bool
181
+ A flag which determines if the statistics should be evaluated on a per-box basis or not.
182
+ If the dataset does not include bounding boxes, this flag is ignored.
271
183
  per_channel : bool
272
184
  A flag which determines if the states should be evaluated on a per-channel basis or not.
273
185
  stats_processor_cls : Iterable[type[StatsProcessor]]
274
186
  An iterable of stats processor classes that calculate stats and return output classes.
275
- processes : int | None, default None
276
- Number of processes to use, defaults to None which uses all available CPU cores.
277
187
 
278
188
  Returns
279
189
  -------
280
- dict[str, NDArray]]
281
- A dictionary containing the computed statistics for each image.
282
- The dictionary keys correspond to the names of the statistics, and the values are :term:`NumPy` arrays
283
- with the results of the computations.
190
+ list[TStatsOutput]
191
+ A list of output classes containing the computed statistics
284
192
 
285
193
  Note
286
194
  ----
@@ -294,20 +202,24 @@ def run_stats(
294
202
  results_list: list[dict[str, NDArray[np.float64]]] = []
295
203
  source_index: list[SourceIndex] = []
296
204
  box_count: list[int] = []
297
- bbox_iter = repeat(None) if bboxes is None else to_numpy_iter(bboxes)
298
205
 
299
206
  warning_list = []
300
- total_for_status = getattr(images, "__len__")() if hasattr(images, "__len__") else None
301
207
  stats_processor_cls = stats_processor_cls if isinstance(stats_processor_cls, Iterable) else [stats_processor_cls]
302
208
 
303
209
  # TODO: Introduce global controls for CPU job parallelism and GPU configurations
304
- with Pool(processes=DEFAULT_PROCESSES) as p:
210
+ with Pool(processes=get_max_processes()) as p:
305
211
  for r in tqdm.tqdm(
306
212
  p.imap(
307
- partial(process_stats_unpack, per_channel=per_channel, stats_processor_cls=stats_processor_cls),
308
- enumerate(zip(to_numpy_iter(images), bbox_iter)),
213
+ partial(
214
+ process_stats_unpack,
215
+ dataset=dataset,
216
+ per_box=per_box,
217
+ per_channel=per_channel,
218
+ stats_processor_cls=stats_processor_cls,
219
+ ),
220
+ range(len(dataset)),
309
221
  ),
310
- total=total_for_status,
222
+ total=len(dataset),
311
223
  ):
312
224
  results_list.extend(r.results)
313
225
  source_index.extend(r.source_indices)
@@ -330,3 +242,40 @@ def run_stats(
330
242
 
331
243
  outputs = [s.convert_output(output, source_index, box_count) for s in stats_processor_cls]
332
244
  return outputs
245
+
246
+
247
+ def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
248
+ if type(a) is not type(b):
249
+ raise TypeError(f"Types {type(a)} and {type(b)} cannot be added.")
250
+
251
+ sum_dict = deepcopy(a.dict())
252
+
253
+ for k in sum_dict:
254
+ if isinstance(sum_dict[k], list):
255
+ sum_dict[k].extend(b.dict()[k])
256
+ else:
257
+ sum_dict[k] = np.concatenate((sum_dict[k], b.dict()[k]))
258
+
259
+ return type(a)(**sum_dict)
260
+
261
+
262
+ def combine_stats(stats: Sequence[TStatsOutput]) -> tuple[TStatsOutput, list[int]]:
263
+ output = None
264
+ dataset_steps = []
265
+ cur_len = 0
266
+ for s in stats:
267
+ output = s if output is None else add_stats(output, s)
268
+ cur_len += len(s)
269
+ dataset_steps.append(cur_len)
270
+ if output is None:
271
+ raise TypeError("Cannot combine empty sequence of stats.")
272
+ return output, dataset_steps
273
+
274
+
275
+ def get_dataset_step_from_idx(idx: int, dataset_steps: list[int]) -> tuple[int, int]:
276
+ last_step = 0
277
+ for i, step in enumerate(dataset_steps):
278
+ if idx < step:
279
+ return i, idx - last_step
280
+ last_step = step
281
+ return -1, idx
@@ -8,9 +8,8 @@ from typing import Any, Callable, Generic, TypeVar, cast
8
8
  import numpy as np
9
9
  from numpy.typing import NDArray
10
10
 
11
- from dataeval.metrics.stats.base import BOX_COUNT, SOURCE_INDEX, BaseStatsOutput
12
- from dataeval.metrics.stats.dimensionstats import DimensionStatsOutput
13
- from dataeval.output import set_metadata
11
+ from dataeval.outputs._base import set_metadata
12
+ from dataeval.outputs._stats import BOX_COUNT, SOURCE_INDEX, BaseStatsOutput, DimensionStatsOutput
14
13
 
15
14
  TStatOutput = TypeVar("TStatOutput", bound=BaseStatsOutput, contravariant=True)
16
15
  ArraySlice = tuple[int, int]
@@ -50,7 +49,7 @@ RATIOSTATS_OVERRIDE_MAP: dict[type, dict[str, Callable[..., NDArray[Any]]]] = {
50
49
  "depth": lambda x: x.box["depth"],
51
50
  "distance": lambda x: x.box["distance"],
52
51
  }
53
- )
52
+ ),
54
53
  }
55
54
 
56
55
 
@@ -87,11 +86,8 @@ def calculate_ratios(key: str, box_stats: BaseStatsOutput, img_stats: BaseStatsO
87
86
  stats = BoxImageStatsOutputSlice(box_stats, (box_i, box_j), img_stats, (img_i, img_j))
88
87
  out_type = type(box_stats)
89
88
  use_override = out_type in RATIOSTATS_OVERRIDE_MAP and key in RATIOSTATS_OVERRIDE_MAP[out_type]
90
- ratio = (
91
- RATIOSTATS_OVERRIDE_MAP[out_type][key](stats)
92
- if use_override
93
- else np.nan_to_num(stats.box[key] / stats.img[key])
94
- )
89
+ with np.errstate(divide="ignore", invalid="ignore"):
90
+ ratio = RATIOSTATS_OVERRIDE_MAP[out_type][key](stats) if use_override else stats.box[key] / stats.img[key]
95
91
  out_stats[box_i:box_j] = ratio.reshape(-1, *out_stats[box_i].shape)
96
92
  return out_stats
97
93
 
@@ -128,27 +124,28 @@ def boxratiostats(
128
124
 
129
125
  Examples
130
126
  --------
131
- Calculating the box ratio statistics using the dimension stats of the boxes and images
127
+ Calculate the box ratio statistics using the dimension stats of the images and boxes
128
+ on a dataset containing 15 targets.
132
129
 
133
130
  >>> from dataeval.metrics.stats import dimensionstats
134
- >>> imagestats = dimensionstats(stats_images)
135
- >>> boxstats = dimensionstats(stats_images, bboxes)
131
+ >>> imagestats = dimensionstats(dataset, per_box=False)
132
+ >>> boxstats = dimensionstats(dataset, per_box=True)
136
133
  >>> ratiostats = boxratiostats(boxstats, imagestats)
137
134
  >>> print(ratiostats.aspect_ratio)
138
- [ 0.86376953 0.58837891 16. 0.85714286 1.26959707 0.43772894
139
- 0.66650391 3.83296703 1.95018315]
135
+ [ 0.864 0.588 16. 0.857 1.27 0.438 0.667 3.833 1.95 0.833
136
+ 1. 0.6 0.522 15. 3.834]
140
137
  >>> print(ratiostats.size)
141
- [0.0255127 0.01037598 0.00097656 0.01822917 0.02327474 0.00683594
142
- 0.00915527 0.03369141 0.02115885]
138
+ [0.026 0.01 0.001 0.018 0.023 0.007 0.009 0.034 0.021 0.007 0.001 0.008
139
+ 0.017 0.001 0.008]
143
140
  """
144
141
  output_cls = type(boxstats)
145
142
  if type(boxstats) is not type(imgstats):
146
143
  raise TypeError("Must provide stats outputs of the same type.")
147
144
  if boxstats.source_index[-1].image != imgstats.source_index[-1].image:
148
145
  raise ValueError("Stats index_map length mismatch. Check if the correct box and image stats were provided.")
149
- if all(count == 0 for count in boxstats.box_count):
146
+ if any(src_idx.box is None for src_idx in boxstats.source_index):
150
147
  raise ValueError("Input for boxstats must contain box information.")
151
- if any(count != 0 for count in imgstats.box_count):
148
+ if any(src_idx.box is not None for src_idx in imgstats.source_index):
152
149
  raise ValueError("Input for imgstats must not contain box information.")
153
150
  boxstats_has_channels = any(si.channel is None for si in boxstats.source_index)
154
151
  imgstats_has_channels = any(si.channel is None for si in imgstats.source_index)
@@ -0,0 +1,75 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = []
4
+
5
+ from typing import Any, Callable
6
+
7
+ import numpy as np
8
+
9
+ from dataeval.metrics.stats._base import StatsProcessor, run_stats
10
+ from dataeval.outputs import DimensionStatsOutput
11
+ from dataeval.outputs._base import set_metadata
12
+ from dataeval.typing import ArrayLike, Dataset
13
+ from dataeval.utils._image import get_bitdepth
14
+
15
+
16
+ class DimensionStatsProcessor(StatsProcessor[DimensionStatsOutput]):
17
+ output_class: type = DimensionStatsOutput
18
+ image_function_map: dict[str, Callable[[StatsProcessor[DimensionStatsOutput]], Any]] = {
19
+ "left": lambda x: x.box[0],
20
+ "top": lambda x: x.box[1],
21
+ "width": lambda x: x.box[2] - x.box[0],
22
+ "height": lambda x: x.box[3] - x.box[1],
23
+ "channels": lambda x: x.shape[-3],
24
+ "size": lambda x: (x.box[2] - x.box[0]) * (x.box[3] - x.box[1]),
25
+ "aspect_ratio": lambda x: (x.box[2] - x.box[0]) / (x.box[3] - x.box[1]),
26
+ "depth": lambda x: get_bitdepth(x.image).depth,
27
+ "center": lambda x: np.asarray([(x.box[0] + x.box[2]) / 2, (x.box[1] + x.box[3]) / 2]),
28
+ "distance": lambda x: np.sqrt(
29
+ np.square(((x.box[0] + x.box[2]) / 2) - (x.shape[-1] / 2))
30
+ + np.square(((x.box[1] + x.box[3]) / 2) - (x.shape[-2] / 2))
31
+ ),
32
+ }
33
+
34
+
35
+ @set_metadata
36
+ def dimensionstats(
37
+ dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
38
+ *,
39
+ per_box: bool = False,
40
+ ) -> DimensionStatsOutput:
41
+ """
42
+ Calculates dimension :term:`statistics<Statistics>` for each image.
43
+
44
+ This function computes various dimensional metrics (e.g., width, height, channels)
45
+ on the images or individual bounding boxes for each image.
46
+
47
+ Parameters
48
+ ----------
49
+ dataset : Dataset
50
+ Dataset to perform calculations on.
51
+ per_box : bool, default False
52
+ If True, perform calculations on each bounding box.
53
+
54
+ Returns
55
+ -------
56
+ DimensionStatsOutput
57
+ A dictionary-like object containing the computed dimension statistics for each image or bounding
58
+ box. The keys correspond to the names of the statistics (e.g., 'width', 'height'), and the values
59
+ are lists of results for each image or :term:NumPy` arrays when the results are multi-dimensional.
60
+
61
+ See Also
62
+ --------
63
+ pixelstats, visualstats, Outliers
64
+
65
+ Examples
66
+ --------
67
+ Calculate the dimension statistics of a dataset of 8 images, whose shape is (C, H, W).
68
+
69
+ >>> results = dimensionstats(dataset)
70
+ >>> print(results.aspect_ratio)
71
+ [1. 1. 1.333 1. 0.667 1. 1. 1. ]
72
+ >>> print(results.channels)
73
+ [3 3 1 3 1 3 3 3]
74
+ """
75
+ return run_stats(dataset, per_box, False, [DimensionStatsProcessor])[0]