dataeval 0.69.4__py3-none-any.whl → 0.70.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. dataeval/__init__.py +3 -3
  2. dataeval/_internal/detectors/drift/base.py +5 -6
  3. dataeval/_internal/detectors/drift/mmd.py +3 -3
  4. dataeval/_internal/detectors/duplicates.py +62 -45
  5. dataeval/_internal/detectors/merged_stats.py +23 -54
  6. dataeval/_internal/detectors/ood/ae.py +3 -3
  7. dataeval/_internal/detectors/outliers.py +133 -61
  8. dataeval/_internal/interop.py +11 -7
  9. dataeval/_internal/metrics/balance.py +9 -9
  10. dataeval/_internal/metrics/ber.py +3 -3
  11. dataeval/_internal/metrics/divergence.py +3 -3
  12. dataeval/_internal/metrics/diversity.py +6 -6
  13. dataeval/_internal/metrics/parity.py +24 -16
  14. dataeval/_internal/metrics/stats/base.py +231 -0
  15. dataeval/_internal/metrics/stats/boxratiostats.py +159 -0
  16. dataeval/_internal/metrics/stats/datasetstats.py +97 -0
  17. dataeval/_internal/metrics/stats/dimensionstats.py +111 -0
  18. dataeval/_internal/metrics/stats/hashstats.py +73 -0
  19. dataeval/_internal/metrics/stats/labelstats.py +125 -0
  20. dataeval/_internal/metrics/stats/pixelstats.py +117 -0
  21. dataeval/_internal/metrics/stats/visualstats.py +122 -0
  22. dataeval/_internal/metrics/uap.py +2 -2
  23. dataeval/_internal/metrics/utils.py +28 -13
  24. dataeval/_internal/output.py +3 -18
  25. dataeval/_internal/workflows/sufficiency.py +123 -133
  26. dataeval/metrics/stats/__init__.py +14 -3
  27. dataeval/workflows/__init__.py +2 -2
  28. {dataeval-0.69.4.dist-info → dataeval-0.70.0.dist-info}/METADATA +3 -3
  29. {dataeval-0.69.4.dist-info → dataeval-0.70.0.dist-info}/RECORD +31 -26
  30. {dataeval-0.69.4.dist-info → dataeval-0.70.0.dist-info}/WHEEL +1 -1
  31. dataeval/_internal/flags.py +0 -77
  32. dataeval/_internal/metrics/stats.py +0 -397
  33. dataeval/flags/__init__.py +0 -3
  34. {dataeval-0.69.4.dist-info → dataeval-0.70.0.dist-info}/LICENSE.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from importlib import import_module
4
- from typing import Iterable
4
+ from typing import Any, Iterable, Iterator
5
5
 
6
6
  import numpy as np
7
7
  from numpy.typing import ArrayLike, NDArray
@@ -22,24 +22,28 @@ def try_import(module_name):
22
22
  return module
23
23
 
24
24
 
25
- def to_numpy(array: ArrayLike | None) -> NDArray:
25
+ def as_numpy(array: ArrayLike | None) -> NDArray[Any]:
26
+ return to_numpy(array, copy=False)
27
+
28
+
29
+ def to_numpy(array: ArrayLike | None, copy: bool = True) -> NDArray[Any]:
26
30
  if array is None:
27
31
  return np.ndarray([])
28
32
 
29
33
  if isinstance(array, np.ndarray):
30
- return array
34
+ return array.copy() if copy else array
31
35
 
32
36
  tf = try_import("tensorflow")
33
37
  if tf and tf.is_tensor(array):
34
- return array.numpy() # type: ignore
38
+ return array.numpy().copy() if copy else array.numpy() # type: ignore
35
39
 
36
40
  torch = try_import("torch")
37
41
  if torch and isinstance(array, torch.Tensor):
38
- return array.detach().cpu().numpy() # type: ignore
42
+ return array.detach().cpu().numpy().copy() if copy else array.detach().cpu().numpy() # type: ignore
39
43
 
40
- return np.asarray(array)
44
+ return np.array(array, copy=copy)
41
45
 
42
46
 
43
- def to_numpy_iter(iterable: Iterable[ArrayLike]):
47
+ def to_numpy_iter(iterable: Iterable[ArrayLike]) -> Iterator[NDArray[Any]]:
44
48
  for array in iterable:
45
49
  yield to_numpy(array)
@@ -2,10 +2,10 @@ from __future__ import annotations
2
2
 
3
3
  import warnings
4
4
  from dataclasses import dataclass
5
- from typing import Sequence
5
+ from typing import Mapping
6
6
 
7
7
  import numpy as np
8
- from numpy.typing import NDArray
8
+ from numpy.typing import ArrayLike, NDArray
9
9
  from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
10
10
 
11
11
  from dataeval._internal.metrics.utils import entropy, preprocess_metadata
@@ -51,16 +51,16 @@ def validate_num_neighbors(num_neighbors: int) -> int:
51
51
 
52
52
 
53
53
  @set_metadata("dataeval.metrics")
54
- def balance(class_labels: Sequence[int], metadata: list[dict], num_neighbors: int = 5) -> BalanceOutput:
54
+ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neighbors: int = 5) -> BalanceOutput:
55
55
  """
56
56
  Mutual information (MI) between factors (class label, metadata, label/image properties)
57
57
 
58
58
  Parameters
59
59
  ----------
60
- class_labels: Sequence[int]
60
+ class_labels: ArrayLike
61
61
  List of class labels for each image
62
- metadata: List[Dict]
63
- List of metadata factors for each image
62
+ metadata: Mapping[str, ArrayLike]
63
+ Dict of lists of metadata factors for each image
64
64
  num_neighbors: int, default 5
65
65
  Number of nearest neighbors to use for computing MI between discrete
66
66
  and continuous variables.
@@ -90,9 +90,9 @@ def balance(class_labels: Sequence[int], metadata: list[dict], num_neighbors: in
90
90
  Return intra/interfactor balance (mutual information)
91
91
 
92
92
  >>> bal.factors
93
- array([[0.99999843, 0.03510422, 0.09725766],
94
- [0.03510422, 0.08433558, 0.15621459],
95
- [0.09725766, 0.15621459, 0.99999856]])
93
+ array([[0.99999843, 0.04133555, 0.09725766],
94
+ [0.04133555, 0.08433558, 0.1301489 ],
95
+ [0.09725766, 0.1301489 , 0.99999856]])
96
96
 
97
97
  Return classwise balance (mutual information) of factors with individual class_labels
98
98
 
@@ -17,7 +17,7 @@ from numpy.typing import ArrayLike, NDArray
17
17
  from scipy.sparse import coo_matrix
18
18
  from scipy.stats import mode
19
19
 
20
- from dataeval._internal.interop import to_numpy
20
+ from dataeval._internal.interop import as_numpy
21
21
  from dataeval._internal.metrics.utils import compute_neighbors, get_classes_counts, get_method, minimum_spanning_tree
22
22
  from dataeval._internal.output import OutputMetadata, set_metadata
23
23
 
@@ -145,7 +145,7 @@ def ber(images: ArrayLike, labels: ArrayLike, k: int = 1, method: Literal["KNN",
145
145
  BEROutput(ber=0.04, ber_lower=0.020416847668728033)
146
146
  """
147
147
  ber_fn = get_method(BER_FN_MAP, method)
148
- X = to_numpy(images)
149
- y = to_numpy(labels)
148
+ X = as_numpy(images)
149
+ y = as_numpy(labels)
150
150
  upper, lower = ber_fn(X, y, k) if method == "KNN" else ber_fn(X, y)
151
151
  return BEROutput(upper, lower)
@@ -9,7 +9,7 @@ from typing import Literal
9
9
  import numpy as np
10
10
  from numpy.typing import ArrayLike, NDArray
11
11
 
12
- from dataeval._internal.interop import to_numpy
12
+ from dataeval._internal.interop import as_numpy
13
13
  from dataeval._internal.metrics.utils import compute_neighbors, get_method, minimum_spanning_tree
14
14
  from dataeval._internal.output import OutputMetadata, set_metadata
15
15
 
@@ -123,8 +123,8 @@ def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST
123
123
  DivergenceOutput(divergence=0.28, errors=36.0)
124
124
  """
125
125
  div_fn = get_method(DIVERGENCE_FN_MAP, method)
126
- a = to_numpy(data_a)
127
- b = to_numpy(data_b)
126
+ a = as_numpy(data_a)
127
+ b = as_numpy(data_b)
128
128
  N = a.shape[0]
129
129
  M = b.shape[0]
130
130
 
@@ -1,10 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass
4
- from typing import Literal, Sequence
4
+ from typing import Literal, Mapping
5
5
 
6
6
  import numpy as np
7
- from numpy.typing import NDArray
7
+ from numpy.typing import ArrayLike, NDArray
8
8
 
9
9
  from dataeval._internal.metrics.utils import entropy, get_counts, get_method, get_num_bins, preprocess_metadata
10
10
  from dataeval._internal.output import OutputMetadata, set_metadata
@@ -142,7 +142,7 @@ DIVERSITY_FN_MAP = {"simpson": diversity_simpson, "shannon": diversity_shannon}
142
142
 
143
143
  @set_metadata("dataeval.metrics")
144
144
  def diversity(
145
- class_labels: Sequence[int], metadata: list[dict], method: Literal["shannon", "simpson"] = "simpson"
145
+ class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], method: Literal["shannon", "simpson"] = "simpson"
146
146
  ) -> DiversityOutput:
147
147
  """
148
148
  Compute diversity and classwise diversity for discrete/categorical variables and, through standard
@@ -155,10 +155,10 @@ def diversity(
155
155
 
156
156
  Parameters
157
157
  ----------
158
- class_labels: Sequence[int]
158
+ class_labels: ArrayLike
159
159
  List of class labels for each image
160
- metadata: List[Dict]
161
- List of metadata factors for each image
160
+ metadata: Mapping[str, ArrayLike]
161
+ Dict of list of metadata factors for each image
162
162
  method: Literal["shannon", "simpson"], default "simpson"
163
163
  Indicates which diversity index should be computed
164
164
 
@@ -62,8 +62,8 @@ def digitize_factor_bins(continuous_values: NDArray, bins: int, factor_name: str
62
62
 
63
63
 
64
64
  def format_discretize_factors(
65
- data_factors: dict[str, NDArray], continuous_factor_bincounts: dict[str, int]
66
- ) -> tuple[dict[str, NDArray], NDArray]:
65
+ data_factors: Mapping[str, NDArray], continuous_factor_bincounts: Mapping[str, int]
66
+ ) -> dict[str, NDArray]:
67
67
  """
68
68
  Sets up the internal list of metadata factors.
69
69
 
@@ -80,10 +80,9 @@ def format_discretize_factors(
80
80
 
81
81
  Returns
82
82
  -------
83
- Tuple[Dict[str, NDArray], NDArray]
83
+ Dict[str, NDArray]
84
84
  - Intrinsic per-image metadata information with the formatting that input data_factors uses.
85
85
  Each key is a metadata factor, whose value is the discrete per-image factor values.
86
- - Per-image labels, whose ith element is the label for the ith element of the dataset.
87
86
  """
88
87
 
89
88
  invalid_keys = set(continuous_factor_bincounts.keys()) - set(data_factors.keys())
@@ -103,8 +102,6 @@ def format_discretize_factors(
103
102
  if lengths[1:] != lengths[:-1]:
104
103
  raise ValueError("The lengths of each entry in the dictionary are not equal." f" Found lengths {lengths}")
105
104
 
106
- labels = data_factors["class"]
107
-
108
105
  metadata_factors = {
109
106
  name: val
110
107
  if name not in continuous_factor_bincounts
@@ -113,7 +110,7 @@ def format_discretize_factors(
113
110
  if name != "class"
114
111
  }
115
112
 
116
- return metadata_factors, labels
113
+ return metadata_factors
117
114
 
118
115
 
119
116
  def normalize_expected_dist(expected_dist: NDArray, observed_dist: NDArray) -> NDArray:
@@ -187,7 +184,8 @@ def validate_dist(label_dist: NDArray, label_name: str):
187
184
  warnings.warn(
188
185
  f"Labels {np.where(label_dist<5)[0]} in {label_name}"
189
186
  " dataset have frequencies less than 5. This may lead"
190
- " to invalid chi-squared evaluation."
187
+ " to invalid chi-squared evaluation.",
188
+ UserWarning,
191
189
  )
192
190
 
193
191
 
@@ -280,8 +278,9 @@ def label_parity(
280
278
 
281
279
  @set_metadata("dataeval.metrics")
282
280
  def parity(
281
+ class_labels: ArrayLike,
283
282
  data_factors: Mapping[str, ArrayLike],
284
- continuous_factor_bincounts: dict[str, int] | None = None,
283
+ continuous_factor_bincounts: Mapping[str, int] | None = None,
285
284
  ) -> ParityOutput[NDArray[np.float64]]:
286
285
  """
287
286
  Calculate chi-square statistics to assess the relationship between multiple factors and class labels.
@@ -292,10 +291,12 @@ def parity(
292
291
 
293
292
  Parameters
294
293
  ----------
294
+ class_labels: ArrayLike
295
+ List of class labels for each image
295
296
  data_factors: Mapping[str, ArrayLike]
296
- The dataset factors, which are per-image attributes including class label and metadata.
297
+ The dataset factors, which are per-image metadata attributes.
297
298
  Each key of dataset_factors is a factor, whose value is the per-image factor values.
298
- continuous_factor_bincounts : Dict[str, int] | None, default None
299
+ continuous_factor_bincounts : Mapping[str, int] | None, default None
299
300
  A dictionary specifying the number of bins for discretizing the continuous factors.
300
301
  The keys should correspond to the names of continuous factors in `data_factors`,
301
302
  and the values should be the number of bins to use for discretization.
@@ -329,21 +330,27 @@ def parity(
329
330
  --------
330
331
  Randomly creating some "continuous" and categorical variables using ``np.random.default_rng``
331
332
 
333
+ >>> labels = np_random_gen.choice([0, 1, 2], (100))
332
334
  >>> data_factors = {
333
335
  ... "age": np_random_gen.choice([25, 30, 35, 45], (100)),
334
336
  ... "income": np_random_gen.choice([50000, 65000, 80000], (100)),
335
337
  ... "gender": np_random_gen.choice(["M", "F"], (100)),
336
- ... "class": np_random_gen.choice([0, 1, 2], (100)),
337
338
  ... }
338
339
  >>> continuous_factor_bincounts = {"age": 4, "income": 3}
339
- >>> parity(data_factors, continuous_factor_bincounts)
340
- ParityOutput(score=array([2.82329785, 1.60625584, 1.38377236]), p_value=array([0.83067563, 0.80766733, 0.5006309 ]))
340
+ >>> parity(labels, data_factors, continuous_factor_bincounts)
341
+ ParityOutput(score=array([7.35731943, 5.46711299, 0.51506212]), p_value=array([0.28906231, 0.24263543, 0.77295762]))
341
342
  """
343
+ if len(np.shape(class_labels)) > 1:
344
+ raise ValueError(
345
+ f"Got class labels with {len(np.shape(class_labels))}-dimensional",
346
+ f" shape {np.shape(class_labels)}, but expected a 1-dimensional array.",
347
+ )
342
348
 
343
349
  data_factors_np = {k: to_numpy(v) for k, v in data_factors.items()}
344
350
  continuous_factor_bincounts = continuous_factor_bincounts if continuous_factor_bincounts else {}
345
351
 
346
- factors, labels = format_discretize_factors(data_factors_np, continuous_factor_bincounts)
352
+ labels = to_numpy(class_labels)
353
+ factors = format_discretize_factors(data_factors_np, continuous_factor_bincounts)
347
354
 
348
355
  chi_scores = np.zeros(len(factors))
349
356
  p_values = np.zeros(len(factors))
@@ -396,7 +403,8 @@ def parity(
396
403
  message = "\n".join(factor_msg)
397
404
 
398
405
  warnings.warn(
399
- f"The following factors did not meet the recommended 5 occurrences for each value-label combination. \nRecommend rerunning parity after adjusting the following factor-value-label combinations: \n{message}", # noqa: E501
406
+ f"The following factors did not meet the recommended 5 occurrences for each value-label combination. \n\
407
+ Recommend rerunning parity after adjusting the following factor-value-label combinations: \n{message}",
400
408
  UserWarning,
401
409
  )
402
410
 
@@ -0,0 +1,231 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ import warnings
5
+ from dataclasses import dataclass
6
+ from typing import Any, Callable, Iterable, NamedTuple, Optional, Union
7
+
8
+ import numpy as np
9
+ from numpy.typing import ArrayLike, NDArray
10
+
11
+ from dataeval._internal.interop import to_numpy_iter
12
+ from dataeval._internal.metrics.utils import normalize_box_shape, normalize_image_shape, rescale
13
+ from dataeval._internal.output import OutputMetadata
14
+
15
+ DTYPE_REGEX = re.compile(r"NDArray\[np\.(.*?)\]")
16
+ SOURCE_INDEX = "source_index"
17
+ BOX_COUNT = "box_count"
18
+
19
+ OptionalRange = Optional[Union[int, Iterable[int]]]
20
+
21
+
22
+ def matches(index: int | None, opt_range: OptionalRange) -> bool:
23
+ if index is None or opt_range is None:
24
+ return True
25
+ return index in opt_range if isinstance(opt_range, Iterable) else index == opt_range
26
+
27
+
28
+ class SourceIndex(NamedTuple):
29
+ """
30
+ Attributes
31
+ ----------
32
+ image: int
33
+ Index of the source image
34
+ box : int | None
35
+ Index of the box of the source image
36
+ channel : int | None
37
+ Index of the channel of the source image
38
+ """
39
+
40
+ image: int
41
+ box: int | None
42
+ channel: int | None
43
+
44
+
45
+ @dataclass(frozen=True)
46
+ class BaseStatsOutput(OutputMetadata):
47
+ """
48
+ Attributes
49
+ ----------
50
+ source_index : List[SourceIndex]
51
+ Mapping from statistic to source image, box and channel index
52
+ box_count : NDArray[np.uint16]
53
+ """
54
+
55
+ source_index: list[SourceIndex]
56
+ box_count: NDArray[np.uint16]
57
+
58
+ def get_channel_mask(
59
+ self,
60
+ channel_index: OptionalRange,
61
+ channel_count: OptionalRange = None,
62
+ ) -> list[bool]:
63
+ """
64
+ Boolean mask for results filtered to specified channel index and optionally the count
65
+ of the channels per image.
66
+
67
+ Parameters
68
+ ----------
69
+ channel_index : int | Iterable[int] | None
70
+ Index or indices of channel(s) to filter for
71
+ channel_count : int | Iterable[int] | None
72
+ Optional count(s) of channels to filter for
73
+ """
74
+ mask: list[bool] = []
75
+ cur_mask: list[bool] = []
76
+ cur_image = 0
77
+ cur_max_channel = 0
78
+ for source_index in list(self.source_index) + [None]:
79
+ if source_index is None or source_index.image > cur_image:
80
+ mask.extend(cur_mask if matches(cur_max_channel + 1, channel_count) else [False for _ in cur_mask])
81
+ if source_index is None:
82
+ break
83
+ cur_image = source_index.image
84
+ cur_max_channel = 0
85
+ cur_mask.clear()
86
+ cur_mask.append(matches(source_index.channel, channel_index))
87
+ cur_max_channel = max(cur_max_channel, source_index.channel or 0)
88
+ return mask
89
+
90
+ def __len__(self) -> int:
91
+ return len(self.source_index)
92
+
93
+
94
+ class StatsProcessor:
95
+ cache_keys: list[str] = []
96
+ image_function_map: dict[str, Callable[[StatsProcessor], Any]] = {}
97
+ channel_function_map: dict[str, Callable[[StatsProcessor], Any]] = {}
98
+
99
+ def __init__(self, image: NDArray, box: NDArray | None, per_channel: bool):
100
+ self.raw = image
101
+ self.width = image.shape[-1]
102
+ self.height = image.shape[-2]
103
+ self.box = np.array([0, 0, self.width, self.height]) if box is None else box
104
+ self.per_channel = per_channel
105
+ self._image = None
106
+ self._shape = None
107
+ self._scaled = None
108
+ self.cache = {}
109
+ self.fn_map = self.channel_function_map if per_channel else self.image_function_map
110
+ self.is_valid_slice = box is None or bool(
111
+ box[0] >= 0 and box[1] >= 0 and box[2] <= image.shape[-1] and box[3] <= image.shape[-2]
112
+ )
113
+
114
+ def get(self, fn_key: str) -> NDArray:
115
+ if fn_key in self.cache_keys:
116
+ if fn_key not in self.cache:
117
+ self.cache[fn_key] = self.fn_map[fn_key](self)
118
+ return self.cache[fn_key]
119
+ else:
120
+ return self.fn_map[fn_key](self)
121
+
122
+ @property
123
+ def image(self) -> NDArray:
124
+ if self._image is None:
125
+ if self.is_valid_slice:
126
+ norm = normalize_image_shape(self.raw)
127
+ self._image = norm[:, self.box[1] : self.box[3], self.box[0] : self.box[2]]
128
+ else:
129
+ self._image = np.zeros((self.raw.shape[0], self.box[3] - self.box[1], self.box[2] - self.box[0]))
130
+ return self._image
131
+
132
+ @property
133
+ def shape(self) -> tuple:
134
+ if self._shape is None:
135
+ self._shape = self.image.shape
136
+ return self._shape
137
+
138
+ @property
139
+ def scaled(self) -> NDArray:
140
+ if self._scaled is None:
141
+ self._scaled = rescale(self.image)
142
+ if self.per_channel:
143
+ self._scaled = self._scaled.reshape(self.image.shape[0], -1)
144
+ return self._scaled
145
+
146
+
147
+ def run_stats(
148
+ images: Iterable[ArrayLike],
149
+ bboxes: Iterable[ArrayLike] | None,
150
+ per_channel: bool,
151
+ stats_processor_cls: type,
152
+ output_cls: type,
153
+ ) -> dict:
154
+ """
155
+ Compute specified statistics on a set of images.
156
+
157
+ This function applies a set of statistical operations to each image in the input iterable,
158
+ based on the specified output class. The function determines which statistics to apply
159
+ using a function map. It also supports optional image flattening for pixel-wise calculations.
160
+
161
+ Parameters
162
+ ----------
163
+ images : Iterable[ArrayLike]
164
+ An iterable of images (e.g., list of arrays), where each image is represented as an
165
+ array-like structure (e.g., NumPy arrays).
166
+ bboxes : Iterable[ArrayLike]
167
+ An iterable of bounding boxes (e.g. list of arrays) where each bounding box is represented
168
+ as an array-like structure in the format of (X0, Y0, X1, Y1). The length of the bounding boxes
169
+ iterable should match the length of the input images.
170
+ per_channel : bool
171
+ A flag which determines if the states should be evaluated on a per-channel basis or not.
172
+ output_cls : type
173
+ The output class for which stats values will be calculated.
174
+
175
+ Returns
176
+ -------
177
+ dict[str, NDArray]]
178
+ A dictionary containing the computed statistics for each image.
179
+ The dictionary keys correspond to the names of the statistics, and the values are NumPy arrays
180
+ with the results of the computations.
181
+
182
+ Notes
183
+ -----
184
+ - The function performs image normalization (rescaling the image values)
185
+ before applying some of the statistics.
186
+ - Pixel-level statistics (e.g., brightness, entropy) are computed after
187
+ rescaling and, optionally, flattening the images.
188
+ - For statistics like histograms and entropy, intermediate results may
189
+ be reused to avoid redundant computation.
190
+ """
191
+ results_list: list[dict[str, NDArray]] = []
192
+ output_list = list(output_cls.__annotations__)
193
+ source_index = []
194
+ box_count = []
195
+ bbox_iter = (None for _ in images) if bboxes is None else to_numpy_iter(bboxes)
196
+
197
+ for i, (boxes, image) in enumerate(zip(bbox_iter, to_numpy_iter(images))):
198
+ nboxes = [None] if boxes is None else normalize_box_shape(boxes)
199
+ for i_b, box in enumerate(nboxes):
200
+ i_b = None if box is None else i_b
201
+ processor: StatsProcessor = stats_processor_cls(image, box, per_channel)
202
+ if not processor.is_valid_slice:
203
+ warnings.warn(f"Bounding box {i_b}: {box} is out of bounds of image {i}: {image.shape}.")
204
+ results_list.append({stat: processor.get(stat) for stat in output_list})
205
+ if per_channel:
206
+ source_index.extend([SourceIndex(i, i_b, c) for c in range(image.shape[-3])])
207
+ else:
208
+ source_index.append(SourceIndex(i, i_b, None))
209
+ box_count.append(0 if boxes is None else len(boxes))
210
+
211
+ output = {}
212
+ if per_channel:
213
+ for i, results in enumerate(results_list):
214
+ for stat, result in results.items():
215
+ output.setdefault(stat, []).extend(result.tolist())
216
+ else:
217
+ for results in results_list:
218
+ for stat, result in results.items():
219
+ output.setdefault(stat, []).append(result.tolist() if isinstance(result, np.ndarray) else result)
220
+
221
+ for stat in output:
222
+ stat_type: str = output_cls.__annotations__[stat]
223
+
224
+ dtype_match = re.match(DTYPE_REGEX, stat_type)
225
+ if dtype_match is not None:
226
+ output[stat] = np.asarray(output[stat], dtype=np.dtype(dtype_match.group(1)))
227
+
228
+ output[SOURCE_INDEX] = source_index
229
+ output[BOX_COUNT] = np.asarray(box_count, dtype=np.uint16)
230
+
231
+ return output
@@ -0,0 +1,159 @@
1
+ from __future__ import annotations
2
+
3
+ import copy
4
+ from typing import Callable, Generic, TypeVar, cast
5
+
6
+ import numpy as np
7
+ from numpy.typing import NDArray
8
+
9
+ from dataeval._internal.metrics.stats.base import BOX_COUNT, SOURCE_INDEX, BaseStatsOutput
10
+ from dataeval._internal.metrics.stats.dimensionstats import DimensionStatsOutput
11
+ from dataeval._internal.output import set_metadata
12
+
13
+ TStatOutput = TypeVar("TStatOutput", bound=BaseStatsOutput, contravariant=True)
14
+ ArraySlice = tuple[int, int]
15
+
16
+
17
+ class BoxImageStatsOutputSlice(Generic[TStatOutput]):
18
+ class StatSlicer:
19
+ def __init__(self, stats: TStatOutput, slice: ArraySlice, channels: int = 0) -> None: # noqa: A002
20
+ self._stats = stats
21
+ self._slice = slice
22
+ self._channels = channels
23
+
24
+ def __getitem__(self, key: str) -> NDArray[np.float64]:
25
+ _stat = cast(np.ndarray, getattr(self._stats, key)).astype(np.float64)
26
+ _shape = _stat[0].shape
27
+ _slice = _stat[self._slice[0] : self._slice[1]]
28
+ return _slice.reshape(-1, self._channels, *_shape) if self._channels else _slice.reshape(-1, *_shape)
29
+
30
+ box: StatSlicer
31
+ img: StatSlicer
32
+ channels: int
33
+
34
+ def __init__(
35
+ self, box_stats: TStatOutput, box_slice: ArraySlice, img_stats: TStatOutput, img_slice: ArraySlice
36
+ ) -> None:
37
+ self.channels = img_slice[1] - img_slice[0]
38
+ self.box = self.StatSlicer(box_stats, box_slice, self.channels)
39
+ self.img = self.StatSlicer(img_stats, img_slice)
40
+
41
+
42
+ RATIOSTATS_OVERRIDE_MAP: dict[type, dict[str, Callable[[BoxImageStatsOutputSlice], NDArray]]] = {
43
+ DimensionStatsOutput: {
44
+ "left": lambda x: x.box["left"] / x.img["width"],
45
+ "top": lambda x: x.box["top"] / x.img["height"],
46
+ "channels": lambda x: x.box["channels"],
47
+ "depth": lambda x: x.box["depth"],
48
+ "distance": lambda x: x.box["distance"],
49
+ }
50
+ }
51
+
52
+
53
+ def get_index_map(stats: BaseStatsOutput) -> list[int]:
54
+ index_map: list[int] = []
55
+ cur_index = -1
56
+ for i, s in enumerate(stats.source_index):
57
+ if s.image > cur_index:
58
+ index_map.append(i)
59
+ cur_index = s.image
60
+ return index_map
61
+
62
+
63
+ def calculate_ratios(key: str, box_stats: BaseStatsOutput, img_stats: BaseStatsOutput) -> NDArray:
64
+ if not hasattr(box_stats, key) or not hasattr(img_stats, key):
65
+ raise KeyError("Invalid key for provided stats output object.")
66
+
67
+ stats = getattr(box_stats, key)
68
+
69
+ # Copy over stats index maps and box counts
70
+ if key in (SOURCE_INDEX):
71
+ return copy.deepcopy(stats)
72
+ elif key == BOX_COUNT:
73
+ return np.copy(stats)
74
+
75
+ # Calculate ratios for each stat
76
+ out_stats: np.ndarray = np.copy(stats).astype(np.float64)
77
+
78
+ box_map = get_index_map(box_stats)
79
+ img_map = get_index_map(img_stats)
80
+ for i, (box_i, img_i) in enumerate(zip(box_map, img_map)):
81
+ box_j = len(box_stats) if i == len(box_map) - 1 else box_map[i + 1]
82
+ img_j = len(img_stats) if i == len(img_map) - 1 else img_map[i + 1]
83
+ stats = BoxImageStatsOutputSlice(box_stats, (box_i, box_j), img_stats, (img_i, img_j))
84
+ out_type = type(box_stats)
85
+ use_override = out_type in RATIOSTATS_OVERRIDE_MAP and key in RATIOSTATS_OVERRIDE_MAP[out_type]
86
+ ratio = (
87
+ RATIOSTATS_OVERRIDE_MAP[out_type][key](stats)
88
+ if use_override
89
+ else np.nan_to_num(stats.box[key] / stats.img[key])
90
+ )
91
+ out_stats[box_i:box_j] = ratio.reshape(-1, *out_stats[box_i].shape)
92
+ return out_stats
93
+
94
+
95
+ @set_metadata("dataeval.metrics")
96
+ def boxratiostats(
97
+ boxstats: TStatOutput,
98
+ imgstats: TStatOutput,
99
+ ) -> TStatOutput:
100
+ """
101
+ Calculates ratio statistics of box outputs over image outputs
102
+
103
+ Parameters
104
+ ----------
105
+ boxstats : DimensionStatsOutput | PixelStatsOutput | VisualStatsOutput
106
+ Box statistics outputs to perform calculations on
107
+ imgstats : DimensionStatsOutput | PixelStatsOutput | VisualStatsOutput
108
+ Image statistics outputs to perform calculations on
109
+
110
+ Returns
111
+ -------
112
+ DimensionStatsOutput | PixelStatsOutput | VisualStatsOutput
113
+ A dictionary-like object containing the computed ratio of the box statistics divided by the
114
+ image statistics.
115
+
116
+ See Also
117
+ --------
118
+ dimensionstats, pixelstats, visualstats
119
+
120
+ Note
121
+ ----
122
+ DimensionStatsOutput values for channels, depth and distances are the original values
123
+ provided by the box outputs
124
+
125
+ Examples
126
+ --------
127
+ Calculating the box ratio statistics using the dimension stats of the boxes and images
128
+
129
+ >>> imagestats = dimensionstats(images)
130
+ >>> boxstats = dimensionstats(images, bboxes)
131
+ >>> ratiostats = boxratiostats(boxstats, imagestats)
132
+ >>> print(ratiostats.aspect_ratio)
133
+ [ 1.15169271 0.78450521 21.33333333 1.5234375 2.25651042 0.77799479
134
+ 0.88867188 3.40625 1.73307292 1.11132812 0.75018315 0.45018315
135
+ 0.69596354 20. 5.11197917 2.33333333 0.75 0.70019531]
136
+ >>> print(ratiostats.size)
137
+ [0.03401693 0.01383464 0.00130208 0.01822917 0.02327474 0.00683594
138
+ 0.01220703 0.0168457 0.01057943 0.00976562 0.00130208 0.01098633
139
+ 0.02246094 0.0012207 0.01123047 0.00911458 0.02636719 0.06835938]
140
+ """
141
+ output_cls = type(boxstats)
142
+ if type(boxstats) is not type(imgstats):
143
+ raise TypeError("Must provide stats outputs of the same type.")
144
+ if boxstats.source_index[-1].image != imgstats.source_index[-1].image:
145
+ raise ValueError("Stats index_map length mismatch. Check if the correct box and image stats were provided.")
146
+ if all(count == 0 for count in boxstats.box_count):
147
+ raise TypeError("Input for boxstats must contain box information.")
148
+ if any(count != 0 for count in imgstats.box_count):
149
+ raise TypeError("Input for imgstats must not contain box information.")
150
+ boxstats_has_channels = any(si.channel is None for si in boxstats.source_index)
151
+ imgstats_has_channels = any(si.channel is None for si in imgstats.source_index)
152
+ if boxstats_has_channels != imgstats_has_channels:
153
+ raise TypeError("Input for boxstats and imgstats must have matching channel information.")
154
+
155
+ output_dict = {}
156
+ for key in boxstats.dict():
157
+ output_dict[key] = calculate_ratios(key, boxstats, imgstats)
158
+
159
+ return output_cls(**output_dict)