dataeval 0.74.2__py3-none-any.whl → 0.75.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. dataeval/__init__.py +27 -23
  2. dataeval/detectors/__init__.py +2 -2
  3. dataeval/detectors/drift/__init__.py +14 -12
  4. dataeval/detectors/drift/base.py +1 -1
  5. dataeval/detectors/drift/cvm.py +1 -1
  6. dataeval/detectors/drift/ks.py +1 -1
  7. dataeval/detectors/drift/mmd.py +6 -5
  8. dataeval/detectors/drift/torch.py +12 -12
  9. dataeval/detectors/drift/uncertainty.py +3 -2
  10. dataeval/detectors/linters/__init__.py +4 -4
  11. dataeval/detectors/linters/clusterer.py +2 -7
  12. dataeval/detectors/linters/duplicates.py +6 -10
  13. dataeval/detectors/linters/outliers.py +4 -2
  14. dataeval/detectors/ood/__init__.py +3 -10
  15. dataeval/detectors/ood/{ae_torch.py → ae.py} +6 -4
  16. dataeval/detectors/ood/base.py +64 -161
  17. dataeval/detectors/ood/metadata_ks_compare.py +34 -42
  18. dataeval/detectors/ood/metadata_least_likely.py +3 -3
  19. dataeval/detectors/ood/metadata_ood_mi.py +6 -5
  20. dataeval/detectors/ood/mixin.py +146 -0
  21. dataeval/detectors/ood/output.py +63 -0
  22. dataeval/interop.py +6 -5
  23. dataeval/{logging.py → log.py} +2 -0
  24. dataeval/metrics/__init__.py +2 -2
  25. dataeval/metrics/bias/__init__.py +9 -12
  26. dataeval/metrics/bias/balance.py +10 -8
  27. dataeval/metrics/bias/coverage.py +52 -4
  28. dataeval/metrics/bias/diversity.py +42 -14
  29. dataeval/metrics/bias/parity.py +15 -12
  30. dataeval/metrics/estimators/__init__.py +2 -2
  31. dataeval/metrics/estimators/ber.py +3 -1
  32. dataeval/metrics/estimators/divergence.py +1 -1
  33. dataeval/metrics/estimators/uap.py +1 -1
  34. dataeval/metrics/stats/__init__.py +18 -18
  35. dataeval/metrics/stats/base.py +4 -4
  36. dataeval/metrics/stats/boxratiostats.py +8 -9
  37. dataeval/metrics/stats/datasetstats.py +10 -14
  38. dataeval/metrics/stats/dimensionstats.py +4 -4
  39. dataeval/metrics/stats/hashstats.py +12 -8
  40. dataeval/metrics/stats/labelstats.py +5 -5
  41. dataeval/metrics/stats/pixelstats.py +4 -9
  42. dataeval/metrics/stats/visualstats.py +4 -9
  43. dataeval/utils/__init__.py +4 -13
  44. dataeval/utils/dataset/__init__.py +7 -0
  45. dataeval/utils/{torch → dataset}/datasets.py +2 -0
  46. dataeval/utils/dataset/read.py +63 -0
  47. dataeval/utils/{split_dataset.py → dataset/split.py} +38 -30
  48. dataeval/utils/image.py +2 -2
  49. dataeval/utils/metadata.py +310 -5
  50. dataeval/{metrics/bias/metadata_utils.py → utils/plot.py} +1 -104
  51. dataeval/utils/torch/__init__.py +2 -17
  52. dataeval/utils/torch/gmm.py +29 -6
  53. dataeval/utils/torch/{utils.py → internal.py} +82 -58
  54. dataeval/utils/torch/models.py +10 -8
  55. dataeval/utils/torch/trainer.py +6 -85
  56. dataeval/workflows/__init__.py +2 -5
  57. dataeval/workflows/sufficiency.py +16 -6
  58. dataeval-0.75.0.dist-info/METADATA +136 -0
  59. dataeval-0.75.0.dist-info/RECORD +67 -0
  60. dataeval/detectors/ood/base_torch.py +0 -109
  61. dataeval/metrics/bias/metadata_preprocessing.py +0 -285
  62. dataeval/utils/gmm.py +0 -26
  63. dataeval-0.74.2.dist-info/METADATA +0 -120
  64. dataeval-0.74.2.dist-info/RECORD +0 -66
  65. {dataeval-0.74.2.dist-info → dataeval-0.75.0.dist-info}/LICENSE.txt +0 -0
  66. {dataeval-0.74.2.dist-info → dataeval-0.75.0.dist-info}/WHEEL +0 -0
@@ -3,22 +3,19 @@ Bias metrics check for skewed or imbalanced datasets and incomplete feature
3
3
  representation which may impact model performance.
4
4
  """
5
5
 
6
- from dataeval.metrics.bias.balance import BalanceOutput, balance
7
- from dataeval.metrics.bias.coverage import CoverageOutput, coverage
8
- from dataeval.metrics.bias.diversity import DiversityOutput, diversity
9
- from dataeval.metrics.bias.metadata_preprocessing import MetadataOutput, metadata_preprocessing
10
- from dataeval.metrics.bias.parity import ParityOutput, label_parity, parity
11
-
12
6
  __all__ = [
7
+ "BalanceOutput",
8
+ "CoverageOutput",
9
+ "DiversityOutput",
10
+ "ParityOutput",
13
11
  "balance",
14
12
  "coverage",
15
13
  "diversity",
16
14
  "label_parity",
17
15
  "parity",
18
- "metadata_preprocessing",
19
- "BalanceOutput",
20
- "CoverageOutput",
21
- "DiversityOutput",
22
- "ParityOutput",
23
- "MetadataOutput",
24
16
  ]
17
+
18
+ from dataeval.metrics.bias.balance import BalanceOutput, balance
19
+ from dataeval.metrics.bias.coverage import CoverageOutput, coverage
20
+ from dataeval.metrics.bias.diversity import DiversityOutput, diversity
21
+ from dataeval.metrics.bias.parity import ParityOutput, label_parity, parity
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- __all__ = ["BalanceOutput", "balance"]
3
+ __all__ = []
4
4
 
5
5
  import contextlib
6
6
  import warnings
@@ -12,9 +12,9 @@ import scipy as sp
12
12
  from numpy.typing import NDArray
13
13
  from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
14
14
 
15
- from dataeval.metrics.bias.metadata_preprocessing import MetadataOutput
16
- from dataeval.metrics.bias.metadata_utils import get_counts, heatmap
17
15
  from dataeval.output import Output, set_metadata
16
+ from dataeval.utils.metadata import Metadata, get_counts
17
+ from dataeval.utils.plot import heatmap
18
18
 
19
19
  with contextlib.suppress(ImportError):
20
20
  from matplotlib.figure import Figure
@@ -119,7 +119,7 @@ def _validate_num_neighbors(num_neighbors: int) -> int:
119
119
 
120
120
  @set_metadata
121
121
  def balance(
122
- metadata: MetadataOutput,
122
+ metadata: Metadata,
123
123
  num_neighbors: int = 5,
124
124
  ) -> BalanceOutput:
125
125
  """
@@ -127,14 +127,16 @@ def balance(
127
127
 
128
128
  Parameters
129
129
  ----------
130
- metadata : MetadataOutput
131
- Output after running `metadata_preprocessing`
130
+ metadata : Metadata
131
+ Preprocessed metadata from :func:`dataeval.utils.metadata.preprocess`
132
+ num_neighbors : int, default 5
133
+ Number of points to consider as neighbors
132
134
 
133
135
  Returns
134
136
  -------
135
137
  BalanceOutput
136
- (num_factors+1) x (num_factors+1) estimate of mutual information
137
- between num_factors metadata factors and class label. Symmetry is enforced.
138
+ (num_factors+1) x (num_factors+1) estimate of mutual information \
139
+ between num_factors metadata factors and class label. Symmetry is enforced.
138
140
 
139
141
  Note
140
142
  ----
@@ -1,18 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
- __all__ = ["CoverageOutput", "coverage"]
3
+ __all__ = []
4
4
 
5
5
  import contextlib
6
6
  import math
7
7
  from dataclasses import dataclass
8
- from typing import Literal
8
+ from typing import Any, Literal
9
9
 
10
10
  import numpy as np
11
11
  from numpy.typing import ArrayLike, NDArray
12
12
  from scipy.spatial.distance import pdist, squareform
13
13
 
14
14
  from dataeval.interop import to_numpy
15
- from dataeval.metrics.bias.metadata_utils import coverage_plot
16
15
  from dataeval.output import Output, set_metadata
17
16
  from dataeval.utils.shared import flatten
18
17
 
@@ -20,6 +19,55 @@ with contextlib.suppress(ImportError):
20
19
  from matplotlib.figure import Figure
21
20
 
22
21
 
22
+ def _plot(images: NDArray[Any], num_images: int) -> Figure:
23
+ """
24
+ Creates a single plot of all of the provided images
25
+
26
+ Parameters
27
+ ----------
28
+ images : NDArray
29
+ Array containing only the desired images to plot
30
+
31
+ Returns
32
+ -------
33
+ matplotlib.figure.Figure
34
+ Plot of all provided images
35
+ """
36
+ import matplotlib.pyplot as plt
37
+
38
+ num_images = min(num_images, len(images))
39
+
40
+ if images.ndim == 4:
41
+ images = np.moveaxis(images, 1, -1)
42
+ elif images.ndim == 3:
43
+ images = np.repeat(images[:, :, :, np.newaxis], 3, axis=-1)
44
+ else:
45
+ raise ValueError(
46
+ f"Expected a (N,C,H,W) or a (N, H, W) set of images, but got a {images.ndim}-dimensional set of images."
47
+ )
48
+
49
+ rows = int(np.ceil(num_images / 3))
50
+ fig, axs = plt.subplots(rows, 3, figsize=(9, 3 * rows))
51
+
52
+ if rows == 1:
53
+ for j in range(3):
54
+ if j >= len(images):
55
+ continue
56
+ axs[j].imshow(images[j])
57
+ axs[j].axis("off")
58
+ else:
59
+ for i in range(rows):
60
+ for j in range(3):
61
+ i_j = i * 3 + j
62
+ if i_j >= len(images):
63
+ continue
64
+ axs[i, j].imshow(images[i_j])
65
+ axs[i, j].axis("off")
66
+
67
+ fig.tight_layout()
68
+ return fig
69
+
70
+
23
71
  @dataclass(frozen=True)
24
72
  class CoverageOutput(Output):
25
73
  """
@@ -62,7 +110,7 @@ class CoverageOutput(Output):
62
110
  selected_images = images[highest_uncovered_indices]
63
111
 
64
112
  # Plot the images
65
- fig = coverage_plot(selected_images, top_k)
113
+ fig = _plot(selected_images, top_k)
66
114
 
67
115
  return fig
68
116
 
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- __all__ = ["DiversityOutput", "diversity"]
3
+ __all__ = []
4
4
 
5
5
  import contextlib
6
6
  from dataclasses import dataclass
@@ -10,15 +10,44 @@ import numpy as np
10
10
  import scipy as sp
11
11
  from numpy.typing import ArrayLike, NDArray
12
12
 
13
- from dataeval.metrics.bias.metadata_preprocessing import MetadataOutput
14
- from dataeval.metrics.bias.metadata_utils import diversity_bar_plot, get_counts, heatmap
15
13
  from dataeval.output import Output, set_metadata
14
+ from dataeval.utils.metadata import Metadata, get_counts
15
+ from dataeval.utils.plot import heatmap
16
16
  from dataeval.utils.shared import get_method
17
17
 
18
18
  with contextlib.suppress(ImportError):
19
19
  from matplotlib.figure import Figure
20
20
 
21
21
 
22
+ def _plot(labels: NDArray[Any], bar_heights: NDArray[Any]) -> Figure:
23
+ """
24
+ Plots a formatted bar plot
25
+
26
+ Parameters
27
+ ----------
28
+ labels : NDArray
29
+ Array containing the labels for each bar
30
+ bar_heights : NDArray
31
+ Array containing the values for each bar
32
+
33
+ Returns
34
+ -------
35
+ matplotlib.figure.Figure
36
+ Bar plot figure
37
+ """
38
+ import matplotlib.pyplot as plt
39
+
40
+ fig, ax = plt.subplots(figsize=(10, 10))
41
+
42
+ ax.bar(labels, bar_heights)
43
+ ax.set_xlabel("Factors")
44
+
45
+ plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
46
+
47
+ fig.tight_layout()
48
+ return fig
49
+
50
+
22
51
  @dataclass(frozen=True)
23
52
  class DiversityOutput(Output):
24
53
  """
@@ -77,8 +106,7 @@ class DiversityOutput(Output):
77
106
  else:
78
107
  # Creating label array for heat map axes
79
108
  heat_labels = np.concatenate((["class"], self.factor_names))
80
-
81
- fig = diversity_bar_plot(heat_labels, self.diversity_index)
109
+ fig = _plot(heat_labels, self.diversity_index)
82
110
 
83
111
  return fig
84
112
 
@@ -165,7 +193,7 @@ def diversity_simpson(
165
193
 
166
194
  @set_metadata
167
195
  def diversity(
168
- metadata: MetadataOutput,
196
+ metadata: Metadata,
169
197
  method: Literal["simpson", "shannon"] = "simpson",
170
198
  ) -> DiversityOutput:
171
199
  """
@@ -179,8 +207,8 @@ def diversity(
179
207
 
180
208
  Parameters
181
209
  ----------
182
- metadata : MetadataOutput
183
- Output after running `metadata_preprocessing`
210
+ metadata : Metadata
211
+ Preprocessed metadata from :func:`dataeval.utils.metadata.preprocess`
184
212
 
185
213
  Note
186
214
  ----
@@ -199,21 +227,21 @@ def diversity(
199
227
 
200
228
  >>> div_simp = diversity(metadata, method="simpson")
201
229
  >>> div_simp.diversity_index
202
- array([0.72413793, 0.88636364, 0.72413793])
230
+ array([0.6 , 0.80882353, 1. , 0.8 ])
203
231
 
204
232
  >>> div_simp.classwise
205
- array([[0.69230769, 0.68965517],
206
- [0.5 , 0.8 ]])
233
+ array([[0.5 , 0.8 , 0.8 ],
234
+ [0.63043478, 0.97560976, 0.52830189]])
207
235
 
208
236
  Compute Shannon diversity index of metadata and class labels
209
237
 
210
238
  >>> div_shan = diversity(metadata, method="shannon")
211
239
  >>> div_shan.diversity_index
212
- array([0.8812909 , 0.96748876, 0.8812909 ])
240
+ array([0.81127812, 0.9426312 , 1. , 0.91829583])
213
241
 
214
242
  >>> div_shan.classwise
215
- array([[0.91651644, 0.86312057],
216
- [0.68260619, 0.91829583]])
243
+ array([[0.68260619, 0.91829583, 0.91829583],
244
+ [0.81443569, 0.99107606, 0.76420451]])
217
245
 
218
246
  See Also
219
247
  --------
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- __all__ = ["ParityOutput", "parity", "label_parity"]
3
+ __all__ = []
4
4
 
5
5
  import warnings
6
6
  from dataclasses import dataclass
@@ -12,8 +12,8 @@ from scipy.stats import chisquare
12
12
  from scipy.stats.contingency import chi2_contingency, crosstab
13
13
 
14
14
  from dataeval.interop import as_numpy, to_numpy
15
- from dataeval.metrics.bias.metadata_preprocessing import MetadataOutput
16
15
  from dataeval.output import Output, set_metadata
16
+ from dataeval.utils.metadata import Metadata
17
17
 
18
18
  TData = TypeVar("TData", np.float64, NDArray[np.float64])
19
19
 
@@ -167,8 +167,9 @@ def label_parity(
167
167
  --------
168
168
  Randomly creating some label distributions using ``np.random.default_rng``
169
169
 
170
- >>> expected_labels = np_random_gen.choice([0, 1, 2, 3, 4], (100))
171
- >>> observed_labels = np_random_gen.choice([2, 3, 0, 4, 1], (100))
170
+ >>> rng = np.random.default_rng(175)
171
+ >>> expected_labels = rng.choice([0, 1, 2, 3, 4], (100))
172
+ >>> observed_labels = rng.choice([2, 3, 0, 4, 1], (100))
172
173
  >>> label_parity(expected_labels, observed_labels)
173
174
  ParityOutput(score=14.007374204742625, p_value=0.0072715574616218, metadata_names=None)
174
175
  """
@@ -205,7 +206,7 @@ def label_parity(
205
206
 
206
207
 
207
208
  @set_metadata
208
- def parity(metadata: MetadataOutput) -> ParityOutput[NDArray[np.float64]]:
209
+ def parity(metadata: Metadata) -> ParityOutput[NDArray[np.float64]]:
209
210
  """
210
211
  Calculate chi-square statistics to assess the linear relationship between multiple factors
211
212
  and class labels.
@@ -216,8 +217,8 @@ def parity(metadata: MetadataOutput) -> ParityOutput[NDArray[np.float64]]:
216
217
 
217
218
  Parameters
218
219
  ----------
219
- metadata : MetadataOutput
220
- Output after running `metadata_preprocessing`
220
+ metadata : Metadata
221
+ Preprocessed metadata from :func:`dataeval.utils.metadata.preprocess`
221
222
 
222
223
  Returns
223
224
  -------
@@ -249,16 +250,18 @@ def parity(metadata: MetadataOutput) -> ParityOutput[NDArray[np.float64]]:
249
250
  --------
250
251
  Randomly creating some "continuous" and categorical variables using ``np.random.default_rng``
251
252
 
252
- >>> labels = np_random_gen.choice([0, 1, 2], (100))
253
+ >>> from dataeval.utils.metadata import preprocess
254
+ >>> rng = np.random.default_rng(175)
255
+ >>> labels = rng.choice([0, 1, 2], (100))
253
256
  >>> metadata_dict = [
254
257
  ... {
255
- ... "age": list(np_random_gen.choice([25, 30, 35, 45], (100))),
256
- ... "income": list(np_random_gen.choice([50000, 65000, 80000], (100))),
257
- ... "gender": list(np_random_gen.choice(["M", "F"], (100))),
258
+ ... "age": list(rng.choice([25, 30, 35, 45], (100))),
259
+ ... "income": list(rng.choice([50000, 65000, 80000], (100))),
260
+ ... "gender": list(rng.choice(["M", "F"], (100))),
258
261
  ... }
259
262
  ... ]
260
263
  >>> continuous_factor_bincounts = {"age": 4, "income": 3}
261
- >>> metadata = metadata_preprocessing(metadata_dict, labels, continuous_factor_bincounts)
264
+ >>> metadata = preprocess(metadata_dict, labels, continuous_factor_bincounts)
262
265
  >>> parity(metadata)
263
266
  ParityOutput(score=array([7.35731943, 5.46711299, 0.51506212]), p_value=array([0.28906231, 0.24263543, 0.77295762]), metadata_names=['age', 'income', 'gender'])
264
267
  """ # noqa: E501
@@ -2,8 +2,8 @@
2
2
  Estimators calculate performance bounds and the statistical distance between datasets.
3
3
  """
4
4
 
5
+ __all__ = ["ber", "divergence", "uap", "BEROutput", "DivergenceOutput", "UAPOutput"]
6
+
5
7
  from dataeval.metrics.estimators.ber import BEROutput, ber
6
8
  from dataeval.metrics.estimators.divergence import DivergenceOutput, divergence
7
9
  from dataeval.metrics.estimators.uap import UAPOutput, uap
8
-
9
- __all__ = ["ber", "divergence", "uap", "BEROutput", "DivergenceOutput", "UAPOutput"]
@@ -5,11 +5,12 @@ KNN based estimate for the :term:`Bayes error rate<Bayes Error Rate (BER)>`
5
5
 
6
6
  Learning to Bound the Multi-class Bayes Error (Th. 3 and Th. 4)
7
7
  https://arxiv.org/abs/1811.06419
8
+
8
9
  """
9
10
 
10
11
  from __future__ import annotations
11
12
 
12
- __all__ = ["BEROutput", "ber"]
13
+ __all__ = []
13
14
 
14
15
  from dataclasses import dataclass
15
16
  from typing import Literal
@@ -38,6 +39,7 @@ class BEROutput(Output):
38
39
  """
39
40
 
40
41
  ber: float
42
+
41
43
  ber_lower: float
42
44
 
43
45
 
@@ -5,7 +5,7 @@ using the Fast Nearest Neighbor and Minimum Spanning Tree algorithms
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
- __all__ = ["DivergenceOutput", "divergence"]
8
+ __all__ = []
9
9
 
10
10
  from dataclasses import dataclass
11
11
  from typing import Literal
@@ -6,7 +6,7 @@ average precision<Upper-Bound Average Precision (UAP)>` using empirical mean pre
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
- __all__ = ["UAPOutput", "uap"]
9
+ __all__ = []
10
10
 
11
11
  from dataclasses import dataclass
12
12
 
@@ -3,6 +3,24 @@ Statistics metrics calculate a variety of image properties and pixel statistics
3
3
  and label statistics against the images and labels of a dataset.
4
4
  """
5
5
 
6
+ __all__ = [
7
+ "ChannelStatsOutput",
8
+ "DatasetStatsOutput",
9
+ "DimensionStatsOutput",
10
+ "HashStatsOutput",
11
+ "LabelStatsOutput",
12
+ "PixelStatsOutput",
13
+ "VisualStatsOutput",
14
+ "boxratiostats",
15
+ "channelstats",
16
+ "datasetstats",
17
+ "dimensionstats",
18
+ "hashstats",
19
+ "labelstats",
20
+ "pixelstats",
21
+ "visualstats",
22
+ ]
23
+
6
24
  from dataeval.metrics.stats.boxratiostats import boxratiostats
7
25
  from dataeval.metrics.stats.datasetstats import (
8
26
  ChannelStatsOutput,
@@ -15,21 +33,3 @@ from dataeval.metrics.stats.hashstats import HashStatsOutput, hashstats
15
33
  from dataeval.metrics.stats.labelstats import LabelStatsOutput, labelstats
16
34
  from dataeval.metrics.stats.pixelstats import PixelStatsOutput, pixelstats
17
35
  from dataeval.metrics.stats.visualstats import VisualStatsOutput, visualstats
18
-
19
- __all__ = [
20
- "boxratiostats",
21
- "channelstats",
22
- "datasetstats",
23
- "dimensionstats",
24
- "hashstats",
25
- "labelstats",
26
- "pixelstats",
27
- "visualstats",
28
- "ChannelStatsOutput",
29
- "DatasetStatsOutput",
30
- "DimensionStatsOutput",
31
- "HashStatsOutput",
32
- "LabelStatsOutput",
33
- "PixelStatsOutput",
34
- "VisualStatsOutput",
35
- ]
@@ -193,7 +193,7 @@ class StatsProcessorOutput(NamedTuple):
193
193
  results: list[dict[str, Any]]
194
194
  source_indices: list[SourceIndex]
195
195
  box_counts: list[int]
196
- warnings_list: list[tuple[int, int, NDArray[np.float64], tuple[int, ...]]]
196
+ warnings_list: list[str]
197
197
 
198
198
 
199
199
  def process_stats(
@@ -206,13 +206,13 @@ def process_stats(
206
206
  results_list: list[dict[str, Any]] = []
207
207
  source_indices: list[SourceIndex] = []
208
208
  box_counts: list[int] = []
209
- warnings_list: list[tuple[int, int, NDArray[np.float64], tuple[int, ...]]] = []
209
+ warnings_list: list[str] = []
210
210
  nboxes = [None] if boxes is None else normalize_box_shape(boxes)
211
211
  for i_b, box in enumerate(nboxes):
212
212
  i_b = None if box is None else i_b
213
213
  processor_list = [p(image, box, per_channel) for p in stats_processor_cls]
214
214
  if any(not p._is_valid_slice for p in processor_list) and i_b is not None and box is not None:
215
- warnings_list.append((i, i_b, box, image.shape))
215
+ warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} is out of bounds of {image.shape}.")
216
216
  results_list.append({k: v for p in processor_list for k, v in p.process().items()})
217
217
  if per_channel:
218
218
  source_indices.extend([SourceIndex(i, i_b, c) for c in range(image_boxes[0].shape[-3])])
@@ -302,7 +302,7 @@ def run_stats(
302
302
 
303
303
  # warnings are not emitted while in multiprocessing pools so we emit after gathering all warnings
304
304
  for w in warning_list:
305
- warnings.warn(f"Bounding box [{w[0]}][{w[1]}]: {w[2]} is out of bounds of {w[3]}.", UserWarning)
305
+ warnings.warn(w, UserWarning)
306
306
 
307
307
  output = {}
308
308
  for results in results_list:
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- __all__ = ["boxratiostats"]
3
+ __all__ = []
4
4
 
5
5
  import copy
6
6
  from typing import Any, Callable, Generic, TypeVar, cast
@@ -130,17 +130,16 @@ def boxratiostats(
130
130
  --------
131
131
  Calculating the box ratio statistics using the dimension stats of the boxes and images
132
132
 
133
- >>> imagestats = dimensionstats(images)
134
- >>> boxstats = dimensionstats(images, bboxes)
133
+ >>> from dataeval.metrics.stats import dimensionstats
134
+ >>> imagestats = dimensionstats(stats_images)
135
+ >>> boxstats = dimensionstats(stats_images, bboxes)
135
136
  >>> ratiostats = boxratiostats(boxstats, imagestats)
136
137
  >>> print(ratiostats.aspect_ratio)
137
- [ 1.15169271 0.78450521 21.33333333 1.5234375 2.25651042 0.77799479
138
- 0.88867188 3.40625 1.73307292 1.11132812 0.75018315 0.45018315
139
- 0.69596354 20. 5.11197917 2.33333333 0.75 0.70019531]
138
+ [ 0.86376953 0.58837891 16. 0.85714286 1.26959707 0.43772894
139
+ 0.66650391 3.83296703 1.95018315]
140
140
  >>> print(ratiostats.size)
141
- [0.03401693 0.01383464 0.00130208 0.01822917 0.02327474 0.00683594
142
- 0.01220703 0.0168457 0.01057943 0.00976562 0.00130208 0.01098633
143
- 0.02246094 0.0012207 0.01123047 0.00911458 0.02636719 0.06835938]
141
+ [0.0255127 0.01037598 0.00097656 0.01822917 0.02327474 0.00683594
142
+ 0.00915527 0.03369141 0.02115885]
144
143
  """
145
144
  output_cls = type(boxstats)
146
145
  if type(boxstats) is not type(imgstats):
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- __all__ = ["DatasetStatsOutput", "ChannelStatsOutput", "datasetstats", "channelstats"]
3
+ __all__ = []
4
4
 
5
5
  from dataclasses import dataclass
6
6
  from typing import Any, Iterable
@@ -25,7 +25,7 @@ class DatasetStatsOutput(Output):
25
25
 
26
26
  This class represents the outputs of various stats functions against a single
27
27
  dataset, such that each index across all stat outputs are representative of
28
- the same source image. Modifying or mixing outputs will result in inaccurate
28
+ the same source image. Modifying or mixing outputs will result in inaccurate
29
29
  outlier calculations if not created correctly.
30
30
 
31
31
  Attributes
@@ -60,7 +60,7 @@ class ChannelStatsOutput(Output):
60
60
 
61
61
  This class represents the outputs of various per-channel stats functions against
62
62
  a single dataset, such that each index across all stat outputs are representative
63
- of the same source image. Modifying or mixing outputs will result in inaccurate
63
+ of the same source image. Modifying or mixing outputs will result in inaccurate
64
64
  outlier calculations if not created correctly.
65
65
 
66
66
  Attributes
@@ -119,13 +119,11 @@ def datasetstats(
119
119
  --------
120
120
  Calculating the dimension, pixel and visual stats for a dataset with bounding boxes
121
121
 
122
- >>> stats = datasetstats(images, bboxes)
122
+ >>> stats = datasetstats(stats_images, bboxes)
123
123
  >>> print(stats.dimensionstats.aspect_ratio)
124
- [ 0.864 0.5884 16. 1.143 1.692 0.5835 0.6665 2.555 1.3
125
- 0.8335 1. 0.6 0.522 15. 3.834 1.75 0.75 0.7 ]
126
- >>> print(stats.visualstats.contrast)
127
- [1.744 1.946 0.1164 0.0635 0.0633 0.06274 0.0429 0.0317 0.0317
128
- 0.02576 0.02081 0.02171 0.01915 0.01767 0.01799 0.01595 0.01433 0.01478]
124
+ [ 0.864 0.5884 16. 1.143 1.692 0.5835 0.6665 2.555 1.3 ]
125
+ >>> print(stats.visualstats.sharpness)
126
+ [4.04 4.434 0.2778 4.957 5.145 5.22 4.957 3.076 2.855 ]
129
127
  """
130
128
  outputs = run_stats(images, bboxes, False, [DimensionStatsProcessor, PixelStatsProcessor, VisualStatsProcessor])
131
129
  return DatasetStatsOutput(*outputs, labelstats=labelstats(labels) if labels else None) # type: ignore
@@ -162,12 +160,10 @@ def channelstats(
162
160
  --------
163
161
  Calculating the per-channel pixel and visual stats for a dataset
164
162
 
165
- >>> stats = channelstats(images)
163
+ >>> stats = channelstats(stats_images)
166
164
  >>> print(stats.visualstats.darkness)
167
- [0.07495 0.1748 0.275 0.1047 0.11096 0.1172 0.2047 0.2109 0.2172
168
- 0.3047 0.311 0.3171 0.4048 0.411 0.4172 0.505 0.5107 0.517
169
- 0.6045 0.611 0.617 0.7046 0.711 0.7173 0.8047 0.811 0.8174
170
- 0.905 0.911 0.917 ]
165
+ [0.1499 0.3499 0.55 0.2094 0.2219 0.2344 0.4194 0.6094 0.622 0.6343
166
+ 0.8154]
171
167
  """
172
168
  outputs = run_stats(images, bboxes, True, [PixelStatsProcessor, VisualStatsProcessor])
173
169
  return ChannelStatsOutput(*outputs) # type: ignore
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- __all__ = ["DimensionStatsOutput", "dimensionstats"]
3
+ __all__ = []
4
4
 
5
5
  from dataclasses import dataclass
6
6
  from typing import Any, Callable, Iterable
@@ -106,10 +106,10 @@ def dimensionstats(
106
106
  --------
107
107
  Calculating the dimension statistics on the images, whose shape is (C, H, W)
108
108
 
109
- >>> results = dimensionstats(images)
109
+ >>> results = dimensionstats(stats_images)
110
110
  >>> print(results.aspect_ratio)
111
- [0.75 0.75 0.75 0.75 0.75 0.75 1.333 0.75 0.75 1. ]
111
+ [1. 1. 1.333 1. 0.6665]
112
112
  >>> print(results.channels)
113
- [1 1 1 1 1 1 3 1 1 3]
113
+ [3 3 1 3 1]
114
114
  """
115
115
  return run_stats(images, bboxes, False, [DimensionStatsProcessor])[0]
@@ -1,6 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
- __all__ = ["HashStatsOutput", "hashstats"]
3
+ import warnings
4
+
5
+ __all__ = []
4
6
 
5
7
  from dataclasses import dataclass
6
8
  from typing import Callable, Iterable
@@ -41,7 +43,7 @@ def pchash(image: ArrayLike) -> str:
41
43
  """
42
44
  Performs a perceptual hash on an image by resizing to a square NxN image
43
45
  using the Lanczos algorithm where N is 32x32 or the largest multiple of
44
- 8 that is smaller than the input image dimensions. The resampled image
46
+ 8 that is smaller than the input image dimensions. The resampled image
45
47
  is compressed using a discrete cosine transform and the lowest frequency
46
48
  component is encoded as a bit array of greater or less than median value
47
49
  and returned as a hex string.
@@ -54,13 +56,15 @@ def pchash(image: ArrayLike) -> str:
54
56
  Returns
55
57
  -------
56
58
  str
57
- The hex string hash of the image using perceptual hashing
59
+ The hex string hash of the image using perceptual hashing, or empty
60
+ string if the image is too small to be hashed
58
61
  """
59
62
  # Verify that the image is at least larger than an 8x8 image
60
63
  arr = as_numpy(image)
61
64
  min_dim = min(arr.shape[-2:])
62
65
  if min_dim < HASH_SIZE + 1:
63
- raise ValueError(f"Image must be larger than {HASH_SIZE}x{HASH_SIZE} for fuzzy hashing.")
66
+ warnings.warn(f"Image must be larger than {HASH_SIZE}x{HASH_SIZE} for fuzzy hashing.")
67
+ return ""
64
68
 
65
69
  # Calculates the dimensions of the resized square image
66
70
  resize_dim = HASH_SIZE * min((min_dim - 1) // HASH_SIZE, MAX_FACTOR)
@@ -92,7 +96,7 @@ def pchash(image: ArrayLike) -> str:
92
96
  def xxhash(image: ArrayLike) -> str:
93
97
  """
94
98
  Performs a fast non-cryptographic hash using the xxhash algorithm
95
- (xxhash.com) against the image as a flattened bytearray. The hash
99
+ (xxhash.com) against the image as a flattened bytearray. The hash
96
100
  is returned as a hex string.
97
101
 
98
102
  Parameters
@@ -147,10 +151,10 @@ def hashstats(
147
151
  --------
148
152
  Calculating the statistics on the images, whose shape is (C, H, W)
149
153
 
150
- >>> results = hashstats(images)
154
+ >>> results = hashstats(stats_images)
151
155
  >>> print(results.xxhash)
152
- ['a72434443d6e7336', 'efc12c2f14581d79', '4a1e03483a27d674', '3a3ecedbcf814226']
156
+ ['6274f837b34ed9f0', '256504fdb6e3d2a4', '7dd0c56ca8474fb0', '50956ad4592f5bbc', '5ba2354079d42aa5']
153
157
  >>> print(results.pchash)
154
- ['8f25506af46a7c6a', '8000808000008080', '8e71f18e0ef18e0e', 'a956d6a956d6a928']
158
+ ['a666999999666666', 'e666999999266666', 'e666999966663299', 'e666999999266666', '96e91656e91616e9']
155
159
  """
156
160
  return run_stats(images, bboxes, False, [HashStatsProcessor])[0]