dataeval 0.72.0__py3-none-any.whl → 0.72.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. dataeval/__init__.py +4 -4
  2. dataeval/detectors/__init__.py +4 -3
  3. dataeval/detectors/drift/__init__.py +10 -11
  4. dataeval/{_internal/detectors → detectors}/drift/base.py +51 -102
  5. dataeval/{_internal/detectors → detectors}/drift/cvm.py +9 -8
  6. dataeval/{_internal/detectors → detectors}/drift/ks.py +11 -10
  7. dataeval/{_internal/detectors → detectors}/drift/mmd.py +33 -34
  8. dataeval/{_internal/detectors → detectors}/drift/torch.py +15 -13
  9. dataeval/{_internal/detectors → detectors}/drift/uncertainty.py +12 -9
  10. dataeval/detectors/drift/updates.py +61 -0
  11. dataeval/detectors/linters/__init__.py +3 -3
  12. dataeval/{_internal/detectors → detectors/linters}/clusterer.py +47 -45
  13. dataeval/{_internal/detectors → detectors/linters}/duplicates.py +20 -10
  14. dataeval/{_internal/detectors → detectors/linters}/merged_stats.py +3 -1
  15. dataeval/{_internal/detectors → detectors/linters}/outliers.py +19 -26
  16. dataeval/detectors/ood/__init__.py +8 -16
  17. dataeval/{_internal/detectors → detectors}/ood/ae.py +9 -9
  18. dataeval/{_internal/detectors → detectors}/ood/aegmm.py +10 -30
  19. dataeval/{_internal/detectors → detectors}/ood/base.py +27 -21
  20. dataeval/{_internal/detectors → detectors}/ood/llr.py +27 -23
  21. dataeval/detectors/ood/metadata_ks_compare.py +99 -0
  22. dataeval/detectors/ood/metadata_least_likely.py +119 -0
  23. dataeval/detectors/ood/metadata_ood_mi.py +92 -0
  24. dataeval/{_internal/detectors → detectors}/ood/vae.py +11 -13
  25. dataeval/{_internal/detectors → detectors}/ood/vaegmm.py +10 -32
  26. dataeval/{_internal/interop.py → interop.py} +12 -7
  27. dataeval/metrics/__init__.py +1 -1
  28. dataeval/metrics/bias/__init__.py +4 -4
  29. dataeval/{_internal/metrics → metrics/bias}/balance.py +70 -4
  30. dataeval/{_internal/metrics → metrics/bias}/coverage.py +10 -8
  31. dataeval/{_internal/metrics → metrics/bias}/diversity.py +54 -20
  32. dataeval/metrics/bias/metadata.py +275 -0
  33. dataeval/{_internal/metrics → metrics/bias}/parity.py +21 -17
  34. dataeval/metrics/estimators/__init__.py +3 -3
  35. dataeval/{_internal/metrics → metrics/estimators}/ber.py +31 -28
  36. dataeval/{_internal/metrics → metrics/estimators}/divergence.py +15 -16
  37. dataeval/{_internal/metrics → metrics/estimators}/uap.py +8 -6
  38. dataeval/metrics/stats/__init__.py +7 -7
  39. dataeval/{_internal/metrics → metrics}/stats/base.py +66 -40
  40. dataeval/{_internal/metrics → metrics}/stats/boxratiostats.py +19 -15
  41. dataeval/{_internal/metrics → metrics}/stats/datasetstats.py +19 -17
  42. dataeval/{_internal/metrics → metrics}/stats/dimensionstats.py +12 -10
  43. dataeval/metrics/stats/hashstats.py +156 -0
  44. dataeval/{_internal/metrics → metrics}/stats/labelstats.py +8 -6
  45. dataeval/{_internal/metrics → metrics}/stats/pixelstats.py +12 -11
  46. dataeval/{_internal/metrics → metrics}/stats/visualstats.py +14 -13
  47. dataeval/{_internal/output.py → output.py} +26 -6
  48. dataeval/utils/__init__.py +8 -4
  49. dataeval/utils/image.py +71 -0
  50. dataeval/utils/shared.py +151 -0
  51. dataeval/utils/split_dataset.py +486 -0
  52. dataeval/utils/tensorflow/__init__.py +9 -7
  53. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/autoencoder.py +64 -68
  54. dataeval/{_internal/models/tensorflow/losses.py → utils/tensorflow/_internal/loss.py} +10 -9
  55. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/pixelcnn.py +18 -22
  56. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py +3 -1
  57. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py +18 -18
  58. dataeval/utils/tensorflow/loss/__init__.py +6 -2
  59. dataeval/utils/torch/__init__.py +7 -3
  60. dataeval/{_internal/models/pytorch → utils/torch}/blocks.py +19 -14
  61. dataeval/{_internal → utils/torch}/datasets.py +49 -43
  62. dataeval/utils/torch/models.py +138 -0
  63. dataeval/{_internal/models/pytorch/autoencoder.py → utils/torch/trainer.py} +12 -141
  64. dataeval/{_internal → utils/torch}/utils.py +3 -1
  65. dataeval/workflows/__init__.py +1 -1
  66. dataeval/{_internal/workflows → workflows}/sufficiency.py +42 -37
  67. {dataeval-0.72.0.dist-info → dataeval-0.72.2.dist-info}/METADATA +7 -5
  68. dataeval-0.72.2.dist-info/RECORD +72 -0
  69. dataeval/_internal/detectors/__init__.py +0 -0
  70. dataeval/_internal/detectors/drift/__init__.py +0 -0
  71. dataeval/_internal/detectors/ood/__init__.py +0 -0
  72. dataeval/_internal/metrics/__init__.py +0 -0
  73. dataeval/_internal/metrics/stats/hashstats.py +0 -75
  74. dataeval/_internal/metrics/utils.py +0 -447
  75. dataeval/_internal/models/__init__.py +0 -0
  76. dataeval/_internal/models/pytorch/__init__.py +0 -0
  77. dataeval/_internal/models/pytorch/utils.py +0 -67
  78. dataeval/_internal/models/tensorflow/__init__.py +0 -0
  79. dataeval/_internal/workflows/__init__.py +0 -0
  80. dataeval/detectors/drift/kernels/__init__.py +0 -10
  81. dataeval/detectors/drift/updates/__init__.py +0 -7
  82. dataeval/utils/tensorflow/models/__init__.py +0 -9
  83. dataeval/utils/tensorflow/recon/__init__.py +0 -3
  84. dataeval/utils/torch/datasets/__init__.py +0 -12
  85. dataeval/utils/torch/models/__init__.py +0 -11
  86. dataeval/utils/torch/trainer/__init__.py +0 -7
  87. dataeval-0.72.0.dist-info/RECORD +0 -80
  88. /dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/gmm.py +0 -0
  89. {dataeval-0.72.0.dist-info → dataeval-0.72.2.dist-info}/LICENSE.txt +0 -0
  90. {dataeval-0.72.0.dist-info → dataeval-0.72.2.dist-info}/WHEEL +0 -0
@@ -1,7 +1,7 @@
1
1
  """
2
2
  This module contains the implementation of the
3
3
  FR Test Statistic based estimate and the
4
- KNN based estimate for the Bayes Error Rate
4
+ KNN based estimate for the :term:`Bayes error rate<Bayes Error Rate (BER)>`
5
5
 
6
6
  Learning to Bound the Multi-class Bayes Error (Th. 3 and Th. 4)
7
7
  https://arxiv.org/abs/1811.06419
@@ -9,6 +9,8 @@ https://arxiv.org/abs/1811.06419
9
9
 
10
10
  from __future__ import annotations
11
11
 
12
+ __all__ = ["BEROutput", "ber"]
13
+
12
14
  from dataclasses import dataclass
13
15
  from typing import Literal
14
16
 
@@ -17,9 +19,9 @@ from numpy.typing import ArrayLike, NDArray
17
19
  from scipy.sparse import coo_matrix
18
20
  from scipy.stats import mode
19
21
 
20
- from dataeval._internal.interop import as_numpy
21
- from dataeval._internal.metrics.utils import compute_neighbors, get_classes_counts, get_method, minimum_spanning_tree
22
- from dataeval._internal.output import OutputMetadata, set_metadata
22
+ from dataeval.interop import as_numpy
23
+ from dataeval.output import OutputMetadata, set_metadata
24
+ from dataeval.utils.shared import compute_neighbors, get_classes_counts, get_method, minimum_spanning_tree
23
25
 
24
26
 
25
27
  @dataclass(frozen=True)
@@ -30,7 +32,7 @@ class BEROutput(OutputMetadata):
30
32
  Attributes
31
33
  ----------
32
34
  ber : float
33
- The upper bounds of the Bayes Error Rate
35
+ The upper bounds of the :term:`Bayes error rate<Bayes Error Rate (BER)>`
34
36
  ber_lower : float
35
37
  The lower bounds of the Bayes Error Rate
36
38
  """
@@ -39,51 +41,55 @@ class BEROutput(OutputMetadata):
39
41
  ber_lower: float
40
42
 
41
43
 
42
- def ber_mst(X: NDArray, y: NDArray) -> tuple[float, float]:
43
- """Calculates the Bayes Error Rate using a minimum spanning tree
44
+ def ber_mst(images: NDArray[np.float64], labels: NDArray[np.int_], k: int = 1) -> tuple[float, float]:
45
+ """Calculates the :term:`Bayes error rate<Bayes Error Rate (BER)>` using a minimum spanning tree
44
46
 
45
47
  Parameters
46
48
  ----------
47
- X : NDArray, shape - (N, ... )
49
+ images : NDArray, shape - (N, ... )
48
50
  n_samples containing n_features
49
- y : NDArray, shape - (N, 1)
51
+ labels : NDArray, shape - (N, 1)
50
52
  Labels corresponding to each sample
53
+ k : int
54
+ Unused
51
55
 
52
56
  Returns
53
57
  -------
54
58
  Tuple[float, float]
55
59
  The upper and lower bounds of the bayes error rate
56
60
  """
57
- M, N = get_classes_counts(y)
61
+ M, N = get_classes_counts(labels)
58
62
 
59
- tree = coo_matrix(minimum_spanning_tree(X))
60
- matches = np.sum([y[tree.row[i]] != y[tree.col[i]] for i in range(N - 1)])
63
+ tree = coo_matrix(minimum_spanning_tree(images))
64
+ matches = np.sum([labels[tree.row[i]] != labels[tree.col[i]] for i in range(N - 1)])
61
65
  deltas = matches / (2 * N)
62
66
  upper = 2 * deltas
63
67
  lower = ((M - 1) / (M)) * (1 - max(1 - 2 * ((M) / (M - 1)) * deltas, 0) ** 0.5)
64
68
  return upper, lower
65
69
 
66
70
 
67
- def ber_knn(X: NDArray, y: NDArray, k: int) -> tuple[float, float]:
68
- """Calculates the Bayes Error Rate using K-nearest neighbors
71
+ def ber_knn(images: NDArray[np.float64], labels: NDArray[np.int_], k: int) -> tuple[float, float]:
72
+ """Calculates the :term:`Bayes error rate<Bayes Error Rate (BER)>` using K-nearest neighbors
69
73
 
70
74
  Parameters
71
75
  ----------
72
- X : NDArray, shape - (N, ... )
76
+ images : NDArray, shape - (N, ... )
73
77
  n_samples containing n_features
74
- y : NDArray, shape - (N, 1)
78
+ labels : NDArray, shape - (N, 1)
75
79
  Labels corresponding to each sample
80
+ k : int
81
+ The number of neighbors to find
76
82
 
77
83
  Returns
78
84
  -------
79
85
  Tuple[float, float]
80
86
  The upper and lower bounds of the bayes error rate
81
87
  """
82
- M, N = get_classes_counts(y)
83
- nn_indices = compute_neighbors(X, X, k=k)
88
+ M, N = get_classes_counts(labels)
89
+ nn_indices = compute_neighbors(images, images, k=k)
84
90
  nn_indices = np.expand_dims(nn_indices, axis=1) if nn_indices.ndim == 1 else nn_indices
85
- modal_class = mode(y[nn_indices], axis=1, keepdims=True).mode.squeeze()
86
- upper = float(np.count_nonzero(modal_class - y) / N)
91
+ modal_class = mode(labels[nn_indices], axis=1, keepdims=True).mode.squeeze()
92
+ upper = float(np.count_nonzero(modal_class - labels) / N)
87
93
  lower = knn_lowerbound(upper, M, k)
88
94
  return upper, lower
89
95
 
@@ -108,18 +114,15 @@ def knn_lowerbound(value: float, classes: int, k: int) -> float:
108
114
  return ((classes - 1) / classes) * (1 - np.sqrt(max(0, 1 - ((classes / (classes - 1)) * value))))
109
115
 
110
116
 
111
- BER_FN_MAP = {"KNN": ber_knn, "MST": ber_mst}
112
-
113
-
114
- @set_metadata("dataeval.metrics")
117
+ @set_metadata()
115
118
  def ber(images: ArrayLike, labels: ArrayLike, k: int = 1, method: Literal["KNN", "MST"] = "KNN") -> BEROutput:
116
119
  """
117
- An estimator for Multi-class Bayes Error Rate using FR or KNN test statistic basis
120
+ An estimator for Multi-class :term:`Bayes error rate<Bayes Error Rate (BER)>` using FR or KNN test statistic basis
118
121
 
119
122
  Parameters
120
123
  ----------
121
124
  images : ArrayLike (N, ... )
122
- Array of images or image embeddings
125
+ Array of images or image :term:`embeddings<Embeddings>`
123
126
  labels : ArrayLike (N, 1)
124
127
  Array of labels for each image or image embedding
125
128
  k : int, default 1
@@ -146,8 +149,8 @@ def ber(images: ArrayLike, labels: ArrayLike, k: int = 1, method: Literal["KNN",
146
149
  >>> ber(images, labels)
147
150
  BEROutput(ber=0.04, ber_lower=0.020416847668728033)
148
151
  """
149
- ber_fn = get_method(BER_FN_MAP, method)
152
+ ber_fn = get_method({"KNN": ber_knn, "MST": ber_mst}, method)
150
153
  X = as_numpy(images)
151
154
  y = as_numpy(labels)
152
- upper, lower = ber_fn(X, y, k) if method == "KNN" else ber_fn(X, y)
155
+ upper, lower = ber_fn(X, y, k)
153
156
  return BEROutput(upper, lower)
@@ -1,19 +1,21 @@
1
1
  """
2
- This module contains the implementation of HP Divergence
2
+ This module contains the implementation of HP :term:`divergence<Divergence>`
3
3
  using the Fast Nearest Neighbor and Minimum Spanning Tree algorithms
4
4
  """
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
+ __all__ = ["DivergenceOutput", "divergence"]
9
+
8
10
  from dataclasses import dataclass
9
11
  from typing import Literal
10
12
 
11
13
  import numpy as np
12
14
  from numpy.typing import ArrayLike, NDArray
13
15
 
14
- from dataeval._internal.interop import as_numpy
15
- from dataeval._internal.metrics.utils import compute_neighbors, get_method, minimum_spanning_tree
16
- from dataeval._internal.output import OutputMetadata, set_metadata
16
+ from dataeval.interop import as_numpy
17
+ from dataeval.output import OutputMetadata, set_metadata
18
+ from dataeval.utils.shared import compute_neighbors, get_method, minimum_spanning_tree
17
19
 
18
20
 
19
21
  @dataclass(frozen=True)
@@ -24,7 +26,7 @@ class DivergenceOutput(OutputMetadata):
24
26
  Attributes
25
27
  ----------
26
28
  divergence : float
27
- Divergence value calculated between 2 datasets ranging between 0.0 and 1.0
29
+ :term:`Divergence` value calculated between 2 datasets ranging between 0.0 and 1.0
28
30
  errors : int
29
31
  The number of differing edges between the datasets
30
32
  """
@@ -33,7 +35,7 @@ class DivergenceOutput(OutputMetadata):
33
35
  errors: int
34
36
 
35
37
 
36
- def divergence_mst(data: NDArray, labels: NDArray) -> int:
38
+ def divergence_mst(data: NDArray[np.float64], labels: NDArray[np.int_]) -> int:
37
39
  """
38
40
  Calculates the estimated label errors based on the minimum spanning tree
39
41
 
@@ -55,7 +57,7 @@ def divergence_mst(data: NDArray, labels: NDArray) -> int:
55
57
  return errors
56
58
 
57
59
 
58
- def divergence_fnn(data: NDArray, labels: NDArray) -> int:
60
+ def divergence_fnn(data: NDArray[np.float64], labels: NDArray[np.int_]) -> int:
59
61
  """
60
62
  Calculates the estimated label errors based on their nearest neighbors
61
63
 
@@ -76,13 +78,10 @@ def divergence_fnn(data: NDArray, labels: NDArray) -> int:
76
78
  return errors
77
79
 
78
80
 
79
- DIVERGENCE_FN_MAP = {"FNN": divergence_fnn, "MST": divergence_mst}
80
-
81
-
82
- @set_metadata("dataeval.metrics")
81
+ @set_metadata()
83
82
  def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST"] = "FNN") -> DivergenceOutput:
84
83
  """
85
- Calculates the divergence and any errors between the datasets
84
+ Calculates the :term`divergence` and any errors between the datasets
86
85
 
87
86
  Parameters
88
87
  ----------
@@ -93,7 +92,7 @@ def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST
93
92
  A dataset in an ArrayLike format to compare.
94
93
  Function expects the data to have 2 dimensions, N number of observations in a P-dimensionial space.
95
94
  method : Literal["MST, "FNN"], default "FNN"
96
- Method used to estimate dataset divergence
95
+ Method used to estimate dataset :term:`divergence<Divergence>`
97
96
 
98
97
  Returns
99
98
  -------
@@ -124,16 +123,16 @@ def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST
124
123
  Evaluate the datasets:
125
124
 
126
125
  >>> divergence(datasetA, datasetB)
127
- DivergenceOutput(divergence=0.28, errors=36.0)
126
+ DivergenceOutput(divergence=0.28, errors=36)
128
127
  """
129
- div_fn = get_method(DIVERGENCE_FN_MAP, method)
128
+ div_fn = get_method({"FNN": divergence_fnn, "MST": divergence_mst}, method)
130
129
  a = as_numpy(data_a)
131
130
  b = as_numpy(data_b)
132
131
  N = a.shape[0]
133
132
  M = b.shape[0]
134
133
 
135
134
  stacked_data = np.vstack((a, b))
136
- labels = np.vstack([np.zeros([N, 1]), np.ones([M, 1])])
135
+ labels = np.vstack([np.zeros([N, 1], dtype=np.int_), np.ones([M, 1], dtype=np.int_)])
137
136
 
138
137
  errors = div_fn(stacked_data, labels)
139
138
  dp = max(0.0, 1 - ((M + N) / (2 * M * N)) * errors)
@@ -1,18 +1,20 @@
1
1
  """
2
2
  This module contains the implementation of the
3
- FR Test Statistic based estimate for the upperbound
4
- average precision using empirical mean precision
3
+ FR Test Statistic based estimate for the :term:`upper-bound
4
+ average precision<Upper-Bound Average Precision (UAP)>` using empirical mean precision
5
5
  """
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
+ __all__ = ["UAPOutput", "uap"]
10
+
9
11
  from dataclasses import dataclass
10
12
 
11
13
  from numpy.typing import ArrayLike
12
14
  from sklearn.metrics import average_precision_score
13
15
 
14
- from dataeval._internal.interop import as_numpy
15
- from dataeval._internal.output import OutputMetadata, set_metadata
16
+ from dataeval.interop import as_numpy
17
+ from dataeval.output import OutputMetadata, set_metadata
16
18
 
17
19
 
18
20
  @dataclass(frozen=True)
@@ -29,7 +31,7 @@ class UAPOutput(OutputMetadata):
29
31
  uap: float
30
32
 
31
33
 
32
- @set_metadata("dataeval.metrics")
34
+ @set_metadata()
33
35
  def uap(labels: ArrayLike, scores: ArrayLike) -> UAPOutput:
34
36
  """
35
37
  FR Test Statistic based estimate of the empirical mean precision for
@@ -38,7 +40,7 @@ def uap(labels: ArrayLike, scores: ArrayLike) -> UAPOutput:
38
40
  Parameters
39
41
  ----------
40
42
  labels : ArrayLike
41
- A numpy array of n_samples of class labels with M unique classes.
43
+ A term:`NumPy` array of n_samples of class labels with M unique classes.
42
44
  scores : ArrayLike
43
45
  A 2D array of class probabilities per image
44
46
 
@@ -3,18 +3,18 @@ Statistics metrics calculate a variety of image properties and pixel statistics
3
3
  and label statistics against the images and labels of a dataset.
4
4
  """
5
5
 
6
- from dataeval._internal.metrics.stats.boxratiostats import boxratiostats
7
- from dataeval._internal.metrics.stats.datasetstats import (
6
+ from dataeval.metrics.stats.boxratiostats import boxratiostats
7
+ from dataeval.metrics.stats.datasetstats import (
8
8
  ChannelStatsOutput,
9
9
  DatasetStatsOutput,
10
10
  channelstats,
11
11
  datasetstats,
12
12
  )
13
- from dataeval._internal.metrics.stats.dimensionstats import DimensionStatsOutput, dimensionstats
14
- from dataeval._internal.metrics.stats.hashstats import HashStatsOutput, hashstats
15
- from dataeval._internal.metrics.stats.labelstats import LabelStatsOutput, labelstats
16
- from dataeval._internal.metrics.stats.pixelstats import PixelStatsOutput, pixelstats
17
- from dataeval._internal.metrics.stats.visualstats import VisualStatsOutput, visualstats
13
+ from dataeval.metrics.stats.dimensionstats import DimensionStatsOutput, dimensionstats
14
+ from dataeval.metrics.stats.hashstats import HashStatsOutput, hashstats
15
+ from dataeval.metrics.stats.labelstats import LabelStatsOutput, labelstats
16
+ from dataeval.metrics.stats.pixelstats import PixelStatsOutput, pixelstats
17
+ from dataeval.metrics.stats.visualstats import VisualStatsOutput, visualstats
18
18
 
19
19
  __all__ = [
20
20
  "boxratiostats",
@@ -1,5 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
+ __all__ = []
4
+
3
5
  import re
4
6
  import warnings
5
7
  from dataclasses import dataclass
@@ -12,14 +14,17 @@ import numpy as np
12
14
  import tqdm
13
15
  from numpy.typing import ArrayLike, NDArray
14
16
 
15
- from dataeval._internal.interop import to_numpy_iter
16
- from dataeval._internal.metrics.utils import normalize_box_shape, normalize_image_shape, rescale
17
- from dataeval._internal.output import OutputMetadata
17
+ from dataeval.interop import to_numpy_iter
18
+ from dataeval.output import OutputMetadata
19
+ from dataeval.utils.image import normalize_image_shape, rescale
18
20
 
19
21
  DTYPE_REGEX = re.compile(r"NDArray\[np\.(.*?)\]")
20
22
  SOURCE_INDEX = "source_index"
21
23
  BOX_COUNT = "box_count"
22
24
 
25
+ # TODO: Replace with global config
26
+ DEFAULT_PROCESSES: int | None = None
27
+
23
28
  OptionalRange = Optional[Union[int, Iterable[int]]]
24
29
 
25
30
 
@@ -29,6 +34,19 @@ def matches(index: int | None, opt_range: OptionalRange) -> bool:
29
34
  return index in opt_range if isinstance(opt_range, Iterable) else index == opt_range
30
35
 
31
36
 
37
+ def normalize_box_shape(bounding_box: NDArray[Any]) -> NDArray[Any]:
38
+ """
39
+ Normalizes the bounding box shape into (N,4).
40
+ """
41
+ ndim = bounding_box.ndim
42
+ if ndim == 1:
43
+ return np.expand_dims(bounding_box, axis=0)
44
+ elif ndim > 2:
45
+ raise ValueError("Bounding boxes must have 2 dimensions: (# of boxes in an image, [X,Y,W,H]) -> (N,4)")
46
+ else:
47
+ return bounding_box
48
+
49
+
32
50
  class SourceIndex(NamedTuple):
33
51
  """
34
52
  Attributes
@@ -101,39 +119,39 @@ TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput, covariant=True)
101
119
  class StatsProcessor(Generic[TStatsOutput]):
102
120
  output_class: type[TStatsOutput]
103
121
  cache_keys: list[str] = []
104
- image_function_map: dict[str, Callable[[StatsProcessor], Any]] = {}
105
- channel_function_map: dict[str, Callable[[StatsProcessor], Any]] = {}
122
+ image_function_map: dict[str, Callable[[StatsProcessor[TStatsOutput]], Any]] = {}
123
+ channel_function_map: dict[str, Callable[[StatsProcessor[TStatsOutput]], Any]] = {}
106
124
 
107
- def __init__(self, image: NDArray, box: NDArray | None, per_channel: bool):
125
+ def __init__(self, image: NDArray[Any], box: NDArray[Any] | None, per_channel: bool) -> None:
108
126
  self.raw = image
109
- self.width = image.shape[-1]
110
- self.height = image.shape[-2]
111
- self.box = np.array([0, 0, self.width, self.height]) if box is None else box
112
- self.per_channel = per_channel
127
+ self.width: int = image.shape[-1]
128
+ self.height: int = image.shape[-2]
129
+ self.box: NDArray[Any] = np.array([0, 0, self.width, self.height]) if box is None else box
130
+ self._per_channel = per_channel
113
131
  self._image = None
114
132
  self._shape = None
115
133
  self._scaled = None
116
- self.cache = {}
117
- self.fn_map = self.channel_function_map if per_channel else self.image_function_map
118
- self.is_valid_slice = box is None or bool(
134
+ self._cache = {}
135
+ self._fn_map = self.channel_function_map if per_channel else self.image_function_map
136
+ self._is_valid_slice = box is None or bool(
119
137
  box[0] >= 0 and box[1] >= 0 and box[2] <= image.shape[-1] and box[3] <= image.shape[-2]
120
138
  )
121
139
 
122
- def get(self, fn_key: str) -> NDArray:
140
+ def get(self, fn_key: str) -> NDArray[Any]:
123
141
  if fn_key in self.cache_keys:
124
- if fn_key not in self.cache:
125
- self.cache[fn_key] = self.fn_map[fn_key](self)
126
- return self.cache[fn_key]
142
+ if fn_key not in self._cache:
143
+ self._cache[fn_key] = self._fn_map[fn_key](self)
144
+ return self._cache[fn_key]
127
145
  else:
128
- return self.fn_map[fn_key](self)
146
+ return self._fn_map[fn_key](self)
129
147
 
130
- def process(self) -> dict:
131
- return {k: self.fn_map[k](self) for k in self.fn_map}
148
+ def process(self) -> dict[str, Any]:
149
+ return {k: self._fn_map[k](self) for k in self._fn_map}
132
150
 
133
151
  @property
134
- def image(self) -> NDArray:
152
+ def image(self) -> NDArray[Any]:
135
153
  if self._image is None:
136
- if self.is_valid_slice:
154
+ if self._is_valid_slice:
137
155
  norm = normalize_image_shape(self.raw)
138
156
  self._image = norm[:, self.box[1] : self.box[3], self.box[0] : self.box[2]]
139
157
  else:
@@ -141,16 +159,16 @@ class StatsProcessor(Generic[TStatsOutput]):
141
159
  return self._image
142
160
 
143
161
  @property
144
- def shape(self) -> tuple:
162
+ def shape(self) -> tuple[int, ...]:
145
163
  if self._shape is None:
146
164
  self._shape = self.image.shape
147
165
  return self._shape
148
166
 
149
167
  @property
150
- def scaled(self) -> NDArray:
168
+ def scaled(self) -> NDArray[Any]:
151
169
  if self._scaled is None:
152
170
  self._scaled = rescale(self.image)
153
- if self.per_channel:
171
+ if self._per_channel:
154
172
  self._scaled = self._scaled.reshape(self.image.shape[0], -1)
155
173
  return self._scaled
156
174
 
@@ -175,25 +193,25 @@ class StatsProcessorOutput(NamedTuple):
175
193
  results: list[dict[str, Any]]
176
194
  source_indices: list[SourceIndex]
177
195
  box_counts: list[int]
178
- warnings_list: list[tuple[int, int, NDArray, tuple[int, ...]]]
196
+ warnings_list: list[tuple[int, int, NDArray[np.float64], tuple[int, ...]]]
179
197
 
180
198
 
181
199
  def process_stats(
182
200
  i: int,
183
- image_boxes: tuple[NDArray, NDArray | None],
201
+ image_boxes: tuple[NDArray[Any], NDArray[Any] | None],
184
202
  per_channel: bool,
185
- stats_processor_cls: Iterable[type[StatsProcessor]],
203
+ stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
186
204
  ) -> StatsProcessorOutput:
187
205
  image, boxes = image_boxes
188
206
  results_list: list[dict[str, Any]] = []
189
207
  source_indices: list[SourceIndex] = []
190
208
  box_counts: list[int] = []
191
- warnings_list: list[tuple[int, int, NDArray, tuple[int, ...]]] = []
209
+ warnings_list: list[tuple[int, int, NDArray[np.float64], tuple[int, ...]]] = []
192
210
  nboxes = [None] if boxes is None else normalize_box_shape(boxes)
193
211
  for i_b, box in enumerate(nboxes):
194
212
  i_b = None if box is None else i_b
195
213
  processor_list = [p(image, box, per_channel) for p in stats_processor_cls]
196
- if any(not p.is_valid_slice for p in processor_list) and i_b is not None and box is not None:
214
+ if any(not p._is_valid_slice for p in processor_list) and i_b is not None and box is not None:
197
215
  warnings_list.append((i, i_b, box, image.shape))
198
216
  results_list.append({k: v for p in processor_list for k, v in p.process().items()})
199
217
  if per_channel:
@@ -204,7 +222,11 @@ def process_stats(
204
222
  return StatsProcessorOutput(results_list, source_indices, box_counts, warnings_list)
205
223
 
206
224
 
207
- def process_stats_unpack(args, per_channel: bool, stats_processor_cls: Iterable[type[StatsProcessor]]):
225
+ def process_stats_unpack(
226
+ args: tuple[int, tuple[NDArray[Any], NDArray[Any] | None]],
227
+ per_channel: bool,
228
+ stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
229
+ ) -> StatsProcessorOutput:
208
230
  return process_stats(*args, per_channel=per_channel, stats_processor_cls=stats_processor_cls)
209
231
 
210
232
 
@@ -215,7 +237,7 @@ def run_stats(
215
237
  stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
216
238
  ) -> list[TStatsOutput]:
217
239
  """
218
- Compute specified statistics on a set of images.
240
+ Compute specified :term:`statistics<Statistics>` on a set of images.
219
241
 
220
242
  This function applies a set of statistical operations to each image in the input iterable,
221
243
  based on the specified output class. The function determines which statistics to apply
@@ -225,7 +247,7 @@ def run_stats(
225
247
  ----------
226
248
  images : Iterable[ArrayLike]
227
249
  An iterable of images (e.g., list of arrays), where each image is represented as an
228
- array-like structure (e.g., NumPy arrays).
250
+ array-like structure (e.g., :term:`NumPy` arrays).
229
251
  bboxes : Iterable[ArrayLike]
230
252
  An iterable of bounding boxes (e.g. list of arrays) where each bounding box is represented
231
253
  as an array-like structure in the format of (X0, Y0, X1, Y1). The length of the bounding boxes
@@ -234,24 +256,28 @@ def run_stats(
234
256
  A flag which determines if the states should be evaluated on a per-channel basis or not.
235
257
  stats_processor_cls : Iterable[type[StatsProcessor]]
236
258
  An iterable of stats processor classes that calculate stats and return output classes.
259
+ processes : int | None, default None
260
+ Number of processes to use, defaults to None which uses all available CPU cores.
237
261
 
238
262
  Returns
239
263
  -------
240
- list[TStatsOutput]
241
- A list of output classes corresponding to the input processor types.
264
+ dict[str, NDArray]]
265
+ A dictionary containing the computed statistics for each image.
266
+ The dictionary keys correspond to the names of the statistics, and the values are :term:`NumPy` arrays
267
+ with the results of the computations.
242
268
 
243
269
  Note
244
270
  ----
245
271
  - The function performs image normalization (rescaling the image values)
246
272
  before applying some of the statistics.
247
- - Pixel-level statistics (e.g., brightness, entropy) are computed after
273
+ - Pixel-level statistics (e.g., :term:`brightness<Brightness>`, entropy) are computed after
248
274
  rescaling and, optionally, flattening the images.
249
275
  - For statistics like histograms and entropy, intermediate results may
250
276
  be reused to avoid redundant computation.
251
277
  """
252
- results_list: list[dict[str, NDArray]] = []
253
- source_index = []
254
- box_count = []
278
+ results_list: list[dict[str, NDArray[np.float64]]] = []
279
+ source_index: list[SourceIndex] = []
280
+ box_count: list[int] = []
255
281
  bbox_iter = repeat(None) if bboxes is None else to_numpy_iter(bboxes)
256
282
 
257
283
  warning_list = []
@@ -259,7 +285,7 @@ def run_stats(
259
285
  stats_processor_cls = stats_processor_cls if isinstance(stats_processor_cls, Iterable) else [stats_processor_cls]
260
286
 
261
287
  # TODO: Introduce global controls for CPU job parallelism and GPU configurations
262
- with Pool(16) as p:
288
+ with Pool(processes=DEFAULT_PROCESSES) as p:
263
289
  for r in tqdm.tqdm(
264
290
  p.imap(
265
291
  partial(process_stats_unpack, per_channel=per_channel, stats_processor_cls=stats_processor_cls),
@@ -1,14 +1,16 @@
1
1
  from __future__ import annotations
2
2
 
3
+ __all__ = ["boxratiostats"]
4
+
3
5
  import copy
4
- from typing import Callable, Generic, TypeVar, cast
6
+ from typing import Any, Callable, Generic, TypeVar, cast
5
7
 
6
8
  import numpy as np
7
9
  from numpy.typing import NDArray
8
10
 
9
- from dataeval._internal.metrics.stats.base import BOX_COUNT, SOURCE_INDEX, BaseStatsOutput
10
- from dataeval._internal.metrics.stats.dimensionstats import DimensionStatsOutput
11
- from dataeval._internal.output import set_metadata
11
+ from dataeval.metrics.stats.base import BOX_COUNT, SOURCE_INDEX, BaseStatsOutput
12
+ from dataeval.metrics.stats.dimensionstats import DimensionStatsOutput
13
+ from dataeval.output import set_metadata
12
14
 
13
15
  TStatOutput = TypeVar("TStatOutput", bound=BaseStatsOutput, contravariant=True)
14
16
  ArraySlice = tuple[int, int]
@@ -39,14 +41,16 @@ class BoxImageStatsOutputSlice(Generic[TStatOutput]):
39
41
  self.img = self.StatSlicer(img_stats, img_slice)
40
42
 
41
43
 
42
- RATIOSTATS_OVERRIDE_MAP: dict[type, dict[str, Callable[[BoxImageStatsOutputSlice], NDArray]]] = {
43
- DimensionStatsOutput: {
44
- "left": lambda x: x.box["left"] / x.img["width"],
45
- "top": lambda x: x.box["top"] / x.img["height"],
46
- "channels": lambda x: x.box["channels"],
47
- "depth": lambda x: x.box["depth"],
48
- "distance": lambda x: x.box["distance"],
49
- }
44
+ RATIOSTATS_OVERRIDE_MAP: dict[type, dict[str, Callable[..., NDArray[Any]]]] = {
45
+ DimensionStatsOutput: dict[str, Callable[[BoxImageStatsOutputSlice[DimensionStatsOutput]], NDArray[Any]]](
46
+ {
47
+ "left": lambda x: x.box["left"] / x.img["width"],
48
+ "top": lambda x: x.box["top"] / x.img["height"],
49
+ "channels": lambda x: x.box["channels"],
50
+ "depth": lambda x: x.box["depth"],
51
+ "distance": lambda x: x.box["distance"],
52
+ }
53
+ )
50
54
  }
51
55
 
52
56
 
@@ -60,7 +64,7 @@ def get_index_map(stats: BaseStatsOutput) -> list[int]:
60
64
  return index_map
61
65
 
62
66
 
63
- def calculate_ratios(key: str, box_stats: BaseStatsOutput, img_stats: BaseStatsOutput) -> NDArray:
67
+ def calculate_ratios(key: str, box_stats: BaseStatsOutput, img_stats: BaseStatsOutput) -> NDArray[np.float64]:
64
68
  if not hasattr(box_stats, key) or not hasattr(img_stats, key):
65
69
  raise KeyError("Invalid key for provided stats output object.")
66
70
 
@@ -92,13 +96,13 @@ def calculate_ratios(key: str, box_stats: BaseStatsOutput, img_stats: BaseStatsO
92
96
  return out_stats
93
97
 
94
98
 
95
- @set_metadata("dataeval.metrics")
99
+ @set_metadata()
96
100
  def boxratiostats(
97
101
  boxstats: TStatOutput,
98
102
  imgstats: TStatOutput,
99
103
  ) -> TStatOutput:
100
104
  """
101
- Calculates ratio statistics of box outputs over image outputs
105
+ Calculates ratio :term:`statistics<Statistics>` of box outputs over image outputs
102
106
 
103
107
  Parameters
104
108
  ----------