dataeval 0.76.1__py3-none-any.whl → 0.81.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. dataeval/__init__.py +3 -3
  2. dataeval/{output.py → _output.py} +14 -0
  3. dataeval/config.py +77 -0
  4. dataeval/detectors/__init__.py +1 -1
  5. dataeval/detectors/drift/__init__.py +6 -6
  6. dataeval/detectors/drift/{base.py → _base.py} +41 -30
  7. dataeval/detectors/drift/{cvm.py → _cvm.py} +21 -28
  8. dataeval/detectors/drift/{ks.py → _ks.py} +20 -26
  9. dataeval/detectors/drift/{mmd.py → _mmd.py} +33 -19
  10. dataeval/detectors/drift/{torch.py → _torch.py} +2 -1
  11. dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} +23 -7
  12. dataeval/detectors/drift/updates.py +1 -1
  13. dataeval/detectors/linters/__init__.py +0 -3
  14. dataeval/detectors/linters/duplicates.py +17 -8
  15. dataeval/detectors/linters/outliers.py +23 -14
  16. dataeval/detectors/ood/ae.py +29 -8
  17. dataeval/detectors/ood/base.py +5 -4
  18. dataeval/detectors/ood/metadata_ks_compare.py +1 -1
  19. dataeval/detectors/ood/mixin.py +20 -5
  20. dataeval/detectors/ood/output.py +1 -1
  21. dataeval/detectors/ood/vae.py +73 -0
  22. dataeval/metadata/__init__.py +5 -0
  23. dataeval/metadata/_ood.py +238 -0
  24. dataeval/metrics/__init__.py +1 -1
  25. dataeval/metrics/bias/__init__.py +5 -4
  26. dataeval/metrics/bias/{balance.py → _balance.py} +67 -17
  27. dataeval/metrics/bias/{coverage.py → _coverage.py} +41 -35
  28. dataeval/metrics/bias/{diversity.py → _diversity.py} +17 -12
  29. dataeval/metrics/bias/{parity.py → _parity.py} +89 -61
  30. dataeval/metrics/estimators/__init__.py +14 -4
  31. dataeval/metrics/estimators/{ber.py → _ber.py} +42 -11
  32. dataeval/metrics/estimators/_clusterer.py +104 -0
  33. dataeval/metrics/estimators/{divergence.py → _divergence.py} +18 -13
  34. dataeval/metrics/estimators/{uap.py → _uap.py} +4 -4
  35. dataeval/metrics/stats/__init__.py +7 -7
  36. dataeval/metrics/stats/{base.py → _base.py} +52 -16
  37. dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} +6 -9
  38. dataeval/metrics/stats/{datasetstats.py → _datasetstats.py} +10 -14
  39. dataeval/metrics/stats/{dimensionstats.py → _dimensionstats.py} +6 -5
  40. dataeval/metrics/stats/{hashstats.py → _hashstats.py} +6 -6
  41. dataeval/metrics/stats/{labelstats.py → _labelstats.py} +4 -4
  42. dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} +5 -4
  43. dataeval/metrics/stats/{visualstats.py → _visualstats.py} +9 -8
  44. dataeval/typing.py +54 -0
  45. dataeval/utils/__init__.py +2 -2
  46. dataeval/utils/_array.py +169 -0
  47. dataeval/utils/_bin.py +199 -0
  48. dataeval/utils/_clusterer.py +144 -0
  49. dataeval/utils/_fast_mst.py +189 -0
  50. dataeval/utils/{image.py → _image.py} +6 -4
  51. dataeval/utils/_method.py +18 -0
  52. dataeval/utils/{shared.py → _mst.py} +3 -65
  53. dataeval/utils/{plot.py → _plot.py} +4 -4
  54. dataeval/utils/data/__init__.py +22 -0
  55. dataeval/utils/data/_embeddings.py +105 -0
  56. dataeval/utils/data/_images.py +65 -0
  57. dataeval/utils/data/_metadata.py +352 -0
  58. dataeval/utils/data/_selection.py +119 -0
  59. dataeval/utils/{dataset/split.py → data/_split.py} +13 -14
  60. dataeval/utils/data/_targets.py +73 -0
  61. dataeval/utils/data/_types.py +58 -0
  62. dataeval/utils/data/collate.py +103 -0
  63. dataeval/utils/data/datasets/__init__.py +17 -0
  64. dataeval/utils/data/datasets/_base.py +254 -0
  65. dataeval/utils/data/datasets/_cifar10.py +134 -0
  66. dataeval/utils/data/datasets/_fileio.py +168 -0
  67. dataeval/utils/data/datasets/_milco.py +153 -0
  68. dataeval/utils/data/datasets/_mixin.py +56 -0
  69. dataeval/utils/data/datasets/_mnist.py +183 -0
  70. dataeval/utils/data/datasets/_ships.py +123 -0
  71. dataeval/utils/data/datasets/_voc.py +352 -0
  72. dataeval/utils/data/selections/__init__.py +15 -0
  73. dataeval/utils/data/selections/_classfilter.py +60 -0
  74. dataeval/utils/data/selections/_indices.py +26 -0
  75. dataeval/utils/data/selections/_limit.py +26 -0
  76. dataeval/utils/data/selections/_reverse.py +18 -0
  77. dataeval/utils/data/selections/_shuffle.py +29 -0
  78. dataeval/utils/metadata.py +51 -376
  79. dataeval/utils/torch/{gmm.py → _gmm.py} +4 -2
  80. dataeval/utils/torch/{internal.py → _internal.py} +21 -51
  81. dataeval/utils/torch/models.py +43 -2
  82. dataeval/workflows/sufficiency.py +10 -9
  83. {dataeval-0.76.1.dist-info → dataeval-0.81.0.dist-info}/METADATA +4 -1
  84. dataeval-0.81.0.dist-info/RECORD +94 -0
  85. dataeval/detectors/linters/clusterer.py +0 -512
  86. dataeval/detectors/linters/merged_stats.py +0 -49
  87. dataeval/detectors/ood/metadata_least_likely.py +0 -119
  88. dataeval/interop.py +0 -69
  89. dataeval/utils/dataset/__init__.py +0 -7
  90. dataeval/utils/dataset/datasets.py +0 -412
  91. dataeval/utils/dataset/read.py +0 -63
  92. dataeval-0.76.1.dist-info/RECORD +0 -67
  93. /dataeval/{log.py → _log.py} +0 -0
  94. /dataeval/utils/torch/{blocks.py → _blocks.py} +0 -0
  95. {dataeval-0.76.1.dist-info → dataeval-0.81.0.dist-info}/LICENSE.txt +0 -0
  96. {dataeval-0.76.1.dist-info → dataeval-0.81.0.dist-info}/WHEEL +0 -0
@@ -2,8 +2,18 @@
2
2
  Estimators calculate performance bounds and the statistical distance between datasets.
3
3
  """
4
4
 
5
- __all__ = ["ber", "divergence", "uap", "BEROutput", "DivergenceOutput", "UAPOutput"]
5
+ __all__ = [
6
+ "ber",
7
+ "clusterer",
8
+ "divergence",
9
+ "uap",
10
+ "BEROutput",
11
+ "ClustererOutput",
12
+ "DivergenceOutput",
13
+ "UAPOutput",
14
+ ]
6
15
 
7
- from dataeval.metrics.estimators.ber import BEROutput, ber
8
- from dataeval.metrics.estimators.divergence import DivergenceOutput, divergence
9
- from dataeval.metrics.estimators.uap import UAPOutput, uap
16
+ from dataeval.metrics.estimators._ber import BEROutput, ber
17
+ from dataeval.metrics.estimators._clusterer import ClustererOutput, clusterer
18
+ from dataeval.metrics.estimators._divergence import DivergenceOutput, divergence
19
+ from dataeval.metrics.estimators._uap import UAPOutput, uap
@@ -16,19 +16,21 @@ from dataclasses import dataclass
16
16
  from typing import Literal
17
17
 
18
18
  import numpy as np
19
- from numpy.typing import ArrayLike, NDArray
19
+ from numpy.typing import NDArray
20
20
  from scipy.sparse import coo_matrix
21
21
  from scipy.stats import mode
22
22
 
23
- from dataeval.interop import as_numpy
24
- from dataeval.output import Output, set_metadata
25
- from dataeval.utils.shared import compute_neighbors, get_classes_counts, get_method, minimum_spanning_tree
23
+ from dataeval._output import Output, set_metadata
24
+ from dataeval.typing import ArrayLike
25
+ from dataeval.utils._array import as_numpy, ensure_embeddings
26
+ from dataeval.utils._method import get_method
27
+ from dataeval.utils._mst import compute_neighbors, minimum_spanning_tree
26
28
 
27
29
 
28
30
  @dataclass(frozen=True)
29
31
  class BEROutput(Output):
30
32
  """
31
- Output class for :func:`ber` estimator metric.
33
+ Output class for :func:`.ber` estimator metric.
32
34
 
33
35
  Attributes
34
36
  ----------
@@ -116,18 +118,21 @@ def knn_lowerbound(value: float, classes: int, k: int) -> float:
116
118
  return ((classes - 1) / classes) * (1 - np.sqrt(max(0, 1 - ((classes / (classes - 1)) * value))))
117
119
 
118
120
 
121
+ _BER_FN_MAP = {"KNN": ber_knn, "MST": ber_mst}
122
+
123
+
119
124
  @set_metadata
120
- def ber(images: ArrayLike, labels: ArrayLike, k: int = 1, method: Literal["KNN", "MST"] = "KNN") -> BEROutput:
125
+ def ber(embeddings: ArrayLike, labels: ArrayLike, k: int = 1, method: Literal["KNN", "MST"] = "KNN") -> BEROutput:
121
126
  """
122
127
  An estimator for Multi-class :term:`Bayes error rate<Bayes Error Rate (BER)>` \
123
128
  using FR or KNN test statistic basis.
124
129
 
125
130
  Parameters
126
131
  ----------
127
- images : ArrayLike (N, ... )
128
- Array of images or image :term:`embeddings<Embeddings>`
132
+ embeddings : ArrayLike (N, ... )
133
+ Array of image :term:`embeddings<Embeddings>`
129
134
  labels : ArrayLike (N, 1)
130
- Array of labels for each image or image embedding
135
+ Array of labels for each image
131
136
  k : int, default 1
132
137
  Number of nearest neighbors for KNN estimator -- ignored by MST estimator
133
138
  method : Literal["KNN", "MST"], default "KNN"
@@ -152,8 +157,34 @@ def ber(images: ArrayLike, labels: ArrayLike, k: int = 1, method: Literal["KNN",
152
157
  >>> ber(images, labels)
153
158
  BEROutput(ber=0.04, ber_lower=0.020416847668728033)
154
159
  """
155
- ber_fn = get_method({"KNN": ber_knn, "MST": ber_mst}, method)
156
- X = as_numpy(images)
160
+ ber_fn = get_method(_BER_FN_MAP, method)
161
+ X = ensure_embeddings(embeddings, dtype=np.float64)
157
162
  y = as_numpy(labels)
158
163
  upper, lower = ber_fn(X, y, k)
159
164
  return BEROutput(upper, lower)
165
+
166
+
167
+ def get_classes_counts(labels: NDArray[np.int_]) -> tuple[int, int]:
168
+ """
169
+ Returns the classes and counts of from an array of labels
170
+
171
+ Parameters
172
+ ----------
173
+ label : NDArray
174
+ Numpy labels array
175
+
176
+ Returns
177
+ -------
178
+ Classes and counts
179
+
180
+ Raises
181
+ ------
182
+ ValueError
183
+ If the number of unique classes is less than 2
184
+ """
185
+ classes, counts = np.unique(labels, return_counts=True)
186
+ M = len(classes)
187
+ if M < 2:
188
+ raise ValueError("Label vector contains less than 2 classes!")
189
+ N = int(np.sum(counts))
190
+ return M, N
@@ -0,0 +1,104 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = []
4
+
5
+ from dataclasses import dataclass
6
+
7
+ import numpy as np
8
+ from numpy.typing import NDArray
9
+
10
+ from dataeval._output import Output
11
+ from dataeval.typing import ArrayLike
12
+ from dataeval.utils._array import as_numpy
13
+
14
+
15
+ @dataclass(frozen=True)
16
+ class ClustererOutput(Output):
17
+ """
18
+ Output class for :func:`.clusterer`.
19
+
20
+ Attributes
21
+ ----------
22
+ clusters : NDArray[int]
23
+ Assigned clusters
24
+ mst : NDArray[int]
25
+ The minimum spanning tree of the data
26
+ linkage_tree : NDArray[float]
27
+ The linkage array of the data
28
+ condensed_tree : NDArray[float]
29
+ The condensed tree of the data
30
+ membership_strengths : NDArray[float]
31
+ The strength of the data point belonging to the assigned cluster
32
+ """
33
+
34
+ clusters: NDArray[np.int_]
35
+ mst: NDArray[np.double]
36
+ linkage_tree: NDArray[np.double]
37
+ condensed_tree: NDArray[np.double]
38
+ membership_strengths: NDArray[np.double]
39
+
40
+ def find_outliers(self) -> NDArray[np.int_]:
41
+ """
42
+ Retrieves Outliers based on when the sample was added to the cluster
43
+ and how far it was from the cluster when it was added
44
+
45
+ Returns
46
+ -------
47
+ NDArray[int]
48
+ A numpy array of the outlier indices
49
+ """
50
+ return np.nonzero(self.clusters == -1)[0]
51
+
52
+ def find_duplicates(self) -> tuple[list[list[int]], list[list[int]]]:
53
+ """
54
+ Finds duplicate and near duplicate data based on cluster average distance
55
+
56
+ Returns
57
+ -------
58
+ Tuple[List[List[int]], List[List[int]]]
59
+ The exact :term:`duplicates<Duplicates>` and near duplicates as lists of related indices
60
+ """
61
+ # Delay load numba compiled functions
62
+ from dataeval.utils._clusterer import compare_links_to_cluster_std, sorted_union_find
63
+
64
+ exact_indices, near_indices = compare_links_to_cluster_std(self.mst, self.clusters)
65
+ exact_dupes = sorted_union_find(exact_indices)
66
+ near_dupes = sorted_union_find(near_indices)
67
+
68
+ return [[int(ii) for ii in il] for il in exact_dupes], [[int(ii) for ii in il] for il in near_dupes]
69
+
70
+
71
+ def clusterer(data: ArrayLike) -> ClustererOutput:
72
+ """
73
+ Uses hierarchical clustering on the flattened data and returns clustering
74
+ information.
75
+
76
+ Parameters
77
+ ----------
78
+ data : ArrayLike, shape - (N, ...)
79
+ A dataset in an ArrayLike format. Function expects the data to have 2
80
+ or more dimensions which will flatten to (N, P) where N number of
81
+ observations in a P-dimensional space.
82
+
83
+ Returns
84
+ -------
85
+ :class:`.ClustererOutput`
86
+
87
+ Note
88
+ ----
89
+ The clusterer works best when the length of the feature dimension, P, is
90
+ less than 500. If flattening a CxHxW image results in a dimension larger
91
+ than 500, then it is recommended to reduce the dimensions.
92
+
93
+ Example
94
+ -------
95
+ >>> clusterer(clusterer_images).clusters
96
+ array([ 2, 0, 0, 0, 0, 0, 4, 0, 3, 1, 1, 0, 2, 0, 0, 0, 0,
97
+ 4, 2, 0, 0, 1, 2, 0, 1, 3, 0, 3, 3, 4, 0, 0, 3, 0,
98
+ 3, -1, 0, 0, 2, 4, 3, 4, 0, 1, 0, -1, 3, 0, 0, 0])
99
+ """
100
+ # Delay load numba compiled functions
101
+ from dataeval.utils._clusterer import cluster
102
+
103
+ c = cluster(data)
104
+ return ClustererOutput(c.clusters, c.mst, c.linkage_tree, as_numpy(c.condensed_tree), c.membership_strengths)
@@ -11,17 +11,19 @@ from dataclasses import dataclass
11
11
  from typing import Literal
12
12
 
13
13
  import numpy as np
14
- from numpy.typing import ArrayLike, NDArray
14
+ from numpy.typing import NDArray
15
15
 
16
- from dataeval.interop import as_numpy
17
- from dataeval.output import Output, set_metadata
18
- from dataeval.utils.shared import compute_neighbors, get_method, minimum_spanning_tree
16
+ from dataeval._output import Output, set_metadata
17
+ from dataeval.typing import ArrayLike
18
+ from dataeval.utils._array import ensure_embeddings
19
+ from dataeval.utils._method import get_method
20
+ from dataeval.utils._mst import compute_neighbors, minimum_spanning_tree
19
21
 
20
22
 
21
23
  @dataclass(frozen=True)
22
24
  class DivergenceOutput(Output):
23
25
  """
24
- Output class for :func:`divergence` estimator metric.
26
+ Output class for :func:`.divergence` estimator metric.
25
27
 
26
28
  Attributes
27
29
  ----------
@@ -78,18 +80,21 @@ def divergence_fnn(data: NDArray[np.float64], labels: NDArray[np.int_]) -> int:
78
80
  return errors
79
81
 
80
82
 
83
+ _DIVERGENCE_FN_MAP = {"FNN": divergence_fnn, "MST": divergence_mst}
84
+
85
+
81
86
  @set_metadata
82
- def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST"] = "FNN") -> DivergenceOutput:
87
+ def divergence(emb_a: ArrayLike, emb_b: ArrayLike, method: Literal["FNN", "MST"] = "FNN") -> DivergenceOutput:
83
88
  """
84
89
  Calculates the :term:`divergence` and any errors between the datasets.
85
90
 
86
91
  Parameters
87
92
  ----------
88
- data_a : ArrayLike, shape - (N, P)
89
- A dataset in an ArrayLike format to compare.
93
+ emb_a : ArrayLike, shape - (N, P)
94
+ Image embeddings in an ArrayLike format to compare.
90
95
  Function expects the data to have 2 dimensions, N number of observations in a P-dimensionial space.
91
- data_b : ArrayLike, shape - (N, P)
92
- A dataset in an ArrayLike format to compare.
96
+ emb_b : ArrayLike, shape - (N, P)
97
+ Image embeddings in an ArrayLike format to compare.
93
98
  Function expects the data to have 2 dimensions, N number of observations in a P-dimensionial space.
94
99
  method : Literal["MST, "FNN"], default "FNN"
95
100
  Method used to estimate dataset :term:`divergence<Divergence>`
@@ -125,9 +130,9 @@ def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST
125
130
  >>> divergence(datasetA, datasetB)
126
131
  DivergenceOutput(divergence=0.28, errors=36)
127
132
  """
128
- div_fn = get_method({"FNN": divergence_fnn, "MST": divergence_mst}, method)
129
- a = as_numpy(data_a)
130
- b = as_numpy(data_b)
133
+ div_fn = get_method(_DIVERGENCE_FN_MAP, method)
134
+ a = ensure_embeddings(emb_a, dtype=np.float64)
135
+ b = ensure_embeddings(emb_b, dtype=np.float64)
131
136
  N = a.shape[0]
132
137
  M = b.shape[0]
133
138
 
@@ -10,17 +10,17 @@ __all__ = []
10
10
 
11
11
  from dataclasses import dataclass
12
12
 
13
- from numpy.typing import ArrayLike
14
13
  from sklearn.metrics import average_precision_score
15
14
 
16
- from dataeval.interop import as_numpy
17
- from dataeval.output import Output, set_metadata
15
+ from dataeval._output import Output, set_metadata
16
+ from dataeval.typing import ArrayLike
17
+ from dataeval.utils._array import as_numpy
18
18
 
19
19
 
20
20
  @dataclass(frozen=True)
21
21
  class UAPOutput(Output):
22
22
  """
23
- Output class for :func:`uap` estimator metric.
23
+ Output class for :func:`.uap` estimator metric.
24
24
 
25
25
  Attributes
26
26
  ----------
@@ -21,15 +21,15 @@ __all__ = [
21
21
  "visualstats",
22
22
  ]
23
23
 
24
- from dataeval.metrics.stats.boxratiostats import boxratiostats
25
- from dataeval.metrics.stats.datasetstats import (
24
+ from dataeval.metrics.stats._boxratiostats import boxratiostats
25
+ from dataeval.metrics.stats._datasetstats import (
26
26
  ChannelStatsOutput,
27
27
  DatasetStatsOutput,
28
28
  channelstats,
29
29
  datasetstats,
30
30
  )
31
- from dataeval.metrics.stats.dimensionstats import DimensionStatsOutput, dimensionstats
32
- from dataeval.metrics.stats.hashstats import HashStatsOutput, hashstats
33
- from dataeval.metrics.stats.labelstats import LabelStatsOutput, labelstats
34
- from dataeval.metrics.stats.pixelstats import PixelStatsOutput, pixelstats
35
- from dataeval.metrics.stats.visualstats import VisualStatsOutput, visualstats
31
+ from dataeval.metrics.stats._dimensionstats import DimensionStatsOutput, dimensionstats
32
+ from dataeval.metrics.stats._hashstats import HashStatsOutput, hashstats
33
+ from dataeval.metrics.stats._labelstats import LabelStatsOutput, labelstats
34
+ from dataeval.metrics.stats._pixelstats import PixelStatsOutput, pixelstats
35
+ from dataeval.metrics.stats._visualstats import VisualStatsOutput, visualstats
@@ -1,32 +1,31 @@
1
1
  from __future__ import annotations
2
2
 
3
- from dataeval.utils.plot import histogram_plot
4
-
5
3
  __all__ = []
6
4
 
7
5
  import re
8
6
  import warnings
7
+ from copy import deepcopy
9
8
  from dataclasses import dataclass
10
9
  from functools import partial
11
10
  from itertools import repeat
12
11
  from multiprocessing import Pool
13
- from typing import Any, Callable, Generic, Iterable, NamedTuple, Optional, TypeVar, Union
12
+ from typing import Any, Callable, Generic, Iterable, Optional, Sequence, Sized, TypeVar, Union
14
13
 
15
14
  import numpy as np
16
15
  import tqdm
17
- from numpy.typing import ArrayLike, NDArray
16
+ from numpy.typing import NDArray
18
17
 
19
- from dataeval.interop import to_numpy_iter
20
- from dataeval.output import Output
21
- from dataeval.utils.image import normalize_image_shape, rescale
18
+ from dataeval._output import Output
19
+ from dataeval.config import get_max_processes
20
+ from dataeval.typing import ArrayLike
21
+ from dataeval.utils._array import to_numpy_iter
22
+ from dataeval.utils._image import normalize_image_shape, rescale
23
+ from dataeval.utils._plot import histogram_plot
22
24
 
23
25
  DTYPE_REGEX = re.compile(r"NDArray\[np\.(.*?)\]")
24
26
  SOURCE_INDEX = "source_index"
25
27
  BOX_COUNT = "box_count"
26
28
 
27
- # TODO: Replace with global config
28
- DEFAULT_PROCESSES: int | None = None
29
-
30
29
  OptionalRange = Optional[Union[int, Iterable[int]]]
31
30
 
32
31
 
@@ -49,7 +48,8 @@ def normalize_box_shape(bounding_box: NDArray[Any]) -> NDArray[Any]:
49
48
  return bounding_box
50
49
 
51
50
 
52
- class SourceIndex(NamedTuple):
51
+ @dataclass
52
+ class SourceIndex:
53
53
  """
54
54
  Attributes
55
55
  ----------
@@ -205,7 +205,8 @@ class StatsProcessor(Generic[TStatsOutput]):
205
205
  return cls.output_class(**output, source_index=source_index, box_count=np.asarray(box_count, dtype=np.uint16))
206
206
 
207
207
 
208
- class StatsProcessorOutput(NamedTuple):
208
+ @dataclass
209
+ class StatsProcessorOutput:
209
210
  results: list[dict[str, Any]]
210
211
  source_indices: list[SourceIndex]
211
212
  box_counts: list[int]
@@ -272,8 +273,6 @@ def run_stats(
272
273
  A flag which determines if the states should be evaluated on a per-channel basis or not.
273
274
  stats_processor_cls : Iterable[type[StatsProcessor]]
274
275
  An iterable of stats processor classes that calculate stats and return output classes.
275
- processes : int | None, default None
276
- Number of processes to use, defaults to None which uses all available CPU cores.
277
276
 
278
277
  Returns
279
278
  -------
@@ -297,11 +296,11 @@ def run_stats(
297
296
  bbox_iter = repeat(None) if bboxes is None else to_numpy_iter(bboxes)
298
297
 
299
298
  warning_list = []
300
- total_for_status = getattr(images, "__len__")() if hasattr(images, "__len__") else None
299
+ total_for_status = len(images) if isinstance(images, Sized) else None
301
300
  stats_processor_cls = stats_processor_cls if isinstance(stats_processor_cls, Iterable) else [stats_processor_cls]
302
301
 
303
302
  # TODO: Introduce global controls for CPU job parallelism and GPU configurations
304
- with Pool(processes=DEFAULT_PROCESSES) as p:
303
+ with Pool(processes=get_max_processes()) as p:
305
304
  for r in tqdm.tqdm(
306
305
  p.imap(
307
306
  partial(process_stats_unpack, per_channel=per_channel, stats_processor_cls=stats_processor_cls),
@@ -330,3 +329,40 @@ def run_stats(
330
329
 
331
330
  outputs = [s.convert_output(output, source_index, box_count) for s in stats_processor_cls]
332
331
  return outputs
332
+
333
+
334
+ def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
335
+ if type(a) is not type(b):
336
+ raise TypeError(f"Types {type(a)} and {type(b)} cannot be added.")
337
+
338
+ sum_dict = deepcopy(a.dict())
339
+
340
+ for k in sum_dict:
341
+ if isinstance(sum_dict[k], list):
342
+ sum_dict[k].extend(b.dict()[k])
343
+ else:
344
+ sum_dict[k] = np.concatenate((sum_dict[k], b.dict()[k]))
345
+
346
+ return type(a)(**sum_dict)
347
+
348
+
349
+ def combine_stats(stats: Sequence[TStatsOutput]) -> tuple[TStatsOutput, list[int]]:
350
+ output = None
351
+ dataset_steps = []
352
+ cur_len = 0
353
+ for s in stats:
354
+ output = s if output is None else add_stats(output, s)
355
+ cur_len += len(s)
356
+ dataset_steps.append(cur_len)
357
+ if output is None:
358
+ raise TypeError("Cannot combine empty sequence of stats.")
359
+ return output, dataset_steps
360
+
361
+
362
+ def get_dataset_step_from_idx(idx: int, dataset_steps: list[int]) -> tuple[int, int]:
363
+ last_step = 0
364
+ for i, step in enumerate(dataset_steps):
365
+ if idx < step:
366
+ return i, idx - last_step
367
+ last_step = step
368
+ return -1, idx
@@ -8,9 +8,9 @@ from typing import Any, Callable, Generic, TypeVar, cast
8
8
  import numpy as np
9
9
  from numpy.typing import NDArray
10
10
 
11
- from dataeval.metrics.stats.base import BOX_COUNT, SOURCE_INDEX, BaseStatsOutput
12
- from dataeval.metrics.stats.dimensionstats import DimensionStatsOutput
13
- from dataeval.output import set_metadata
11
+ from dataeval._output import set_metadata
12
+ from dataeval.metrics.stats._base import BOX_COUNT, SOURCE_INDEX, BaseStatsOutput
13
+ from dataeval.metrics.stats._dimensionstats import DimensionStatsOutput
14
14
 
15
15
  TStatOutput = TypeVar("TStatOutput", bound=BaseStatsOutput, contravariant=True)
16
16
  ArraySlice = tuple[int, int]
@@ -50,7 +50,7 @@ RATIOSTATS_OVERRIDE_MAP: dict[type, dict[str, Callable[..., NDArray[Any]]]] = {
50
50
  "depth": lambda x: x.box["depth"],
51
51
  "distance": lambda x: x.box["distance"],
52
52
  }
53
- )
53
+ ),
54
54
  }
55
55
 
56
56
 
@@ -87,11 +87,8 @@ def calculate_ratios(key: str, box_stats: BaseStatsOutput, img_stats: BaseStatsO
87
87
  stats = BoxImageStatsOutputSlice(box_stats, (box_i, box_j), img_stats, (img_i, img_j))
88
88
  out_type = type(box_stats)
89
89
  use_override = out_type in RATIOSTATS_OVERRIDE_MAP and key in RATIOSTATS_OVERRIDE_MAP[out_type]
90
- ratio = (
91
- RATIOSTATS_OVERRIDE_MAP[out_type][key](stats)
92
- if use_override
93
- else np.nan_to_num(stats.box[key] / stats.img[key])
94
- )
90
+ with np.errstate(divide="ignore", invalid="ignore"):
91
+ ratio = RATIOSTATS_OVERRIDE_MAP[out_type][key](stats) if use_override else stats.box[key] / stats.img[key]
95
92
  out_stats[box_i:box_j] = ratio.reshape(-1, *out_stats[box_i].shape)
96
93
  return out_stats
97
94
 
@@ -5,24 +5,20 @@ __all__ = []
5
5
  from dataclasses import dataclass
6
6
  from typing import Any, Iterable
7
7
 
8
- from numpy.typing import ArrayLike
9
-
10
- from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, _is_plottable, run_stats
11
- from dataeval.metrics.stats.dimensionstats import (
12
- DimensionStatsOutput,
13
- DimensionStatsProcessor,
14
- )
15
- from dataeval.metrics.stats.labelstats import LabelStatsOutput, labelstats
16
- from dataeval.metrics.stats.pixelstats import PixelStatsOutput, PixelStatsProcessor
17
- from dataeval.metrics.stats.visualstats import VisualStatsOutput, VisualStatsProcessor
18
- from dataeval.output import Output, set_metadata
19
- from dataeval.utils.plot import channel_histogram_plot
8
+ from dataeval._output import Output, set_metadata
9
+ from dataeval.metrics.stats._base import BaseStatsOutput, HistogramPlotMixin, _is_plottable, run_stats
10
+ from dataeval.metrics.stats._dimensionstats import DimensionStatsOutput, DimensionStatsProcessor
11
+ from dataeval.metrics.stats._labelstats import LabelStatsOutput, labelstats
12
+ from dataeval.metrics.stats._pixelstats import PixelStatsOutput, PixelStatsProcessor
13
+ from dataeval.metrics.stats._visualstats import VisualStatsOutput, VisualStatsProcessor
14
+ from dataeval.typing import ArrayLike
15
+ from dataeval.utils._plot import channel_histogram_plot
20
16
 
21
17
 
22
18
  @dataclass(frozen=True)
23
19
  class DatasetStatsOutput(Output, HistogramPlotMixin):
24
20
  """
25
- Output class for :func:`datasetstats` stats metric.
21
+ Output class for :func:`.datasetstats` stats metric.
26
22
 
27
23
  This class represents the outputs of various stats functions against a single
28
24
  dataset, such that each index across all stat outputs are representative of
@@ -82,7 +78,7 @@ def _get_channels(cls, channel_limit: int | None = None, channel_index: int | It
82
78
  @dataclass(frozen=True)
83
79
  class ChannelStatsOutput(Output):
84
80
  """
85
- Output class for :func:`channelstats` stats metric.
81
+ Output class for :func:`.channelstats` stats metric.
86
82
 
87
83
  This class represents the outputs of various per-channel stats functions against
88
84
  a single dataset, such that each index across all stat outputs are representative
@@ -6,17 +6,18 @@ from dataclasses import dataclass
6
6
  from typing import Any, Callable, Iterable
7
7
 
8
8
  import numpy as np
9
- from numpy.typing import ArrayLike, NDArray
9
+ from numpy.typing import NDArray
10
10
 
11
- from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
12
- from dataeval.output import set_metadata
13
- from dataeval.utils.image import get_bitdepth
11
+ from dataeval._output import set_metadata
12
+ from dataeval.metrics.stats._base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
13
+ from dataeval.typing import ArrayLike
14
+ from dataeval.utils._image import get_bitdepth
14
15
 
15
16
 
16
17
  @dataclass(frozen=True)
17
18
  class DimensionStatsOutput(BaseStatsOutput, HistogramPlotMixin):
18
19
  """
19
- Output class for :func:`dimensionstats` stats metric.
20
+ Output class for :func:`.dimensionstats` stats metric.
20
21
 
21
22
  Attributes
22
23
  ----------
@@ -9,14 +9,14 @@ from typing import Callable, Iterable
9
9
 
10
10
  import numpy as np
11
11
  import xxhash as xxh
12
- from numpy.typing import ArrayLike
13
12
  from PIL import Image
14
13
  from scipy.fftpack import dct
15
14
 
16
- from dataeval.interop import as_numpy
17
- from dataeval.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
18
- from dataeval.output import set_metadata
19
- from dataeval.utils.image import normalize_image_shape, rescale
15
+ from dataeval._output import set_metadata
16
+ from dataeval.metrics.stats._base import BaseStatsOutput, StatsProcessor, run_stats
17
+ from dataeval.typing import ArrayLike
18
+ from dataeval.utils._array import as_numpy
19
+ from dataeval.utils._image import normalize_image_shape, rescale
20
20
 
21
21
  HASH_SIZE = 8
22
22
  MAX_FACTOR = 4
@@ -25,7 +25,7 @@ MAX_FACTOR = 4
25
25
  @dataclass(frozen=True)
26
26
  class HashStatsOutput(BaseStatsOutput):
27
27
  """
28
- Output class for :func:`hashstats` stats metric.
28
+ Output class for :func:`.hashstats` stats metric.
29
29
 
30
30
  Attributes
31
31
  ----------
@@ -8,10 +8,10 @@ from dataclasses import dataclass
8
8
  from typing import Any, Iterable, Mapping, TypeVar
9
9
 
10
10
  import numpy as np
11
- from numpy.typing import ArrayLike
12
11
 
13
- from dataeval.interop import as_numpy
14
- from dataeval.output import Output, set_metadata
12
+ from dataeval._output import Output, set_metadata
13
+ from dataeval.typing import ArrayLike
14
+ from dataeval.utils._array import as_numpy
15
15
 
16
16
  with contextlib.suppress(ImportError):
17
17
  import pandas as pd
@@ -20,7 +20,7 @@ with contextlib.suppress(ImportError):
20
20
  @dataclass(frozen=True)
21
21
  class LabelStatsOutput(Output):
22
22
  """
23
- Output class for :func:`labelstats` stats metric.
23
+ Output class for :func:`.labelstats` stats metric.
24
24
 
25
25
  Attributes
26
26
  ----------
@@ -6,17 +6,18 @@ from dataclasses import dataclass
6
6
  from typing import Any, Callable, Iterable
7
7
 
8
8
  import numpy as np
9
- from numpy.typing import ArrayLike, NDArray
9
+ from numpy.typing import NDArray
10
10
  from scipy.stats import entropy, kurtosis, skew
11
11
 
12
- from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
13
- from dataeval.output import set_metadata
12
+ from dataeval._output import set_metadata
13
+ from dataeval.metrics.stats._base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
14
+ from dataeval.typing import ArrayLike
14
15
 
15
16
 
16
17
  @dataclass(frozen=True)
17
18
  class PixelStatsOutput(BaseStatsOutput, HistogramPlotMixin):
18
19
  """
19
- Output class for :func:`pixelstats` stats metric.
20
+ Output class for :func:`.pixelstats` stats metric.
20
21
 
21
22
  Attributes
22
23
  ----------