dataeval 0.72.0__py3-none-any.whl → 0.72.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. dataeval/__init__.py +4 -4
  2. dataeval/detectors/__init__.py +4 -3
  3. dataeval/detectors/drift/__init__.py +10 -11
  4. dataeval/{_internal/detectors → detectors}/drift/base.py +51 -102
  5. dataeval/{_internal/detectors → detectors}/drift/cvm.py +9 -8
  6. dataeval/{_internal/detectors → detectors}/drift/ks.py +11 -10
  7. dataeval/{_internal/detectors → detectors}/drift/mmd.py +33 -34
  8. dataeval/{_internal/detectors → detectors}/drift/torch.py +15 -13
  9. dataeval/{_internal/detectors → detectors}/drift/uncertainty.py +12 -9
  10. dataeval/detectors/drift/updates.py +61 -0
  11. dataeval/detectors/linters/__init__.py +3 -3
  12. dataeval/{_internal/detectors → detectors/linters}/clusterer.py +47 -45
  13. dataeval/{_internal/detectors → detectors/linters}/duplicates.py +20 -10
  14. dataeval/{_internal/detectors → detectors/linters}/merged_stats.py +3 -1
  15. dataeval/{_internal/detectors → detectors/linters}/outliers.py +19 -26
  16. dataeval/detectors/ood/__init__.py +8 -16
  17. dataeval/{_internal/detectors → detectors}/ood/ae.py +9 -9
  18. dataeval/{_internal/detectors → detectors}/ood/aegmm.py +10 -30
  19. dataeval/{_internal/detectors → detectors}/ood/base.py +27 -21
  20. dataeval/{_internal/detectors → detectors}/ood/llr.py +27 -23
  21. dataeval/detectors/ood/metadata_ks_compare.py +99 -0
  22. dataeval/detectors/ood/metadata_least_likely.py +119 -0
  23. dataeval/detectors/ood/metadata_ood_mi.py +92 -0
  24. dataeval/{_internal/detectors → detectors}/ood/vae.py +11 -13
  25. dataeval/{_internal/detectors → detectors}/ood/vaegmm.py +10 -32
  26. dataeval/{_internal/interop.py → interop.py} +12 -7
  27. dataeval/metrics/__init__.py +1 -1
  28. dataeval/metrics/bias/__init__.py +4 -4
  29. dataeval/{_internal/metrics → metrics/bias}/balance.py +70 -4
  30. dataeval/{_internal/metrics → metrics/bias}/coverage.py +10 -8
  31. dataeval/{_internal/metrics → metrics/bias}/diversity.py +54 -20
  32. dataeval/metrics/bias/metadata.py +275 -0
  33. dataeval/{_internal/metrics → metrics/bias}/parity.py +21 -17
  34. dataeval/metrics/estimators/__init__.py +3 -3
  35. dataeval/{_internal/metrics → metrics/estimators}/ber.py +31 -28
  36. dataeval/{_internal/metrics → metrics/estimators}/divergence.py +15 -16
  37. dataeval/{_internal/metrics → metrics/estimators}/uap.py +8 -6
  38. dataeval/metrics/stats/__init__.py +7 -7
  39. dataeval/{_internal/metrics → metrics}/stats/base.py +66 -40
  40. dataeval/{_internal/metrics → metrics}/stats/boxratiostats.py +19 -15
  41. dataeval/{_internal/metrics → metrics}/stats/datasetstats.py +19 -17
  42. dataeval/{_internal/metrics → metrics}/stats/dimensionstats.py +12 -10
  43. dataeval/metrics/stats/hashstats.py +156 -0
  44. dataeval/{_internal/metrics → metrics}/stats/labelstats.py +8 -6
  45. dataeval/{_internal/metrics → metrics}/stats/pixelstats.py +12 -11
  46. dataeval/{_internal/metrics → metrics}/stats/visualstats.py +14 -13
  47. dataeval/{_internal/output.py → output.py} +26 -6
  48. dataeval/utils/__init__.py +8 -4
  49. dataeval/utils/image.py +71 -0
  50. dataeval/utils/shared.py +151 -0
  51. dataeval/utils/split_dataset.py +486 -0
  52. dataeval/utils/tensorflow/__init__.py +9 -7
  53. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/autoencoder.py +64 -68
  54. dataeval/{_internal/models/tensorflow/losses.py → utils/tensorflow/_internal/loss.py} +10 -9
  55. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/pixelcnn.py +18 -22
  56. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py +3 -1
  57. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py +18 -18
  58. dataeval/utils/tensorflow/loss/__init__.py +6 -2
  59. dataeval/utils/torch/__init__.py +7 -3
  60. dataeval/{_internal/models/pytorch → utils/torch}/blocks.py +19 -14
  61. dataeval/{_internal → utils/torch}/datasets.py +49 -43
  62. dataeval/utils/torch/models.py +138 -0
  63. dataeval/{_internal/models/pytorch/autoencoder.py → utils/torch/trainer.py} +12 -141
  64. dataeval/{_internal → utils/torch}/utils.py +3 -1
  65. dataeval/workflows/__init__.py +1 -1
  66. dataeval/{_internal/workflows → workflows}/sufficiency.py +42 -37
  67. {dataeval-0.72.0.dist-info → dataeval-0.72.2.dist-info}/METADATA +7 -5
  68. dataeval-0.72.2.dist-info/RECORD +72 -0
  69. dataeval/_internal/detectors/__init__.py +0 -0
  70. dataeval/_internal/detectors/drift/__init__.py +0 -0
  71. dataeval/_internal/detectors/ood/__init__.py +0 -0
  72. dataeval/_internal/metrics/__init__.py +0 -0
  73. dataeval/_internal/metrics/stats/hashstats.py +0 -75
  74. dataeval/_internal/metrics/utils.py +0 -447
  75. dataeval/_internal/models/__init__.py +0 -0
  76. dataeval/_internal/models/pytorch/__init__.py +0 -0
  77. dataeval/_internal/models/pytorch/utils.py +0 -67
  78. dataeval/_internal/models/tensorflow/__init__.py +0 -0
  79. dataeval/_internal/workflows/__init__.py +0 -0
  80. dataeval/detectors/drift/kernels/__init__.py +0 -10
  81. dataeval/detectors/drift/updates/__init__.py +0 -7
  82. dataeval/utils/tensorflow/models/__init__.py +0 -9
  83. dataeval/utils/tensorflow/recon/__init__.py +0 -3
  84. dataeval/utils/torch/datasets/__init__.py +0 -12
  85. dataeval/utils/torch/models/__init__.py +0 -11
  86. dataeval/utils/torch/trainer/__init__.py +0 -7
  87. dataeval-0.72.0.dist-info/RECORD +0 -80
  88. /dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/gmm.py +0 -0
  89. {dataeval-0.72.0.dist-info → dataeval-0.72.2.dist-info}/LICENSE.txt +0 -0
  90. {dataeval-0.72.0.dist-info → dataeval-0.72.2.dist-info}/WHEEL +0 -0
@@ -1,19 +1,21 @@
1
1
  from __future__ import annotations
2
2
 
3
+ __all__ = ["DatasetStatsOutput", "ChannelStatsOutput", "datasetstats", "channelstats"]
4
+
3
5
  from dataclasses import dataclass
4
6
  from typing import Any, Iterable
5
7
 
6
8
  from numpy.typing import ArrayLike
7
9
 
8
- from dataeval._internal.metrics.stats.base import BaseStatsOutput, run_stats
9
- from dataeval._internal.metrics.stats.dimensionstats import (
10
+ from dataeval.metrics.stats.base import BaseStatsOutput, run_stats
11
+ from dataeval.metrics.stats.dimensionstats import (
10
12
  DimensionStatsOutput,
11
13
  DimensionStatsProcessor,
12
14
  )
13
- from dataeval._internal.metrics.stats.labelstats import LabelStatsOutput, labelstats
14
- from dataeval._internal.metrics.stats.pixelstats import PixelStatsOutput, PixelStatsProcessor
15
- from dataeval._internal.metrics.stats.visualstats import VisualStatsOutput, VisualStatsProcessor
16
- from dataeval._internal.output import OutputMetadata, set_metadata
15
+ from dataeval.metrics.stats.labelstats import LabelStatsOutput, labelstats
16
+ from dataeval.metrics.stats.pixelstats import PixelStatsOutput, PixelStatsProcessor
17
+ from dataeval.metrics.stats.visualstats import VisualStatsOutput, VisualStatsProcessor
18
+ from dataeval.output import OutputMetadata, set_metadata
17
19
 
18
20
 
19
21
  @dataclass(frozen=True)
@@ -39,14 +41,14 @@ class DatasetStatsOutput(OutputMetadata):
39
41
  visualstats: VisualStatsOutput
40
42
  labelstats: LabelStatsOutput | None = None
41
43
 
42
- def outputs(self) -> list[OutputMetadata]:
44
+ def _outputs(self) -> list[OutputMetadata]:
43
45
  return [s for s in (self.dimensionstats, self.pixelstats, self.visualstats, self.labelstats) if s is not None]
44
46
 
45
47
  def dict(self) -> dict[str, Any]:
46
- return {k: v for o in self.outputs() for k, v in o.dict().items()}
48
+ return {k: v for o in self._outputs() for k, v in o.dict().items()}
47
49
 
48
- def __post_init__(self):
49
- lengths = [len(s) for s in self.outputs() if isinstance(s, BaseStatsOutput)]
50
+ def __post_init__(self) -> None:
51
+ lengths = [len(s) for s in self._outputs() if isinstance(s, BaseStatsOutput)]
50
52
  if not all(length == lengths[0] for length in lengths):
51
53
  raise ValueError("All StatsOutput classes must contain the same number of image sources.")
52
54
 
@@ -70,26 +72,26 @@ class ChannelStatsOutput(OutputMetadata):
70
72
  pixelstats: PixelStatsOutput
71
73
  visualstats: VisualStatsOutput
72
74
 
73
- def outputs(self) -> list[BaseStatsOutput]:
74
- return [self.pixelstats, self.visualstats]
75
+ def _outputs(self) -> tuple[PixelStatsOutput, VisualStatsOutput]:
76
+ return (self.pixelstats, self.visualstats)
75
77
 
76
78
  def dict(self) -> dict[str, Any]:
77
79
  return {**self.pixelstats.dict(), **self.visualstats.dict()}
78
80
 
79
- def __post_init__(self):
80
- lengths = [len(s) for s in self.outputs()]
81
+ def __post_init__(self) -> None:
82
+ lengths = [len(s) for s in self._outputs()]
81
83
  if not all(length == lengths[0] for length in lengths):
82
84
  raise ValueError("All StatsOutput classes must contain the same number of image sources.")
83
85
 
84
86
 
85
- @set_metadata("dataeval.metrics")
87
+ @set_metadata()
86
88
  def datasetstats(
87
89
  images: Iterable[ArrayLike],
88
90
  bboxes: Iterable[ArrayLike] | None = None,
89
91
  labels: Iterable[ArrayLike] | None = None,
90
92
  ) -> DatasetStatsOutput:
91
93
  """
92
- Calculates various statistics for each image
94
+ Calculates various :term:`statistics<Statistics>` for each image
93
95
 
94
96
  This function computes dimension, pixel and visual metrics
95
97
  on the images or individual bounding boxes for each image as
@@ -129,7 +131,7 @@ def datasetstats(
129
131
  return DatasetStatsOutput(*outputs, labelstats=labelstats(labels) if labels else None) # type: ignore
130
132
 
131
133
 
132
- @set_metadata("dataeval.metrics")
134
+ @set_metadata()
133
135
  def channelstats(
134
136
  images: Iterable[ArrayLike],
135
137
  bboxes: Iterable[ArrayLike] | None = None,
@@ -1,14 +1,16 @@
1
1
  from __future__ import annotations
2
2
 
3
+ __all__ = ["DimensionStatsOutput", "dimensionstats"]
4
+
3
5
  from dataclasses import dataclass
4
- from typing import Iterable
6
+ from typing import Any, Callable, Iterable
5
7
 
6
8
  import numpy as np
7
9
  from numpy.typing import ArrayLike, NDArray
8
10
 
9
- from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
10
- from dataeval._internal.metrics.utils import get_bitdepth
11
- from dataeval._internal.output import set_metadata
11
+ from dataeval.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
12
+ from dataeval.output import set_metadata
13
+ from dataeval.utils.image import get_bitdepth
12
14
 
13
15
 
14
16
  @dataclass(frozen=True)
@@ -31,7 +33,7 @@ class DimensionStatsOutput(BaseStatsOutput):
31
33
  size : NDArray[np.uint32]
32
34
  Size of the images in pixels
33
35
  aspect_ratio : NDArray[np.float16]
34
- Aspect ratio of the images (width/height)
36
+ :term:`ASspect Ratio<Aspect Ratio>` of the images (width/height)
35
37
  depth : NDArray[np.uint8]
36
38
  Color depth of the images in bits
37
39
  center : NDArray[np.uint16]
@@ -53,8 +55,8 @@ class DimensionStatsOutput(BaseStatsOutput):
53
55
 
54
56
 
55
57
  class DimensionStatsProcessor(StatsProcessor[DimensionStatsOutput]):
56
- output_class = DimensionStatsOutput
57
- image_function_map = {
58
+ output_class: type = DimensionStatsOutput
59
+ image_function_map: dict[str, Callable[[StatsProcessor[DimensionStatsOutput]], Any]] = {
58
60
  "left": lambda x: x.box[0],
59
61
  "top": lambda x: x.box[1],
60
62
  "width": lambda x: x.box[2] - x.box[0],
@@ -71,13 +73,13 @@ class DimensionStatsProcessor(StatsProcessor[DimensionStatsOutput]):
71
73
  }
72
74
 
73
75
 
74
- @set_metadata("dataeval.metrics")
76
+ @set_metadata()
75
77
  def dimensionstats(
76
78
  images: Iterable[ArrayLike],
77
79
  bboxes: Iterable[ArrayLike] | None = None,
78
80
  ) -> DimensionStatsOutput:
79
81
  """
80
- Calculates dimension statistics for each image
82
+ Calculates dimension :term:`statistics<Statistics>` for each image
81
83
 
82
84
  This function computes various dimensional metrics (e.g., width, height, channels)
83
85
  on the images or individual bounding boxes for each image.
@@ -94,7 +96,7 @@ def dimensionstats(
94
96
  DimensionStatsOutput
95
97
  A dictionary-like object containing the computed dimension statistics for each image or bounding
96
98
  box. The keys correspond to the names of the statistics (e.g., 'width', 'height'), and the values
97
- are lists of results for each image or numpy arrays when the results are multi-dimensional.
99
+ are lists of results for each image or :term:NumPy` arrays when the results are multi-dimensional.
98
100
 
99
101
  See Also
100
102
  --------
@@ -0,0 +1,156 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = ["HashStatsOutput", "hashstats"]
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Callable, Iterable
7
+
8
+ import numpy as np
9
+ import xxhash as xxh
10
+ from numpy.typing import ArrayLike
11
+ from PIL import Image
12
+ from scipy.fftpack import dct
13
+
14
+ from dataeval.interop import as_numpy
15
+ from dataeval.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
16
+ from dataeval.output import set_metadata
17
+ from dataeval.utils.image import normalize_image_shape, rescale
18
+
19
+ HASH_SIZE = 8
20
+ MAX_FACTOR = 4
21
+
22
+
23
+ @dataclass(frozen=True)
24
+ class HashStatsOutput(BaseStatsOutput):
25
+ """
26
+ Output class for :func:`hashstats` stats metric
27
+
28
+ Attributes
29
+ ----------
30
+ xxhash : List[str]
31
+ xxHash hash of the images as a hex string
32
+ pchash : List[str]
33
+ :term:`Perception-based Hash` of the images as a hex string
34
+ """
35
+
36
+ xxhash: list[str]
37
+ pchash: list[str]
38
+
39
+
40
+ def pchash(image: ArrayLike) -> str:
41
+ """
42
+ Performs a perceptual hash on an image by resizing to a square NxN image
43
+ using the Lanczos algorithm where N is 32x32 or the largest multiple of
44
+ 8 that is smaller than the input image dimensions. The resampled image
45
+ is compressed using a discrete cosine transform and the lowest frequency
46
+ component is encoded as a bit array of greater or less than median value
47
+ and returned as a hex string.
48
+
49
+ Parameters
50
+ ----------
51
+ image : ArrayLike
52
+ An image as a numpy array in CxHxW format
53
+
54
+ Returns
55
+ -------
56
+ str
57
+ The hex string hash of the image using perceptual hashing
58
+ """
59
+ # Verify that the image is at least larger than an 8x8 image
60
+ arr = as_numpy(image)
61
+ min_dim = min(arr.shape[-2:])
62
+ if min_dim < HASH_SIZE + 1:
63
+ raise ValueError(f"Image must be larger than {HASH_SIZE}x{HASH_SIZE} for fuzzy hashing.")
64
+
65
+ # Calculates the dimensions of the resized square image
66
+ resize_dim = HASH_SIZE * min((min_dim - 1) // HASH_SIZE, MAX_FACTOR)
67
+
68
+ # Normalizes the image to CxHxW and takes the mean over all the channels
69
+ normalized = np.mean(normalize_image_shape(arr), axis=0).squeeze()
70
+
71
+ # Rescales the pixel values to an 8-bit 0-255 image
72
+ rescaled = rescale(normalized, 8).astype(np.uint8)
73
+
74
+ # Resizes the image using the Lanczos algorithm to a square image
75
+ im = np.array(Image.fromarray(rescaled).resize((resize_dim, resize_dim), Image.Resampling.LANCZOS))
76
+
77
+ # Performs discrete cosine transforms to compress the image information and takes the lowest frequency component
78
+ transform = dct(dct(im.T).T)[:HASH_SIZE, :HASH_SIZE]
79
+
80
+ # Encodes the transform as a bit array over the median value
81
+ diff = transform > np.median(transform)
82
+
83
+ # Pads the front of the bit array to a multiple of 8 with False
84
+ padded = np.full(int(np.ceil(diff.size / 8) * 8), False)
85
+ padded[-diff.size :] = diff.ravel()
86
+
87
+ # Converts the bit array to a hex string and strips leading 0s
88
+ hash_hex = np.packbits(padded).tobytes().hex().lstrip("0")
89
+ return hash_hex if hash_hex else "0"
90
+
91
+
92
+ def xxhash(image: ArrayLike) -> str:
93
+ """
94
+ Performs a fast non-cryptographic hash using the xxhash algorithm
95
+ (xxhash.com) against the image as a flattened bytearray. The hash
96
+ is returned as a hex string.
97
+
98
+ Parameters
99
+ ----------
100
+ image : ArrayLike
101
+ An image as a numpy array
102
+
103
+ Returns
104
+ -------
105
+ str
106
+ The hex string hash of the image using the xxHash algorithm
107
+ """
108
+ return xxh.xxh3_64_hexdigest(as_numpy(image).ravel().tobytes())
109
+
110
+
111
+ class HashStatsProcessor(StatsProcessor[HashStatsOutput]):
112
+ output_class: type = HashStatsOutput
113
+ image_function_map: dict[str, Callable[[StatsProcessor[HashStatsOutput]], str]] = {
114
+ "xxhash": lambda x: xxhash(x.image),
115
+ "pchash": lambda x: pchash(x.image),
116
+ }
117
+
118
+
119
+ @set_metadata()
120
+ def hashstats(
121
+ images: Iterable[ArrayLike],
122
+ bboxes: Iterable[ArrayLike] | None = None,
123
+ ) -> HashStatsOutput:
124
+ """
125
+ Calculates hashes for each image
126
+
127
+ This function computes hashes from the images including exact hashes and perception-based
128
+ hashes. These hash values can be used to determine if images are exact or near matches.
129
+
130
+ Parameters
131
+ ----------
132
+ images : ArrayLike
133
+ Images to hashing
134
+ bboxes : Iterable[ArrayLike] or None
135
+ Bounding boxes in `xyxy` format for each image
136
+
137
+ Returns
138
+ -------
139
+ HashStatsOutput
140
+ A dictionary-like object containing the computed hashes for each image.
141
+
142
+ See Also
143
+ --------
144
+ Duplicates
145
+
146
+ Examples
147
+ --------
148
+ Calculating the statistics on the images, whose shape is (C, H, W)
149
+
150
+ >>> results = hashstats(images)
151
+ >>> print(results.xxhash)
152
+ ['a72434443d6e7336', 'efc12c2f14581d79', '4a1e03483a27d674', '3a3ecedbcf814226']
153
+ >>> print(results.pchash)
154
+ ['8f25506af46a7c6a', '8000808000008080', '8e71f18e0ef18e0e', 'a956d6a956d6a928']
155
+ """
156
+ return run_stats(images, bboxes, False, [HashStatsProcessor])[0]
@@ -1,13 +1,15 @@
1
1
  from __future__ import annotations
2
2
 
3
+ __all__ = ["LabelStatsOutput", "labelstats"]
4
+
3
5
  from collections import Counter, defaultdict
4
6
  from dataclasses import dataclass
5
7
  from typing import Any, Iterable, Mapping, TypeVar
6
8
 
7
9
  from numpy.typing import ArrayLike
8
10
 
9
- from dataeval._internal.interop import to_numpy
10
- from dataeval._internal.output import OutputMetadata, set_metadata
11
+ from dataeval.interop import to_numpy
12
+ from dataeval.output import OutputMetadata, set_metadata
11
13
 
12
14
 
13
15
  @dataclass(frozen=True)
@@ -55,12 +57,12 @@ def sort(d: Mapping[TKey, Any]) -> dict[TKey, Any]:
55
57
  return dict(sorted(d.items(), key=lambda x: x[0]))
56
58
 
57
59
 
58
- @set_metadata("dataeval.metrics")
60
+ @set_metadata()
59
61
  def labelstats(
60
62
  labels: Iterable[ArrayLike],
61
63
  ) -> LabelStatsOutput:
62
64
  """
63
- Calculates statistics for data labels
65
+ Calculates :term:`statistics<Statistics>` for data labels
64
66
 
65
67
  This function computes counting metrics (e.g., total per class, total per image)
66
68
  on the labels.
@@ -68,7 +70,7 @@ def labelstats(
68
70
  Parameters
69
71
  ----------
70
72
  labels : ArrayLike, shape - [label] | [[label]] or (N,M) | (N,)
71
- Lists or numpy array of labels.
73
+ Lists or :term:`NumPy` array of labels.
72
74
  A set of lists where each list contains all labels per image -
73
75
  (e.g. [[label1, label2], [label2], [label1, label3]] or [label1, label2, label1, label3]).
74
76
  If a numpy array, N is the number of images, M is the number of labels per image.
@@ -80,7 +82,7 @@ def labelstats(
80
82
 
81
83
  Examples
82
84
  --------
83
- Calculating the statistics on labels for a set of data
85
+ Calculating the :term:`statistics<Statistics>` on labels for a set of data
84
86
 
85
87
  >>> stats = labelstats(labels)
86
88
  >>> stats.label_counts_per_class
@@ -1,14 +1,16 @@
1
1
  from __future__ import annotations
2
2
 
3
+ __all__ = ["PixelStatsOutput", "pixelstats"]
4
+
3
5
  from dataclasses import dataclass
4
- from typing import Iterable
6
+ from typing import Any, Callable, Iterable
5
7
 
6
8
  import numpy as np
7
9
  from numpy.typing import ArrayLike, NDArray
8
10
  from scipy.stats import entropy, kurtosis, skew
9
11
 
10
- from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
11
- from dataeval._internal.output import set_metadata
12
+ from dataeval.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
13
+ from dataeval.output import set_metadata
12
14
 
13
15
 
14
16
  @dataclass(frozen=True)
@@ -23,7 +25,7 @@ class PixelStatsOutput(BaseStatsOutput):
23
25
  std : NDArray[np.float16]
24
26
  Standard deviation of the pixel values of the images
25
27
  var : NDArray[np.float16]
26
- Variance of the pixel values of the images
28
+ :term:`Variance` of the pixel values of the images
27
29
  skew : NDArray[np.float16]
28
30
  Skew of the pixel values of the images
29
31
  kurtosis : NDArray[np.float16]
@@ -44,9 +46,8 @@ class PixelStatsOutput(BaseStatsOutput):
44
46
 
45
47
 
46
48
  class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
47
- output_class = PixelStatsOutput
48
- cache_keys = ["histogram"]
49
- image_function_map = {
49
+ output_class: type = PixelStatsOutput
50
+ image_function_map: dict[str, Callable[[StatsProcessor[PixelStatsOutput]], Any]] = {
50
51
  "mean": lambda self: np.mean(self.scaled),
51
52
  "std": lambda x: np.std(x.scaled),
52
53
  "var": lambda x: np.var(x.scaled),
@@ -55,7 +56,7 @@ class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
55
56
  "histogram": lambda x: np.histogram(x.scaled, 256, (0, 1))[0],
56
57
  "entropy": lambda x: entropy(x.get("histogram")),
57
58
  }
58
- channel_function_map = {
59
+ channel_function_map: dict[str, Callable[[StatsProcessor[PixelStatsOutput]], Any]] = {
59
60
  "mean": lambda x: np.mean(x.scaled, axis=1),
60
61
  "std": lambda x: np.std(x.scaled, axis=1),
61
62
  "var": lambda x: np.var(x.scaled, axis=1),
@@ -66,14 +67,14 @@ class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
66
67
  }
67
68
 
68
69
 
69
- @set_metadata("dataeval.metrics")
70
+ @set_metadata()
70
71
  def pixelstats(
71
72
  images: Iterable[ArrayLike],
72
73
  bboxes: Iterable[ArrayLike] | None = None,
73
74
  per_channel: bool = False,
74
75
  ) -> PixelStatsOutput:
75
76
  """
76
- Calculates pixel statistics for each image
77
+ Calculates pixel :term:`statistics<Statistics>` for each image
77
78
 
78
79
  This function computes various statistical metrics (e.g., mean, standard deviation, entropy)
79
80
  on the images as a whole.
@@ -90,7 +91,7 @@ def pixelstats(
90
91
  PixelStatsOutput
91
92
  A dictionary-like object containing the computed statistics for each image. The keys correspond
92
93
  to the names of the statistics (e.g., 'mean', 'std'), and the values are lists of results for
93
- each image or numpy arrays when the results are multi-dimensional.
94
+ each image or :term:`NumPy` arrays when the results are multi-dimensional.
94
95
 
95
96
  See Also
96
97
  --------
@@ -1,14 +1,16 @@
1
1
  from __future__ import annotations
2
2
 
3
+ __all__ = ["VisualStatsOutput", "visualstats"]
4
+
3
5
  from dataclasses import dataclass
4
- from typing import Iterable
6
+ from typing import Any, Callable, Iterable
5
7
 
6
8
  import numpy as np
7
9
  from numpy.typing import ArrayLike, NDArray
8
10
 
9
- from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
10
- from dataeval._internal.metrics.utils import edge_filter
11
- from dataeval._internal.output import set_metadata
11
+ from dataeval.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
12
+ from dataeval.output import set_metadata
13
+ from dataeval.utils.image import edge_filter
12
14
 
13
15
  QUARTILES = (0, 25, 50, 75, 100)
14
16
 
@@ -46,9 +48,8 @@ class VisualStatsOutput(BaseStatsOutput):
46
48
 
47
49
 
48
50
  class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
49
- output_class = VisualStatsOutput
50
- cache_keys = ["percentiles"]
51
- image_function_map = {
51
+ output_class: type = VisualStatsOutput
52
+ image_function_map: dict[str, Callable[[StatsProcessor[VisualStatsOutput]], Any]] = {
52
53
  "brightness": lambda x: x.get("percentiles")[1],
53
54
  "contrast": lambda x: np.nan_to_num(
54
55
  (np.max(x.get("percentiles")) - np.min(x.get("percentiles"))) / np.mean(x.get("percentiles"))
@@ -59,7 +60,7 @@ class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
59
60
  "zeros": lambda x: np.count_nonzero(np.sum(x.image, axis=0) == 0) / np.prod(x.shape[-2:]),
60
61
  "percentiles": lambda x: np.nanpercentile(x.scaled, q=QUARTILES),
61
62
  }
62
- channel_function_map = {
63
+ channel_function_map: dict[str, Callable[[StatsProcessor[VisualStatsOutput]], Any]] = {
63
64
  "brightness": lambda x: x.get("percentiles")[:, 1],
64
65
  "contrast": lambda x: np.nan_to_num(
65
66
  (np.max(x.get("percentiles"), axis=1) - np.min(x.get("percentiles"), axis=1))
@@ -73,7 +74,7 @@ class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
73
74
  }
74
75
 
75
76
 
76
- @set_metadata("dataeval.metrics")
77
+ @set_metadata()
77
78
  def visualstats(
78
79
  images: Iterable[ArrayLike],
79
80
  bboxes: Iterable[ArrayLike] | None = None,
@@ -82,7 +83,7 @@ def visualstats(
82
83
  """
83
84
  Calculates visual statistics for each image
84
85
 
85
- This function computes various visual metrics (e.g., brightness, darkness, contrast, sharpness)
86
+ This function computes various visual metrics (e.g., :term:`brightness<Brightness>`, darkness, contrast, blurriness)
86
87
  on the images as a whole.
87
88
 
88
89
  Parameters
@@ -96,8 +97,8 @@ def visualstats(
96
97
  -------
97
98
  VisualStatsOutput
98
99
  A dictionary-like object containing the computed visual statistics for each image. The keys correspond
99
- to the names of the statistics (e.g., 'brightness', 'sharpness'), and the values are lists of results for
100
- each image or numpy arrays when the results are multi-dimensional.
100
+ to the names of the statistics (e.g., 'brightness', 'blurriness'), and the values are lists of results for
101
+ each image or :term:`NumPy` arrays when the results are multi-dimensional.
101
102
 
102
103
  See Also
103
104
  --------
@@ -109,7 +110,7 @@ def visualstats(
109
110
 
110
111
  Examples
111
112
  --------
112
- Calculating the statistics on the images, whose shape is (C, H, W)
113
+ Calculating the :term:`statistics<Statistics>` on the images, whose shape is (C, H, W)
113
114
 
114
115
  >>> results = visualstats(images)
115
116
  >>> print(results.brightness)
@@ -1,12 +1,20 @@
1
1
  from __future__ import annotations
2
2
 
3
+ __all__ = []
4
+
3
5
  import inspect
6
+ import sys
4
7
  from datetime import datetime, timezone
5
8
  from functools import wraps
6
- from typing import Any
9
+ from typing import Any, Callable, Iterable, TypeVar
7
10
 
8
11
  import numpy as np
9
12
 
13
+ if sys.version_info >= (3, 10):
14
+ from typing import ParamSpec
15
+ else:
16
+ from typing_extensions import ParamSpec
17
+
10
18
  from dataeval import __version__
11
19
 
12
20
 
@@ -25,10 +33,18 @@ class OutputMetadata:
25
33
  return {k.removeprefix("_"): v for k, v in self.__dict__.items() if k.startswith("_")}
26
34
 
27
35
 
28
- def set_metadata(module_name: str = "", state_attr: list[str] | None = None):
29
- def decorator(fn):
36
+ P = ParamSpec("P")
37
+ R = TypeVar("R", bound=OutputMetadata)
38
+
39
+
40
+ def set_metadata(
41
+ state_attr: Iterable[str] | None = None,
42
+ ) -> Callable[[Callable[P, R]], Callable[P, R]]:
43
+ """Decorator to stamp OutputMetadata classes with runtime metadata"""
44
+
45
+ def decorator(fn: Callable[P, R]) -> Callable[P, R]:
30
46
  @wraps(fn)
31
- def wrapper(*args, **kwargs):
47
+ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
32
48
  def fmt(v):
33
49
  if np.isscalar(v):
34
50
  return v
@@ -52,9 +68,13 @@ def set_metadata(module_name: str = "", state_attr: list[str] | None = None):
52
68
  if "self" in arguments and state_attr
53
69
  else {}
54
70
  )
55
- name = args[0].__class__.__name__ if "self" in arguments else fn.__name__
71
+ name = (
72
+ f"{args[0].__class__.__module__}.{args[0].__class__.__name__}.{fn.__name__}"
73
+ if "self" in arguments
74
+ else f"{fn.__module__}.{fn.__qualname__}"
75
+ )
56
76
  metadata = {
57
- "_name": f"{module_name}.{name}",
77
+ "_name": name,
58
78
  "_execution_time": time,
59
79
  "_execution_duration": duration,
60
80
  "_arguments": {k: v for k, v in arguments.items() if k != "self"},
@@ -1,19 +1,23 @@
1
1
  """
2
2
  The utility classes and functions are provided by DataEval to assist users
3
3
  in setting up architectures that are guaranteed to work with applicable DataEval
4
- metrics. Currently DataEval supports both Tensorflow and PyTorch backends.
4
+ metrics. Currently DataEval supports both :term:`TensorFlow` and PyTorch backends.
5
5
  """
6
6
 
7
7
  from dataeval import _IS_TENSORFLOW_AVAILABLE, _IS_TORCH_AVAILABLE
8
+ from dataeval.utils.split_dataset import split_dataset
8
9
 
9
- __all__ = []
10
+ __all__ = ["split_dataset"]
10
11
 
11
12
  if _IS_TORCH_AVAILABLE: # pragma: no cover
12
- from . import torch
13
+ from dataeval.utils import torch
13
14
 
14
15
  __all__ += ["torch"]
15
16
 
16
17
  if _IS_TENSORFLOW_AVAILABLE: # pragma: no cover
17
- from . import tensorflow
18
+ from dataeval.utils import tensorflow
18
19
 
19
20
  __all__ += ["tensorflow"]
21
+
22
+ del _IS_TENSORFLOW_AVAILABLE
23
+ del _IS_TORCH_AVAILABLE
@@ -0,0 +1,71 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = []
4
+
5
+ from typing import Any, NamedTuple
6
+
7
+ import numpy as np
8
+ from numpy.typing import ArrayLike, NDArray
9
+ from scipy.signal import convolve2d
10
+
11
+ EDGE_KERNEL = np.array([[-1, -1, -1], [-1, 8, -1], [-1, -1, -1]], dtype=np.int8)
12
+ BIT_DEPTH = (1, 8, 12, 16, 32)
13
+
14
+
15
+ class BitDepth(NamedTuple):
16
+ depth: int
17
+ pmin: float | int
18
+ pmax: float | int
19
+
20
+
21
+ def get_bitdepth(image: NDArray[Any]) -> BitDepth:
22
+ """
23
+ Approximates the bit depth of the image using the
24
+ min and max pixel values.
25
+ """
26
+ pmin, pmax = np.min(image), np.max(image)
27
+ if pmin < 0:
28
+ return BitDepth(0, pmin, pmax)
29
+ else:
30
+ depth = ([x for x in BIT_DEPTH if 2**x > pmax] or [max(BIT_DEPTH)])[0]
31
+ return BitDepth(depth, 0, 2**depth - 1)
32
+
33
+
34
+ def rescale(image: NDArray[Any], depth: int = 1) -> NDArray[Any]:
35
+ """
36
+ Rescales the image using the bit depth provided.
37
+ """
38
+ bitdepth = get_bitdepth(image)
39
+ if bitdepth.depth == depth:
40
+ return image
41
+ else:
42
+ normalized = (image + bitdepth.pmin) / (bitdepth.pmax - bitdepth.pmin)
43
+ return normalized * (2**depth - 1)
44
+
45
+
46
+ def normalize_image_shape(image: NDArray[Any]) -> NDArray[Any]:
47
+ """
48
+ Normalizes the image shape into (C,H,W).
49
+ """
50
+ ndim = image.ndim
51
+ if ndim == 2:
52
+ return np.expand_dims(image, axis=0)
53
+ elif ndim == 3:
54
+ return image
55
+ elif ndim > 3:
56
+ # Slice all but the last 3 dimensions
57
+ return image[(0,) * (ndim - 3)]
58
+ else:
59
+ raise ValueError("Images must have 2 or more dimensions.")
60
+
61
+
62
+ def edge_filter(image: ArrayLike, offset: float = 0.5) -> NDArray[np.uint8]:
63
+ """
64
+ Returns the image filtered using a 3x3 edge detection kernel:
65
+ [[ -1, -1, -1 ],
66
+ [ -1, 8, -1 ],
67
+ [ -1, -1, -1 ]]
68
+ """
69
+ edges = convolve2d(image, EDGE_KERNEL, mode="same", boundary="symm") + offset
70
+ np.clip(edges, 0, 255, edges)
71
+ return edges