dataeval 0.69.4__py3-none-any.whl → 0.70.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. dataeval/__init__.py +8 -8
  2. dataeval/_internal/datasets.py +235 -131
  3. dataeval/_internal/detectors/clusterer.py +2 -0
  4. dataeval/_internal/detectors/drift/base.py +7 -8
  5. dataeval/_internal/detectors/drift/mmd.py +4 -4
  6. dataeval/_internal/detectors/duplicates.py +64 -45
  7. dataeval/_internal/detectors/merged_stats.py +23 -54
  8. dataeval/_internal/detectors/ood/ae.py +8 -6
  9. dataeval/_internal/detectors/ood/aegmm.py +6 -4
  10. dataeval/_internal/detectors/ood/base.py +12 -7
  11. dataeval/_internal/detectors/ood/llr.py +6 -4
  12. dataeval/_internal/detectors/ood/vae.py +5 -3
  13. dataeval/_internal/detectors/ood/vaegmm.py +6 -4
  14. dataeval/_internal/detectors/outliers.py +137 -63
  15. dataeval/_internal/interop.py +11 -7
  16. dataeval/_internal/metrics/balance.py +13 -11
  17. dataeval/_internal/metrics/ber.py +5 -3
  18. dataeval/_internal/metrics/coverage.py +4 -0
  19. dataeval/_internal/metrics/divergence.py +9 -5
  20. dataeval/_internal/metrics/diversity.py +14 -12
  21. dataeval/_internal/metrics/parity.py +32 -22
  22. dataeval/_internal/metrics/stats/base.py +231 -0
  23. dataeval/_internal/metrics/stats/boxratiostats.py +159 -0
  24. dataeval/_internal/metrics/stats/datasetstats.py +99 -0
  25. dataeval/_internal/metrics/stats/dimensionstats.py +113 -0
  26. dataeval/_internal/metrics/stats/hashstats.py +75 -0
  27. dataeval/_internal/metrics/stats/labelstats.py +125 -0
  28. dataeval/_internal/metrics/stats/pixelstats.py +119 -0
  29. dataeval/_internal/metrics/stats/visualstats.py +124 -0
  30. dataeval/_internal/metrics/uap.py +8 -4
  31. dataeval/_internal/metrics/utils.py +30 -15
  32. dataeval/_internal/models/pytorch/autoencoder.py +5 -5
  33. dataeval/_internal/models/tensorflow/pixelcnn.py +1 -4
  34. dataeval/_internal/output.py +3 -18
  35. dataeval/_internal/utils.py +11 -16
  36. dataeval/_internal/workflows/sufficiency.py +152 -151
  37. dataeval/detectors/__init__.py +4 -0
  38. dataeval/detectors/drift/__init__.py +8 -3
  39. dataeval/detectors/drift/kernels/__init__.py +4 -0
  40. dataeval/detectors/drift/updates/__init__.py +4 -0
  41. dataeval/detectors/linters/__init__.py +15 -4
  42. dataeval/detectors/ood/__init__.py +14 -2
  43. dataeval/metrics/__init__.py +5 -0
  44. dataeval/metrics/bias/__init__.py +13 -4
  45. dataeval/metrics/estimators/__init__.py +8 -8
  46. dataeval/metrics/stats/__init__.py +25 -3
  47. dataeval/utils/__init__.py +16 -3
  48. dataeval/utils/tensorflow/__init__.py +11 -0
  49. dataeval/utils/torch/__init__.py +12 -0
  50. dataeval/utils/torch/datasets/__init__.py +7 -0
  51. dataeval/workflows/__init__.py +6 -2
  52. {dataeval-0.69.4.dist-info → dataeval-0.70.1.dist-info}/METADATA +12 -4
  53. dataeval-0.70.1.dist-info/RECORD +80 -0
  54. {dataeval-0.69.4.dist-info → dataeval-0.70.1.dist-info}/WHEEL +1 -1
  55. dataeval/_internal/flags.py +0 -77
  56. dataeval/_internal/metrics/stats.py +0 -397
  57. dataeval/flags/__init__.py +0 -3
  58. dataeval/tensorflow/__init__.py +0 -3
  59. dataeval/torch/__init__.py +0 -3
  60. dataeval-0.69.4.dist-info/RECORD +0 -74
  61. /dataeval/{tensorflow → utils/tensorflow}/loss/__init__.py +0 -0
  62. /dataeval/{tensorflow → utils/tensorflow}/models/__init__.py +0 -0
  63. /dataeval/{tensorflow → utils/tensorflow}/recon/__init__.py +0 -0
  64. /dataeval/{torch → utils/torch}/models/__init__.py +0 -0
  65. /dataeval/{torch → utils/torch}/trainer/__init__.py +0 -0
  66. {dataeval-0.69.4.dist-info → dataeval-0.70.1.dist-info}/LICENSE.txt +0 -0
@@ -0,0 +1,113 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Iterable
5
+
6
+ import numpy as np
7
+ from numpy.typing import ArrayLike, NDArray
8
+
9
+ from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
10
+ from dataeval._internal.metrics.utils import get_bitdepth
11
+ from dataeval._internal.output import set_metadata
12
+
13
+
14
+ class DimensionStatsProcessor(StatsProcessor):
15
+ image_function_map = {
16
+ "left": lambda x: x.box[0],
17
+ "top": lambda x: x.box[1],
18
+ "width": lambda x: x.shape[-1],
19
+ "height": lambda x: x.shape[-2],
20
+ "channels": lambda x: x.shape[-3],
21
+ "size": lambda x: np.prod(x.shape[-2:]),
22
+ "aspect_ratio": lambda x: x.shape[-1] / x.shape[-2],
23
+ "depth": lambda x: get_bitdepth(x.image).depth,
24
+ "center": lambda x: np.asarray([(x.box[0] + x.box[2]) / 2, (x.box[1] + x.box[3]) / 2]),
25
+ "distance": lambda x: np.sqrt(
26
+ np.square(((x.box[0] + x.box[2]) / 2) - (x.width / 2))
27
+ + np.square(((x.box[1] + x.box[3]) / 2) - (x.height / 2))
28
+ ),
29
+ }
30
+
31
+
32
+ @dataclass(frozen=True)
33
+ class DimensionStatsOutput(BaseStatsOutput):
34
+ """
35
+ Output class for :func:`dimensionstats` stats metric
36
+
37
+ Attributes
38
+ ----------
39
+ left : NDArray[np.int32]
40
+ Offsets from the left edge of images in pixels
41
+ top : NDArray[np.int32]
42
+ Offsets from the top edge of images in pixels
43
+ width : NDArray[np.uint32]
44
+ Width of the images in pixels
45
+ height : NDArray[np.uint32]
46
+ Height of the images in pixels
47
+ channels : NDArray[np.uint8]
48
+ Channel count of the images in pixels
49
+ size : NDArray[np.uint32]
50
+ Size of the images in pixels
51
+ aspect_ratio : NDArray[np.float16]
52
+ Aspect ratio of the images (width/height)
53
+ depth : NDArray[np.uint8]
54
+ Color depth of the images in bits
55
+ center : NDArray[np.uint16]
56
+ Offset from center in [x,y] coordinates of the images in pixels
57
+ distance : NDArray[np.float16]
58
+ Distance in pixels from center
59
+ """
60
+
61
+ left: NDArray[np.int32]
62
+ top: NDArray[np.int32]
63
+ width: NDArray[np.uint32]
64
+ height: NDArray[np.uint32]
65
+ channels: NDArray[np.uint8]
66
+ size: NDArray[np.uint32]
67
+ aspect_ratio: NDArray[np.float16]
68
+ depth: NDArray[np.uint8]
69
+ center: NDArray[np.int16]
70
+ distance: NDArray[np.float16]
71
+
72
+
73
+ @set_metadata("dataeval.metrics")
74
+ def dimensionstats(
75
+ images: Iterable[ArrayLike],
76
+ bboxes: Iterable[ArrayLike] | None = None,
77
+ ) -> DimensionStatsOutput:
78
+ """
79
+ Calculates dimension statistics for each image
80
+
81
+ This function computes various dimensional metrics (e.g., width, height, channels)
82
+ on the images or individual bounding boxes for each image.
83
+
84
+ Parameters
85
+ ----------
86
+ images : Iterable[ArrayLike]
87
+ Images to perform calculations on
88
+ bboxes : Iterable[ArrayLike] or None
89
+ Bounding boxes in `xyxy` format for each image to perform calculations on
90
+
91
+ Returns
92
+ -------
93
+ DimensionStatsOutput
94
+ A dictionary-like object containing the computed dimension statistics for each image or bounding
95
+ box. The keys correspond to the names of the statistics (e.g., 'width', 'height'), and the values
96
+ are lists of results for each image or numpy arrays when the results are multi-dimensional.
97
+
98
+ See Also
99
+ --------
100
+ pixelstats, visualstats, Outliers
101
+
102
+ Examples
103
+ --------
104
+ Calculating the dimension statistics on the images, whose shape is (C, H, W)
105
+
106
+ >>> results = dimensionstats(images)
107
+ >>> print(results.aspect_ratio)
108
+ [0.75 0.75 0.75 0.75 0.75 0.75 1.333 0.75 0.75 1. ]
109
+ >>> print(results.channels)
110
+ [1 1 1 1 1 1 3 1 1 3]
111
+ """
112
+ output = run_stats(images, bboxes, False, DimensionStatsProcessor, DimensionStatsOutput)
113
+ return DimensionStatsOutput(**output)
@@ -0,0 +1,75 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Iterable
5
+
6
+ from numpy.typing import ArrayLike
7
+
8
+ from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
9
+ from dataeval._internal.metrics.utils import pchash, xxhash
10
+ from dataeval._internal.output import set_metadata
11
+
12
+
13
+ class HashStatsProcessor(StatsProcessor):
14
+ image_function_map = {
15
+ "xxhash": lambda x: xxhash(x.image),
16
+ "pchash": lambda x: pchash(x.image),
17
+ }
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class HashStatsOutput(BaseStatsOutput):
22
+ """
23
+ Output class for :func:`hashstats` stats metric
24
+
25
+ Attributes
26
+ ----------
27
+ xxhash : List[str]
28
+ xxHash hash of the images as a hex string
29
+ pchash : List[str]
30
+ Perception hash of the images as a hex string
31
+ """
32
+
33
+ xxhash: list[str]
34
+ pchash: list[str]
35
+
36
+
37
+ @set_metadata("dataeval.metrics")
38
+ def hashstats(
39
+ images: Iterable[ArrayLike],
40
+ bboxes: Iterable[ArrayLike] | None = None,
41
+ ) -> HashStatsOutput:
42
+ """
43
+ Calculates hashes for each image
44
+
45
+ This function computes hashes from the images including exact hashes and perception-based
46
+ hashes. These hash values can be used to determine if images are exact or near matches.
47
+
48
+ Parameters
49
+ ----------
50
+ images : ArrayLike
51
+ Images to hashing
52
+ bboxes : Iterable[ArrayLike] or None
53
+ Bounding boxes in `xyxy` format for each image
54
+
55
+ Returns
56
+ -------
57
+ HashStatsOutput
58
+ A dictionary-like object containing the computed hashes for each image.
59
+
60
+ See Also
61
+ --------
62
+ Duplicates
63
+
64
+ Examples
65
+ --------
66
+ Calculating the statistics on the images, whose shape is (C, H, W)
67
+
68
+ >>> results = hashstats(images)
69
+ >>> print(results.xxhash)
70
+ ['a72434443d6e7336', 'efc12c2f14581d79', '4a1e03483a27d674', '3a3ecedbcf814226']
71
+ >>> print(results.pchash)
72
+ ['8f25506af46a7c6a', '8000808000008080', '8e71f18e0ef18e0e', 'a956d6a956d6a928']
73
+ """
74
+ output = run_stats(images, bboxes, False, HashStatsProcessor, HashStatsOutput)
75
+ return HashStatsOutput(**output)
@@ -0,0 +1,125 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import Counter, defaultdict
4
+ from dataclasses import dataclass
5
+ from typing import Any, Iterable, Mapping, TypeVar
6
+
7
+ from numpy.typing import ArrayLike
8
+
9
+ from dataeval._internal.interop import to_numpy
10
+ from dataeval._internal.output import OutputMetadata, set_metadata
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class LabelStatsOutput(OutputMetadata):
15
+ """
16
+ Output class for :func:`labelstats` stats metric
17
+
18
+ Attributes
19
+ ----------
20
+ label_counts_per_class : dict[str | int, int]
21
+ Dictionary whose keys are the different label classes and
22
+ values are total counts of each class
23
+ label_counts_per_image : list[int]
24
+ Number of labels per image
25
+ image_counts_per_label : dict[str | int, int]
26
+ Dictionary whose keys are the different label classes and
27
+ values are total counts of each image the class is present in
28
+ image_indices_per_label : dict[str | int, list]
29
+ Dictionary whose keys are the different label classes and
30
+ values are lists containing the images that have that label
31
+ image_count : int
32
+ Total number of images present
33
+ class_count : int
34
+ Total number of classes present
35
+ label_count : int
36
+ Total number of labels present
37
+ """
38
+
39
+ label_counts_per_class: dict[str | int, int]
40
+ label_counts_per_image: list[int]
41
+ image_counts_per_label: dict[str | int, int]
42
+ image_indices_per_label: dict[str | int, list[int]]
43
+ image_count: int
44
+ class_count: int
45
+ label_count: int
46
+
47
+
48
+ TKey = TypeVar("TKey", int, str)
49
+
50
+
51
+ def sort(d: Mapping[TKey, Any]) -> dict[TKey, Any]:
52
+ """
53
+ Sort mappings by key in increasing order
54
+ """
55
+ return dict(sorted(d.items(), key=lambda x: x[0]))
56
+
57
+
58
+ @set_metadata("dataeval.metrics")
59
+ def labelstats(
60
+ labels: Iterable[ArrayLike],
61
+ ) -> LabelStatsOutput:
62
+ """
63
+ Calculates statistics for data labels
64
+
65
+ This function computes counting metrics (e.g., total per class, total per image)
66
+ on the labels.
67
+
68
+ Parameters
69
+ ----------
70
+ labels : ArrayLike, shape - [label] | [[label]] or (N,M) | (N,)
71
+ Lists or numpy array of labels.
72
+ A set of lists where each list contains all labels per image -
73
+ (e.g. [[label1, label2], [label2], [label1, label3]] or [label1, label2, label1, label3]).
74
+ If a numpy array, N is the number of images, M is the number of labels per image.
75
+
76
+ Returns
77
+ -------
78
+ LabelStatsOutput
79
+ A dictionary-like object containing the computed counting metrics for the labels.
80
+
81
+ Examples
82
+ --------
83
+ Calculating the statistics on labels for a set of data
84
+
85
+ >>> stats = labelstats(labels)
86
+ >>> stats.label_counts_per_class
87
+ {'chicken': 3, 'cow': 8, 'horse': 9, 'pig': 7, 'sheep': 7}
88
+ >>> stats.label_counts_per_image
89
+ [3, 2, 3, 4, 1, 5, 4, 4, 4, 4]
90
+ >>> stats.image_counts_per_label
91
+ {'chicken': 2, 'cow': 6, 'horse': 7, 'pig': 5, 'sheep': 7}
92
+ >>> (stats.image_count, stats.class_count, stats.label_count)
93
+ (10, 5, 34)
94
+ """
95
+ label_counts = Counter()
96
+ image_counts = Counter()
97
+ index_location = defaultdict(list[int])
98
+ label_per_image: list[int] = []
99
+
100
+ for i, group in enumerate(labels):
101
+ # Count occurrences of each label in all sublists
102
+ group = to_numpy(group)
103
+
104
+ label_counts.update(group)
105
+
106
+ # Get the number of labels per image
107
+ label_per_image.append(len(group))
108
+
109
+ # Create a set of unique items in the current sublist
110
+ unique_items: set[int] = set(group)
111
+
112
+ # Update image counts and index locations
113
+ image_counts.update(unique_items)
114
+ for item in unique_items:
115
+ index_location[item].append(i)
116
+
117
+ return LabelStatsOutput(
118
+ label_counts_per_class=sort(label_counts),
119
+ label_counts_per_image=label_per_image,
120
+ image_counts_per_label=sort(image_counts),
121
+ image_indices_per_label=sort(index_location),
122
+ image_count=len(label_per_image),
123
+ class_count=len(label_counts),
124
+ label_count=sum(label_counts.values()),
125
+ )
@@ -0,0 +1,119 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Iterable
5
+
6
+ import numpy as np
7
+ from numpy.typing import ArrayLike, NDArray
8
+ from scipy.stats import entropy, kurtosis, skew
9
+
10
+ from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
11
+ from dataeval._internal.output import set_metadata
12
+
13
+
14
+ class PixelStatsProcessor(StatsProcessor):
15
+ cache_keys = ["histogram"]
16
+ image_function_map = {
17
+ "mean": lambda self: np.mean(self.scaled),
18
+ "std": lambda x: np.std(x.scaled),
19
+ "var": lambda x: np.var(x.scaled),
20
+ "skew": lambda x: np.nan_to_num(skew(x.scaled.ravel())),
21
+ "kurtosis": lambda x: np.nan_to_num(kurtosis(x.scaled.ravel())),
22
+ "histogram": lambda x: np.histogram(x.scaled, 256, (0, 1))[0],
23
+ "entropy": lambda x: entropy(x.get("histogram")),
24
+ }
25
+ channel_function_map = {
26
+ "mean": lambda x: np.mean(x.scaled, axis=1),
27
+ "std": lambda x: np.std(x.scaled, axis=1),
28
+ "var": lambda x: np.var(x.scaled, axis=1),
29
+ "skew": lambda x: np.nan_to_num(skew(x.scaled, axis=1)),
30
+ "kurtosis": lambda x: np.nan_to_num(kurtosis(x.scaled, axis=1)),
31
+ "histogram": lambda x: np.apply_along_axis(lambda y: np.histogram(y, 256, (0, 1))[0], 1, x.scaled),
32
+ "entropy": lambda x: entropy(x.get("histogram"), axis=1),
33
+ }
34
+
35
+
36
+ @dataclass(frozen=True)
37
+ class PixelStatsOutput(BaseStatsOutput):
38
+ """
39
+ Output class for :func:`pixelstats` stats metric
40
+
41
+ Attributes
42
+ ----------
43
+ mean : NDArray[np.float16]
44
+ Mean of the pixel values of the images
45
+ std : NDArray[np.float16]
46
+ Standard deviation of the pixel values of the images
47
+ var : NDArray[np.float16]
48
+ Variance of the pixel values of the images
49
+ skew : NDArray[np.float16]
50
+ Skew of the pixel values of the images
51
+ kurtosis : NDArray[np.float16]
52
+ Kurtosis of the pixel values of the images
53
+ histogram : NDArray[np.uint32]
54
+ Histogram of the pixel values of the images across 256 bins scaled between 0 and 1
55
+ entropy : NDArray[np.float16]
56
+ Entropy of the pixel values of the images
57
+ """
58
+
59
+ mean: NDArray[np.float16]
60
+ std: NDArray[np.float16]
61
+ var: NDArray[np.float16]
62
+ skew: NDArray[np.float16]
63
+ kurtosis: NDArray[np.float16]
64
+ histogram: NDArray[np.uint32]
65
+ entropy: NDArray[np.float16]
66
+
67
+
68
+ @set_metadata("dataeval.metrics")
69
+ def pixelstats(
70
+ images: Iterable[ArrayLike],
71
+ bboxes: Iterable[ArrayLike] | None = None,
72
+ per_channel: bool = False,
73
+ ) -> PixelStatsOutput:
74
+ """
75
+ Calculates pixel statistics for each image
76
+
77
+ This function computes various statistical metrics (e.g., mean, standard deviation, entropy)
78
+ on the images as a whole.
79
+
80
+ Parameters
81
+ ----------
82
+ images : Iterable[ArrayLike]
83
+ Images to perform calculations on
84
+ bboxes : Iterable[ArrayLike] or None
85
+ Bounding boxes in `xyxy` format for each image to perform calculations
86
+
87
+ Returns
88
+ -------
89
+ PixelStatsOutput
90
+ A dictionary-like object containing the computed statistics for each image. The keys correspond
91
+ to the names of the statistics (e.g., 'mean', 'std'), and the values are lists of results for
92
+ each image or numpy arrays when the results are multi-dimensional.
93
+
94
+ See Also
95
+ --------
96
+ dimensionstats, visualstats, Outliers
97
+
98
+ Note
99
+ ----
100
+ - All metrics are scaled based on the perceived bit depth (which is derived from the largest pixel value)
101
+ to allow for better comparison between images stored in different formats and different resolutions.
102
+
103
+ Examples
104
+ --------
105
+ Calculating the statistics on the images, whose shape is (C, H, W)
106
+
107
+ >>> results = pixelstats(images)
108
+ >>> print(results.mean)
109
+ [0.04828 0.562 0.06726 0.09937 0.1315 0.1636 0.1957 0.2278 0.26
110
+ 0.292 0.3242 0.3562 0.3884 0.4204 0.4526 0.4846 0.5166 0.549
111
+ 0.581 0.6133 0.6455 0.6772 0.7095 0.7417 0.774 0.8057 0.838
112
+ 0.87 0.9023 0.934 ]
113
+ >>> print(results.entropy)
114
+ [3.238 3.303 0.8125 1.028 0.8223 1.046 0.8247 1.041 0.8203 1.012
115
+ 0.812 0.9883 0.795 0.9243 0.9243 0.795 0.9907 0.8125 1.028 0.8223
116
+ 1.046 0.8247 1.041 0.8203 1.012 0.812 0.9883 0.795 0.9243 0.9243]
117
+ """
118
+ output = run_stats(images, bboxes, per_channel, PixelStatsProcessor, PixelStatsOutput)
119
+ return PixelStatsOutput(**output)
@@ -0,0 +1,124 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Iterable
5
+
6
+ import numpy as np
7
+ from numpy.typing import ArrayLike, NDArray
8
+
9
+ from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
10
+ from dataeval._internal.metrics.utils import edge_filter
11
+ from dataeval._internal.output import set_metadata
12
+
13
+ QUARTILES = (0, 25, 50, 75, 100)
14
+
15
+
16
+ class VisualStatsProcessor(StatsProcessor):
17
+ cache_keys = ["percentiles"]
18
+ image_function_map = {
19
+ "brightness": lambda x: x.get("percentiles")[-2],
20
+ "blurriness": lambda x: np.std(edge_filter(np.mean(x.image, axis=0))),
21
+ "contrast": lambda x: np.nan_to_num(
22
+ (np.max(x.get("percentiles")) - np.min(x.get("percentiles"))) / np.mean(x.get("percentiles"))
23
+ ),
24
+ "darkness": lambda x: x.get("percentiles")[1],
25
+ "missing": lambda x: np.sum(np.isnan(x.image)) / np.prod(x.shape[-2:]),
26
+ "zeros": lambda x: np.count_nonzero(x.image == 0) / np.prod(x.shape[-2:]),
27
+ "percentiles": lambda x: np.nanpercentile(x.scaled, q=QUARTILES),
28
+ }
29
+ channel_function_map = {
30
+ "brightness": lambda x: x.get("percentiles")[:, -2],
31
+ "blurriness": lambda x: np.std(np.vectorize(edge_filter, signature="(m,n)->(m,n)")(x.image), axis=(1, 2)),
32
+ "contrast": lambda x: np.nan_to_num(
33
+ (np.max(x.get("percentiles"), axis=1) - np.min(x.get("percentiles"), axis=1))
34
+ / np.mean(x.get("percentiles"), axis=1)
35
+ ),
36
+ "darkness": lambda x: x.get("percentiles")[:, 1],
37
+ "missing": lambda x: np.sum(np.isnan(x.image), axis=(1, 2)) / np.prod(x.shape[-2:]),
38
+ "zeros": lambda x: np.count_nonzero(x.image == 0, axis=(1, 2)) / np.prod(x.shape[-2:]),
39
+ "percentiles": lambda x: np.nanpercentile(x.scaled, q=QUARTILES, axis=1).T,
40
+ }
41
+
42
+
43
+ @dataclass(frozen=True)
44
+ class VisualStatsOutput(BaseStatsOutput):
45
+ """
46
+ Output class for :func:`visualstats` stats metric
47
+
48
+ Attributes
49
+ ----------
50
+ brightness : NDArray[np.float16]
51
+ Brightness of the images
52
+ blurriness : NDArray[np.float16]
53
+ Blurriness of the images
54
+ contrast : NDArray[np.float16]
55
+ Image contrast ratio
56
+ darkness : NDArray[np.float16]
57
+ Darkness of the images
58
+ missing : NDArray[np.float16]
59
+ Percentage of the images with missing pixels
60
+ zeros : NDArray[np.float16]
61
+ Percentage of the images with zero value pixels
62
+ percentiles : NDArray[np.float16]
63
+ Percentiles of the pixel values of the images with quartiles of (0, 25, 50, 75, 100)
64
+ """
65
+
66
+ brightness: NDArray[np.float16]
67
+ blurriness: NDArray[np.float16]
68
+ contrast: NDArray[np.float16]
69
+ darkness: NDArray[np.float16]
70
+ missing: NDArray[np.float16]
71
+ zeros: NDArray[np.float16]
72
+ percentiles: NDArray[np.float16]
73
+
74
+
75
+ @set_metadata("dataeval.metrics")
76
+ def visualstats(
77
+ images: Iterable[ArrayLike],
78
+ bboxes: Iterable[ArrayLike] | None = None,
79
+ per_channel: bool = False,
80
+ ) -> VisualStatsOutput:
81
+ """
82
+ Calculates visual statistics for each image
83
+
84
+ This function computes various visual metrics (e.g., brightness, darkness, contrast, blurriness)
85
+ on the images as a whole.
86
+
87
+ Parameters
88
+ ----------
89
+ images : Iterable[ArrayLike]
90
+ Images to perform calculations on
91
+ bboxes : Iterable[ArrayLike] or None
92
+ Bounding boxes in `xyxy` format for each image to perform calculations on
93
+
94
+ Returns
95
+ -------
96
+ VisualStatsOutput
97
+ A dictionary-like object containing the computed visual statistics for each image. The keys correspond
98
+ to the names of the statistics (e.g., 'brightness', 'blurriness'), and the values are lists of results for
99
+ each image or numpy arrays when the results are multi-dimensional.
100
+
101
+ See Also
102
+ --------
103
+ dimensionstats, pixelstats, Outliers
104
+
105
+ Note
106
+ ----
107
+ - `zeros` and `missing` are presented as a percentage of total pixel counts
108
+
109
+ Examples
110
+ --------
111
+ Calculating the statistics on the images, whose shape is (C, H, W)
112
+
113
+ >>> results = visualstats(images)
114
+ >>> print(results.brightness)
115
+ [0.0737 0.607 0.0713 0.1046 0.138 0.1713 0.2046 0.2379 0.2712 0.3047
116
+ 0.338 0.3713 0.4045 0.438 0.4712 0.5044 0.538 0.5713 0.6045 0.638
117
+ 0.6714 0.7046 0.738 0.7715 0.8047 0.838 0.871 0.905 0.938 0.971 ]
118
+ >>> print(results.contrast)
119
+ [2.041 1.332 1.293 1.279 1.272 1.268 1.265 1.263 1.261 1.26 1.259 1.258
120
+ 1.258 1.257 1.257 1.256 1.256 1.255 1.255 1.255 1.255 1.254 1.254 1.254
121
+ 1.254 1.254 1.254 1.253 1.253 1.253]
122
+ """
123
+ output = run_stats(images, bboxes, per_channel, VisualStatsProcessor, VisualStatsOutput)
124
+ return VisualStatsOutput(**output)
@@ -4,18 +4,22 @@ FR Test Statistic based estimate for the upperbound
4
4
  average precision using empirical mean precision
5
5
  """
6
6
 
7
+ from __future__ import annotations
8
+
7
9
  from dataclasses import dataclass
8
10
 
9
11
  from numpy.typing import ArrayLike
10
12
  from sklearn.metrics import average_precision_score
11
13
 
12
- from dataeval._internal.interop import to_numpy
14
+ from dataeval._internal.interop import as_numpy
13
15
  from dataeval._internal.output import OutputMetadata, set_metadata
14
16
 
15
17
 
16
18
  @dataclass(frozen=True)
17
19
  class UAPOutput(OutputMetadata):
18
20
  """
21
+ Output class for :func:`uap` estimator metric
22
+
19
23
  Attributes
20
24
  ----------
21
25
  uap : float
@@ -48,8 +52,8 @@ def uap(labels: ArrayLike, scores: ArrayLike) -> UAPOutput:
48
52
  ValueError
49
53
  If unique classes M < 2
50
54
 
51
- Notes
52
- -----
55
+ Note
56
+ ----
53
57
  This function calculates the empirical mean precision using the
54
58
  ``average_precision_score`` from scikit-learn, weighted by the class distribution.
55
59
 
@@ -75,5 +79,5 @@ def uap(labels: ArrayLike, scores: ArrayLike) -> UAPOutput:
75
79
  UAPOutput(uap=0.7777777777777777)
76
80
  """
77
81
 
78
- precision = float(average_precision_score(to_numpy(labels), to_numpy(scores), average="weighted"))
82
+ precision = float(average_precision_score(as_numpy(labels), as_numpy(scores), average="weighted"))
79
83
  return UAPOutput(precision)
@@ -1,10 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, Callable, Literal, NamedTuple, Sequence
3
+ from typing import Any, Callable, Literal, Mapping, NamedTuple
4
4
 
5
5
  import numpy as np
6
6
  import xxhash as xxh
7
- from numpy.typing import NDArray
7
+ from numpy.typing import ArrayLike, NDArray
8
8
  from PIL import Image
9
9
  from scipy.fftpack import dct
10
10
  from scipy.signal import convolve2d
@@ -14,6 +14,8 @@ from scipy.spatial.distance import pdist, squareform
14
14
  from scipy.stats import entropy as sp_entropy
15
15
  from sklearn.neighbors import NearestNeighbors
16
16
 
17
+ from dataeval._internal.interop import to_numpy
18
+
17
19
  EPSILON = 1e-5
18
20
  EDGE_KERNEL = np.array([[-1, -1, -1], [-1, 8, -1], [-1, -1, -1]], dtype=np.int8)
19
21
  BIT_DEPTH = (1, 8, 12, 16, 32)
@@ -89,8 +91,8 @@ def entropy(
89
91
  subset_mask: NDArray[np.bool_] | None
90
92
  Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
91
93
 
92
- Notes
93
- -----
94
+ Note
95
+ ----
94
96
  For continuous variables, histogram bins are chosen automatically. See
95
97
  numpy.histogram for details.
96
98
 
@@ -162,26 +164,26 @@ def infer_categorical(X: NDArray, threshold: float = 0.2) -> NDArray:
162
164
 
163
165
 
164
166
  def preprocess_metadata(
165
- class_labels: Sequence[int], metadata: list[dict], cat_thresh: float = 0.2
167
+ class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], cat_thresh: float = 0.2
166
168
  ) -> tuple[NDArray, list[str], list[bool]]:
167
- # convert class_labels and list of metadata dicts to dict of ndarrays
168
- metadata_dict: dict[str, NDArray] = {
169
- "class_label": np.asarray(class_labels, dtype=int),
170
- **{k: np.array([d[k] for d in metadata]) for k in metadata[0]},
171
- }
169
+ # convert class_labels and dict of lists to matrix of metadata values
170
+ preprocessed_metadata = {"class_label": np.asarray(class_labels, dtype=int)}
172
171
 
173
172
  # map columns of dict that are not numeric (e.g. string) to numeric values
174
173
  # that mutual information and diversity functions can accommodate. Each
175
174
  # unique string receives a unique integer value.
176
- for k, v in metadata_dict.items():
175
+ for k, v in metadata.items():
177
176
  # if not numeric
177
+ v = to_numpy(v)
178
178
  if not np.issubdtype(v.dtype, np.number):
179
179
  _, mapped_vals = np.unique(v, return_inverse=True)
180
- metadata_dict[k] = mapped_vals
180
+ preprocessed_metadata[k] = mapped_vals
181
+ else:
182
+ preprocessed_metadata[k] = v
181
183
 
182
- data = np.stack(list(metadata_dict.values()), axis=-1)
183
- names = list(metadata_dict.keys())
184
- is_categorical = [infer_categorical(metadata_dict[var], cat_thresh)[0] for var in names]
184
+ data = np.stack(list(preprocessed_metadata.values()), axis=-1)
185
+ names = list(preprocessed_metadata.keys())
186
+ is_categorical = [infer_categorical(preprocessed_metadata[var], cat_thresh)[0] for var in names]
185
187
 
186
188
  return data, names, is_categorical
187
189
 
@@ -350,6 +352,19 @@ def normalize_image_shape(image: NDArray) -> NDArray:
350
352
  raise ValueError("Images must have 2 or more dimensions.")
351
353
 
352
354
 
355
+ def normalize_box_shape(bounding_box: NDArray) -> NDArray:
356
+ """
357
+ Normalizes the bounding box shape into (N,4).
358
+ """
359
+ ndim = bounding_box.ndim
360
+ if ndim == 1:
361
+ return np.expand_dims(bounding_box, axis=0)
362
+ elif ndim > 2:
363
+ raise ValueError("Bounding boxes must have 2 dimensions: (# of boxes in an image, [X,Y,W,H]) -> (N,4)")
364
+ else:
365
+ return bounding_box
366
+
367
+
353
368
  def edge_filter(image: NDArray, offset: float = 0.5) -> NDArray:
354
369
  """
355
370
  Returns the image filtered using a 3x3 edge detection kernel: