dataeval 0.69.4__py3-none-any.whl → 0.70.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. dataeval/__init__.py +3 -3
  2. dataeval/_internal/detectors/drift/base.py +5 -6
  3. dataeval/_internal/detectors/drift/mmd.py +3 -3
  4. dataeval/_internal/detectors/duplicates.py +62 -45
  5. dataeval/_internal/detectors/merged_stats.py +23 -54
  6. dataeval/_internal/detectors/ood/ae.py +3 -3
  7. dataeval/_internal/detectors/outliers.py +133 -61
  8. dataeval/_internal/interop.py +11 -7
  9. dataeval/_internal/metrics/balance.py +9 -9
  10. dataeval/_internal/metrics/ber.py +3 -3
  11. dataeval/_internal/metrics/divergence.py +3 -3
  12. dataeval/_internal/metrics/diversity.py +6 -6
  13. dataeval/_internal/metrics/parity.py +24 -16
  14. dataeval/_internal/metrics/stats/base.py +231 -0
  15. dataeval/_internal/metrics/stats/boxratiostats.py +159 -0
  16. dataeval/_internal/metrics/stats/datasetstats.py +97 -0
  17. dataeval/_internal/metrics/stats/dimensionstats.py +111 -0
  18. dataeval/_internal/metrics/stats/hashstats.py +73 -0
  19. dataeval/_internal/metrics/stats/labelstats.py +125 -0
  20. dataeval/_internal/metrics/stats/pixelstats.py +117 -0
  21. dataeval/_internal/metrics/stats/visualstats.py +122 -0
  22. dataeval/_internal/metrics/uap.py +2 -2
  23. dataeval/_internal/metrics/utils.py +28 -13
  24. dataeval/_internal/output.py +3 -18
  25. dataeval/_internal/workflows/sufficiency.py +123 -133
  26. dataeval/metrics/stats/__init__.py +14 -3
  27. dataeval/workflows/__init__.py +2 -2
  28. {dataeval-0.69.4.dist-info → dataeval-0.70.0.dist-info}/METADATA +3 -3
  29. {dataeval-0.69.4.dist-info → dataeval-0.70.0.dist-info}/RECORD +31 -26
  30. {dataeval-0.69.4.dist-info → dataeval-0.70.0.dist-info}/WHEEL +1 -1
  31. dataeval/_internal/flags.py +0 -77
  32. dataeval/_internal/metrics/stats.py +0 -397
  33. dataeval/flags/__init__.py +0 -3
  34. {dataeval-0.69.4.dist-info → dataeval-0.70.0.dist-info}/LICENSE.txt +0 -0
@@ -0,0 +1,97 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Iterable
5
+
6
+ from numpy.typing import ArrayLike
7
+
8
+ from dataeval._internal.metrics.stats.base import BaseStatsOutput
9
+ from dataeval._internal.metrics.stats.dimensionstats import DimensionStatsOutput, dimensionstats
10
+ from dataeval._internal.metrics.stats.labelstats import LabelStatsOutput, labelstats
11
+ from dataeval._internal.metrics.stats.pixelstats import PixelStatsOutput, pixelstats
12
+ from dataeval._internal.metrics.stats.visualstats import VisualStatsOutput, visualstats
13
+ from dataeval._internal.output import OutputMetadata, set_metadata
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class DatasetStatsOutput(OutputMetadata):
18
+ """
19
+ This class represents the outputs of various stats functions against a single
20
+ dataset, such that each index across all stat outputs are representative of
21
+ the same source image. Modifying or mixing outputs will result in inaccurate
22
+ outlier calculations if not created correctly.
23
+
24
+ Attributes
25
+ ----------
26
+ dimensionstats : DimensionStatsOutput or None
27
+ pixelstats: PixelStatsOutput or None
28
+ visualstats: VisualStatsOutput or None
29
+ labelstats: LabelStatsOutput or None, default None
30
+ """
31
+
32
+ dimensionstats: DimensionStatsOutput | None
33
+ pixelstats: PixelStatsOutput | None
34
+ visualstats: VisualStatsOutput | None
35
+ labelstats: LabelStatsOutput | None = None
36
+
37
+ def outputs(self) -> list[BaseStatsOutput]:
38
+ return [s for s in (self.dimensionstats, self.pixelstats, self.visualstats) if s is not None]
39
+
40
+ def __post_init__(self):
41
+ lengths = [len(s) for s in self.outputs()]
42
+ if not all(length == lengths[0] for length in lengths):
43
+ raise ValueError("All StatsOutput classes must contain the same number of image sources.")
44
+
45
+
46
+ @set_metadata("dataeval.metrics")
47
+ def datasetstats(
48
+ images: Iterable[ArrayLike],
49
+ bboxes: Iterable[ArrayLike] | None = None,
50
+ labels: Iterable[ArrayLike] | None = None,
51
+ use_dimension: bool = True,
52
+ use_pixel: bool = True,
53
+ use_visual: bool = True,
54
+ ) -> DatasetStatsOutput:
55
+ """
56
+ Calculates various statistics for each image
57
+
58
+ This function computes dimension, pixel and visual metrics
59
+ on the images or individual bounding boxes for each image as
60
+ well as label statistics if provided.
61
+
62
+ Parameters
63
+ ----------
64
+ images : Iterable[ArrayLike]
65
+ Images to perform calculations on
66
+ bboxes : Iterable[ArrayLike] or None
67
+ Bounding boxes in `xyxy` format for each image to perform calculations on
68
+ labels : Iterable[ArrayLike] or None
69
+ Labels of images or boxes to perform calculations on
70
+
71
+ Returns
72
+ -------
73
+ DatasetStatsOutput
74
+ Output class containing the outputs of various stats functions
75
+
76
+ See Also
77
+ --------
78
+ dimensionstats, labelstats, pixelstats, visualstats, Outliers
79
+
80
+ Examples
81
+ --------
82
+ Calculating the dimension, pixel and visual stats for a dataset with bounding boxes
83
+
84
+ >>> stats = datasetstats(images, bboxes)
85
+ >>> print(stats.dimensionstats.aspect_ratio)
86
+ [ 0.864 0.5884 16. 1.143 1.692 0.5835 0.6665 2.555 1.3
87
+ 0.8335 1. 0.6 0.522 15. 3.834 1.75 0.75 0.7 ]
88
+ >>> print(stats.visualstats.contrast)
89
+ [1.744 1.946 0.1164 0.0635 0.0633 0.06274 0.0429 0.0317 0.0317
90
+ 0.02576 0.02081 0.02171 0.01915 0.01767 0.01799 0.01595 0.01433 0.01478]
91
+ """
92
+ return DatasetStatsOutput(
93
+ dimensionstats(images, bboxes) if use_dimension else None,
94
+ pixelstats(images, bboxes) if use_pixel else None,
95
+ visualstats(images, bboxes) if use_visual else None,
96
+ labelstats(labels) if labels else None,
97
+ )
@@ -0,0 +1,111 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Iterable
5
+
6
+ import numpy as np
7
+ from numpy.typing import ArrayLike, NDArray
8
+
9
+ from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
10
+ from dataeval._internal.metrics.utils import get_bitdepth
11
+ from dataeval._internal.output import set_metadata
12
+
13
+
14
+ class DimensionStatsProcessor(StatsProcessor):
15
+ image_function_map = {
16
+ "left": lambda x: x.box[0],
17
+ "top": lambda x: x.box[1],
18
+ "width": lambda x: x.shape[-1],
19
+ "height": lambda x: x.shape[-2],
20
+ "channels": lambda x: x.shape[-3],
21
+ "size": lambda x: np.prod(x.shape[-2:]),
22
+ "aspect_ratio": lambda x: x.shape[-1] / x.shape[-2],
23
+ "depth": lambda x: get_bitdepth(x.image).depth,
24
+ "center": lambda x: np.asarray([(x.box[0] + x.box[2]) / 2, (x.box[1] + x.box[3]) / 2]),
25
+ "distance": lambda x: np.sqrt(
26
+ np.square(((x.box[0] + x.box[2]) / 2) - (x.width / 2))
27
+ + np.square(((x.box[1] + x.box[3]) / 2) - (x.height / 2))
28
+ ),
29
+ }
30
+
31
+
32
+ @dataclass(frozen=True)
33
+ class DimensionStatsOutput(BaseStatsOutput):
34
+ """
35
+ Attributes
36
+ ----------
37
+ left : NDArray[np.int32]
38
+ Offsets from the left edge of images in pixels
39
+ top : NDArray[np.int32]
40
+ Offsets from the top edge of images in pixels
41
+ width : NDArray[np.uint32]
42
+ Width of the images in pixels
43
+ height : NDArray[np.uint32]
44
+ Height of the images in pixels
45
+ channels : NDArray[np.uint8]
46
+ Channel count of the images in pixels
47
+ size : NDArray[np.uint32]
48
+ Size of the images in pixels
49
+ aspect_ratio : NDArray[np.float16]
50
+ Aspect ratio of the images (width/height)
51
+ depth : NDArray[np.uint8]
52
+ Color depth of the images in bits
53
+ center : NDArray[np.uint16]
54
+ Offset from center in [x,y] coordinates of the images in pixels
55
+ distance : NDArray[np.float16]
56
+ Distance in pixels from center
57
+ """
58
+
59
+ left: NDArray[np.int32]
60
+ top: NDArray[np.int32]
61
+ width: NDArray[np.uint32]
62
+ height: NDArray[np.uint32]
63
+ channels: NDArray[np.uint8]
64
+ size: NDArray[np.uint32]
65
+ aspect_ratio: NDArray[np.float16]
66
+ depth: NDArray[np.uint8]
67
+ center: NDArray[np.int16]
68
+ distance: NDArray[np.float16]
69
+
70
+
71
+ @set_metadata("dataeval.metrics")
72
+ def dimensionstats(
73
+ images: Iterable[ArrayLike],
74
+ bboxes: Iterable[ArrayLike] | None = None,
75
+ ) -> DimensionStatsOutput:
76
+ """
77
+ Calculates dimension statistics for each image
78
+
79
+ This function computes various dimensional metrics (e.g., width, height, channels)
80
+ on the images or individual bounding boxes for each image.
81
+
82
+ Parameters
83
+ ----------
84
+ images : Iterable[ArrayLike]
85
+ Images to perform calculations on
86
+ bboxes : Iterable[ArrayLike] or None
87
+ Bounding boxes in `xyxy` format for each image to perform calculations on
88
+
89
+ Returns
90
+ -------
91
+ DimensionStatsOutput
92
+ A dictionary-like object containing the computed dimension statistics for each image or bounding
93
+ box. The keys correspond to the names of the statistics (e.g., 'width', 'height'), and the values
94
+ are lists of results for each image or numpy arrays when the results are multi-dimensional.
95
+
96
+ See Also
97
+ --------
98
+ pixelstats, visualstats, Outliers
99
+
100
+ Examples
101
+ --------
102
+ Calculating the dimension statistics on the images, whose shape is (C, H, W)
103
+
104
+ >>> results = dimensionstats(images)
105
+ >>> print(results.aspect_ratio)
106
+ [0.75 0.75 0.75 0.75 0.75 0.75 1.333 0.75 0.75 1. ]
107
+ >>> print(results.channels)
108
+ [1 1 1 1 1 1 3 1 1 3]
109
+ """
110
+ output = run_stats(images, bboxes, False, DimensionStatsProcessor, DimensionStatsOutput)
111
+ return DimensionStatsOutput(**output)
@@ -0,0 +1,73 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Iterable
5
+
6
+ from numpy.typing import ArrayLike
7
+
8
+ from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
9
+ from dataeval._internal.metrics.utils import pchash, xxhash
10
+ from dataeval._internal.output import set_metadata
11
+
12
+
13
+ class HashStatsProcessor(StatsProcessor):
14
+ image_function_map = {
15
+ "xxhash": lambda x: xxhash(x.image),
16
+ "pchash": lambda x: pchash(x.image),
17
+ }
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class HashStatsOutput(BaseStatsOutput):
22
+ """
23
+ Attributes
24
+ ----------
25
+ xxhash : List[str]
26
+ xxHash hash of the images as a hex string
27
+ pchash : List[str]
28
+ Perception hash of the images as a hex string
29
+ """
30
+
31
+ xxhash: list[str]
32
+ pchash: list[str]
33
+
34
+
35
+ @set_metadata("dataeval.metrics")
36
+ def hashstats(
37
+ images: Iterable[ArrayLike],
38
+ bboxes: Iterable[ArrayLike] | None = None,
39
+ ) -> HashStatsOutput:
40
+ """
41
+ Calculates hashes for each image
42
+
43
+ This function computes hashes from the images including exact hashes and perception-based
44
+ hashes. These hash values can be used to determine if images are exact or near matches.
45
+
46
+ Parameters
47
+ ----------
48
+ images : ArrayLike
49
+ Images to hashing
50
+ bboxes : Iterable[ArrayLike] or None
51
+ Bounding boxes in `xyxy` format for each image
52
+
53
+ Returns
54
+ -------
55
+ HashStatsOutput
56
+ A dictionary-like object containing the computed hashes for each image.
57
+
58
+ See Also
59
+ --------
60
+ Duplicates
61
+
62
+ Examples
63
+ --------
64
+ Calculating the statistics on the images, whose shape is (C, H, W)
65
+
66
+ >>> results = hashstats(images)
67
+ >>> print(results.xxhash)
68
+ ['a72434443d6e7336', 'efc12c2f14581d79', '4a1e03483a27d674', '3a3ecedbcf814226']
69
+ >>> print(results.pchash)
70
+ ['8f25506af46a7c6a', '8000808000008080', '8e71f18e0ef18e0e', 'a956d6a956d6a928']
71
+ """
72
+ output = run_stats(images, bboxes, False, HashStatsProcessor, HashStatsOutput)
73
+ return HashStatsOutput(**output)
@@ -0,0 +1,125 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import Counter, defaultdict
4
+ from dataclasses import dataclass
5
+ from typing import Any, Iterable, Mapping, TypeVar
6
+
7
+ from numpy.typing import ArrayLike
8
+
9
+ from dataeval._internal.interop import to_numpy
10
+ from dataeval._internal.output import OutputMetadata, set_metadata
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class LabelStatsOutput(OutputMetadata):
15
+ """
16
+ Output class for `labelstats` metrics function
17
+
18
+ Attributes
19
+ ----------
20
+ label_counts_per_class : dict[str | int, int]
21
+ Dictionary whose keys are the different label classes and
22
+ values are total counts of each class
23
+ label_counts_per_image : list[int]
24
+ Number of labels per image
25
+ image_counts_per_label : dict[str | int, int]
26
+ Dictionary whose keys are the different label classes and
27
+ values are total counts of each image the class is present in
28
+ image_indices_per_label : dict[str | int, list]
29
+ Dictionary whose keys are the different label classes and
30
+ values are lists containing the images that have that label
31
+ image_count : int
32
+ Total number of images present
33
+ class_count : int
34
+ Total number of classes present
35
+ label_count : int
36
+ Total number of labels present
37
+ """
38
+
39
+ label_counts_per_class: dict[str | int, int]
40
+ label_counts_per_image: list[int]
41
+ image_counts_per_label: dict[str | int, int]
42
+ image_indices_per_label: dict[str | int, list[int]]
43
+ image_count: int
44
+ class_count: int
45
+ label_count: int
46
+
47
+
48
+ TKey = TypeVar("TKey", int, str)
49
+
50
+
51
+ def sort(d: Mapping[TKey, Any]) -> dict[TKey, Any]:
52
+ """
53
+ Sort mappings by key in increasing order
54
+ """
55
+ return dict(sorted(d.items(), key=lambda x: x[0]))
56
+
57
+
58
+ @set_metadata("dataeval.metrics")
59
+ def labelstats(
60
+ labels: Iterable[ArrayLike],
61
+ ) -> LabelStatsOutput:
62
+ """
63
+ Calculates statistics for data labels
64
+
65
+ This function computes counting metrics (e.g., total per class, total per image)
66
+ on the labels.
67
+
68
+ Parameters
69
+ ----------
70
+ labels : ArrayLike, shape - [label] | [[label]] or (N,M) | (N,)
71
+ Lists or numpy array of labels.
72
+ A set of lists where each list contains all labels per image -
73
+ (e.g. [[label1, label2], [label2], [label1, label3]] or [label1, label2, label1, label3]).
74
+ If a numpy array, N is the number of images, M is the number of labels per image.
75
+
76
+ Returns
77
+ -------
78
+ LabelStatsOutput
79
+ A dictionary-like object containing the computed counting metrics for the labels.
80
+
81
+ Examples
82
+ --------
83
+ Calculating the statistics on labels for a set of data
84
+
85
+ >>> stats = labelstats(labels)
86
+ >>> stats.label_counts_per_class
87
+ {'chicken': 3, 'cow': 8, 'horse': 9, 'pig': 7, 'sheep': 7}
88
+ >>> stats.label_counts_per_image
89
+ [3, 2, 3, 4, 1, 5, 4, 4, 4, 4]
90
+ >>> stats.image_counts_per_label
91
+ {'chicken': 2, 'cow': 6, 'horse': 7, 'pig': 5, 'sheep': 7}
92
+ >>> (stats.image_count, stats.class_count, stats.label_count)
93
+ (10, 5, 34)
94
+ """
95
+ label_counts = Counter()
96
+ image_counts = Counter()
97
+ index_location = defaultdict(list[int])
98
+ label_per_image: list[int] = []
99
+
100
+ for i, group in enumerate(labels):
101
+ # Count occurrences of each label in all sublists
102
+ group = to_numpy(group)
103
+
104
+ label_counts.update(group)
105
+
106
+ # Get the number of labels per image
107
+ label_per_image.append(len(group))
108
+
109
+ # Create a set of unique items in the current sublist
110
+ unique_items: set[int] = set(group)
111
+
112
+ # Update image counts and index locations
113
+ image_counts.update(unique_items)
114
+ for item in unique_items:
115
+ index_location[item].append(i)
116
+
117
+ return LabelStatsOutput(
118
+ label_counts_per_class=sort(label_counts),
119
+ label_counts_per_image=label_per_image,
120
+ image_counts_per_label=sort(image_counts),
121
+ image_indices_per_label=sort(index_location),
122
+ image_count=len(label_per_image),
123
+ class_count=len(label_counts),
124
+ label_count=sum(label_counts.values()),
125
+ )
@@ -0,0 +1,117 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Iterable
5
+
6
+ import numpy as np
7
+ from numpy.typing import ArrayLike, NDArray
8
+ from scipy.stats import entropy, kurtosis, skew
9
+
10
+ from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
11
+ from dataeval._internal.output import set_metadata
12
+
13
+
14
+ class PixelStatsProcessor(StatsProcessor):
15
+ cache_keys = ["histogram"]
16
+ image_function_map = {
17
+ "mean": lambda self: np.mean(self.scaled),
18
+ "std": lambda x: np.std(x.scaled),
19
+ "var": lambda x: np.var(x.scaled),
20
+ "skew": lambda x: np.nan_to_num(skew(x.scaled.ravel())),
21
+ "kurtosis": lambda x: np.nan_to_num(kurtosis(x.scaled.ravel())),
22
+ "histogram": lambda x: np.histogram(x.scaled, 256, (0, 1))[0],
23
+ "entropy": lambda x: entropy(x.get("histogram")),
24
+ }
25
+ channel_function_map = {
26
+ "mean": lambda x: np.mean(x.scaled, axis=1),
27
+ "std": lambda x: np.std(x.scaled, axis=1),
28
+ "var": lambda x: np.var(x.scaled, axis=1),
29
+ "skew": lambda x: np.nan_to_num(skew(x.scaled, axis=1)),
30
+ "kurtosis": lambda x: np.nan_to_num(kurtosis(x.scaled, axis=1)),
31
+ "histogram": lambda x: np.apply_along_axis(lambda y: np.histogram(y, 256, (0, 1))[0], 1, x.scaled),
32
+ "entropy": lambda x: entropy(x.get("histogram"), axis=1),
33
+ }
34
+
35
+
36
+ @dataclass(frozen=True)
37
+ class PixelStatsOutput(BaseStatsOutput):
38
+ """
39
+ Attributes
40
+ ----------
41
+ mean : NDArray[np.float16]
42
+ Mean of the pixel values of the images
43
+ std : NDArray[np.float16]
44
+ Standard deviation of the pixel values of the images
45
+ var : NDArray[np.float16]
46
+ Variance of the pixel values of the images
47
+ skew : NDArray[np.float16]
48
+ Skew of the pixel values of the images
49
+ kurtosis : NDArray[np.float16]
50
+ Kurtosis of the pixel values of the images
51
+ histogram : NDArray[np.uint32]
52
+ Histogram of the pixel values of the images across 256 bins scaled between 0 and 1
53
+ entropy : NDArray[np.float16]
54
+ Entropy of the pixel values of the images
55
+ """
56
+
57
+ mean: NDArray[np.float16]
58
+ std: NDArray[np.float16]
59
+ var: NDArray[np.float16]
60
+ skew: NDArray[np.float16]
61
+ kurtosis: NDArray[np.float16]
62
+ histogram: NDArray[np.uint32]
63
+ entropy: NDArray[np.float16]
64
+
65
+
66
+ @set_metadata("dataeval.metrics")
67
+ def pixelstats(
68
+ images: Iterable[ArrayLike],
69
+ bboxes: Iterable[ArrayLike] | None = None,
70
+ per_channel: bool = False,
71
+ ) -> PixelStatsOutput:
72
+ """
73
+ Calculates pixel statistics for each image
74
+
75
+ This function computes various statistical metrics (e.g., mean, standard deviation, entropy)
76
+ on the images as a whole.
77
+
78
+ Parameters
79
+ ----------
80
+ images : Iterable[ArrayLike]
81
+ Images to perform calculations on
82
+ bboxes : Iterable[ArrayLike] or None
83
+ Bounding boxes in `xyxy` format for each image to perform calculations
84
+
85
+ Returns
86
+ -------
87
+ PixelStatsOutput
88
+ A dictionary-like object containing the computed statistics for each image. The keys correspond
89
+ to the names of the statistics (e.g., 'mean', 'std'), and the values are lists of results for
90
+ each image or numpy arrays when the results are multi-dimensional.
91
+
92
+ See Also
93
+ --------
94
+ dimensionstats, visualstats, Outliers
95
+
96
+ Notes
97
+ -----
98
+ - All metrics are scaled based on the perceived bit depth (which is derived from the largest pixel value)
99
+ to allow for better comparison between images stored in different formats and different resolutions.
100
+
101
+ Examples
102
+ --------
103
+ Calculating the statistics on the images, whose shape is (C, H, W)
104
+
105
+ >>> results = pixelstats(images)
106
+ >>> print(results.mean)
107
+ [0.04828 0.562 0.06726 0.09937 0.1315 0.1636 0.1957 0.2278 0.26
108
+ 0.292 0.3242 0.3562 0.3884 0.4204 0.4526 0.4846 0.5166 0.549
109
+ 0.581 0.6133 0.6455 0.6772 0.7095 0.7417 0.774 0.8057 0.838
110
+ 0.87 0.9023 0.934 ]
111
+ >>> print(results.entropy)
112
+ [3.238 3.303 0.8125 1.028 0.8223 1.046 0.8247 1.041 0.8203 1.012
113
+ 0.812 0.9883 0.795 0.9243 0.9243 0.795 0.9907 0.8125 1.028 0.8223
114
+ 1.046 0.8247 1.041 0.8203 1.012 0.812 0.9883 0.795 0.9243 0.9243]
115
+ """
116
+ output = run_stats(images, bboxes, per_channel, PixelStatsProcessor, PixelStatsOutput)
117
+ return PixelStatsOutput(**output)
@@ -0,0 +1,122 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Iterable
5
+
6
+ import numpy as np
7
+ from numpy.typing import ArrayLike, NDArray
8
+
9
+ from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
10
+ from dataeval._internal.metrics.utils import edge_filter
11
+ from dataeval._internal.output import set_metadata
12
+
13
+ QUARTILES = (0, 25, 50, 75, 100)
14
+
15
+
16
+ class VisualStatsProcessor(StatsProcessor):
17
+ cache_keys = ["percentiles"]
18
+ image_function_map = {
19
+ "brightness": lambda x: x.get("percentiles")[-2],
20
+ "blurriness": lambda x: np.std(edge_filter(np.mean(x.image, axis=0))),
21
+ "contrast": lambda x: np.nan_to_num(
22
+ (np.max(x.get("percentiles")) - np.min(x.get("percentiles"))) / np.mean(x.get("percentiles"))
23
+ ),
24
+ "darkness": lambda x: x.get("percentiles")[1],
25
+ "missing": lambda x: np.sum(np.isnan(x.image)) / np.prod(x.shape[-2:]),
26
+ "zeros": lambda x: np.count_nonzero(x.image == 0) / np.prod(x.shape[-2:]),
27
+ "percentiles": lambda x: np.nanpercentile(x.scaled, q=QUARTILES),
28
+ }
29
+ channel_function_map = {
30
+ "brightness": lambda x: x.get("percentiles")[:, -2],
31
+ "blurriness": lambda x: np.std(np.vectorize(edge_filter, signature="(m,n)->(m,n)")(x.image), axis=(1, 2)),
32
+ "contrast": lambda x: np.nan_to_num(
33
+ (np.max(x.get("percentiles"), axis=1) - np.min(x.get("percentiles"), axis=1))
34
+ / np.mean(x.get("percentiles"), axis=1)
35
+ ),
36
+ "darkness": lambda x: x.get("percentiles")[:, 1],
37
+ "missing": lambda x: np.sum(np.isnan(x.image), axis=(1, 2)) / np.prod(x.shape[-2:]),
38
+ "zeros": lambda x: np.count_nonzero(x.image == 0, axis=(1, 2)) / np.prod(x.shape[-2:]),
39
+ "percentiles": lambda x: np.nanpercentile(x.scaled, q=QUARTILES, axis=1).T,
40
+ }
41
+
42
+
43
+ @dataclass(frozen=True)
44
+ class VisualStatsOutput(BaseStatsOutput):
45
+ """
46
+ Attributes
47
+ ----------
48
+ brightness : NDArray[np.float16]
49
+ Brightness of the images
50
+ blurriness : NDArray[np.float16]
51
+ Blurriness of the images
52
+ contrast : NDArray[np.float16]
53
+ Image contrast ratio
54
+ darkness : NDArray[np.float16]
55
+ Darkness of the images
56
+ missing : NDArray[np.float16]
57
+ Percentage of the images with missing pixels
58
+ zeros : NDArray[np.float16]
59
+ Percentage of the images with zero value pixels
60
+ percentiles : NDArray[np.float16]
61
+ Percentiles of the pixel values of the images with quartiles of (0, 25, 50, 75, 100)
62
+ """
63
+
64
+ brightness: NDArray[np.float16]
65
+ blurriness: NDArray[np.float16]
66
+ contrast: NDArray[np.float16]
67
+ darkness: NDArray[np.float16]
68
+ missing: NDArray[np.float16]
69
+ zeros: NDArray[np.float16]
70
+ percentiles: NDArray[np.float16]
71
+
72
+
73
+ @set_metadata("dataeval.metrics")
74
+ def visualstats(
75
+ images: Iterable[ArrayLike],
76
+ bboxes: Iterable[ArrayLike] | None = None,
77
+ per_channel: bool = False,
78
+ ) -> VisualStatsOutput:
79
+ """
80
+ Calculates visual statistics for each image
81
+
82
+ This function computes various visual metrics (e.g., brightness, darkness, contrast, blurriness)
83
+ on the images as a whole.
84
+
85
+ Parameters
86
+ ----------
87
+ images : Iterable[ArrayLike]
88
+ Images to perform calculations on
89
+ bboxes : Iterable[ArrayLike] or None
90
+ Bounding boxes in `xyxy` format for each image to perform calculations on
91
+
92
+ Returns
93
+ -------
94
+ VisualStatsOutput
95
+ A dictionary-like object containing the computed visual statistics for each image. The keys correspond
96
+ to the names of the statistics (e.g., 'brightness', 'blurriness'), and the values are lists of results for
97
+ each image or numpy arrays when the results are multi-dimensional.
98
+
99
+ See Also
100
+ --------
101
+ dimensionstats, pixelstats, Outliers
102
+
103
+ Notes
104
+ -----
105
+ - `zeros` and `missing` are presented as a percentage of total pixel counts
106
+
107
+ Examples
108
+ --------
109
+ Calculating the statistics on the images, whose shape is (C, H, W)
110
+
111
+ >>> results = visualstats(images)
112
+ >>> print(results.brightness)
113
+ [0.0737 0.607 0.0713 0.1046 0.138 0.1713 0.2046 0.2379 0.2712 0.3047
114
+ 0.338 0.3713 0.4045 0.438 0.4712 0.5044 0.538 0.5713 0.6045 0.638
115
+ 0.6714 0.7046 0.738 0.7715 0.8047 0.838 0.871 0.905 0.938 0.971 ]
116
+ >>> print(results.contrast)
117
+ [2.041 1.332 1.293 1.279 1.272 1.268 1.265 1.263 1.261 1.26 1.259 1.258
118
+ 1.258 1.257 1.257 1.256 1.256 1.255 1.255 1.255 1.255 1.254 1.254 1.254
119
+ 1.254 1.254 1.254 1.253 1.253 1.253]
120
+ """
121
+ output = run_stats(images, bboxes, per_channel, VisualStatsProcessor, VisualStatsOutput)
122
+ return VisualStatsOutput(**output)
@@ -9,7 +9,7 @@ from dataclasses import dataclass
9
9
  from numpy.typing import ArrayLike
10
10
  from sklearn.metrics import average_precision_score
11
11
 
12
- from dataeval._internal.interop import to_numpy
12
+ from dataeval._internal.interop import as_numpy
13
13
  from dataeval._internal.output import OutputMetadata, set_metadata
14
14
 
15
15
 
@@ -75,5 +75,5 @@ def uap(labels: ArrayLike, scores: ArrayLike) -> UAPOutput:
75
75
  UAPOutput(uap=0.7777777777777777)
76
76
  """
77
77
 
78
- precision = float(average_precision_score(to_numpy(labels), to_numpy(scores), average="weighted"))
78
+ precision = float(average_precision_score(as_numpy(labels), as_numpy(scores), average="weighted"))
79
79
  return UAPOutput(precision)