dataeval 0.81.0__py3-none-any.whl → 0.82.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. dataeval/__init__.py +1 -1
  2. dataeval/config.py +68 -11
  3. dataeval/detectors/drift/__init__.py +2 -2
  4. dataeval/detectors/drift/_base.py +8 -64
  5. dataeval/detectors/drift/_mmd.py +12 -38
  6. dataeval/detectors/drift/_torch.py +7 -7
  7. dataeval/detectors/drift/_uncertainty.py +6 -5
  8. dataeval/detectors/drift/updates.py +20 -3
  9. dataeval/detectors/linters/__init__.py +3 -2
  10. dataeval/detectors/linters/duplicates.py +14 -46
  11. dataeval/detectors/linters/outliers.py +25 -159
  12. dataeval/detectors/ood/__init__.py +1 -1
  13. dataeval/detectors/ood/ae.py +6 -5
  14. dataeval/detectors/ood/base.py +2 -2
  15. dataeval/detectors/ood/metadata_ood_mi.py +4 -6
  16. dataeval/detectors/ood/mixin.py +3 -4
  17. dataeval/detectors/ood/vae.py +3 -2
  18. dataeval/metadata/__init__.py +2 -1
  19. dataeval/metadata/_distance.py +134 -0
  20. dataeval/metadata/_ood.py +30 -49
  21. dataeval/metadata/_utils.py +44 -0
  22. dataeval/metrics/bias/__init__.py +5 -4
  23. dataeval/metrics/bias/_balance.py +17 -149
  24. dataeval/metrics/bias/_coverage.py +4 -106
  25. dataeval/metrics/bias/_diversity.py +12 -107
  26. dataeval/metrics/bias/_parity.py +7 -71
  27. dataeval/metrics/estimators/__init__.py +5 -4
  28. dataeval/metrics/estimators/_ber.py +2 -20
  29. dataeval/metrics/estimators/_clusterer.py +1 -61
  30. dataeval/metrics/estimators/_divergence.py +2 -19
  31. dataeval/metrics/estimators/_uap.py +2 -16
  32. dataeval/metrics/stats/__init__.py +15 -12
  33. dataeval/metrics/stats/_base.py +41 -128
  34. dataeval/metrics/stats/_boxratiostats.py +13 -13
  35. dataeval/metrics/stats/_dimensionstats.py +17 -58
  36. dataeval/metrics/stats/_hashstats.py +19 -35
  37. dataeval/metrics/stats/_imagestats.py +94 -0
  38. dataeval/metrics/stats/_labelstats.py +42 -121
  39. dataeval/metrics/stats/_pixelstats.py +19 -51
  40. dataeval/metrics/stats/_visualstats.py +19 -51
  41. dataeval/outputs/__init__.py +57 -0
  42. dataeval/outputs/_base.py +182 -0
  43. dataeval/outputs/_bias.py +381 -0
  44. dataeval/outputs/_drift.py +83 -0
  45. dataeval/outputs/_estimators.py +114 -0
  46. dataeval/outputs/_linters.py +186 -0
  47. dataeval/outputs/_metadata.py +54 -0
  48. dataeval/{detectors/ood/output.py → outputs/_ood.py} +22 -22
  49. dataeval/outputs/_stats.py +393 -0
  50. dataeval/outputs/_utils.py +44 -0
  51. dataeval/outputs/_workflows.py +364 -0
  52. dataeval/typing.py +187 -7
  53. dataeval/utils/_method.py +1 -5
  54. dataeval/utils/_plot.py +2 -2
  55. dataeval/utils/data/__init__.py +5 -1
  56. dataeval/utils/data/_dataset.py +217 -0
  57. dataeval/utils/data/_embeddings.py +12 -14
  58. dataeval/utils/data/_images.py +30 -27
  59. dataeval/utils/data/_metadata.py +28 -11
  60. dataeval/utils/data/_selection.py +25 -22
  61. dataeval/utils/data/_split.py +5 -29
  62. dataeval/utils/data/_targets.py +14 -2
  63. dataeval/utils/data/datasets/_base.py +5 -5
  64. dataeval/utils/data/datasets/_cifar10.py +1 -1
  65. dataeval/utils/data/datasets/_milco.py +1 -1
  66. dataeval/utils/data/datasets/_mnist.py +1 -1
  67. dataeval/utils/data/datasets/_ships.py +1 -1
  68. dataeval/utils/data/{_types.py → datasets/_types.py} +10 -16
  69. dataeval/utils/data/datasets/_voc.py +1 -1
  70. dataeval/utils/data/selections/_classfilter.py +4 -5
  71. dataeval/utils/data/selections/_indices.py +2 -2
  72. dataeval/utils/data/selections/_limit.py +2 -2
  73. dataeval/utils/data/selections/_reverse.py +2 -2
  74. dataeval/utils/data/selections/_shuffle.py +2 -2
  75. dataeval/utils/torch/_internal.py +5 -5
  76. dataeval/utils/torch/trainer.py +8 -8
  77. dataeval/workflows/__init__.py +2 -1
  78. dataeval/workflows/sufficiency.py +6 -342
  79. {dataeval-0.81.0.dist-info → dataeval-0.82.1.dist-info}/METADATA +2 -2
  80. dataeval-0.82.1.dist-info/RECORD +105 -0
  81. dataeval/_output.py +0 -137
  82. dataeval/detectors/ood/metadata_ks_compare.py +0 -129
  83. dataeval/metrics/stats/_datasetstats.py +0 -198
  84. dataeval-0.81.0.dist-info/RECORD +0 -94
  85. {dataeval-0.81.0.dist-info → dataeval-0.82.1.dist-info}/LICENSE.txt +0 -0
  86. {dataeval-0.81.0.dist-info → dataeval-0.82.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,186 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = []
4
+
5
+ import contextlib
6
+ from dataclasses import dataclass
7
+ from typing import Generic, TypeVar, Union
8
+
9
+ from typing_extensions import TypeAlias
10
+
11
+ with contextlib.suppress(ImportError):
12
+ import pandas as pd
13
+
14
+ from dataeval.outputs._base import Output
15
+ from dataeval.outputs._stats import DimensionStatsOutput, LabelStatsOutput, PixelStatsOutput, VisualStatsOutput
16
+
17
+ DuplicateGroup: TypeAlias = list[int]
18
+ DatasetDuplicateGroupMap: TypeAlias = dict[int, DuplicateGroup]
19
+ TIndexCollection = TypeVar("TIndexCollection", DuplicateGroup, DatasetDuplicateGroupMap)
20
+
21
+ IndexIssueMap: TypeAlias = dict[int, dict[str, float]]
22
+ OutlierStatsOutput: TypeAlias = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
23
+ TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap, list[IndexIssueMap])
24
+
25
+
26
+ @dataclass(frozen=True)
27
+ class DuplicatesOutput(Output, Generic[TIndexCollection]):
28
+ """
29
+ Output class for :class:`.Duplicates` lint detector.
30
+
31
+ Attributes
32
+ ----------
33
+ exact : list[list[int] | dict[int, list[int]]]
34
+ Indices of images that are exact matches
35
+ near: list[list[int] | dict[int, list[int]]]
36
+ Indices of images that are near matches
37
+
38
+ Notes
39
+ -----
40
+ - For a single dataset, indices are returned as a list of index groups.
41
+ - For multiple datasets, indices are returned as dictionaries where the key is the
42
+ index of the dataset, and the value is the list index groups from that dataset.
43
+ """
44
+
45
+ exact: list[TIndexCollection]
46
+ near: list[TIndexCollection]
47
+
48
+
49
+ def _reorganize_by_class_and_metric(result: IndexIssueMap, lstats: LabelStatsOutput):
50
+ """Flip result from grouping by image to grouping by class and metric"""
51
+ metrics = {}
52
+ class_wise = {label: {} for label in lstats.class_names}
53
+
54
+ # Group metrics and calculate class-wise counts
55
+ for img, group in result.items():
56
+ for extreme in group:
57
+ metrics.setdefault(extreme, []).append(img)
58
+ for i, images in enumerate(lstats.image_indices_per_class):
59
+ if img in images:
60
+ class_wise[lstats.class_names[i]][extreme] = class_wise[lstats.class_names[i]].get(extreme, 0) + 1
61
+
62
+ return metrics, class_wise
63
+
64
+
65
+ def _create_table(metrics, class_wise):
66
+ """Create table for displaying the results"""
67
+ max_class_length = max(len(str(label)) for label in class_wise) + 2
68
+ max_total = max(len(metrics[group]) for group in metrics) + 2
69
+
70
+ table_header = " | ".join(
71
+ [f"{'Class':>{max_class_length}}"]
72
+ + [f"{group:^{max(5, len(str(group))) + 2}}" for group in sorted(metrics.keys())]
73
+ + [f"{'Total':<{max_total}}"]
74
+ )
75
+ table_rows = []
76
+
77
+ for class_cat, results in class_wise.items():
78
+ table_value = [f"{class_cat:>{max_class_length}}"]
79
+ total = 0
80
+ for group in sorted(metrics.keys()):
81
+ count = results.get(group, 0)
82
+ table_value.append(f"{count:^{max(5, len(str(group))) + 2}}")
83
+ total += count
84
+ table_value.append(f"{total:^{max_total}}")
85
+ table_rows.append(" | ".join(table_value))
86
+
87
+ table = [table_header] + table_rows
88
+ return table
89
+
90
+
91
+ def _create_pandas_dataframe(class_wise):
92
+ """Create data for pandas dataframe"""
93
+ data = []
94
+ for label, metrics_dict in class_wise.items():
95
+ row = {"Class": label}
96
+ total = sum(metrics_dict.values())
97
+ row.update(metrics_dict) # Add metric counts
98
+ row["Total"] = total
99
+ data.append(row)
100
+ return data
101
+
102
+
103
+ @dataclass(frozen=True)
104
+ class OutliersOutput(Output, Generic[TIndexIssueMap]):
105
+ """
106
+ Output class for :class:`.Outliers` lint detector.
107
+
108
+ Attributes
109
+ ----------
110
+ issues : dict[int, dict[str, float]] | list[dict[int, dict[str, float]]]
111
+ Indices of image Outliers with their associated issue type and calculated values.
112
+
113
+ - For a single dataset, a dictionary containing the indices of outliers and
114
+ a dictionary showing the issues and calculated values for the given index.
115
+ - For multiple stats outputs, a list of dictionaries containing the indices of
116
+ outliers and their associated issues and calculated values.
117
+ """
118
+
119
+ issues: TIndexIssueMap
120
+
121
+ def __len__(self) -> int:
122
+ if isinstance(self.issues, dict):
123
+ return len(self.issues)
124
+ else:
125
+ return sum(len(d) for d in self.issues)
126
+
127
+ def to_table(self, labelstats: LabelStatsOutput) -> str:
128
+ """
129
+ Formats the outlier output results as a table.
130
+
131
+ Parameters
132
+ ----------
133
+ labelstats : LabelStatsOutput
134
+ Output of :func:`.labelstats`
135
+
136
+ Returns
137
+ -------
138
+ str
139
+ """
140
+ if isinstance(self.issues, dict):
141
+ metrics, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
142
+ listed_table = _create_table(metrics, classwise)
143
+ table = "\n".join(listed_table)
144
+ else:
145
+ outertable = []
146
+ for d in self.issues:
147
+ metrics, classwise = _reorganize_by_class_and_metric(d, labelstats)
148
+ listed_table = _create_table(metrics, classwise)
149
+ str_table = "\n".join(listed_table)
150
+ outertable.append(str_table)
151
+ table = "\n\n".join(outertable)
152
+ return table
153
+
154
+ def to_dataframe(self, labelstats: LabelStatsOutput) -> pd.DataFrame:
155
+ """
156
+ Exports the outliers output results to a pandas DataFrame.
157
+
158
+ Parameters
159
+ ----------
160
+ labelstats : LabelStatsOutput
161
+ Output of :func:`.labelstats`
162
+
163
+ Returns
164
+ -------
165
+ pd.DataFrame
166
+
167
+ Notes
168
+ -----
169
+ This method requires `pandas <https://pandas.pydata.org/>`_ to be installed.
170
+ """
171
+ import pandas as pd
172
+
173
+ if isinstance(self.issues, dict):
174
+ _, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
175
+ data = _create_pandas_dataframe(classwise)
176
+ df = pd.DataFrame(data)
177
+ else:
178
+ df_list = []
179
+ for i, d in enumerate(self.issues):
180
+ _, classwise = _reorganize_by_class_and_metric(d, labelstats)
181
+ data = _create_pandas_dataframe(classwise)
182
+ single_df = pd.DataFrame(data)
183
+ single_df["Dataset"] = i
184
+ df_list.append(single_df)
185
+ df = pd.concat(df_list)
186
+ return df
@@ -0,0 +1,54 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = []
4
+
5
+ from typing import NamedTuple
6
+
7
+ from dataeval.outputs._base import MappingOutput, SequenceOutput
8
+
9
+
10
+ class MostDeviatedFactorsOutput(SequenceOutput[tuple[str, float]]):
11
+ """
12
+ Output class for results of :func:`.most_deviated_factors` for OOD samples with metadata.
13
+
14
+ Attributes
15
+ ----------
16
+ value : tuple[str, float]
17
+ A tuple of the factor name and deviation of the highest metadata deviation
18
+ """
19
+
20
+
21
+ class MetadataDistanceValues(NamedTuple):
22
+ """
23
+ Statistics comparing metadata distance.
24
+
25
+ Attributes
26
+ ----------
27
+ statistic : float
28
+ the KS statistic
29
+ location : float
30
+ The value at which the KS statistic has its maximum, measured in IQR-normalized units relative
31
+ to the median of the reference distribution.
32
+ dist : float
33
+ The Earth Mover's Distance normalized by the interquartile range (IQR) of the reference
34
+ pvalue : float
35
+ The p-value from the KS two-sample test
36
+ """
37
+
38
+ statistic: float
39
+ location: float
40
+ dist: float
41
+ pvalue: float
42
+
43
+
44
+ class MetadataDistanceOutput(MappingOutput[str, MetadataDistanceValues]):
45
+ """
46
+ Output class for results of ks_2samp featurewise comparisons of new metadata to reference metadata.
47
+
48
+ Attributes
49
+ ----------
50
+ key : str
51
+ Metadata feature names
52
+ value : :class:`.MetadataDistanceValues`
53
+ Output per feature name containing the statistic, statistic location, distance, and pvalue.
54
+ """
@@ -8,27 +8,7 @@ from typing import Literal
8
8
  import numpy as np
9
9
  from numpy.typing import NDArray
10
10
 
11
- from dataeval._output import Output
12
-
13
-
14
- @dataclass(frozen=True)
15
- class OODOutput(Output):
16
- """
17
- Output class for predictions from out-of-distribution detectors.
18
-
19
- Attributes
20
- ----------
21
- is_ood : NDArray
22
- Array of images that are detected as :term:Out-of-Distribution (OOD)`
23
- instance_score : NDArray
24
- Instance score of the evaluated dataset
25
- feature_score : NDArray | None
26
- Feature score, if available, of the evaluated dataset
27
- """
28
-
29
- is_ood: NDArray[np.bool_]
30
- instance_score: NDArray[np.float32]
31
- feature_score: NDArray[np.float32] | None
11
+ from dataeval.outputs._base import Output
32
12
 
33
13
 
34
14
  @dataclass(frozen=True)
@@ -49,7 +29,7 @@ class OODScoreOutput(Output):
49
29
 
50
30
  def get(self, ood_type: Literal["instance", "feature"]) -> NDArray[np.float32]:
51
31
  """
52
- Returns either the instance or feature score
32
+ Returns either the instance or feature score.
53
33
 
54
34
  Parameters
55
35
  ----------
@@ -61,3 +41,23 @@ class OODScoreOutput(Output):
61
41
  Either the instance or feature score based on input selection
62
42
  """
63
43
  return self.instance_score if ood_type == "instance" or self.feature_score is None else self.feature_score
44
+
45
+
46
+ @dataclass(frozen=True)
47
+ class OODOutput(Output):
48
+ """
49
+ Output class for predictions from out-of-distribution detectors.
50
+
51
+ Attributes
52
+ ----------
53
+ is_ood : NDArray
54
+ Array of images that are detected as :term:Out-of-Distribution (OOD)`
55
+ instance_score : NDArray
56
+ Instance score of the evaluated dataset
57
+ feature_score : NDArray | None
58
+ Feature score, if available, of the evaluated dataset
59
+ """
60
+
61
+ is_ood: NDArray[np.bool_]
62
+ instance_score: NDArray[np.float32]
63
+ feature_score: NDArray[np.float32] | None
@@ -0,0 +1,393 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = []
4
+
5
+ import contextlib
6
+ from dataclasses import dataclass
7
+ from typing import Any, Iterable, Optional, Union
8
+
9
+ import numpy as np
10
+ from numpy.typing import NDArray
11
+ from typing_extensions import TypeAlias
12
+
13
+ with contextlib.suppress(ImportError):
14
+ import pandas as pd
15
+
16
+ from dataeval.outputs._base import Output
17
+ from dataeval.utils._plot import channel_histogram_plot, histogram_plot
18
+
19
+ OptionalRange: TypeAlias = Optional[Union[int, Iterable[int]]]
20
+
21
+ SOURCE_INDEX = "source_index"
22
+ BOX_COUNT = "box_count"
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class SourceIndex:
27
+ """
28
+ The indices of the source image, box and channel.
29
+
30
+ Attributes
31
+ ----------
32
+ image: int
33
+ Index of the source image
34
+ box : int | None
35
+ Index of the box of the source image (if applicable)
36
+ channel : int | None
37
+ Index of the channel of the source image (if applicable)
38
+ """
39
+
40
+ image: int
41
+ box: int | None
42
+ channel: int | None
43
+
44
+
45
+ def matches(index: int | None, opt_range: OptionalRange) -> bool:
46
+ if index is None or opt_range is None:
47
+ return True
48
+ return index in opt_range if isinstance(opt_range, Iterable) else index == opt_range
49
+
50
+
51
+ @dataclass(frozen=True)
52
+ class BaseStatsOutput(Output):
53
+ """
54
+ Attributes
55
+ ----------
56
+ source_index : List[SourceIndex]
57
+ Mapping from statistic to source image, box and channel index
58
+ box_count : NDArray[np.uint16]
59
+ """
60
+
61
+ source_index: list[SourceIndex]
62
+ box_count: NDArray[np.uint16]
63
+
64
+ def __post_init__(self) -> None:
65
+ length = len(self.source_index)
66
+ bad = {k: len(v) for k, v in self.data().items() if k not in [SOURCE_INDEX, BOX_COUNT] and len(v) != length}
67
+ if bad:
68
+ raise ValueError(f"All values must have the same length as source_index. Bad values: {str(bad)}.")
69
+
70
+ def get_channel_mask(
71
+ self,
72
+ channel_index: OptionalRange,
73
+ channel_count: OptionalRange = None,
74
+ ) -> list[bool]:
75
+ """
76
+ Boolean mask for results filtered to specified channel index and optionally the count
77
+ of the channels per image.
78
+
79
+ Parameters
80
+ ----------
81
+ channel_index : int | Iterable[int] | None
82
+ Index or indices of channel(s) to filter for
83
+ channel_count : int | Iterable[int] | None
84
+ Optional count(s) of channels to filter for
85
+ """
86
+ mask: list[bool] = []
87
+ cur_mask: list[bool] = []
88
+ cur_image = 0
89
+ cur_max_channel = 0
90
+ for source_index in list(self.source_index) + [None]:
91
+ if source_index is None or source_index.image > cur_image:
92
+ mask.extend(cur_mask if matches(cur_max_channel + 1, channel_count) else [False for _ in cur_mask])
93
+ if source_index is not None:
94
+ cur_image = source_index.image
95
+ cur_max_channel = 0
96
+ cur_mask.clear()
97
+ if source_index is not None:
98
+ cur_mask.append(matches(source_index.channel, channel_index))
99
+ cur_max_channel = max(cur_max_channel, source_index.channel or 0)
100
+ return mask
101
+
102
+ def __len__(self) -> int:
103
+ return len(self.source_index)
104
+
105
+ def _get_channels(
106
+ self, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None
107
+ ) -> tuple[int, list[bool] | None]:
108
+ source_index = self.data()[SOURCE_INDEX]
109
+ raw_channels = int(max([si.channel or 0 for si in source_index])) + 1
110
+ if isinstance(channel_index, int):
111
+ max_channels = 1 if channel_index < raw_channels else raw_channels
112
+ ch_mask = self.get_channel_mask(channel_index)
113
+ elif isinstance(channel_index, Iterable) and all(isinstance(val, int) for val in list(channel_index)):
114
+ max_channels = len(list(channel_index))
115
+ ch_mask = self.get_channel_mask(channel_index)
116
+ elif isinstance(channel_limit, int):
117
+ max_channels = channel_limit
118
+ ch_mask = self.get_channel_mask(None, channel_limit)
119
+ else:
120
+ max_channels = raw_channels
121
+ ch_mask = None
122
+
123
+ if max_channels > raw_channels:
124
+ max_channels = raw_channels
125
+ if ch_mask is not None and not any(ch_mask):
126
+ ch_mask = None
127
+
128
+ return max_channels, ch_mask
129
+
130
+ def factors(self) -> dict[str, NDArray[Any]]:
131
+ return {
132
+ k: v
133
+ for k, v in self.data().items()
134
+ if k not in (SOURCE_INDEX, BOX_COUNT) and isinstance(v, np.ndarray) and v[v != 0].size > 0 and v.ndim == 1
135
+ }
136
+
137
+ def plot(
138
+ self, log: bool, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None
139
+ ) -> None:
140
+ max_channels, ch_mask = self._get_channels(channel_limit, channel_index)
141
+ if max_channels == 1:
142
+ histogram_plot(self.factors(), log)
143
+ else:
144
+ channel_histogram_plot(self.factors(), log, max_channels, ch_mask)
145
+
146
+
147
+ @dataclass(frozen=True)
148
+ class DimensionStatsOutput(BaseStatsOutput):
149
+ """
150
+ Output class for :func:`.dimensionstats` stats metric.
151
+
152
+ Attributes
153
+ ----------
154
+ left : NDArray[np.int32]
155
+ Offsets from the left edge of images in pixels
156
+ top : NDArray[np.int32]
157
+ Offsets from the top edge of images in pixels
158
+ width : NDArray[np.uint32]
159
+ Width of the images in pixels
160
+ height : NDArray[np.uint32]
161
+ Height of the images in pixels
162
+ channels : NDArray[np.uint8]
163
+ Channel count of the images in pixels
164
+ size : NDArray[np.uint32]
165
+ Size of the images in pixels
166
+ aspect_ratio : NDArray[np.float16]
167
+ :term:`ASspect Ratio<Aspect Ratio>` of the images (width/height)
168
+ depth : NDArray[np.uint8]
169
+ Color depth of the images in bits
170
+ center : NDArray[np.uint16]
171
+ Offset from center in [x,y] coordinates of the images in pixels
172
+ distance : NDArray[np.float16]
173
+ Distance in pixels from center
174
+ """
175
+
176
+ left: NDArray[np.int32]
177
+ top: NDArray[np.int32]
178
+ width: NDArray[np.uint32]
179
+ height: NDArray[np.uint32]
180
+ channels: NDArray[np.uint8]
181
+ size: NDArray[np.uint32]
182
+ aspect_ratio: NDArray[np.float16]
183
+ depth: NDArray[np.uint8]
184
+ center: NDArray[np.int16]
185
+ distance: NDArray[np.float16]
186
+
187
+
188
+ @dataclass(frozen=True)
189
+ class HashStatsOutput(BaseStatsOutput):
190
+ """
191
+ Output class for :func:`.hashstats` stats metric.
192
+
193
+ Attributes
194
+ ----------
195
+ xxhash : List[str]
196
+ xxHash hash of the images as a hex string
197
+ pchash : List[str]
198
+ :term:`Perception-based Hash` of the images as a hex string
199
+ """
200
+
201
+ xxhash: list[str]
202
+ pchash: list[str]
203
+
204
+
205
+ @dataclass(frozen=True)
206
+ class LabelStatsOutput(Output):
207
+ """
208
+ Output class for :func:`.labelstats` stats metric.
209
+
210
+ Attributes
211
+ ----------
212
+ label_counts_per_class : dict[int, int]
213
+ Dictionary whose keys are the different label classes and
214
+ values are total counts of each class
215
+ label_counts_per_image : list[int]
216
+ Number of labels per image
217
+ image_counts_per_class : dict[int, int]
218
+ Dictionary whose keys are the different label classes and
219
+ values are total counts of each image the class is present in
220
+ image_indices_per_class : dict[int, list]
221
+ Dictionary whose keys are the different label classes and
222
+ values are lists containing the images that have that label
223
+ image_count : int
224
+ Total number of images present
225
+ class_count : int
226
+ Total number of classes present
227
+ label_count : int
228
+ Total number of labels present
229
+ class_names : list[str]
230
+ """
231
+
232
+ label_counts_per_class: list[int]
233
+ label_counts_per_image: list[int]
234
+ image_counts_per_class: list[int]
235
+ image_indices_per_class: list[list[int]]
236
+ image_count: int
237
+ class_count: int
238
+ label_count: int
239
+ class_names: list[str]
240
+
241
+ def to_table(self) -> str:
242
+ """
243
+ Formats the label statistics output results as a table.
244
+
245
+ Returns
246
+ -------
247
+ str
248
+ """
249
+ max_char = max(len(name) if isinstance(name, str) else name // 10 + 1 for name in self.class_names)
250
+ max_char = max(max_char, 5)
251
+ max_label = max(list(self.label_counts_per_class))
252
+ max_img = max(list(self.image_counts_per_class))
253
+ max_num = int(np.ceil(np.log10(max(max_label, max_img))))
254
+ max_num = max(max_num, 11)
255
+
256
+ # Display basic counts
257
+ table_str = [f"Class Count: {self.class_count}"]
258
+ table_str += [f"Label Count: {self.label_count}"]
259
+ table_str += [f"Average # Labels per Image: {round(np.mean(self.label_counts_per_image), 2)}"]
260
+ table_str += ["--------------------------------------"]
261
+
262
+ # Display counts per class
263
+ table_str += [f"{'Label':>{max_char}}: Total Count - Image Count"]
264
+ for cls in range(len(self.class_names)):
265
+ table_str += [
266
+ f"{self.class_names[cls]:>{max_char}}: {self.label_counts_per_class[cls]:^{max_num}}"
267
+ + " - "
268
+ + f"{self.image_counts_per_class[cls]:^{max_num}}".rstrip()
269
+ ]
270
+
271
+ return "\n".join(table_str)
272
+
273
+ def to_dataframe(self) -> pd.DataFrame:
274
+ """
275
+ Exports the label statistics output results to a pandas DataFrame.
276
+
277
+ Notes
278
+ -----
279
+ This method requires `pandas <https://pandas.pydata.org/>`_ to be installed.
280
+
281
+ Returns
282
+ -------
283
+ pd.DataFrame
284
+ """
285
+ import pandas as pd
286
+
287
+ total_count = []
288
+ image_count = []
289
+ for cls in range(len(self.class_names)):
290
+ total_count.append(self.label_counts_per_class[cls])
291
+ image_count.append(self.image_counts_per_class[cls])
292
+
293
+ return pd.DataFrame(
294
+ {
295
+ "Label": self.class_names,
296
+ "Total Count": total_count,
297
+ "Image Count": image_count,
298
+ }
299
+ )
300
+
301
+
302
+ @dataclass(frozen=True)
303
+ class PixelStatsOutput(BaseStatsOutput):
304
+ """
305
+ Output class for :func:`.pixelstats` stats metric.
306
+
307
+ Attributes
308
+ ----------
309
+ mean : NDArray[np.float16]
310
+ Mean of the pixel values of the images
311
+ std : NDArray[np.float16]
312
+ Standard deviation of the pixel values of the images
313
+ var : NDArray[np.float16]
314
+ :term:`Variance` of the pixel values of the images
315
+ skew : NDArray[np.float16]
316
+ Skew of the pixel values of the images
317
+ kurtosis : NDArray[np.float16]
318
+ Kurtosis of the pixel values of the images
319
+ histogram : NDArray[np.uint32]
320
+ Histogram of the pixel values of the images across 256 bins scaled between 0 and 1
321
+ entropy : NDArray[np.float16]
322
+ Entropy of the pixel values of the images
323
+ """
324
+
325
+ mean: NDArray[np.float16]
326
+ std: NDArray[np.float16]
327
+ var: NDArray[np.float16]
328
+ skew: NDArray[np.float16]
329
+ kurtosis: NDArray[np.float16]
330
+ histogram: NDArray[np.uint32]
331
+ entropy: NDArray[np.float16]
332
+
333
+
334
+ @dataclass(frozen=True)
335
+ class VisualStatsOutput(BaseStatsOutput):
336
+ """
337
+ Output class for :func:`.visualstats` stats metric.
338
+
339
+ Attributes
340
+ ----------
341
+ brightness : NDArray[np.float16]
342
+ Brightness of the images
343
+ contrast : NDArray[np.float16]
344
+ Image contrast ratio
345
+ darkness : NDArray[np.float16]
346
+ Darkness of the images
347
+ missing : NDArray[np.float16]
348
+ Percentage of the images with missing pixels
349
+ sharpness : NDArray[np.float16]
350
+ Sharpness of the images
351
+ zeros : NDArray[np.float16]
352
+ Percentage of the images with zero value pixels
353
+ percentiles : NDArray[np.float16]
354
+ Percentiles of the pixel values of the images with quartiles of (0, 25, 50, 75, 100)
355
+ """
356
+
357
+ brightness: NDArray[np.float16]
358
+ contrast: NDArray[np.float16]
359
+ darkness: NDArray[np.float16]
360
+ missing: NDArray[np.float16]
361
+ sharpness: NDArray[np.float16]
362
+ zeros: NDArray[np.float16]
363
+ percentiles: NDArray[np.float16]
364
+
365
+
366
+ @dataclass(frozen=True)
367
+ class ImageStatsOutput(DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput):
368
+ """
369
+ Output class for :func:`.imagestats` stats metric with `per_channel=False`.
370
+
371
+ This class represents the combined outputs of various stats functions against a
372
+ single dataset, such that each index across all stat outputs are representative
373
+ of the same source image. Modifying or mixing outputs will result in inaccurate
374
+ outlier calculations if not created correctly.
375
+
376
+ The attributes and methods are a superset of :class:`.DimensionStatsOutput`,
377
+ :class:`.PixelStatsOutput` and :class:`.VisualStatsOutput`.
378
+ """
379
+
380
+
381
+ @dataclass(frozen=True)
382
+ class ChannelStatsOutput(PixelStatsOutput, VisualStatsOutput):
383
+ """
384
+ Output class for :func:`.imagestats` stats metric with `per_channel=True`.
385
+
386
+ This class represents the outputs of various per-channel stats functions against
387
+ a single dataset, such that each index across all stat outputs are representative
388
+ of the same source image. Modifying or mixing outputs will result in inaccurate
389
+ outlier calculations if not created correctly.
390
+
391
+ The attributes and methods are a superset of :class:`.PixelStatsOutput` and
392
+ :class:`.VisualStatsOutput`.
393
+ """