dataeval 0.76.1__py3-none-any.whl → 0.82.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. dataeval/__init__.py +3 -3
  2. dataeval/config.py +77 -0
  3. dataeval/detectors/__init__.py +1 -1
  4. dataeval/detectors/drift/__init__.py +6 -6
  5. dataeval/detectors/drift/{base.py → _base.py} +40 -85
  6. dataeval/detectors/drift/{cvm.py → _cvm.py} +21 -28
  7. dataeval/detectors/drift/{ks.py → _ks.py} +20 -26
  8. dataeval/detectors/drift/{mmd.py → _mmd.py} +31 -43
  9. dataeval/detectors/drift/{torch.py → _torch.py} +2 -1
  10. dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} +24 -7
  11. dataeval/detectors/drift/updates.py +20 -3
  12. dataeval/detectors/linters/__init__.py +3 -5
  13. dataeval/detectors/linters/duplicates.py +13 -36
  14. dataeval/detectors/linters/outliers.py +23 -148
  15. dataeval/detectors/ood/__init__.py +1 -1
  16. dataeval/detectors/ood/ae.py +30 -9
  17. dataeval/detectors/ood/base.py +5 -4
  18. dataeval/detectors/ood/mixin.py +21 -7
  19. dataeval/detectors/ood/vae.py +73 -0
  20. dataeval/metadata/__init__.py +6 -0
  21. dataeval/metadata/_distance.py +167 -0
  22. dataeval/metadata/_ood.py +217 -0
  23. dataeval/metadata/_utils.py +44 -0
  24. dataeval/metrics/__init__.py +1 -1
  25. dataeval/metrics/bias/__init__.py +6 -4
  26. dataeval/metrics/bias/{balance.py → _balance.py} +15 -101
  27. dataeval/metrics/bias/_coverage.py +98 -0
  28. dataeval/metrics/bias/{diversity.py → _diversity.py} +18 -111
  29. dataeval/metrics/bias/{parity.py → _parity.py} +39 -77
  30. dataeval/metrics/estimators/__init__.py +15 -4
  31. dataeval/metrics/estimators/{ber.py → _ber.py} +42 -29
  32. dataeval/metrics/estimators/_clusterer.py +44 -0
  33. dataeval/metrics/estimators/{divergence.py → _divergence.py} +18 -30
  34. dataeval/metrics/estimators/{uap.py → _uap.py} +4 -18
  35. dataeval/metrics/stats/__init__.py +16 -13
  36. dataeval/metrics/stats/{base.py → _base.py} +82 -133
  37. dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} +15 -18
  38. dataeval/metrics/stats/_dimensionstats.py +75 -0
  39. dataeval/metrics/stats/{hashstats.py → _hashstats.py} +21 -37
  40. dataeval/metrics/stats/_imagestats.py +94 -0
  41. dataeval/metrics/stats/_labelstats.py +131 -0
  42. dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} +19 -50
  43. dataeval/metrics/stats/{visualstats.py → _visualstats.py} +23 -54
  44. dataeval/outputs/__init__.py +53 -0
  45. dataeval/{output.py → outputs/_base.py} +55 -25
  46. dataeval/outputs/_bias.py +381 -0
  47. dataeval/outputs/_drift.py +83 -0
  48. dataeval/outputs/_estimators.py +114 -0
  49. dataeval/outputs/_linters.py +184 -0
  50. dataeval/{detectors/ood/output.py → outputs/_ood.py} +22 -22
  51. dataeval/outputs/_stats.py +387 -0
  52. dataeval/outputs/_utils.py +44 -0
  53. dataeval/outputs/_workflows.py +364 -0
  54. dataeval/typing.py +234 -0
  55. dataeval/utils/__init__.py +2 -2
  56. dataeval/utils/_array.py +169 -0
  57. dataeval/utils/_bin.py +199 -0
  58. dataeval/utils/_clusterer.py +144 -0
  59. dataeval/utils/_fast_mst.py +189 -0
  60. dataeval/utils/{image.py → _image.py} +6 -4
  61. dataeval/utils/_method.py +14 -0
  62. dataeval/utils/{shared.py → _mst.py} +3 -65
  63. dataeval/utils/{plot.py → _plot.py} +6 -6
  64. dataeval/utils/data/__init__.py +26 -0
  65. dataeval/utils/data/_dataset.py +217 -0
  66. dataeval/utils/data/_embeddings.py +104 -0
  67. dataeval/utils/data/_images.py +68 -0
  68. dataeval/utils/data/_metadata.py +360 -0
  69. dataeval/utils/data/_selection.py +126 -0
  70. dataeval/utils/{dataset/split.py → data/_split.py} +12 -38
  71. dataeval/utils/data/_targets.py +85 -0
  72. dataeval/utils/data/collate.py +103 -0
  73. dataeval/utils/data/datasets/__init__.py +17 -0
  74. dataeval/utils/data/datasets/_base.py +254 -0
  75. dataeval/utils/data/datasets/_cifar10.py +134 -0
  76. dataeval/utils/data/datasets/_fileio.py +168 -0
  77. dataeval/utils/data/datasets/_milco.py +153 -0
  78. dataeval/utils/data/datasets/_mixin.py +56 -0
  79. dataeval/utils/data/datasets/_mnist.py +183 -0
  80. dataeval/utils/data/datasets/_ships.py +123 -0
  81. dataeval/utils/data/datasets/_types.py +52 -0
  82. dataeval/utils/data/datasets/_voc.py +352 -0
  83. dataeval/utils/data/selections/__init__.py +15 -0
  84. dataeval/utils/data/selections/_classfilter.py +57 -0
  85. dataeval/utils/data/selections/_indices.py +26 -0
  86. dataeval/utils/data/selections/_limit.py +26 -0
  87. dataeval/utils/data/selections/_reverse.py +18 -0
  88. dataeval/utils/data/selections/_shuffle.py +29 -0
  89. dataeval/utils/metadata.py +51 -376
  90. dataeval/utils/torch/{gmm.py → _gmm.py} +4 -2
  91. dataeval/utils/torch/{internal.py → _internal.py} +21 -51
  92. dataeval/utils/torch/models.py +43 -2
  93. dataeval/workflows/__init__.py +2 -1
  94. dataeval/workflows/sufficiency.py +11 -346
  95. {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/METADATA +5 -2
  96. dataeval-0.82.0.dist-info/RECORD +104 -0
  97. dataeval/detectors/linters/clusterer.py +0 -512
  98. dataeval/detectors/linters/merged_stats.py +0 -49
  99. dataeval/detectors/ood/metadata_ks_compare.py +0 -129
  100. dataeval/detectors/ood/metadata_least_likely.py +0 -119
  101. dataeval/interop.py +0 -69
  102. dataeval/metrics/bias/coverage.py +0 -194
  103. dataeval/metrics/stats/datasetstats.py +0 -202
  104. dataeval/metrics/stats/dimensionstats.py +0 -115
  105. dataeval/metrics/stats/labelstats.py +0 -210
  106. dataeval/utils/dataset/__init__.py +0 -7
  107. dataeval/utils/dataset/datasets.py +0 -412
  108. dataeval/utils/dataset/read.py +0 -63
  109. dataeval-0.76.1.dist-info/RECORD +0 -67
  110. /dataeval/{log.py → _log.py} +0 -0
  111. /dataeval/utils/torch/{blocks.py → _blocks.py} +0 -0
  112. {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/LICENSE.txt +0 -0
  113. {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,184 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = []
4
+
5
+ import contextlib
6
+ from dataclasses import dataclass
7
+ from typing import Generic, TypeVar, Union
8
+
9
+ from typing_extensions import TypeAlias
10
+
11
+ with contextlib.suppress(ImportError):
12
+ import pandas as pd
13
+
14
+ from dataeval.outputs._base import Output
15
+ from dataeval.outputs._stats import DimensionStatsOutput, LabelStatsOutput, PixelStatsOutput, VisualStatsOutput
16
+
17
+ DuplicateGroup: TypeAlias = list[int]
18
+ DatasetDuplicateGroupMap: TypeAlias = dict[int, DuplicateGroup]
19
+ TIndexCollection = TypeVar("TIndexCollection", DuplicateGroup, DatasetDuplicateGroupMap)
20
+
21
+ IndexIssueMap: TypeAlias = dict[int, dict[str, float]]
22
+ OutlierStatsOutput: TypeAlias = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
23
+ TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap, list[IndexIssueMap])
24
+
25
+
26
+ @dataclass(frozen=True)
27
+ class DuplicatesOutput(Generic[TIndexCollection], Output):
28
+ """
29
+ Output class for :class:`.Duplicates` lint detector.
30
+
31
+ Attributes
32
+ ----------
33
+ exact : list[list[int] | dict[int, list[int]]]
34
+ Indices of images that are exact matches
35
+ near: list[list[int] | dict[int, list[int]]]
36
+ Indices of images that are near matches
37
+
38
+ - For a single dataset, indices are returned as a list of index groups.
39
+ - For multiple datasets, indices are returned as dictionaries where the key is the
40
+ index of the dataset, and the value is the list index groups from that dataset.
41
+ """
42
+
43
+ exact: list[TIndexCollection]
44
+ near: list[TIndexCollection]
45
+
46
+
47
+ def _reorganize_by_class_and_metric(result: IndexIssueMap, lstats: LabelStatsOutput):
48
+ """Flip result from grouping by image to grouping by class and metric"""
49
+ metrics = {}
50
+ class_wise = {label: {} for label in lstats.class_names}
51
+
52
+ # Group metrics and calculate class-wise counts
53
+ for img, group in result.items():
54
+ for extreme in group:
55
+ metrics.setdefault(extreme, []).append(img)
56
+ for i, images in enumerate(lstats.image_indices_per_class):
57
+ if img in images:
58
+ class_wise[lstats.class_names[i]][extreme] = class_wise[lstats.class_names[i]].get(extreme, 0) + 1
59
+
60
+ return metrics, class_wise
61
+
62
+
63
+ def _create_table(metrics, class_wise):
64
+ """Create table for displaying the results"""
65
+ max_class_length = max(len(str(label)) for label in class_wise) + 2
66
+ max_total = max(len(metrics[group]) for group in metrics) + 2
67
+
68
+ table_header = " | ".join(
69
+ [f"{'Class':>{max_class_length}}"]
70
+ + [f"{group:^{max(5, len(str(group))) + 2}}" for group in sorted(metrics.keys())]
71
+ + [f"{'Total':<{max_total}}"]
72
+ )
73
+ table_rows = []
74
+
75
+ for class_cat, results in class_wise.items():
76
+ table_value = [f"{class_cat:>{max_class_length}}"]
77
+ total = 0
78
+ for group in sorted(metrics.keys()):
79
+ count = results.get(group, 0)
80
+ table_value.append(f"{count:^{max(5, len(str(group))) + 2}}")
81
+ total += count
82
+ table_value.append(f"{total:^{max_total}}")
83
+ table_rows.append(" | ".join(table_value))
84
+
85
+ table = [table_header] + table_rows
86
+ return table
87
+
88
+
89
+ def _create_pandas_dataframe(class_wise):
90
+ """Create data for pandas dataframe"""
91
+ data = []
92
+ for label, metrics_dict in class_wise.items():
93
+ row = {"Class": label}
94
+ total = sum(metrics_dict.values())
95
+ row.update(metrics_dict) # Add metric counts
96
+ row["Total"] = total
97
+ data.append(row)
98
+ return data
99
+
100
+
101
+ @dataclass(frozen=True)
102
+ class OutliersOutput(Generic[TIndexIssueMap], Output):
103
+ """
104
+ Output class for :class:`.Outliers` lint detector.
105
+
106
+ Attributes
107
+ ----------
108
+ issues : dict[int, dict[str, float]] | list[dict[int, dict[str, float]]]
109
+ Indices of image Outliers with their associated issue type and calculated values.
110
+
111
+ - For a single dataset, a dictionary containing the indices of outliers and
112
+ a dictionary showing the issues and calculated values for the given index.
113
+ - For multiple stats outputs, a list of dictionaries containing the indices of
114
+ outliers and their associated issues and calculated values.
115
+ """
116
+
117
+ issues: TIndexIssueMap
118
+
119
+ def __len__(self) -> int:
120
+ if isinstance(self.issues, dict):
121
+ return len(self.issues)
122
+ else:
123
+ return sum(len(d) for d in self.issues)
124
+
125
+ def to_table(self, labelstats: LabelStatsOutput) -> str:
126
+ """
127
+ Formats the outlier output results as a table.
128
+
129
+ Parameters
130
+ ----------
131
+ labelstats : LabelStatsOutput
132
+ Output of :func:`.labelstats`
133
+
134
+ Returns
135
+ -------
136
+ str
137
+ """
138
+ if isinstance(self.issues, dict):
139
+ metrics, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
140
+ listed_table = _create_table(metrics, classwise)
141
+ table = "\n".join(listed_table)
142
+ else:
143
+ outertable = []
144
+ for d in self.issues:
145
+ metrics, classwise = _reorganize_by_class_and_metric(d, labelstats)
146
+ listed_table = _create_table(metrics, classwise)
147
+ str_table = "\n".join(listed_table)
148
+ outertable.append(str_table)
149
+ table = "\n\n".join(outertable)
150
+ return table
151
+
152
+ def to_dataframe(self, labelstats: LabelStatsOutput) -> pd.DataFrame:
153
+ """
154
+ Exports the outliers output results to a pandas DataFrame.
155
+
156
+ Parameters
157
+ ----------
158
+ labelstats : LabelStatsOutput
159
+ Output of :func:`.labelstats`
160
+
161
+ Returns
162
+ -------
163
+ pd.DataFrame
164
+
165
+ Notes
166
+ -----
167
+ This method requires `pandas <https://pandas.pydata.org/>`_ to be installed.
168
+ """
169
+ import pandas as pd
170
+
171
+ if isinstance(self.issues, dict):
172
+ _, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
173
+ data = _create_pandas_dataframe(classwise)
174
+ df = pd.DataFrame(data)
175
+ else:
176
+ df_list = []
177
+ for i, d in enumerate(self.issues):
178
+ _, classwise = _reorganize_by_class_and_metric(d, labelstats)
179
+ data = _create_pandas_dataframe(classwise)
180
+ single_df = pd.DataFrame(data)
181
+ single_df["Dataset"] = i
182
+ df_list.append(single_df)
183
+ df = pd.concat(df_list)
184
+ return df
@@ -8,27 +8,7 @@ from typing import Literal
8
8
  import numpy as np
9
9
  from numpy.typing import NDArray
10
10
 
11
- from dataeval.output import Output
12
-
13
-
14
- @dataclass(frozen=True)
15
- class OODOutput(Output):
16
- """
17
- Output class for predictions from out-of-distribution detectors.
18
-
19
- Attributes
20
- ----------
21
- is_ood : NDArray
22
- Array of images that are detected as :term:Out-of-Distribution (OOD)`
23
- instance_score : NDArray
24
- Instance score of the evaluated dataset
25
- feature_score : NDArray | None
26
- Feature score, if available, of the evaluated dataset
27
- """
28
-
29
- is_ood: NDArray[np.bool_]
30
- instance_score: NDArray[np.float32]
31
- feature_score: NDArray[np.float32] | None
11
+ from dataeval.outputs._base import Output
32
12
 
33
13
 
34
14
  @dataclass(frozen=True)
@@ -49,7 +29,7 @@ class OODScoreOutput(Output):
49
29
 
50
30
  def get(self, ood_type: Literal["instance", "feature"]) -> NDArray[np.float32]:
51
31
  """
52
- Returns either the instance or feature score
32
+ Returns either the instance or feature score.
53
33
 
54
34
  Parameters
55
35
  ----------
@@ -61,3 +41,23 @@ class OODScoreOutput(Output):
61
41
  Either the instance or feature score based on input selection
62
42
  """
63
43
  return self.instance_score if ood_type == "instance" or self.feature_score is None else self.feature_score
44
+
45
+
46
+ @dataclass(frozen=True)
47
+ class OODOutput(Output):
48
+ """
49
+ Output class for predictions from out-of-distribution detectors.
50
+
51
+ Attributes
52
+ ----------
53
+ is_ood : NDArray
54
+ Array of images that are detected as :term:Out-of-Distribution (OOD)`
55
+ instance_score : NDArray
56
+ Instance score of the evaluated dataset
57
+ feature_score : NDArray | None
58
+ Feature score, if available, of the evaluated dataset
59
+ """
60
+
61
+ is_ood: NDArray[np.bool_]
62
+ instance_score: NDArray[np.float32]
63
+ feature_score: NDArray[np.float32] | None
@@ -0,0 +1,387 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = []
4
+
5
+ import contextlib
6
+ from dataclasses import dataclass
7
+ from typing import Iterable, Optional, Union
8
+
9
+ import numpy as np
10
+ from numpy.typing import NDArray
11
+ from typing_extensions import TypeAlias
12
+
13
+ with contextlib.suppress(ImportError):
14
+ import pandas as pd
15
+
16
+ from dataeval.outputs._base import Output
17
+ from dataeval.utils._plot import channel_histogram_plot, histogram_plot
18
+
19
+ OptionalRange: TypeAlias = Optional[Union[int, Iterable[int]]]
20
+
21
+ SOURCE_INDEX = "source_index"
22
+ BOX_COUNT = "box_count"
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class SourceIndex:
27
+ """
28
+ The indices of the source image, box and channel.
29
+
30
+ Attributes
31
+ ----------
32
+ image: int
33
+ Index of the source image
34
+ box : int | None
35
+ Index of the box of the source image (if applicable)
36
+ channel : int | None
37
+ Index of the channel of the source image (if applicable)
38
+ """
39
+
40
+ image: int
41
+ box: int | None
42
+ channel: int | None
43
+
44
+
45
+ def matches(index: int | None, opt_range: OptionalRange) -> bool:
46
+ if index is None or opt_range is None:
47
+ return True
48
+ return index in opt_range if isinstance(opt_range, Iterable) else index == opt_range
49
+
50
+
51
+ @dataclass(frozen=True)
52
+ class BaseStatsOutput(Output):
53
+ """
54
+ Attributes
55
+ ----------
56
+ source_index : List[SourceIndex]
57
+ Mapping from statistic to source image, box and channel index
58
+ box_count : NDArray[np.uint16]
59
+ """
60
+
61
+ source_index: list[SourceIndex]
62
+ box_count: NDArray[np.uint16]
63
+
64
+ def __post_init__(self) -> None:
65
+ length = len(self.source_index)
66
+ bad = {k: len(v) for k, v in self.dict().items() if k not in [SOURCE_INDEX, BOX_COUNT] and len(v) != length}
67
+ if bad:
68
+ raise ValueError(f"All values must have the same length as source_index. Bad values: {str(bad)}.")
69
+
70
+ def get_channel_mask(
71
+ self,
72
+ channel_index: OptionalRange,
73
+ channel_count: OptionalRange = None,
74
+ ) -> list[bool]:
75
+ """
76
+ Boolean mask for results filtered to specified channel index and optionally the count
77
+ of the channels per image.
78
+
79
+ Parameters
80
+ ----------
81
+ channel_index : int | Iterable[int] | None
82
+ Index or indices of channel(s) to filter for
83
+ channel_count : int | Iterable[int] | None
84
+ Optional count(s) of channels to filter for
85
+ """
86
+ mask: list[bool] = []
87
+ cur_mask: list[bool] = []
88
+ cur_image = 0
89
+ cur_max_channel = 0
90
+ for source_index in list(self.source_index) + [None]:
91
+ if source_index is None or source_index.image > cur_image:
92
+ mask.extend(cur_mask if matches(cur_max_channel + 1, channel_count) else [False for _ in cur_mask])
93
+ if source_index is not None:
94
+ cur_image = source_index.image
95
+ cur_max_channel = 0
96
+ cur_mask.clear()
97
+ if source_index is not None:
98
+ cur_mask.append(matches(source_index.channel, channel_index))
99
+ cur_max_channel = max(cur_max_channel, source_index.channel or 0)
100
+ return mask
101
+
102
+ def __len__(self) -> int:
103
+ return len(self.source_index)
104
+
105
+ def _get_channels(
106
+ self, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None
107
+ ) -> tuple[int, list[bool] | None]:
108
+ source_index = self.dict()[SOURCE_INDEX]
109
+ raw_channels = int(max([si.channel or 0 for si in source_index])) + 1
110
+ if isinstance(channel_index, int):
111
+ max_channels = 1 if channel_index < raw_channels else raw_channels
112
+ ch_mask = self.get_channel_mask(channel_index)
113
+ elif isinstance(channel_index, Iterable) and all(isinstance(val, int) for val in list(channel_index)):
114
+ max_channels = len(list(channel_index))
115
+ ch_mask = self.get_channel_mask(channel_index)
116
+ elif isinstance(channel_limit, int):
117
+ max_channels = channel_limit
118
+ ch_mask = self.get_channel_mask(None, channel_limit)
119
+ else:
120
+ max_channels = raw_channels
121
+ ch_mask = None
122
+
123
+ if max_channels > raw_channels:
124
+ max_channels = raw_channels
125
+ if ch_mask is not None and not any(ch_mask):
126
+ ch_mask = None
127
+
128
+ return max_channels, ch_mask
129
+
130
+ def plot(
131
+ self, log: bool, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None
132
+ ) -> None:
133
+ max_channels, ch_mask = self._get_channels(channel_limit, channel_index)
134
+ d = {k: v for k, v in self.dict().items() if isinstance(v, np.ndarray) and v[v != 0].size > 0 and v.ndim == 1}
135
+ if max_channels == 1:
136
+ histogram_plot(d, log)
137
+ else:
138
+ channel_histogram_plot(d, log, max_channels, ch_mask)
139
+
140
+
141
+ @dataclass(frozen=True)
142
+ class DimensionStatsOutput(BaseStatsOutput):
143
+ """
144
+ Output class for :func:`.dimensionstats` stats metric.
145
+
146
+ Attributes
147
+ ----------
148
+ left : NDArray[np.int32]
149
+ Offsets from the left edge of images in pixels
150
+ top : NDArray[np.int32]
151
+ Offsets from the top edge of images in pixels
152
+ width : NDArray[np.uint32]
153
+ Width of the images in pixels
154
+ height : NDArray[np.uint32]
155
+ Height of the images in pixels
156
+ channels : NDArray[np.uint8]
157
+ Channel count of the images in pixels
158
+ size : NDArray[np.uint32]
159
+ Size of the images in pixels
160
+ aspect_ratio : NDArray[np.float16]
161
+ :term:`ASspect Ratio<Aspect Ratio>` of the images (width/height)
162
+ depth : NDArray[np.uint8]
163
+ Color depth of the images in bits
164
+ center : NDArray[np.uint16]
165
+ Offset from center in [x,y] coordinates of the images in pixels
166
+ distance : NDArray[np.float16]
167
+ Distance in pixels from center
168
+ """
169
+
170
+ left: NDArray[np.int32]
171
+ top: NDArray[np.int32]
172
+ width: NDArray[np.uint32]
173
+ height: NDArray[np.uint32]
174
+ channels: NDArray[np.uint8]
175
+ size: NDArray[np.uint32]
176
+ aspect_ratio: NDArray[np.float16]
177
+ depth: NDArray[np.uint8]
178
+ center: NDArray[np.int16]
179
+ distance: NDArray[np.float16]
180
+
181
+
182
+ @dataclass(frozen=True)
183
+ class HashStatsOutput(BaseStatsOutput):
184
+ """
185
+ Output class for :func:`.hashstats` stats metric.
186
+
187
+ Attributes
188
+ ----------
189
+ xxhash : List[str]
190
+ xxHash hash of the images as a hex string
191
+ pchash : List[str]
192
+ :term:`Perception-based Hash` of the images as a hex string
193
+ """
194
+
195
+ xxhash: list[str]
196
+ pchash: list[str]
197
+
198
+
199
+ @dataclass(frozen=True)
200
+ class LabelStatsOutput(Output):
201
+ """
202
+ Output class for :func:`.labelstats` stats metric.
203
+
204
+ Attributes
205
+ ----------
206
+ label_counts_per_class : dict[int, int]
207
+ Dictionary whose keys are the different label classes and
208
+ values are total counts of each class
209
+ label_counts_per_image : list[int]
210
+ Number of labels per image
211
+ image_counts_per_class : dict[int, int]
212
+ Dictionary whose keys are the different label classes and
213
+ values are total counts of each image the class is present in
214
+ image_indices_per_class : dict[int, list]
215
+ Dictionary whose keys are the different label classes and
216
+ values are lists containing the images that have that label
217
+ image_count : int
218
+ Total number of images present
219
+ class_count : int
220
+ Total number of classes present
221
+ label_count : int
222
+ Total number of labels present
223
+ class_names : list[str]
224
+ """
225
+
226
+ label_counts_per_class: list[int]
227
+ label_counts_per_image: list[int]
228
+ image_counts_per_class: list[int]
229
+ image_indices_per_class: list[list[int]]
230
+ image_count: int
231
+ class_count: int
232
+ label_count: int
233
+ class_names: list[str]
234
+
235
+ def to_table(self) -> str:
236
+ """
237
+ Formats the label statistics output results as a table.
238
+
239
+ Returns
240
+ -------
241
+ str
242
+ """
243
+ max_char = max(len(name) if isinstance(name, str) else name // 10 + 1 for name in self.class_names)
244
+ max_char = max(max_char, 5)
245
+ max_label = max(list(self.label_counts_per_class))
246
+ max_img = max(list(self.image_counts_per_class))
247
+ max_num = int(np.ceil(np.log10(max(max_label, max_img))))
248
+ max_num = max(max_num, 11)
249
+
250
+ # Display basic counts
251
+ table_str = [f"Class Count: {self.class_count}"]
252
+ table_str += [f"Label Count: {self.label_count}"]
253
+ table_str += [f"Average # Labels per Image: {round(np.mean(self.label_counts_per_image), 2)}"]
254
+ table_str += ["--------------------------------------"]
255
+
256
+ # Display counts per class
257
+ table_str += [f"{'Label':>{max_char}}: Total Count - Image Count"]
258
+ for cls in range(len(self.class_names)):
259
+ table_str += [
260
+ f"{self.class_names[cls]:>{max_char}}: {self.label_counts_per_class[cls]:^{max_num}}"
261
+ + " - "
262
+ + f"{self.image_counts_per_class[cls]:^{max_num}}".rstrip()
263
+ ]
264
+
265
+ return "\n".join(table_str)
266
+
267
+ def to_dataframe(self) -> pd.DataFrame:
268
+ """
269
+ Exports the label statistics output results to a pandas DataFrame.
270
+
271
+ Notes
272
+ -----
273
+ This method requires `pandas <https://pandas.pydata.org/>`_ to be installed.
274
+
275
+ Returns
276
+ -------
277
+ pd.DataFrame
278
+ """
279
+ import pandas as pd
280
+
281
+ total_count = []
282
+ image_count = []
283
+ for cls in range(len(self.class_names)):
284
+ total_count.append(self.label_counts_per_class[cls])
285
+ image_count.append(self.image_counts_per_class[cls])
286
+
287
+ return pd.DataFrame(
288
+ {
289
+ "Label": self.class_names,
290
+ "Total Count": total_count,
291
+ "Image Count": image_count,
292
+ }
293
+ )
294
+
295
+
296
+ @dataclass(frozen=True)
297
+ class PixelStatsOutput(BaseStatsOutput):
298
+ """
299
+ Output class for :func:`.pixelstats` stats metric.
300
+
301
+ Attributes
302
+ ----------
303
+ mean : NDArray[np.float16]
304
+ Mean of the pixel values of the images
305
+ std : NDArray[np.float16]
306
+ Standard deviation of the pixel values of the images
307
+ var : NDArray[np.float16]
308
+ :term:`Variance` of the pixel values of the images
309
+ skew : NDArray[np.float16]
310
+ Skew of the pixel values of the images
311
+ kurtosis : NDArray[np.float16]
312
+ Kurtosis of the pixel values of the images
313
+ histogram : NDArray[np.uint32]
314
+ Histogram of the pixel values of the images across 256 bins scaled between 0 and 1
315
+ entropy : NDArray[np.float16]
316
+ Entropy of the pixel values of the images
317
+ """
318
+
319
+ mean: NDArray[np.float16]
320
+ std: NDArray[np.float16]
321
+ var: NDArray[np.float16]
322
+ skew: NDArray[np.float16]
323
+ kurtosis: NDArray[np.float16]
324
+ histogram: NDArray[np.uint32]
325
+ entropy: NDArray[np.float16]
326
+
327
+
328
+ @dataclass(frozen=True)
329
+ class VisualStatsOutput(BaseStatsOutput):
330
+ """
331
+ Output class for :func:`.visualstats` stats metric.
332
+
333
+ Attributes
334
+ ----------
335
+ brightness : NDArray[np.float16]
336
+ Brightness of the images
337
+ contrast : NDArray[np.float16]
338
+ Image contrast ratio
339
+ darkness : NDArray[np.float16]
340
+ Darkness of the images
341
+ missing : NDArray[np.float16]
342
+ Percentage of the images with missing pixels
343
+ sharpness : NDArray[np.float16]
344
+ Sharpness of the images
345
+ zeros : NDArray[np.float16]
346
+ Percentage of the images with zero value pixels
347
+ percentiles : NDArray[np.float16]
348
+ Percentiles of the pixel values of the images with quartiles of (0, 25, 50, 75, 100)
349
+ """
350
+
351
+ brightness: NDArray[np.float16]
352
+ contrast: NDArray[np.float16]
353
+ darkness: NDArray[np.float16]
354
+ missing: NDArray[np.float16]
355
+ sharpness: NDArray[np.float16]
356
+ zeros: NDArray[np.float16]
357
+ percentiles: NDArray[np.float16]
358
+
359
+
360
+ @dataclass(frozen=True)
361
+ class ImageStatsOutput(DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput):
362
+ """
363
+ Output class for :func:`.imagestats` stats metric with `per_channel=False`.
364
+
365
+ This class represents the combined outputs of various stats functions against a
366
+ single dataset, such that each index across all stat outputs are representative
367
+ of the same source image. Modifying or mixing outputs will result in inaccurate
368
+ outlier calculations if not created correctly.
369
+
370
+ The attributes and methods are a superset of :class:`.DimensionStatsOutput`,
371
+ :class:`.PixelStatsOutput` and :class:`.VisualStatsOutput`.
372
+ """
373
+
374
+
375
+ @dataclass(frozen=True)
376
+ class ChannelStatsOutput(PixelStatsOutput, VisualStatsOutput):
377
+ """
378
+ Output class for :func:`.imagestats` stats metric with `per_channel=True`.
379
+
380
+ This class represents the outputs of various per-channel stats functions against
381
+ a single dataset, such that each index across all stat outputs are representative
382
+ of the same source image. Modifying or mixing outputs will result in inaccurate
383
+ outlier calculations if not created correctly.
384
+
385
+ The attributes and methods are a superset of :class:`.PixelStatsOutput` and
386
+ :class:`.VisualStatsOutput`.
387
+ """
@@ -0,0 +1,44 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = []
4
+
5
+ from dataclasses import dataclass
6
+
7
+ import numpy as np
8
+ from numpy.typing import NDArray
9
+
10
+ from dataeval.outputs._base import Output
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class TrainValSplit:
15
+ """
16
+ Dataclass containing train and validation indices.
17
+
18
+ Attributes
19
+ ----------
20
+ train: NDArray[np.intp]
21
+ Indices for the training set
22
+ val: NDArray[np.intp]
23
+ Indices for the validation set
24
+ """
25
+
26
+ train: NDArray[np.intp]
27
+ val: NDArray[np.intp]
28
+
29
+
30
+ @dataclass(frozen=True)
31
+ class SplitDatasetOutput(Output):
32
+ """
33
+ Output class containing test indices and a list of TrainValSplits.
34
+
35
+ Attributes
36
+ ----------
37
+ test: NDArray[np.intp]
38
+ Indices for the test set
39
+ folds: list[TrainValSplit]
40
+ List of train and validation split indices
41
+ """
42
+
43
+ test: NDArray[np.intp]
44
+ folds: list[TrainValSplit]