dataeval 0.76.1__py3-none-any.whl → 0.82.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +3 -3
- dataeval/config.py +77 -0
- dataeval/detectors/__init__.py +1 -1
- dataeval/detectors/drift/__init__.py +6 -6
- dataeval/detectors/drift/{base.py → _base.py} +40 -85
- dataeval/detectors/drift/{cvm.py → _cvm.py} +21 -28
- dataeval/detectors/drift/{ks.py → _ks.py} +20 -26
- dataeval/detectors/drift/{mmd.py → _mmd.py} +31 -43
- dataeval/detectors/drift/{torch.py → _torch.py} +2 -1
- dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} +24 -7
- dataeval/detectors/drift/updates.py +20 -3
- dataeval/detectors/linters/__init__.py +3 -5
- dataeval/detectors/linters/duplicates.py +13 -36
- dataeval/detectors/linters/outliers.py +23 -148
- dataeval/detectors/ood/__init__.py +1 -1
- dataeval/detectors/ood/ae.py +30 -9
- dataeval/detectors/ood/base.py +5 -4
- dataeval/detectors/ood/mixin.py +21 -7
- dataeval/detectors/ood/vae.py +73 -0
- dataeval/metadata/__init__.py +6 -0
- dataeval/metadata/_distance.py +167 -0
- dataeval/metadata/_ood.py +217 -0
- dataeval/metadata/_utils.py +44 -0
- dataeval/metrics/__init__.py +1 -1
- dataeval/metrics/bias/__init__.py +6 -4
- dataeval/metrics/bias/{balance.py → _balance.py} +15 -101
- dataeval/metrics/bias/_coverage.py +98 -0
- dataeval/metrics/bias/{diversity.py → _diversity.py} +18 -111
- dataeval/metrics/bias/{parity.py → _parity.py} +39 -77
- dataeval/metrics/estimators/__init__.py +15 -4
- dataeval/metrics/estimators/{ber.py → _ber.py} +42 -29
- dataeval/metrics/estimators/_clusterer.py +44 -0
- dataeval/metrics/estimators/{divergence.py → _divergence.py} +18 -30
- dataeval/metrics/estimators/{uap.py → _uap.py} +4 -18
- dataeval/metrics/stats/__init__.py +16 -13
- dataeval/metrics/stats/{base.py → _base.py} +82 -133
- dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} +15 -18
- dataeval/metrics/stats/_dimensionstats.py +75 -0
- dataeval/metrics/stats/{hashstats.py → _hashstats.py} +21 -37
- dataeval/metrics/stats/_imagestats.py +94 -0
- dataeval/metrics/stats/_labelstats.py +131 -0
- dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} +19 -50
- dataeval/metrics/stats/{visualstats.py → _visualstats.py} +23 -54
- dataeval/outputs/__init__.py +53 -0
- dataeval/{output.py → outputs/_base.py} +55 -25
- dataeval/outputs/_bias.py +381 -0
- dataeval/outputs/_drift.py +83 -0
- dataeval/outputs/_estimators.py +114 -0
- dataeval/outputs/_linters.py +184 -0
- dataeval/{detectors/ood/output.py → outputs/_ood.py} +22 -22
- dataeval/outputs/_stats.py +387 -0
- dataeval/outputs/_utils.py +44 -0
- dataeval/outputs/_workflows.py +364 -0
- dataeval/typing.py +234 -0
- dataeval/utils/__init__.py +2 -2
- dataeval/utils/_array.py +169 -0
- dataeval/utils/_bin.py +199 -0
- dataeval/utils/_clusterer.py +144 -0
- dataeval/utils/_fast_mst.py +189 -0
- dataeval/utils/{image.py → _image.py} +6 -4
- dataeval/utils/_method.py +14 -0
- dataeval/utils/{shared.py → _mst.py} +3 -65
- dataeval/utils/{plot.py → _plot.py} +6 -6
- dataeval/utils/data/__init__.py +26 -0
- dataeval/utils/data/_dataset.py +217 -0
- dataeval/utils/data/_embeddings.py +104 -0
- dataeval/utils/data/_images.py +68 -0
- dataeval/utils/data/_metadata.py +360 -0
- dataeval/utils/data/_selection.py +126 -0
- dataeval/utils/{dataset/split.py → data/_split.py} +12 -38
- dataeval/utils/data/_targets.py +85 -0
- dataeval/utils/data/collate.py +103 -0
- dataeval/utils/data/datasets/__init__.py +17 -0
- dataeval/utils/data/datasets/_base.py +254 -0
- dataeval/utils/data/datasets/_cifar10.py +134 -0
- dataeval/utils/data/datasets/_fileio.py +168 -0
- dataeval/utils/data/datasets/_milco.py +153 -0
- dataeval/utils/data/datasets/_mixin.py +56 -0
- dataeval/utils/data/datasets/_mnist.py +183 -0
- dataeval/utils/data/datasets/_ships.py +123 -0
- dataeval/utils/data/datasets/_types.py +52 -0
- dataeval/utils/data/datasets/_voc.py +352 -0
- dataeval/utils/data/selections/__init__.py +15 -0
- dataeval/utils/data/selections/_classfilter.py +57 -0
- dataeval/utils/data/selections/_indices.py +26 -0
- dataeval/utils/data/selections/_limit.py +26 -0
- dataeval/utils/data/selections/_reverse.py +18 -0
- dataeval/utils/data/selections/_shuffle.py +29 -0
- dataeval/utils/metadata.py +51 -376
- dataeval/utils/torch/{gmm.py → _gmm.py} +4 -2
- dataeval/utils/torch/{internal.py → _internal.py} +21 -51
- dataeval/utils/torch/models.py +43 -2
- dataeval/workflows/__init__.py +2 -1
- dataeval/workflows/sufficiency.py +11 -346
- {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/METADATA +5 -2
- dataeval-0.82.0.dist-info/RECORD +104 -0
- dataeval/detectors/linters/clusterer.py +0 -512
- dataeval/detectors/linters/merged_stats.py +0 -49
- dataeval/detectors/ood/metadata_ks_compare.py +0 -129
- dataeval/detectors/ood/metadata_least_likely.py +0 -119
- dataeval/interop.py +0 -69
- dataeval/metrics/bias/coverage.py +0 -194
- dataeval/metrics/stats/datasetstats.py +0 -202
- dataeval/metrics/stats/dimensionstats.py +0 -115
- dataeval/metrics/stats/labelstats.py +0 -210
- dataeval/utils/dataset/__init__.py +0 -7
- dataeval/utils/dataset/datasets.py +0 -412
- dataeval/utils/dataset/read.py +0 -63
- dataeval-0.76.1.dist-info/RECORD +0 -67
- /dataeval/{log.py → _log.py} +0 -0
- /dataeval/utils/torch/{blocks.py → _blocks.py} +0 -0
- {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,184 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
__all__ = []
|
4
|
+
|
5
|
+
import contextlib
|
6
|
+
from dataclasses import dataclass
|
7
|
+
from typing import Generic, TypeVar, Union
|
8
|
+
|
9
|
+
from typing_extensions import TypeAlias
|
10
|
+
|
11
|
+
with contextlib.suppress(ImportError):
|
12
|
+
import pandas as pd
|
13
|
+
|
14
|
+
from dataeval.outputs._base import Output
|
15
|
+
from dataeval.outputs._stats import DimensionStatsOutput, LabelStatsOutput, PixelStatsOutput, VisualStatsOutput
|
16
|
+
|
17
|
+
DuplicateGroup: TypeAlias = list[int]
|
18
|
+
DatasetDuplicateGroupMap: TypeAlias = dict[int, DuplicateGroup]
|
19
|
+
TIndexCollection = TypeVar("TIndexCollection", DuplicateGroup, DatasetDuplicateGroupMap)
|
20
|
+
|
21
|
+
IndexIssueMap: TypeAlias = dict[int, dict[str, float]]
|
22
|
+
OutlierStatsOutput: TypeAlias = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
|
23
|
+
TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap, list[IndexIssueMap])
|
24
|
+
|
25
|
+
|
26
|
+
@dataclass(frozen=True)
|
27
|
+
class DuplicatesOutput(Generic[TIndexCollection], Output):
|
28
|
+
"""
|
29
|
+
Output class for :class:`.Duplicates` lint detector.
|
30
|
+
|
31
|
+
Attributes
|
32
|
+
----------
|
33
|
+
exact : list[list[int] | dict[int, list[int]]]
|
34
|
+
Indices of images that are exact matches
|
35
|
+
near: list[list[int] | dict[int, list[int]]]
|
36
|
+
Indices of images that are near matches
|
37
|
+
|
38
|
+
- For a single dataset, indices are returned as a list of index groups.
|
39
|
+
- For multiple datasets, indices are returned as dictionaries where the key is the
|
40
|
+
index of the dataset, and the value is the list index groups from that dataset.
|
41
|
+
"""
|
42
|
+
|
43
|
+
exact: list[TIndexCollection]
|
44
|
+
near: list[TIndexCollection]
|
45
|
+
|
46
|
+
|
47
|
+
def _reorganize_by_class_and_metric(result: IndexIssueMap, lstats: LabelStatsOutput):
|
48
|
+
"""Flip result from grouping by image to grouping by class and metric"""
|
49
|
+
metrics = {}
|
50
|
+
class_wise = {label: {} for label in lstats.class_names}
|
51
|
+
|
52
|
+
# Group metrics and calculate class-wise counts
|
53
|
+
for img, group in result.items():
|
54
|
+
for extreme in group:
|
55
|
+
metrics.setdefault(extreme, []).append(img)
|
56
|
+
for i, images in enumerate(lstats.image_indices_per_class):
|
57
|
+
if img in images:
|
58
|
+
class_wise[lstats.class_names[i]][extreme] = class_wise[lstats.class_names[i]].get(extreme, 0) + 1
|
59
|
+
|
60
|
+
return metrics, class_wise
|
61
|
+
|
62
|
+
|
63
|
+
def _create_table(metrics, class_wise):
|
64
|
+
"""Create table for displaying the results"""
|
65
|
+
max_class_length = max(len(str(label)) for label in class_wise) + 2
|
66
|
+
max_total = max(len(metrics[group]) for group in metrics) + 2
|
67
|
+
|
68
|
+
table_header = " | ".join(
|
69
|
+
[f"{'Class':>{max_class_length}}"]
|
70
|
+
+ [f"{group:^{max(5, len(str(group))) + 2}}" for group in sorted(metrics.keys())]
|
71
|
+
+ [f"{'Total':<{max_total}}"]
|
72
|
+
)
|
73
|
+
table_rows = []
|
74
|
+
|
75
|
+
for class_cat, results in class_wise.items():
|
76
|
+
table_value = [f"{class_cat:>{max_class_length}}"]
|
77
|
+
total = 0
|
78
|
+
for group in sorted(metrics.keys()):
|
79
|
+
count = results.get(group, 0)
|
80
|
+
table_value.append(f"{count:^{max(5, len(str(group))) + 2}}")
|
81
|
+
total += count
|
82
|
+
table_value.append(f"{total:^{max_total}}")
|
83
|
+
table_rows.append(" | ".join(table_value))
|
84
|
+
|
85
|
+
table = [table_header] + table_rows
|
86
|
+
return table
|
87
|
+
|
88
|
+
|
89
|
+
def _create_pandas_dataframe(class_wise):
|
90
|
+
"""Create data for pandas dataframe"""
|
91
|
+
data = []
|
92
|
+
for label, metrics_dict in class_wise.items():
|
93
|
+
row = {"Class": label}
|
94
|
+
total = sum(metrics_dict.values())
|
95
|
+
row.update(metrics_dict) # Add metric counts
|
96
|
+
row["Total"] = total
|
97
|
+
data.append(row)
|
98
|
+
return data
|
99
|
+
|
100
|
+
|
101
|
+
@dataclass(frozen=True)
|
102
|
+
class OutliersOutput(Generic[TIndexIssueMap], Output):
|
103
|
+
"""
|
104
|
+
Output class for :class:`.Outliers` lint detector.
|
105
|
+
|
106
|
+
Attributes
|
107
|
+
----------
|
108
|
+
issues : dict[int, dict[str, float]] | list[dict[int, dict[str, float]]]
|
109
|
+
Indices of image Outliers with their associated issue type and calculated values.
|
110
|
+
|
111
|
+
- For a single dataset, a dictionary containing the indices of outliers and
|
112
|
+
a dictionary showing the issues and calculated values for the given index.
|
113
|
+
- For multiple stats outputs, a list of dictionaries containing the indices of
|
114
|
+
outliers and their associated issues and calculated values.
|
115
|
+
"""
|
116
|
+
|
117
|
+
issues: TIndexIssueMap
|
118
|
+
|
119
|
+
def __len__(self) -> int:
|
120
|
+
if isinstance(self.issues, dict):
|
121
|
+
return len(self.issues)
|
122
|
+
else:
|
123
|
+
return sum(len(d) for d in self.issues)
|
124
|
+
|
125
|
+
def to_table(self, labelstats: LabelStatsOutput) -> str:
|
126
|
+
"""
|
127
|
+
Formats the outlier output results as a table.
|
128
|
+
|
129
|
+
Parameters
|
130
|
+
----------
|
131
|
+
labelstats : LabelStatsOutput
|
132
|
+
Output of :func:`.labelstats`
|
133
|
+
|
134
|
+
Returns
|
135
|
+
-------
|
136
|
+
str
|
137
|
+
"""
|
138
|
+
if isinstance(self.issues, dict):
|
139
|
+
metrics, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
|
140
|
+
listed_table = _create_table(metrics, classwise)
|
141
|
+
table = "\n".join(listed_table)
|
142
|
+
else:
|
143
|
+
outertable = []
|
144
|
+
for d in self.issues:
|
145
|
+
metrics, classwise = _reorganize_by_class_and_metric(d, labelstats)
|
146
|
+
listed_table = _create_table(metrics, classwise)
|
147
|
+
str_table = "\n".join(listed_table)
|
148
|
+
outertable.append(str_table)
|
149
|
+
table = "\n\n".join(outertable)
|
150
|
+
return table
|
151
|
+
|
152
|
+
def to_dataframe(self, labelstats: LabelStatsOutput) -> pd.DataFrame:
|
153
|
+
"""
|
154
|
+
Exports the outliers output results to a pandas DataFrame.
|
155
|
+
|
156
|
+
Parameters
|
157
|
+
----------
|
158
|
+
labelstats : LabelStatsOutput
|
159
|
+
Output of :func:`.labelstats`
|
160
|
+
|
161
|
+
Returns
|
162
|
+
-------
|
163
|
+
pd.DataFrame
|
164
|
+
|
165
|
+
Notes
|
166
|
+
-----
|
167
|
+
This method requires `pandas <https://pandas.pydata.org/>`_ to be installed.
|
168
|
+
"""
|
169
|
+
import pandas as pd
|
170
|
+
|
171
|
+
if isinstance(self.issues, dict):
|
172
|
+
_, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
|
173
|
+
data = _create_pandas_dataframe(classwise)
|
174
|
+
df = pd.DataFrame(data)
|
175
|
+
else:
|
176
|
+
df_list = []
|
177
|
+
for i, d in enumerate(self.issues):
|
178
|
+
_, classwise = _reorganize_by_class_and_metric(d, labelstats)
|
179
|
+
data = _create_pandas_dataframe(classwise)
|
180
|
+
single_df = pd.DataFrame(data)
|
181
|
+
single_df["Dataset"] = i
|
182
|
+
df_list.append(single_df)
|
183
|
+
df = pd.concat(df_list)
|
184
|
+
return df
|
@@ -8,27 +8,7 @@ from typing import Literal
|
|
8
8
|
import numpy as np
|
9
9
|
from numpy.typing import NDArray
|
10
10
|
|
11
|
-
from dataeval.
|
12
|
-
|
13
|
-
|
14
|
-
@dataclass(frozen=True)
|
15
|
-
class OODOutput(Output):
|
16
|
-
"""
|
17
|
-
Output class for predictions from out-of-distribution detectors.
|
18
|
-
|
19
|
-
Attributes
|
20
|
-
----------
|
21
|
-
is_ood : NDArray
|
22
|
-
Array of images that are detected as :term:Out-of-Distribution (OOD)`
|
23
|
-
instance_score : NDArray
|
24
|
-
Instance score of the evaluated dataset
|
25
|
-
feature_score : NDArray | None
|
26
|
-
Feature score, if available, of the evaluated dataset
|
27
|
-
"""
|
28
|
-
|
29
|
-
is_ood: NDArray[np.bool_]
|
30
|
-
instance_score: NDArray[np.float32]
|
31
|
-
feature_score: NDArray[np.float32] | None
|
11
|
+
from dataeval.outputs._base import Output
|
32
12
|
|
33
13
|
|
34
14
|
@dataclass(frozen=True)
|
@@ -49,7 +29,7 @@ class OODScoreOutput(Output):
|
|
49
29
|
|
50
30
|
def get(self, ood_type: Literal["instance", "feature"]) -> NDArray[np.float32]:
|
51
31
|
"""
|
52
|
-
Returns either the instance or feature score
|
32
|
+
Returns either the instance or feature score.
|
53
33
|
|
54
34
|
Parameters
|
55
35
|
----------
|
@@ -61,3 +41,23 @@ class OODScoreOutput(Output):
|
|
61
41
|
Either the instance or feature score based on input selection
|
62
42
|
"""
|
63
43
|
return self.instance_score if ood_type == "instance" or self.feature_score is None else self.feature_score
|
44
|
+
|
45
|
+
|
46
|
+
@dataclass(frozen=True)
|
47
|
+
class OODOutput(Output):
|
48
|
+
"""
|
49
|
+
Output class for predictions from out-of-distribution detectors.
|
50
|
+
|
51
|
+
Attributes
|
52
|
+
----------
|
53
|
+
is_ood : NDArray
|
54
|
+
Array of images that are detected as :term:Out-of-Distribution (OOD)`
|
55
|
+
instance_score : NDArray
|
56
|
+
Instance score of the evaluated dataset
|
57
|
+
feature_score : NDArray | None
|
58
|
+
Feature score, if available, of the evaluated dataset
|
59
|
+
"""
|
60
|
+
|
61
|
+
is_ood: NDArray[np.bool_]
|
62
|
+
instance_score: NDArray[np.float32]
|
63
|
+
feature_score: NDArray[np.float32] | None
|
@@ -0,0 +1,387 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
__all__ = []
|
4
|
+
|
5
|
+
import contextlib
|
6
|
+
from dataclasses import dataclass
|
7
|
+
from typing import Iterable, Optional, Union
|
8
|
+
|
9
|
+
import numpy as np
|
10
|
+
from numpy.typing import NDArray
|
11
|
+
from typing_extensions import TypeAlias
|
12
|
+
|
13
|
+
with contextlib.suppress(ImportError):
|
14
|
+
import pandas as pd
|
15
|
+
|
16
|
+
from dataeval.outputs._base import Output
|
17
|
+
from dataeval.utils._plot import channel_histogram_plot, histogram_plot
|
18
|
+
|
19
|
+
OptionalRange: TypeAlias = Optional[Union[int, Iterable[int]]]
|
20
|
+
|
21
|
+
SOURCE_INDEX = "source_index"
|
22
|
+
BOX_COUNT = "box_count"
|
23
|
+
|
24
|
+
|
25
|
+
@dataclass(frozen=True)
|
26
|
+
class SourceIndex:
|
27
|
+
"""
|
28
|
+
The indices of the source image, box and channel.
|
29
|
+
|
30
|
+
Attributes
|
31
|
+
----------
|
32
|
+
image: int
|
33
|
+
Index of the source image
|
34
|
+
box : int | None
|
35
|
+
Index of the box of the source image (if applicable)
|
36
|
+
channel : int | None
|
37
|
+
Index of the channel of the source image (if applicable)
|
38
|
+
"""
|
39
|
+
|
40
|
+
image: int
|
41
|
+
box: int | None
|
42
|
+
channel: int | None
|
43
|
+
|
44
|
+
|
45
|
+
def matches(index: int | None, opt_range: OptionalRange) -> bool:
|
46
|
+
if index is None or opt_range is None:
|
47
|
+
return True
|
48
|
+
return index in opt_range if isinstance(opt_range, Iterable) else index == opt_range
|
49
|
+
|
50
|
+
|
51
|
+
@dataclass(frozen=True)
|
52
|
+
class BaseStatsOutput(Output):
|
53
|
+
"""
|
54
|
+
Attributes
|
55
|
+
----------
|
56
|
+
source_index : List[SourceIndex]
|
57
|
+
Mapping from statistic to source image, box and channel index
|
58
|
+
box_count : NDArray[np.uint16]
|
59
|
+
"""
|
60
|
+
|
61
|
+
source_index: list[SourceIndex]
|
62
|
+
box_count: NDArray[np.uint16]
|
63
|
+
|
64
|
+
def __post_init__(self) -> None:
|
65
|
+
length = len(self.source_index)
|
66
|
+
bad = {k: len(v) for k, v in self.dict().items() if k not in [SOURCE_INDEX, BOX_COUNT] and len(v) != length}
|
67
|
+
if bad:
|
68
|
+
raise ValueError(f"All values must have the same length as source_index. Bad values: {str(bad)}.")
|
69
|
+
|
70
|
+
def get_channel_mask(
|
71
|
+
self,
|
72
|
+
channel_index: OptionalRange,
|
73
|
+
channel_count: OptionalRange = None,
|
74
|
+
) -> list[bool]:
|
75
|
+
"""
|
76
|
+
Boolean mask for results filtered to specified channel index and optionally the count
|
77
|
+
of the channels per image.
|
78
|
+
|
79
|
+
Parameters
|
80
|
+
----------
|
81
|
+
channel_index : int | Iterable[int] | None
|
82
|
+
Index or indices of channel(s) to filter for
|
83
|
+
channel_count : int | Iterable[int] | None
|
84
|
+
Optional count(s) of channels to filter for
|
85
|
+
"""
|
86
|
+
mask: list[bool] = []
|
87
|
+
cur_mask: list[bool] = []
|
88
|
+
cur_image = 0
|
89
|
+
cur_max_channel = 0
|
90
|
+
for source_index in list(self.source_index) + [None]:
|
91
|
+
if source_index is None or source_index.image > cur_image:
|
92
|
+
mask.extend(cur_mask if matches(cur_max_channel + 1, channel_count) else [False for _ in cur_mask])
|
93
|
+
if source_index is not None:
|
94
|
+
cur_image = source_index.image
|
95
|
+
cur_max_channel = 0
|
96
|
+
cur_mask.clear()
|
97
|
+
if source_index is not None:
|
98
|
+
cur_mask.append(matches(source_index.channel, channel_index))
|
99
|
+
cur_max_channel = max(cur_max_channel, source_index.channel or 0)
|
100
|
+
return mask
|
101
|
+
|
102
|
+
def __len__(self) -> int:
|
103
|
+
return len(self.source_index)
|
104
|
+
|
105
|
+
def _get_channels(
|
106
|
+
self, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None
|
107
|
+
) -> tuple[int, list[bool] | None]:
|
108
|
+
source_index = self.dict()[SOURCE_INDEX]
|
109
|
+
raw_channels = int(max([si.channel or 0 for si in source_index])) + 1
|
110
|
+
if isinstance(channel_index, int):
|
111
|
+
max_channels = 1 if channel_index < raw_channels else raw_channels
|
112
|
+
ch_mask = self.get_channel_mask(channel_index)
|
113
|
+
elif isinstance(channel_index, Iterable) and all(isinstance(val, int) for val in list(channel_index)):
|
114
|
+
max_channels = len(list(channel_index))
|
115
|
+
ch_mask = self.get_channel_mask(channel_index)
|
116
|
+
elif isinstance(channel_limit, int):
|
117
|
+
max_channels = channel_limit
|
118
|
+
ch_mask = self.get_channel_mask(None, channel_limit)
|
119
|
+
else:
|
120
|
+
max_channels = raw_channels
|
121
|
+
ch_mask = None
|
122
|
+
|
123
|
+
if max_channels > raw_channels:
|
124
|
+
max_channels = raw_channels
|
125
|
+
if ch_mask is not None and not any(ch_mask):
|
126
|
+
ch_mask = None
|
127
|
+
|
128
|
+
return max_channels, ch_mask
|
129
|
+
|
130
|
+
def plot(
|
131
|
+
self, log: bool, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None
|
132
|
+
) -> None:
|
133
|
+
max_channels, ch_mask = self._get_channels(channel_limit, channel_index)
|
134
|
+
d = {k: v for k, v in self.dict().items() if isinstance(v, np.ndarray) and v[v != 0].size > 0 and v.ndim == 1}
|
135
|
+
if max_channels == 1:
|
136
|
+
histogram_plot(d, log)
|
137
|
+
else:
|
138
|
+
channel_histogram_plot(d, log, max_channels, ch_mask)
|
139
|
+
|
140
|
+
|
141
|
+
@dataclass(frozen=True)
|
142
|
+
class DimensionStatsOutput(BaseStatsOutput):
|
143
|
+
"""
|
144
|
+
Output class for :func:`.dimensionstats` stats metric.
|
145
|
+
|
146
|
+
Attributes
|
147
|
+
----------
|
148
|
+
left : NDArray[np.int32]
|
149
|
+
Offsets from the left edge of images in pixels
|
150
|
+
top : NDArray[np.int32]
|
151
|
+
Offsets from the top edge of images in pixels
|
152
|
+
width : NDArray[np.uint32]
|
153
|
+
Width of the images in pixels
|
154
|
+
height : NDArray[np.uint32]
|
155
|
+
Height of the images in pixels
|
156
|
+
channels : NDArray[np.uint8]
|
157
|
+
Channel count of the images in pixels
|
158
|
+
size : NDArray[np.uint32]
|
159
|
+
Size of the images in pixels
|
160
|
+
aspect_ratio : NDArray[np.float16]
|
161
|
+
:term:`ASspect Ratio<Aspect Ratio>` of the images (width/height)
|
162
|
+
depth : NDArray[np.uint8]
|
163
|
+
Color depth of the images in bits
|
164
|
+
center : NDArray[np.uint16]
|
165
|
+
Offset from center in [x,y] coordinates of the images in pixels
|
166
|
+
distance : NDArray[np.float16]
|
167
|
+
Distance in pixels from center
|
168
|
+
"""
|
169
|
+
|
170
|
+
left: NDArray[np.int32]
|
171
|
+
top: NDArray[np.int32]
|
172
|
+
width: NDArray[np.uint32]
|
173
|
+
height: NDArray[np.uint32]
|
174
|
+
channels: NDArray[np.uint8]
|
175
|
+
size: NDArray[np.uint32]
|
176
|
+
aspect_ratio: NDArray[np.float16]
|
177
|
+
depth: NDArray[np.uint8]
|
178
|
+
center: NDArray[np.int16]
|
179
|
+
distance: NDArray[np.float16]
|
180
|
+
|
181
|
+
|
182
|
+
@dataclass(frozen=True)
|
183
|
+
class HashStatsOutput(BaseStatsOutput):
|
184
|
+
"""
|
185
|
+
Output class for :func:`.hashstats` stats metric.
|
186
|
+
|
187
|
+
Attributes
|
188
|
+
----------
|
189
|
+
xxhash : List[str]
|
190
|
+
xxHash hash of the images as a hex string
|
191
|
+
pchash : List[str]
|
192
|
+
:term:`Perception-based Hash` of the images as a hex string
|
193
|
+
"""
|
194
|
+
|
195
|
+
xxhash: list[str]
|
196
|
+
pchash: list[str]
|
197
|
+
|
198
|
+
|
199
|
+
@dataclass(frozen=True)
|
200
|
+
class LabelStatsOutput(Output):
|
201
|
+
"""
|
202
|
+
Output class for :func:`.labelstats` stats metric.
|
203
|
+
|
204
|
+
Attributes
|
205
|
+
----------
|
206
|
+
label_counts_per_class : dict[int, int]
|
207
|
+
Dictionary whose keys are the different label classes and
|
208
|
+
values are total counts of each class
|
209
|
+
label_counts_per_image : list[int]
|
210
|
+
Number of labels per image
|
211
|
+
image_counts_per_class : dict[int, int]
|
212
|
+
Dictionary whose keys are the different label classes and
|
213
|
+
values are total counts of each image the class is present in
|
214
|
+
image_indices_per_class : dict[int, list]
|
215
|
+
Dictionary whose keys are the different label classes and
|
216
|
+
values are lists containing the images that have that label
|
217
|
+
image_count : int
|
218
|
+
Total number of images present
|
219
|
+
class_count : int
|
220
|
+
Total number of classes present
|
221
|
+
label_count : int
|
222
|
+
Total number of labels present
|
223
|
+
class_names : list[str]
|
224
|
+
"""
|
225
|
+
|
226
|
+
label_counts_per_class: list[int]
|
227
|
+
label_counts_per_image: list[int]
|
228
|
+
image_counts_per_class: list[int]
|
229
|
+
image_indices_per_class: list[list[int]]
|
230
|
+
image_count: int
|
231
|
+
class_count: int
|
232
|
+
label_count: int
|
233
|
+
class_names: list[str]
|
234
|
+
|
235
|
+
def to_table(self) -> str:
|
236
|
+
"""
|
237
|
+
Formats the label statistics output results as a table.
|
238
|
+
|
239
|
+
Returns
|
240
|
+
-------
|
241
|
+
str
|
242
|
+
"""
|
243
|
+
max_char = max(len(name) if isinstance(name, str) else name // 10 + 1 for name in self.class_names)
|
244
|
+
max_char = max(max_char, 5)
|
245
|
+
max_label = max(list(self.label_counts_per_class))
|
246
|
+
max_img = max(list(self.image_counts_per_class))
|
247
|
+
max_num = int(np.ceil(np.log10(max(max_label, max_img))))
|
248
|
+
max_num = max(max_num, 11)
|
249
|
+
|
250
|
+
# Display basic counts
|
251
|
+
table_str = [f"Class Count: {self.class_count}"]
|
252
|
+
table_str += [f"Label Count: {self.label_count}"]
|
253
|
+
table_str += [f"Average # Labels per Image: {round(np.mean(self.label_counts_per_image), 2)}"]
|
254
|
+
table_str += ["--------------------------------------"]
|
255
|
+
|
256
|
+
# Display counts per class
|
257
|
+
table_str += [f"{'Label':>{max_char}}: Total Count - Image Count"]
|
258
|
+
for cls in range(len(self.class_names)):
|
259
|
+
table_str += [
|
260
|
+
f"{self.class_names[cls]:>{max_char}}: {self.label_counts_per_class[cls]:^{max_num}}"
|
261
|
+
+ " - "
|
262
|
+
+ f"{self.image_counts_per_class[cls]:^{max_num}}".rstrip()
|
263
|
+
]
|
264
|
+
|
265
|
+
return "\n".join(table_str)
|
266
|
+
|
267
|
+
def to_dataframe(self) -> pd.DataFrame:
|
268
|
+
"""
|
269
|
+
Exports the label statistics output results to a pandas DataFrame.
|
270
|
+
|
271
|
+
Notes
|
272
|
+
-----
|
273
|
+
This method requires `pandas <https://pandas.pydata.org/>`_ to be installed.
|
274
|
+
|
275
|
+
Returns
|
276
|
+
-------
|
277
|
+
pd.DataFrame
|
278
|
+
"""
|
279
|
+
import pandas as pd
|
280
|
+
|
281
|
+
total_count = []
|
282
|
+
image_count = []
|
283
|
+
for cls in range(len(self.class_names)):
|
284
|
+
total_count.append(self.label_counts_per_class[cls])
|
285
|
+
image_count.append(self.image_counts_per_class[cls])
|
286
|
+
|
287
|
+
return pd.DataFrame(
|
288
|
+
{
|
289
|
+
"Label": self.class_names,
|
290
|
+
"Total Count": total_count,
|
291
|
+
"Image Count": image_count,
|
292
|
+
}
|
293
|
+
)
|
294
|
+
|
295
|
+
|
296
|
+
@dataclass(frozen=True)
|
297
|
+
class PixelStatsOutput(BaseStatsOutput):
|
298
|
+
"""
|
299
|
+
Output class for :func:`.pixelstats` stats metric.
|
300
|
+
|
301
|
+
Attributes
|
302
|
+
----------
|
303
|
+
mean : NDArray[np.float16]
|
304
|
+
Mean of the pixel values of the images
|
305
|
+
std : NDArray[np.float16]
|
306
|
+
Standard deviation of the pixel values of the images
|
307
|
+
var : NDArray[np.float16]
|
308
|
+
:term:`Variance` of the pixel values of the images
|
309
|
+
skew : NDArray[np.float16]
|
310
|
+
Skew of the pixel values of the images
|
311
|
+
kurtosis : NDArray[np.float16]
|
312
|
+
Kurtosis of the pixel values of the images
|
313
|
+
histogram : NDArray[np.uint32]
|
314
|
+
Histogram of the pixel values of the images across 256 bins scaled between 0 and 1
|
315
|
+
entropy : NDArray[np.float16]
|
316
|
+
Entropy of the pixel values of the images
|
317
|
+
"""
|
318
|
+
|
319
|
+
mean: NDArray[np.float16]
|
320
|
+
std: NDArray[np.float16]
|
321
|
+
var: NDArray[np.float16]
|
322
|
+
skew: NDArray[np.float16]
|
323
|
+
kurtosis: NDArray[np.float16]
|
324
|
+
histogram: NDArray[np.uint32]
|
325
|
+
entropy: NDArray[np.float16]
|
326
|
+
|
327
|
+
|
328
|
+
@dataclass(frozen=True)
|
329
|
+
class VisualStatsOutput(BaseStatsOutput):
|
330
|
+
"""
|
331
|
+
Output class for :func:`.visualstats` stats metric.
|
332
|
+
|
333
|
+
Attributes
|
334
|
+
----------
|
335
|
+
brightness : NDArray[np.float16]
|
336
|
+
Brightness of the images
|
337
|
+
contrast : NDArray[np.float16]
|
338
|
+
Image contrast ratio
|
339
|
+
darkness : NDArray[np.float16]
|
340
|
+
Darkness of the images
|
341
|
+
missing : NDArray[np.float16]
|
342
|
+
Percentage of the images with missing pixels
|
343
|
+
sharpness : NDArray[np.float16]
|
344
|
+
Sharpness of the images
|
345
|
+
zeros : NDArray[np.float16]
|
346
|
+
Percentage of the images with zero value pixels
|
347
|
+
percentiles : NDArray[np.float16]
|
348
|
+
Percentiles of the pixel values of the images with quartiles of (0, 25, 50, 75, 100)
|
349
|
+
"""
|
350
|
+
|
351
|
+
brightness: NDArray[np.float16]
|
352
|
+
contrast: NDArray[np.float16]
|
353
|
+
darkness: NDArray[np.float16]
|
354
|
+
missing: NDArray[np.float16]
|
355
|
+
sharpness: NDArray[np.float16]
|
356
|
+
zeros: NDArray[np.float16]
|
357
|
+
percentiles: NDArray[np.float16]
|
358
|
+
|
359
|
+
|
360
|
+
@dataclass(frozen=True)
|
361
|
+
class ImageStatsOutput(DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput):
|
362
|
+
"""
|
363
|
+
Output class for :func:`.imagestats` stats metric with `per_channel=False`.
|
364
|
+
|
365
|
+
This class represents the combined outputs of various stats functions against a
|
366
|
+
single dataset, such that each index across all stat outputs are representative
|
367
|
+
of the same source image. Modifying or mixing outputs will result in inaccurate
|
368
|
+
outlier calculations if not created correctly.
|
369
|
+
|
370
|
+
The attributes and methods are a superset of :class:`.DimensionStatsOutput`,
|
371
|
+
:class:`.PixelStatsOutput` and :class:`.VisualStatsOutput`.
|
372
|
+
"""
|
373
|
+
|
374
|
+
|
375
|
+
@dataclass(frozen=True)
|
376
|
+
class ChannelStatsOutput(PixelStatsOutput, VisualStatsOutput):
|
377
|
+
"""
|
378
|
+
Output class for :func:`.imagestats` stats metric with `per_channel=True`.
|
379
|
+
|
380
|
+
This class represents the outputs of various per-channel stats functions against
|
381
|
+
a single dataset, such that each index across all stat outputs are representative
|
382
|
+
of the same source image. Modifying or mixing outputs will result in inaccurate
|
383
|
+
outlier calculations if not created correctly.
|
384
|
+
|
385
|
+
The attributes and methods are a superset of :class:`.PixelStatsOutput` and
|
386
|
+
:class:`.VisualStatsOutput`.
|
387
|
+
"""
|
@@ -0,0 +1,44 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
__all__ = []
|
4
|
+
|
5
|
+
from dataclasses import dataclass
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
from numpy.typing import NDArray
|
9
|
+
|
10
|
+
from dataeval.outputs._base import Output
|
11
|
+
|
12
|
+
|
13
|
+
@dataclass(frozen=True)
|
14
|
+
class TrainValSplit:
|
15
|
+
"""
|
16
|
+
Dataclass containing train and validation indices.
|
17
|
+
|
18
|
+
Attributes
|
19
|
+
----------
|
20
|
+
train: NDArray[np.intp]
|
21
|
+
Indices for the training set
|
22
|
+
val: NDArray[np.intp]
|
23
|
+
Indices for the validation set
|
24
|
+
"""
|
25
|
+
|
26
|
+
train: NDArray[np.intp]
|
27
|
+
val: NDArray[np.intp]
|
28
|
+
|
29
|
+
|
30
|
+
@dataclass(frozen=True)
|
31
|
+
class SplitDatasetOutput(Output):
|
32
|
+
"""
|
33
|
+
Output class containing test indices and a list of TrainValSplits.
|
34
|
+
|
35
|
+
Attributes
|
36
|
+
----------
|
37
|
+
test: NDArray[np.intp]
|
38
|
+
Indices for the test set
|
39
|
+
folds: list[TrainValSplit]
|
40
|
+
List of train and validation split indices
|
41
|
+
"""
|
42
|
+
|
43
|
+
test: NDArray[np.intp]
|
44
|
+
folds: list[TrainValSplit]
|