dataeval 0.76.1__py3-none-any.whl → 0.82.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +3 -3
- dataeval/config.py +77 -0
- dataeval/detectors/__init__.py +1 -1
- dataeval/detectors/drift/__init__.py +6 -6
- dataeval/detectors/drift/{base.py → _base.py} +40 -85
- dataeval/detectors/drift/{cvm.py → _cvm.py} +21 -28
- dataeval/detectors/drift/{ks.py → _ks.py} +20 -26
- dataeval/detectors/drift/{mmd.py → _mmd.py} +31 -43
- dataeval/detectors/drift/{torch.py → _torch.py} +2 -1
- dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} +24 -7
- dataeval/detectors/drift/updates.py +20 -3
- dataeval/detectors/linters/__init__.py +3 -5
- dataeval/detectors/linters/duplicates.py +13 -36
- dataeval/detectors/linters/outliers.py +23 -148
- dataeval/detectors/ood/__init__.py +1 -1
- dataeval/detectors/ood/ae.py +30 -9
- dataeval/detectors/ood/base.py +5 -4
- dataeval/detectors/ood/mixin.py +21 -7
- dataeval/detectors/ood/vae.py +73 -0
- dataeval/metadata/__init__.py +6 -0
- dataeval/metadata/_distance.py +167 -0
- dataeval/metadata/_ood.py +217 -0
- dataeval/metadata/_utils.py +44 -0
- dataeval/metrics/__init__.py +1 -1
- dataeval/metrics/bias/__init__.py +6 -4
- dataeval/metrics/bias/{balance.py → _balance.py} +15 -101
- dataeval/metrics/bias/_coverage.py +98 -0
- dataeval/metrics/bias/{diversity.py → _diversity.py} +18 -111
- dataeval/metrics/bias/{parity.py → _parity.py} +39 -77
- dataeval/metrics/estimators/__init__.py +15 -4
- dataeval/metrics/estimators/{ber.py → _ber.py} +42 -29
- dataeval/metrics/estimators/_clusterer.py +44 -0
- dataeval/metrics/estimators/{divergence.py → _divergence.py} +18 -30
- dataeval/metrics/estimators/{uap.py → _uap.py} +4 -18
- dataeval/metrics/stats/__init__.py +16 -13
- dataeval/metrics/stats/{base.py → _base.py} +82 -133
- dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} +15 -18
- dataeval/metrics/stats/_dimensionstats.py +75 -0
- dataeval/metrics/stats/{hashstats.py → _hashstats.py} +21 -37
- dataeval/metrics/stats/_imagestats.py +94 -0
- dataeval/metrics/stats/_labelstats.py +131 -0
- dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} +19 -50
- dataeval/metrics/stats/{visualstats.py → _visualstats.py} +23 -54
- dataeval/outputs/__init__.py +53 -0
- dataeval/{output.py → outputs/_base.py} +55 -25
- dataeval/outputs/_bias.py +381 -0
- dataeval/outputs/_drift.py +83 -0
- dataeval/outputs/_estimators.py +114 -0
- dataeval/outputs/_linters.py +184 -0
- dataeval/{detectors/ood/output.py → outputs/_ood.py} +22 -22
- dataeval/outputs/_stats.py +387 -0
- dataeval/outputs/_utils.py +44 -0
- dataeval/outputs/_workflows.py +364 -0
- dataeval/typing.py +234 -0
- dataeval/utils/__init__.py +2 -2
- dataeval/utils/_array.py +169 -0
- dataeval/utils/_bin.py +199 -0
- dataeval/utils/_clusterer.py +144 -0
- dataeval/utils/_fast_mst.py +189 -0
- dataeval/utils/{image.py → _image.py} +6 -4
- dataeval/utils/_method.py +14 -0
- dataeval/utils/{shared.py → _mst.py} +3 -65
- dataeval/utils/{plot.py → _plot.py} +6 -6
- dataeval/utils/data/__init__.py +26 -0
- dataeval/utils/data/_dataset.py +217 -0
- dataeval/utils/data/_embeddings.py +104 -0
- dataeval/utils/data/_images.py +68 -0
- dataeval/utils/data/_metadata.py +360 -0
- dataeval/utils/data/_selection.py +126 -0
- dataeval/utils/{dataset/split.py → data/_split.py} +12 -38
- dataeval/utils/data/_targets.py +85 -0
- dataeval/utils/data/collate.py +103 -0
- dataeval/utils/data/datasets/__init__.py +17 -0
- dataeval/utils/data/datasets/_base.py +254 -0
- dataeval/utils/data/datasets/_cifar10.py +134 -0
- dataeval/utils/data/datasets/_fileio.py +168 -0
- dataeval/utils/data/datasets/_milco.py +153 -0
- dataeval/utils/data/datasets/_mixin.py +56 -0
- dataeval/utils/data/datasets/_mnist.py +183 -0
- dataeval/utils/data/datasets/_ships.py +123 -0
- dataeval/utils/data/datasets/_types.py +52 -0
- dataeval/utils/data/datasets/_voc.py +352 -0
- dataeval/utils/data/selections/__init__.py +15 -0
- dataeval/utils/data/selections/_classfilter.py +57 -0
- dataeval/utils/data/selections/_indices.py +26 -0
- dataeval/utils/data/selections/_limit.py +26 -0
- dataeval/utils/data/selections/_reverse.py +18 -0
- dataeval/utils/data/selections/_shuffle.py +29 -0
- dataeval/utils/metadata.py +51 -376
- dataeval/utils/torch/{gmm.py → _gmm.py} +4 -2
- dataeval/utils/torch/{internal.py → _internal.py} +21 -51
- dataeval/utils/torch/models.py +43 -2
- dataeval/workflows/__init__.py +2 -1
- dataeval/workflows/sufficiency.py +11 -346
- {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/METADATA +5 -2
- dataeval-0.82.0.dist-info/RECORD +104 -0
- dataeval/detectors/linters/clusterer.py +0 -512
- dataeval/detectors/linters/merged_stats.py +0 -49
- dataeval/detectors/ood/metadata_ks_compare.py +0 -129
- dataeval/detectors/ood/metadata_least_likely.py +0 -119
- dataeval/interop.py +0 -69
- dataeval/metrics/bias/coverage.py +0 -194
- dataeval/metrics/stats/datasetstats.py +0 -202
- dataeval/metrics/stats/dimensionstats.py +0 -115
- dataeval/metrics/stats/labelstats.py +0 -210
- dataeval/utils/dataset/__init__.py +0 -7
- dataeval/utils/dataset/datasets.py +0 -412
- dataeval/utils/dataset/read.py +0 -63
- dataeval-0.76.1.dist-info/RECORD +0 -67
- /dataeval/{log.py → _log.py} +0 -0
- /dataeval/utils/torch/{blocks.py → _blocks.py} +0 -0
- {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/WHEEL +0 -0
@@ -7,32 +7,17 @@ from __future__ import annotations
|
|
7
7
|
|
8
8
|
__all__ = []
|
9
9
|
|
10
|
-
from dataclasses import dataclass
|
11
10
|
from typing import Literal
|
12
11
|
|
13
12
|
import numpy as np
|
14
|
-
from numpy.typing import
|
13
|
+
from numpy.typing import NDArray
|
15
14
|
|
16
|
-
from dataeval.
|
17
|
-
from dataeval.
|
18
|
-
from dataeval.
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
class DivergenceOutput(Output):
|
23
|
-
"""
|
24
|
-
Output class for :func:`divergence` estimator metric.
|
25
|
-
|
26
|
-
Attributes
|
27
|
-
----------
|
28
|
-
divergence : float
|
29
|
-
:term:`Divergence` value calculated between 2 datasets ranging between 0.0 and 1.0
|
30
|
-
errors : int
|
31
|
-
The number of differing edges between the datasets
|
32
|
-
"""
|
33
|
-
|
34
|
-
divergence: float
|
35
|
-
errors: int
|
15
|
+
from dataeval.outputs import DivergenceOutput
|
16
|
+
from dataeval.outputs._base import set_metadata
|
17
|
+
from dataeval.typing import ArrayLike
|
18
|
+
from dataeval.utils._array import ensure_embeddings
|
19
|
+
from dataeval.utils._method import get_method
|
20
|
+
from dataeval.utils._mst import compute_neighbors, minimum_spanning_tree
|
36
21
|
|
37
22
|
|
38
23
|
def divergence_mst(data: NDArray[np.float64], labels: NDArray[np.int_]) -> int:
|
@@ -78,18 +63,21 @@ def divergence_fnn(data: NDArray[np.float64], labels: NDArray[np.int_]) -> int:
|
|
78
63
|
return errors
|
79
64
|
|
80
65
|
|
66
|
+
_DIVERGENCE_FN_MAP = {"FNN": divergence_fnn, "MST": divergence_mst}
|
67
|
+
|
68
|
+
|
81
69
|
@set_metadata
|
82
|
-
def divergence(
|
70
|
+
def divergence(emb_a: ArrayLike, emb_b: ArrayLike, method: Literal["FNN", "MST"] = "FNN") -> DivergenceOutput:
|
83
71
|
"""
|
84
72
|
Calculates the :term:`divergence` and any errors between the datasets.
|
85
73
|
|
86
74
|
Parameters
|
87
75
|
----------
|
88
|
-
|
89
|
-
|
76
|
+
emb_a : ArrayLike, shape - (N, P)
|
77
|
+
Image embeddings in an ArrayLike format to compare.
|
90
78
|
Function expects the data to have 2 dimensions, N number of observations in a P-dimensionial space.
|
91
|
-
|
92
|
-
|
79
|
+
emb_b : ArrayLike, shape - (N, P)
|
80
|
+
Image embeddings in an ArrayLike format to compare.
|
93
81
|
Function expects the data to have 2 dimensions, N number of observations in a P-dimensionial space.
|
94
82
|
method : Literal["MST, "FNN"], default "FNN"
|
95
83
|
Method used to estimate dataset :term:`divergence<Divergence>`
|
@@ -125,9 +113,9 @@ def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST
|
|
125
113
|
>>> divergence(datasetA, datasetB)
|
126
114
|
DivergenceOutput(divergence=0.28, errors=36)
|
127
115
|
"""
|
128
|
-
div_fn = get_method(
|
129
|
-
a =
|
130
|
-
b =
|
116
|
+
div_fn = get_method(_DIVERGENCE_FN_MAP, method)
|
117
|
+
a = ensure_embeddings(emb_a, dtype=np.float64)
|
118
|
+
b = ensure_embeddings(emb_b, dtype=np.float64)
|
131
119
|
N = a.shape[0]
|
132
120
|
M = b.shape[0]
|
133
121
|
|
@@ -8,27 +8,13 @@ from __future__ import annotations
|
|
8
8
|
|
9
9
|
__all__ = []
|
10
10
|
|
11
|
-
from dataclasses import dataclass
|
12
11
|
|
13
|
-
from numpy.typing import ArrayLike
|
14
12
|
from sklearn.metrics import average_precision_score
|
15
13
|
|
16
|
-
from dataeval.
|
17
|
-
from dataeval.
|
18
|
-
|
19
|
-
|
20
|
-
@dataclass(frozen=True)
|
21
|
-
class UAPOutput(Output):
|
22
|
-
"""
|
23
|
-
Output class for :func:`uap` estimator metric.
|
24
|
-
|
25
|
-
Attributes
|
26
|
-
----------
|
27
|
-
uap : float
|
28
|
-
The empirical mean precision estimate
|
29
|
-
"""
|
30
|
-
|
31
|
-
uap: float
|
14
|
+
from dataeval.outputs import UAPOutput
|
15
|
+
from dataeval.outputs._base import set_metadata
|
16
|
+
from dataeval.typing import ArrayLike
|
17
|
+
from dataeval.utils._array import as_numpy
|
32
18
|
|
33
19
|
|
34
20
|
@set_metadata
|
@@ -5,15 +5,14 @@ and label statistics against the images and labels of a dataset.
|
|
5
5
|
|
6
6
|
__all__ = [
|
7
7
|
"ChannelStatsOutput",
|
8
|
-
"
|
8
|
+
"ImageStatsOutput",
|
9
9
|
"DimensionStatsOutput",
|
10
10
|
"HashStatsOutput",
|
11
11
|
"LabelStatsOutput",
|
12
12
|
"PixelStatsOutput",
|
13
13
|
"VisualStatsOutput",
|
14
14
|
"boxratiostats",
|
15
|
-
"
|
16
|
-
"datasetstats",
|
15
|
+
"imagestats",
|
17
16
|
"dimensionstats",
|
18
17
|
"hashstats",
|
19
18
|
"labelstats",
|
@@ -21,15 +20,19 @@ __all__ = [
|
|
21
20
|
"visualstats",
|
22
21
|
]
|
23
22
|
|
24
|
-
from dataeval.metrics.stats.
|
25
|
-
from dataeval.metrics.stats.
|
23
|
+
from dataeval.metrics.stats._boxratiostats import boxratiostats
|
24
|
+
from dataeval.metrics.stats._dimensionstats import dimensionstats
|
25
|
+
from dataeval.metrics.stats._hashstats import hashstats
|
26
|
+
from dataeval.metrics.stats._imagestats import imagestats
|
27
|
+
from dataeval.metrics.stats._labelstats import labelstats
|
28
|
+
from dataeval.metrics.stats._pixelstats import pixelstats
|
29
|
+
from dataeval.metrics.stats._visualstats import visualstats
|
30
|
+
from dataeval.outputs._stats import (
|
26
31
|
ChannelStatsOutput,
|
27
|
-
|
28
|
-
|
29
|
-
|
32
|
+
DimensionStatsOutput,
|
33
|
+
HashStatsOutput,
|
34
|
+
ImageStatsOutput,
|
35
|
+
LabelStatsOutput,
|
36
|
+
PixelStatsOutput,
|
37
|
+
VisualStatsOutput,
|
30
38
|
)
|
31
|
-
from dataeval.metrics.stats.dimensionstats import DimensionStatsOutput, dimensionstats
|
32
|
-
from dataeval.metrics.stats.hashstats import HashStatsOutput, hashstats
|
33
|
-
from dataeval.metrics.stats.labelstats import LabelStatsOutput, labelstats
|
34
|
-
from dataeval.metrics.stats.pixelstats import PixelStatsOutput, pixelstats
|
35
|
-
from dataeval.metrics.stats.visualstats import VisualStatsOutput, visualstats
|
@@ -1,39 +1,27 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from dataeval.utils.plot import histogram_plot
|
4
|
-
|
5
3
|
__all__ = []
|
6
4
|
|
7
5
|
import re
|
8
6
|
import warnings
|
7
|
+
from collections import ChainMap
|
8
|
+
from copy import deepcopy
|
9
9
|
from dataclasses import dataclass
|
10
10
|
from functools import partial
|
11
|
-
from itertools import repeat
|
12
11
|
from multiprocessing import Pool
|
13
|
-
from typing import Any, Callable, Generic, Iterable,
|
12
|
+
from typing import Any, Callable, Generic, Iterable, Sequence, TypeVar, cast
|
14
13
|
|
15
14
|
import numpy as np
|
16
15
|
import tqdm
|
17
|
-
from numpy.typing import
|
16
|
+
from numpy.typing import NDArray
|
18
17
|
|
19
|
-
from dataeval.
|
20
|
-
from dataeval.
|
21
|
-
from dataeval.
|
18
|
+
from dataeval.config import get_max_processes
|
19
|
+
from dataeval.outputs._stats import BaseStatsOutput, SourceIndex
|
20
|
+
from dataeval.typing import ArrayLike, Dataset, ObjectDetectionTarget
|
21
|
+
from dataeval.utils._array import to_numpy
|
22
|
+
from dataeval.utils._image import normalize_image_shape, rescale
|
22
23
|
|
23
24
|
DTYPE_REGEX = re.compile(r"NDArray\[np\.(.*?)\]")
|
24
|
-
SOURCE_INDEX = "source_index"
|
25
|
-
BOX_COUNT = "box_count"
|
26
|
-
|
27
|
-
# TODO: Replace with global config
|
28
|
-
DEFAULT_PROCESSES: int | None = None
|
29
|
-
|
30
|
-
OptionalRange = Optional[Union[int, Iterable[int]]]
|
31
|
-
|
32
|
-
|
33
|
-
def matches(index: int | None, opt_range: OptionalRange) -> bool:
|
34
|
-
if index is None or opt_range is None:
|
35
|
-
return True
|
36
|
-
return index in opt_range if isinstance(opt_range, Iterable) else index == opt_range
|
37
25
|
|
38
26
|
|
39
27
|
def normalize_box_shape(bounding_box: NDArray[Any]) -> NDArray[Any]:
|
@@ -49,86 +37,6 @@ def normalize_box_shape(bounding_box: NDArray[Any]) -> NDArray[Any]:
|
|
49
37
|
return bounding_box
|
50
38
|
|
51
39
|
|
52
|
-
class SourceIndex(NamedTuple):
|
53
|
-
"""
|
54
|
-
Attributes
|
55
|
-
----------
|
56
|
-
image: int
|
57
|
-
Index of the source image
|
58
|
-
box : int | None
|
59
|
-
Index of the box of the source image
|
60
|
-
channel : int | None
|
61
|
-
Index of the channel of the source image
|
62
|
-
"""
|
63
|
-
|
64
|
-
image: int
|
65
|
-
box: int | None
|
66
|
-
channel: int | None
|
67
|
-
|
68
|
-
|
69
|
-
@dataclass(frozen=True)
|
70
|
-
class BaseStatsOutput(Output):
|
71
|
-
"""
|
72
|
-
Attributes
|
73
|
-
----------
|
74
|
-
source_index : List[SourceIndex]
|
75
|
-
Mapping from statistic to source image, box and channel index
|
76
|
-
box_count : NDArray[np.uint16]
|
77
|
-
"""
|
78
|
-
|
79
|
-
source_index: list[SourceIndex]
|
80
|
-
box_count: NDArray[np.uint16]
|
81
|
-
|
82
|
-
def get_channel_mask(
|
83
|
-
self,
|
84
|
-
channel_index: OptionalRange,
|
85
|
-
channel_count: OptionalRange = None,
|
86
|
-
) -> list[bool]:
|
87
|
-
"""
|
88
|
-
Boolean mask for results filtered to specified channel index and optionally the count
|
89
|
-
of the channels per image.
|
90
|
-
|
91
|
-
Parameters
|
92
|
-
----------
|
93
|
-
channel_index : int | Iterable[int] | None
|
94
|
-
Index or indices of channel(s) to filter for
|
95
|
-
channel_count : int | Iterable[int] | None
|
96
|
-
Optional count(s) of channels to filter for
|
97
|
-
"""
|
98
|
-
mask: list[bool] = []
|
99
|
-
cur_mask: list[bool] = []
|
100
|
-
cur_image = 0
|
101
|
-
cur_max_channel = 0
|
102
|
-
for source_index in list(self.source_index) + [None]:
|
103
|
-
if source_index is None or source_index.image > cur_image:
|
104
|
-
mask.extend(cur_mask if matches(cur_max_channel + 1, channel_count) else [False for _ in cur_mask])
|
105
|
-
if source_index is not None:
|
106
|
-
cur_image = source_index.image
|
107
|
-
cur_max_channel = 0
|
108
|
-
cur_mask.clear()
|
109
|
-
if source_index is not None:
|
110
|
-
cur_mask.append(matches(source_index.channel, channel_index))
|
111
|
-
cur_max_channel = max(cur_max_channel, source_index.channel or 0)
|
112
|
-
return mask
|
113
|
-
|
114
|
-
def __len__(self) -> int:
|
115
|
-
return len(self.source_index)
|
116
|
-
|
117
|
-
|
118
|
-
def _is_plottable(k: str, v: Any, excluded_keys: Iterable[str]) -> bool:
|
119
|
-
return isinstance(v, np.ndarray) and v[v != 0].size > 0 and all(k != x for x in excluded_keys)
|
120
|
-
|
121
|
-
|
122
|
-
class HistogramPlotMixin:
|
123
|
-
_excluded_keys: Iterable[str] = []
|
124
|
-
|
125
|
-
def dict(self) -> dict[str, Any]: ...
|
126
|
-
|
127
|
-
def plot(self, log: bool) -> None:
|
128
|
-
data_dict = {k: v for k, v in self.dict().items() if _is_plottable(k, v, self._excluded_keys)}
|
129
|
-
histogram_plot(data_dict, log)
|
130
|
-
|
131
|
-
|
132
40
|
TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput, covariant=True)
|
133
41
|
|
134
42
|
|
@@ -193,10 +101,9 @@ class StatsProcessor(Generic[TStatsOutput]):
|
|
193
101
|
cls, source: dict[str, Any], source_index: list[SourceIndex], box_count: list[int]
|
194
102
|
) -> TStatsOutput:
|
195
103
|
output = {}
|
196
|
-
for
|
197
|
-
|
198
|
-
|
199
|
-
stat_type: str = cls.output_class.__annotations__[key]
|
104
|
+
attrs = dict(ChainMap(*(getattr(c, "__annotations__", {}) for c in cls.output_class.__mro__)))
|
105
|
+
for key in (key for key in source if key in attrs):
|
106
|
+
stat_type: str = attrs[key]
|
200
107
|
dtype_match = re.match(DTYPE_REGEX, stat_type)
|
201
108
|
if dtype_match is not None:
|
202
109
|
output[key] = np.asarray(source[key], dtype=np.dtype(dtype_match.group(1)))
|
@@ -205,7 +112,8 @@ class StatsProcessor(Generic[TStatsOutput]):
|
|
205
112
|
return cls.output_class(**output, source_index=source_index, box_count=np.asarray(box_count, dtype=np.uint16))
|
206
113
|
|
207
114
|
|
208
|
-
|
115
|
+
@dataclass
|
116
|
+
class StatsProcessorOutput:
|
209
117
|
results: list[dict[str, Any]]
|
210
118
|
source_indices: list[SourceIndex]
|
211
119
|
box_counts: list[int]
|
@@ -214,16 +122,20 @@ class StatsProcessorOutput(NamedTuple):
|
|
214
122
|
|
215
123
|
def process_stats(
|
216
124
|
i: int,
|
217
|
-
|
125
|
+
dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
|
126
|
+
per_box: bool,
|
218
127
|
per_channel: bool,
|
219
128
|
stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
|
220
129
|
) -> StatsProcessorOutput:
|
221
|
-
|
130
|
+
data = dataset[i]
|
131
|
+
image, target = (to_numpy(cast(ArrayLike, data[0])), data[1]) if isinstance(data, tuple) else (to_numpy(data), None)
|
132
|
+
target = None if not isinstance(target, ObjectDetectionTarget) else target
|
133
|
+
boxes = to_numpy(target.boxes) if target is not None else None
|
222
134
|
results_list: list[dict[str, Any]] = []
|
223
135
|
source_indices: list[SourceIndex] = []
|
224
136
|
box_counts: list[int] = []
|
225
137
|
warnings_list: list[str] = []
|
226
|
-
nboxes = [None] if boxes is None else normalize_box_shape(boxes)
|
138
|
+
nboxes = [None] if boxes is None or not per_box else normalize_box_shape(boxes)
|
227
139
|
for i_b, box in enumerate(nboxes):
|
228
140
|
i_b = None if box is None else i_b
|
229
141
|
processor_list = [p(image, box, per_channel) for p in stats_processor_cls]
|
@@ -231,7 +143,7 @@ def process_stats(
|
|
231
143
|
warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} is out of bounds of {image.shape}.")
|
232
144
|
results_list.append({k: v for p in processor_list for k, v in p.process().items()})
|
233
145
|
if per_channel:
|
234
|
-
source_indices.extend([SourceIndex(i, i_b, c) for c in range(
|
146
|
+
source_indices.extend([SourceIndex(i, i_b, c) for c in range(image.shape[-3])])
|
235
147
|
else:
|
236
148
|
source_indices.append(SourceIndex(i, i_b, None))
|
237
149
|
box_counts.append(0 if boxes is None else len(boxes))
|
@@ -239,16 +151,18 @@ def process_stats(
|
|
239
151
|
|
240
152
|
|
241
153
|
def process_stats_unpack(
|
242
|
-
|
154
|
+
i: int,
|
155
|
+
dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
|
156
|
+
per_box: bool,
|
243
157
|
per_channel: bool,
|
244
158
|
stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
|
245
159
|
) -> StatsProcessorOutput:
|
246
|
-
return process_stats(
|
160
|
+
return process_stats(i, dataset, per_box=per_box, per_channel=per_channel, stats_processor_cls=stats_processor_cls)
|
247
161
|
|
248
162
|
|
249
163
|
def run_stats(
|
250
|
-
|
251
|
-
|
164
|
+
dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
|
165
|
+
per_box: bool,
|
252
166
|
per_channel: bool,
|
253
167
|
stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
|
254
168
|
) -> list[TStatsOutput]:
|
@@ -261,26 +175,20 @@ def run_stats(
|
|
261
175
|
|
262
176
|
Parameters
|
263
177
|
----------
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
as an array-like structure in the format of (X0, Y0, X1, Y1). The length of the bounding boxes
|
270
|
-
iterable should match the length of the input images.
|
178
|
+
data : Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]]
|
179
|
+
A dataset of images and targets to compute statistics on.
|
180
|
+
per_box : bool
|
181
|
+
A flag which determines if the statistics should be evaluated on a per-box basis or not.
|
182
|
+
If the dataset does not include bounding boxes, this flag is ignored.
|
271
183
|
per_channel : bool
|
272
184
|
A flag which determines if the states should be evaluated on a per-channel basis or not.
|
273
185
|
stats_processor_cls : Iterable[type[StatsProcessor]]
|
274
186
|
An iterable of stats processor classes that calculate stats and return output classes.
|
275
|
-
processes : int | None, default None
|
276
|
-
Number of processes to use, defaults to None which uses all available CPU cores.
|
277
187
|
|
278
188
|
Returns
|
279
189
|
-------
|
280
|
-
|
281
|
-
A
|
282
|
-
The dictionary keys correspond to the names of the statistics, and the values are :term:`NumPy` arrays
|
283
|
-
with the results of the computations.
|
190
|
+
list[TStatsOutput]
|
191
|
+
A list of output classes containing the computed statistics
|
284
192
|
|
285
193
|
Note
|
286
194
|
----
|
@@ -294,20 +202,24 @@ def run_stats(
|
|
294
202
|
results_list: list[dict[str, NDArray[np.float64]]] = []
|
295
203
|
source_index: list[SourceIndex] = []
|
296
204
|
box_count: list[int] = []
|
297
|
-
bbox_iter = repeat(None) if bboxes is None else to_numpy_iter(bboxes)
|
298
205
|
|
299
206
|
warning_list = []
|
300
|
-
total_for_status = getattr(images, "__len__")() if hasattr(images, "__len__") else None
|
301
207
|
stats_processor_cls = stats_processor_cls if isinstance(stats_processor_cls, Iterable) else [stats_processor_cls]
|
302
208
|
|
303
209
|
# TODO: Introduce global controls for CPU job parallelism and GPU configurations
|
304
|
-
with Pool(processes=
|
210
|
+
with Pool(processes=get_max_processes()) as p:
|
305
211
|
for r in tqdm.tqdm(
|
306
212
|
p.imap(
|
307
|
-
partial(
|
308
|
-
|
213
|
+
partial(
|
214
|
+
process_stats_unpack,
|
215
|
+
dataset=dataset,
|
216
|
+
per_box=per_box,
|
217
|
+
per_channel=per_channel,
|
218
|
+
stats_processor_cls=stats_processor_cls,
|
219
|
+
),
|
220
|
+
range(len(dataset)),
|
309
221
|
),
|
310
|
-
total=
|
222
|
+
total=len(dataset),
|
311
223
|
):
|
312
224
|
results_list.extend(r.results)
|
313
225
|
source_index.extend(r.source_indices)
|
@@ -330,3 +242,40 @@ def run_stats(
|
|
330
242
|
|
331
243
|
outputs = [s.convert_output(output, source_index, box_count) for s in stats_processor_cls]
|
332
244
|
return outputs
|
245
|
+
|
246
|
+
|
247
|
+
def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
|
248
|
+
if type(a) is not type(b):
|
249
|
+
raise TypeError(f"Types {type(a)} and {type(b)} cannot be added.")
|
250
|
+
|
251
|
+
sum_dict = deepcopy(a.dict())
|
252
|
+
|
253
|
+
for k in sum_dict:
|
254
|
+
if isinstance(sum_dict[k], list):
|
255
|
+
sum_dict[k].extend(b.dict()[k])
|
256
|
+
else:
|
257
|
+
sum_dict[k] = np.concatenate((sum_dict[k], b.dict()[k]))
|
258
|
+
|
259
|
+
return type(a)(**sum_dict)
|
260
|
+
|
261
|
+
|
262
|
+
def combine_stats(stats: Sequence[TStatsOutput]) -> tuple[TStatsOutput, list[int]]:
|
263
|
+
output = None
|
264
|
+
dataset_steps = []
|
265
|
+
cur_len = 0
|
266
|
+
for s in stats:
|
267
|
+
output = s if output is None else add_stats(output, s)
|
268
|
+
cur_len += len(s)
|
269
|
+
dataset_steps.append(cur_len)
|
270
|
+
if output is None:
|
271
|
+
raise TypeError("Cannot combine empty sequence of stats.")
|
272
|
+
return output, dataset_steps
|
273
|
+
|
274
|
+
|
275
|
+
def get_dataset_step_from_idx(idx: int, dataset_steps: list[int]) -> tuple[int, int]:
|
276
|
+
last_step = 0
|
277
|
+
for i, step in enumerate(dataset_steps):
|
278
|
+
if idx < step:
|
279
|
+
return i, idx - last_step
|
280
|
+
last_step = step
|
281
|
+
return -1, idx
|
@@ -8,9 +8,8 @@ from typing import Any, Callable, Generic, TypeVar, cast
|
|
8
8
|
import numpy as np
|
9
9
|
from numpy.typing import NDArray
|
10
10
|
|
11
|
-
from dataeval.
|
12
|
-
from dataeval.
|
13
|
-
from dataeval.output import set_metadata
|
11
|
+
from dataeval.outputs._base import set_metadata
|
12
|
+
from dataeval.outputs._stats import BOX_COUNT, SOURCE_INDEX, BaseStatsOutput, DimensionStatsOutput
|
14
13
|
|
15
14
|
TStatOutput = TypeVar("TStatOutput", bound=BaseStatsOutput, contravariant=True)
|
16
15
|
ArraySlice = tuple[int, int]
|
@@ -50,7 +49,7 @@ RATIOSTATS_OVERRIDE_MAP: dict[type, dict[str, Callable[..., NDArray[Any]]]] = {
|
|
50
49
|
"depth": lambda x: x.box["depth"],
|
51
50
|
"distance": lambda x: x.box["distance"],
|
52
51
|
}
|
53
|
-
)
|
52
|
+
),
|
54
53
|
}
|
55
54
|
|
56
55
|
|
@@ -87,11 +86,8 @@ def calculate_ratios(key: str, box_stats: BaseStatsOutput, img_stats: BaseStatsO
|
|
87
86
|
stats = BoxImageStatsOutputSlice(box_stats, (box_i, box_j), img_stats, (img_i, img_j))
|
88
87
|
out_type = type(box_stats)
|
89
88
|
use_override = out_type in RATIOSTATS_OVERRIDE_MAP and key in RATIOSTATS_OVERRIDE_MAP[out_type]
|
90
|
-
|
91
|
-
RATIOSTATS_OVERRIDE_MAP[out_type][key](stats)
|
92
|
-
if use_override
|
93
|
-
else np.nan_to_num(stats.box[key] / stats.img[key])
|
94
|
-
)
|
89
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
90
|
+
ratio = RATIOSTATS_OVERRIDE_MAP[out_type][key](stats) if use_override else stats.box[key] / stats.img[key]
|
95
91
|
out_stats[box_i:box_j] = ratio.reshape(-1, *out_stats[box_i].shape)
|
96
92
|
return out_stats
|
97
93
|
|
@@ -128,27 +124,28 @@ def boxratiostats(
|
|
128
124
|
|
129
125
|
Examples
|
130
126
|
--------
|
131
|
-
|
127
|
+
Calculate the box ratio statistics using the dimension stats of the images and boxes
|
128
|
+
on a dataset containing 15 targets.
|
132
129
|
|
133
130
|
>>> from dataeval.metrics.stats import dimensionstats
|
134
|
-
>>> imagestats = dimensionstats(
|
135
|
-
>>> boxstats = dimensionstats(
|
131
|
+
>>> imagestats = dimensionstats(dataset, per_box=False)
|
132
|
+
>>> boxstats = dimensionstats(dataset, per_box=True)
|
136
133
|
>>> ratiostats = boxratiostats(boxstats, imagestats)
|
137
134
|
>>> print(ratiostats.aspect_ratio)
|
138
|
-
[ 0.
|
139
|
-
0.
|
135
|
+
[ 0.864 0.588 16. 0.857 1.27 0.438 0.667 3.833 1.95 0.833
|
136
|
+
1. 0.6 0.522 15. 3.834]
|
140
137
|
>>> print(ratiostats.size)
|
141
|
-
[0.
|
142
|
-
0.
|
138
|
+
[0.026 0.01 0.001 0.018 0.023 0.007 0.009 0.034 0.021 0.007 0.001 0.008
|
139
|
+
0.017 0.001 0.008]
|
143
140
|
"""
|
144
141
|
output_cls = type(boxstats)
|
145
142
|
if type(boxstats) is not type(imgstats):
|
146
143
|
raise TypeError("Must provide stats outputs of the same type.")
|
147
144
|
if boxstats.source_index[-1].image != imgstats.source_index[-1].image:
|
148
145
|
raise ValueError("Stats index_map length mismatch. Check if the correct box and image stats were provided.")
|
149
|
-
if
|
146
|
+
if any(src_idx.box is None for src_idx in boxstats.source_index):
|
150
147
|
raise ValueError("Input for boxstats must contain box information.")
|
151
|
-
if any(
|
148
|
+
if any(src_idx.box is not None for src_idx in imgstats.source_index):
|
152
149
|
raise ValueError("Input for imgstats must not contain box information.")
|
153
150
|
boxstats_has_channels = any(si.channel is None for si in boxstats.source_index)
|
154
151
|
imgstats_has_channels = any(si.channel is None for si in imgstats.source_index)
|
@@ -0,0 +1,75 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
__all__ = []
|
4
|
+
|
5
|
+
from typing import Any, Callable
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
|
9
|
+
from dataeval.metrics.stats._base import StatsProcessor, run_stats
|
10
|
+
from dataeval.outputs import DimensionStatsOutput
|
11
|
+
from dataeval.outputs._base import set_metadata
|
12
|
+
from dataeval.typing import ArrayLike, Dataset
|
13
|
+
from dataeval.utils._image import get_bitdepth
|
14
|
+
|
15
|
+
|
16
|
+
class DimensionStatsProcessor(StatsProcessor[DimensionStatsOutput]):
|
17
|
+
output_class: type = DimensionStatsOutput
|
18
|
+
image_function_map: dict[str, Callable[[StatsProcessor[DimensionStatsOutput]], Any]] = {
|
19
|
+
"left": lambda x: x.box[0],
|
20
|
+
"top": lambda x: x.box[1],
|
21
|
+
"width": lambda x: x.box[2] - x.box[0],
|
22
|
+
"height": lambda x: x.box[3] - x.box[1],
|
23
|
+
"channels": lambda x: x.shape[-3],
|
24
|
+
"size": lambda x: (x.box[2] - x.box[0]) * (x.box[3] - x.box[1]),
|
25
|
+
"aspect_ratio": lambda x: (x.box[2] - x.box[0]) / (x.box[3] - x.box[1]),
|
26
|
+
"depth": lambda x: get_bitdepth(x.image).depth,
|
27
|
+
"center": lambda x: np.asarray([(x.box[0] + x.box[2]) / 2, (x.box[1] + x.box[3]) / 2]),
|
28
|
+
"distance": lambda x: np.sqrt(
|
29
|
+
np.square(((x.box[0] + x.box[2]) / 2) - (x.shape[-1] / 2))
|
30
|
+
+ np.square(((x.box[1] + x.box[3]) / 2) - (x.shape[-2] / 2))
|
31
|
+
),
|
32
|
+
}
|
33
|
+
|
34
|
+
|
35
|
+
@set_metadata
|
36
|
+
def dimensionstats(
|
37
|
+
dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
|
38
|
+
*,
|
39
|
+
per_box: bool = False,
|
40
|
+
) -> DimensionStatsOutput:
|
41
|
+
"""
|
42
|
+
Calculates dimension :term:`statistics<Statistics>` for each image.
|
43
|
+
|
44
|
+
This function computes various dimensional metrics (e.g., width, height, channels)
|
45
|
+
on the images or individual bounding boxes for each image.
|
46
|
+
|
47
|
+
Parameters
|
48
|
+
----------
|
49
|
+
dataset : Dataset
|
50
|
+
Dataset to perform calculations on.
|
51
|
+
per_box : bool, default False
|
52
|
+
If True, perform calculations on each bounding box.
|
53
|
+
|
54
|
+
Returns
|
55
|
+
-------
|
56
|
+
DimensionStatsOutput
|
57
|
+
A dictionary-like object containing the computed dimension statistics for each image or bounding
|
58
|
+
box. The keys correspond to the names of the statistics (e.g., 'width', 'height'), and the values
|
59
|
+
are lists of results for each image or :term:NumPy` arrays when the results are multi-dimensional.
|
60
|
+
|
61
|
+
See Also
|
62
|
+
--------
|
63
|
+
pixelstats, visualstats, Outliers
|
64
|
+
|
65
|
+
Examples
|
66
|
+
--------
|
67
|
+
Calculate the dimension statistics of a dataset of 8 images, whose shape is (C, H, W).
|
68
|
+
|
69
|
+
>>> results = dimensionstats(dataset)
|
70
|
+
>>> print(results.aspect_ratio)
|
71
|
+
[1. 1. 1.333 1. 0.667 1. 1. 1. ]
|
72
|
+
>>> print(results.channels)
|
73
|
+
[3 3 1 3 1 3 3 3]
|
74
|
+
"""
|
75
|
+
return run_stats(dataset, per_box, False, [DimensionStatsProcessor])[0]
|