dataeval 0.86.4__py3-none-any.whl → 0.86.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +1 -1
- dataeval/config.py +1 -1
- dataeval/data/_metadata.py +5 -3
- dataeval/detectors/drift/_nml/_result.py +2 -2
- dataeval/detectors/linters/outliers.py +56 -23
- dataeval/metrics/stats/_base.py +4 -4
- dataeval/metrics/stats/_labelstats.py +5 -1
- dataeval/metrics/stats/_pixelstats.py +5 -4
- dataeval/metrics/stats/_visualstats.py +7 -8
- dataeval/outputs/_drift.py +1 -1
- dataeval/outputs/_stats.py +60 -15
- dataeval/utils/data/_dataset.py +2 -3
- dataeval/utils/datasets/_fileio.py +1 -1
- dataeval/utils/torch/_internal.py +1 -1
- {dataeval-0.86.4.dist-info → dataeval-0.86.6.dist-info}/METADATA +1 -1
- {dataeval-0.86.4.dist-info → dataeval-0.86.6.dist-info}/RECORD +18 -18
- {dataeval-0.86.4.dist-info → dataeval-0.86.6.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.86.4.dist-info → dataeval-0.86.6.dist-info}/WHEEL +0 -0
dataeval/__init__.py
CHANGED
dataeval/config.py
CHANGED
dataeval/data/_metadata.py
CHANGED
@@ -4,7 +4,7 @@ __all__ = []
|
|
4
4
|
|
5
5
|
import warnings
|
6
6
|
from dataclasses import dataclass
|
7
|
-
from typing import Any, Iterable, Literal, Mapping, Sequence
|
7
|
+
from typing import Any, Iterable, Literal, Mapping, Sequence, Sized
|
8
8
|
|
9
9
|
import numpy as np
|
10
10
|
import polars as pl
|
@@ -69,6 +69,7 @@ class Metadata:
|
|
69
69
|
self._is_binned = False
|
70
70
|
|
71
71
|
self._dataset = dataset
|
72
|
+
self._count = len(dataset) if isinstance(dataset, Sized) else 0
|
72
73
|
self._continuous_factor_bins = dict(continuous_factor_bins) if continuous_factor_bins else {}
|
73
74
|
self._auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = auto_bin_method
|
74
75
|
|
@@ -198,8 +199,9 @@ class Metadata:
|
|
198
199
|
|
199
200
|
@property
|
200
201
|
def image_count(self) -> int:
|
201
|
-
self.
|
202
|
-
|
202
|
+
if self._count == 0:
|
203
|
+
self._structure()
|
204
|
+
return self._count
|
203
205
|
|
204
206
|
def _filter(self, factor: str | tuple[str, Any]) -> bool:
|
205
207
|
factor = factor[0] if isinstance(factor, tuple) else factor
|
@@ -29,7 +29,7 @@ class AbstractResult(GenericOutput[pd.DataFrame]):
|
|
29
29
|
self._data = results_data.copy(deep=True)
|
30
30
|
|
31
31
|
def data(self) -> pd.DataFrame:
|
32
|
-
return self.
|
32
|
+
return self.to_dataframe()
|
33
33
|
|
34
34
|
@property
|
35
35
|
def empty(self) -> bool:
|
@@ -38,7 +38,7 @@ class AbstractResult(GenericOutput[pd.DataFrame]):
|
|
38
38
|
def __len__(self) -> int:
|
39
39
|
return 0 if self.empty else len(self._data)
|
40
40
|
|
41
|
-
def
|
41
|
+
def to_dataframe(self, multilevel: bool = True) -> pd.DataFrame:
|
42
42
|
"""Export results to pandas dataframe."""
|
43
43
|
if multilevel:
|
44
44
|
return self._data
|
@@ -7,6 +7,7 @@ from typing import Any, Literal, Sequence, overload
|
|
7
7
|
import numpy as np
|
8
8
|
from numpy.typing import NDArray
|
9
9
|
|
10
|
+
from dataeval.config import EPSILON
|
10
11
|
from dataeval.data._images import Images
|
11
12
|
from dataeval.metrics.stats._base import combine_stats, get_dataset_step_from_idx
|
12
13
|
from dataeval.metrics.stats._imagestats import imagestats
|
@@ -17,27 +18,59 @@ from dataeval.outputs._stats import BASE_ATTRS
|
|
17
18
|
from dataeval.typing import ArrayLike, Dataset
|
18
19
|
|
19
20
|
|
20
|
-
def
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
21
|
+
def _get_zscore_mask(values: NDArray[np.float64], threshold: float | None) -> NDArray[np.bool_] | None:
|
22
|
+
threshold = threshold if threshold is not None else 3.0
|
23
|
+
std_val = np.nanstd(values)
|
24
|
+
if std_val > EPSILON:
|
25
|
+
mean_val = np.nanmean(values)
|
26
|
+
abs_diff = np.abs(values - mean_val)
|
27
|
+
return (abs_diff / std_val) > threshold
|
28
|
+
return None
|
29
|
+
|
30
|
+
|
31
|
+
def _get_modzscore_mask(values: NDArray[np.float64], threshold: float | None) -> NDArray[np.bool_] | None:
|
32
|
+
threshold = threshold if threshold is not None else 3.5
|
33
|
+
median_val = np.nanmedian(values)
|
34
|
+
abs_diff = np.abs(values - median_val)
|
35
|
+
m_abs_diff = np.nanmedian(abs_diff)
|
36
|
+
m_abs_diff = np.nanmean(abs_diff) if m_abs_diff <= EPSILON else m_abs_diff
|
37
|
+
if m_abs_diff > EPSILON:
|
38
|
+
mod_z_score = 0.6745 * abs_diff / m_abs_diff
|
34
39
|
return mod_z_score > threshold
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
40
|
+
return None
|
41
|
+
|
42
|
+
|
43
|
+
def _get_iqr_mask(values: NDArray[np.float64], threshold: float | None) -> NDArray[np.bool_] | None:
|
44
|
+
threshold = threshold if threshold is not None else 1.5
|
45
|
+
qrt = np.nanpercentile(values, q=(25, 75), method="midpoint")
|
46
|
+
iqr_val = qrt[1] - qrt[0]
|
47
|
+
if iqr_val > EPSILON:
|
48
|
+
iqr_threshold = iqr_val * threshold
|
49
|
+
return (values < (qrt[0] - iqr_threshold)) | (values > (qrt[1] + iqr_threshold))
|
50
|
+
return None
|
51
|
+
|
52
|
+
|
53
|
+
def _get_outlier_mask(
|
54
|
+
values: NDArray[Any], method: Literal["zscore", "modzscore", "iqr"], threshold: float | None
|
55
|
+
) -> NDArray[np.bool_]:
|
56
|
+
if len(values) == 0:
|
57
|
+
return np.array([], dtype=bool)
|
58
|
+
|
59
|
+
nan_mask = np.isnan(values)
|
60
|
+
|
61
|
+
if np.all(nan_mask):
|
62
|
+
outliers = None
|
63
|
+
elif method == "zscore":
|
64
|
+
outliers = _get_zscore_mask(values.astype(np.float64), threshold)
|
65
|
+
elif method == "modzscore":
|
66
|
+
outliers = _get_modzscore_mask(values.astype(np.float64), threshold)
|
67
|
+
elif method == "iqr":
|
68
|
+
outliers = _get_iqr_mask(values.astype(np.float64), threshold)
|
69
|
+
else:
|
70
|
+
raise ValueError("Outlier method must be 'zscore' 'modzscore' or 'iqr'.")
|
71
|
+
|
72
|
+
# If outliers were found, return the mask with NaN values set to False, otherwise return all False
|
73
|
+
return outliers & ~nan_mask if outliers is not None else np.full(values.shape, False, dtype=bool)
|
41
74
|
|
42
75
|
|
43
76
|
class Outliers:
|
@@ -164,10 +197,10 @@ class Outliers:
|
|
164
197
|
>>> len(results)
|
165
198
|
2
|
166
199
|
>>> results.issues[0]
|
167
|
-
{10: {'
|
200
|
+
{10: {'entropy': 0.2128}, 12: {'std': 0.00536, 'var': 2.87e-05, 'entropy': 0.2128}}
|
168
201
|
>>> results.issues[1]
|
169
202
|
{}
|
170
|
-
"""
|
203
|
+
"""
|
171
204
|
if isinstance(stats, (ImageStatsOutput, DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)):
|
172
205
|
return OutliersOutput(self._get_outliers(stats.data()))
|
173
206
|
|
@@ -221,7 +254,7 @@ class Outliers:
|
|
221
254
|
>>> list(results.issues)
|
222
255
|
[10, 12]
|
223
256
|
>>> results.issues[10]
|
224
|
-
{'contrast': 1.25, 'zeros': 0.05493, '
|
257
|
+
{'contrast': 1.25, 'zeros': 0.05493, 'entropy': 0.2128}
|
225
258
|
"""
|
226
259
|
images = Images(data) if isinstance(data, Dataset) else data
|
227
260
|
self.stats = imagestats(images)
|
dataeval/metrics/stats/_base.py
CHANGED
@@ -13,8 +13,8 @@ from multiprocessing import Pool
|
|
13
13
|
from typing import Any, Callable, Generic, Iterable, Iterator, Sequence, TypeVar
|
14
14
|
|
15
15
|
import numpy as np
|
16
|
-
import tqdm
|
17
16
|
from numpy.typing import NDArray
|
17
|
+
from tqdm.auto import tqdm
|
18
18
|
|
19
19
|
from dataeval.config import get_max_processes
|
20
20
|
from dataeval.outputs._stats import BASE_ATTRS, BaseStatsOutput, SourceIndex
|
@@ -77,7 +77,7 @@ class PoolWrapper:
|
|
77
77
|
"""
|
78
78
|
|
79
79
|
def __init__(self, processes: int | None) -> None:
|
80
|
-
self.pool = Pool(processes) if processes is
|
80
|
+
self.pool = Pool(processes) if processes is None or processes > 1 else None
|
81
81
|
|
82
82
|
def imap(self, func: Callable[[_S], _T], iterable: Iterable[_S]) -> Iterator[_T]:
|
83
83
|
return map(func, iterable) if self.pool is None else self.pool.imap(func, iterable)
|
@@ -93,7 +93,7 @@ class PoolWrapper:
|
|
93
93
|
|
94
94
|
class StatsProcessor(Generic[TStatsOutput]):
|
95
95
|
output_class: type[TStatsOutput]
|
96
|
-
cache_keys:
|
96
|
+
cache_keys: set[str] = set()
|
97
97
|
image_function_map: dict[str, Callable[[StatsProcessor[TStatsOutput]], Any]] = {}
|
98
98
|
channel_function_map: dict[str, Callable[[StatsProcessor[TStatsOutput]], Any]] = {}
|
99
99
|
|
@@ -267,7 +267,7 @@ def run_stats(
|
|
267
267
|
stats_processor_cls = stats_processor_cls if isinstance(stats_processor_cls, Iterable) else [stats_processor_cls]
|
268
268
|
|
269
269
|
with PoolWrapper(processes=get_max_processes()) as p:
|
270
|
-
for r in tqdm
|
270
|
+
for r in tqdm(
|
271
271
|
p.imap(
|
272
272
|
partial(
|
273
273
|
process_stats_unpack,
|
@@ -68,7 +68,11 @@ def labelstats(dataset: Metadata | AnnotatedDataset[Any]) -> LabelStatsOutput:
|
|
68
68
|
|
69
69
|
# Count labels per image
|
70
70
|
label_per_image_df = metadata_df.group_by("image_index").agg(pl.len().alias("label_count"))
|
71
|
-
|
71
|
+
|
72
|
+
# Join with all indices to include missing ones with 0 count
|
73
|
+
all_indices = pl.DataFrame({"image_index": range(metadata.image_count)})
|
74
|
+
complete_label_df = all_indices.join(label_per_image_df, on="image_index", how="left").fill_null(0)
|
75
|
+
label_per_image = complete_label_df.sort("image_index")["label_count"].to_list()
|
72
76
|
|
73
77
|
return LabelStatsOutput(
|
74
78
|
label_counts_per_class=label_counts,
|
@@ -15,12 +15,13 @@ from dataeval.typing import ArrayLike, Dataset
|
|
15
15
|
|
16
16
|
class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
|
17
17
|
output_class: type = PixelStatsOutput
|
18
|
+
cache_keys = {"histogram"}
|
18
19
|
image_function_map: dict[str, Callable[[StatsProcessor[PixelStatsOutput]], Any]] = {
|
19
20
|
"mean": lambda x: np.nanmean(x.scaled),
|
20
21
|
"std": lambda x: np.nanstd(x.scaled),
|
21
22
|
"var": lambda x: np.nanvar(x.scaled),
|
22
|
-
"skew": lambda x:
|
23
|
-
"kurtosis": lambda x:
|
23
|
+
"skew": lambda x: skew(x.scaled.ravel(), nan_policy="omit"),
|
24
|
+
"kurtosis": lambda x: kurtosis(x.scaled.ravel(), nan_policy="omit"),
|
24
25
|
"histogram": lambda x: np.histogram(x.scaled, 256, (0, 1))[0],
|
25
26
|
"entropy": lambda x: entropy(x.get("histogram")),
|
26
27
|
}
|
@@ -28,8 +29,8 @@ class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
|
|
28
29
|
"mean": lambda x: np.nanmean(x.scaled, axis=1),
|
29
30
|
"std": lambda x: np.nanstd(x.scaled, axis=1),
|
30
31
|
"var": lambda x: np.nanvar(x.scaled, axis=1),
|
31
|
-
"skew": lambda x:
|
32
|
-
"kurtosis": lambda x:
|
32
|
+
"skew": lambda x: skew(x.scaled, axis=1, nan_policy="omit"),
|
33
|
+
"kurtosis": lambda x: kurtosis(x.scaled, axis=1, nan_policy="omit"),
|
33
34
|
"histogram": lambda x: np.apply_along_axis(lambda y: np.histogram(y, 256, (0, 1))[0], 1, x.scaled),
|
34
35
|
"entropy": lambda x: entropy(x.get("histogram"), axis=1),
|
35
36
|
}
|
@@ -6,6 +6,7 @@ from typing import Any, Callable
|
|
6
6
|
|
7
7
|
import numpy as np
|
8
8
|
|
9
|
+
from dataeval.config import EPSILON
|
9
10
|
from dataeval.metrics.stats._base import StatsProcessor, run_stats
|
10
11
|
from dataeval.outputs import VisualStatsOutput
|
11
12
|
from dataeval.outputs._base import set_metadata
|
@@ -17,23 +18,21 @@ QUARTILES = (0, 25, 50, 75, 100)
|
|
17
18
|
|
18
19
|
class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
|
19
20
|
output_class: type = VisualStatsOutput
|
21
|
+
cache_keys: set[str] = {"percentiles"}
|
20
22
|
image_function_map: dict[str, Callable[[StatsProcessor[VisualStatsOutput]], Any]] = {
|
21
23
|
"brightness": lambda x: x.get("percentiles")[1],
|
22
|
-
"contrast": lambda x:
|
23
|
-
|
24
|
-
else (np.max(x.get("percentiles")) - np.min(x.get("percentiles"))) / np.mean(x.get("percentiles")),
|
24
|
+
"contrast": lambda x: (np.max(x.get("percentiles")) - np.min(x.get("percentiles")))
|
25
|
+
/ (np.mean(x.get("percentiles")) + EPSILON),
|
25
26
|
"darkness": lambda x: x.get("percentiles")[-2],
|
26
27
|
"missing": lambda x: np.count_nonzero(np.isnan(np.sum(x.image, axis=0))) / np.prod(x.shape[-2:]),
|
27
28
|
"sharpness": lambda x: np.nanstd(edge_filter(np.mean(x.image, axis=0))),
|
28
|
-
"zeros": lambda x: np.count_nonzero(np.
|
29
|
+
"zeros": lambda x: np.count_nonzero(np.sum(x.image, axis=0) == 0) / np.prod(x.shape[-2:]),
|
29
30
|
"percentiles": lambda x: np.nanpercentile(x.scaled, q=QUARTILES),
|
30
31
|
}
|
31
32
|
channel_function_map: dict[str, Callable[[StatsProcessor[VisualStatsOutput]], Any]] = {
|
32
33
|
"brightness": lambda x: x.get("percentiles")[:, 1],
|
33
|
-
"contrast": lambda x: np.
|
34
|
-
|
35
|
-
/ np.mean(x.get("percentiles"), axis=1)
|
36
|
-
),
|
34
|
+
"contrast": lambda x: (np.max(x.get("percentiles"), axis=1) - np.min(x.get("percentiles"), axis=1))
|
35
|
+
/ (np.mean(x.get("percentiles"), axis=1) + EPSILON),
|
37
36
|
"darkness": lambda x: x.get("percentiles")[:, -2],
|
38
37
|
"missing": lambda x: np.count_nonzero(np.isnan(x.image), axis=(1, 2)) / np.prod(x.shape[-2:]),
|
39
38
|
"sharpness": lambda x: np.nanstd(np.vectorize(edge_filter, signature="(m,n)->(m,n)")(x.image), axis=(1, 2)),
|
dataeval/outputs/_drift.py
CHANGED
@@ -114,7 +114,7 @@ class DriftMVDCOutput(PerMetricResult):
|
|
114
114
|
import matplotlib.pyplot as plt
|
115
115
|
|
116
116
|
fig, ax = plt.subplots(dpi=300)
|
117
|
-
resdf = self.
|
117
|
+
resdf = self.to_dataframe()
|
118
118
|
xticks = np.arange(resdf.shape[0])
|
119
119
|
trndf = resdf[resdf["chunk"]["period"] == "reference"]
|
120
120
|
tstdf = resdf[resdf["chunk"]["period"] == "analysis"]
|
dataeval/outputs/_stats.py
CHANGED
@@ -6,7 +6,7 @@ from dataclasses import dataclass
|
|
6
6
|
from typing import TYPE_CHECKING, Any, Iterable, Mapping, NamedTuple, Optional, Sequence, Union
|
7
7
|
|
8
8
|
import numpy as np
|
9
|
-
import
|
9
|
+
import polars as pl
|
10
10
|
from numpy.typing import NDArray
|
11
11
|
from typing_extensions import TypeAlias
|
12
12
|
|
@@ -22,7 +22,7 @@ SOURCE_INDEX = "source_index"
|
|
22
22
|
OBJECT_COUNT = "object_count"
|
23
23
|
IMAGE_COUNT = "image_count"
|
24
24
|
|
25
|
-
BASE_ATTRS =
|
25
|
+
BASE_ATTRS = [SOURCE_INDEX, OBJECT_COUNT, IMAGE_COUNT]
|
26
26
|
|
27
27
|
|
28
28
|
class SourceIndex(NamedTuple):
|
@@ -156,14 +156,21 @@ class BaseStatsOutput(Output):
|
|
156
156
|
Mapping[str, NDArray[Any]]
|
157
157
|
"""
|
158
158
|
filter_ = [filter] if isinstance(filter, str) else filter
|
159
|
+
|
160
|
+
"""
|
161
|
+
Performs validation checks to ensure selected keys and constant or 1-D values
|
162
|
+
Each set of checks returns True if a valid value.
|
163
|
+
Only one set of final checks needs to be True to allow the value through
|
164
|
+
"""
|
159
165
|
return {
|
160
166
|
k: v
|
161
167
|
for k, v in self.data().items()
|
162
|
-
if
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
168
|
+
if (
|
169
|
+
k not in BASE_ATTRS # Ignore BaseStatsOutput attributes
|
170
|
+
and (filter_ is None or k in filter_) # Key is selected
|
171
|
+
and (isinstance(v, np.ndarray) and v.ndim == 1) # Check valid array
|
172
|
+
and (not exclude_constant or len(np.unique(v)) > 1) # Check valid numpy "constant"
|
173
|
+
)
|
167
174
|
}
|
168
175
|
|
169
176
|
def plot(
|
@@ -195,6 +202,11 @@ class BaseStatsOutput(Output):
|
|
195
202
|
return histogram_plot(factors, log)
|
196
203
|
return channel_histogram_plot(factors, log, max_channels, ch_mask)
|
197
204
|
|
205
|
+
def to_dataframe(self) -> pl.DataFrame:
|
206
|
+
"""Returns the processed factors a polars dataframe of shape (factors, samples)"""
|
207
|
+
|
208
|
+
return pl.DataFrame(self.factors())
|
209
|
+
|
198
210
|
|
199
211
|
@dataclass(frozen=True)
|
200
212
|
class DimensionStatsOutput(BaseStatsOutput):
|
@@ -256,6 +268,43 @@ class HashStatsOutput(BaseStatsOutput):
|
|
256
268
|
xxhash: Sequence[str]
|
257
269
|
pchash: Sequence[str]
|
258
270
|
|
271
|
+
def to_dataframe(self) -> pl.DataFrame:
|
272
|
+
"""
|
273
|
+
Returns a polars dataframe for the xxhash and pchash attributes of each sample
|
274
|
+
|
275
|
+
Note
|
276
|
+
----
|
277
|
+
xxhash and pchash do not follow the normal definition of factors but are
|
278
|
+
helpful attributes of the data
|
279
|
+
|
280
|
+
Examples
|
281
|
+
--------
|
282
|
+
Display the hashes of a dataset of images, whose shape is (C, H, W),
|
283
|
+
as a polars DataFrame
|
284
|
+
|
285
|
+
>>> from dataeval.metrics.stats import hashstats
|
286
|
+
>>> results = hashstats(dataset)
|
287
|
+
>>> print(results.to_dataframe())
|
288
|
+
shape: (8, 2)
|
289
|
+
┌──────────────────┬──────────────────┐
|
290
|
+
│ xxhash ┆ pchash │
|
291
|
+
│ --- ┆ --- │
|
292
|
+
│ str ┆ str │
|
293
|
+
╞══════════════════╪══════════════════╡
|
294
|
+
│ 69b50a5f06af238c ┆ e666999999266666 │
|
295
|
+
│ 5a861d7a23d1afe7 ┆ e666999999266666 │
|
296
|
+
│ 7ffdb4990ad44ac6 ┆ e666999966666299 │
|
297
|
+
│ 4f0c366a3298ceac ┆ e666999999266666 │
|
298
|
+
│ c5519e36ac1f8839 ┆ 96e91656e91616e9 │
|
299
|
+
│ e7e92346159a4567 ┆ e666999999266666 │
|
300
|
+
│ 9a538f797a5ba8ee ┆ e666999999266666 │
|
301
|
+
│ 1a658bd2a1baee25 ┆ e666999999266666 │
|
302
|
+
└──────────────────┴──────────────────┘
|
303
|
+
"""
|
304
|
+
data = {"xxhash": self.xxhash, "pchash": self.pchash}
|
305
|
+
schema = {"xxhash": str, "pchash": str}
|
306
|
+
return pl.DataFrame(data=data, schema=schema)
|
307
|
+
|
259
308
|
|
260
309
|
@dataclass(frozen=True)
|
261
310
|
class LabelStatsOutput(Output):
|
@@ -325,17 +374,13 @@ class LabelStatsOutput(Output):
|
|
325
374
|
|
326
375
|
return "\n".join(table_str)
|
327
376
|
|
328
|
-
def to_dataframe(self) ->
|
377
|
+
def to_dataframe(self) -> pl.DataFrame:
|
329
378
|
"""
|
330
|
-
Exports the label statistics output results to a
|
331
|
-
|
332
|
-
Notes
|
333
|
-
-----
|
334
|
-
This method requires `pandas <https://pandas.pydata.org/>`_ to be installed.
|
379
|
+
Exports the label statistics output results to a polars DataFrame.
|
335
380
|
|
336
381
|
Returns
|
337
382
|
-------
|
338
|
-
|
383
|
+
pl.DataFrame
|
339
384
|
"""
|
340
385
|
total_count = []
|
341
386
|
image_count = []
|
@@ -343,7 +388,7 @@ class LabelStatsOutput(Output):
|
|
343
388
|
total_count.append(self.label_counts_per_class[cls])
|
344
389
|
image_count.append(self.image_counts_per_class[cls])
|
345
390
|
|
346
|
-
return
|
391
|
+
return pl.DataFrame(
|
347
392
|
{
|
348
393
|
"Label": self.class_names,
|
349
394
|
"Total Count": total_count,
|
dataeval/utils/data/_dataset.py
CHANGED
@@ -72,9 +72,8 @@ def _listify_metadata(
|
|
72
72
|
|
73
73
|
def _find_max(arr: ArrayLike) -> Any:
|
74
74
|
if not isinstance(arr, (bytes, str)) and isinstance(arr, (Iterable, Sequence, Array)):
|
75
|
-
|
76
|
-
|
77
|
-
return max(arr)
|
75
|
+
nested = [x for x in [_find_max(x) for x in arr] if x is not None]
|
76
|
+
return max(nested) if len(nested) > 0 else None
|
78
77
|
return arr
|
79
78
|
|
80
79
|
|
@@ -8,7 +8,7 @@ import numpy as np
|
|
8
8
|
import torch
|
9
9
|
from numpy.typing import NDArray
|
10
10
|
from torch.utils.data import DataLoader, TensorDataset
|
11
|
-
from tqdm import tqdm
|
11
|
+
from tqdm.auto import tqdm
|
12
12
|
|
13
13
|
from dataeval.config import DeviceLike, get_device
|
14
14
|
from dataeval.typing import Array
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: dataeval
|
3
|
-
Version: 0.86.
|
3
|
+
Version: 0.86.6
|
4
4
|
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
5
5
|
Home-page: https://dataeval.ai/
|
6
6
|
License: MIT
|
@@ -1,10 +1,10 @@
|
|
1
|
-
dataeval/__init__.py,sha256=
|
1
|
+
dataeval/__init__.py,sha256=9M6Th_pJ371mFO5oLUC0UZJmDclHa8SbNJse71-T84I,1636
|
2
2
|
dataeval/_log.py,sha256=C7AGkIRzymvYJ0LQXtnShiy3i5Xrp8T58JzIHHguk_Q,365
|
3
|
-
dataeval/config.py,sha256=
|
3
|
+
dataeval/config.py,sha256=bHa8np4FCtLLv8_xlfdDC4lb1InJ_kT0vXDO5P42rvk,4082
|
4
4
|
dataeval/data/__init__.py,sha256=wzQ6uUFLNB3VJR0a2QnRBYwEmwXT93q0WpHu7FmFW1E,486
|
5
5
|
dataeval/data/_embeddings.py,sha256=PFjpdV9bfusCB4taTIYSzx1hP8nJb_KCkZTN8kMw-Hs,12885
|
6
6
|
dataeval/data/_images.py,sha256=Rc_59CuU4zfN7Xm7an1XUx8ZghQg6a56VJWMZD9edRw,2654
|
7
|
-
dataeval/data/_metadata.py,sha256=
|
7
|
+
dataeval/data/_metadata.py,sha256=OTda9V7DA5Ejxip_NR16LCK2C8HMtpjWHHiFoW3LrLY,14364
|
8
8
|
dataeval/data/_selection.py,sha256=r06xeiyK8nTWPLyItkoPQRWZI1i6LATSue_cuEbCdc4,4463
|
9
9
|
dataeval/data/_split.py,sha256=nQABR05vxil2Qx7-uX4Fm0_DWpibskBGDJOYj_b1u3I,16737
|
10
10
|
dataeval/data/selections/__init__.py,sha256=2m8ZB53wXzqLcqmc6p5atO6graB6ZyiRSNJFxf11X_g,613
|
@@ -26,13 +26,13 @@ dataeval/detectors/drift/_nml/__init__.py,sha256=MNyKyZlfTjr5uQql2uBBfRkUdsuduie
|
|
26
26
|
dataeval/detectors/drift/_nml/_base.py,sha256=o34LcCsD9p1A6u8UdQn-dxIVwC2CMr6uCpC0vq16JX0,2663
|
27
27
|
dataeval/detectors/drift/_nml/_chunk.py,sha256=t12eouanRNiu5DJXOaYDZXUvFMqfcp1BETLOufdV79M,13567
|
28
28
|
dataeval/detectors/drift/_nml/_domainclassifier.py,sha256=n7Ttq5Ej7sAY9Jn2iagaGj4IIWiG8gmA3wwFizlBqes,7292
|
29
|
-
dataeval/detectors/drift/_nml/_result.py,sha256=
|
29
|
+
dataeval/detectors/drift/_nml/_result.py,sha256=TMK17bnlgSdL0MCRHtQZJO8YoWWe4C2kh_akESrlP1g,3269
|
30
30
|
dataeval/detectors/drift/_nml/_thresholds.py,sha256=WGdkLei9w_EvvsRHQzWdDyFVoZHIwM78k_aB3eoh31Q,12060
|
31
31
|
dataeval/detectors/drift/_uncertainty.py,sha256=BHlykJ-r7TGLJxdPfoazXnoAJ1qVDzbk5HjAMdsnHz8,5847
|
32
32
|
dataeval/detectors/drift/updates.py,sha256=L1PnrPlIE1x6ujCc5mCwjcAZwadVTn-Zjb6MnTDvzJQ,2251
|
33
33
|
dataeval/detectors/linters/__init__.py,sha256=xn2zPwUcmsuf-Jd9uw6AVI11C9z1b1Y9fYtuFnXenZ0,404
|
34
34
|
dataeval/detectors/linters/duplicates.py,sha256=X5WSEvI_BHkLoXjkaHK6wTnSkx4IjpO_exMRjSlhc70,4963
|
35
|
-
dataeval/detectors/linters/outliers.py,sha256=
|
35
|
+
dataeval/detectors/linters/outliers.py,sha256=GaM9n8yPgBPzVOL_bxJCj0eCwobEEP4JHKHD9liRdlw,10130
|
36
36
|
dataeval/detectors/ood/__init__.py,sha256=juCYBDs7CQEAtMhnEpPqF6uTrOIH9kTBSuQ_GRw6a8o,283
|
37
37
|
dataeval/detectors/ood/ae.py,sha256=fTrUfFxv6xUqzKpwMC8rW3JrizA16M_bgzqLuBKMrS0,2944
|
38
38
|
dataeval/detectors/ood/base.py,sha256=9b-Ljznf0lB1SXF4F_Aj3eJ4Y3ijGEDPMjucUsWOGJM,3051
|
@@ -54,23 +54,23 @@ dataeval/metrics/estimators/_clusterer.py,sha256=1HrpihGTJ63IkNSOy4Ibw633Gllkm1R
|
|
54
54
|
dataeval/metrics/estimators/_divergence.py,sha256=-np4nWNtRrHnvo4xdWuTzkyJJmobyjDnVDBOMjtBS1Y,4003
|
55
55
|
dataeval/metrics/estimators/_uap.py,sha256=BULEBbJ9BQ1IcTeZf0x7iI60QHAWCccBOM97FIu9VXA,1928
|
56
56
|
dataeval/metrics/stats/__init__.py,sha256=6tA_9nbbM5ObJ6cds8Y1VBtTQiTOxrpGQSFLu_lWGGA,1098
|
57
|
-
dataeval/metrics/stats/_base.py,sha256
|
57
|
+
dataeval/metrics/stats/_base.py,sha256=R-hxoEPLreZcxYxBfyjbKfdoGMMTPiqJ5g2zSO-1UYM,12541
|
58
58
|
dataeval/metrics/stats/_boxratiostats.py,sha256=ROZrlqgbowkGfCR5PJ5TL7Og40iMOdUqJnsCtaz_Xek,6450
|
59
59
|
dataeval/metrics/stats/_dimensionstats.py,sha256=EVO-BlxrZl8qrP09lwPbyWdrG1ZeDtgj4LiswDwEZ1I,2896
|
60
60
|
dataeval/metrics/stats/_hashstats.py,sha256=qa1CYRgOebkxqkALfffaPM-kJ074ZbyfpWbfOfuObSs,4758
|
61
61
|
dataeval/metrics/stats/_imagestats.py,sha256=gUPNgN5Zwzdr7WnSwbve1NXNsyxd5dy3cSnlR_7guCg,3007
|
62
|
-
dataeval/metrics/stats/_labelstats.py,sha256=
|
63
|
-
dataeval/metrics/stats/_pixelstats.py,sha256=
|
64
|
-
dataeval/metrics/stats/_visualstats.py,sha256=
|
62
|
+
dataeval/metrics/stats/_labelstats.py,sha256=_dXt3p8_-SHEtHvJWbL0rnQvO2g30zxX42mG2LGJepU,3195
|
63
|
+
dataeval/metrics/stats/_pixelstats.py,sha256=N9e7RXuzSHtlJtWU7l5IcTTIXe2kOmWiuj6lnJpZWq0,3312
|
64
|
+
dataeval/metrics/stats/_visualstats.py,sha256=b6jMq36_UlKduMrkwfq2i0fXNalDEcMdqPgoynXl5hI,3713
|
65
65
|
dataeval/outputs/__init__.py,sha256=geHB5M3QOiFFaQGV4ZwDTTKpqZPvPePbqG7lzaPhaXQ,1741
|
66
66
|
dataeval/outputs/_base.py,sha256=-Wa0gFcBVLbfWPMZyCql7x4vGsnkLP4pecsQIeUZ2_Y,5904
|
67
67
|
dataeval/outputs/_bias.py,sha256=1OZpKncYTryjPLRHb4d6NlhE27uPT57gCob_5jtjKDI,10456
|
68
|
-
dataeval/outputs/_drift.py,sha256=
|
68
|
+
dataeval/outputs/_drift.py,sha256=hXILED_soY8ppIQZgftQvmumtwDrTnABbYl-flIGEU4,4588
|
69
69
|
dataeval/outputs/_estimators.py,sha256=mh-R08CgYtmq9ffANDMYR-V4vrZnSjOjEyOMiMDZ2Ic,3091
|
70
70
|
dataeval/outputs/_linters.py,sha256=k8lkd8EZ23q0m-HOD-FgqMcLQFy1UH7vws2ucLPyn08,6697
|
71
71
|
dataeval/outputs/_metadata.py,sha256=ffZgpX8KWURPHXpOWjbvJ2KRqWQkS2nWuIjKUzoHhMI,1710
|
72
72
|
dataeval/outputs/_ood.py,sha256=suLKVXULGtXH0rq9eXHI1d3d2jhGmItJtz4QiQd47A4,1718
|
73
|
-
dataeval/outputs/_stats.py,sha256=
|
73
|
+
dataeval/outputs/_stats.py,sha256=_ItGjs9YaMHqjivkR1YBcSErD5ICfa_-iV9nq0l8bTM,17451
|
74
74
|
dataeval/outputs/_utils.py,sha256=NfhYaGT2PZlhIs8ICKUsPWHZXjhWYDkEJqBDdqMeaOM,929
|
75
75
|
dataeval/outputs/_workflows.py,sha256=K786mOgegxVi81diUA-qpbwGEkwa8YA7Fk4ttgjJeaY,10831
|
76
76
|
dataeval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -85,14 +85,14 @@ dataeval/utils/_method.py,sha256=9B9JQbgqWJBRhQJb7glajUtWaQzUTIUuvrZ9_bisxsM,394
|
|
85
85
|
dataeval/utils/_mst.py,sha256=bLmJmu_1Dtj3hC5gQp3oAiJ_7TKtEjahTqusVRRU4eI,2168
|
86
86
|
dataeval/utils/_plot.py,sha256=1rnMkBRvTFLoTAHqXwF7c7GJ5_5iqlgarZKAzmYciLk,7225
|
87
87
|
dataeval/utils/data/__init__.py,sha256=xGzrjrOxOP2DP1tU84AWMKPnSxFvSjM81CTlDg4rNM8,331
|
88
|
-
dataeval/utils/data/_dataset.py,sha256=
|
88
|
+
dataeval/utils/data/_dataset.py,sha256=tC_vqgWnmojAoAANo5BUVfEUYXl7GzOBSeYjR9olbDk,9506
|
89
89
|
dataeval/utils/data/collate.py,sha256=5egEEKhNNCGeNLChO1p6dZ4Wg6x51VEaMNHz7hEZUxI,3936
|
90
90
|
dataeval/utils/data/metadata.py,sha256=L1c2bCiMj0aR0QCoKkjwBujIftJDEMgW_3ZbgeS8WHo,14703
|
91
91
|
dataeval/utils/datasets/__init__.py,sha256=pAXqHX76yAoBI8XB3m6zGuW-u3s3PCoIXG5GDzxH7Zs,572
|
92
92
|
dataeval/utils/datasets/_antiuav.py,sha256=kA_ia1fYNcJiz9SpCvh-Z8iSc7iJrdogjBI3soyaa7A,8304
|
93
93
|
dataeval/utils/datasets/_base.py,sha256=pyfpJda3ku469M3TFRsJn9S2oAiQODOGTlLcdcoEW9U,9031
|
94
94
|
dataeval/utils/datasets/_cifar10.py,sha256=hZc_A30yKYBbv2kvVdEkZ9egyEe6XBUnmksoIAoJ-5Y,8265
|
95
|
-
dataeval/utils/datasets/_fileio.py,sha256=
|
95
|
+
dataeval/utils/datasets/_fileio.py,sha256=LEoFVNdryRdi7mKpWw-9D8lA6XMa-Jaszd85bv93POo,5454
|
96
96
|
dataeval/utils/datasets/_milco.py,sha256=iXf4C1I3Eg_3gHKUe4XPi21yFMBO51zxTIqAkGf9bYg,7869
|
97
97
|
dataeval/utils/datasets/_mixin.py,sha256=S8iii-SoYUsFFYNXjw2thlZkpBvRLnZ4XI8wTqOKXgU,1729
|
98
98
|
dataeval/utils/datasets/_mnist.py,sha256=uz46sE1Go3TgGjG6x2cXckSVQ0mSg2mhgk8BUvLWjb0,8149
|
@@ -102,12 +102,12 @@ dataeval/utils/datasets/_voc.py,sha256=pafY112O80isYkrdy7Quie9SBm_TmYhREuyl8Sxts
|
|
102
102
|
dataeval/utils/torch/__init__.py,sha256=dn5mjCrFp0b1aL_UEURhONU0Ag0cmXoTOBSGagpkTiA,325
|
103
103
|
dataeval/utils/torch/_blocks.py,sha256=HVhBTMMD5NA4qheMUgyol1KWiKZDIuc8k5j4RcMKmhk,1466
|
104
104
|
dataeval/utils/torch/_gmm.py,sha256=XM68GNEP97EjaB1U49-ZXRb81d0CEFnPS910alrcB3g,3740
|
105
|
-
dataeval/utils/torch/_internal.py,sha256=
|
105
|
+
dataeval/utils/torch/_internal.py,sha256=HuyBB7NWFI9sUrRbOCZFxOfZjRGPdqr5iF7_DT2S0wo,4159
|
106
106
|
dataeval/utils/torch/models.py,sha256=1idpXyjrYcCBSsbxxRUOto8xr4MJNjDEqQHiIXVU5Zc,9700
|
107
107
|
dataeval/utils/torch/trainer.py,sha256=Oc2lK13uPGhmLYbmAqlPWyKxgG4YJFlnSXCqFHUZbdA,5528
|
108
108
|
dataeval/workflows/__init__.py,sha256=ou8y0KO-d6W5lgmcyLjKlf-J_ckP3vilW7wHkgiDlZ4,255
|
109
109
|
dataeval/workflows/sufficiency.py,sha256=j-R8dg4XE6a66p_oTXG2GNzgg3vGk85CTblxhFXaxog,8513
|
110
|
-
dataeval-0.86.
|
111
|
-
dataeval-0.86.
|
112
|
-
dataeval-0.86.
|
113
|
-
dataeval-0.86.
|
110
|
+
dataeval-0.86.6.dist-info/LICENSE.txt,sha256=uAooygKWvX6NbU9Ran9oG2msttoG8aeTeHSTe5JeCnY,1061
|
111
|
+
dataeval-0.86.6.dist-info/METADATA,sha256=pHhjYhbE3BlgvxtINd333FwljVfELIKQnplaAwLNZVg,5353
|
112
|
+
dataeval-0.86.6.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
113
|
+
dataeval-0.86.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|