dataeval 0.86.1__tar.gz → 0.86.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataeval-0.86.1 → dataeval-0.86.2}/PKG-INFO +2 -1
- {dataeval-0.86.1 → dataeval-0.86.2}/pyproject.toml +2 -1
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/__init__.py +1 -1
- dataeval-0.86.2/src/dataeval/data/_metadata.py +392 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/_split.py +2 -2
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metadata/_distance.py +10 -7
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metadata/_ood.py +11 -103
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/bias/_balance.py +23 -33
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/bias/_diversity.py +16 -14
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/bias/_parity.py +9 -6
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/outputs/_bias.py +7 -51
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/data/_dataset.py +22 -8
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/datasets/_milco.py +3 -1
- dataeval-0.86.1/src/dataeval/data/_metadata.py +0 -393
- {dataeval-0.86.1 → dataeval-0.86.2}/LICENSE.txt +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/README.md +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/_log.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/config.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/__init__.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/_embeddings.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/_images.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/_selection.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/_targets.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/selections/__init__.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/selections/_classbalance.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/selections/_classfilter.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/selections/_indices.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/selections/_limit.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/selections/_prioritize.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/selections/_reverse.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/data/selections/_shuffle.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/__init__.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/__init__.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_base.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_cvm.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_ks.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_mmd.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_mvdc.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_nml/__init__.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_nml/_base.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_nml/_chunk.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_nml/_domainclassifier.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_nml/_result.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_nml/_thresholds.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/_uncertainty.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/drift/updates.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/linters/__init__.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/linters/duplicates.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/linters/outliers.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/ood/__init__.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/ood/ae.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/ood/base.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/detectors/ood/mixin.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metadata/__init__.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metadata/_utils.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/__init__.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/bias/__init__.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/bias/_completeness.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/bias/_coverage.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/estimators/__init__.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/estimators/_ber.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/estimators/_clusterer.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/estimators/_divergence.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/estimators/_uap.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/stats/__init__.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/stats/_base.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/stats/_boxratiostats.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/stats/_dimensionstats.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/stats/_hashstats.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/stats/_imagestats.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/stats/_labelstats.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/stats/_pixelstats.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/metrics/stats/_visualstats.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/outputs/__init__.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/outputs/_base.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/outputs/_drift.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/outputs/_estimators.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/outputs/_linters.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/outputs/_metadata.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/outputs/_ood.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/outputs/_stats.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/outputs/_utils.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/outputs/_workflows.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/py.typed +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/typing.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/__init__.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/_array.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/_bin.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/_clusterer.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/_fast_mst.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/_image.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/_method.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/_mst.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/_plot.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/data/__init__.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/data/collate.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/data/metadata.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/datasets/__init__.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/datasets/_antiuav.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/datasets/_base.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/datasets/_cifar10.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/datasets/_fileio.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/datasets/_mixin.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/datasets/_mnist.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/datasets/_ships.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/datasets/_types.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/datasets/_voc.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/torch/__init__.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/torch/_blocks.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/torch/_gmm.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/torch/_internal.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/torch/models.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/utils/torch/trainer.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/workflows/__init__.py +0 -0
- {dataeval-0.86.1 → dataeval-0.86.2}/src/dataeval/workflows/sufficiency.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: dataeval
|
3
|
-
Version: 0.86.
|
3
|
+
Version: 0.86.2
|
4
4
|
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
5
5
|
Home-page: https://dataeval.ai/
|
6
6
|
License: MIT
|
@@ -29,6 +29,7 @@ Requires-Dist: numba (>=0.59.1)
|
|
29
29
|
Requires-Dist: numpy (>=1.24.2)
|
30
30
|
Requires-Dist: pandas (>=2.0)
|
31
31
|
Requires-Dist: pillow (>=10.3.0)
|
32
|
+
Requires-Dist: polars (>=1.0.0)
|
32
33
|
Requires-Dist: requests
|
33
34
|
Requires-Dist: scikit-learn (>=1.5.0)
|
34
35
|
Requires-Dist: scipy (>=1.10)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "dataeval"
|
3
|
-
version = "0.86.
|
3
|
+
version = "0.86.2" # dynamic
|
4
4
|
description = "DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks"
|
5
5
|
license = "MIT"
|
6
6
|
readme = "README.md"
|
@@ -49,6 +49,7 @@ numba = {version = ">=0.59.1"}
|
|
49
49
|
numpy = {version = ">=1.24.2"}
|
50
50
|
pandas = {version = ">=2.0"}
|
51
51
|
pillow = {version = ">=10.3.0"}
|
52
|
+
polars = {version = ">=1.0.0"}
|
52
53
|
requests = {version = "*"}
|
53
54
|
scipy = {version = ">=1.10"}
|
54
55
|
scikit-learn = {version = ">=1.5.0"}
|
@@ -0,0 +1,392 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
__all__ = []
|
4
|
+
|
5
|
+
import warnings
|
6
|
+
from dataclasses import dataclass
|
7
|
+
from typing import TYPE_CHECKING, Any, Iterable, Literal, Mapping, Sequence, Sized
|
8
|
+
|
9
|
+
import numpy as np
|
10
|
+
import polars as pl
|
11
|
+
from numpy.typing import NDArray
|
12
|
+
|
13
|
+
from dataeval.typing import (
|
14
|
+
AnnotatedDataset,
|
15
|
+
Array,
|
16
|
+
ObjectDetectionTarget,
|
17
|
+
)
|
18
|
+
from dataeval.utils._array import as_numpy
|
19
|
+
from dataeval.utils._bin import bin_data, digitize_data
|
20
|
+
from dataeval.utils.data.metadata import merge
|
21
|
+
|
22
|
+
if TYPE_CHECKING:
|
23
|
+
from dataeval.data import Targets
|
24
|
+
else:
|
25
|
+
from dataeval.data._targets import Targets
|
26
|
+
|
27
|
+
|
28
|
+
@dataclass
|
29
|
+
class FactorInfo:
|
30
|
+
factor_type: Literal["categorical", "continuous", "discrete"] | None = None
|
31
|
+
discretized_col: str | None = None
|
32
|
+
|
33
|
+
|
34
|
+
class Metadata:
|
35
|
+
"""
|
36
|
+
Class containing binned metadata using Polars DataFrames.
|
37
|
+
|
38
|
+
Parameters
|
39
|
+
----------
|
40
|
+
dataset : ImageClassificationDataset or ObjectDetectionDataset
|
41
|
+
Dataset to access original targets and metadata from.
|
42
|
+
continuous_factor_bins : Mapping[str, int | Sequence[float]] | None, default None
|
43
|
+
Mapping from continuous factor name to the number of bins or bin edges
|
44
|
+
auto_bin_method : Literal["uniform_width", "uniform_count", "clusters"], default "uniform_width"
|
45
|
+
Method for automatically determining the number of bins for continuous factors
|
46
|
+
exclude : Sequence[str] | None, default None
|
47
|
+
Filter metadata factors to exclude the specified factors, cannot be set with `include`
|
48
|
+
include : Sequence[str] | None, default None
|
49
|
+
Filter metadata factors to include the specified factors, cannot be set with `exclude`
|
50
|
+
"""
|
51
|
+
|
52
|
+
def __init__(
|
53
|
+
self,
|
54
|
+
dataset: AnnotatedDataset[tuple[Any, Any, dict[str, Any]]],
|
55
|
+
*,
|
56
|
+
continuous_factor_bins: Mapping[str, int | Sequence[float]] | None = None,
|
57
|
+
auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = "uniform_width",
|
58
|
+
exclude: Sequence[str] | None = None,
|
59
|
+
include: Sequence[str] | None = None,
|
60
|
+
) -> None:
|
61
|
+
self._targets: Targets
|
62
|
+
self._class_labels: NDArray[np.intp]
|
63
|
+
self._class_names: list[str]
|
64
|
+
self._image_indices: NDArray[np.intp]
|
65
|
+
self._factors: dict[str, FactorInfo]
|
66
|
+
self._dropped_factors: dict[str, list[str]]
|
67
|
+
self._dataframe: pl.DataFrame
|
68
|
+
|
69
|
+
self._is_structured = False
|
70
|
+
self._is_binned = False
|
71
|
+
|
72
|
+
self._dataset = dataset
|
73
|
+
self._continuous_factor_bins = dict(continuous_factor_bins) if continuous_factor_bins else {}
|
74
|
+
self._auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = auto_bin_method
|
75
|
+
|
76
|
+
if exclude is not None and include is not None:
|
77
|
+
raise ValueError("Filters for `exclude` and `include` are mutually exclusive.")
|
78
|
+
|
79
|
+
self._exclude = set(exclude or ())
|
80
|
+
self._include = set(include or ())
|
81
|
+
|
82
|
+
@property
|
83
|
+
def targets(self) -> Targets:
|
84
|
+
"""Target information for the dataset."""
|
85
|
+
self._structure()
|
86
|
+
return self._targets
|
87
|
+
|
88
|
+
@property
|
89
|
+
def raw(self) -> list[dict[str, Any]]:
|
90
|
+
"""The raw list of metadata dictionaries for the dataset."""
|
91
|
+
self._structure()
|
92
|
+
return self._raw
|
93
|
+
|
94
|
+
@property
|
95
|
+
def exclude(self) -> set[str]:
|
96
|
+
"""Factors to exclude from the metadata."""
|
97
|
+
return self._exclude
|
98
|
+
|
99
|
+
@exclude.setter
|
100
|
+
def exclude(self, value: Sequence[str]) -> None:
|
101
|
+
exclude = set(value)
|
102
|
+
if self._exclude != exclude:
|
103
|
+
self._exclude = exclude
|
104
|
+
self._include = set()
|
105
|
+
self._is_binned = False
|
106
|
+
|
107
|
+
@property
|
108
|
+
def include(self) -> set[str]:
|
109
|
+
"""Factors to include from the metadata."""
|
110
|
+
return self._include
|
111
|
+
|
112
|
+
@include.setter
|
113
|
+
def include(self, value: Sequence[str]) -> None:
|
114
|
+
include = set(value)
|
115
|
+
if self._include != include:
|
116
|
+
self._include = include
|
117
|
+
self._exclude = set()
|
118
|
+
self._is_binned = False
|
119
|
+
|
120
|
+
@property
|
121
|
+
def continuous_factor_bins(self) -> Mapping[str, int | Sequence[float]]:
|
122
|
+
"""Map of factor names to bin counts or bin edges."""
|
123
|
+
return self._continuous_factor_bins
|
124
|
+
|
125
|
+
@continuous_factor_bins.setter
|
126
|
+
def continuous_factor_bins(self, bins: Mapping[str, int | Sequence[float]]) -> None:
|
127
|
+
if self._continuous_factor_bins != bins:
|
128
|
+
self._continuous_factor_bins = dict(bins)
|
129
|
+
self._reset_bins(bins)
|
130
|
+
|
131
|
+
@property
|
132
|
+
def auto_bin_method(self) -> Literal["uniform_width", "uniform_count", "clusters"]:
|
133
|
+
"""Binning method to use when continuous_factor_bins is not defined."""
|
134
|
+
return self._auto_bin_method
|
135
|
+
|
136
|
+
@auto_bin_method.setter
|
137
|
+
def auto_bin_method(self, method: Literal["uniform_width", "uniform_count", "clusters"]) -> None:
|
138
|
+
if self._auto_bin_method != method:
|
139
|
+
self._auto_bin_method = method
|
140
|
+
self._reset_bins()
|
141
|
+
|
142
|
+
@property
|
143
|
+
def dataframe(self) -> pl.DataFrame:
|
144
|
+
"""Dataframe containing target information and metadata factors."""
|
145
|
+
self._structure()
|
146
|
+
return self._dataframe
|
147
|
+
|
148
|
+
@property
|
149
|
+
def dropped_factors(self) -> dict[str, list[str]]:
|
150
|
+
"""Factors that were dropped during preprocessing and the reasons why they were dropped."""
|
151
|
+
self._structure()
|
152
|
+
return self._dropped_factors
|
153
|
+
|
154
|
+
@property
|
155
|
+
def discretized_data(self) -> NDArray[np.int64]:
|
156
|
+
"""Factor data with continuous data discretized."""
|
157
|
+
if not self.factor_names:
|
158
|
+
return np.array([], dtype=np.int64)
|
159
|
+
|
160
|
+
self._bin()
|
161
|
+
return (
|
162
|
+
self.dataframe.select([info.discretized_col or name for name, info in self.factor_info.items()])
|
163
|
+
.to_numpy()
|
164
|
+
.astype(np.int64)
|
165
|
+
)
|
166
|
+
|
167
|
+
@property
|
168
|
+
def factor_names(self) -> list[str]:
|
169
|
+
"""Factor names of the metadata."""
|
170
|
+
self._structure()
|
171
|
+
return list(self._factors)
|
172
|
+
|
173
|
+
@property
|
174
|
+
def factor_info(self) -> dict[str, FactorInfo]:
|
175
|
+
"""Factor types of the metadata."""
|
176
|
+
self._bin()
|
177
|
+
return self._factors
|
178
|
+
|
179
|
+
@property
|
180
|
+
def factor_data(self) -> NDArray[Any]:
|
181
|
+
"""Factor data as a NumPy array."""
|
182
|
+
if not self.factor_names:
|
183
|
+
return np.array([], dtype=np.float64)
|
184
|
+
|
185
|
+
# Extract continuous columns and convert to NumPy array
|
186
|
+
return self.dataframe.select(self.factor_names).to_numpy()
|
187
|
+
|
188
|
+
@property
|
189
|
+
def class_labels(self) -> NDArray[np.intp]:
|
190
|
+
"""Class labels as a NumPy array."""
|
191
|
+
self._structure()
|
192
|
+
return self._class_labels
|
193
|
+
|
194
|
+
@property
|
195
|
+
def class_names(self) -> list[str]:
|
196
|
+
"""Class names as a list of strings."""
|
197
|
+
self._structure()
|
198
|
+
return self._class_names
|
199
|
+
|
200
|
+
@property
|
201
|
+
def image_indices(self) -> NDArray[np.intp]:
|
202
|
+
"""Indices of images as a NumPy array."""
|
203
|
+
self._bin()
|
204
|
+
return self._image_indices
|
205
|
+
|
206
|
+
@property
|
207
|
+
def image_count(self) -> int:
|
208
|
+
self._bin()
|
209
|
+
return int(self._image_indices.max() + 1)
|
210
|
+
|
211
|
+
def _reset_bins(self, cols: Iterable[str] | None = None) -> None:
|
212
|
+
if self._is_binned:
|
213
|
+
columns = self._dataframe.columns
|
214
|
+
for col in (col for col in cols or columns if f"{col}[|]" in columns):
|
215
|
+
self._dataframe.drop_in_place(f"{col}[|]")
|
216
|
+
self._factors[col] = FactorInfo()
|
217
|
+
self._is_binned = False
|
218
|
+
|
219
|
+
def _structure(self) -> None:
|
220
|
+
if self._is_structured:
|
221
|
+
return
|
222
|
+
|
223
|
+
raw: list[dict[str, Any]] = []
|
224
|
+
|
225
|
+
labels = []
|
226
|
+
bboxes = []
|
227
|
+
scores = []
|
228
|
+
srcidx = []
|
229
|
+
is_od = None
|
230
|
+
for i in range(len(self._dataset)):
|
231
|
+
_, target, metadata = self._dataset[i]
|
232
|
+
|
233
|
+
raw.append(metadata)
|
234
|
+
|
235
|
+
if is_od_target := isinstance(target, ObjectDetectionTarget):
|
236
|
+
target_labels = as_numpy(target.labels)
|
237
|
+
target_len = len(target_labels)
|
238
|
+
labels.extend(target_labels.tolist())
|
239
|
+
bboxes.extend(as_numpy(target.boxes).tolist())
|
240
|
+
scores.extend(as_numpy(target.scores).tolist())
|
241
|
+
srcidx.extend([i] * target_len)
|
242
|
+
elif isinstance(target, Array):
|
243
|
+
target_len = 1
|
244
|
+
labels.append(int(np.argmax(as_numpy(target))))
|
245
|
+
scores.append(target)
|
246
|
+
else:
|
247
|
+
raise TypeError("Encountered unsupported target type in dataset")
|
248
|
+
|
249
|
+
is_od = is_od_target if is_od is None else is_od
|
250
|
+
if is_od != is_od_target:
|
251
|
+
raise ValueError("Encountered unexpected target type in dataset")
|
252
|
+
|
253
|
+
labels = as_numpy(labels).astype(np.intp)
|
254
|
+
scores = as_numpy(scores).astype(np.float32)
|
255
|
+
bboxes = as_numpy(bboxes).astype(np.float32) if is_od else None
|
256
|
+
srcidx = as_numpy(srcidx).astype(np.intp) if is_od else None
|
257
|
+
|
258
|
+
target_dict = {
|
259
|
+
"image_index": srcidx if srcidx is not None else np.arange(len(labels)),
|
260
|
+
"class_label": labels,
|
261
|
+
"score": scores,
|
262
|
+
"box": bboxes if bboxes is not None else [None] * len(labels),
|
263
|
+
}
|
264
|
+
|
265
|
+
self._targets = Targets(labels, scores, bboxes, srcidx)
|
266
|
+
self._raw = raw
|
267
|
+
|
268
|
+
index2label = self._dataset.metadata.get("index2label", {})
|
269
|
+
self._class_labels = labels
|
270
|
+
self._class_names = [index2label.get(i, str(i)) for i in np.unique(self._class_labels)]
|
271
|
+
self._image_indices = target_dict["image_index"]
|
272
|
+
|
273
|
+
targets_per_image = None if srcidx is None else np.unique(srcidx, return_counts=True)[1].tolist()
|
274
|
+
merged = merge(raw, return_dropped=True, ignore_lists=False, targets_per_image=targets_per_image)
|
275
|
+
|
276
|
+
reserved = ["image_index", "class_label", "score", "box"]
|
277
|
+
factor_dict = {f"metadata_{k}" if k in reserved else k: v for k, v in merged[0].items() if k != "_image_index"}
|
278
|
+
|
279
|
+
self._factors = dict.fromkeys(factor_dict, FactorInfo())
|
280
|
+
self._dataframe = pl.DataFrame({**target_dict, **factor_dict})
|
281
|
+
self._dropped_factors = merged[1]
|
282
|
+
self._is_structured = True
|
283
|
+
|
284
|
+
def _bin(self) -> None:
|
285
|
+
"""Populate factor info and bin non-categorical factors."""
|
286
|
+
if self._is_binned:
|
287
|
+
return
|
288
|
+
|
289
|
+
# Start with an empty set of factor info
|
290
|
+
factor_info: dict[str, FactorInfo] = {}
|
291
|
+
|
292
|
+
# Create a mutable DataFrame for updates
|
293
|
+
df = self.dataframe.clone()
|
294
|
+
factor_bins = self.continuous_factor_bins
|
295
|
+
|
296
|
+
# Check for invalid keys
|
297
|
+
invalid_keys = set(factor_bins.keys()) - set(df.columns)
|
298
|
+
if invalid_keys:
|
299
|
+
warnings.warn(
|
300
|
+
f"The keys - {invalid_keys} - are present in the `continuous_factor_bins` dictionary "
|
301
|
+
"but are not columns in the metadata DataFrame. Unknown keys will be ignored."
|
302
|
+
)
|
303
|
+
|
304
|
+
column_set = set(df.columns)
|
305
|
+
for col in (col for col in self.factor_names if f"{col}[|]" not in column_set):
|
306
|
+
# Get data as numpy array for processing
|
307
|
+
data = df[col].to_numpy()
|
308
|
+
col_dz = f"{col}[|]"
|
309
|
+
if col in factor_bins:
|
310
|
+
# User provided binning
|
311
|
+
bins = factor_bins[col]
|
312
|
+
df = df.with_columns(pl.Series(name=col_dz, values=digitize_data(data, bins).astype(np.int64)))
|
313
|
+
factor_info[col] = FactorInfo("continuous", col_dz)
|
314
|
+
else:
|
315
|
+
# Check if data is numeric
|
316
|
+
unique, ordinal = np.unique(data, return_inverse=True)
|
317
|
+
if not np.issubdtype(data.dtype, np.number) or unique.size <= max(20, data.size * 0.01):
|
318
|
+
# Non-numeric data or small number of unique values - convert to categorical
|
319
|
+
df = df.with_columns(pl.Series(name=col_dz, values=ordinal.astype(np.int64)))
|
320
|
+
factor_info[col] = FactorInfo("categorical", col_dz)
|
321
|
+
elif data.dtype == float:
|
322
|
+
# Many unique values - discretize by binning
|
323
|
+
warnings.warn(
|
324
|
+
f"A user defined binning was not provided for {col}. "
|
325
|
+
f"Using the {self.auto_bin_method} method to discretize the data. "
|
326
|
+
"It is recommended that the user rerun and supply the desired "
|
327
|
+
"bins using the continuous_factor_bins parameter.",
|
328
|
+
UserWarning,
|
329
|
+
)
|
330
|
+
# Create binned version
|
331
|
+
binned_data = bin_data(data, self.auto_bin_method)
|
332
|
+
df = df.with_columns(pl.Series(name=col_dz, values=binned_data.astype(np.int64)))
|
333
|
+
factor_info[col] = FactorInfo("continuous", col_dz)
|
334
|
+
else:
|
335
|
+
factor_info[col] = FactorInfo("discrete", col_dz)
|
336
|
+
|
337
|
+
# Store the results
|
338
|
+
self._dataframe = df
|
339
|
+
self._factors.update(factor_info)
|
340
|
+
self._is_binned = True
|
341
|
+
|
342
|
+
def get_factors_by_type(self, factor_type: Literal["categorical", "continuous", "discrete"]) -> list[str]:
|
343
|
+
"""
|
344
|
+
Get the names of factors of a specific type.
|
345
|
+
|
346
|
+
Parameters
|
347
|
+
----------
|
348
|
+
factor_type : Literal["categorical", "continuous", "discrete"]
|
349
|
+
The type of factors to retrieve.
|
350
|
+
|
351
|
+
Returns
|
352
|
+
-------
|
353
|
+
list[str]
|
354
|
+
List of factor names of the specified type.
|
355
|
+
"""
|
356
|
+
self._bin()
|
357
|
+
return [name for name, info in self.factor_info.items() if info.factor_type == factor_type]
|
358
|
+
|
359
|
+
def add_factors(self, factors: Mapping[str, Any]) -> None:
|
360
|
+
"""
|
361
|
+
Add additional factors to the metadata.
|
362
|
+
|
363
|
+
The number of measures per factor must match the number of images
|
364
|
+
in the dataset or the number of detections in the dataset.
|
365
|
+
|
366
|
+
Parameters
|
367
|
+
----------
|
368
|
+
factors : Mapping[str, ArrayLike]
|
369
|
+
Dictionary of factors to add to the metadata.
|
370
|
+
"""
|
371
|
+
self._structure()
|
372
|
+
|
373
|
+
targets = len(self.targets.source) if self.targets.source is not None else len(self.targets)
|
374
|
+
images = self.image_count
|
375
|
+
lengths = {k: len(v if isinstance(v, Sized) else np.atleast_1d(as_numpy(v))) for k, v in factors.items()}
|
376
|
+
targets_match = all(f == targets for f in lengths.values())
|
377
|
+
images_match = targets_match if images == targets else all(f == images for f in lengths.values())
|
378
|
+
if not targets_match and not images_match:
|
379
|
+
raise ValueError(
|
380
|
+
"The lists/arrays in the provided factors have a different length than the current metadata factors."
|
381
|
+
)
|
382
|
+
|
383
|
+
new_columns = []
|
384
|
+
for k, v in factors.items():
|
385
|
+
v = as_numpy(v)
|
386
|
+
data = v if (self.targets.source is None or lengths[k] == targets) else v[self.targets.source]
|
387
|
+
new_columns.append(pl.Series(name=k, values=data))
|
388
|
+
self._factors[k] = FactorInfo()
|
389
|
+
|
390
|
+
if new_columns:
|
391
|
+
self._dataframe = self.dataframe.with_columns(new_columns)
|
392
|
+
self._is_binned = False
|
@@ -207,8 +207,8 @@ def get_groups(metadata: Metadata, split_on: Sequence[str] | None) -> NDArray[np
|
|
207
207
|
return None
|
208
208
|
|
209
209
|
split_set = set(split_on)
|
210
|
-
indices = [i for i, name in enumerate(metadata.
|
211
|
-
binned_features = metadata.
|
210
|
+
indices = [i for i, name in enumerate(metadata.factor_names) if name in split_set]
|
211
|
+
binned_features = metadata.discretized_data[:, indices]
|
212
212
|
return np.unique(binned_features, axis=0, return_inverse=True)[1]
|
213
213
|
|
214
214
|
|
@@ -80,14 +80,17 @@ def metadata_distance(metadata1: Metadata, metadata2: Metadata) -> MetadataDista
|
|
80
80
|
MetadataDistanceValues(statistic=1.0, location=0.44354838709677413, dist=2.7, pvalue=0.0)
|
81
81
|
"""
|
82
82
|
|
83
|
-
_compare_keys(metadata1.
|
84
|
-
|
83
|
+
_compare_keys(metadata1.factor_names, metadata2.factor_names)
|
84
|
+
cont_fnames = metadata1.get_factors_by_type("continuous")
|
85
85
|
|
86
|
-
|
87
|
-
|
86
|
+
if not cont_fnames:
|
87
|
+
return MetadataDistanceOutput({})
|
88
88
|
|
89
|
-
|
90
|
-
|
89
|
+
cont1 = np.atleast_2d(metadata1.dataframe[cont_fnames].to_numpy()) # (S, F)
|
90
|
+
cont2 = np.atleast_2d(metadata2.dataframe[cont_fnames].to_numpy()) # (S, F)
|
91
|
+
|
92
|
+
_validate_factors_and_data(cont_fnames, cont1)
|
93
|
+
_validate_factors_and_data(cont_fnames, cont2)
|
91
94
|
|
92
95
|
N = len(cont1)
|
93
96
|
M = len(cont2)
|
@@ -104,7 +107,7 @@ def metadata_distance(metadata1: Metadata, metadata2: Metadata) -> MetadataDista
|
|
104
107
|
results: dict[str, MetadataDistanceValues] = {}
|
105
108
|
|
106
109
|
# Per factor
|
107
|
-
for i, fname in enumerate(
|
110
|
+
for i, fname in enumerate(cont_fnames):
|
108
111
|
fdata1 = cont1[:, i] # (S, 1)
|
109
112
|
fdata2 = cont2[:, i] # (S, 1)
|
110
113
|
|
@@ -15,95 +15,6 @@ from dataeval.outputs import MostDeviatedFactorsOutput, OODOutput, OODPredictorO
|
|
15
15
|
from dataeval.outputs._base import set_metadata
|
16
16
|
|
17
17
|
|
18
|
-
def _combine_discrete_continuous(metadata: Metadata) -> tuple[list[str], NDArray[np.float64]]:
|
19
|
-
"""Combines the discrete and continuous data of a :class:`Metadata` object
|
20
|
-
|
21
|
-
Returns
|
22
|
-
-------
|
23
|
-
Tuple[list[str], NDArray]
|
24
|
-
The combined list of factors names and the combined discrete and continuous data
|
25
|
-
|
26
|
-
Note
|
27
|
-
----
|
28
|
-
Discrete and continuous data must have the same number of samples
|
29
|
-
"""
|
30
|
-
names = []
|
31
|
-
data = []
|
32
|
-
|
33
|
-
if metadata.discrete_factor_names and metadata.discrete_data.size != 0:
|
34
|
-
names.extend(metadata.discrete_factor_names)
|
35
|
-
data.append(metadata.discrete_data)
|
36
|
-
|
37
|
-
if metadata.continuous_factor_names and metadata.continuous_data.size != 0:
|
38
|
-
names.extend(metadata.continuous_factor_names)
|
39
|
-
data.append(metadata.continuous_data)
|
40
|
-
|
41
|
-
return names, np.hstack(data, dtype=np.float64) if data else np.array([], dtype=np.float64)
|
42
|
-
|
43
|
-
|
44
|
-
def _combine_metadata(
|
45
|
-
metadata_1: Metadata, metadata_2: Metadata
|
46
|
-
) -> tuple[list[str], list[NDArray[np.float64 | np.int64]], list[NDArray[np.int64 | np.float64]]]:
|
47
|
-
"""
|
48
|
-
Combines the factor names and data arrays of metadata_1 and metadata_2 when the names
|
49
|
-
match exactly and data has the same number of columns (factors).
|
50
|
-
|
51
|
-
Parameters
|
52
|
-
----------
|
53
|
-
metadata_1 : Metadata
|
54
|
-
The set of factor names used as reference to determine the correct factor names and length of data
|
55
|
-
metadata_2 : Metadata
|
56
|
-
The compared set of factor names and data that must match metadata_1
|
57
|
-
|
58
|
-
Returns
|
59
|
-
-------
|
60
|
-
list[str]
|
61
|
-
The combined discrete and continuous factor names in that order.
|
62
|
-
list[NDArray]
|
63
|
-
Combined discrete and continuous data of metadata_1
|
64
|
-
list[NDArray]
|
65
|
-
Combined discrete and continuous data of metadata_2
|
66
|
-
|
67
|
-
Raises
|
68
|
-
------
|
69
|
-
ValueError
|
70
|
-
If keys do not match in metadata_1 and metadata_2
|
71
|
-
ValueError
|
72
|
-
If the length of keys do not match the length of the data
|
73
|
-
"""
|
74
|
-
factor_names: list[str] = []
|
75
|
-
m1_data: list[NDArray[np.int64 | np.float64]] = []
|
76
|
-
m2_data: list[NDArray[np.int64 | np.float64]] = []
|
77
|
-
|
78
|
-
# Both metadata must have the same number of factors (cols), but not necessarily samples (row)
|
79
|
-
if metadata_1.total_num_factors != metadata_2.total_num_factors:
|
80
|
-
raise ValueError(
|
81
|
-
f"Number of factors differs between metadata_1 ({metadata_1.total_num_factors}) "
|
82
|
-
f"and metadata_2 ({metadata_2.total_num_factors})"
|
83
|
-
)
|
84
|
-
|
85
|
-
# Validate and attach discrete data
|
86
|
-
if metadata_1.discrete_factor_names:
|
87
|
-
_compare_keys(metadata_1.discrete_factor_names, metadata_2.discrete_factor_names)
|
88
|
-
_validate_factors_and_data(metadata_1.discrete_factor_names, metadata_1.discrete_data)
|
89
|
-
|
90
|
-
factor_names.extend(metadata_1.discrete_factor_names)
|
91
|
-
m1_data.append(metadata_1.discrete_data)
|
92
|
-
m2_data.append(metadata_2.discrete_data)
|
93
|
-
|
94
|
-
# Validate and attach continuous data
|
95
|
-
if metadata_1.continuous_factor_names:
|
96
|
-
_compare_keys(metadata_1.continuous_factor_names, metadata_2.continuous_factor_names)
|
97
|
-
_validate_factors_and_data(metadata_1.continuous_factor_names, metadata_1.continuous_data)
|
98
|
-
|
99
|
-
factor_names.extend(metadata_1.continuous_factor_names)
|
100
|
-
m1_data.append(metadata_1.continuous_data)
|
101
|
-
m2_data.append(metadata_2.continuous_data)
|
102
|
-
|
103
|
-
# Turns list of discrete and continuous into one array
|
104
|
-
return factor_names, m1_data, m2_data
|
105
|
-
|
106
|
-
|
107
18
|
def _calc_median_deviations(reference: NDArray, test: NDArray) -> NDArray:
|
108
19
|
"""
|
109
20
|
Calculates deviations of the test data from the median of the reference data
|
@@ -207,16 +118,13 @@ def find_most_deviated_factors(
|
|
207
118
|
if not any(ood_mask):
|
208
119
|
return MostDeviatedFactorsOutput([])
|
209
120
|
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
metadata_1=metadata_ref,
|
214
|
-
metadata_2=metadata_tst,
|
215
|
-
)
|
121
|
+
factor_names = metadata_ref.factor_names
|
122
|
+
ref_data = metadata_ref.factor_data
|
123
|
+
tst_data = metadata_tst.factor_data
|
216
124
|
|
217
|
-
|
218
|
-
|
219
|
-
|
125
|
+
_compare_keys(factor_names, metadata_tst.factor_names)
|
126
|
+
_validate_factors_and_data(factor_names, ref_data)
|
127
|
+
_validate_factors_and_data(factor_names, tst_data)
|
220
128
|
|
221
129
|
if len(ref_data) < 3:
|
222
130
|
warnings.warn(
|
@@ -256,6 +164,7 @@ which is what many library functions return, multiply it by _NATS2BITS to get it
|
|
256
164
|
"""
|
257
165
|
|
258
166
|
|
167
|
+
@set_metadata
|
259
168
|
def find_ood_predictors(
|
260
169
|
metadata: Metadata,
|
261
170
|
ood: OODOutput,
|
@@ -305,8 +214,8 @@ def find_ood_predictors(
|
|
305
214
|
|
306
215
|
ood_mask: NDArray[np.bool_] = ood.is_ood
|
307
216
|
|
308
|
-
|
309
|
-
|
217
|
+
factors = metadata.factor_names
|
218
|
+
data = metadata.factor_data
|
310
219
|
|
311
220
|
# No metadata correlated with out of distribution data, return 0.0 for all factors
|
312
221
|
if not any(ood_mask):
|
@@ -320,14 +229,13 @@ def find_ood_predictors(
|
|
320
229
|
# Calculate mean, std of each factor over all samples
|
321
230
|
scaled_data = (data - np.mean(data, axis=0)) / np.std(data, axis=0, ddof=1) # (S, F)
|
322
231
|
|
323
|
-
discrete_features =
|
324
|
-
discrete_features[:discrete_features_count] = True
|
232
|
+
discrete_features = [info.factor_type != "continuous" for info in metadata.factor_info.values()]
|
325
233
|
|
326
234
|
mutual_info_values = (
|
327
235
|
mutual_info_classif(
|
328
236
|
X=scaled_data,
|
329
237
|
y=ood_mask,
|
330
|
-
discrete_features=discrete_features, # type: ignore
|
238
|
+
discrete_features=discrete_features, # type: ignore - sklearn function not typed
|
331
239
|
random_state=get_seed(),
|
332
240
|
)
|
333
241
|
* _NATS2BITS
|