dataeval 0.86.9__tar.gz → 0.87.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataeval-0.86.9 → dataeval-0.87.0}/PKG-INFO +5 -17
- {dataeval-0.86.9 → dataeval-0.87.0}/README.md +3 -12
- {dataeval-0.86.9 → dataeval-0.87.0}/pyproject.toml +1 -3
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/__init__.py +1 -1
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/_version.py +2 -2
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/config.py +4 -19
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/data/_metadata.py +56 -27
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/data/_split.py +1 -1
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/data/selections/_classbalance.py +4 -3
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/data/selections/_classfilter.py +5 -5
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/data/selections/_indices.py +2 -2
- dataeval-0.87.0/src/dataeval/data/selections/_prioritize.py +513 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/data/selections/_reverse.py +1 -1
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/data/selections/_shuffle.py +2 -2
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/detectors/ood/__init__.py +2 -1
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/detectors/ood/base.py +38 -1
- dataeval-0.87.0/src/dataeval/detectors/ood/knn.py +95 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metrics/bias/_balance.py +28 -21
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metrics/bias/_diversity.py +4 -4
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metrics/bias/_parity.py +2 -2
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metrics/stats/_hashstats.py +19 -2
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/outputs/_workflows.py +20 -7
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/typing.py +14 -2
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/utils/__init__.py +2 -2
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/utils/_bin.py +7 -6
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/utils/data/__init__.py +2 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/utils/data/_dataset.py +13 -6
- dataeval-0.87.0/src/dataeval/utils/data/_validate.py +169 -0
- dataeval-0.86.9/src/dataeval/data/selections/_prioritize.py +0 -293
- dataeval-0.86.9/src/dataeval/utils/datasets/__init__.py +0 -21
- dataeval-0.86.9/src/dataeval/utils/datasets/_antiuav.py +0 -189
- dataeval-0.86.9/src/dataeval/utils/datasets/_base.py +0 -266
- dataeval-0.86.9/src/dataeval/utils/datasets/_cifar10.py +0 -201
- dataeval-0.86.9/src/dataeval/utils/datasets/_fileio.py +0 -142
- dataeval-0.86.9/src/dataeval/utils/datasets/_milco.py +0 -197
- dataeval-0.86.9/src/dataeval/utils/datasets/_mixin.py +0 -54
- dataeval-0.86.9/src/dataeval/utils/datasets/_mnist.py +0 -202
- dataeval-0.86.9/src/dataeval/utils/datasets/_seadrone.py +0 -512
- dataeval-0.86.9/src/dataeval/utils/datasets/_ships.py +0 -144
- dataeval-0.86.9/src/dataeval/utils/datasets/_types.py +0 -48
- dataeval-0.86.9/src/dataeval/utils/datasets/_voc.py +0 -583
- {dataeval-0.86.9 → dataeval-0.87.0}/.gitignore +0 -0
- /dataeval-0.86.9/LICENSE.txt → /dataeval-0.87.0/LICENSE +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/_log.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/data/__init__.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/data/_embeddings.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/data/_images.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/data/_selection.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/data/selections/__init__.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/data/selections/_limit.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/detectors/__init__.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/detectors/drift/__init__.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/detectors/drift/_base.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/detectors/drift/_cvm.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/detectors/drift/_ks.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/detectors/drift/_mmd.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/detectors/drift/_mvdc.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/detectors/drift/_nml/__init__.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/detectors/drift/_nml/_base.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/detectors/drift/_nml/_chunk.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/detectors/drift/_nml/_domainclassifier.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/detectors/drift/_nml/_result.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/detectors/drift/_nml/_thresholds.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/detectors/drift/_uncertainty.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/detectors/drift/updates.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/detectors/linters/__init__.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/detectors/linters/duplicates.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/detectors/linters/outliers.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/detectors/ood/ae.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/detectors/ood/mixin.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metadata/__init__.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metadata/_distance.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metadata/_ood.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metadata/_utils.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metrics/__init__.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metrics/bias/__init__.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metrics/bias/_completeness.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metrics/bias/_coverage.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metrics/estimators/__init__.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metrics/estimators/_ber.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metrics/estimators/_clusterer.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metrics/estimators/_divergence.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metrics/estimators/_uap.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metrics/stats/__init__.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metrics/stats/_base.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metrics/stats/_boxratiostats.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metrics/stats/_dimensionstats.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metrics/stats/_imagestats.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metrics/stats/_labelstats.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metrics/stats/_pixelstats.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/metrics/stats/_visualstats.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/outputs/__init__.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/outputs/_base.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/outputs/_bias.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/outputs/_drift.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/outputs/_estimators.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/outputs/_linters.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/outputs/_metadata.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/outputs/_ood.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/outputs/_stats.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/outputs/_utils.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/py.typed +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/utils/_array.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/utils/_clusterer.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/utils/_fast_mst.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/utils/_image.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/utils/_method.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/utils/_mst.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/utils/_plot.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/utils/data/collate.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/utils/data/metadata.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/utils/torch/__init__.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/utils/torch/_blocks.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/utils/torch/_gmm.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/utils/torch/_internal.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/utils/torch/models.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/utils/torch/trainer.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/workflows/__init__.py +0 -0
- {dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/workflows/sufficiency.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: dataeval
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.87.0
|
4
4
|
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
5
5
|
Project-URL: Homepage, https://dataeval.ai/
|
6
6
|
Project-URL: Repository, https://github.com/aria-ml/dataeval/
|
@@ -8,7 +8,7 @@ Project-URL: Documentation, https://dataeval.readthedocs.io/
|
|
8
8
|
Author-email: Andrew Weng <andrew.weng@ariacoustics.com>, Bill Peria <bill.peria@ariacoustics.com>, Jon Botts <jonathan.botts@ariacoustics.com>, Jonathan Christian <jonathan.christian@ariacoustics.com>, Justin McMillan <justin.mcmillan@ariacoustics.com>, Ryan Wood <ryan.wood@ariacoustics.com>, Scott Swan <scott.swan@ariacoustics.com>, Shaun Jullens <shaun.jullens@ariacoustics.com>
|
9
9
|
Maintainer-email: ARiA <dataeval@ariacoustics.com>
|
10
10
|
License-Expression: MIT
|
11
|
-
License-File: LICENSE
|
11
|
+
License-File: LICENSE
|
12
12
|
Classifier: Development Status :: 4 - Beta
|
13
13
|
Classifier: Intended Audience :: Science/Research
|
14
14
|
Classifier: License :: OSI Approved :: MIT License
|
@@ -20,15 +20,12 @@ Classifier: Programming Language :: Python :: 3.11
|
|
20
20
|
Classifier: Programming Language :: Python :: 3.12
|
21
21
|
Classifier: Topic :: Scientific/Engineering
|
22
22
|
Requires-Python: <3.13,>=3.9
|
23
|
-
Requires-Dist: defusedxml>=0.7.1
|
24
23
|
Requires-Dist: fast-hdbscan==0.2.0
|
25
24
|
Requires-Dist: lightgbm>=4
|
26
25
|
Requires-Dist: numba>=0.59.1
|
27
26
|
Requires-Dist: numpy>=1.24.2
|
28
27
|
Requires-Dist: pandas>=2.0
|
29
|
-
Requires-Dist: pillow>=10.3.0
|
30
28
|
Requires-Dist: polars>=1.0.0
|
31
|
-
Requires-Dist: requests>=2.32.3
|
32
29
|
Requires-Dist: scikit-learn>=1.5.0
|
33
30
|
Requires-Dist: scipy>=1.10
|
34
31
|
Requires-Dist: torch>=2.2.0
|
@@ -123,14 +120,8 @@ micromamba create -f environment\environment.yaml -c pytorch
|
|
123
120
|
|
124
121
|
### **Installing from GitHub**
|
125
122
|
|
126
|
-
To install DataEval from source locally on Ubuntu,
|
127
|
-
|
128
|
-
|
129
|
-
```bash
|
130
|
-
sudo apt-get install git-lfs
|
131
|
-
```
|
132
|
-
|
133
|
-
Pull the source down and change to the DataEval project directory.
|
123
|
+
To install DataEval from source locally on Ubuntu, pull the source down and
|
124
|
+
change to the DataEval project directory.
|
134
125
|
|
135
126
|
```bash
|
136
127
|
git clone https://github.com/aria-ml/dataeval.git
|
@@ -167,10 +158,7 @@ source .venv/bin/activate
|
|
167
158
|
|
168
159
|
## Contact Us
|
169
160
|
|
170
|
-
If you have any questions, feel free to reach out to
|
171
|
-
|
172
|
-
- **POC**: Scott Swan @scott.swan
|
173
|
-
- **DPOC**: Andrew Weng @aweng
|
161
|
+
If you have any questions, feel free to reach out to [us](mailto:dataeval@ariacoustics.com)!
|
174
162
|
|
175
163
|
## Acknowledgement
|
176
164
|
|
@@ -72,14 +72,8 @@ micromamba create -f environment\environment.yaml -c pytorch
|
|
72
72
|
|
73
73
|
### **Installing from GitHub**
|
74
74
|
|
75
|
-
To install DataEval from source locally on Ubuntu,
|
76
|
-
|
77
|
-
|
78
|
-
```bash
|
79
|
-
sudo apt-get install git-lfs
|
80
|
-
```
|
81
|
-
|
82
|
-
Pull the source down and change to the DataEval project directory.
|
75
|
+
To install DataEval from source locally on Ubuntu, pull the source down and
|
76
|
+
change to the DataEval project directory.
|
83
77
|
|
84
78
|
```bash
|
85
79
|
git clone https://github.com/aria-ml/dataeval.git
|
@@ -116,10 +110,7 @@ source .venv/bin/activate
|
|
116
110
|
|
117
111
|
## Contact Us
|
118
112
|
|
119
|
-
If you have any questions, feel free to reach out to
|
120
|
-
|
121
|
-
- **POC**: Scott Swan @scott.swan
|
122
|
-
- **DPOC**: Andrew Weng @aweng
|
113
|
+
If you have any questions, feel free to reach out to [us](mailto:dataeval@ariacoustics.com)!
|
123
114
|
|
124
115
|
## Acknowledgement
|
125
116
|
|
@@ -31,15 +31,12 @@ classifiers = [
|
|
31
31
|
"Topic :: Scientific/Engineering",
|
32
32
|
]
|
33
33
|
dependencies = [
|
34
|
-
"defusedxml>=0.7.1",
|
35
34
|
"fast_hdbscan==0.2.0",
|
36
35
|
"lightgbm>=4",
|
37
36
|
"numba>=0.59.1",
|
38
37
|
"numpy>=1.24.2",
|
39
38
|
"pandas>=2.0",
|
40
|
-
"pillow>=10.3.0",
|
41
39
|
"polars>=1.0.0",
|
42
|
-
"requests>=2.32.3",
|
43
40
|
"scipy>=1.10",
|
44
41
|
"scikit-learn>=1.5.0",
|
45
42
|
"torch>=2.2.0",
|
@@ -96,6 +93,7 @@ docs = [
|
|
96
93
|
"jinja2>=3.1.6",
|
97
94
|
"jupyter-client>=8.6.0",
|
98
95
|
"jupyter-cache>=1.0",
|
96
|
+
"maite-datasets>=0.0.1",
|
99
97
|
"myst-nb>=1.0",
|
100
98
|
"sphinx-autoapi>=3.6.0",
|
101
99
|
"sphinx-design>=0.6.1",
|
@@ -4,19 +4,15 @@ Global configuration settings for DataEval.
|
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
|
7
|
-
__all__ = ["get_device", "set_device", "get_max_processes", "set_max_processes", "use_max_processes"
|
7
|
+
__all__ = ["get_device", "set_device", "get_max_processes", "set_max_processes", "use_max_processes"]
|
8
8
|
|
9
|
-
import
|
10
|
-
from typing import Any, Union
|
11
|
-
|
12
|
-
if sys.version_info >= (3, 10):
|
13
|
-
from typing import TypeAlias
|
14
|
-
else:
|
15
|
-
from typing_extensions import TypeAlias
|
9
|
+
from typing import Any
|
16
10
|
|
17
11
|
import numpy as np
|
18
12
|
import torch
|
19
13
|
|
14
|
+
from dataeval.typing import DeviceLike
|
15
|
+
|
20
16
|
### GLOBALS ###
|
21
17
|
|
22
18
|
_device: torch.device | None = None
|
@@ -27,17 +23,6 @@ _seed: int | None = None
|
|
27
23
|
|
28
24
|
EPSILON = 1e-12
|
29
25
|
|
30
|
-
### TYPES ###
|
31
|
-
|
32
|
-
DeviceLike: TypeAlias = Union[int, str, tuple[str, int], torch.device]
|
33
|
-
"""
|
34
|
-
Type alias for types that are acceptable for specifying a torch.device.
|
35
|
-
|
36
|
-
See Also
|
37
|
-
--------
|
38
|
-
`torch.device <https://pytorch.org/docs/stable/tensor_attributes.html#torch.device>`_
|
39
|
-
"""
|
40
|
-
|
41
26
|
### FUNCS ###
|
42
27
|
|
43
28
|
|
@@ -16,18 +16,31 @@ from dataeval.typing import (
|
|
16
16
|
ObjectDetectionTarget,
|
17
17
|
)
|
18
18
|
from dataeval.utils._array import as_numpy
|
19
|
-
from dataeval.utils._bin import bin_data, digitize_data
|
19
|
+
from dataeval.utils._bin import bin_data, digitize_data, is_continuous
|
20
20
|
from dataeval.utils.data.metadata import merge
|
21
21
|
|
22
22
|
|
23
23
|
def _binned(name: str) -> str:
|
24
|
-
return f"{name}
|
24
|
+
return f"{name}↕"
|
25
|
+
|
26
|
+
|
27
|
+
def _digitized(name: str) -> str:
|
28
|
+
return f"{name}#"
|
25
29
|
|
26
30
|
|
27
31
|
@dataclass
|
28
32
|
class FactorInfo:
|
29
|
-
factor_type: Literal["categorical", "continuous", "discrete"]
|
30
|
-
|
33
|
+
factor_type: Literal["categorical", "continuous", "discrete"]
|
34
|
+
is_binned: bool = False
|
35
|
+
is_digitized: bool = False
|
36
|
+
|
37
|
+
|
38
|
+
def _to_col(name: str, info: FactorInfo, binned: bool = True) -> str:
|
39
|
+
if binned and info.is_binned:
|
40
|
+
return _binned(name)
|
41
|
+
if info.is_digitized:
|
42
|
+
return _digitized(name)
|
43
|
+
return name
|
31
44
|
|
32
45
|
|
33
46
|
class Metadata:
|
@@ -60,7 +73,7 @@ class Metadata:
|
|
60
73
|
self._class_labels: NDArray[np.intp]
|
61
74
|
self._class_names: list[str]
|
62
75
|
self._image_indices: NDArray[np.intp]
|
63
|
-
self._factors: dict[str, FactorInfo]
|
76
|
+
self._factors: dict[str, FactorInfo | None]
|
64
77
|
self._dropped_factors: dict[str, list[str]]
|
65
78
|
self._dataframe: pl.DataFrame
|
66
79
|
self._raw: Sequence[Mapping[str, Any]]
|
@@ -146,14 +159,27 @@ class Metadata:
|
|
146
159
|
return self._dropped_factors
|
147
160
|
|
148
161
|
@property
|
149
|
-
def
|
150
|
-
"""Factor data with
|
162
|
+
def digitized_data(self) -> NDArray[np.int64]:
|
163
|
+
"""Factor data with digitized categorical data."""
|
164
|
+
if not self.factor_names:
|
165
|
+
return np.array([], dtype=np.int64)
|
166
|
+
|
167
|
+
self._bin()
|
168
|
+
return (
|
169
|
+
self.dataframe.select([_to_col(k, v, False) for k, v in self.factor_info.items()])
|
170
|
+
.to_numpy()
|
171
|
+
.astype(np.int64)
|
172
|
+
)
|
173
|
+
|
174
|
+
@property
|
175
|
+
def binned_data(self) -> NDArray[np.int64]:
|
176
|
+
"""Factor data with binned continuous data."""
|
151
177
|
if not self.factor_names:
|
152
178
|
return np.array([], dtype=np.int64)
|
153
179
|
|
154
180
|
self._bin()
|
155
181
|
return (
|
156
|
-
self.dataframe.select([
|
182
|
+
self.dataframe.select([_to_col(k, v, True) for k, v in self.factor_info.items()])
|
157
183
|
.to_numpy()
|
158
184
|
.astype(np.int64)
|
159
185
|
)
|
@@ -168,7 +194,7 @@ class Metadata:
|
|
168
194
|
def factor_info(self) -> Mapping[str, FactorInfo]:
|
169
195
|
"""Factor types of the metadata."""
|
170
196
|
self._bin()
|
171
|
-
return dict(filter(self._filter, self._factors.items()))
|
197
|
+
return dict(filter(self._filter, ((k, v) for k, v in self._factors.items() if v is not None)))
|
172
198
|
|
173
199
|
@property
|
174
200
|
def factor_data(self) -> NDArray[Any]:
|
@@ -194,7 +220,7 @@ class Metadata:
|
|
194
220
|
@property
|
195
221
|
def image_indices(self) -> NDArray[np.intp]:
|
196
222
|
"""Indices of images as a NumPy array."""
|
197
|
-
self.
|
223
|
+
self._structure()
|
198
224
|
return self._image_indices
|
199
225
|
|
200
226
|
@property
|
@@ -212,7 +238,7 @@ class Metadata:
|
|
212
238
|
columns = self._dataframe.columns
|
213
239
|
for col in (col for col in cols or columns if _binned(col) in columns):
|
214
240
|
self._dataframe.drop_in_place(_binned(col))
|
215
|
-
self._factors[col] =
|
241
|
+
self._factors[col] = None
|
216
242
|
self._is_binned = False
|
217
243
|
|
218
244
|
def _structure(self) -> None:
|
@@ -277,7 +303,7 @@ class Metadata:
|
|
277
303
|
self._class_labels = labels
|
278
304
|
self._class_names = list(index2label.values())
|
279
305
|
self._image_indices = target_dict["image_index"]
|
280
|
-
self._factors = dict.fromkeys(factor_dict,
|
306
|
+
self._factors = dict.fromkeys(factor_dict, None)
|
281
307
|
self._dataframe = pl.DataFrame({**target_dict, **factor_dict})
|
282
308
|
self._dropped_factors = merged[1]
|
283
309
|
self._is_structured = True
|
@@ -303,24 +329,25 @@ class Metadata:
|
|
303
329
|
)
|
304
330
|
|
305
331
|
column_set = set(df.columns)
|
306
|
-
for col in (col for col in self.factor_names if _binned(col)
|
332
|
+
for col in (col for col in self.factor_names if not {_binned(col), _digitized(col)} & column_set):
|
307
333
|
# Get data as numpy array for processing
|
308
334
|
data = df[col].to_numpy()
|
309
|
-
col_dz = _binned(col)
|
310
335
|
if col in factor_bins:
|
311
336
|
# User provided binning
|
312
337
|
bins = factor_bins[col]
|
313
|
-
|
314
|
-
|
338
|
+
col_bn = _binned(col)
|
339
|
+
df = df.with_columns(pl.Series(name=col_bn, values=digitize_data(data, bins).astype(np.int64)))
|
340
|
+
factor_info[col] = FactorInfo("continuous", is_binned=True)
|
315
341
|
else:
|
316
342
|
# Check if data is numeric
|
317
|
-
|
318
|
-
if not np.issubdtype(data.dtype, np.number)
|
319
|
-
# Non-numeric data
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
343
|
+
_, ordinal = np.unique(data, return_inverse=True)
|
344
|
+
if not np.issubdtype(data.dtype, np.number):
|
345
|
+
# Non-numeric data - convert to categorical
|
346
|
+
col_dg = _digitized(col)
|
347
|
+
df = df.with_columns(pl.Series(name=col_dg, values=ordinal.astype(np.int64)))
|
348
|
+
factor_info[col] = FactorInfo("categorical", is_digitized=True)
|
349
|
+
elif is_continuous(data, self.image_indices):
|
350
|
+
# Continuous values - discretize by binning
|
324
351
|
warnings.warn(
|
325
352
|
f"A user defined binning was not provided for {col}. "
|
326
353
|
f"Using the {self.auto_bin_method} method to discretize the data. "
|
@@ -330,10 +357,12 @@ class Metadata:
|
|
330
357
|
)
|
331
358
|
# Create binned version
|
332
359
|
binned_data = bin_data(data, self.auto_bin_method)
|
333
|
-
|
334
|
-
|
360
|
+
col_bn = _binned(col)
|
361
|
+
df = df.with_columns(pl.Series(name=col_bn, values=binned_data.astype(np.int64)))
|
362
|
+
factor_info[col] = FactorInfo("continuous", is_binned=True)
|
335
363
|
else:
|
336
|
-
|
364
|
+
# Non-continuous values - treat as discrete
|
365
|
+
factor_info[col] = FactorInfo("discrete")
|
337
366
|
|
338
367
|
# Store the results
|
339
368
|
self._dataframe = df
|
@@ -367,7 +396,7 @@ class Metadata:
|
|
367
396
|
for k, v in factors.items():
|
368
397
|
data = as_numpy(v)[self.image_indices]
|
369
398
|
new_columns.append(pl.Series(name=k, values=data))
|
370
|
-
self._factors[k] =
|
399
|
+
self._factors[k] = None
|
371
400
|
|
372
401
|
if new_columns:
|
373
402
|
self._dataframe = self.dataframe.with_columns(new_columns)
|
@@ -208,7 +208,7 @@ def get_groups(metadata: Metadata, split_on: Sequence[str] | None) -> NDArray[np
|
|
208
208
|
|
209
209
|
split_set = set(split_on)
|
210
210
|
indices = [i for i, name in enumerate(metadata.factor_names) if name in split_set]
|
211
|
-
binned_features = metadata.
|
211
|
+
binned_features = metadata.binned_data[:, indices]
|
212
212
|
return np.unique(binned_features, axis=0, return_inverse=True)[1]
|
213
213
|
|
214
214
|
|
@@ -11,12 +11,13 @@ from dataeval.utils._array import as_numpy
|
|
11
11
|
|
12
12
|
class ClassBalance(Selection[ImageClassificationDatum]):
|
13
13
|
"""
|
14
|
-
|
14
|
+
Select indices of a dataset that will equalize the occurrences of all classes.
|
15
15
|
|
16
16
|
Note
|
17
17
|
----
|
18
|
-
The total number of instances of each class will be equalized which may result
|
18
|
+
1. The total number of instances of each class will be equalized which may result
|
19
19
|
in a lower total number of instances than specified by the selection limit.
|
20
|
+
2. This selection currently only supports classification tasks
|
20
21
|
"""
|
21
22
|
|
22
23
|
stage = SelectionStage.FILTER
|
@@ -29,7 +30,7 @@ class ClassBalance(Selection[ImageClassificationDatum]):
|
|
29
30
|
label = int(np.argmax(as_numpy(target)))
|
30
31
|
else:
|
31
32
|
# ObjectDetectionTarget and SegmentationTarget not supported yet
|
32
|
-
raise TypeError("
|
33
|
+
raise TypeError("ClassBalance only supports classification targets as an array of class probabilities.")
|
33
34
|
class_indices.setdefault(label, []).append(i)
|
34
35
|
|
35
36
|
per_class_limit = min(min(len(c) for c in class_indices.values()), dataset._size_limit // len(class_indices))
|
@@ -14,12 +14,12 @@ from dataeval.utils._array import as_numpy
|
|
14
14
|
|
15
15
|
class ClassFilter(Selection[Any]):
|
16
16
|
"""
|
17
|
-
|
17
|
+
Select dataset indices based on class labels, keeping only those present in `classes`.
|
18
18
|
|
19
19
|
Parameters
|
20
20
|
----------
|
21
21
|
classes : Sequence[int]
|
22
|
-
The classes to
|
22
|
+
The sequence of classes to keep.
|
23
23
|
filter_detections : bool, default True
|
24
24
|
Whether to filter detections from targets for object detection and segmentation datasets.
|
25
25
|
"""
|
@@ -41,16 +41,16 @@ class ClassFilter(Selection[Any]):
|
|
41
41
|
if isinstance(target, Array):
|
42
42
|
# Get the label for the image
|
43
43
|
label = int(np.argmax(as_numpy(target)))
|
44
|
-
# Check to see if the label is in the classes to
|
44
|
+
# Check to see if the label is in the classes to keep
|
45
45
|
if label in self.classes:
|
46
|
-
# Include the image
|
46
|
+
# Include the image index
|
47
47
|
selection.append(idx)
|
48
48
|
elif isinstance(target, (ObjectDetectionTarget, SegmentationTarget)):
|
49
49
|
# Get the set of labels from the target
|
50
50
|
labels = set(target.labels if isinstance(target.labels, Iterable) else [target.labels])
|
51
51
|
# Check to see if any labels are in the classes to filter for
|
52
52
|
if labels.intersection(self.classes):
|
53
|
-
# Include the image
|
53
|
+
# Include the image index
|
54
54
|
selection.append(idx)
|
55
55
|
# If we are filtering out other labels and there are other labels, add a subselection filter
|
56
56
|
if self.filter_detections and labels.difference(self.classes):
|
@@ -9,12 +9,12 @@ from dataeval.data._selection import Select, Selection, SelectionStage
|
|
9
9
|
|
10
10
|
class Indices(Selection[Any]):
|
11
11
|
"""
|
12
|
-
Selects
|
12
|
+
Selects only the given indices from the dataset.
|
13
13
|
|
14
14
|
Parameters
|
15
15
|
----------
|
16
16
|
indices : Sequence[int]
|
17
|
-
The indices to select
|
17
|
+
The specific indices to select.
|
18
18
|
"""
|
19
19
|
|
20
20
|
stage = SelectionStage.FILTER
|