dataeval 1.0.6__tar.gz → 1.1.0rc0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/PKG-INFO +30 -11
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/README.md +18 -3
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/pyproject.toml +63 -42
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/__init__.py +17 -8
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/_embeddings.py +16 -11
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/_experimental.py +4 -12
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/_metadata.py +9 -6
- dataeval-1.1.0rc0/src/dataeval/_ontology.py +558 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/_version.py +2 -2
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/bias/_balance.py +6 -6
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/bias/_diversity.py +4 -4
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/bias/_parity.py +2 -2
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/config.py +35 -2
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/__init__.py +15 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_bin.py +66 -6
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_cache.py +5 -8
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_completeness.py +76 -25
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_compute_stats.py +8 -7
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_diversity.py +5 -5
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_fast_hdbscan/_mst.py +7 -2
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_feature_distance.py +2 -3
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_hash.py +11 -10
- dataeval-1.1.0rc0/src/dataeval/core/_label_alignment.py +271 -0
- dataeval-1.1.0rc0/src/dataeval/core/_label_coverage.py +246 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_label_parity.py +5 -3
- dataeval-1.1.0rc0/src/dataeval/core/_label_reconciliation.py +147 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_mst.py +15 -15
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_mutual_info.py +12 -12
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_nullmodel.py +315 -5
- dataeval-1.1.0rc0/src/dataeval/core/_ontology_validation.py +196 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_parity.py +4 -4
- dataeval-1.1.0rc0/src/dataeval/core/_track_stats.py +501 -0
- dataeval-1.1.0rc0/src/dataeval/data/__init__.py +37 -0
- {dataeval-1.0.6/src/dataeval/selection → dataeval-1.1.0rc0/src/dataeval/data}/_classbalance.py +5 -3
- {dataeval-1.0.6/src/dataeval/selection → dataeval-1.1.0rc0/src/dataeval/data}/_classfilter.py +8 -33
- dataeval-1.1.0rc0/src/dataeval/data/_conform.py +138 -0
- dataeval-1.1.0rc0/src/dataeval/data/_crops.py +371 -0
- {dataeval-1.0.6/src/dataeval/selection → dataeval-1.1.0rc0/src/dataeval/data}/_indices.py +1 -1
- {dataeval-1.0.6/src/dataeval/selection → dataeval-1.1.0rc0/src/dataeval/data}/_limit.py +1 -1
- dataeval-1.1.0rc0/src/dataeval/data/_merge.py +87 -0
- dataeval-1.1.0rc0/src/dataeval/data/_relabel.py +193 -0
- {dataeval-1.0.6/src/dataeval/selection → dataeval-1.1.0rc0/src/dataeval/data}/_reverse.py +1 -1
- {dataeval-1.0.6/src/dataeval/selection → dataeval-1.1.0rc0/src/dataeval/data}/_select.py +49 -10
- {dataeval-1.0.6/src/dataeval/selection → dataeval-1.1.0rc0/src/dataeval/data}/_shuffle.py +2 -2
- dataeval-1.0.6/src/dataeval/utils/data.py → dataeval-1.1.0rc0/src/dataeval/data/_split.py +13 -78
- dataeval-1.1.0rc0/src/dataeval/data/_tracks.py +83 -0
- dataeval-1.1.0rc0/src/dataeval/data/_unzip.py +84 -0
- dataeval-1.1.0rc0/src/dataeval/exceptions.py +77 -0
- dataeval-1.1.0rc0/src/dataeval/extractors/__init__.py +31 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/extractors/_bovw.py +7 -7
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/extractors/_flatten.py +2 -2
- dataeval-1.1.0rc0/src/dataeval/extractors/_geometry.py +53 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/extractors/_onnx.py +45 -16
- dataeval-1.1.0rc0/src/dataeval/extractors/_resize.py +22 -0
- dataeval-1.1.0rc0/src/dataeval/extractors/_scores.py +41 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/extractors/_torch.py +63 -15
- dataeval-1.1.0rc0/src/dataeval/extractors/_uncertainty.py +473 -0
- dataeval-1.1.0rc0/src/dataeval/models/__init__.py +20 -0
- dataeval-1.1.0rc0/src/dataeval/models/_backends.py +174 -0
- dataeval-1.1.0rc0/src/dataeval/models/_input.py +125 -0
- dataeval-1.1.0rc0/src/dataeval/models/_metadata.py +140 -0
- dataeval-1.1.0rc0/src/dataeval/models/_predictors.py +455 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/performance/_sufficiency.py +14 -13
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/protocols.py +257 -137
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/quality/_duplicates.py +23 -16
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/quality/_outliers.py +6 -6
- dataeval-1.1.0rc0/src/dataeval/scope/__init__.py +14 -0
- dataeval-1.1.0rc0/src/dataeval/scope/_coverage.py +407 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/scope/_prioritize.py +6 -6
- dataeval-1.1.0rc0/src/dataeval/scope/_representation.py +301 -0
- dataeval-1.1.0rc0/src/dataeval/selection/__init__.py +26 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/shift/__init__.py +2 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/_base.py +7 -4
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/_chunk.py +1 -1
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/_domain_classifier.py +2 -2
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/_kneighbors.py +1 -1
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/_mmd.py +6 -2
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/_reconstruction.py +2 -2
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/_univariate.py +15 -5
- dataeval-1.1.0rc0/src/dataeval/shift/_drift/_wasserstein.py +423 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/shift/_shared/_reconstruction.py +3 -3
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/types.py +387 -4
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/utils/__init__.py +1 -2
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/utils/_internal.py +92 -2
- dataeval-1.1.0rc0/src/dataeval/utils/_validate.py +258 -0
- dataeval-1.1.0rc0/src/dataeval/utils/data.py +34 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/utils/preprocessing.py +111 -21
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/utils/training.py +101 -16
- dataeval-1.0.6/src/dataeval/exceptions.py +0 -41
- dataeval-1.0.6/src/dataeval/extractors/__init__.py +0 -15
- dataeval-1.0.6/src/dataeval/extractors/_uncertainty.py +0 -245
- dataeval-1.0.6/src/dataeval/scope/__init__.py +0 -10
- dataeval-1.0.6/src/dataeval/selection/__init__.py +0 -20
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/.gitignore +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/LICENSE +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/_helpers.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/_log.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/_warm_cache.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/bias/__init__.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_ber.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/__init__.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_base.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_dimensionstats.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_hashstats.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_pixelstats.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_register.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_registry.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_visualstats.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_clusterer.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_compute_ratios.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_coverage.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_divergence.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_fast_hdbscan/_cluster_trees.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_fast_hdbscan/_disjoint_set.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_label_errors.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_label_stats.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_metadata_insights.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_rank.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/core/_uap.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/flags.py +5 -5
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/performance/__init__.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/performance/_aggregator.py +8 -8
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/performance/_output.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/performance/schedules.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/py.typed +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/quality/__init__.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/quality/_shared.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/__init__.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/shift/_ood/__init__.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/shift/_ood/_base.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/shift/_ood/_domain_classifier.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/shift/_ood/_kneighbors.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/shift/_ood/_reconstruction.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/shift/_shared/__init__.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/shift/_shared/_domain_classifier.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/shift/_shared/_kneighbors.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/shift/update_strategies.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/utils/losses.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/utils/models.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/utils/onnx.py +0 -0
- {dataeval-1.0.6 → dataeval-1.1.0rc0}/src/dataeval/utils/thresholds.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataeval
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.0rc0
|
|
4
4
|
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
|
5
5
|
Project-URL: Homepage, https://dataeval.ai/
|
|
6
6
|
Project-URL: Repository, https://github.com/aria-ml/dataeval/
|
|
@@ -21,6 +21,7 @@ Classifier: Programming Language :: Python :: 3.14
|
|
|
21
21
|
Classifier: Topic :: Scientific/Engineering
|
|
22
22
|
Requires-Python: >=3.10
|
|
23
23
|
Requires-Dist: lightgbm>=4
|
|
24
|
+
Requires-Dist: maite>=0.9.4
|
|
24
25
|
Requires-Dist: numba>=0.61.0
|
|
25
26
|
Requires-Dist: numpy>=1.24.2
|
|
26
27
|
Requires-Dist: polars>=1.0.0
|
|
@@ -37,23 +38,27 @@ Requires-Dist: torchvision>=0.17.0; extra == 'cpu'
|
|
|
37
38
|
Provides-Extra: cu118
|
|
38
39
|
Requires-Dist: torch>=2.2.0; extra == 'cu118'
|
|
39
40
|
Requires-Dist: torchvision>=0.17.0; extra == 'cu118'
|
|
40
|
-
Provides-Extra: cu124
|
|
41
|
-
Requires-Dist: torch>=2.2.0; extra == 'cu124'
|
|
42
|
-
Requires-Dist: torchvision>=0.17.0; extra == 'cu124'
|
|
43
41
|
Provides-Extra: cu128
|
|
44
42
|
Requires-Dist: torch>=2.2.0; extra == 'cu128'
|
|
45
43
|
Requires-Dist: torchvision>=0.17.0; extra == 'cu128'
|
|
44
|
+
Provides-Extra: litert
|
|
45
|
+
Requires-Dist: ai-edge-litert>=2.0; (python_version <= '3.14') and extra == 'litert'
|
|
46
46
|
Provides-Extra: onnx
|
|
47
|
-
Requires-Dist: onnx; extra == 'onnx'
|
|
48
|
-
Requires-Dist: onnxruntime
|
|
47
|
+
Requires-Dist: onnx>=1.14.0; extra == 'onnx'
|
|
48
|
+
Requires-Dist: onnxruntime<1.24,>=1.14.0; (python_version == '3.10') and extra == 'onnx'
|
|
49
|
+
Requires-Dist: onnxruntime>=1.14.0; (python_version >= '3.11') and extra == 'onnx'
|
|
49
50
|
Provides-Extra: onnx-gpu
|
|
50
|
-
Requires-Dist: onnx; extra == 'onnx-gpu'
|
|
51
|
-
Requires-Dist: onnxruntime-gpu
|
|
51
|
+
Requires-Dist: onnx>=1.14.0; extra == 'onnx-gpu'
|
|
52
|
+
Requires-Dist: onnxruntime-gpu<1.24,>=1.14.0; (python_version == '3.10') and extra == 'onnx-gpu'
|
|
53
|
+
Requires-Dist: onnxruntime-gpu>=1.14.0; (python_version >= '3.11') and extra == 'onnx-gpu'
|
|
54
|
+
Provides-Extra: ontology
|
|
55
|
+
Requires-Dist: rdflib>=7.0; extra == 'ontology'
|
|
52
56
|
Provides-Extra: opencv
|
|
53
57
|
Requires-Dist: opencv-python-headless>=4.8.0; extra == 'opencv'
|
|
54
58
|
Description-Content-Type: text/markdown
|
|
55
59
|
|
|
56
60
|
<!-- markdownlint-disable MD041 -->
|
|
61
|
+
|
|
57
62
|

|
|
58
63
|
|
|
59
64
|
<!-- :auto badges: -->
|
|
@@ -130,14 +135,28 @@ You can install DataEval directly from pypi.org using the following command.
|
|
|
130
135
|
pip install dataeval
|
|
131
136
|
```
|
|
132
137
|
|
|
138
|
+
By default, PyTorch is installed from PyPI which includes CUDA support on Linux.
|
|
139
|
+
To install a specific PyTorch variant, use `--extra-index-url`:
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
# CPU only
|
|
143
|
+
pip install dataeval --extra-index-url https://download.pytorch.org/whl/cpu
|
|
144
|
+
|
|
145
|
+
# CUDA 11.8
|
|
146
|
+
pip install dataeval --extra-index-url https://download.pytorch.org/whl/cu118
|
|
147
|
+
|
|
148
|
+
# CUDA 12.8
|
|
149
|
+
pip install dataeval --extra-index-url https://download.pytorch.org/whl/cu128
|
|
150
|
+
```
|
|
151
|
+
|
|
133
152
|
### **Installing with conda**
|
|
134
153
|
|
|
135
154
|
DataEval can be installed in a Conda/Mamba environment using the provided
|
|
136
|
-
`environment.
|
|
155
|
+
`environment.yml` file. As some dependencies are installed from the `pytorch`
|
|
137
156
|
channel, the channel is specified in the below example.
|
|
138
157
|
|
|
139
158
|
```bash
|
|
140
|
-
micromamba create -f environment\environment.
|
|
159
|
+
micromamba create -f environment\environment.yml -c pytorch
|
|
141
160
|
```
|
|
142
161
|
|
|
143
162
|
### **Installing from GitHub**
|
|
@@ -401,7 +420,7 @@ shape: (3, 5)
|
|
|
401
420
|
|
|
402
421
|
A result with many large groups is a signal that your dataset contains
|
|
403
422
|
repeated collection events. Before training, remove all but one sample from
|
|
404
|
-
each group. See the [deduplication how-to guide](./docs/source/notebooks/h2_deduplicate.
|
|
423
|
+
each group. See the [deduplication how-to guide](./docs/source/notebooks/h2_deduplicate.py)
|
|
405
424
|
for a complete walkthrough, including how to choose which sample to keep.
|
|
406
425
|
|
|
407
426
|
### Where to go next
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
<!-- markdownlint-disable MD041 -->
|
|
2
|
+
|
|
2
3
|

|
|
3
4
|
|
|
4
5
|
<!-- :auto badges: -->
|
|
@@ -75,14 +76,28 @@ You can install DataEval directly from pypi.org using the following command.
|
|
|
75
76
|
pip install dataeval
|
|
76
77
|
```
|
|
77
78
|
|
|
79
|
+
By default, PyTorch is installed from PyPI which includes CUDA support on Linux.
|
|
80
|
+
To install a specific PyTorch variant, use `--extra-index-url`:
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
# CPU only
|
|
84
|
+
pip install dataeval --extra-index-url https://download.pytorch.org/whl/cpu
|
|
85
|
+
|
|
86
|
+
# CUDA 11.8
|
|
87
|
+
pip install dataeval --extra-index-url https://download.pytorch.org/whl/cu118
|
|
88
|
+
|
|
89
|
+
# CUDA 12.8
|
|
90
|
+
pip install dataeval --extra-index-url https://download.pytorch.org/whl/cu128
|
|
91
|
+
```
|
|
92
|
+
|
|
78
93
|
### **Installing with conda**
|
|
79
94
|
|
|
80
95
|
DataEval can be installed in a Conda/Mamba environment using the provided
|
|
81
|
-
`environment.
|
|
96
|
+
`environment.yml` file. As some dependencies are installed from the `pytorch`
|
|
82
97
|
channel, the channel is specified in the below example.
|
|
83
98
|
|
|
84
99
|
```bash
|
|
85
|
-
micromamba create -f environment\environment.
|
|
100
|
+
micromamba create -f environment\environment.yml -c pytorch
|
|
86
101
|
```
|
|
87
102
|
|
|
88
103
|
### **Installing from GitHub**
|
|
@@ -346,7 +361,7 @@ shape: (3, 5)
|
|
|
346
361
|
|
|
347
362
|
A result with many large groups is a signal that your dataset contains
|
|
348
363
|
repeated collection events. Before training, remove all but one sample from
|
|
349
|
-
each group. See the [deduplication how-to guide](./docs/source/notebooks/h2_deduplicate.
|
|
364
|
+
each group. See the [deduplication how-to guide](./docs/source/notebooks/h2_deduplicate.py)
|
|
350
365
|
for a complete walkthrough, including how to choose which sample to keep.
|
|
351
366
|
|
|
352
367
|
### Where to go next
|
|
@@ -31,6 +31,7 @@ classifiers = [
|
|
|
31
31
|
"Topic :: Scientific/Engineering",
|
|
32
32
|
]
|
|
33
33
|
dependencies = [
|
|
34
|
+
"maite>=0.9.4",
|
|
34
35
|
"numba>=0.61.0",
|
|
35
36
|
"lightgbm>=4",
|
|
36
37
|
"numpy>=1.24.2",
|
|
@@ -47,17 +48,43 @@ dependencies = [
|
|
|
47
48
|
[project.optional-dependencies]
|
|
48
49
|
cpu = ["torch>=2.2.0", "torchvision>=0.17.0"]
|
|
49
50
|
cu118 = ["torch>=2.2.0", "torchvision>=0.17.0"]
|
|
50
|
-
cu124 = ["torch>=2.2.0", "torchvision>=0.17.0"]
|
|
51
51
|
cu128 = ["torch>=2.2.0", "torchvision>=0.17.0"]
|
|
52
|
+
litert = ["ai-edge-litert>=2.0; python_version <= '3.14'"]
|
|
52
53
|
opencv = ["opencv-python-headless>=4.8.0"]
|
|
53
|
-
onnx = [
|
|
54
|
-
|
|
54
|
+
onnx = [
|
|
55
|
+
"onnx>=1.14.0",
|
|
56
|
+
"onnxruntime>=1.14.0,<1.24; python_version == '3.10'",
|
|
57
|
+
"onnxruntime>=1.14.0; python_version >= '3.11'",
|
|
58
|
+
]
|
|
59
|
+
onnx-gpu = [
|
|
60
|
+
"onnx>=1.14.0",
|
|
61
|
+
"onnxruntime-gpu>=1.14.0,<1.24; python_version == '3.10'",
|
|
62
|
+
"onnxruntime-gpu>=1.14.0; python_version >= '3.11'",
|
|
63
|
+
]
|
|
64
|
+
ontology = ["rdflib>=7.0"]
|
|
55
65
|
|
|
56
66
|
[project.urls]
|
|
57
67
|
Homepage = "https://dataeval.ai/"
|
|
58
68
|
Repository = "https://github.com/aria-ml/dataeval/"
|
|
59
69
|
Documentation = "https://dataeval.readthedocs.io/"
|
|
60
70
|
|
|
71
|
+
# MAITE interoperability entry-points.
|
|
72
|
+
[project.entry-points."maite.tasks"]
|
|
73
|
+
dataeval_balance = "dataeval.bias:Balance"
|
|
74
|
+
dataeval_diversity = "dataeval.bias:Diversity"
|
|
75
|
+
dataeval_parity = "dataeval.bias:Parity"
|
|
76
|
+
dataeval_outliers = "dataeval.quality:Outliers"
|
|
77
|
+
dataeval_duplicates = "dataeval.quality:Duplicates"
|
|
78
|
+
dataeval_sufficiency = "dataeval.performance:Sufficiency"
|
|
79
|
+
|
|
80
|
+
[project.entry-points."maite.protocols.image_classification.Model"]
|
|
81
|
+
dataeval_OnnxImageClassifier = "dataeval.models:OnnxImageClassifier"
|
|
82
|
+
dataeval_LiteRtImageClassifier = "dataeval.models:LiteRtImageClassifier"
|
|
83
|
+
|
|
84
|
+
[project.entry-points."maite.protocols.object_detection.Model"]
|
|
85
|
+
dataeval_OnnxObjectDetector = "dataeval.models:OnnxObjectDetector"
|
|
86
|
+
dataeval_LiteRtObjectDetector = "dataeval.models:LiteRtObjectDetector"
|
|
87
|
+
|
|
61
88
|
[dependency-groups]
|
|
62
89
|
base = [
|
|
63
90
|
"uv>=0.8.0",
|
|
@@ -65,7 +92,7 @@ base = [
|
|
|
65
92
|
lock = [
|
|
66
93
|
{ include-group = "base" },
|
|
67
94
|
"pyproject2conda>=0.22",
|
|
68
|
-
"poetry
|
|
95
|
+
"poetry==2.2.0; python_version<'3.14'",
|
|
69
96
|
]
|
|
70
97
|
lint = [
|
|
71
98
|
"ruff>=0.11",
|
|
@@ -73,19 +100,15 @@ lint = [
|
|
|
73
100
|
]
|
|
74
101
|
docsync = [
|
|
75
102
|
"jupytext>=1.19.1",
|
|
76
|
-
"mdformat-myst",
|
|
77
|
-
]
|
|
78
|
-
doclint = [
|
|
79
|
-
{ include-group = "docs"},
|
|
80
|
-
"ruff>=0.11",
|
|
81
|
-
"pyright[nodejs]>=1.1.400",
|
|
82
103
|
]
|
|
83
104
|
test = [
|
|
84
105
|
"coverage[toml]>=7.6",
|
|
85
106
|
"onnx>=1.14.0",
|
|
107
|
+
"onnxscript>=0.6.0",
|
|
86
108
|
"pytest>=8.3",
|
|
87
109
|
"pytest-cov>=6.1",
|
|
88
110
|
"pytest-xdist>=3.6.1",
|
|
111
|
+
"rdflib>=7.0",
|
|
89
112
|
]
|
|
90
113
|
verify = [
|
|
91
114
|
"pytest>=8.3",
|
|
@@ -103,10 +126,13 @@ docs = [
|
|
|
103
126
|
"jinja2>=3.1.6",
|
|
104
127
|
"jupyter-client>=8.6.0",
|
|
105
128
|
"jupyter-cache>=1.0",
|
|
106
|
-
"maite-datasets>=0.0.
|
|
129
|
+
"maite-datasets>=0.0.15",
|
|
107
130
|
"myst-nb>=1.0",
|
|
108
131
|
"opencv-python-headless>=4.8.0",
|
|
132
|
+
"pandas>=2.0.0",
|
|
109
133
|
"plotly>=6.2.0",
|
|
134
|
+
"rapidfuzz>=3.0",
|
|
135
|
+
"rdflib>=7.0",
|
|
110
136
|
"sphinx-autoapi>=3.6.0",
|
|
111
137
|
"sphinx-design>=0.6.1",
|
|
112
138
|
"sphinx-immaterial>=0.12.5",
|
|
@@ -116,21 +142,15 @@ docs = [
|
|
|
116
142
|
"torchmetrics>=1.0.0",
|
|
117
143
|
"markupsafe>=3,<3.0.2",
|
|
118
144
|
"jupytext>=1.19.1",
|
|
145
|
+
"ultralytics>=8.0.0",
|
|
119
146
|
]
|
|
120
147
|
security = [ # keep in sync with [tool.uv.constraint-dependencies]
|
|
121
148
|
"cryptography>=46.0.5", # CVE-2026-26007: Missing Subgroup Validation for SECT Curves
|
|
122
149
|
"filelock>=3.20.3", # GHSA-w853-jp5j-5j7f, GHSA-qmgc-5h2g-mvrw
|
|
123
|
-
"onnx>=1.21.0", # CVE-2026-28500: Untrusted Model Repository Warnings Suppressed by silent=True
|
|
124
|
-
# CVE-2026-34445: Malicious ONNX models can crash servers by exploiting unprotected object settings
|
|
125
|
-
# CVE-2026-27489: Vulnerable to Path Traversal via Symlink
|
|
126
|
-
# GHSA-q56x-g2fj-4rj6: TOCTOU arbitrary file read/write in save_external_dat
|
|
127
150
|
"pillow>=12.1.1", # CVE-2026-25990: OOB write via PSD image
|
|
128
|
-
"poetry>=2.3.3", # CVE-2026-34591: Poetry Has Wheel Path Traversal Which Can Lead to Arbitrary File Write
|
|
129
151
|
"protobuf>=6.33.5", # GHSA-7gcm-g887-7qv7
|
|
130
152
|
"setuptools>=82.0.0", # CVE-2026-23949: (jaraco_context) path traversal in tarball()
|
|
131
153
|
# CVE-2026-24049: (wheel) privilege escalation via unpack
|
|
132
|
-
"tornado>=6.5.5", # CVE-2026-31958: Tornado is vulnerable to DoS due to too many multipart parts
|
|
133
|
-
# CVE-2026-35536: Tornado has cookie attribute injection via .RequestHandler.set_cookie
|
|
134
154
|
]
|
|
135
155
|
dev = [
|
|
136
156
|
{ include-group = "base" },
|
|
@@ -149,24 +169,16 @@ conflicts = [
|
|
|
149
169
|
[
|
|
150
170
|
{ extra = "cpu" },
|
|
151
171
|
{ extra = "cu118" },
|
|
152
|
-
{ extra = "cu124" },
|
|
153
172
|
{ extra = "cu128" },
|
|
154
173
|
],
|
|
155
174
|
]
|
|
156
175
|
constraint-dependencies = [
|
|
157
176
|
"cryptography>=46.0.5", # CVE-2026-26007: Missing Subgroup Validation for SECT Curves
|
|
158
177
|
"filelock>=3.20.3", # GHSA-w853-jp5j-5j7f, GHSA-qmgc-5h2g-mvrw
|
|
159
|
-
"onnx>=1.21.0", # CVE-2026-28500: Untrusted Model Repository Warnings Suppressed by silent=True
|
|
160
|
-
# CVE-2026-34445: Malicious ONNX models can crash servers by exploiting unprotected object settings
|
|
161
|
-
# CVE-2026-27489: Vulnerable to Path Traversal via Symlink
|
|
162
|
-
# GHSA-q56x-g2fj-4rj6: TOCTOU arbitrary file read/write in save_external_dat
|
|
163
178
|
"pillow>=12.1.1", # CVE-2026-25990: OOB write via PSD image
|
|
164
|
-
"poetry>=2.3.3", # CVE-2026-34591: Poetry Has Wheel Path Traversal Which Can Lead to Arbitrary File Write
|
|
165
179
|
"protobuf>=6.33.5", # GHSA-7gcm-g887-7qv7
|
|
166
180
|
"setuptools>=82.0.0", # CVE-2026-23949: (jaraco_context) path traversal in tarball()
|
|
167
181
|
# CVE-2026-24049: (wheel) privilege escalation via unpack
|
|
168
|
-
"tornado>=6.5.5", # CVE-2026-31958: Tornado is vulnerable to DoS due to too many multipart parts
|
|
169
|
-
# CVE-2026-35536: Tornado has cookie attribute injection via .RequestHandler.set_cookie
|
|
170
182
|
]
|
|
171
183
|
|
|
172
184
|
[[tool.uv.index]]
|
|
@@ -179,11 +191,6 @@ name = "pytorch-cu118"
|
|
|
179
191
|
url = "https://download.pytorch.org/whl/cu118"
|
|
180
192
|
explicit = true
|
|
181
193
|
|
|
182
|
-
[[tool.uv.index]]
|
|
183
|
-
name = "pytorch-cu124"
|
|
184
|
-
url = "https://download.pytorch.org/whl/cu124"
|
|
185
|
-
explicit = true
|
|
186
|
-
|
|
187
194
|
[[tool.uv.index]]
|
|
188
195
|
name = "pytorch-cu128"
|
|
189
196
|
url = "https://download.pytorch.org/whl/cu128"
|
|
@@ -193,19 +200,28 @@ explicit = true
|
|
|
193
200
|
torch = [
|
|
194
201
|
{ index = "pytorch-cpu", extra = "cpu" },
|
|
195
202
|
{ index = "pytorch-cu118", extra = "cu118" },
|
|
196
|
-
{ index = "pytorch-cu124", extra = "cu124" },
|
|
197
203
|
{ index = "pytorch-cu128", extra = "cu128" },
|
|
198
204
|
]
|
|
199
205
|
torchvision = [
|
|
200
206
|
{ index = "pytorch-cpu", extra = "cpu" },
|
|
201
207
|
{ index = "pytorch-cu118", extra = "cu118" },
|
|
202
|
-
{ index = "pytorch-cu124", extra = "cu124" },
|
|
203
208
|
{ index = "pytorch-cu128", extra = "cu128" },
|
|
204
209
|
]
|
|
205
210
|
|
|
206
211
|
[tool.uv.extra-build-dependencies]
|
|
207
212
|
numba = ["tbb>=2021.6"]
|
|
208
213
|
|
|
214
|
+
[tool.poetry]
|
|
215
|
+
version = "0.0.0" # overwritten by poetry-dynamic-versioning
|
|
216
|
+
|
|
217
|
+
[[tool.poetry.source]]
|
|
218
|
+
name = "pytorch-cpu"
|
|
219
|
+
url = "https://download.pytorch.org/whl/cpu"
|
|
220
|
+
priority = "supplemental"
|
|
221
|
+
|
|
222
|
+
[tool.poetry.dependencies]
|
|
223
|
+
torch = { version = ">=2.2.0", source = "pytorch-cpu" }
|
|
224
|
+
|
|
209
225
|
[tool.hatch.build.targets.sdist]
|
|
210
226
|
include = ["src/dataeval"]
|
|
211
227
|
|
|
@@ -221,11 +237,11 @@ source = "vcs"
|
|
|
221
237
|
[tool.hatch.build.hooks.vcs]
|
|
222
238
|
version-file = "src/dataeval/_version.py"
|
|
223
239
|
|
|
224
|
-
[tool.poetry]
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
240
|
+
[tool.poetry-dynamic-versioning]
|
|
241
|
+
enable = true
|
|
242
|
+
vcs = "git"
|
|
243
|
+
style = "pep440"
|
|
244
|
+
pattern = "^v?(?P<base>\\d+\\.\\d+\\.\\d+)"
|
|
229
245
|
|
|
230
246
|
[tool.pyproject2conda.dependencies]
|
|
231
247
|
numpy = { skip = true, packages = "numpy>=1.24.2" }
|
|
@@ -235,7 +251,7 @@ torch = { pip = true } # PyTorch is no longer maintained on conda-forge
|
|
|
235
251
|
xxhash = { skip = true, packages = "python-xxhash>=3.3" }
|
|
236
252
|
|
|
237
253
|
[tool.pyright]
|
|
238
|
-
include = ["src", "tests"]
|
|
254
|
+
include = ["src", "tests", "verification", "docs/source/notebooks"]
|
|
239
255
|
exclude = [
|
|
240
256
|
"**/__pycache__",
|
|
241
257
|
"**/node_modules",
|
|
@@ -248,6 +264,10 @@ reportMissingImports = false
|
|
|
248
264
|
|
|
249
265
|
[tool.pytest.ini_options]
|
|
250
266
|
testpaths = ["tests"]
|
|
267
|
+
filterwarnings = [
|
|
268
|
+
"ignore:The default value of normalize_pixel_values changed:FutureWarning",
|
|
269
|
+
"ignore:Clustering metrics expect discrete values but received continuous values:UserWarning",
|
|
270
|
+
]
|
|
251
271
|
addopts = [
|
|
252
272
|
"--pythonwarnings=ignore::DeprecationWarning",
|
|
253
273
|
"--verbose",
|
|
@@ -294,6 +314,7 @@ exclude = [
|
|
|
294
314
|
".tox",
|
|
295
315
|
"prototype",
|
|
296
316
|
"src/dataeval/_version.py",
|
|
317
|
+
"*.ipynb",
|
|
297
318
|
]
|
|
298
319
|
line-length = 120
|
|
299
320
|
indent-width = 4
|
|
@@ -308,7 +329,7 @@ ignore = ["ANN101", "ANN102", "ANN401", "C408", "C416", "COM812", "NPY002", "SLF
|
|
|
308
329
|
fixable = ["ALL"]
|
|
309
330
|
unfixable = []
|
|
310
331
|
dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
|
|
311
|
-
per-file-ignores = { "!src/*" = ["ANN", "ARG", "BLE", "S", "SLF", "RET", "C90", "D", "FIX", "N", "PERF"], "docs/*" = ["B904"] }
|
|
332
|
+
per-file-ignores = { "!src/*" = ["ANN", "ARG", "BLE", "S", "SLF", "RET", "C90", "D", "FIX", "N", "PERF"], "docs/*" = ["B904"], "docs/source/notebooks/*" = ["E402", "E501", "E703", "RUF100", "SIM105", "UP009"] }
|
|
312
333
|
|
|
313
334
|
[tool.ruff.lint.flake8-builtins]
|
|
314
335
|
builtins-strict-checking = false
|
|
@@ -332,8 +353,8 @@ docstring-code-format = true
|
|
|
332
353
|
docstring-code-line-length = "dynamic"
|
|
333
354
|
|
|
334
355
|
[tool.codespell]
|
|
335
|
-
skip = './*env*,./output,./docs/build,./docs/source/.jupyter_cache,./docs/source/*/data,CHANGELOG.md,uv.lock,requirements
|
|
336
|
-
ignore-words-list = ["Hart","FPR"]
|
|
356
|
+
skip = './*env*,./output,./docs/build,./docs/source/.jupyter_cache,./docs/source/*/data,CHANGELOG.md,uv.lock,requirements.*.txt,*.html,*.lock,*.ipynb'
|
|
357
|
+
ignore-words-list = ["Hart","FPR", "MOT", "mot"]
|
|
337
358
|
|
|
338
359
|
[build-system]
|
|
339
360
|
requires = ["hatchling", "hatch-vcs"]
|
|
@@ -20,24 +20,30 @@ __all__ = [
|
|
|
20
20
|
"exceptions",
|
|
21
21
|
"flags",
|
|
22
22
|
"log",
|
|
23
|
+
"models",
|
|
23
24
|
"protocols",
|
|
24
25
|
"types",
|
|
25
26
|
"Embeddings",
|
|
26
27
|
"Metadata",
|
|
28
|
+
"Ontology",
|
|
27
29
|
]
|
|
28
30
|
|
|
29
31
|
import logging
|
|
30
32
|
|
|
31
|
-
from . import config, exceptions, flags, protocols, types
|
|
33
|
+
from . import config, exceptions, flags, models, protocols, types
|
|
32
34
|
from ._embeddings import Embeddings
|
|
33
35
|
from ._metadata import Metadata
|
|
36
|
+
from ._ontology import Ontology
|
|
34
37
|
|
|
35
38
|
logging.getLogger(__name__).addHandler(logging.NullHandler())
|
|
36
39
|
|
|
37
40
|
|
|
38
41
|
def log(level: int = logging.DEBUG, handler: logging.Handler | None = None) -> None:
|
|
39
42
|
"""
|
|
40
|
-
Add a
|
|
43
|
+
Add a handler to the logger quickly for debugging.
|
|
44
|
+
|
|
45
|
+
Calling this more than once is idempotent: a handler equal to one already
|
|
46
|
+
attached to the logger is not added again, so log lines are not duplicated.
|
|
41
47
|
|
|
42
48
|
Parameters
|
|
43
49
|
----------
|
|
@@ -45,18 +51,21 @@ def log(level: int = logging.DEBUG, handler: logging.Handler | None = None) -> N
|
|
|
45
51
|
Set the logging level for the logger.
|
|
46
52
|
handler : logging.Handler, optional
|
|
47
53
|
Sets the logging handler for the logger if provided, otherwise logger will be
|
|
48
|
-
provided with a StreamHandler.
|
|
54
|
+
provided with a StreamHandler. When a custom handler is supplied its formatter
|
|
55
|
+
is left untouched; the default StreamHandler is given a verbose debugging
|
|
56
|
+
formatter.
|
|
49
57
|
"""
|
|
50
58
|
import logging
|
|
51
59
|
|
|
52
|
-
|
|
60
|
+
_logger = logging.getLogger(__name__)
|
|
53
61
|
if handler is None:
|
|
54
|
-
handler = logging.StreamHandler()
|
|
62
|
+
handler = logging.StreamHandler()
|
|
55
63
|
handler.setFormatter(
|
|
56
64
|
logging.Formatter(
|
|
57
65
|
"%(asctime)s %(levelname)-8s %(name)s.%(filename)s:%(lineno)s - %(funcName)10s() | %(message)s",
|
|
58
66
|
),
|
|
59
67
|
)
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
68
|
+
if handler not in _logger.handlers:
|
|
69
|
+
_logger.addHandler(handler)
|
|
70
|
+
_logger.setLevel(level)
|
|
71
|
+
_logger.debug("Added logging handler %s to logger: %s", handler, __name__)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""Embeddings class for extracting and managing image embeddings."""
|
|
2
2
|
|
|
3
|
-
__all__ = []
|
|
3
|
+
__all__ = ["Embeddings"]
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
6
|
import os
|
|
@@ -14,7 +14,7 @@ import xxhash as xxh
|
|
|
14
14
|
from numpy.typing import NDArray
|
|
15
15
|
from typing_extensions import Self
|
|
16
16
|
|
|
17
|
-
from dataeval.config import
|
|
17
|
+
from dataeval.config import resolve_batch_size
|
|
18
18
|
from dataeval.exceptions import NotFittedError
|
|
19
19
|
from dataeval.extractors import FlattenExtractor
|
|
20
20
|
from dataeval.protocols import (
|
|
@@ -25,6 +25,8 @@ from dataeval.protocols import (
|
|
|
25
25
|
FeatureExtractor,
|
|
26
26
|
ProgressCallback,
|
|
27
27
|
)
|
|
28
|
+
from dataeval.utils._internal import unwrap_image
|
|
29
|
+
from dataeval.utils._validate import requires_maite_dataset
|
|
28
30
|
|
|
29
31
|
_logger = logging.getLogger(__name__)
|
|
30
32
|
|
|
@@ -53,8 +55,14 @@ class Embeddings(Array, FeatureExtractor):
|
|
|
53
55
|
:class:`~dataeval.extractors.FlattenExtractor` for simple baseline
|
|
54
56
|
compatibility with all DataEval tools.
|
|
55
57
|
batch_size : int or None, default None
|
|
56
|
-
|
|
57
|
-
|
|
58
|
+
I/O chunk size: how many images are loaded from the dataset, encoded, and
|
|
59
|
+
written to storage per step. Resolved via
|
|
60
|
+
:func:`~dataeval.config.resolve_batch_size` as the first set of
|
|
61
|
+
``batch_size`` (this argument), the extractor's own ``batch_size``, then
|
|
62
|
+
the global default. This is distinct from an extractor's own forward-pass
|
|
63
|
+
(compute) batch size: an extractor with its own ``batch_size`` sub-batches
|
|
64
|
+
each chunk for the model, so the smaller of the two bounds the forward
|
|
65
|
+
pass. Batching never changes the resulting embeddings.
|
|
58
66
|
path : Path, str, or None, default None
|
|
59
67
|
File path for memory-mapped storage. When None, caches embeddings in memory only.
|
|
60
68
|
When Path or string is provided, uses memory-mapped storage for large embeddings
|
|
@@ -93,6 +101,7 @@ class Embeddings(Array, FeatureExtractor):
|
|
|
93
101
|
|
|
94
102
|
memory_threshold: float
|
|
95
103
|
|
|
104
|
+
@requires_maite_dataset("dataset", expected="image_only")
|
|
96
105
|
def __init__(
|
|
97
106
|
self,
|
|
98
107
|
# Technically more permissive than ImageClassificationDataset or ObjectDetectionDataset
|
|
@@ -104,7 +113,7 @@ class Embeddings(Array, FeatureExtractor):
|
|
|
104
113
|
progress_callback: ProgressCallback | None = None,
|
|
105
114
|
) -> None:
|
|
106
115
|
self._extractor = extractor if extractor is not None else FlattenExtractor()
|
|
107
|
-
self._batch_size =
|
|
116
|
+
self._batch_size = resolve_batch_size(batch_size, getattr(self._extractor, "batch_size", None))
|
|
108
117
|
self.memory_threshold = max(0.0, min(1.0, memory_threshold))
|
|
109
118
|
self._progress_callback = progress_callback
|
|
110
119
|
|
|
@@ -159,6 +168,7 @@ class Embeddings(Array, FeatureExtractor):
|
|
|
159
168
|
"""
|
|
160
169
|
return self._dataset is not None
|
|
161
170
|
|
|
171
|
+
@requires_maite_dataset("dataset", expected="image_only")
|
|
162
172
|
def bind(self, dataset: Dataset[tuple[ArrayLike, Any, Any]] | Dataset[ArrayLike]) -> Self:
|
|
163
173
|
"""Bind this instance to a dataset.
|
|
164
174
|
|
|
@@ -502,12 +512,7 @@ class Embeddings(Array, FeatureExtractor):
|
|
|
502
512
|
if self._dataset is None:
|
|
503
513
|
raise NotFittedError("No dataset bound. Call bind() first.")
|
|
504
514
|
|
|
505
|
-
|
|
506
|
-
for idx in indices:
|
|
507
|
-
item = self._dataset[idx]
|
|
508
|
-
image = item[0] if isinstance(item, tuple) else item
|
|
509
|
-
images.append(image)
|
|
510
|
-
return images
|
|
515
|
+
return [unwrap_image(self._dataset[idx]) for idx in indices]
|
|
511
516
|
|
|
512
517
|
def _batch(self, indices: Sequence[int]) -> Iterator[NDArray[Any]]: # noqa: C901
|
|
513
518
|
"""Process indices in batches using the extractor."""
|
|
@@ -39,14 +39,6 @@ def _make_warning_message( # noqa: C901
|
|
|
39
39
|
return msg
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
def _prepend_doc_note(doc: str | None, note: str) -> str:
|
|
43
|
-
"""Prepend a status note to a docstring."""
|
|
44
|
-
header = f".. warning::\n {note}"
|
|
45
|
-
if doc:
|
|
46
|
-
return f"{header}\n\n{doc}"
|
|
47
|
-
return header
|
|
48
|
-
|
|
49
|
-
|
|
50
42
|
@overload
|
|
51
43
|
def experimental(_target: F) -> F: ...
|
|
52
44
|
@overload
|
|
@@ -89,7 +81,7 @@ def experimental( # noqa: C901
|
|
|
89
81
|
original_init(self, *args, **kwargs)
|
|
90
82
|
|
|
91
83
|
target.__init__ = new_init # type: ignore[attr-defined]
|
|
92
|
-
target.
|
|
84
|
+
target.__experimental__ = msg # type: ignore[attr-defined]
|
|
93
85
|
return target # type: ignore[return-value]
|
|
94
86
|
|
|
95
87
|
@functools.wraps(target)
|
|
@@ -100,7 +92,7 @@ def experimental( # noqa: C901
|
|
|
100
92
|
warned = True
|
|
101
93
|
return target(*args, **kwargs)
|
|
102
94
|
|
|
103
|
-
wrapper.
|
|
95
|
+
wrapper.__experimental__ = msg # type: ignore[attr-defined]
|
|
104
96
|
return wrapper # type: ignore[return-value]
|
|
105
97
|
|
|
106
98
|
if _target is not None:
|
|
@@ -165,7 +157,7 @@ def deprecated( # noqa: C901
|
|
|
165
157
|
original_init(self, *args, **kwargs)
|
|
166
158
|
|
|
167
159
|
target.__init__ = new_init # type: ignore[attr-defined]
|
|
168
|
-
target.
|
|
160
|
+
target.__deprecated__ = msg # type: ignore[attr-defined]
|
|
169
161
|
return target # type: ignore[return-value]
|
|
170
162
|
|
|
171
163
|
@functools.wraps(target)
|
|
@@ -176,7 +168,7 @@ def deprecated( # noqa: C901
|
|
|
176
168
|
warned = True
|
|
177
169
|
return target(*args, **kwargs)
|
|
178
170
|
|
|
179
|
-
wrapper.
|
|
171
|
+
wrapper.__deprecated__ = msg # type: ignore[attr-defined]
|
|
180
172
|
return wrapper # type: ignore[return-value]
|
|
181
173
|
|
|
182
174
|
if _target is not None:
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
__all__ = []
|
|
1
|
+
__all__ = ["Metadata"]
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence, Sized
|
|
@@ -22,6 +22,7 @@ from dataeval.protocols import (
|
|
|
22
22
|
)
|
|
23
23
|
from dataeval.types import Array1D
|
|
24
24
|
from dataeval.utils._internal import as_numpy, merge_metadata
|
|
25
|
+
from dataeval.utils._validate import requires_maite_dataset
|
|
25
26
|
|
|
26
27
|
_logger = logging.getLogger(__name__)
|
|
27
28
|
|
|
@@ -105,6 +106,7 @@ class Metadata(Array, FeatureExtractor):
|
|
|
105
106
|
>>> test_factors = metadata(test_dataset) # Extract from new dataset
|
|
106
107
|
"""
|
|
107
108
|
|
|
109
|
+
@requires_maite_dataset("dataset", expected="any_target")
|
|
108
110
|
def __init__(
|
|
109
111
|
self,
|
|
110
112
|
dataset: AnnotatedDataset[tuple[Any, Any, DatumMetadata]] | None = None,
|
|
@@ -168,6 +170,7 @@ class Metadata(Array, FeatureExtractor):
|
|
|
168
170
|
"""
|
|
169
171
|
return self._dataset is not None
|
|
170
172
|
|
|
173
|
+
@requires_maite_dataset("dataset", expected="any_target")
|
|
171
174
|
def bind(self, dataset: AnnotatedDataset[tuple[Any, Any, DatumMetadata]]) -> Self:
|
|
172
175
|
"""Bind this instance to a dataset.
|
|
173
176
|
|
|
@@ -573,6 +576,11 @@ class Metadata(Array, FeatureExtractor):
|
|
|
573
576
|
Rows where target_index is None contain datum-level data.
|
|
574
577
|
Rows where target_index is an integer contain target/detection-level data.
|
|
575
578
|
|
|
579
|
+
See Also
|
|
580
|
+
--------
|
|
581
|
+
:attr:`~dataeval.Metadata.image_data` : Filter to image-level rows only
|
|
582
|
+
:attr:`~dataeval.Metadata.target_data` : Filter to target-level rows only
|
|
583
|
+
|
|
576
584
|
Notes
|
|
577
585
|
-----
|
|
578
586
|
This property triggers dataset structure analysis on first access.
|
|
@@ -581,11 +589,6 @@ class Metadata(Array, FeatureExtractor):
|
|
|
581
589
|
For Object Detection datasets, the dataframe now contains:
|
|
582
590
|
- Image-level rows (target_index=None): One per image with image-level factors
|
|
583
591
|
- Target-level rows (target_index=0,1,2...): One per detection with detection data
|
|
584
|
-
|
|
585
|
-
See Also
|
|
586
|
-
--------
|
|
587
|
-
image_data : Filter to image-level rows only
|
|
588
|
-
target_data : Filter to target-level rows only
|
|
589
592
|
"""
|
|
590
593
|
self._structure()
|
|
591
594
|
return self._dataframe
|