dataeval 1.0.5__tar.gz → 1.1.0rc0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/PKG-INFO +30 -11
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/README.md +18 -3
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/pyproject.toml +64 -26
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/__init__.py +17 -8
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/_embeddings.py +16 -11
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/_experimental.py +4 -12
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/_metadata.py +10 -7
- dataeval-1.1.0rc0/src/dataeval/_ontology.py +558 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/_version.py +2 -2
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/bias/_balance.py +25 -22
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/bias/_diversity.py +8 -6
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/bias/_parity.py +2 -4
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/config.py +35 -2
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/__init__.py +15 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_ber.py +10 -2
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_bin.py +66 -6
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_cache.py +5 -8
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_clusterer.py +1 -1
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_completeness.py +76 -25
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_compute_stats.py +8 -7
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_coverage.py +31 -13
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_diversity.py +5 -5
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_fast_hdbscan/_mst.py +7 -2
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_feature_distance.py +6 -6
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_hash.py +11 -10
- dataeval-1.1.0rc0/src/dataeval/core/_label_alignment.py +271 -0
- dataeval-1.1.0rc0/src/dataeval/core/_label_coverage.py +246 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_label_parity.py +8 -6
- dataeval-1.1.0rc0/src/dataeval/core/_label_reconciliation.py +147 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_metadata_insights.py +24 -20
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_mst.py +16 -16
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_mutual_info.py +32 -32
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_nullmodel.py +315 -5
- dataeval-1.1.0rc0/src/dataeval/core/_ontology_validation.py +196 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_parity.py +4 -4
- dataeval-1.1.0rc0/src/dataeval/core/_track_stats.py +501 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_uap.py +2 -2
- dataeval-1.1.0rc0/src/dataeval/data/__init__.py +37 -0
- {dataeval-1.0.5/src/dataeval/selection → dataeval-1.1.0rc0/src/dataeval/data}/_classbalance.py +5 -3
- {dataeval-1.0.5/src/dataeval/selection → dataeval-1.1.0rc0/src/dataeval/data}/_classfilter.py +8 -33
- dataeval-1.1.0rc0/src/dataeval/data/_conform.py +138 -0
- dataeval-1.1.0rc0/src/dataeval/data/_crops.py +371 -0
- {dataeval-1.0.5/src/dataeval/selection → dataeval-1.1.0rc0/src/dataeval/data}/_indices.py +1 -1
- {dataeval-1.0.5/src/dataeval/selection → dataeval-1.1.0rc0/src/dataeval/data}/_limit.py +1 -1
- dataeval-1.1.0rc0/src/dataeval/data/_merge.py +87 -0
- dataeval-1.1.0rc0/src/dataeval/data/_relabel.py +193 -0
- {dataeval-1.0.5/src/dataeval/selection → dataeval-1.1.0rc0/src/dataeval/data}/_reverse.py +1 -1
- {dataeval-1.0.5/src/dataeval/selection → dataeval-1.1.0rc0/src/dataeval/data}/_select.py +49 -10
- {dataeval-1.0.5/src/dataeval/selection → dataeval-1.1.0rc0/src/dataeval/data}/_shuffle.py +2 -2
- dataeval-1.0.5/src/dataeval/utils/data.py → dataeval-1.1.0rc0/src/dataeval/data/_split.py +13 -78
- dataeval-1.1.0rc0/src/dataeval/data/_tracks.py +83 -0
- dataeval-1.1.0rc0/src/dataeval/data/_unzip.py +84 -0
- dataeval-1.1.0rc0/src/dataeval/exceptions.py +77 -0
- dataeval-1.1.0rc0/src/dataeval/extractors/__init__.py +31 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/extractors/_bovw.py +7 -7
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/extractors/_flatten.py +2 -2
- dataeval-1.1.0rc0/src/dataeval/extractors/_geometry.py +53 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/extractors/_onnx.py +45 -16
- dataeval-1.1.0rc0/src/dataeval/extractors/_resize.py +22 -0
- dataeval-1.1.0rc0/src/dataeval/extractors/_scores.py +41 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/extractors/_torch.py +63 -15
- dataeval-1.1.0rc0/src/dataeval/extractors/_uncertainty.py +473 -0
- dataeval-1.1.0rc0/src/dataeval/models/__init__.py +20 -0
- dataeval-1.1.0rc0/src/dataeval/models/_backends.py +174 -0
- dataeval-1.1.0rc0/src/dataeval/models/_input.py +125 -0
- dataeval-1.1.0rc0/src/dataeval/models/_metadata.py +140 -0
- dataeval-1.1.0rc0/src/dataeval/models/_predictors.py +455 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/performance/_sufficiency.py +15 -14
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/protocols.py +257 -137
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/quality/_duplicates.py +24 -17
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/quality/_outliers.py +8 -8
- dataeval-1.1.0rc0/src/dataeval/scope/__init__.py +14 -0
- dataeval-1.1.0rc0/src/dataeval/scope/_coverage.py +407 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/scope/_prioritize.py +12 -9
- dataeval-1.1.0rc0/src/dataeval/scope/_representation.py +301 -0
- dataeval-1.1.0rc0/src/dataeval/selection/__init__.py +26 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/__init__.py +2 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/_base.py +9 -5
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/_chunk.py +1 -1
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/_domain_classifier.py +2 -2
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/_kneighbors.py +1 -1
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/_mmd.py +6 -2
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/_reconstruction.py +2 -2
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/_univariate.py +15 -5
- dataeval-1.1.0rc0/src/dataeval/shift/_drift/_wasserstein.py +423 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_ood/_domain_classifier.py +25 -9
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_shared/_reconstruction.py +3 -3
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/types.py +387 -4
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/utils/__init__.py +1 -2
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/utils/_internal.py +92 -2
- dataeval-1.1.0rc0/src/dataeval/utils/_validate.py +258 -0
- dataeval-1.1.0rc0/src/dataeval/utils/data.py +34 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/utils/preprocessing.py +111 -21
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/utils/training.py +101 -16
- dataeval-1.0.5/src/dataeval/exceptions.py +0 -41
- dataeval-1.0.5/src/dataeval/extractors/__init__.py +0 -15
- dataeval-1.0.5/src/dataeval/extractors/_uncertainty.py +0 -245
- dataeval-1.0.5/src/dataeval/scope/__init__.py +0 -10
- dataeval-1.0.5/src/dataeval/selection/__init__.py +0 -20
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/.gitignore +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/LICENSE +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/_helpers.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/_log.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/_warm_cache.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/bias/__init__.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/__init__.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_base.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_dimensionstats.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_hashstats.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_pixelstats.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_register.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_registry.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_visualstats.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_compute_ratios.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_divergence.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_fast_hdbscan/_cluster_trees.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_fast_hdbscan/_disjoint_set.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_label_errors.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_label_stats.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_rank.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/flags.py +5 -5
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/performance/__init__.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/performance/_aggregator.py +8 -8
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/performance/_output.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/performance/schedules.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/py.typed +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/quality/__init__.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/quality/_shared.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/__init__.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_ood/__init__.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_ood/_base.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_ood/_kneighbors.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_ood/_reconstruction.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_shared/__init__.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_shared/_domain_classifier.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_shared/_kneighbors.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/update_strategies.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/utils/losses.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/utils/models.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/utils/onnx.py +0 -0
- {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/utils/thresholds.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataeval
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.0rc0
|
|
4
4
|
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
|
5
5
|
Project-URL: Homepage, https://dataeval.ai/
|
|
6
6
|
Project-URL: Repository, https://github.com/aria-ml/dataeval/
|
|
@@ -21,6 +21,7 @@ Classifier: Programming Language :: Python :: 3.14
|
|
|
21
21
|
Classifier: Topic :: Scientific/Engineering
|
|
22
22
|
Requires-Python: >=3.10
|
|
23
23
|
Requires-Dist: lightgbm>=4
|
|
24
|
+
Requires-Dist: maite>=0.9.4
|
|
24
25
|
Requires-Dist: numba>=0.61.0
|
|
25
26
|
Requires-Dist: numpy>=1.24.2
|
|
26
27
|
Requires-Dist: polars>=1.0.0
|
|
@@ -37,23 +38,27 @@ Requires-Dist: torchvision>=0.17.0; extra == 'cpu'
|
|
|
37
38
|
Provides-Extra: cu118
|
|
38
39
|
Requires-Dist: torch>=2.2.0; extra == 'cu118'
|
|
39
40
|
Requires-Dist: torchvision>=0.17.0; extra == 'cu118'
|
|
40
|
-
Provides-Extra: cu124
|
|
41
|
-
Requires-Dist: torch>=2.2.0; extra == 'cu124'
|
|
42
|
-
Requires-Dist: torchvision>=0.17.0; extra == 'cu124'
|
|
43
41
|
Provides-Extra: cu128
|
|
44
42
|
Requires-Dist: torch>=2.2.0; extra == 'cu128'
|
|
45
43
|
Requires-Dist: torchvision>=0.17.0; extra == 'cu128'
|
|
44
|
+
Provides-Extra: litert
|
|
45
|
+
Requires-Dist: ai-edge-litert>=2.0; (python_version <= '3.14') and extra == 'litert'
|
|
46
46
|
Provides-Extra: onnx
|
|
47
|
-
Requires-Dist: onnx; extra == 'onnx'
|
|
48
|
-
Requires-Dist: onnxruntime
|
|
47
|
+
Requires-Dist: onnx>=1.14.0; extra == 'onnx'
|
|
48
|
+
Requires-Dist: onnxruntime<1.24,>=1.14.0; (python_version == '3.10') and extra == 'onnx'
|
|
49
|
+
Requires-Dist: onnxruntime>=1.14.0; (python_version >= '3.11') and extra == 'onnx'
|
|
49
50
|
Provides-Extra: onnx-gpu
|
|
50
|
-
Requires-Dist: onnx; extra == 'onnx-gpu'
|
|
51
|
-
Requires-Dist: onnxruntime-gpu
|
|
51
|
+
Requires-Dist: onnx>=1.14.0; extra == 'onnx-gpu'
|
|
52
|
+
Requires-Dist: onnxruntime-gpu<1.24,>=1.14.0; (python_version == '3.10') and extra == 'onnx-gpu'
|
|
53
|
+
Requires-Dist: onnxruntime-gpu>=1.14.0; (python_version >= '3.11') and extra == 'onnx-gpu'
|
|
54
|
+
Provides-Extra: ontology
|
|
55
|
+
Requires-Dist: rdflib>=7.0; extra == 'ontology'
|
|
52
56
|
Provides-Extra: opencv
|
|
53
57
|
Requires-Dist: opencv-python-headless>=4.8.0; extra == 'opencv'
|
|
54
58
|
Description-Content-Type: text/markdown
|
|
55
59
|
|
|
56
60
|
<!-- markdownlint-disable MD041 -->
|
|
61
|
+
|
|
57
62
|

|
|
58
63
|
|
|
59
64
|
<!-- :auto badges: -->
|
|
@@ -130,14 +135,28 @@ You can install DataEval directly from pypi.org using the following command.
|
|
|
130
135
|
pip install dataeval
|
|
131
136
|
```
|
|
132
137
|
|
|
138
|
+
By default, PyTorch is installed from PyPI which includes CUDA support on Linux.
|
|
139
|
+
To install a specific PyTorch variant, use `--extra-index-url`:
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
# CPU only
|
|
143
|
+
pip install dataeval --extra-index-url https://download.pytorch.org/whl/cpu
|
|
144
|
+
|
|
145
|
+
# CUDA 11.8
|
|
146
|
+
pip install dataeval --extra-index-url https://download.pytorch.org/whl/cu118
|
|
147
|
+
|
|
148
|
+
# CUDA 12.8
|
|
149
|
+
pip install dataeval --extra-index-url https://download.pytorch.org/whl/cu128
|
|
150
|
+
```
|
|
151
|
+
|
|
133
152
|
### **Installing with conda**
|
|
134
153
|
|
|
135
154
|
DataEval can be installed in a Conda/Mamba environment using the provided
|
|
136
|
-
`environment.
|
|
155
|
+
`environment.yml` file. As some dependencies are installed from the `pytorch`
|
|
137
156
|
channel, the channel is specified in the below example.
|
|
138
157
|
|
|
139
158
|
```bash
|
|
140
|
-
micromamba create -f environment\environment.
|
|
159
|
+
micromamba create -f environment\environment.yml -c pytorch
|
|
141
160
|
```
|
|
142
161
|
|
|
143
162
|
### **Installing from GitHub**
|
|
@@ -401,7 +420,7 @@ shape: (3, 5)
|
|
|
401
420
|
|
|
402
421
|
A result with many large groups is a signal that your dataset contains
|
|
403
422
|
repeated collection events. Before training, remove all but one sample from
|
|
404
|
-
each group. See the [deduplication how-to guide](./docs/source/notebooks/h2_deduplicate.
|
|
423
|
+
each group. See the [deduplication how-to guide](./docs/source/notebooks/h2_deduplicate.py)
|
|
405
424
|
for a complete walkthrough, including how to choose which sample to keep.
|
|
406
425
|
|
|
407
426
|
### Where to go next
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
<!-- markdownlint-disable MD041 -->
|
|
2
|
+
|
|
2
3
|

|
|
3
4
|
|
|
4
5
|
<!-- :auto badges: -->
|
|
@@ -75,14 +76,28 @@ You can install DataEval directly from pypi.org using the following command.
|
|
|
75
76
|
pip install dataeval
|
|
76
77
|
```
|
|
77
78
|
|
|
79
|
+
By default, PyTorch is installed from PyPI which includes CUDA support on Linux.
|
|
80
|
+
To install a specific PyTorch variant, use `--extra-index-url`:
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
# CPU only
|
|
84
|
+
pip install dataeval --extra-index-url https://download.pytorch.org/whl/cpu
|
|
85
|
+
|
|
86
|
+
# CUDA 11.8
|
|
87
|
+
pip install dataeval --extra-index-url https://download.pytorch.org/whl/cu118
|
|
88
|
+
|
|
89
|
+
# CUDA 12.8
|
|
90
|
+
pip install dataeval --extra-index-url https://download.pytorch.org/whl/cu128
|
|
91
|
+
```
|
|
92
|
+
|
|
78
93
|
### **Installing with conda**
|
|
79
94
|
|
|
80
95
|
DataEval can be installed in a Conda/Mamba environment using the provided
|
|
81
|
-
`environment.
|
|
96
|
+
`environment.yml` file. As some dependencies are installed from the `pytorch`
|
|
82
97
|
channel, the channel is specified in the below example.
|
|
83
98
|
|
|
84
99
|
```bash
|
|
85
|
-
micromamba create -f environment\environment.
|
|
100
|
+
micromamba create -f environment\environment.yml -c pytorch
|
|
86
101
|
```
|
|
87
102
|
|
|
88
103
|
### **Installing from GitHub**
|
|
@@ -346,7 +361,7 @@ shape: (3, 5)
|
|
|
346
361
|
|
|
347
362
|
A result with many large groups is a signal that your dataset contains
|
|
348
363
|
repeated collection events. Before training, remove all but one sample from
|
|
349
|
-
each group. See the [deduplication how-to guide](./docs/source/notebooks/h2_deduplicate.
|
|
364
|
+
each group. See the [deduplication how-to guide](./docs/source/notebooks/h2_deduplicate.py)
|
|
350
365
|
for a complete walkthrough, including how to choose which sample to keep.
|
|
351
366
|
|
|
352
367
|
### Where to go next
|
|
@@ -31,6 +31,7 @@ classifiers = [
|
|
|
31
31
|
"Topic :: Scientific/Engineering",
|
|
32
32
|
]
|
|
33
33
|
dependencies = [
|
|
34
|
+
"maite>=0.9.4",
|
|
34
35
|
"numba>=0.61.0",
|
|
35
36
|
"lightgbm>=4",
|
|
36
37
|
"numpy>=1.24.2",
|
|
@@ -47,17 +48,43 @@ dependencies = [
|
|
|
47
48
|
[project.optional-dependencies]
|
|
48
49
|
cpu = ["torch>=2.2.0", "torchvision>=0.17.0"]
|
|
49
50
|
cu118 = ["torch>=2.2.0", "torchvision>=0.17.0"]
|
|
50
|
-
cu124 = ["torch>=2.2.0", "torchvision>=0.17.0"]
|
|
51
51
|
cu128 = ["torch>=2.2.0", "torchvision>=0.17.0"]
|
|
52
|
+
litert = ["ai-edge-litert>=2.0; python_version <= '3.14'"]
|
|
52
53
|
opencv = ["opencv-python-headless>=4.8.0"]
|
|
53
|
-
onnx = [
|
|
54
|
-
|
|
54
|
+
onnx = [
|
|
55
|
+
"onnx>=1.14.0",
|
|
56
|
+
"onnxruntime>=1.14.0,<1.24; python_version == '3.10'",
|
|
57
|
+
"onnxruntime>=1.14.0; python_version >= '3.11'",
|
|
58
|
+
]
|
|
59
|
+
onnx-gpu = [
|
|
60
|
+
"onnx>=1.14.0",
|
|
61
|
+
"onnxruntime-gpu>=1.14.0,<1.24; python_version == '3.10'",
|
|
62
|
+
"onnxruntime-gpu>=1.14.0; python_version >= '3.11'",
|
|
63
|
+
]
|
|
64
|
+
ontology = ["rdflib>=7.0"]
|
|
55
65
|
|
|
56
66
|
[project.urls]
|
|
57
67
|
Homepage = "https://dataeval.ai/"
|
|
58
68
|
Repository = "https://github.com/aria-ml/dataeval/"
|
|
59
69
|
Documentation = "https://dataeval.readthedocs.io/"
|
|
60
70
|
|
|
71
|
+
# MAITE interoperability entry-points.
|
|
72
|
+
[project.entry-points."maite.tasks"]
|
|
73
|
+
dataeval_balance = "dataeval.bias:Balance"
|
|
74
|
+
dataeval_diversity = "dataeval.bias:Diversity"
|
|
75
|
+
dataeval_parity = "dataeval.bias:Parity"
|
|
76
|
+
dataeval_outliers = "dataeval.quality:Outliers"
|
|
77
|
+
dataeval_duplicates = "dataeval.quality:Duplicates"
|
|
78
|
+
dataeval_sufficiency = "dataeval.performance:Sufficiency"
|
|
79
|
+
|
|
80
|
+
[project.entry-points."maite.protocols.image_classification.Model"]
|
|
81
|
+
dataeval_OnnxImageClassifier = "dataeval.models:OnnxImageClassifier"
|
|
82
|
+
dataeval_LiteRtImageClassifier = "dataeval.models:LiteRtImageClassifier"
|
|
83
|
+
|
|
84
|
+
[project.entry-points."maite.protocols.object_detection.Model"]
|
|
85
|
+
dataeval_OnnxObjectDetector = "dataeval.models:OnnxObjectDetector"
|
|
86
|
+
dataeval_LiteRtObjectDetector = "dataeval.models:LiteRtObjectDetector"
|
|
87
|
+
|
|
61
88
|
[dependency-groups]
|
|
62
89
|
base = [
|
|
63
90
|
"uv>=0.8.0",
|
|
@@ -65,7 +92,7 @@ base = [
|
|
|
65
92
|
lock = [
|
|
66
93
|
{ include-group = "base" },
|
|
67
94
|
"pyproject2conda>=0.22",
|
|
68
|
-
"poetry
|
|
95
|
+
"poetry==2.2.0; python_version<'3.14'",
|
|
69
96
|
]
|
|
70
97
|
lint = [
|
|
71
98
|
"ruff>=0.11",
|
|
@@ -73,19 +100,15 @@ lint = [
|
|
|
73
100
|
]
|
|
74
101
|
docsync = [
|
|
75
102
|
"jupytext>=1.19.1",
|
|
76
|
-
"mdformat-myst",
|
|
77
|
-
]
|
|
78
|
-
doclint = [
|
|
79
|
-
{ include-group = "docs"},
|
|
80
|
-
"ruff>=0.11",
|
|
81
|
-
"pyright[nodejs]>=1.1.400",
|
|
82
103
|
]
|
|
83
104
|
test = [
|
|
84
105
|
"coverage[toml]>=7.6",
|
|
85
106
|
"onnx>=1.14.0",
|
|
107
|
+
"onnxscript>=0.6.0",
|
|
86
108
|
"pytest>=8.3",
|
|
87
109
|
"pytest-cov>=6.1",
|
|
88
110
|
"pytest-xdist>=3.6.1",
|
|
111
|
+
"rdflib>=7.0",
|
|
89
112
|
]
|
|
90
113
|
verify = [
|
|
91
114
|
"pytest>=8.3",
|
|
@@ -103,10 +126,13 @@ docs = [
|
|
|
103
126
|
"jinja2>=3.1.6",
|
|
104
127
|
"jupyter-client>=8.6.0",
|
|
105
128
|
"jupyter-cache>=1.0",
|
|
106
|
-
"maite-datasets>=0.0.
|
|
129
|
+
"maite-datasets>=0.0.15",
|
|
107
130
|
"myst-nb>=1.0",
|
|
108
131
|
"opencv-python-headless>=4.8.0",
|
|
132
|
+
"pandas>=2.0.0",
|
|
109
133
|
"plotly>=6.2.0",
|
|
134
|
+
"rapidfuzz>=3.0",
|
|
135
|
+
"rdflib>=7.0",
|
|
110
136
|
"sphinx-autoapi>=3.6.0",
|
|
111
137
|
"sphinx-design>=0.6.1",
|
|
112
138
|
"sphinx-immaterial>=0.12.5",
|
|
@@ -114,9 +140,9 @@ docs = [
|
|
|
114
140
|
"sphinx-tabs>=3.4.7",
|
|
115
141
|
"Sphinx>=7.2.6,<9.0.0", # sphinx-immaterial <= 0.13.9 is not compatible with sphinx >=9.0
|
|
116
142
|
"torchmetrics>=1.0.0",
|
|
117
|
-
"torchvision>=0.17.0",
|
|
118
143
|
"markupsafe>=3,<3.0.2",
|
|
119
144
|
"jupytext>=1.19.1",
|
|
145
|
+
"ultralytics>=8.0.0",
|
|
120
146
|
]
|
|
121
147
|
security = [ # keep in sync with [tool.uv.constraint-dependencies]
|
|
122
148
|
"cryptography>=46.0.5", # CVE-2026-26007: Missing Subgroup Validation for SECT Curves
|
|
@@ -143,7 +169,6 @@ conflicts = [
|
|
|
143
169
|
[
|
|
144
170
|
{ extra = "cpu" },
|
|
145
171
|
{ extra = "cu118" },
|
|
146
|
-
{ extra = "cu124" },
|
|
147
172
|
{ extra = "cu128" },
|
|
148
173
|
],
|
|
149
174
|
]
|
|
@@ -166,11 +191,6 @@ name = "pytorch-cu118"
|
|
|
166
191
|
url = "https://download.pytorch.org/whl/cu118"
|
|
167
192
|
explicit = true
|
|
168
193
|
|
|
169
|
-
[[tool.uv.index]]
|
|
170
|
-
name = "pytorch-cu124"
|
|
171
|
-
url = "https://download.pytorch.org/whl/cu124"
|
|
172
|
-
explicit = true
|
|
173
|
-
|
|
174
194
|
[[tool.uv.index]]
|
|
175
195
|
name = "pytorch-cu128"
|
|
176
196
|
url = "https://download.pytorch.org/whl/cu128"
|
|
@@ -180,19 +200,28 @@ explicit = true
|
|
|
180
200
|
torch = [
|
|
181
201
|
{ index = "pytorch-cpu", extra = "cpu" },
|
|
182
202
|
{ index = "pytorch-cu118", extra = "cu118" },
|
|
183
|
-
{ index = "pytorch-cu124", extra = "cu124" },
|
|
184
203
|
{ index = "pytorch-cu128", extra = "cu128" },
|
|
185
204
|
]
|
|
186
205
|
torchvision = [
|
|
187
206
|
{ index = "pytorch-cpu", extra = "cpu" },
|
|
188
207
|
{ index = "pytorch-cu118", extra = "cu118" },
|
|
189
|
-
{ index = "pytorch-cu124", extra = "cu124" },
|
|
190
208
|
{ index = "pytorch-cu128", extra = "cu128" },
|
|
191
209
|
]
|
|
192
210
|
|
|
193
211
|
[tool.uv.extra-build-dependencies]
|
|
194
212
|
numba = ["tbb>=2021.6"]
|
|
195
213
|
|
|
214
|
+
[tool.poetry]
|
|
215
|
+
version = "0.0.0" # overwritten by poetry-dynamic-versioning
|
|
216
|
+
|
|
217
|
+
[[tool.poetry.source]]
|
|
218
|
+
name = "pytorch-cpu"
|
|
219
|
+
url = "https://download.pytorch.org/whl/cpu"
|
|
220
|
+
priority = "supplemental"
|
|
221
|
+
|
|
222
|
+
[tool.poetry.dependencies]
|
|
223
|
+
torch = { version = ">=2.2.0", source = "pytorch-cpu" }
|
|
224
|
+
|
|
196
225
|
[tool.hatch.build.targets.sdist]
|
|
197
226
|
include = ["src/dataeval"]
|
|
198
227
|
|
|
@@ -208,8 +237,11 @@ source = "vcs"
|
|
|
208
237
|
[tool.hatch.build.hooks.vcs]
|
|
209
238
|
version-file = "src/dataeval/_version.py"
|
|
210
239
|
|
|
211
|
-
[tool.poetry]
|
|
212
|
-
|
|
240
|
+
[tool.poetry-dynamic-versioning]
|
|
241
|
+
enable = true
|
|
242
|
+
vcs = "git"
|
|
243
|
+
style = "pep440"
|
|
244
|
+
pattern = "^v?(?P<base>\\d+\\.\\d+\\.\\d+)"
|
|
213
245
|
|
|
214
246
|
[tool.pyproject2conda.dependencies]
|
|
215
247
|
numpy = { skip = true, packages = "numpy>=1.24.2" }
|
|
@@ -219,7 +251,7 @@ torch = { pip = true } # PyTorch is no longer maintained on conda-forge
|
|
|
219
251
|
xxhash = { skip = true, packages = "python-xxhash>=3.3" }
|
|
220
252
|
|
|
221
253
|
[tool.pyright]
|
|
222
|
-
include = ["src", "tests"]
|
|
254
|
+
include = ["src", "tests", "verification", "docs/source/notebooks"]
|
|
223
255
|
exclude = [
|
|
224
256
|
"**/__pycache__",
|
|
225
257
|
"**/node_modules",
|
|
@@ -232,6 +264,10 @@ reportMissingImports = false
|
|
|
232
264
|
|
|
233
265
|
[tool.pytest.ini_options]
|
|
234
266
|
testpaths = ["tests"]
|
|
267
|
+
filterwarnings = [
|
|
268
|
+
"ignore:The default value of normalize_pixel_values changed:FutureWarning",
|
|
269
|
+
"ignore:Clustering metrics expect discrete values but received continuous values:UserWarning",
|
|
270
|
+
]
|
|
235
271
|
addopts = [
|
|
236
272
|
"--pythonwarnings=ignore::DeprecationWarning",
|
|
237
273
|
"--verbose",
|
|
@@ -278,6 +314,7 @@ exclude = [
|
|
|
278
314
|
".tox",
|
|
279
315
|
"prototype",
|
|
280
316
|
"src/dataeval/_version.py",
|
|
317
|
+
"*.ipynb",
|
|
281
318
|
]
|
|
282
319
|
line-length = 120
|
|
283
320
|
indent-width = 4
|
|
@@ -292,7 +329,7 @@ ignore = ["ANN101", "ANN102", "ANN401", "C408", "C416", "COM812", "NPY002", "SLF
|
|
|
292
329
|
fixable = ["ALL"]
|
|
293
330
|
unfixable = []
|
|
294
331
|
dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
|
|
295
|
-
per-file-ignores = { "!src/*" = ["ANN", "ARG", "BLE", "S", "SLF", "RET", "C90", "D", "FIX", "N", "PERF"], "docs/*" = ["B904"] }
|
|
332
|
+
per-file-ignores = { "!src/*" = ["ANN", "ARG", "BLE", "S", "SLF", "RET", "C90", "D", "FIX", "N", "PERF"], "docs/*" = ["B904"], "docs/source/notebooks/*" = ["E402", "E501", "E703", "RUF100", "SIM105", "UP009"] }
|
|
296
333
|
|
|
297
334
|
[tool.ruff.lint.flake8-builtins]
|
|
298
335
|
builtins-strict-checking = false
|
|
@@ -307,6 +344,7 @@ max-complexity = 5
|
|
|
307
344
|
convention = "numpy"
|
|
308
345
|
|
|
309
346
|
[tool.ruff.format]
|
|
347
|
+
preview = true
|
|
310
348
|
quote-style = "double"
|
|
311
349
|
indent-style = "space"
|
|
312
350
|
skip-magic-trailing-comma = false
|
|
@@ -315,8 +353,8 @@ docstring-code-format = true
|
|
|
315
353
|
docstring-code-line-length = "dynamic"
|
|
316
354
|
|
|
317
355
|
[tool.codespell]
|
|
318
|
-
skip = './*env*,./output,./docs/build,./docs/source/.jupyter_cache,./docs/source/*/data,CHANGELOG.md,uv.lock,requirements
|
|
319
|
-
ignore-words-list = ["Hart","FPR"]
|
|
356
|
+
skip = './*env*,./output,./docs/build,./docs/source/.jupyter_cache,./docs/source/*/data,CHANGELOG.md,uv.lock,requirements.*.txt,*.html,*.lock,*.ipynb'
|
|
357
|
+
ignore-words-list = ["Hart","FPR", "MOT", "mot"]
|
|
320
358
|
|
|
321
359
|
[build-system]
|
|
322
360
|
requires = ["hatchling", "hatch-vcs"]
|
|
@@ -20,24 +20,30 @@ __all__ = [
|
|
|
20
20
|
"exceptions",
|
|
21
21
|
"flags",
|
|
22
22
|
"log",
|
|
23
|
+
"models",
|
|
23
24
|
"protocols",
|
|
24
25
|
"types",
|
|
25
26
|
"Embeddings",
|
|
26
27
|
"Metadata",
|
|
28
|
+
"Ontology",
|
|
27
29
|
]
|
|
28
30
|
|
|
29
31
|
import logging
|
|
30
32
|
|
|
31
|
-
from . import config, exceptions, flags, protocols, types
|
|
33
|
+
from . import config, exceptions, flags, models, protocols, types
|
|
32
34
|
from ._embeddings import Embeddings
|
|
33
35
|
from ._metadata import Metadata
|
|
36
|
+
from ._ontology import Ontology
|
|
34
37
|
|
|
35
38
|
logging.getLogger(__name__).addHandler(logging.NullHandler())
|
|
36
39
|
|
|
37
40
|
|
|
38
41
|
def log(level: int = logging.DEBUG, handler: logging.Handler | None = None) -> None:
|
|
39
42
|
"""
|
|
40
|
-
Add a
|
|
43
|
+
Add a handler to the logger quickly for debugging.
|
|
44
|
+
|
|
45
|
+
Calling this more than once is idempotent: a handler equal to one already
|
|
46
|
+
attached to the logger is not added again, so log lines are not duplicated.
|
|
41
47
|
|
|
42
48
|
Parameters
|
|
43
49
|
----------
|
|
@@ -45,18 +51,21 @@ def log(level: int = logging.DEBUG, handler: logging.Handler | None = None) -> N
|
|
|
45
51
|
Set the logging level for the logger.
|
|
46
52
|
handler : logging.Handler, optional
|
|
47
53
|
Sets the logging handler for the logger if provided, otherwise logger will be
|
|
48
|
-
provided with a StreamHandler.
|
|
54
|
+
provided with a StreamHandler. When a custom handler is supplied its formatter
|
|
55
|
+
is left untouched; the default StreamHandler is given a verbose debugging
|
|
56
|
+
formatter.
|
|
49
57
|
"""
|
|
50
58
|
import logging
|
|
51
59
|
|
|
52
|
-
|
|
60
|
+
_logger = logging.getLogger(__name__)
|
|
53
61
|
if handler is None:
|
|
54
|
-
handler = logging.StreamHandler()
|
|
62
|
+
handler = logging.StreamHandler()
|
|
55
63
|
handler.setFormatter(
|
|
56
64
|
logging.Formatter(
|
|
57
65
|
"%(asctime)s %(levelname)-8s %(name)s.%(filename)s:%(lineno)s - %(funcName)10s() | %(message)s",
|
|
58
66
|
),
|
|
59
67
|
)
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
68
|
+
if handler not in _logger.handlers:
|
|
69
|
+
_logger.addHandler(handler)
|
|
70
|
+
_logger.setLevel(level)
|
|
71
|
+
_logger.debug("Added logging handler %s to logger: %s", handler, __name__)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""Embeddings class for extracting and managing image embeddings."""
|
|
2
2
|
|
|
3
|
-
__all__ = []
|
|
3
|
+
__all__ = ["Embeddings"]
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
6
|
import os
|
|
@@ -14,7 +14,7 @@ import xxhash as xxh
|
|
|
14
14
|
from numpy.typing import NDArray
|
|
15
15
|
from typing_extensions import Self
|
|
16
16
|
|
|
17
|
-
from dataeval.config import
|
|
17
|
+
from dataeval.config import resolve_batch_size
|
|
18
18
|
from dataeval.exceptions import NotFittedError
|
|
19
19
|
from dataeval.extractors import FlattenExtractor
|
|
20
20
|
from dataeval.protocols import (
|
|
@@ -25,6 +25,8 @@ from dataeval.protocols import (
|
|
|
25
25
|
FeatureExtractor,
|
|
26
26
|
ProgressCallback,
|
|
27
27
|
)
|
|
28
|
+
from dataeval.utils._internal import unwrap_image
|
|
29
|
+
from dataeval.utils._validate import requires_maite_dataset
|
|
28
30
|
|
|
29
31
|
_logger = logging.getLogger(__name__)
|
|
30
32
|
|
|
@@ -53,8 +55,14 @@ class Embeddings(Array, FeatureExtractor):
|
|
|
53
55
|
:class:`~dataeval.extractors.FlattenExtractor` for simple baseline
|
|
54
56
|
compatibility with all DataEval tools.
|
|
55
57
|
batch_size : int or None, default None
|
|
56
|
-
|
|
57
|
-
|
|
58
|
+
I/O chunk size: how many images are loaded from the dataset, encoded, and
|
|
59
|
+
written to storage per step. Resolved via
|
|
60
|
+
:func:`~dataeval.config.resolve_batch_size` as the first set of
|
|
61
|
+
``batch_size`` (this argument), the extractor's own ``batch_size``, then
|
|
62
|
+
the global default. This is distinct from an extractor's own forward-pass
|
|
63
|
+
(compute) batch size: an extractor with its own ``batch_size`` sub-batches
|
|
64
|
+
each chunk for the model, so the smaller of the two bounds the forward
|
|
65
|
+
pass. Batching never changes the resulting embeddings.
|
|
58
66
|
path : Path, str, or None, default None
|
|
59
67
|
File path for memory-mapped storage. When None, caches embeddings in memory only.
|
|
60
68
|
When Path or string is provided, uses memory-mapped storage for large embeddings
|
|
@@ -93,6 +101,7 @@ class Embeddings(Array, FeatureExtractor):
|
|
|
93
101
|
|
|
94
102
|
memory_threshold: float
|
|
95
103
|
|
|
104
|
+
@requires_maite_dataset("dataset", expected="image_only")
|
|
96
105
|
def __init__(
|
|
97
106
|
self,
|
|
98
107
|
# Technically more permissive than ImageClassificationDataset or ObjectDetectionDataset
|
|
@@ -104,7 +113,7 @@ class Embeddings(Array, FeatureExtractor):
|
|
|
104
113
|
progress_callback: ProgressCallback | None = None,
|
|
105
114
|
) -> None:
|
|
106
115
|
self._extractor = extractor if extractor is not None else FlattenExtractor()
|
|
107
|
-
self._batch_size =
|
|
116
|
+
self._batch_size = resolve_batch_size(batch_size, getattr(self._extractor, "batch_size", None))
|
|
108
117
|
self.memory_threshold = max(0.0, min(1.0, memory_threshold))
|
|
109
118
|
self._progress_callback = progress_callback
|
|
110
119
|
|
|
@@ -159,6 +168,7 @@ class Embeddings(Array, FeatureExtractor):
|
|
|
159
168
|
"""
|
|
160
169
|
return self._dataset is not None
|
|
161
170
|
|
|
171
|
+
@requires_maite_dataset("dataset", expected="image_only")
|
|
162
172
|
def bind(self, dataset: Dataset[tuple[ArrayLike, Any, Any]] | Dataset[ArrayLike]) -> Self:
|
|
163
173
|
"""Bind this instance to a dataset.
|
|
164
174
|
|
|
@@ -502,12 +512,7 @@ class Embeddings(Array, FeatureExtractor):
|
|
|
502
512
|
if self._dataset is None:
|
|
503
513
|
raise NotFittedError("No dataset bound. Call bind() first.")
|
|
504
514
|
|
|
505
|
-
|
|
506
|
-
for idx in indices:
|
|
507
|
-
item = self._dataset[idx]
|
|
508
|
-
image = item[0] if isinstance(item, tuple) else item
|
|
509
|
-
images.append(image)
|
|
510
|
-
return images
|
|
515
|
+
return [unwrap_image(self._dataset[idx]) for idx in indices]
|
|
511
516
|
|
|
512
517
|
def _batch(self, indices: Sequence[int]) -> Iterator[NDArray[Any]]: # noqa: C901
|
|
513
518
|
"""Process indices in batches using the extractor."""
|
|
@@ -39,14 +39,6 @@ def _make_warning_message( # noqa: C901
|
|
|
39
39
|
return msg
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
def _prepend_doc_note(doc: str | None, note: str) -> str:
|
|
43
|
-
"""Prepend a status note to a docstring."""
|
|
44
|
-
header = f".. warning::\n {note}"
|
|
45
|
-
if doc:
|
|
46
|
-
return f"{header}\n\n{doc}"
|
|
47
|
-
return header
|
|
48
|
-
|
|
49
|
-
|
|
50
42
|
@overload
|
|
51
43
|
def experimental(_target: F) -> F: ...
|
|
52
44
|
@overload
|
|
@@ -89,7 +81,7 @@ def experimental( # noqa: C901
|
|
|
89
81
|
original_init(self, *args, **kwargs)
|
|
90
82
|
|
|
91
83
|
target.__init__ = new_init # type: ignore[attr-defined]
|
|
92
|
-
target.
|
|
84
|
+
target.__experimental__ = msg # type: ignore[attr-defined]
|
|
93
85
|
return target # type: ignore[return-value]
|
|
94
86
|
|
|
95
87
|
@functools.wraps(target)
|
|
@@ -100,7 +92,7 @@ def experimental( # noqa: C901
|
|
|
100
92
|
warned = True
|
|
101
93
|
return target(*args, **kwargs)
|
|
102
94
|
|
|
103
|
-
wrapper.
|
|
95
|
+
wrapper.__experimental__ = msg # type: ignore[attr-defined]
|
|
104
96
|
return wrapper # type: ignore[return-value]
|
|
105
97
|
|
|
106
98
|
if _target is not None:
|
|
@@ -165,7 +157,7 @@ def deprecated( # noqa: C901
|
|
|
165
157
|
original_init(self, *args, **kwargs)
|
|
166
158
|
|
|
167
159
|
target.__init__ = new_init # type: ignore[attr-defined]
|
|
168
|
-
target.
|
|
160
|
+
target.__deprecated__ = msg # type: ignore[attr-defined]
|
|
169
161
|
return target # type: ignore[return-value]
|
|
170
162
|
|
|
171
163
|
@functools.wraps(target)
|
|
@@ -176,7 +168,7 @@ def deprecated( # noqa: C901
|
|
|
176
168
|
warned = True
|
|
177
169
|
return target(*args, **kwargs)
|
|
178
170
|
|
|
179
|
-
wrapper.
|
|
171
|
+
wrapper.__deprecated__ = msg # type: ignore[attr-defined]
|
|
180
172
|
return wrapper # type: ignore[return-value]
|
|
181
173
|
|
|
182
174
|
if _target is not None:
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
__all__ = []
|
|
1
|
+
__all__ = ["Metadata"]
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence, Sized
|
|
@@ -22,6 +22,7 @@ from dataeval.protocols import (
|
|
|
22
22
|
)
|
|
23
23
|
from dataeval.types import Array1D
|
|
24
24
|
from dataeval.utils._internal import as_numpy, merge_metadata
|
|
25
|
+
from dataeval.utils._validate import requires_maite_dataset
|
|
25
26
|
|
|
26
27
|
_logger = logging.getLogger(__name__)
|
|
27
28
|
|
|
@@ -105,6 +106,7 @@ class Metadata(Array, FeatureExtractor):
|
|
|
105
106
|
>>> test_factors = metadata(test_dataset) # Extract from new dataset
|
|
106
107
|
"""
|
|
107
108
|
|
|
109
|
+
@requires_maite_dataset("dataset", expected="any_target")
|
|
108
110
|
def __init__(
|
|
109
111
|
self,
|
|
110
112
|
dataset: AnnotatedDataset[tuple[Any, Any, DatumMetadata]] | None = None,
|
|
@@ -168,6 +170,7 @@ class Metadata(Array, FeatureExtractor):
|
|
|
168
170
|
"""
|
|
169
171
|
return self._dataset is not None
|
|
170
172
|
|
|
173
|
+
@requires_maite_dataset("dataset", expected="any_target")
|
|
171
174
|
def bind(self, dataset: AnnotatedDataset[tuple[Any, Any, DatumMetadata]]) -> Self:
|
|
172
175
|
"""Bind this instance to a dataset.
|
|
173
176
|
|
|
@@ -573,6 +576,11 @@ class Metadata(Array, FeatureExtractor):
|
|
|
573
576
|
Rows where target_index is None contain datum-level data.
|
|
574
577
|
Rows where target_index is an integer contain target/detection-level data.
|
|
575
578
|
|
|
579
|
+
See Also
|
|
580
|
+
--------
|
|
581
|
+
:attr:`~dataeval.Metadata.image_data` : Filter to image-level rows only
|
|
582
|
+
:attr:`~dataeval.Metadata.target_data` : Filter to target-level rows only
|
|
583
|
+
|
|
576
584
|
Notes
|
|
577
585
|
-----
|
|
578
586
|
This property triggers dataset structure analysis on first access.
|
|
@@ -581,11 +589,6 @@ class Metadata(Array, FeatureExtractor):
|
|
|
581
589
|
For Object Detection datasets, the dataframe now contains:
|
|
582
590
|
- Image-level rows (target_index=None): One per image with image-level factors
|
|
583
591
|
- Target-level rows (target_index=0,1,2...): One per detection with detection data
|
|
584
|
-
|
|
585
|
-
See Also
|
|
586
|
-
--------
|
|
587
|
-
image_data : Filter to image-level rows only
|
|
588
|
-
target_data : Filter to target-level rows only
|
|
589
592
|
"""
|
|
590
593
|
self._structure()
|
|
591
594
|
return self._dataframe
|
|
@@ -650,7 +653,7 @@ class Metadata(Array, FeatureExtractor):
|
|
|
650
653
|
-------
|
|
651
654
|
Sequence[str]
|
|
652
655
|
List of factor names that passed filtering and preprocessing steps.
|
|
653
|
-
Order matches columns in factor_data
|
|
656
|
+
Order matches columns in factor_data.
|
|
654
657
|
|
|
655
658
|
Notes
|
|
656
659
|
-----
|