dataeval 1.0.5__tar.gz → 1.1.0rc0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. {dataeval-1.0.5 → dataeval-1.1.0rc0}/PKG-INFO +30 -11
  2. {dataeval-1.0.5 → dataeval-1.1.0rc0}/README.md +18 -3
  3. {dataeval-1.0.5 → dataeval-1.1.0rc0}/pyproject.toml +64 -26
  4. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/__init__.py +17 -8
  5. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/_embeddings.py +16 -11
  6. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/_experimental.py +4 -12
  7. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/_metadata.py +10 -7
  8. dataeval-1.1.0rc0/src/dataeval/_ontology.py +558 -0
  9. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/_version.py +2 -2
  10. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/bias/_balance.py +25 -22
  11. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/bias/_diversity.py +8 -6
  12. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/bias/_parity.py +2 -4
  13. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/config.py +35 -2
  14. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/__init__.py +15 -0
  15. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_ber.py +10 -2
  16. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_bin.py +66 -6
  17. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_cache.py +5 -8
  18. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_clusterer.py +1 -1
  19. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_completeness.py +76 -25
  20. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_compute_stats.py +8 -7
  21. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_coverage.py +31 -13
  22. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_diversity.py +5 -5
  23. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_fast_hdbscan/_mst.py +7 -2
  24. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_feature_distance.py +6 -6
  25. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_hash.py +11 -10
  26. dataeval-1.1.0rc0/src/dataeval/core/_label_alignment.py +271 -0
  27. dataeval-1.1.0rc0/src/dataeval/core/_label_coverage.py +246 -0
  28. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_label_parity.py +8 -6
  29. dataeval-1.1.0rc0/src/dataeval/core/_label_reconciliation.py +147 -0
  30. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_metadata_insights.py +24 -20
  31. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_mst.py +16 -16
  32. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_mutual_info.py +32 -32
  33. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_nullmodel.py +315 -5
  34. dataeval-1.1.0rc0/src/dataeval/core/_ontology_validation.py +196 -0
  35. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_parity.py +4 -4
  36. dataeval-1.1.0rc0/src/dataeval/core/_track_stats.py +501 -0
  37. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_uap.py +2 -2
  38. dataeval-1.1.0rc0/src/dataeval/data/__init__.py +37 -0
  39. {dataeval-1.0.5/src/dataeval/selection → dataeval-1.1.0rc0/src/dataeval/data}/_classbalance.py +5 -3
  40. {dataeval-1.0.5/src/dataeval/selection → dataeval-1.1.0rc0/src/dataeval/data}/_classfilter.py +8 -33
  41. dataeval-1.1.0rc0/src/dataeval/data/_conform.py +138 -0
  42. dataeval-1.1.0rc0/src/dataeval/data/_crops.py +371 -0
  43. {dataeval-1.0.5/src/dataeval/selection → dataeval-1.1.0rc0/src/dataeval/data}/_indices.py +1 -1
  44. {dataeval-1.0.5/src/dataeval/selection → dataeval-1.1.0rc0/src/dataeval/data}/_limit.py +1 -1
  45. dataeval-1.1.0rc0/src/dataeval/data/_merge.py +87 -0
  46. dataeval-1.1.0rc0/src/dataeval/data/_relabel.py +193 -0
  47. {dataeval-1.0.5/src/dataeval/selection → dataeval-1.1.0rc0/src/dataeval/data}/_reverse.py +1 -1
  48. {dataeval-1.0.5/src/dataeval/selection → dataeval-1.1.0rc0/src/dataeval/data}/_select.py +49 -10
  49. {dataeval-1.0.5/src/dataeval/selection → dataeval-1.1.0rc0/src/dataeval/data}/_shuffle.py +2 -2
  50. dataeval-1.0.5/src/dataeval/utils/data.py → dataeval-1.1.0rc0/src/dataeval/data/_split.py +13 -78
  51. dataeval-1.1.0rc0/src/dataeval/data/_tracks.py +83 -0
  52. dataeval-1.1.0rc0/src/dataeval/data/_unzip.py +84 -0
  53. dataeval-1.1.0rc0/src/dataeval/exceptions.py +77 -0
  54. dataeval-1.1.0rc0/src/dataeval/extractors/__init__.py +31 -0
  55. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/extractors/_bovw.py +7 -7
  56. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/extractors/_flatten.py +2 -2
  57. dataeval-1.1.0rc0/src/dataeval/extractors/_geometry.py +53 -0
  58. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/extractors/_onnx.py +45 -16
  59. dataeval-1.1.0rc0/src/dataeval/extractors/_resize.py +22 -0
  60. dataeval-1.1.0rc0/src/dataeval/extractors/_scores.py +41 -0
  61. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/extractors/_torch.py +63 -15
  62. dataeval-1.1.0rc0/src/dataeval/extractors/_uncertainty.py +473 -0
  63. dataeval-1.1.0rc0/src/dataeval/models/__init__.py +20 -0
  64. dataeval-1.1.0rc0/src/dataeval/models/_backends.py +174 -0
  65. dataeval-1.1.0rc0/src/dataeval/models/_input.py +125 -0
  66. dataeval-1.1.0rc0/src/dataeval/models/_metadata.py +140 -0
  67. dataeval-1.1.0rc0/src/dataeval/models/_predictors.py +455 -0
  68. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/performance/_sufficiency.py +15 -14
  69. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/protocols.py +257 -137
  70. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/quality/_duplicates.py +24 -17
  71. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/quality/_outliers.py +8 -8
  72. dataeval-1.1.0rc0/src/dataeval/scope/__init__.py +14 -0
  73. dataeval-1.1.0rc0/src/dataeval/scope/_coverage.py +407 -0
  74. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/scope/_prioritize.py +12 -9
  75. dataeval-1.1.0rc0/src/dataeval/scope/_representation.py +301 -0
  76. dataeval-1.1.0rc0/src/dataeval/selection/__init__.py +26 -0
  77. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/__init__.py +2 -0
  78. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/_base.py +9 -5
  79. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/_chunk.py +1 -1
  80. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/_domain_classifier.py +2 -2
  81. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/_kneighbors.py +1 -1
  82. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/_mmd.py +6 -2
  83. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/_reconstruction.py +2 -2
  84. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/_univariate.py +15 -5
  85. dataeval-1.1.0rc0/src/dataeval/shift/_drift/_wasserstein.py +423 -0
  86. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_ood/_domain_classifier.py +25 -9
  87. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_shared/_reconstruction.py +3 -3
  88. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/types.py +387 -4
  89. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/utils/__init__.py +1 -2
  90. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/utils/_internal.py +92 -2
  91. dataeval-1.1.0rc0/src/dataeval/utils/_validate.py +258 -0
  92. dataeval-1.1.0rc0/src/dataeval/utils/data.py +34 -0
  93. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/utils/preprocessing.py +111 -21
  94. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/utils/training.py +101 -16
  95. dataeval-1.0.5/src/dataeval/exceptions.py +0 -41
  96. dataeval-1.0.5/src/dataeval/extractors/__init__.py +0 -15
  97. dataeval-1.0.5/src/dataeval/extractors/_uncertainty.py +0 -245
  98. dataeval-1.0.5/src/dataeval/scope/__init__.py +0 -10
  99. dataeval-1.0.5/src/dataeval/selection/__init__.py +0 -20
  100. {dataeval-1.0.5 → dataeval-1.1.0rc0}/.gitignore +0 -0
  101. {dataeval-1.0.5 → dataeval-1.1.0rc0}/LICENSE +0 -0
  102. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/_helpers.py +0 -0
  103. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/_log.py +0 -0
  104. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/_warm_cache.py +0 -0
  105. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/bias/__init__.py +0 -0
  106. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/__init__.py +0 -0
  107. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_base.py +0 -0
  108. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_dimensionstats.py +0 -0
  109. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_hashstats.py +0 -0
  110. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_pixelstats.py +0 -0
  111. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_register.py +0 -0
  112. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_registry.py +0 -0
  113. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_calculators/_visualstats.py +0 -0
  114. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_compute_ratios.py +0 -0
  115. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_divergence.py +0 -0
  116. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_fast_hdbscan/_cluster_trees.py +0 -0
  117. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_fast_hdbscan/_disjoint_set.py +0 -0
  118. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_label_errors.py +0 -0
  119. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_label_stats.py +0 -0
  120. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/core/_rank.py +0 -0
  121. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/flags.py +5 -5
  122. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/performance/__init__.py +0 -0
  123. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/performance/_aggregator.py +8 -8
  124. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/performance/_output.py +0 -0
  125. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/performance/schedules.py +0 -0
  126. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/py.typed +0 -0
  127. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/quality/__init__.py +0 -0
  128. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/quality/_shared.py +0 -0
  129. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_drift/__init__.py +0 -0
  130. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_ood/__init__.py +0 -0
  131. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_ood/_base.py +0 -0
  132. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_ood/_kneighbors.py +0 -0
  133. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_ood/_reconstruction.py +0 -0
  134. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_shared/__init__.py +0 -0
  135. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_shared/_domain_classifier.py +0 -0
  136. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/_shared/_kneighbors.py +0 -0
  137. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/shift/update_strategies.py +0 -0
  138. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/utils/losses.py +0 -0
  139. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/utils/models.py +0 -0
  140. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/utils/onnx.py +0 -0
  141. {dataeval-1.0.5 → dataeval-1.1.0rc0}/src/dataeval/utils/thresholds.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataeval
3
- Version: 1.0.5
3
+ Version: 1.1.0rc0
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Project-URL: Homepage, https://dataeval.ai/
6
6
  Project-URL: Repository, https://github.com/aria-ml/dataeval/
@@ -21,6 +21,7 @@ Classifier: Programming Language :: Python :: 3.14
21
21
  Classifier: Topic :: Scientific/Engineering
22
22
  Requires-Python: >=3.10
23
23
  Requires-Dist: lightgbm>=4
24
+ Requires-Dist: maite>=0.9.4
24
25
  Requires-Dist: numba>=0.61.0
25
26
  Requires-Dist: numpy>=1.24.2
26
27
  Requires-Dist: polars>=1.0.0
@@ -37,23 +38,27 @@ Requires-Dist: torchvision>=0.17.0; extra == 'cpu'
37
38
  Provides-Extra: cu118
38
39
  Requires-Dist: torch>=2.2.0; extra == 'cu118'
39
40
  Requires-Dist: torchvision>=0.17.0; extra == 'cu118'
40
- Provides-Extra: cu124
41
- Requires-Dist: torch>=2.2.0; extra == 'cu124'
42
- Requires-Dist: torchvision>=0.17.0; extra == 'cu124'
43
41
  Provides-Extra: cu128
44
42
  Requires-Dist: torch>=2.2.0; extra == 'cu128'
45
43
  Requires-Dist: torchvision>=0.17.0; extra == 'cu128'
44
+ Provides-Extra: litert
45
+ Requires-Dist: ai-edge-litert>=2.0; (python_version <= '3.14') and extra == 'litert'
46
46
  Provides-Extra: onnx
47
- Requires-Dist: onnx; extra == 'onnx'
48
- Requires-Dist: onnxruntime>=1.14.0; extra == 'onnx'
47
+ Requires-Dist: onnx>=1.14.0; extra == 'onnx'
48
+ Requires-Dist: onnxruntime<1.24,>=1.14.0; (python_version == '3.10') and extra == 'onnx'
49
+ Requires-Dist: onnxruntime>=1.14.0; (python_version >= '3.11') and extra == 'onnx'
49
50
  Provides-Extra: onnx-gpu
50
- Requires-Dist: onnx; extra == 'onnx-gpu'
51
- Requires-Dist: onnxruntime-gpu>=1.14.0; extra == 'onnx-gpu'
51
+ Requires-Dist: onnx>=1.14.0; extra == 'onnx-gpu'
52
+ Requires-Dist: onnxruntime-gpu<1.24,>=1.14.0; (python_version == '3.10') and extra == 'onnx-gpu'
53
+ Requires-Dist: onnxruntime-gpu>=1.14.0; (python_version >= '3.11') and extra == 'onnx-gpu'
54
+ Provides-Extra: ontology
55
+ Requires-Dist: rdflib>=7.0; extra == 'ontology'
52
56
  Provides-Extra: opencv
53
57
  Requires-Dist: opencv-python-headless>=4.8.0; extra == 'opencv'
54
58
  Description-Content-Type: text/markdown
55
59
 
56
60
  <!-- markdownlint-disable MD041 -->
61
+
57
62
  ![dataeval-logo](docs/source/_static/images/DataEval_ImageText.png)
58
63
 
59
64
  <!-- :auto badges: -->
@@ -130,14 +135,28 @@ You can install DataEval directly from pypi.org using the following command.
130
135
  pip install dataeval
131
136
  ```
132
137
 
138
+ By default, PyTorch is installed from PyPI which includes CUDA support on Linux.
139
+ To install a specific PyTorch variant, use `--extra-index-url`:
140
+
141
+ ```bash
142
+ # CPU only
143
+ pip install dataeval --extra-index-url https://download.pytorch.org/whl/cpu
144
+
145
+ # CUDA 11.8
146
+ pip install dataeval --extra-index-url https://download.pytorch.org/whl/cu118
147
+
148
+ # CUDA 12.8
149
+ pip install dataeval --extra-index-url https://download.pytorch.org/whl/cu128
150
+ ```
151
+
133
152
  ### **Installing with conda**
134
153
 
135
154
  DataEval can be installed in a Conda/Mamba environment using the provided
136
- `environment.yaml` file. As some dependencies are installed from the `pytorch`
155
+ `environment.yml` file. As some dependencies are installed from the `pytorch`
137
156
  channel, the channel is specified in the below example.
138
157
 
139
158
  ```bash
140
- micromamba create -f environment\environment.yaml -c pytorch
159
+ micromamba create -f environment\environment.yml -c pytorch
141
160
  ```
142
161
 
143
162
  ### **Installing from GitHub**
@@ -401,7 +420,7 @@ shape: (3, 5)
401
420
 
402
421
  A result with many large groups is a signal that your dataset contains
403
422
  repeated collection events. Before training, remove all but one sample from
404
- each group. See the [deduplication how-to guide](./docs/source/notebooks/h2_deduplicate.md)
423
+ each group. See the [deduplication how-to guide](./docs/source/notebooks/h2_deduplicate.py)
405
424
  for a complete walkthrough, including how to choose which sample to keep.
406
425
 
407
426
  ### Where to go next
@@ -1,4 +1,5 @@
1
1
  <!-- markdownlint-disable MD041 -->
2
+
2
3
  ![dataeval-logo](docs/source/_static/images/DataEval_ImageText.png)
3
4
 
4
5
  <!-- :auto badges: -->
@@ -75,14 +76,28 @@ You can install DataEval directly from pypi.org using the following command.
75
76
  pip install dataeval
76
77
  ```
77
78
 
79
+ By default, PyTorch is installed from PyPI which includes CUDA support on Linux.
80
+ To install a specific PyTorch variant, use `--extra-index-url`:
81
+
82
+ ```bash
83
+ # CPU only
84
+ pip install dataeval --extra-index-url https://download.pytorch.org/whl/cpu
85
+
86
+ # CUDA 11.8
87
+ pip install dataeval --extra-index-url https://download.pytorch.org/whl/cu118
88
+
89
+ # CUDA 12.8
90
+ pip install dataeval --extra-index-url https://download.pytorch.org/whl/cu128
91
+ ```
92
+
78
93
  ### **Installing with conda**
79
94
 
80
95
  DataEval can be installed in a Conda/Mamba environment using the provided
81
- `environment.yaml` file. As some dependencies are installed from the `pytorch`
96
+ `environment.yml` file. As some dependencies are installed from the `pytorch`
82
97
  channel, the channel is specified in the below example.
83
98
 
84
99
  ```bash
85
- micromamba create -f environment\environment.yaml -c pytorch
100
+ micromamba create -f environment\environment.yml -c pytorch
86
101
  ```
87
102
 
88
103
  ### **Installing from GitHub**
@@ -346,7 +361,7 @@ shape: (3, 5)
346
361
 
347
362
  A result with many large groups is a signal that your dataset contains
348
363
  repeated collection events. Before training, remove all but one sample from
349
- each group. See the [deduplication how-to guide](./docs/source/notebooks/h2_deduplicate.md)
364
+ each group. See the [deduplication how-to guide](./docs/source/notebooks/h2_deduplicate.py)
350
365
  for a complete walkthrough, including how to choose which sample to keep.
351
366
 
352
367
  ### Where to go next
@@ -31,6 +31,7 @@ classifiers = [
31
31
  "Topic :: Scientific/Engineering",
32
32
  ]
33
33
  dependencies = [
34
+ "maite>=0.9.4",
34
35
  "numba>=0.61.0",
35
36
  "lightgbm>=4",
36
37
  "numpy>=1.24.2",
@@ -47,17 +48,43 @@ dependencies = [
47
48
  [project.optional-dependencies]
48
49
  cpu = ["torch>=2.2.0", "torchvision>=0.17.0"]
49
50
  cu118 = ["torch>=2.2.0", "torchvision>=0.17.0"]
50
- cu124 = ["torch>=2.2.0", "torchvision>=0.17.0"]
51
51
  cu128 = ["torch>=2.2.0", "torchvision>=0.17.0"]
52
+ litert = ["ai-edge-litert>=2.0; python_version <= '3.14'"]
52
53
  opencv = ["opencv-python-headless>=4.8.0"]
53
- onnx = ["onnx", "onnxruntime>=1.14.0"]
54
- onnx-gpu = ["onnx", "onnxruntime-gpu>=1.14.0"]
54
+ onnx = [
55
+ "onnx>=1.14.0",
56
+ "onnxruntime>=1.14.0,<1.24; python_version == '3.10'",
57
+ "onnxruntime>=1.14.0; python_version >= '3.11'",
58
+ ]
59
+ onnx-gpu = [
60
+ "onnx>=1.14.0",
61
+ "onnxruntime-gpu>=1.14.0,<1.24; python_version == '3.10'",
62
+ "onnxruntime-gpu>=1.14.0; python_version >= '3.11'",
63
+ ]
64
+ ontology = ["rdflib>=7.0"]
55
65
 
56
66
  [project.urls]
57
67
  Homepage = "https://dataeval.ai/"
58
68
  Repository = "https://github.com/aria-ml/dataeval/"
59
69
  Documentation = "https://dataeval.readthedocs.io/"
60
70
 
71
+ # MAITE interoperability entry-points.
72
+ [project.entry-points."maite.tasks"]
73
+ dataeval_balance = "dataeval.bias:Balance"
74
+ dataeval_diversity = "dataeval.bias:Diversity"
75
+ dataeval_parity = "dataeval.bias:Parity"
76
+ dataeval_outliers = "dataeval.quality:Outliers"
77
+ dataeval_duplicates = "dataeval.quality:Duplicates"
78
+ dataeval_sufficiency = "dataeval.performance:Sufficiency"
79
+
80
+ [project.entry-points."maite.protocols.image_classification.Model"]
81
+ dataeval_OnnxImageClassifier = "dataeval.models:OnnxImageClassifier"
82
+ dataeval_LiteRtImageClassifier = "dataeval.models:LiteRtImageClassifier"
83
+
84
+ [project.entry-points."maite.protocols.object_detection.Model"]
85
+ dataeval_OnnxObjectDetector = "dataeval.models:OnnxObjectDetector"
86
+ dataeval_LiteRtObjectDetector = "dataeval.models:LiteRtObjectDetector"
87
+
61
88
  [dependency-groups]
62
89
  base = [
63
90
  "uv>=0.8.0",
@@ -65,7 +92,7 @@ base = [
65
92
  lock = [
66
93
  { include-group = "base" },
67
94
  "pyproject2conda>=0.22",
68
- "poetry>=2; python_version<'3.14'",
95
+ "poetry==2.2.0; python_version<'3.14'",
69
96
  ]
70
97
  lint = [
71
98
  "ruff>=0.11",
@@ -73,19 +100,15 @@ lint = [
73
100
  ]
74
101
  docsync = [
75
102
  "jupytext>=1.19.1",
76
- "mdformat-myst",
77
- ]
78
- doclint = [
79
- { include-group = "docs"},
80
- "ruff>=0.11",
81
- "pyright[nodejs]>=1.1.400",
82
103
  ]
83
104
  test = [
84
105
  "coverage[toml]>=7.6",
85
106
  "onnx>=1.14.0",
107
+ "onnxscript>=0.6.0",
86
108
  "pytest>=8.3",
87
109
  "pytest-cov>=6.1",
88
110
  "pytest-xdist>=3.6.1",
111
+ "rdflib>=7.0",
89
112
  ]
90
113
  verify = [
91
114
  "pytest>=8.3",
@@ -103,10 +126,13 @@ docs = [
103
126
  "jinja2>=3.1.6",
104
127
  "jupyter-client>=8.6.0",
105
128
  "jupyter-cache>=1.0",
106
- "maite-datasets>=0.0.12",
129
+ "maite-datasets>=0.0.15",
107
130
  "myst-nb>=1.0",
108
131
  "opencv-python-headless>=4.8.0",
132
+ "pandas>=2.0.0",
109
133
  "plotly>=6.2.0",
134
+ "rapidfuzz>=3.0",
135
+ "rdflib>=7.0",
110
136
  "sphinx-autoapi>=3.6.0",
111
137
  "sphinx-design>=0.6.1",
112
138
  "sphinx-immaterial>=0.12.5",
@@ -114,9 +140,9 @@ docs = [
114
140
  "sphinx-tabs>=3.4.7",
115
141
  "Sphinx>=7.2.6,<9.0.0", # sphinx-immaterial <= 0.13.9 is not compatible with sphinx >=9.0
116
142
  "torchmetrics>=1.0.0",
117
- "torchvision>=0.17.0",
118
143
  "markupsafe>=3,<3.0.2",
119
144
  "jupytext>=1.19.1",
145
+ "ultralytics>=8.0.0",
120
146
  ]
121
147
  security = [ # keep in sync with [tool.uv.constraint-dependencies]
122
148
  "cryptography>=46.0.5", # CVE-2026-26007: Missing Subgroup Validation for SECT Curves
@@ -143,7 +169,6 @@ conflicts = [
143
169
  [
144
170
  { extra = "cpu" },
145
171
  { extra = "cu118" },
146
- { extra = "cu124" },
147
172
  { extra = "cu128" },
148
173
  ],
149
174
  ]
@@ -166,11 +191,6 @@ name = "pytorch-cu118"
166
191
  url = "https://download.pytorch.org/whl/cu118"
167
192
  explicit = true
168
193
 
169
- [[tool.uv.index]]
170
- name = "pytorch-cu124"
171
- url = "https://download.pytorch.org/whl/cu124"
172
- explicit = true
173
-
174
194
  [[tool.uv.index]]
175
195
  name = "pytorch-cu128"
176
196
  url = "https://download.pytorch.org/whl/cu128"
@@ -180,19 +200,28 @@ explicit = true
180
200
  torch = [
181
201
  { index = "pytorch-cpu", extra = "cpu" },
182
202
  { index = "pytorch-cu118", extra = "cu118" },
183
- { index = "pytorch-cu124", extra = "cu124" },
184
203
  { index = "pytorch-cu128", extra = "cu128" },
185
204
  ]
186
205
  torchvision = [
187
206
  { index = "pytorch-cpu", extra = "cpu" },
188
207
  { index = "pytorch-cu118", extra = "cu118" },
189
- { index = "pytorch-cu124", extra = "cu124" },
190
208
  { index = "pytorch-cu128", extra = "cu128" },
191
209
  ]
192
210
 
193
211
  [tool.uv.extra-build-dependencies]
194
212
  numba = ["tbb>=2021.6"]
195
213
 
214
+ [tool.poetry]
215
+ version = "0.0.0" # overwritten by poetry-dynamic-versioning
216
+
217
+ [[tool.poetry.source]]
218
+ name = "pytorch-cpu"
219
+ url = "https://download.pytorch.org/whl/cpu"
220
+ priority = "supplemental"
221
+
222
+ [tool.poetry.dependencies]
223
+ torch = { version = ">=2.2.0", source = "pytorch-cpu" }
224
+
196
225
  [tool.hatch.build.targets.sdist]
197
226
  include = ["src/dataeval"]
198
227
 
@@ -208,8 +237,11 @@ source = "vcs"
208
237
  [tool.hatch.build.hooks.vcs]
209
238
  version-file = "src/dataeval/_version.py"
210
239
 
211
- [tool.poetry]
212
- version = "0.0.0" # unused
240
+ [tool.poetry-dynamic-versioning]
241
+ enable = true
242
+ vcs = "git"
243
+ style = "pep440"
244
+ pattern = "^v?(?P<base>\\d+\\.\\d+\\.\\d+)"
213
245
 
214
246
  [tool.pyproject2conda.dependencies]
215
247
  numpy = { skip = true, packages = "numpy>=1.24.2" }
@@ -219,7 +251,7 @@ torch = { pip = true } # PyTorch is no longer maintained on conda-forge
219
251
  xxhash = { skip = true, packages = "python-xxhash>=3.3" }
220
252
 
221
253
  [tool.pyright]
222
- include = ["src", "tests"]
254
+ include = ["src", "tests", "verification", "docs/source/notebooks"]
223
255
  exclude = [
224
256
  "**/__pycache__",
225
257
  "**/node_modules",
@@ -232,6 +264,10 @@ reportMissingImports = false
232
264
 
233
265
  [tool.pytest.ini_options]
234
266
  testpaths = ["tests"]
267
+ filterwarnings = [
268
+ "ignore:The default value of normalize_pixel_values changed:FutureWarning",
269
+ "ignore:Clustering metrics expect discrete values but received continuous values:UserWarning",
270
+ ]
235
271
  addopts = [
236
272
  "--pythonwarnings=ignore::DeprecationWarning",
237
273
  "--verbose",
@@ -278,6 +314,7 @@ exclude = [
278
314
  ".tox",
279
315
  "prototype",
280
316
  "src/dataeval/_version.py",
317
+ "*.ipynb",
281
318
  ]
282
319
  line-length = 120
283
320
  indent-width = 4
@@ -292,7 +329,7 @@ ignore = ["ANN101", "ANN102", "ANN401", "C408", "C416", "COM812", "NPY002", "SLF
292
329
  fixable = ["ALL"]
293
330
  unfixable = []
294
331
  dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
295
- per-file-ignores = { "!src/*" = ["ANN", "ARG", "BLE", "S", "SLF", "RET", "C90", "D", "FIX", "N", "PERF"], "docs/*" = ["B904"] }
332
+ per-file-ignores = { "!src/*" = ["ANN", "ARG", "BLE", "S", "SLF", "RET", "C90", "D", "FIX", "N", "PERF"], "docs/*" = ["B904"], "docs/source/notebooks/*" = ["E402", "E501", "E703", "RUF100", "SIM105", "UP009"] }
296
333
 
297
334
  [tool.ruff.lint.flake8-builtins]
298
335
  builtins-strict-checking = false
@@ -307,6 +344,7 @@ max-complexity = 5
307
344
  convention = "numpy"
308
345
 
309
346
  [tool.ruff.format]
347
+ preview = true
310
348
  quote-style = "double"
311
349
  indent-style = "space"
312
350
  skip-magic-trailing-comma = false
@@ -315,8 +353,8 @@ docstring-code-format = true
315
353
  docstring-code-line-length = "dynamic"
316
354
 
317
355
  [tool.codespell]
318
- skip = './*env*,./output,./docs/build,./docs/source/.jupyter_cache,./docs/source/*/data,CHANGELOG.md,uv.lock,requirements.txt,*.html,*.lock,*.ipynb'
319
- ignore-words-list = ["Hart","FPR"]
356
+ skip = './*env*,./output,./docs/build,./docs/source/.jupyter_cache,./docs/source/*/data,CHANGELOG.md,uv.lock,requirements.*.txt,*.html,*.lock,*.ipynb'
357
+ ignore-words-list = ["Hart","FPR", "MOT", "mot"]
320
358
 
321
359
  [build-system]
322
360
  requires = ["hatchling", "hatch-vcs"]
@@ -20,24 +20,30 @@ __all__ = [
20
20
  "exceptions",
21
21
  "flags",
22
22
  "log",
23
+ "models",
23
24
  "protocols",
24
25
  "types",
25
26
  "Embeddings",
26
27
  "Metadata",
28
+ "Ontology",
27
29
  ]
28
30
 
29
31
  import logging
30
32
 
31
- from . import config, exceptions, flags, protocols, types
33
+ from . import config, exceptions, flags, models, protocols, types
32
34
  from ._embeddings import Embeddings
33
35
  from ._metadata import Metadata
36
+ from ._ontology import Ontology
34
37
 
35
38
  logging.getLogger(__name__).addHandler(logging.NullHandler())
36
39
 
37
40
 
38
41
  def log(level: int = logging.DEBUG, handler: logging.Handler | None = None) -> None:
39
42
  """
40
- Add a StreamHandler to the logger quickly for debugging.
43
+ Add a handler to the logger quickly for debugging.
44
+
45
+ Calling this more than once is idempotent: a handler equal to one already
46
+ attached to the logger is not added again, so log lines are not duplicated.
41
47
 
42
48
  Parameters
43
49
  ----------
@@ -45,18 +51,21 @@ def log(level: int = logging.DEBUG, handler: logging.Handler | None = None) -> N
45
51
  Set the logging level for the logger.
46
52
  handler : logging.Handler, optional
47
53
  Sets the logging handler for the logger if provided, otherwise logger will be
48
- provided with a StreamHandler.
54
+ provided with a StreamHandler. When a custom handler is supplied its formatter
55
+ is left untouched; the default StreamHandler is given a verbose debugging
56
+ formatter.
49
57
  """
50
58
  import logging
51
59
 
52
- logger = logging.getLogger(__name__)
60
+ _logger = logging.getLogger(__name__)
53
61
  if handler is None:
54
- handler = logging.StreamHandler() if handler is None else handler
62
+ handler = logging.StreamHandler()
55
63
  handler.setFormatter(
56
64
  logging.Formatter(
57
65
  "%(asctime)s %(levelname)-8s %(name)s.%(filename)s:%(lineno)s - %(funcName)10s() | %(message)s",
58
66
  ),
59
67
  )
60
- logger.addHandler(handler)
61
- logger.setLevel(level)
62
- logger.debug(f"Added logging handler {handler} to logger: {__name__}")
68
+ if handler not in _logger.handlers:
69
+ _logger.addHandler(handler)
70
+ _logger.setLevel(level)
71
+ _logger.debug("Added logging handler %s to logger: %s", handler, __name__)
@@ -1,6 +1,6 @@
1
1
  """Embeddings class for extracting and managing image embeddings."""
2
2
 
3
- __all__ = []
3
+ __all__ = ["Embeddings"]
4
4
 
5
5
  import logging
6
6
  import os
@@ -14,7 +14,7 @@ import xxhash as xxh
14
14
  from numpy.typing import NDArray
15
15
  from typing_extensions import Self
16
16
 
17
- from dataeval.config import get_batch_size
17
+ from dataeval.config import resolve_batch_size
18
18
  from dataeval.exceptions import NotFittedError
19
19
  from dataeval.extractors import FlattenExtractor
20
20
  from dataeval.protocols import (
@@ -25,6 +25,8 @@ from dataeval.protocols import (
25
25
  FeatureExtractor,
26
26
  ProgressCallback,
27
27
  )
28
+ from dataeval.utils._internal import unwrap_image
29
+ from dataeval.utils._validate import requires_maite_dataset
28
30
 
29
31
  _logger = logging.getLogger(__name__)
30
32
 
@@ -53,8 +55,14 @@ class Embeddings(Array, FeatureExtractor):
53
55
  :class:`~dataeval.extractors.FlattenExtractor` for simple baseline
54
56
  compatibility with all DataEval tools.
55
57
  batch_size : int or None, default None
56
- Number of samples to process per batch. When None, uses DataEval's
57
- configured batch size via :func:`~dataeval.config.get_batch_size`.
58
+ I/O chunk size: how many images are loaded from the dataset, encoded, and
59
+ written to storage per step. Resolved via
60
+ :func:`~dataeval.config.resolve_batch_size` as the first set of
61
+ ``batch_size`` (this argument), the extractor's own ``batch_size``, then
62
+ the global default. This is distinct from an extractor's own forward-pass
63
+ (compute) batch size: an extractor with its own ``batch_size`` sub-batches
64
+ each chunk for the model, so the smaller of the two bounds the forward
65
+ pass. Batching never changes the resulting embeddings.
58
66
  path : Path, str, or None, default None
59
67
  File path for memory-mapped storage. When None, caches embeddings in memory only.
60
68
  When Path or string is provided, uses memory-mapped storage for large embeddings
@@ -93,6 +101,7 @@ class Embeddings(Array, FeatureExtractor):
93
101
 
94
102
  memory_threshold: float
95
103
 
104
+ @requires_maite_dataset("dataset", expected="image_only")
96
105
  def __init__(
97
106
  self,
98
107
  # Technically more permissive than ImageClassificationDataset or ObjectDetectionDataset
@@ -104,7 +113,7 @@ class Embeddings(Array, FeatureExtractor):
104
113
  progress_callback: ProgressCallback | None = None,
105
114
  ) -> None:
106
115
  self._extractor = extractor if extractor is not None else FlattenExtractor()
107
- self._batch_size = get_batch_size(batch_size)
116
+ self._batch_size = resolve_batch_size(batch_size, getattr(self._extractor, "batch_size", None))
108
117
  self.memory_threshold = max(0.0, min(1.0, memory_threshold))
109
118
  self._progress_callback = progress_callback
110
119
 
@@ -159,6 +168,7 @@ class Embeddings(Array, FeatureExtractor):
159
168
  """
160
169
  return self._dataset is not None
161
170
 
171
+ @requires_maite_dataset("dataset", expected="image_only")
162
172
  def bind(self, dataset: Dataset[tuple[ArrayLike, Any, Any]] | Dataset[ArrayLike]) -> Self:
163
173
  """Bind this instance to a dataset.
164
174
 
@@ -502,12 +512,7 @@ class Embeddings(Array, FeatureExtractor):
502
512
  if self._dataset is None:
503
513
  raise NotFittedError("No dataset bound. Call bind() first.")
504
514
 
505
- images: list[Any] = []
506
- for idx in indices:
507
- item = self._dataset[idx]
508
- image = item[0] if isinstance(item, tuple) else item
509
- images.append(image)
510
- return images
515
+ return [unwrap_image(self._dataset[idx]) for idx in indices]
511
516
 
512
517
  def _batch(self, indices: Sequence[int]) -> Iterator[NDArray[Any]]: # noqa: C901
513
518
  """Process indices in batches using the extractor."""
@@ -39,14 +39,6 @@ def _make_warning_message( # noqa: C901
39
39
  return msg
40
40
 
41
41
 
42
- def _prepend_doc_note(doc: str | None, note: str) -> str:
43
- """Prepend a status note to a docstring."""
44
- header = f".. warning::\n {note}"
45
- if doc:
46
- return f"{header}\n\n{doc}"
47
- return header
48
-
49
-
50
42
  @overload
51
43
  def experimental(_target: F) -> F: ...
52
44
  @overload
@@ -89,7 +81,7 @@ def experimental( # noqa: C901
89
81
  original_init(self, *args, **kwargs)
90
82
 
91
83
  target.__init__ = new_init # type: ignore[attr-defined]
92
- target.__doc__ = _prepend_doc_note(target.__doc__, msg)
84
+ target.__experimental__ = msg # type: ignore[attr-defined]
93
85
  return target # type: ignore[return-value]
94
86
 
95
87
  @functools.wraps(target)
@@ -100,7 +92,7 @@ def experimental( # noqa: C901
100
92
  warned = True
101
93
  return target(*args, **kwargs)
102
94
 
103
- wrapper.__doc__ = _prepend_doc_note(target.__doc__, msg)
95
+ wrapper.__experimental__ = msg # type: ignore[attr-defined]
104
96
  return wrapper # type: ignore[return-value]
105
97
 
106
98
  if _target is not None:
@@ -165,7 +157,7 @@ def deprecated( # noqa: C901
165
157
  original_init(self, *args, **kwargs)
166
158
 
167
159
  target.__init__ = new_init # type: ignore[attr-defined]
168
- target.__doc__ = _prepend_doc_note(target.__doc__, msg)
160
+ target.__deprecated__ = msg # type: ignore[attr-defined]
169
161
  return target # type: ignore[return-value]
170
162
 
171
163
  @functools.wraps(target)
@@ -176,7 +168,7 @@ def deprecated( # noqa: C901
176
168
  warned = True
177
169
  return target(*args, **kwargs)
178
170
 
179
- wrapper.__doc__ = _prepend_doc_note(target.__doc__, msg)
171
+ wrapper.__deprecated__ = msg # type: ignore[attr-defined]
180
172
  return wrapper # type: ignore[return-value]
181
173
 
182
174
  if _target is not None:
@@ -1,4 +1,4 @@
1
- __all__ = []
1
+ __all__ = ["Metadata"]
2
2
 
3
3
  import logging
4
4
  from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence, Sized
@@ -22,6 +22,7 @@ from dataeval.protocols import (
22
22
  )
23
23
  from dataeval.types import Array1D
24
24
  from dataeval.utils._internal import as_numpy, merge_metadata
25
+ from dataeval.utils._validate import requires_maite_dataset
25
26
 
26
27
  _logger = logging.getLogger(__name__)
27
28
 
@@ -105,6 +106,7 @@ class Metadata(Array, FeatureExtractor):
105
106
  >>> test_factors = metadata(test_dataset) # Extract from new dataset
106
107
  """
107
108
 
109
+ @requires_maite_dataset("dataset", expected="any_target")
108
110
  def __init__(
109
111
  self,
110
112
  dataset: AnnotatedDataset[tuple[Any, Any, DatumMetadata]] | None = None,
@@ -168,6 +170,7 @@ class Metadata(Array, FeatureExtractor):
168
170
  """
169
171
  return self._dataset is not None
170
172
 
173
+ @requires_maite_dataset("dataset", expected="any_target")
171
174
  def bind(self, dataset: AnnotatedDataset[tuple[Any, Any, DatumMetadata]]) -> Self:
172
175
  """Bind this instance to a dataset.
173
176
 
@@ -573,6 +576,11 @@ class Metadata(Array, FeatureExtractor):
573
576
  Rows where target_index is None contain datum-level data.
574
577
  Rows where target_index is an integer contain target/detection-level data.
575
578
 
579
+ See Also
580
+ --------
581
+ :attr:`~dataeval.Metadata.image_data` : Filter to image-level rows only
582
+ :attr:`~dataeval.Metadata.target_data` : Filter to target-level rows only
583
+
576
584
  Notes
577
585
  -----
578
586
  This property triggers dataset structure analysis on first access.
@@ -581,11 +589,6 @@ class Metadata(Array, FeatureExtractor):
581
589
  For Object Detection datasets, the dataframe now contains:
582
590
  - Image-level rows (target_index=None): One per image with image-level factors
583
591
  - Target-level rows (target_index=0,1,2...): One per detection with detection data
584
-
585
- See Also
586
- --------
587
- image_data : Filter to image-level rows only
588
- target_data : Filter to target-level rows only
589
592
  """
590
593
  self._structure()
591
594
  return self._dataframe
@@ -650,7 +653,7 @@ class Metadata(Array, FeatureExtractor):
650
653
  -------
651
654
  Sequence[str]
652
655
  List of factor names that passed filtering and preprocessing steps.
653
- Order matches columns in factor_data and binned_data.
656
+ Order matches columns in factor_data.
654
657
 
655
658
  Notes
656
659
  -----