deriva-ml 1.17.10__tar.gz → 1.17.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. deriva_ml-1.17.12/.DS_Store +0 -0
  2. deriva_ml-1.17.12/.cursor.config +3 -0
  3. deriva_ml-1.17.12/.vscode/settings.json +12 -0
  4. deriva_ml-1.17.12/CLAUDE.md +259 -0
  5. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/PKG-INFO +4 -4
  6. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/docs/Notebooks/DerivaML Vocabulary.ipynb +20 -95
  7. deriva_ml-1.17.12/docs/architecture.md +480 -0
  8. deriva_ml-1.17.12/docs/code-docs/dataset.md +8 -0
  9. deriva_ml-1.17.12/docs/code-docs/dataset_aux_classes.md +7 -0
  10. deriva_ml-1.17.12/docs/code-docs/dataset_bag.md +8 -0
  11. deriva_ml-1.17.12/docs/code-docs/deriva_definitions.md +8 -0
  12. deriva_ml-1.17.12/docs/code-docs/deriva_model.md +8 -0
  13. deriva_ml-1.17.12/docs/code-docs/exceptions.md +7 -0
  14. deriva_ml-1.17.12/docs/code-docs/feature.md +8 -0
  15. deriva_ml-1.17.12/docs/code-docs/upload.md +6 -0
  16. deriva_ml-1.17.12/docs/code-docs/workflow.md +8 -0
  17. deriva_ml-1.17.12/docs/user-guide/annotations.md +634 -0
  18. deriva_ml-1.17.12/docs/user-guide/datasets.md +514 -0
  19. deriva_ml-1.17.12/docs/user-guide/execution-configuration.md +296 -0
  20. deriva_ml-1.17.12/docs/user-guide/features.md +266 -0
  21. deriva_ml-1.17.12/docs/user-guide/file-assets.md +251 -0
  22. deriva_ml-1.17.12/docs/user-guide/hydra-zen-configuration.md +718 -0
  23. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/docs/user-guide/overview.md +19 -17
  24. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/mkdocs.yml +21 -10
  25. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/pyproject.toml +9 -8
  26. deriva_ml-1.17.12/src/deriva_ml/__init__.py +147 -0
  27. deriva_ml-1.17.12/src/deriva_ml/asset/__init__.py +17 -0
  28. deriva_ml-1.17.12/src/deriva_ml/asset/asset.py +357 -0
  29. deriva_ml-1.17.12/src/deriva_ml/asset/aux_classes.py +100 -0
  30. deriva_ml-1.17.12/src/deriva_ml/bump_version.py +385 -0
  31. deriva_ml-1.17.12/src/deriva_ml/catalog/__init__.py +31 -0
  32. deriva_ml-1.17.12/src/deriva_ml/catalog/clone.py +1939 -0
  33. deriva_ml-1.17.12/src/deriva_ml/catalog/localize.py +426 -0
  34. deriva_ml-1.17.12/src/deriva_ml/core/__init__.py +68 -0
  35. deriva_ml-1.17.12/src/deriva_ml/core/base.py +1305 -0
  36. deriva_ml-1.17.12/src/deriva_ml/core/config.py +217 -0
  37. deriva_ml-1.17.12/src/deriva_ml/core/constants.py +137 -0
  38. deriva_ml-1.17.12/src/deriva_ml/core/definitions.py +184 -0
  39. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml/core/enums.py +47 -73
  40. deriva_ml-1.17.12/src/deriva_ml/core/ermrest.py +321 -0
  41. deriva_ml-1.17.12/src/deriva_ml/core/exceptions.py +311 -0
  42. deriva_ml-1.17.12/src/deriva_ml/core/filespec.py +187 -0
  43. deriva_ml-1.17.12/src/deriva_ml/core/logging_config.py +225 -0
  44. deriva_ml-1.17.12/src/deriva_ml/core/mixins/__init__.py +42 -0
  45. deriva_ml-1.17.12/src/deriva_ml/core/mixins/annotation.py +915 -0
  46. deriva_ml-1.17.12/src/deriva_ml/core/mixins/asset.py +384 -0
  47. deriva_ml-1.17.12/src/deriva_ml/core/mixins/dataset.py +237 -0
  48. deriva_ml-1.17.12/src/deriva_ml/core/mixins/execution.py +408 -0
  49. deriva_ml-1.17.12/src/deriva_ml/core/mixins/feature.py +365 -0
  50. deriva_ml-1.17.12/src/deriva_ml/core/mixins/file.py +263 -0
  51. deriva_ml-1.17.12/src/deriva_ml/core/mixins/path_builder.py +145 -0
  52. deriva_ml-1.17.12/src/deriva_ml/core/mixins/rid_resolution.py +204 -0
  53. deriva_ml-1.17.12/src/deriva_ml/core/mixins/vocabulary.py +400 -0
  54. deriva_ml-1.17.12/src/deriva_ml/core/mixins/workflow.py +322 -0
  55. deriva_ml-1.17.12/src/deriva_ml/core/validation.py +389 -0
  56. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml/dataset/__init__.py +2 -1
  57. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml/dataset/aux_classes.py +20 -4
  58. deriva_ml-1.17.12/src/deriva_ml/dataset/catalog_graph.py +575 -0
  59. deriva_ml-1.17.12/src/deriva_ml/dataset/dataset.py +1753 -0
  60. deriva_ml-1.17.12/src/deriva_ml/dataset/dataset_bag.py +1579 -0
  61. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml/dataset/history.py +27 -14
  62. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml/dataset/upload.py +225 -38
  63. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml/demo_catalog.py +126 -110
  64. deriva_ml-1.17.12/src/deriva_ml/execution/__init__.py +70 -0
  65. deriva_ml-1.17.12/src/deriva_ml/execution/base_config.py +639 -0
  66. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml/execution/execution.py +543 -242
  67. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml/execution/execution_configuration.py +26 -11
  68. deriva_ml-1.17.12/src/deriva_ml/execution/execution_record.py +592 -0
  69. deriva_ml-1.17.12/src/deriva_ml/execution/find_caller.py +298 -0
  70. deriva_ml-1.17.12/src/deriva_ml/execution/model_protocol.py +175 -0
  71. deriva_ml-1.17.12/src/deriva_ml/execution/multirun_config.py +153 -0
  72. deriva_ml-1.17.12/src/deriva_ml/execution/runner.py +595 -0
  73. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml/execution/workflow.py +223 -34
  74. deriva_ml-1.17.12/src/deriva_ml/experiment/__init__.py +8 -0
  75. deriva_ml-1.17.12/src/deriva_ml/experiment/experiment.py +411 -0
  76. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml/feature.py +6 -1
  77. deriva_ml-1.17.12/src/deriva_ml/install_kernel.py +208 -0
  78. deriva_ml-1.17.12/src/deriva_ml/interfaces.py +862 -0
  79. deriva_ml-1.17.12/src/deriva_ml/model/__init__.py +99 -0
  80. deriva_ml-1.17.12/src/deriva_ml/model/annotations.py +1278 -0
  81. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml/model/catalog.py +286 -60
  82. deriva_ml-1.17.12/src/deriva_ml/model/database.py +214 -0
  83. deriva_ml-1.17.12/src/deriva_ml/model/deriva_ml_database.py +308 -0
  84. deriva_ml-1.17.12/src/deriva_ml/model/handles.py +14 -0
  85. deriva_ml-1.17.12/src/deriva_ml/run_model.py +319 -0
  86. deriva_ml-1.17.12/src/deriva_ml/run_notebook.py +697 -0
  87. deriva_ml-1.17.12/src/deriva_ml/schema/__init__.py +19 -0
  88. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml/schema/annotations.py +62 -33
  89. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml/schema/create_schema.py +169 -69
  90. deriva_ml-1.17.12/src/deriva_ml/schema/validation.py +601 -0
  91. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml.egg-info/PKG-INFO +4 -4
  92. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml.egg-info/SOURCES.txt +67 -2
  93. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml.egg-info/entry_points.txt +1 -0
  94. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml.egg-info/requires.txt +2 -2
  95. deriva_ml-1.17.12/test_output.txt +9494 -0
  96. deriva_ml-1.17.12/tests/asset/test_asset.py +360 -0
  97. deriva_ml-1.17.12/tests/catalog/__init__.py +1 -0
  98. deriva_ml-1.17.12/tests/catalog/test_clone_catalog.py +1682 -0
  99. deriva_ml-1.17.12/tests/catalog_manager.py +445 -0
  100. deriva_ml-1.17.12/tests/conftest.py +315 -0
  101. deriva_ml-1.17.12/tests/core/test_catalog_annotations.py +190 -0
  102. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/tests/core/test_file.py +16 -9
  103. deriva_ml-1.17.12/tests/core/test_hydra_zen_config.py +567 -0
  104. deriva_ml-1.17.12/tests/core/test_rid_resolution.py +107 -0
  105. deriva_ml-1.17.12/tests/core/test_vocabulary.py +289 -0
  106. deriva_ml-1.17.12/tests/dataset/test_catalog_dataset_functions.py +203 -0
  107. deriva_ml-1.17.12/tests/dataset/test_dataset_version.py +87 -0
  108. deriva_ml-1.17.12/tests/dataset/test_datasets.py +415 -0
  109. deriva_ml-1.17.12/tests/dataset/test_denormalize.py +776 -0
  110. deriva_ml-1.17.12/tests/dataset/test_download.py +417 -0
  111. deriva_ml-1.17.12/tests/dataset/test_restructure.py +1024 -0
  112. deriva_ml-1.17.12/tests/execution/test_execution.py +1350 -0
  113. deriva_ml-1.17.12/tests/execution/test_find_caller.py +375 -0
  114. deriva_ml-1.17.12/tests/execution/test_runner.py +310 -0
  115. deriva_ml-1.17.12/tests/execution/test_storage.py +265 -0
  116. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/tests/execution/workflow-test.ipynb +5 -2
  117. deriva_ml-1.17.12/tests/experiment/__init__.py +1 -0
  118. deriva_ml-1.17.12/tests/experiment/test_experiment.py +680 -0
  119. deriva_ml-1.17.12/tests/factories.py +516 -0
  120. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/tests/feature/test_features.py +43 -12
  121. deriva_ml-1.17.12/tests/model/test_annotations.py +508 -0
  122. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/tests/model/test_database.py +58 -36
  123. deriva_ml-1.17.12/tests/model/test_handles.py +405 -0
  124. deriva_ml-1.17.12/tests/schema/__init__.py +1 -0
  125. deriva_ml-1.17.12/tests/schema/test_validation.py +265 -0
  126. deriva_ml-1.17.12/tests/test_factories.py +290 -0
  127. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/tests/test_utils.py +35 -12
  128. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/uv.lock +510 -1169
  129. deriva_ml-1.17.10/docs/code-docs/dataset.md +0 -2
  130. deriva_ml-1.17.10/docs/code-docs/dataset_aux_classes.md +0 -4
  131. deriva_ml-1.17.10/docs/code-docs/dataset_bag.md +0 -4
  132. deriva_ml-1.17.10/docs/code-docs/deriva_definitions.md +0 -2
  133. deriva_ml-1.17.10/docs/code-docs/deriva_model.md +0 -2
  134. deriva_ml-1.17.10/docs/code-docs/feature.md +0 -2
  135. deriva_ml-1.17.10/docs/code-docs/upload.md +0 -2
  136. deriva_ml-1.17.10/docs/user-guide/datasets.md +0 -112
  137. deriva_ml-1.17.10/docs/user-guide/execution-configuration.md +0 -26
  138. deriva_ml-1.17.10/docs/user-guide/file-assets.md +0 -3
  139. deriva_ml-1.17.10/src/deriva_ml/__init__.py +0 -79
  140. deriva_ml-1.17.10/src/deriva_ml/bump_version.py +0 -142
  141. deriva_ml-1.17.10/src/deriva_ml/core/__init__.py +0 -39
  142. deriva_ml-1.17.10/src/deriva_ml/core/base.py +0 -1527
  143. deriva_ml-1.17.10/src/deriva_ml/core/config.py +0 -69
  144. deriva_ml-1.17.10/src/deriva_ml/core/constants.py +0 -36
  145. deriva_ml-1.17.10/src/deriva_ml/core/definitions.py +0 -74
  146. deriva_ml-1.17.10/src/deriva_ml/core/ermrest.py +0 -288
  147. deriva_ml-1.17.10/src/deriva_ml/core/exceptions.py +0 -28
  148. deriva_ml-1.17.10/src/deriva_ml/core/filespec.py +0 -116
  149. deriva_ml-1.17.10/src/deriva_ml/dataset/dataset.py +0 -1519
  150. deriva_ml-1.17.10/src/deriva_ml/dataset/dataset_bag.py +0 -450
  151. deriva_ml-1.17.10/src/deriva_ml/execution/__init__.py +0 -26
  152. deriva_ml-1.17.10/src/deriva_ml/install_kernel.py +0 -71
  153. deriva_ml-1.17.10/src/deriva_ml/model/database.py +0 -719
  154. deriva_ml-1.17.10/src/deriva_ml/protocols/dataset.py +0 -19
  155. deriva_ml-1.17.10/src/deriva_ml/run_notebook.py +0 -228
  156. deriva_ml-1.17.10/src/deriva_ml/schema/__init__.py +0 -3
  157. deriva_ml-1.17.10/src/deriva_ml/test.py +0 -94
  158. deriva_ml-1.17.10/tests/conftest.py +0 -89
  159. deriva_ml-1.17.10/tests/core/test_vocabulary.py +0 -68
  160. deriva_ml-1.17.10/tests/dataset/test_dataset_version.py +0 -59
  161. deriva_ml-1.17.10/tests/dataset/test_datasets.py +0 -184
  162. deriva_ml-1.17.10/tests/dataset/test_download.py +0 -173
  163. deriva_ml-1.17.10/tests/execution/test_execution.py +0 -232
  164. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/.github/release-drafter.yml +0 -0
  165. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/.github/workflows/publish-docs.yml +0 -0
  166. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/.github/workflows/release.yml +0 -0
  167. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/.gitignore +0 -0
  168. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/LICENSE +0 -0
  169. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/README.md +0 -0
  170. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/docs/.DS_Store +0 -0
  171. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/docs/Notebooks/DerivaML Create Notes.ipynb +0 -0
  172. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/docs/Notebooks/DerivaML Dataset.ipynb +0 -0
  173. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/docs/Notebooks/DerivaML Execution.ipynb +0 -0
  174. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/docs/Notebooks/DerivaML Features.ipynb +0 -0
  175. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/docs/Notebooks/DerivaML Ingest.ipynb +0 -0
  176. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/docs/assets/ERD.png +0 -0
  177. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/docs/assets/Launcher.png +0 -0
  178. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/docs/assets/copy_minid.png +0 -0
  179. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/docs/assets/deriva-logo.png +0 -0
  180. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/docs/assets/deriva-ml.pdf +0 -0
  181. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/docs/assets/sharing-at-home.pdf +0 -0
  182. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/docs/code-docs/deriva_ml_base.md +0 -0
  183. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/docs/code-docs/execution.md +0 -0
  184. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/docs/code-docs/execution_configuration.md +0 -0
  185. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/docs/index.md +0 -0
  186. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/docs/release-notes.md +0 -0
  187. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/docs/user-guide/deriva_ml_structure.md +0 -0
  188. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/docs/user-guide/identifiers.md +0 -0
  189. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/docs/user-guide/install.md +0 -0
  190. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/docs/user-guide/notebooks.md +0 -0
  191. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/setup.cfg +0 -0
  192. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/.DS_Store +0 -0
  193. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml/.DS_Store +0 -0
  194. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml/execution/environment.py +0 -0
  195. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml/schema/check_schema.py +0 -0
  196. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml/schema/deriva-ml-reference.json +0 -0
  197. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml/schema/policy.json +0 -0
  198. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml/schema/table_comments_utils.py +0 -0
  199. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml.egg-info/dependency_links.txt +0 -0
  200. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/src/deriva_ml.egg-info/top_level.txt +0 -0
  201. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/tests/__init__.py +0 -0
  202. {deriva_ml-1.17.10/src/deriva_ml/model → deriva_ml-1.17.12/tests/asset}/__init__.py +0 -0
  203. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/tests/core/__init__.py +0 -0
  204. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/tests/core/test_basic_tables.py +0 -0
  205. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/tests/dataset/__init__.py +0 -0
  206. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/tests/dataset/demo-catalog-schema.json +0 -0
  207. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/tests/dataset/deriva-ml-reference.json +0 -0
  208. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/tests/dataset/eye-ai-catalog-schema.json +0 -0
  209. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/tests/execution/__init__.py +0 -0
  210. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/tests/execution/workflow-test.py +0 -0
  211. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/tests/model/__init__.py +0 -0
  212. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/tests/model/test_models.py +0 -0
  213. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/tests/test-files/execution-parameters.json +0 -0
  214. {deriva_ml-1.17.10 → deriva_ml-1.17.12}/tests/test-files/notebook-parameters.json +0 -0
Binary file
@@ -0,0 +1,3 @@
1
+ {
2
+ "python.defaultInterpreterPath": "/Users/carl/opt/anaconda3/envs/deriva-test/bin/python"
3
+ }
@@ -0,0 +1,12 @@
1
+ {
2
+ "python.defaultInterpreterPath": "/Users/carl/opt/anaconda3/envs/deriva-test/bin/python",
3
+ "python.analysis.extraPaths": [
4
+ "./src"
5
+ ],
6
+ "python.analysis.typeCheckingMode": "basic",
7
+ "python.formatting.provider": "black",
8
+ "editor.formatOnSave": true,
9
+ "python.linting.enabled": true,
10
+ "python.linting.pylintEnabled": false,
11
+ "python.linting.flake8Enabled": true
12
+ }
@@ -0,0 +1,259 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Project Overview
6
+
7
+ DerivaML is a Python library for creating and executing reproducible machine learning workflows using a Deriva catalog. It provides:
8
+ - Dataset versioning and management with BDBag support
9
+ - Execution tracking with provenance
10
+ - Feature management for ML experiments
11
+ - Controlled vocabulary management
12
+ - Asset tracking and upload
13
+
14
+ ## Build and Development Commands
15
+
16
+ ```bash
17
+ # Install dependencies
18
+ uv sync
19
+
20
+ # Run all tests (requires DERIVA_HOST env var or defaults to localhost)
21
+ uv run pytest
22
+
23
+ # Run a single test file
24
+ uv run pytest tests/dataset/test_datasets.py
25
+
26
+ # Run a specific test
27
+ uv run pytest tests/dataset/test_datasets.py::test_function_name -v
28
+
29
+ # Run tests with coverage
30
+ uv run pytest --cov=deriva_ml --cov-report=term-missing
31
+
32
+ # Lint and format
33
+ uv run ruff check src/
34
+ uv run ruff format src/
35
+
36
+ # Build documentation
37
+ uv run mkdocs serve
38
+ ```
39
+
40
+ ## Architecture
41
+
42
+ ### Core Classes
43
+
44
+ **DerivaML** (`src/deriva_ml/core/base.py`): Main entry point for catalog operations. Provides:
45
+ - Catalog connection and authentication via Globus
46
+ - Vocabulary and feature management
47
+ - Dataset creation and lookup
48
+ - Workflow and execution management
49
+
50
+ **Execution** (`src/deriva_ml/execution/execution.py`): Manages ML workflow lifecycle:
51
+ - Downloads/materializes datasets specified in configuration
52
+ - Tracks execution status and provenance
53
+ - Handles asset upload after execution completes
54
+ - Used as context manager: `with ml.create_execution(config) as exe:`
55
+
56
+ **Dataset** (`src/deriva_ml/dataset/dataset.py`): Versioned dataset management:
57
+ - Semantic versioning (major.minor.patch)
58
+ - BDBag export with optional MINID creation
59
+ - Nested dataset support
60
+ - Version history tracking via catalog snapshots
61
+
62
+ **DatasetBag** (`src/deriva_ml/dataset/dataset_bag.py`): Downloaded dataset representation:
63
+ - Provides same interface as Dataset via `DatasetLike` protocol
64
+ - Works with local BDBag directories (no catalog connection needed)
65
+ - Supports nested dataset traversal and member listing
66
+ - Use `restructure_assets()` to reorganize files by dataset type/features
67
+
68
+ **ExecutionConfiguration** (`src/deriva_ml/execution/execution_configuration.py`): Pydantic model for execution setup:
69
+ - Dataset specifications with version and materialization options
70
+ - Input asset RIDs
71
+ - Workflow reference
72
+ - Execution parameters
73
+
74
+ ### Key Patterns
75
+
76
+ **Catalog Path Builder**: Most catalog queries use the fluent path builder API:
77
+ ```python
78
+ pb = ml.pathBuilder()
79
+ results = pb.schemas[schema_name].tables[table_name].entities().fetch()
80
+ ```
81
+
82
+ **Dataset Versioning**: Datasets use catalog snapshots for version isolation:
83
+ - Each version records a catalog snapshot timestamp
84
+ - `dataset.set_version(version)` returns a Dataset bound to that snapshot
85
+ - Version increments propagate to parent/child datasets via topological sort
86
+
87
+ **Asset Management**: Assets are tracked via association tables:
88
+ - `Asset_Type` vocabulary controls asset categorization
89
+ - `{Asset}_Execution` tables link assets to executions with Input/Output roles
90
+ - File uploads use Hatrac object store
91
+
92
+ ### Testing
93
+
94
+ Tests require a running Deriva catalog. The test fixtures in `tests/conftest.py`:
95
+ - `deriva_catalog`: Creates an empty test catalog (session-scoped)
96
+ - `test_ml`: Provides a DerivaML instance, resets catalog between tests
97
+ - `catalog_with_datasets`: Provides a catalog with populated demo data
98
+
99
+ Set `DERIVA_HOST` environment variable to specify the test server (defaults to `localhost`).
100
+
101
+ ## Schema Structure
102
+
103
+ The library uses two schemas:
104
+ - **deriva-ml** (`ML_SCHEMA`): Core ML tables (Dataset, Execution, Workflow, Feature_Name, etc.)
105
+ - **Domain schema**: Application-specific tables created by users
106
+
107
+ Controlled vocabularies: Dataset_Type, Asset_Type, Workflow_Type, Asset_Role, Feature_Name
108
+
109
+ ## Exception Hierarchy
110
+
111
+ DerivaML uses a structured exception hierarchy for error handling:
112
+
113
+ ```
114
+ DerivaMLException (base class)
115
+ ├── DerivaMLConfigurationError (configuration/initialization)
116
+ │ ├── DerivaMLSchemaError (schema structure issues)
117
+ │ └── DerivaMLAuthenticationError (auth failures)
118
+ ├── DerivaMLDataError (data access/validation)
119
+ │ ├── DerivaMLNotFoundError (entity not found)
120
+ │ │ ├── DerivaMLDatasetNotFound
121
+ │ │ ├── DerivaMLTableNotFound
122
+ │ │ └── DerivaMLInvalidTerm
123
+ │ ├── DerivaMLTableTypeError (wrong table type)
124
+ │ ├── DerivaMLValidationError (validation failures)
125
+ │ └── DerivaMLCycleError (relationship cycles)
126
+ ├── DerivaMLExecutionError (execution lifecycle)
127
+ │ ├── DerivaMLWorkflowError
128
+ │ └── DerivaMLUploadError
129
+ └── DerivaMLReadOnlyError (writes on read-only)
130
+ ```
131
+
132
+ Import from: `from deriva_ml.core.exceptions import ...`
133
+
134
+ ## Protocol Hierarchy
135
+
136
+ The library uses protocols for type-safe polymorphism:
137
+
138
+ **Dataset Protocols:**
139
+ - `DatasetLike`: Read-only operations (Dataset and DatasetBag)
140
+ - `WritableDataset`: Write operations (Dataset only)
141
+
142
+ **Catalog Protocols:**
143
+ - `DerivaMLCatalogReader`: Read-only catalog operations
144
+ - `DerivaMLCatalog`: Full catalog operations with writes
145
+
146
+ Import from: `from deriva_ml.interfaces import ...`
147
+
148
+ ## Shared Utilities
149
+
150
+ **Validation** (`deriva_ml.core.validation`):
151
+ - `VALIDATION_CONFIG`: Standard ConfigDict for `@validate_call`
152
+ - `STRICT_VALIDATION_CONFIG`: ConfigDict that forbids extra fields
153
+
154
+ **Logging** (`deriva_ml.core.logging_config`):
155
+ - `get_logger(name)`: Get a deriva_ml logger
156
+ - `configure_logging(level)`: Configure logging for all components
157
+ - `LoggerMixin`: Mixin providing `_logger` attribute
158
+
159
+ ## Future Decomposition
160
+
161
+ The `DerivaML` class (~1700 lines) handles multiple concerns. Future refactoring could extract:
162
+ - `VocabularyManager`: Term and vocabulary CRUD
163
+ - `FeatureManager`: Feature definition and values
164
+ - `WorkflowManager`: Workflow tracking and Git integration
165
+ - `DatasetManager`: Dataset creation and lookup
166
+ - `AssetManager`: Asset table operations
167
+
168
+ Similarly, `Execution` (~1100 lines) could be decomposed into:
169
+ - `DatasetDownloader`: Dataset materialization
170
+ - `AssetUploader`: Result upload and cataloging
171
+ - `StatusTracker`: Execution status management
172
+
173
+ ## Hydra-zen Configuration
174
+
175
+ DerivaML integrates with hydra-zen for reproducible configuration. Key config classes:
176
+
177
+ **DerivaMLConfig** (`deriva_ml.core.config`): Main connection configuration
178
+ ```python
179
+ from deriva_ml import DerivaMLConfig
180
+ config = DerivaMLConfig(hostname="example.org", catalog_id="42")
181
+ ml = DerivaML.instantiate(config)
182
+ ```
183
+
184
+ **DatasetSpecConfig** (`deriva_ml.dataset`): Dataset specification for executions
185
+ ```python
186
+ from deriva_ml.dataset import DatasetSpecConfig
187
+ spec = DatasetSpecConfig(rid="XXXX", version="1.0.0", materialize=True)
188
+ ```
189
+
190
+ **AssetRIDConfig** (`deriva_ml.execution`): Input asset specification
191
+ ```python
192
+ from deriva_ml.execution import AssetRIDConfig
193
+ asset = AssetRIDConfig(rid="YYYY", description="Pretrained weights")
194
+ ```
195
+
196
+ **ExecutionConfiguration** (`deriva_ml.execution`): Full execution setup
197
+ ```python
198
+ from deriva_ml.execution import ExecutionConfiguration
199
+ config = ExecutionConfiguration(
200
+ datasets=[DatasetSpecConfig(rid="DATA", version="1.0.0")],
201
+ assets=["WGTS"],
202
+ description="Training run"
203
+ )
204
+ ```
205
+
206
+ Use `builds()` with `populate_full_signature=True` for hydra-zen integration.
207
+ Use `zen_partial=True` for model functions that receive execution context at runtime.
208
+
209
+ See `docs/user-guide/hydra-zen-configuration.md` for complete documentation.
210
+
211
+ ## Best Practices & Patterns
212
+
213
+ ### Version Bumping
214
+
215
+ Use the `bump-version` script for releases - it handles the complete workflow:
216
+ ```bash
217
+ uv run bump-version patch # or minor, major
218
+ ```
219
+ This fetches tags, bumps the version, creates a tag, and pushes everything in one command.
220
+ Don't use `bump-my-version` directly as it doesn't push changes.
221
+
222
+ ### Asset Upload
223
+
224
+ Use `asset_file_path()` API to register files for upload:
225
+ ```python
226
+ path = execution.asset_file_path(
227
+ MLAsset.execution_metadata,
228
+ "my-file.yaml",
229
+ asset_types=ExecMetadataType.hydra_config.value,
230
+ )
231
+ with path.open("w") as f:
232
+ f.write(content)
233
+ ```
234
+ Don't manually create files in `working_dir / "Execution_Metadata"` - they won't be uploaded.
235
+
236
+ ### Upload Network Configuration
237
+
238
+ `upload_directory()` has two network configuration parameters:
239
+ - `timeout`: HTTP session timeout (connect, read) - passed to session config
240
+ - `chunk_size`: Hatrac chunk upload size in bytes - passed through upload spec
241
+
242
+ ### Workflow Deduplication
243
+
244
+ Workflows are deduplicated by checksum. When the same script runs multiple times, `add_workflow()` returns the existing workflow's RID rather than creating a new one. Tests that need distinct workflows must account for this.
245
+
246
+ ### Testing find_experiments
247
+
248
+ The `find_experiments()` function finds executions with Hydra config files (matching `*-config.yaml` in Execution_Metadata). Test fixtures must use `asset_file_path()` to properly register config files - see `execution_with_hydra_config` fixture.
249
+
250
+ ### Association Tables
251
+
252
+ Use `Table.define_association()` for creating association tables instead of manually defining columns, keys, and foreign keys:
253
+ ```python
254
+ Table.define_association(
255
+ associates=[("Execution", execution), ("Nested_Execution", execution)],
256
+ comment="Description",
257
+ metadata=[Column.define("Sequence", builtin_types.int4, nullok=True)]
258
+ )
259
+ ```
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deriva-ml
3
- Version: 1.17.10
3
+ Version: 1.17.12
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
- Requires-Python: >=3.10
6
+ Requires-Python: >=3.12
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
9
  Requires-Dist: bump-my-version
@@ -14,9 +14,9 @@ Requires-Dist: nbconvert
14
14
  Requires-Dist: pandas
15
15
  Requires-Dist: pydantic>=2.11
16
16
  Requires-Dist: papermill
17
- Requires-Dist: pandas-stubs==2.2.3.250527
17
+ Requires-Dist: pandas-stubs
18
18
  Requires-Dist: pyyaml
19
- Requires-Dist: regex~=2024.7.24
19
+ Requires-Dist: regex
20
20
  Requires-Dist: semver>3.0.0
21
21
  Requires-Dist: setuptools>=80
22
22
  Requires-Dist: setuptools-scm>=8.0
@@ -24,32 +24,24 @@
24
24
  },
25
25
  {
26
26
  "cell_type": "code",
27
+ "execution_count": null,
27
28
  "id": "2",
28
- "metadata": {
29
- "ExecuteTime": {
30
- "end_time": "2025-06-06T21:12:17.642500Z",
31
- "start_time": "2025-06-06T21:12:16.168200Z"
32
- }
33
- },
29
+ "metadata": {},
30
+ "outputs": [],
34
31
  "source": [
35
32
  "from IPython.display import display, Markdown, HTML\n",
36
33
  "import pandas as pd\n",
37
34
  "from deriva.core.utils.globus_auth_utils import GlobusNativeLogin\n",
38
35
  "from deriva_ml.demo_catalog import create_demo_catalog, DemoML\n",
39
36
  "from deriva_ml import MLVocab"
40
- ],
41
- "outputs": [],
42
- "execution_count": 1
37
+ ]
43
38
  },
44
39
  {
45
40
  "cell_type": "code",
41
+ "execution_count": null,
46
42
  "id": "3",
47
- "metadata": {
48
- "ExecuteTime": {
49
- "end_time": "2025-06-06T21:12:20.383347Z",
50
- "start_time": "2025-06-06T21:12:20.344740Z"
51
- }
52
- },
43
+ "metadata": {},
44
+ "outputs": [],
53
45
  "source": [
54
46
  "hostname = 'dev.eye-ai.org' # This needs to be changed.\n",
55
47
  "\n",
@@ -59,17 +51,7 @@
59
51
  "else:\n",
60
52
  " gnl.login([hostname], no_local_server=True, no_browser=True, refresh_tokens=True, update_bdbag_keychain=True)\n",
61
53
  " print(\"Login Successful\")"
62
- ],
63
- "outputs": [
64
- {
65
- "name": "stdout",
66
- "output_type": "stream",
67
- "text": [
68
- "You are already logged in.\n"
69
- ]
70
- }
71
- ],
72
- "execution_count": 2
54
+ ]
73
55
  },
74
56
  {
75
57
  "cell_type": "markdown",
@@ -82,37 +64,14 @@
82
64
  },
83
65
  {
84
66
  "cell_type": "code",
67
+ "execution_count": null,
85
68
  "id": "5",
86
- "metadata": {
87
- "ExecuteTime": {
88
- "end_time": "2025-06-06T21:12:53.290591Z",
89
- "start_time": "2025-06-06T21:12:24.856557Z"
90
- }
91
- },
69
+ "metadata": {},
70
+ "outputs": [],
92
71
  "source": [
93
72
  "test_catalog = create_demo_catalog(hostname)\n",
94
73
  "ml_instance = DemoML(hostname, test_catalog.catalog_id)"
95
- ],
96
- "outputs": [
97
- {
98
- "name": "stderr",
99
- "output_type": "stream",
100
- "text": [
101
- "2025-06-06 14:12:47,103 - deriva_ml.WARNING - File /Users/carl/Repos/Projects/deriva-ml/docs/Notebooks/DerivaML Vocabulary.ipynb has been modified since last commit. Consider commiting before executing\n"
102
- ]
103
- },
104
- {
105
- "data": {
106
- "text/plain": [
107
- "<IPython.core.display.Markdown object>"
108
- ],
109
- "text/markdown": "Execution RID: https://dev.eye-ai.org/id/2060/3SC@33D-VDH5-6N1W"
110
- },
111
- "metadata": {},
112
- "output_type": "display_data"
113
- }
114
- ],
115
- "execution_count": 3
74
+ ]
116
75
  },
117
76
  {
118
77
  "cell_type": "markdown",
@@ -125,30 +84,13 @@
125
84
  },
126
85
  {
127
86
  "cell_type": "code",
87
+ "execution_count": null,
128
88
  "id": "7",
129
- "metadata": {
130
- "ExecuteTime": {
131
- "end_time": "2025-06-06T21:12:53.473300Z",
132
- "start_time": "2025-06-06T21:12:53.305180Z"
133
- }
134
- },
89
+ "metadata": {},
90
+ "outputs": [],
135
91
  "source": [
136
92
  "ml_instance.find_vocabularies()"
137
- ],
138
- "outputs": [
139
- {
140
- "ename": "AttributeError",
141
- "evalue": "'DemoML' object has no attribute 'find_vocabularies'",
142
- "output_type": "error",
143
- "traceback": [
144
- "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
145
- "\u001B[0;31mAttributeError\u001B[0m Traceback (most recent call last)",
146
- "Cell \u001B[0;32mIn[4], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[43mml_instance\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mfind_vocabularies\u001B[49m()\n",
147
- "\u001B[0;31mAttributeError\u001B[0m: 'DemoML' object has no attribute 'find_vocabularies'"
148
- ]
149
- }
150
- ],
151
- "execution_count": 4
93
+ ]
152
94
  },
153
95
  {
154
96
  "cell_type": "markdown",
@@ -223,33 +165,16 @@
223
165
  },
224
166
  {
225
167
  "cell_type": "code",
168
+ "execution_count": null,
226
169
  "id": "15",
227
- "metadata": {
228
- "ExecuteTime": {
229
- "end_time": "2025-06-06T21:11:15.795882Z",
230
- "start_time": "2025-06-06T21:11:15.335291Z"
231
- }
232
- },
170
+ "metadata": {},
171
+ "outputs": [],
233
172
  "source": [
234
173
  "display(\n",
235
174
  " Markdown('#### Contents of controlled vocabulary \"My term set'),\n",
236
175
  " pd.DataFrame([v.model_dump() for v in ml_instance.list_vocabulary_terms(\"My term set\")])\n",
237
176
  ")"
238
- ],
239
- "outputs": [
240
- {
241
- "ename": "NameError",
242
- "evalue": "name 'ml_instance' is not defined",
243
- "output_type": "error",
244
- "traceback": [
245
- "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
246
- "\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)",
247
- "Cell \u001B[0;32mIn[2], line 3\u001B[0m\n\u001B[1;32m 1\u001B[0m display(\n\u001B[1;32m 2\u001B[0m Markdown(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m#### Contents of controlled vocabulary \u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mMy term set\u001B[39m\u001B[38;5;124m'\u001B[39m),\n\u001B[0;32m----> 3\u001B[0m pd\u001B[38;5;241m.\u001B[39mDataFrame([v\u001B[38;5;241m.\u001B[39mmodel_dump() \u001B[38;5;28;01mfor\u001B[39;00m v \u001B[38;5;129;01min\u001B[39;00m \u001B[43mml_instance\u001B[49m\u001B[38;5;241m.\u001B[39mlist_vocabulary_terms(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mMy term set\u001B[39m\u001B[38;5;124m\"\u001B[39m)])\n\u001B[1;32m 4\u001B[0m )\n",
248
- "\u001B[0;31mNameError\u001B[0m: name 'ml_instance' is not defined"
249
- ]
250
- }
251
- ],
252
- "execution_count": 2
177
+ ]
253
178
  },
254
179
  {
255
180
  "cell_type": "markdown",