deriva-ml 1.14.47__tar.gz → 1.17.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/PKG-INFO +10 -7
  2. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/pyproject.toml +12 -7
  3. deriva_ml-1.17.0/src/.DS_Store +0 -0
  4. deriva_ml-1.17.0/src/deriva_ml/.DS_Store +0 -0
  5. deriva_ml-1.17.0/src/deriva_ml/__init__.py +77 -0
  6. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/core/__init__.py +2 -2
  7. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/core/base.py +28 -16
  8. deriva_ml-1.17.0/src/deriva_ml/core/config.py +67 -0
  9. deriva_ml-1.17.0/src/deriva_ml/dataset/__init__.py +12 -0
  10. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/dataset/aux_classes.py +31 -2
  11. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/dataset/dataset.py +7 -5
  12. deriva_ml-1.17.0/src/deriva_ml/dataset/dataset_bag.py +450 -0
  13. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/dataset/upload.py +7 -4
  14. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/demo_catalog.py +17 -3
  15. deriva_ml-1.17.0/src/deriva_ml/execution/__init__.py +26 -0
  16. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/execution/execution.py +50 -28
  17. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/execution/execution_configuration.py +26 -31
  18. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/execution/workflow.py +8 -0
  19. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/model/catalog.py +119 -2
  20. deriva_ml-1.17.0/src/deriva_ml/model/database.py +719 -0
  21. deriva_ml-1.17.0/src/deriva_ml/protocols/dataset.py +19 -0
  22. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/run_notebook.py +55 -50
  23. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/schema/annotations.py +7 -5
  24. deriva_ml-1.17.0/src/deriva_ml/test.py +94 -0
  25. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml.egg-info/PKG-INFO +10 -7
  26. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml.egg-info/SOURCES.txt +5 -1
  27. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml.egg-info/requires.txt +8 -5
  28. deriva_ml-1.17.0/uv.lock +3822 -0
  29. deriva_ml-1.14.47/src/deriva_ml/__init__.py +0 -48
  30. deriva_ml-1.14.47/src/deriva_ml/dataset/__init__.py +0 -4
  31. deriva_ml-1.14.47/src/deriva_ml/dataset/dataset_bag.py +0 -342
  32. deriva_ml-1.14.47/src/deriva_ml/model/database.py +0 -345
  33. deriva_ml-1.14.47/src/deriva_ml/model/sql_mapper.py +0 -44
  34. deriva_ml-1.14.47/tests/model/__init__.py +0 -0
  35. deriva_ml-1.14.47/uv.lock +0 -3823
  36. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/.github/release-drafter.yml +0 -0
  37. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/.github/workflows/publish-docs.yml +0 -0
  38. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/.github/workflows/release.yml +0 -0
  39. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/.gitignore +0 -0
  40. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/LICENSE +0 -0
  41. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/README.md +0 -0
  42. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/.DS_Store +0 -0
  43. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/Notebooks/DerivaML Create Notes.ipynb +0 -0
  44. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/Notebooks/DerivaML Dataset.ipynb +0 -0
  45. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/Notebooks/DerivaML Execution.ipynb +0 -0
  46. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/Notebooks/DerivaML Features.ipynb +0 -0
  47. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/Notebooks/DerivaML Ingest.ipynb +0 -0
  48. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/Notebooks/DerivaML Vocabulary.ipynb +0 -0
  49. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/assets/ERD.png +0 -0
  50. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/assets/Launcher.png +0 -0
  51. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/assets/copy_minid.png +0 -0
  52. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/assets/deriva-logo.png +0 -0
  53. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/assets/deriva-ml.pdf +0 -0
  54. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/assets/sharing-at-home.pdf +0 -0
  55. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/code-docs/dataset.md +0 -0
  56. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/code-docs/dataset_aux_classes.md +0 -0
  57. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/code-docs/dataset_bag.md +0 -0
  58. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/code-docs/deriva_definitions.md +0 -0
  59. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/code-docs/deriva_ml_base.md +0 -0
  60. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/code-docs/deriva_model.md +0 -0
  61. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/code-docs/execution.md +0 -0
  62. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/code-docs/execution_configuration.md +0 -0
  63. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/code-docs/feature.md +0 -0
  64. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/code-docs/upload.md +0 -0
  65. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/index.md +0 -0
  66. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/release-notes.md +0 -0
  67. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/user-guide/datasets.md +0 -0
  68. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/user-guide/deriva_ml_structure.md +0 -0
  69. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/user-guide/execution-configuration.md +0 -0
  70. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/user-guide/file-assets.md +0 -0
  71. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/user-guide/identifiers.md +0 -0
  72. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/user-guide/install.md +0 -0
  73. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/user-guide/notebooks.md +0 -0
  74. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/docs/user-guide/overview.md +0 -0
  75. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/mkdocs.yml +0 -0
  76. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/setup.cfg +0 -0
  77. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/bump_version.py +0 -0
  78. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/core/constants.py +0 -0
  79. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/core/definitions.py +0 -0
  80. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/core/enums.py +0 -0
  81. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/core/ermrest.py +0 -0
  82. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/core/exceptions.py +0 -0
  83. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/core/filespec.py +0 -0
  84. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/dataset/history.py +0 -0
  85. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/execution/environment.py +0 -0
  86. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/feature.py +0 -0
  87. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/install_kernel.py +0 -0
  88. {deriva_ml-1.14.47/src/deriva_ml/execution → deriva_ml-1.17.0/src/deriva_ml/model}/__init__.py +0 -0
  89. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/schema/__init__.py +0 -0
  90. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/schema/check_schema.py +0 -0
  91. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/schema/create_schema.py +0 -0
  92. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/schema/deriva-ml-reference.json +0 -0
  93. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/schema/policy.json +0 -0
  94. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml/schema/table_comments_utils.py +0 -0
  95. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml.egg-info/dependency_links.txt +0 -0
  96. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml.egg-info/entry_points.txt +0 -0
  97. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/src/deriva_ml.egg-info/top_level.txt +0 -0
  98. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/tests/__init__.py +0 -0
  99. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/tests/conftest.py +0 -0
  100. {deriva_ml-1.14.47/src/deriva_ml/model → deriva_ml-1.17.0/tests/core}/__init__.py +0 -0
  101. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/tests/core/test_basic_tables.py +0 -0
  102. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/tests/core/test_file.py +0 -0
  103. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/tests/core/test_vocabulary.py +0 -0
  104. {deriva_ml-1.14.47/tests/core → deriva_ml-1.17.0/tests/dataset}/__init__.py +0 -0
  105. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/tests/dataset/demo-catalog-schema.json +0 -0
  106. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/tests/dataset/deriva-ml-reference.json +0 -0
  107. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/tests/dataset/eye-ai-catalog-schema.json +0 -0
  108. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/tests/dataset/test_dataset_version.py +0 -0
  109. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/tests/dataset/test_datasets.py +0 -0
  110. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/tests/dataset/test_download.py +0 -0
  111. {deriva_ml-1.14.47/tests/dataset → deriva_ml-1.17.0/tests/execution}/__init__.py +0 -0
  112. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/tests/execution/test_execution.py +0 -0
  113. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/tests/execution/workflow-test.ipynb +0 -0
  114. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/tests/execution/workflow-test.py +0 -0
  115. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/tests/feature/test_features.py +0 -0
  116. {deriva_ml-1.14.47/tests/execution → deriva_ml-1.17.0/tests/model}/__init__.py +0 -0
  117. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/tests/model/test_database.py +0 -0
  118. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/tests/model/test_models.py +0 -0
  119. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/tests/test-files/execution-parameters.json +0 -0
  120. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/tests/test-files/notebook-parameters.json +0 -0
  121. {deriva_ml-1.14.47 → deriva_ml-1.17.0}/tests/test_utils.py +0 -0
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deriva-ml
3
- Version: 1.14.47
3
+ Version: 1.17.0
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
- Requires-Python: >=3.10
6
+ Requires-Python: >=3.11
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
9
  Requires-Dist: bump-my-version
@@ -12,15 +12,18 @@ Requires-Dist: deriva~=1.7.10
12
12
  Requires-Dist: deepdiff
13
13
  Requires-Dist: nbconvert
14
14
  Requires-Dist: pandas
15
- Requires-Dist: regex~=2024.7.24
15
+ Requires-Dist: pip-system-certs
16
16
  Requires-Dist: pydantic>=2.11
17
- Requires-Dist: semver>3.0.0
18
- Requires-Dist: setuptools>=64
19
- Requires-Dist: setuptools-scm>=8.0
20
- Requires-Dist: nbstripout
21
17
  Requires-Dist: papermill
22
18
  Requires-Dist: pandas-stubs==2.2.3.250527
23
19
  Requires-Dist: pyyaml
20
+ Requires-Dist: regex~=2024.7.24
21
+ Requires-Dist: semver>3.0.0
22
+ Requires-Dist: setuptools>=80
23
+ Requires-Dist: setuptools-scm>=8.0
24
+ Requires-Dist: nbstripout
25
+ Requires-Dist: hydra_zen
26
+ Requires-Dist: SQLAlchemy
24
27
  Dynamic: license-file
25
28
 
26
29
  # DerivaML
@@ -6,7 +6,7 @@ authors = [
6
6
  ]
7
7
  description = "Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines"
8
8
  readme = "README.md"
9
- requires-python = ">=3.10"
9
+ requires-python = ">=3.11"
10
10
  dependencies = [
11
11
  "bump-my-version",
12
12
  "bdbag",
@@ -14,15 +14,18 @@ dependencies = [
14
14
  "deepdiff",
15
15
  "nbconvert",
16
16
  "pandas",
17
- "regex~=2024.7.24",
17
+ "pip-system-certs",
18
18
  "pydantic>=2.11",
19
+ "papermill",
20
+ "pandas-stubs==2.2.3.250527",
21
+ "pyyaml",
22
+ "regex~=2024.7.24",
19
23
  "semver>3.0.0",
20
- "setuptools>=64",
24
+ "setuptools>=80",
21
25
  "setuptools-scm>=8.0",
22
26
  "nbstripout",
23
- "papermill",
24
- "pandas-stubs==2.2.3.250527",
25
- "pyyaml"
27
+ "hydra_zen",
28
+ "SQLAlchemy"
26
29
  ]
27
30
 
28
31
  [project.scripts]
@@ -38,10 +41,11 @@ deriva-ml-check-catalog-schema = "deriva_ml.schema.check_schema:main"
38
41
  [project.optional-dependencies]
39
42
 
40
43
  [build-system]
41
- requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2", "wheel"]
44
+ requires = ["setuptools>=80", "setuptools_scm[toml]>=8", "wheel"]
42
45
  build-backend = "setuptools.build_meta"
43
46
 
44
47
  [tool.uv]
48
+ python-preference = "only-managed"
45
49
 
46
50
  [tool.uv.sources]
47
51
  #bdbag = {git = "https://github.com/fair-research/bdbag", branch = "master" }
@@ -113,6 +117,7 @@ dev = [
113
117
  "pytest>=8.4.1",
114
118
  "pytest-mock",
115
119
  "pytest-coverage>=0.0",
120
+ "pip-system-certs",
116
121
  "ruff"
117
122
  ]
118
123
  lint = [
Binary file
Binary file
@@ -0,0 +1,77 @@
1
+ from importlib.metadata import PackageNotFoundError, version
2
+ from typing import TYPE_CHECKING
3
+
4
+ # Safe imports - no circular dependencies
5
+ from deriva_ml.core.config import DerivaMLConfig
6
+ from deriva_ml.core.definitions import (
7
+ RID,
8
+ BuiltinTypes,
9
+ ColumnDefinition,
10
+ DerivaAssetColumns,
11
+ DerivaSystemColumns,
12
+ ExecAssetType,
13
+ ExecMetadataType,
14
+ FileSpec,
15
+ FileUploadState,
16
+ ForeignKeyDefinition,
17
+ KeyDefinition,
18
+ MLAsset,
19
+ MLVocab,
20
+ TableDefinition,
21
+ UploadState,
22
+ )
23
+ from deriva_ml.core.exceptions import (
24
+ DerivaMLException,
25
+ DerivaMLInvalidTerm,
26
+ DerivaMLTableTypeError,
27
+ )
28
+
29
+ # Type-checking only - avoid circular import at runtime
30
+ if TYPE_CHECKING:
31
+ from deriva_ml.core.base import DerivaML
32
+
33
+
34
+ # Lazy import function for runtime usage
35
+ def __getattr__(name):
36
+ """Lazy import to avoid circular dependencies."""
37
+ if name == "DerivaML":
38
+ from deriva_ml.core.base import DerivaML
39
+
40
+ return DerivaML
41
+ elif name == "Execution":
42
+ from deriva_ml.execution.execution import Execution
43
+
44
+ return Execution
45
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
46
+
47
+
48
+ __all__ = [
49
+ "DerivaML", # Lazy-loaded
50
+ "DerivaMLConfig",
51
+ # Exceptions
52
+ "DerivaMLException",
53
+ "DerivaMLInvalidTerm",
54
+ "DerivaMLTableTypeError",
55
+ # Definitions
56
+ "RID",
57
+ "BuiltinTypes",
58
+ "ColumnDefinition",
59
+ "DerivaSystemColumns",
60
+ "DerivaAssetColumns",
61
+ "ExecAssetType",
62
+ "ExecMetadataType",
63
+ "FileSpec",
64
+ "FileUploadState",
65
+ "ForeignKeyDefinition",
66
+ "KeyDefinition",
67
+ "MLAsset",
68
+ "MLVocab",
69
+ "TableDefinition",
70
+ "UploadState",
71
+ ]
72
+
73
+ try:
74
+ __version__ = version("deriva_ml")
75
+ except PackageNotFoundError:
76
+ # package is not installed
77
+ pass
@@ -1,4 +1,5 @@
1
1
  from deriva_ml.core.base import DerivaML
2
+ from deriva_ml.core.config import DerivaMLConfig
2
3
  from deriva_ml.core.definitions import (
3
4
  RID,
4
5
  BuiltinTypes,
@@ -17,12 +18,11 @@ from deriva_ml.core.exceptions import DerivaMLException, DerivaMLInvalidTerm, De
17
18
 
18
19
  __all__ = [
19
20
  "DerivaML",
20
-
21
+ "DerivaMLConfig",
21
22
  # Exceptions
22
23
  "DerivaMLException",
23
24
  "DerivaMLInvalidTerm",
24
25
  "DerivaMLTableTypeError",
25
-
26
26
  # Definitions
27
27
  "RID",
28
28
  "BuiltinTypes",
@@ -15,12 +15,11 @@ from __future__ import annotations # noqa: I001
15
15
 
16
16
  # Standard library imports
17
17
  from collections import defaultdict
18
- import getpass
19
18
  import logging
20
19
  from datetime import datetime
21
20
  from itertools import chain
22
21
  from pathlib import Path
23
- from typing import Dict, Iterable, List, cast, TYPE_CHECKING, Any
22
+ from typing import Dict, Iterable, List, cast, TYPE_CHECKING, Any, Self
24
23
  from urllib.parse import urlsplit
25
24
 
26
25
 
@@ -29,18 +28,14 @@ import requests
29
28
  from pydantic import ConfigDict, validate_call
30
29
 
31
30
  # Deriva imports
32
- from deriva.core import (
33
- DEFAULT_SESSION_CONFIG,
34
- format_exception,
35
- get_credential,
36
- urlquote,
37
- )
31
+ from deriva.core import DEFAULT_SESSION_CONFIG, format_exception, get_credential, urlquote
38
32
 
39
33
  import deriva.core.datapath as datapath
40
34
  from deriva.core.datapath import DataPathException, _SchemaWrapper as SchemaWrapper
41
35
  from deriva.core.deriva_server import DerivaServer
42
36
  from deriva.core.ermrest_catalog import ResolveRidResult
43
37
  from deriva.core.ermrest_model import Key, Table
38
+ from deriva.core.utils.core_utils import DEFAULT_LOGGER_OVERRIDES
44
39
  from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
45
40
 
46
41
  from deriva_ml.core.exceptions import DerivaMLInvalidTerm
@@ -55,6 +50,7 @@ from deriva_ml.core.definitions import (
55
50
  TableDefinition,
56
51
  VocabularyTerm,
57
52
  )
53
+ from deriva_ml.core.config import DerivaMLConfig
58
54
  from deriva_ml.core.exceptions import DerivaMLTableTypeError, DerivaMLException
59
55
  from deriva_ml.dataset.aux_classes import DatasetSpec
60
56
  from deriva_ml.dataset.dataset import Dataset
@@ -108,6 +104,10 @@ class DerivaML(Dataset):
108
104
  >>> ml.add_term('vocabulary_table', 'new_term', description='Description of term')
109
105
  """
110
106
 
107
+ @classmethod
108
+ def instantiate(cls, config: DerivaMLConfig) -> Self:
109
+ return cls(**config.model_dump())
110
+
111
111
  def __init__(
112
112
  self,
113
113
  hostname: str,
@@ -116,8 +116,10 @@ class DerivaML(Dataset):
116
116
  project_name: str | None = None,
117
117
  cache_dir: str | Path | None = None,
118
118
  working_dir: str | Path | None = None,
119
+ hydra_runtime_output_dir: str | Path | None = None,
119
120
  ml_schema: str = ML_SCHEMA,
120
121
  logging_level=logging.WARNING,
122
+ deriva_logging_level=logging.WARNING,
121
123
  credential=None,
122
124
  use_minid: bool = True,
123
125
  check_auth: bool = True,
@@ -152,7 +154,6 @@ class DerivaML(Dataset):
152
154
  credentials=self.credential,
153
155
  session_config=self._get_session_config(),
154
156
  )
155
-
156
157
  try:
157
158
  if check_auth and server.get_authn_session():
158
159
  pass
@@ -161,17 +162,14 @@ class DerivaML(Dataset):
161
162
  "You are not authorized to access this catalog. "
162
163
  "Please check your credentials and make sure you have logged in."
163
164
  )
164
-
165
165
  self.catalog = server.connect_ermrest(catalog_id)
166
166
  self.model = DerivaModel(self.catalog.getCatalogModel(), domain_schema=domain_schema)
167
167
 
168
168
  # Set up working and cache directories
169
- default_workdir = self.__class__.__name__ + "_working"
170
- self.working_dir = (
171
- Path(working_dir) / getpass.getuser() if working_dir else Path.home() / "deriva-ml"
172
- ) / default_workdir
173
-
169
+ self.working_dir = DerivaMLConfig.compute_workdir(working_dir)
174
170
  self.working_dir.mkdir(parents=True, exist_ok=True)
171
+ self.hydra_runtime_output_dir = hydra_runtime_output_dir
172
+
175
173
  self.cache_dir = Path(cache_dir) if cache_dir else self.working_dir / "cache"
176
174
  self.cache_dir.mkdir(parents=True, exist_ok=True)
177
175
 
@@ -181,6 +179,15 @@ class DerivaML(Dataset):
181
179
  # Set up logging
182
180
  self._logger = logging.getLogger("deriva_ml")
183
181
  self._logger.setLevel(logging_level)
182
+ self._logging_level = logging_level
183
+ self._deriva_logging_level = deriva_logging_level
184
+
185
+ # Configure deriva logging level
186
+ logger_config = DEFAULT_LOGGER_OVERRIDES
187
+ # allow for reconfiguration of module-specific logging levels
188
+ [logging.getLogger(name).setLevel(level) for name, level in logger_config.items()]
189
+ logging.getLogger("bagit").setLevel(deriva_logging_level)
190
+ logging.getLogger("bdbag").setLevel(deriva_logging_level)
184
191
 
185
192
  # Store instance configuration
186
193
  self.host_name = hostname
@@ -1081,7 +1088,12 @@ class DerivaML(Dataset):
1081
1088
  return self._download_dataset_bag(
1082
1089
  dataset=dataset,
1083
1090
  execution_rid=execution_rid,
1084
- snapshot_catalog=DerivaML(self.host_name, self._version_snapshot(dataset)),
1091
+ snapshot_catalog=DerivaML(
1092
+ self.host_name,
1093
+ self._version_snapshot(dataset),
1094
+ logging_level=self._logging_level,
1095
+ deriva_logging_level=self._deriva_logging_level,
1096
+ ),
1085
1097
  )
1086
1098
 
1087
1099
  def _update_status(self, new_status: Status, status_detail: str, execution_rid: RID):
@@ -0,0 +1,67 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Any
4
+
5
+ from hydra.conf import HydraConf, RunDir
6
+ from hydra.core.hydra_config import HydraConfig
7
+ from hydra_zen import store
8
+ from omegaconf import OmegaConf
9
+ from pydantic import BaseModel, model_validator
10
+
11
+ from deriva_ml.core.definitions import ML_SCHEMA
12
+
13
+
14
+ class DerivaMLConfig(BaseModel):
15
+ hostname: str
16
+ catalog_id: str | int = 1
17
+ domain_schema: str | None = None
18
+ project_name: str | None = None
19
+ cache_dir: str | Path | None = None
20
+ working_dir: str | Path | None = None
21
+ hydra_runtime_output_dir: str | Path | None = None
22
+ ml_schema: str = ML_SCHEMA
23
+ logging_level: Any = logging.WARNING
24
+ deriva_logging_level: Any = logging.WARNING
25
+ credential: Any = None
26
+ use_minid: bool = True
27
+ check_auth: bool = True
28
+
29
+ @model_validator(mode="after")
30
+ def init_working_dir(self):
31
+ """
32
+ Sets up the working directory for the model.
33
+
34
+ This method configures the working directory, ensuring that all required
35
+ file operations are performed in the appropriate location. If the user does not
36
+ specify a directory, a default directory based on the user's home directory
37
+ or username will be used.
38
+
39
+ This is a repeat of what is in the DerivaML.__init__ bu we put this here so that the working
40
+ directory is available to hydra.
41
+
42
+ Returns:
43
+ Self: The object instance with the working directory initialized.
44
+ """
45
+
46
+ self.working_dir = DerivaMLConfig.compute_workdir(self.working_dir)
47
+ self.hydra_runtime_output_dir = Path(HydraConfig.get().runtime.output_dir)
48
+ return self
49
+
50
+ @staticmethod
51
+ def compute_workdir(working_dir) -> Path:
52
+ # Create a default working directory if none is provided
53
+ working_dir = Path(working_dir) if working_dir else Path.home() / "deriva-ml"
54
+ return working_dir.absolute()
55
+
56
+
57
+ OmegaConf.register_new_resolver("compute_workdir", DerivaMLConfig.compute_workdir, replace=True)
58
+ store(
59
+ HydraConf(
60
+ run=RunDir("${compute_workdir:${deriva_ml.working_dir}}/hydra/${now:%Y-%m-%d_%H-%M-%S}"),
61
+ output_subdir="hydra-config",
62
+ ),
63
+ group="hydra",
64
+ name="config",
65
+ )
66
+
67
+ store.add_to_hydra_store()
@@ -0,0 +1,12 @@
1
+ from .aux_classes import DatasetSpec, DatasetSpecConfig, DatasetVersion, VersionPart
2
+ from .dataset import Dataset
3
+ from .dataset_bag import DatasetBag
4
+
5
+ __all__ = [
6
+ "Dataset",
7
+ "DatasetSpec",
8
+ "DatasetSpecConfig",
9
+ "DatasetBag",
10
+ "DatasetVersion",
11
+ "VersionPart",
12
+ ]
@@ -3,8 +3,9 @@ THis module defines the DataSet class with is used to manipulate n
3
3
  """
4
4
 
5
5
  from enum import Enum
6
- from typing import Any, Optional, SupportsInt
6
+ from typing import Any, Optional, SupportsInt, overload
7
7
 
8
+ from hydra_zen import hydrated_dataclass
8
9
  from pydantic import (
9
10
  BaseModel,
10
11
  ConfigDict,
@@ -42,6 +43,9 @@ class DatasetVersion(Version):
42
43
  replace(major, minor, patch): Replace the major and minor versions
43
44
  """
44
45
 
46
+ @overload
47
+ def __init__(self, version: str): ...
48
+ @overload
45
49
  def __init__(self, major: SupportsInt, minor: SupportsInt = 0, patch: SupportsInt = 0):
46
50
  """Initialize a DatasetVersion object.
47
51
 
@@ -50,6 +54,21 @@ class DatasetVersion(Version):
50
54
  minor: Minor version number. Used to indicate additional members added, or change in member values.
51
55
  patch: Patch number of the dataset. Used to indicate minor clean-up and edits
52
56
  """
57
+ ...
58
+
59
+ def __init__(self, *args):
60
+ """Initialize a DatasetVersion object.
61
+
62
+ Args:
63
+ major: Major version number. Used to indicate schema changes.
64
+ minor: Minor version number. Used to indicate additional members added, or change in member values.
65
+ patch: Patch number of the dataset. Used to indicate minor clean-up and edits
66
+ """
67
+ if len(args) == 1 and isinstance(args[0], str):
68
+ v = Version.parse(args[0])
69
+ major, minor, patch = v.major, v.minor, v.patch
70
+ else:
71
+ major, minor, patch = args
53
72
  super().__init__(major, minor, patch)
54
73
 
55
74
  def to_dict(self) -> dict[str, Any]:
@@ -182,8 +201,9 @@ class DatasetSpec(BaseModel):
182
201
  """
183
202
 
184
203
  rid: RID
185
- materialize: bool = True
186
204
  version: DatasetVersion | conlist(item_type=int, min_length=3, max_length=3) | tuple[int, int, int] | str
205
+ materialize: bool = True
206
+ description: str = ""
187
207
 
188
208
  model_config = ConfigDict(arbitrary_types_allowed=True)
189
209
 
@@ -208,3 +228,12 @@ class DatasetSpec(BaseModel):
208
228
  @field_serializer("version")
209
229
  def serialize_version(self, version: DatasetVersion) -> dict[str, Any]:
210
230
  return version.to_dict()
231
+
232
+
233
+ # Interface for hydra-zen
234
+ @hydrated_dataclass(DatasetSpec)
235
+ class DatasetSpecConfig:
236
+ rid: str
237
+ version: str
238
+ materialize: bool = True
239
+ description: str = ""
@@ -22,14 +22,16 @@ Typical usage example:
22
22
 
23
23
  from __future__ import annotations
24
24
 
25
- # Standard library imports
26
25
  import json
27
26
  import logging
28
27
  from collections import defaultdict
28
+
29
+ # Standard library imports
29
30
  from graphlib import TopologicalSorter
30
31
  from pathlib import Path
31
32
  from tempfile import TemporaryDirectory
32
33
  from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator
34
+ from urllib.parse import urlparse
33
35
 
34
36
  import deriva.core.utils.hash_utils as hash_utils
35
37
  import requests
@@ -1039,7 +1041,6 @@ class Dataset:
1039
1041
  envars={"RID": dataset.rid},
1040
1042
  )
1041
1043
  minid_page_url = exporter.export()[0] # Get the MINID launch page
1042
-
1043
1044
  except (
1044
1045
  DerivaDownloadError,
1045
1046
  DerivaDownloadConfigurationError,
@@ -1095,7 +1096,8 @@ class Dataset:
1095
1096
 
1096
1097
  # Check or create MINID
1097
1098
  minid_url = version_record.minid
1098
- if not minid_url:
1099
+ # If we either don't have a MINID, or we have a MINID, but we don't want to use it, generate a new one.
1100
+ if (not minid_url) or (not self._use_minid):
1099
1101
  if not create:
1100
1102
  raise DerivaMLException(f"Minid for dataset {rid} doesn't exist")
1101
1103
  if self._use_minid:
@@ -1105,7 +1107,6 @@ class Dataset:
1105
1107
  # Return based on MINID usage
1106
1108
  if self._use_minid:
1107
1109
  return self._fetch_minid_metadata(minid_url, dataset.version)
1108
-
1109
1110
  return DatasetMinid(
1110
1111
  dataset_version=dataset.version,
1111
1112
  RID=f"{rid}@{version_record.snapshot}",
@@ -1138,7 +1139,8 @@ class Dataset:
1138
1139
  with TemporaryDirectory() as tmp_dir:
1139
1140
  if self._use_minid:
1140
1141
  # Get bag from S3
1141
- archive_path = fetch_single_file(minid.bag_url, output_path=tmp_dir)
1142
+ bag_path = Path(tmp_dir) / Path(urlparse(minid.bag_url).path).name
1143
+ archive_path = fetch_single_file(minid.bag_url, output_path=bag_path)
1142
1144
  else:
1143
1145
  exporter = DerivaExport(host=self._model.catalog.deriva_server.server, output_dir=tmp_dir)
1144
1146
  archive_path = exporter.retrieve_file(minid.bag_url)