deriva-ml 1.14.46__tar.gz → 1.16.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/PKG-INFO +2 -1
  2. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/pyproject.toml +2 -1
  3. deriva_ml-1.16.0/src/deriva_ml/__init__.py +87 -0
  4. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/bump_version.py +1 -1
  5. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/core/__init__.py +2 -2
  6. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/core/base.py +12 -12
  7. deriva_ml-1.16.0/src/deriva_ml/core/config.py +67 -0
  8. deriva_ml-1.16.0/src/deriva_ml/dataset/__init__.py +17 -0
  9. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/dataset/aux_classes.py +20 -1
  10. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/dataset/dataset.py +3 -2
  11. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/dataset/dataset_bag.py +115 -0
  12. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/dataset/upload.py +1 -0
  13. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/demo_catalog.py +1 -1
  14. deriva_ml-1.16.0/src/deriva_ml/execution/__init__.py +25 -0
  15. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/execution/execution.py +46 -26
  16. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/execution/execution_configuration.py +8 -32
  17. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/model/catalog.py +113 -1
  18. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/model/database.py +2 -2
  19. deriva_ml-1.16.0/src/deriva_ml/protocols/dataset.py +19 -0
  20. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/run_notebook.py +55 -50
  21. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/schema/annotations.py +7 -5
  22. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml.egg-info/PKG-INFO +2 -1
  23. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml.egg-info/SOURCES.txt +2 -0
  24. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml.egg-info/requires.txt +1 -0
  25. deriva_ml-1.16.0/uv.lock +4073 -0
  26. deriva_ml-1.14.46/src/deriva_ml/__init__.py +0 -48
  27. deriva_ml-1.14.46/src/deriva_ml/dataset/__init__.py +0 -4
  28. deriva_ml-1.14.46/tests/model/__init__.py +0 -0
  29. deriva_ml-1.14.46/uv.lock +0 -3823
  30. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/.github/release-drafter.yml +0 -0
  31. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/.github/workflows/publish-docs.yml +0 -0
  32. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/.github/workflows/release.yml +0 -0
  33. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/.gitignore +0 -0
  34. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/LICENSE +0 -0
  35. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/README.md +0 -0
  36. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/.DS_Store +0 -0
  37. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/Notebooks/DerivaML Create Notes.ipynb +0 -0
  38. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/Notebooks/DerivaML Dataset.ipynb +0 -0
  39. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/Notebooks/DerivaML Execution.ipynb +0 -0
  40. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/Notebooks/DerivaML Features.ipynb +0 -0
  41. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/Notebooks/DerivaML Ingest.ipynb +0 -0
  42. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/Notebooks/DerivaML Vocabulary.ipynb +0 -0
  43. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/assets/ERD.png +0 -0
  44. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/assets/Launcher.png +0 -0
  45. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/assets/copy_minid.png +0 -0
  46. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/assets/deriva-logo.png +0 -0
  47. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/assets/deriva-ml.pdf +0 -0
  48. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/assets/sharing-at-home.pdf +0 -0
  49. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/code-docs/dataset.md +0 -0
  50. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/code-docs/dataset_aux_classes.md +0 -0
  51. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/code-docs/dataset_bag.md +0 -0
  52. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/code-docs/deriva_definitions.md +0 -0
  53. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/code-docs/deriva_ml_base.md +0 -0
  54. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/code-docs/deriva_model.md +0 -0
  55. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/code-docs/execution.md +0 -0
  56. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/code-docs/execution_configuration.md +0 -0
  57. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/code-docs/feature.md +0 -0
  58. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/code-docs/upload.md +0 -0
  59. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/index.md +0 -0
  60. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/release-notes.md +0 -0
  61. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/user-guide/datasets.md +0 -0
  62. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/user-guide/deriva_ml_structure.md +0 -0
  63. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/user-guide/execution-configuration.md +0 -0
  64. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/user-guide/file-assets.md +0 -0
  65. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/user-guide/identifiers.md +0 -0
  66. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/user-guide/install.md +0 -0
  67. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/user-guide/notebooks.md +0 -0
  68. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/docs/user-guide/overview.md +0 -0
  69. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/mkdocs.yml +0 -0
  70. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/setup.cfg +0 -0
  71. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/core/constants.py +0 -0
  72. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/core/definitions.py +0 -0
  73. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/core/enums.py +0 -0
  74. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/core/ermrest.py +0 -0
  75. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/core/exceptions.py +0 -0
  76. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/core/filespec.py +0 -0
  77. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/dataset/history.py +0 -0
  78. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/execution/environment.py +0 -0
  79. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/execution/workflow.py +0 -0
  80. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/feature.py +0 -0
  81. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/install_kernel.py +0 -0
  82. {deriva_ml-1.14.46/src/deriva_ml/execution → deriva_ml-1.16.0/src/deriva_ml/model}/__init__.py +0 -0
  83. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/model/sql_mapper.py +0 -0
  84. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/schema/__init__.py +0 -0
  85. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/schema/check_schema.py +0 -0
  86. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/schema/create_schema.py +0 -0
  87. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/schema/deriva-ml-reference.json +0 -0
  88. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/schema/policy.json +0 -0
  89. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/schema/table_comments_utils.py +0 -0
  90. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml.egg-info/dependency_links.txt +0 -0
  91. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml.egg-info/entry_points.txt +0 -0
  92. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml.egg-info/top_level.txt +0 -0
  93. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/tests/__init__.py +0 -0
  94. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/tests/conftest.py +0 -0
  95. {deriva_ml-1.14.46/src/deriva_ml/model → deriva_ml-1.16.0/tests/core}/__init__.py +0 -0
  96. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/tests/core/test_basic_tables.py +0 -0
  97. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/tests/core/test_file.py +0 -0
  98. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/tests/core/test_vocabulary.py +0 -0
  99. {deriva_ml-1.14.46/tests/core → deriva_ml-1.16.0/tests/dataset}/__init__.py +0 -0
  100. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/tests/dataset/demo-catalog-schema.json +0 -0
  101. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/tests/dataset/deriva-ml-reference.json +0 -0
  102. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/tests/dataset/eye-ai-catalog-schema.json +0 -0
  103. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/tests/dataset/test_dataset_version.py +0 -0
  104. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/tests/dataset/test_datasets.py +0 -0
  105. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/tests/dataset/test_download.py +0 -0
  106. {deriva_ml-1.14.46/tests/dataset → deriva_ml-1.16.0/tests/execution}/__init__.py +0 -0
  107. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/tests/execution/test_execution.py +0 -0
  108. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/tests/execution/workflow-test.ipynb +0 -0
  109. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/tests/execution/workflow-test.py +0 -0
  110. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/tests/feature/test_features.py +0 -0
  111. {deriva_ml-1.14.46/tests/execution → deriva_ml-1.16.0/tests/model}/__init__.py +0 -0
  112. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/tests/model/test_database.py +0 -0
  113. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/tests/model/test_models.py +0 -0
  114. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/tests/test-files/execution-parameters.json +0 -0
  115. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/tests/test-files/notebook-parameters.json +0 -0
  116. {deriva_ml-1.14.46 → deriva_ml-1.16.0}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deriva-ml
3
- Version: 1.14.46
3
+ Version: 1.16.0
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
6
  Requires-Python: >=3.10
@@ -21,6 +21,7 @@ Requires-Dist: nbstripout
21
21
  Requires-Dist: papermill
22
22
  Requires-Dist: pandas-stubs==2.2.3.250527
23
23
  Requires-Dist: pyyaml
24
+ Requires-Dist: hydra_zen
24
25
  Dynamic: license-file
25
26
 
26
27
  # DerivaML
@@ -22,7 +22,8 @@ dependencies = [
22
22
  "nbstripout",
23
23
  "papermill",
24
24
  "pandas-stubs==2.2.3.250527",
25
- "pyyaml"
25
+ "pyyaml",
26
+ "hydra_zen",
26
27
  ]
27
28
 
28
29
  [project.scripts]
@@ -0,0 +1,87 @@
1
+ from importlib.metadata import PackageNotFoundError, version
2
+ from typing import TYPE_CHECKING
3
+
4
+ # Safe imports - no circular dependencies
5
+ from deriva_ml.core.config import DerivaMLConfig
6
+ from deriva_ml.core.definitions import (
7
+ RID,
8
+ BuiltinTypes,
9
+ ColumnDefinition,
10
+ DerivaAssetColumns,
11
+ DerivaSystemColumns,
12
+ ExecAssetType,
13
+ ExecMetadataType,
14
+ FileSpec,
15
+ FileUploadState,
16
+ ForeignKeyDefinition,
17
+ KeyDefinition,
18
+ MLAsset,
19
+ MLVocab,
20
+ TableDefinition,
21
+ UploadState,
22
+ )
23
+ from deriva_ml.core.exceptions import (
24
+ DerivaMLException,
25
+ DerivaMLInvalidTerm,
26
+ DerivaMLTableTypeError,
27
+ )
28
+ from deriva_ml.dataset.aux_classes import DatasetConfig, DatasetConfigList, DatasetSpec, DatasetVersion
29
+
30
+ from .execution import Execution, ExecutionConfiguration, Workflow
31
+
32
+ # Type-checking only - avoid circular import at runtime
33
+ if TYPE_CHECKING:
34
+ from deriva_ml.core.base import DerivaML
35
+
36
+
37
+ # Lazy import function for runtime usage
38
+ def __getattr__(name):
39
+ """Lazy import to avoid circular dependencies."""
40
+ if name == "DerivaML":
41
+ from deriva_ml.core.base import DerivaML
42
+
43
+ return DerivaML
44
+ elif name == "Execution":
45
+ from deriva_ml.execution.execution import Execution
46
+
47
+ return Execution
48
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
49
+
50
+
51
+ __all__ = [
52
+ "DerivaML", # Lazy-loaded
53
+ "DerivaMLConfig",
54
+ "DatasetConfig",
55
+ "DatasetConfigList",
56
+ "DatasetSpec",
57
+ "DatasetVersion",
58
+ "Execution",
59
+ "ExecutionConfiguration",
60
+ "Workflow",
61
+ # Exceptions
62
+ "DerivaMLException",
63
+ "DerivaMLInvalidTerm",
64
+ "DerivaMLTableTypeError",
65
+ # Definitions
66
+ "RID",
67
+ "BuiltinTypes",
68
+ "ColumnDefinition",
69
+ "DerivaSystemColumns",
70
+ "DerivaAssetColumns",
71
+ "ExecAssetType",
72
+ "ExecMetadataType",
73
+ "FileSpec",
74
+ "FileUploadState",
75
+ "ForeignKeyDefinition",
76
+ "KeyDefinition",
77
+ "MLAsset",
78
+ "MLVocab",
79
+ "TableDefinition",
80
+ "UploadState",
81
+ ]
82
+
83
+ try:
84
+ __version__ = version("deriva_ml")
85
+ except PackageNotFoundError:
86
+ # package is not installed
87
+ pass
@@ -105,7 +105,7 @@ def main() -> int:
105
105
 
106
106
  # Find latest semver tag with prefix
107
107
  tag = latest_semver_tag(prefix)
108
-
108
+ print(f"Latest semver tag: {tag}")
109
109
  if not tag:
110
110
  seed_initial_tag(f"{prefix}{start}")
111
111
  print(f"Seeded {prefix}{start}. Done.")
@@ -1,4 +1,5 @@
1
1
  from deriva_ml.core.base import DerivaML
2
+ from deriva_ml.core.config import DerivaMLConfig
2
3
  from deriva_ml.core.definitions import (
3
4
  RID,
4
5
  BuiltinTypes,
@@ -17,12 +18,11 @@ from deriva_ml.core.exceptions import DerivaMLException, DerivaMLInvalidTerm, De
17
18
 
18
19
  __all__ = [
19
20
  "DerivaML",
20
-
21
+ "DerivaMLConfig",
21
22
  # Exceptions
22
23
  "DerivaMLException",
23
24
  "DerivaMLInvalidTerm",
24
25
  "DerivaMLTableTypeError",
25
-
26
26
  # Definitions
27
27
  "RID",
28
28
  "BuiltinTypes",
@@ -15,7 +15,6 @@ from __future__ import annotations # noqa: I001
15
15
 
16
16
  # Standard library imports
17
17
  from collections import defaultdict
18
- import getpass
19
18
  import logging
20
19
  from datetime import datetime
21
20
  from itertools import chain
@@ -29,12 +28,7 @@ import requests
29
28
  from pydantic import ConfigDict, validate_call
30
29
 
31
30
  # Deriva imports
32
- from deriva.core import (
33
- DEFAULT_SESSION_CONFIG,
34
- format_exception,
35
- get_credential,
36
- urlquote,
37
- )
31
+ from deriva.core import DEFAULT_SESSION_CONFIG, format_exception, get_credential, urlquote, init_logging
38
32
 
39
33
  import deriva.core.datapath as datapath
40
34
  from deriva.core.datapath import DataPathException, _SchemaWrapper as SchemaWrapper
@@ -55,6 +49,7 @@ from deriva_ml.core.definitions import (
55
49
  TableDefinition,
56
50
  VocabularyTerm,
57
51
  )
52
+ from deriva_ml.core.config import DerivaMLConfig
58
53
  from deriva_ml.core.exceptions import DerivaMLTableTypeError, DerivaMLException
59
54
  from deriva_ml.dataset.aux_classes import DatasetSpec
60
55
  from deriva_ml.dataset.dataset import Dataset
@@ -116,8 +111,10 @@ class DerivaML(Dataset):
116
111
  project_name: str | None = None,
117
112
  cache_dir: str | Path | None = None,
118
113
  working_dir: str | Path | None = None,
114
+ hydra_runtime_output_dir: str | Path | None = None,
119
115
  ml_schema: str = ML_SCHEMA,
120
116
  logging_level=logging.WARNING,
117
+ deriva_logging_level=logging.WARNING,
121
118
  credential=None,
122
119
  use_minid: bool = True,
123
120
  check_auth: bool = True,
@@ -166,12 +163,10 @@ class DerivaML(Dataset):
166
163
  self.model = DerivaModel(self.catalog.getCatalogModel(), domain_schema=domain_schema)
167
164
 
168
165
  # Set up working and cache directories
169
- default_workdir = self.__class__.__name__ + "_working"
170
- self.working_dir = (
171
- Path(working_dir) / getpass.getuser() if working_dir else Path.home() / "deriva-ml"
172
- ) / default_workdir
173
-
166
+ self.working_dir = DerivaMLConfig.compute_workdir(working_dir)
174
167
  self.working_dir.mkdir(parents=True, exist_ok=True)
168
+ self.hydra_runtime_output_dir = hydra_runtime_output_dir
169
+
175
170
  self.cache_dir = Path(cache_dir) if cache_dir else self.working_dir / "cache"
176
171
  self.cache_dir.mkdir(parents=True, exist_ok=True)
177
172
 
@@ -182,6 +177,11 @@ class DerivaML(Dataset):
182
177
  self._logger = logging.getLogger("deriva_ml")
183
178
  self._logger.setLevel(logging_level)
184
179
 
180
+ # Configure deriva logging level
181
+ init_logging(deriva_logging_level)
182
+ logging.getLogger("bagit").setLevel(deriva_logging_level)
183
+ logging.getLogger("bdbag").setLevel(deriva_logging_level)
184
+
185
185
  # Store instance configuration
186
186
  self.host_name = hostname
187
187
  self.catalog_id = catalog_id
@@ -0,0 +1,67 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Any
4
+
5
+ from hydra.conf import HydraConf, RunDir
6
+ from hydra.core.hydra_config import HydraConfig
7
+ from hydra_zen import store
8
+ from omegaconf import OmegaConf
9
+ from pydantic import BaseModel, model_validator
10
+
11
+ from deriva_ml.core.definitions import ML_SCHEMA
12
+
13
+
14
+ class DerivaMLConfig(BaseModel):
15
+ hostname: str
16
+ catalog_id: str | int = 1
17
+ domain_schema: str | None = None
18
+ project_name: str | None = None
19
+ cache_dir: str | Path | None = None
20
+ working_dir: str | Path | None = None
21
+ hydra_runtime_output_dir: str | Path | None = None
22
+ ml_schema: str = ML_SCHEMA
23
+ logging_level: Any = logging.WARNING
24
+ deriva_logging_level: Any = logging.WARNING
25
+ credential: Any = None
26
+ use_minid: bool = True
27
+ check_auth: bool = True
28
+
29
+ @model_validator(mode="after")
30
+ def init_working_dir(self):
31
+ """
32
+ Sets up the working directory for the model.
33
+
34
+ This method configures the working directory, ensuring that all required
35
+ file operations are performed in the appropriate location. If the user does not
36
+ specify a directory, a default directory based on the user's home directory
37
+ or username will be used.
38
+
39
+ This is a repeat of what is in the DerivaML.__init__ bu we put this here so that the working
40
+ directory is available to hydra.
41
+
42
+ Returns:
43
+ Self: The object instance with the working directory initialized.
44
+ """
45
+
46
+ self.working_dir = DerivaMLConfig.compute_workdir(self.working_dir)
47
+ self.hydra_runtime_output_dir = Path(HydraConfig.get().runtime.output_dir)
48
+ return self
49
+
50
+ @staticmethod
51
+ def compute_workdir(working_dir) -> Path:
52
+ # Create a default working directory if none is provided
53
+ working_dir = Path(working_dir) if working_dir else Path.home() / "deriva-ml"
54
+ return working_dir.absolute()
55
+
56
+
57
+ OmegaConf.register_new_resolver("compute_workdir", DerivaMLConfig.compute_workdir, replace=True)
58
+ store(
59
+ HydraConf(
60
+ run=RunDir("${compute_workdir:${deriva_ml.working_dir}}/hydra/${now:%Y-%m-%d_%H-%M-%S}"),
61
+ output_subdir="hydra-config",
62
+ ),
63
+ group="hydra",
64
+ name="config",
65
+ )
66
+
67
+ store.add_to_hydra_store()
@@ -0,0 +1,17 @@
1
+ from typing import Protocol, runtime_checkable
2
+
3
+ from deriva_ml.core.definitions import RID
4
+
5
+ from .aux_classes import DatasetConfig, DatasetConfigList, DatasetSpec, DatasetVersion, VersionPart
6
+ from .dataset import Dataset
7
+ from .dataset_bag import DatasetBag
8
+
9
+ __all__ = [
10
+ "Dataset",
11
+ "DatasetSpec",
12
+ "DatasetConfig",
13
+ "DatasetConfigList",
14
+ "DatasetBag",
15
+ "DatasetVersion",
16
+ "VersionPart",
17
+ ]
@@ -5,6 +5,7 @@ THis module defines the DataSet class with is used to manipulate n
5
5
  from enum import Enum
6
6
  from typing import Any, Optional, SupportsInt
7
7
 
8
+ from hydra_zen import hydrated_dataclass
8
9
  from pydantic import (
9
10
  BaseModel,
10
11
  ConfigDict,
@@ -182,8 +183,9 @@ class DatasetSpec(BaseModel):
182
183
  """
183
184
 
184
185
  rid: RID
185
- materialize: bool = True
186
186
  version: DatasetVersion | conlist(item_type=int, min_length=3, max_length=3) | tuple[int, int, int] | str
187
+ materialize: bool = True
188
+ description: str = ""
187
189
 
188
190
  model_config = ConfigDict(arbitrary_types_allowed=True)
189
191
 
@@ -208,3 +210,20 @@ class DatasetSpec(BaseModel):
208
210
  @field_serializer("version")
209
211
  def serialize_version(self, version: DatasetVersion) -> dict[str, Any]:
210
212
  return version.to_dict()
213
+
214
+
215
+ @hydrated_dataclass(DatasetSpec)
216
+ class DatasetConfig:
217
+ rid: str
218
+ version: str
219
+ materialize: bool = True
220
+ description: str = ""
221
+
222
+ class DatasetList(BaseModel):
223
+ datasets: list[DatasetSpec]
224
+ description: str = ""
225
+
226
+ @hydrated_dataclass(DatasetList)
227
+ class DatasetConfigList:
228
+ datasets: list[DatasetConfig]
229
+ description: str = ""
@@ -22,10 +22,11 @@ Typical usage example:
22
22
 
23
23
  from __future__ import annotations
24
24
 
25
- # Standard library imports
26
25
  import json
27
26
  import logging
28
27
  from collections import defaultdict
28
+
29
+ # Standard library imports
29
30
  from graphlib import TopologicalSorter
30
31
  from pathlib import Path
31
32
  from tempfile import TemporaryDirectory
@@ -1138,7 +1139,7 @@ class Dataset:
1138
1139
  with TemporaryDirectory() as tmp_dir:
1139
1140
  if self._use_minid:
1140
1141
  # Get bag from S3
1141
- archive_path = fetch_single_file(minid.bag_url)
1142
+ archive_path = fetch_single_file(minid.bag_url, output_path=tmp_dir)
1142
1143
  else:
1143
1144
  exporter = DerivaExport(host=self._model.catalog.deriva_server.server, output_dir=tmp_dir)
1144
1145
  archive_path = exporter.retrieve_file(minid.bag_url)
@@ -266,6 +266,22 @@ class DatasetBag:
266
266
  sql_cmd = f'SELECT * FROM "{feature_table}"'
267
267
  return cast(datapath._ResultSet, [dict(zip(col_names, r)) for r in db.execute(sql_cmd).fetchall()])
268
268
 
269
+ def list_dataset_element_types(self) -> list[Table]:
270
+ """
271
+ Lists the data types of elements contained within a dataset.
272
+
273
+ This method analyzes the dataset and identifies the data types for all
274
+ elements within it. It is useful for understanding the structure and
275
+ content of the dataset and allows for better manipulation and usage of its
276
+ data.
277
+
278
+ Returns:
279
+ list[str]: A list of strings where each string represents a data type
280
+ of an element found in the dataset.
281
+
282
+ """
283
+ return self.model.list_dataset_element_types()
284
+
269
285
  def list_dataset_children(self, recurse: bool = False) -> list[DatasetBag]:
270
286
  """Get nested datasets.
271
287
 
@@ -333,6 +349,105 @@ class DatasetBag:
333
349
  # Term not found
334
350
  raise DerivaMLInvalidTerm(vocab_table, term_name)
335
351
 
352
+ def _denormalize(self, include_tables: list[str] | None) -> str:
353
+ """
354
+ Generates an SQL statement for denormalizing the dataset based on the tables to include. Processes cycles in
355
+ graph relationships, ensures proper join order, and generates selected columns for denormalization.
356
+
357
+ Args:
358
+ include_tables (list[str] | None): List of table names to include in the denormalized dataset. If None,
359
+ all tables from the dataset will be included.
360
+
361
+ Returns:
362
+ str: SQL query string that represents the process of denormalization.
363
+ """
364
+
365
+ def column_name(col: Column) -> str:
366
+ return f'"{self.model.normalize_table_name(col.table.name)}"."{col.name}"'
367
+
368
+ # Skip over tables that we don't want to include in the denormalized dataset.
369
+ # Also, strip off the Dataset/Dataset_X part of the path so we don't include dataset columns in the denormalized
370
+ # table.
371
+
372
+ join_tables, tables, denormalized_columns, dataset_rids, dataset_element_tables = (
373
+ self.model._prepare_wide_table(self, self.dataset_rid, include_tables)
374
+ )
375
+
376
+ select_args = [
377
+ # SQLlite will strip out the table name from the column in the select statement, so we need to add
378
+ # an explicit alias to the column name.
379
+ f'"{self.model.normalize_table_name(table_name)}"."{column_name}" AS "{table_name}.{column_name}"'
380
+ for table_name, column_name in denormalized_columns
381
+ ]
382
+
383
+ # First table in the table list is the table specified in the method call.
384
+ normalized_join_tables = [self.model.normalize_table_name(t) for t in join_tables]
385
+ sql_statement = f'SELECT {",".join(select_args)} FROM "{normalized_join_tables[0]}"'
386
+ for t in normalized_join_tables[1:]:
387
+ on = tables[t]
388
+ sql_statement += f' LEFT JOIN "{t}" ON '
389
+ sql_statement += "OR ".join([f"{column_name(o[0])} = {column_name(o[1])}" for o in on])
390
+
391
+ # Select only rows from the datasets you wish to include.
392
+ dataset_rid_list = ",".join([f'"{self.dataset_rid}"'] + [f'"{b.dataset_rid}"' for b in dataset_rids])
393
+ sql_statement += f'WHERE "{self.model.normalize_table_name("Dataset")}"."RID" IN ({dataset_rid_list})'
394
+
395
+ # Only include rows that have actual values in them.
396
+ real_row = [f'"{self.model.normalize_table_name(t)}".RID IS NOT NULL ' for t in dataset_element_tables]
397
+ sql_statement += f" AND ({' OR '.join(real_row)})"
398
+ return sql_statement
399
+
400
+ def denormalize_as_dataframe(self, include_tables: list[str] | None = None) -> pd.DataFrame:
401
+ """
402
+ Denormalize the dataset and return the result as a dataframe.
403
+
404
+ This routine will examine the domain schema for the dataset, determine which tables to include and denormalize
405
+ the dataset values into a single wide table. The result is returned as a dataframe.
406
+
407
+ The optional argument include_tables can be used to specify a subset of tables to include in the denormalized
408
+ view. The tables in this argument can appear anywhere in the dataset schema. The method will determine which
409
+ additional tables are required to complete the denormalization process. If include_tables is not specified,
410
+ all of the tables in the schema will be included.
411
+
412
+ The resulting wide table will include a column for every table needed to complete the denormalization process.
413
+
414
+ Args:
415
+ include_tables: List of table names to include in the denormalized dataset. If None, than the entire schema
416
+ is used.
417
+
418
+ Returns:
419
+ Dataframe containing the denormalized dataset.
420
+ """
421
+ return pd.read_sql(self._denormalize(include_tables=include_tables), self.database)
422
+
423
+ def denormalize_as_dict(self, include_tables: list[str] | None = None) -> Generator[dict[str, Any], None, None]:
424
+ """
425
+ Denormalize the dataset and return the result as a set of dictionarys.
426
+
427
+ This routine will examine the domain schema for the dataset, determine which tables to include and denormalize
428
+ the dataset values into a single wide table. The result is returned as a generateor that returns a dictionary
429
+ for each row in the denormlized wide table.
430
+
431
+ The optional argument include_tables can be used to specify a subset of tables to include in the denormalized
432
+ view. The tables in this argument can appear anywhere in the dataset schema. The method will determine which
433
+ additional tables are required to complete the denormalization process. If include_tables is not specified,
434
+ all of the tables in the schema will be included.
435
+
436
+ The resulting wide table will include a column for every table needed to complete the denormalization process.
437
+
438
+ Args:
439
+ include_tables: List of table names to include in the denormalized dataset. If None, than the entire schema
440
+ is used.
441
+
442
+ Returns:
443
+ A generator that returns a dictionary representation of each row in the denormalized dataset.
444
+ """
445
+ with self.database as dbase:
446
+ cursor = dbase.execute(self._denormalize(include_tables=include_tables))
447
+ columns = [desc[0] for desc in cursor.description]
448
+ for row in cursor:
449
+ yield dict(zip(columns, row))
450
+
336
451
 
337
452
  # Add annotations after definition to deal with forward reference issues in pydantic
338
453
 
@@ -412,6 +412,7 @@ def asset_file_path(
412
412
  "Description",
413
413
  }.union(set(DerivaSystemColumns))
414
414
  asset_metadata = {c.name for c in asset_table.columns} - asset_columns
415
+
415
416
  if not (asset_metadata >= set(metadata.keys())):
416
417
  raise DerivaMLException(f"Metadata {metadata} does not match asset metadata {asset_metadata}")
417
418
 
@@ -367,7 +367,7 @@ def create_demo_catalog(
367
367
  create_features=False,
368
368
  create_datasets=False,
369
369
  on_exit_delete=True,
370
- logging_level=logging.INFO,
370
+ logging_level=logging.WARNING,
371
371
  ) -> ErmrestCatalog:
372
372
  test_catalog = create_ml_catalog(hostname, project_name=project_name)
373
373
  if on_exit_delete:
@@ -0,0 +1,25 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ # Safe imports - no circular dependencies
4
+ from deriva_ml.execution.execution_configuration import ExecutionConfiguration
5
+ from deriva_ml.execution.workflow import Workflow
6
+
7
+ if TYPE_CHECKING:
8
+ from deriva_ml.execution.execution import Execution
9
+
10
+
11
+ # Lazy import for runtime
12
+ def __getattr__(name):
13
+ """Lazy import to avoid circular dependencies."""
14
+ if name == "Execution":
15
+ from deriva_ml.execution.execution import Execution
16
+
17
+ return Execution
18
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
19
+
20
+
21
+ __all__ = [
22
+ "Execution", # Lazy-loaded
23
+ "ExecutionConfiguration",
24
+ "Workflow",
25
+ ]