deriva-ml 1.14.47__tar.gz → 1.16.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/PKG-INFO +2 -1
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/pyproject.toml +2 -1
- deriva_ml-1.16.0/src/deriva_ml/__init__.py +87 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/core/__init__.py +2 -2
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/core/base.py +12 -12
- deriva_ml-1.16.0/src/deriva_ml/core/config.py +67 -0
- deriva_ml-1.16.0/src/deriva_ml/dataset/__init__.py +17 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/dataset/aux_classes.py +20 -1
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/dataset/dataset.py +2 -1
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/dataset/dataset_bag.py +115 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/dataset/upload.py +1 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/demo_catalog.py +1 -1
- deriva_ml-1.16.0/src/deriva_ml/execution/__init__.py +25 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/execution/execution.py +46 -26
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/execution/execution_configuration.py +8 -32
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/model/catalog.py +113 -1
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/model/database.py +2 -2
- deriva_ml-1.16.0/src/deriva_ml/protocols/dataset.py +19 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/run_notebook.py +55 -50
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/schema/annotations.py +7 -5
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml.egg-info/PKG-INFO +2 -1
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml.egg-info/SOURCES.txt +2 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml.egg-info/requires.txt +1 -0
- deriva_ml-1.16.0/uv.lock +4073 -0
- deriva_ml-1.14.47/src/deriva_ml/__init__.py +0 -48
- deriva_ml-1.14.47/src/deriva_ml/dataset/__init__.py +0 -4
- deriva_ml-1.14.47/tests/model/__init__.py +0 -0
- deriva_ml-1.14.47/uv.lock +0 -3823
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/.github/release-drafter.yml +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/.github/workflows/publish-docs.yml +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/.github/workflows/release.yml +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/.gitignore +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/LICENSE +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/README.md +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/.DS_Store +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/Notebooks/DerivaML Create Notes.ipynb +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/Notebooks/DerivaML Dataset.ipynb +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/Notebooks/DerivaML Execution.ipynb +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/Notebooks/DerivaML Features.ipynb +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/Notebooks/DerivaML Ingest.ipynb +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/Notebooks/DerivaML Vocabulary.ipynb +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/assets/ERD.png +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/assets/Launcher.png +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/assets/copy_minid.png +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/assets/deriva-logo.png +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/assets/deriva-ml.pdf +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/assets/sharing-at-home.pdf +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/code-docs/dataset.md +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/code-docs/dataset_aux_classes.md +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/code-docs/dataset_bag.md +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/code-docs/deriva_definitions.md +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/code-docs/deriva_ml_base.md +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/code-docs/deriva_model.md +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/code-docs/execution.md +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/code-docs/execution_configuration.md +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/code-docs/feature.md +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/code-docs/upload.md +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/index.md +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/release-notes.md +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/user-guide/datasets.md +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/user-guide/deriva_ml_structure.md +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/user-guide/execution-configuration.md +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/user-guide/file-assets.md +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/user-guide/identifiers.md +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/user-guide/install.md +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/user-guide/notebooks.md +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/docs/user-guide/overview.md +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/mkdocs.yml +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/setup.cfg +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/bump_version.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/core/constants.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/core/definitions.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/core/enums.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/core/ermrest.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/core/exceptions.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/core/filespec.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/dataset/history.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/execution/environment.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/execution/workflow.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/feature.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/install_kernel.py +0 -0
- {deriva_ml-1.14.47/src/deriva_ml/execution → deriva_ml-1.16.0/src/deriva_ml/model}/__init__.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/model/sql_mapper.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/schema/__init__.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/schema/check_schema.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/schema/create_schema.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/schema/deriva-ml-reference.json +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/schema/policy.json +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml/schema/table_comments_utils.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml.egg-info/dependency_links.txt +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml.egg-info/entry_points.txt +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/src/deriva_ml.egg-info/top_level.txt +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/tests/__init__.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/tests/conftest.py +0 -0
- {deriva_ml-1.14.47/src/deriva_ml/model → deriva_ml-1.16.0/tests/core}/__init__.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/tests/core/test_basic_tables.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/tests/core/test_file.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/tests/core/test_vocabulary.py +0 -0
- {deriva_ml-1.14.47/tests/core → deriva_ml-1.16.0/tests/dataset}/__init__.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/tests/dataset/demo-catalog-schema.json +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/tests/dataset/deriva-ml-reference.json +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/tests/dataset/eye-ai-catalog-schema.json +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/tests/dataset/test_dataset_version.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/tests/dataset/test_datasets.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/tests/dataset/test_download.py +0 -0
- {deriva_ml-1.14.47/tests/dataset → deriva_ml-1.16.0/tests/execution}/__init__.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/tests/execution/test_execution.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/tests/execution/workflow-test.ipynb +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/tests/execution/workflow-test.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/tests/feature/test_features.py +0 -0
- {deriva_ml-1.14.47/tests/execution → deriva_ml-1.16.0/tests/model}/__init__.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/tests/model/test_database.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/tests/model/test_models.py +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/tests/test-files/execution-parameters.json +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/tests/test-files/notebook-parameters.json +0 -0
- {deriva_ml-1.14.47 → deriva_ml-1.16.0}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: deriva-ml
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.16.0
|
|
4
4
|
Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
|
|
5
5
|
Author-email: ISRD <isrd-dev@isi.edu>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -21,6 +21,7 @@ Requires-Dist: nbstripout
|
|
|
21
21
|
Requires-Dist: papermill
|
|
22
22
|
Requires-Dist: pandas-stubs==2.2.3.250527
|
|
23
23
|
Requires-Dist: pyyaml
|
|
24
|
+
Requires-Dist: hydra_zen
|
|
24
25
|
Dynamic: license-file
|
|
25
26
|
|
|
26
27
|
# DerivaML
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
# Safe imports - no circular dependencies
|
|
5
|
+
from deriva_ml.core.config import DerivaMLConfig
|
|
6
|
+
from deriva_ml.core.definitions import (
|
|
7
|
+
RID,
|
|
8
|
+
BuiltinTypes,
|
|
9
|
+
ColumnDefinition,
|
|
10
|
+
DerivaAssetColumns,
|
|
11
|
+
DerivaSystemColumns,
|
|
12
|
+
ExecAssetType,
|
|
13
|
+
ExecMetadataType,
|
|
14
|
+
FileSpec,
|
|
15
|
+
FileUploadState,
|
|
16
|
+
ForeignKeyDefinition,
|
|
17
|
+
KeyDefinition,
|
|
18
|
+
MLAsset,
|
|
19
|
+
MLVocab,
|
|
20
|
+
TableDefinition,
|
|
21
|
+
UploadState,
|
|
22
|
+
)
|
|
23
|
+
from deriva_ml.core.exceptions import (
|
|
24
|
+
DerivaMLException,
|
|
25
|
+
DerivaMLInvalidTerm,
|
|
26
|
+
DerivaMLTableTypeError,
|
|
27
|
+
)
|
|
28
|
+
from deriva_ml.dataset.aux_classes import DatasetConfig, DatasetConfigList, DatasetSpec, DatasetVersion
|
|
29
|
+
|
|
30
|
+
from .execution import Execution, ExecutionConfiguration, Workflow
|
|
31
|
+
|
|
32
|
+
# Type-checking only - avoid circular import at runtime
|
|
33
|
+
if TYPE_CHECKING:
|
|
34
|
+
from deriva_ml.core.base import DerivaML
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# Lazy import function for runtime usage
|
|
38
|
+
def __getattr__(name):
|
|
39
|
+
"""Lazy import to avoid circular dependencies."""
|
|
40
|
+
if name == "DerivaML":
|
|
41
|
+
from deriva_ml.core.base import DerivaML
|
|
42
|
+
|
|
43
|
+
return DerivaML
|
|
44
|
+
elif name == "Execution":
|
|
45
|
+
from deriva_ml.execution.execution import Execution
|
|
46
|
+
|
|
47
|
+
return Execution
|
|
48
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
__all__ = [
|
|
52
|
+
"DerivaML", # Lazy-loaded
|
|
53
|
+
"DerivaMLConfig",
|
|
54
|
+
"DatasetConfig",
|
|
55
|
+
"DatasetConfigList",
|
|
56
|
+
"DatasetSpec",
|
|
57
|
+
"DatasetVersion",
|
|
58
|
+
"Execution",
|
|
59
|
+
"ExecutionConfiguration",
|
|
60
|
+
"Workflow",
|
|
61
|
+
# Exceptions
|
|
62
|
+
"DerivaMLException",
|
|
63
|
+
"DerivaMLInvalidTerm",
|
|
64
|
+
"DerivaMLTableTypeError",
|
|
65
|
+
# Definitions
|
|
66
|
+
"RID",
|
|
67
|
+
"BuiltinTypes",
|
|
68
|
+
"ColumnDefinition",
|
|
69
|
+
"DerivaSystemColumns",
|
|
70
|
+
"DerivaAssetColumns",
|
|
71
|
+
"ExecAssetType",
|
|
72
|
+
"ExecMetadataType",
|
|
73
|
+
"FileSpec",
|
|
74
|
+
"FileUploadState",
|
|
75
|
+
"ForeignKeyDefinition",
|
|
76
|
+
"KeyDefinition",
|
|
77
|
+
"MLAsset",
|
|
78
|
+
"MLVocab",
|
|
79
|
+
"TableDefinition",
|
|
80
|
+
"UploadState",
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
__version__ = version("deriva_ml")
|
|
85
|
+
except PackageNotFoundError:
|
|
86
|
+
# package is not installed
|
|
87
|
+
pass
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from deriva_ml.core.base import DerivaML
|
|
2
|
+
from deriva_ml.core.config import DerivaMLConfig
|
|
2
3
|
from deriva_ml.core.definitions import (
|
|
3
4
|
RID,
|
|
4
5
|
BuiltinTypes,
|
|
@@ -17,12 +18,11 @@ from deriva_ml.core.exceptions import DerivaMLException, DerivaMLInvalidTerm, De
|
|
|
17
18
|
|
|
18
19
|
__all__ = [
|
|
19
20
|
"DerivaML",
|
|
20
|
-
|
|
21
|
+
"DerivaMLConfig",
|
|
21
22
|
# Exceptions
|
|
22
23
|
"DerivaMLException",
|
|
23
24
|
"DerivaMLInvalidTerm",
|
|
24
25
|
"DerivaMLTableTypeError",
|
|
25
|
-
|
|
26
26
|
# Definitions
|
|
27
27
|
"RID",
|
|
28
28
|
"BuiltinTypes",
|
|
@@ -15,7 +15,6 @@ from __future__ import annotations # noqa: I001
|
|
|
15
15
|
|
|
16
16
|
# Standard library imports
|
|
17
17
|
from collections import defaultdict
|
|
18
|
-
import getpass
|
|
19
18
|
import logging
|
|
20
19
|
from datetime import datetime
|
|
21
20
|
from itertools import chain
|
|
@@ -29,12 +28,7 @@ import requests
|
|
|
29
28
|
from pydantic import ConfigDict, validate_call
|
|
30
29
|
|
|
31
30
|
# Deriva imports
|
|
32
|
-
from deriva.core import
|
|
33
|
-
DEFAULT_SESSION_CONFIG,
|
|
34
|
-
format_exception,
|
|
35
|
-
get_credential,
|
|
36
|
-
urlquote,
|
|
37
|
-
)
|
|
31
|
+
from deriva.core import DEFAULT_SESSION_CONFIG, format_exception, get_credential, urlquote, init_logging
|
|
38
32
|
|
|
39
33
|
import deriva.core.datapath as datapath
|
|
40
34
|
from deriva.core.datapath import DataPathException, _SchemaWrapper as SchemaWrapper
|
|
@@ -55,6 +49,7 @@ from deriva_ml.core.definitions import (
|
|
|
55
49
|
TableDefinition,
|
|
56
50
|
VocabularyTerm,
|
|
57
51
|
)
|
|
52
|
+
from deriva_ml.core.config import DerivaMLConfig
|
|
58
53
|
from deriva_ml.core.exceptions import DerivaMLTableTypeError, DerivaMLException
|
|
59
54
|
from deriva_ml.dataset.aux_classes import DatasetSpec
|
|
60
55
|
from deriva_ml.dataset.dataset import Dataset
|
|
@@ -116,8 +111,10 @@ class DerivaML(Dataset):
|
|
|
116
111
|
project_name: str | None = None,
|
|
117
112
|
cache_dir: str | Path | None = None,
|
|
118
113
|
working_dir: str | Path | None = None,
|
|
114
|
+
hydra_runtime_output_dir: str | Path | None = None,
|
|
119
115
|
ml_schema: str = ML_SCHEMA,
|
|
120
116
|
logging_level=logging.WARNING,
|
|
117
|
+
deriva_logging_level=logging.WARNING,
|
|
121
118
|
credential=None,
|
|
122
119
|
use_minid: bool = True,
|
|
123
120
|
check_auth: bool = True,
|
|
@@ -166,12 +163,10 @@ class DerivaML(Dataset):
|
|
|
166
163
|
self.model = DerivaModel(self.catalog.getCatalogModel(), domain_schema=domain_schema)
|
|
167
164
|
|
|
168
165
|
# Set up working and cache directories
|
|
169
|
-
|
|
170
|
-
self.working_dir = (
|
|
171
|
-
Path(working_dir) / getpass.getuser() if working_dir else Path.home() / "deriva-ml"
|
|
172
|
-
) / default_workdir
|
|
173
|
-
|
|
166
|
+
self.working_dir = DerivaMLConfig.compute_workdir(working_dir)
|
|
174
167
|
self.working_dir.mkdir(parents=True, exist_ok=True)
|
|
168
|
+
self.hydra_runtime_output_dir = hydra_runtime_output_dir
|
|
169
|
+
|
|
175
170
|
self.cache_dir = Path(cache_dir) if cache_dir else self.working_dir / "cache"
|
|
176
171
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
177
172
|
|
|
@@ -182,6 +177,11 @@ class DerivaML(Dataset):
|
|
|
182
177
|
self._logger = logging.getLogger("deriva_ml")
|
|
183
178
|
self._logger.setLevel(logging_level)
|
|
184
179
|
|
|
180
|
+
# Configure deriva logging level
|
|
181
|
+
init_logging(deriva_logging_level)
|
|
182
|
+
logging.getLogger("bagit").setLevel(deriva_logging_level)
|
|
183
|
+
logging.getLogger("bdbag").setLevel(deriva_logging_level)
|
|
184
|
+
|
|
185
185
|
# Store instance configuration
|
|
186
186
|
self.host_name = hostname
|
|
187
187
|
self.catalog_id = catalog_id
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from hydra.conf import HydraConf, RunDir
|
|
6
|
+
from hydra.core.hydra_config import HydraConfig
|
|
7
|
+
from hydra_zen import store
|
|
8
|
+
from omegaconf import OmegaConf
|
|
9
|
+
from pydantic import BaseModel, model_validator
|
|
10
|
+
|
|
11
|
+
from deriva_ml.core.definitions import ML_SCHEMA
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DerivaMLConfig(BaseModel):
|
|
15
|
+
hostname: str
|
|
16
|
+
catalog_id: str | int = 1
|
|
17
|
+
domain_schema: str | None = None
|
|
18
|
+
project_name: str | None = None
|
|
19
|
+
cache_dir: str | Path | None = None
|
|
20
|
+
working_dir: str | Path | None = None
|
|
21
|
+
hydra_runtime_output_dir: str | Path | None = None
|
|
22
|
+
ml_schema: str = ML_SCHEMA
|
|
23
|
+
logging_level: Any = logging.WARNING
|
|
24
|
+
deriva_logging_level: Any = logging.WARNING
|
|
25
|
+
credential: Any = None
|
|
26
|
+
use_minid: bool = True
|
|
27
|
+
check_auth: bool = True
|
|
28
|
+
|
|
29
|
+
@model_validator(mode="after")
|
|
30
|
+
def init_working_dir(self):
|
|
31
|
+
"""
|
|
32
|
+
Sets up the working directory for the model.
|
|
33
|
+
|
|
34
|
+
This method configures the working directory, ensuring that all required
|
|
35
|
+
file operations are performed in the appropriate location. If the user does not
|
|
36
|
+
specify a directory, a default directory based on the user's home directory
|
|
37
|
+
or username will be used.
|
|
38
|
+
|
|
39
|
+
This is a repeat of what is in the DerivaML.__init__ bu we put this here so that the working
|
|
40
|
+
directory is available to hydra.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Self: The object instance with the working directory initialized.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
self.working_dir = DerivaMLConfig.compute_workdir(self.working_dir)
|
|
47
|
+
self.hydra_runtime_output_dir = Path(HydraConfig.get().runtime.output_dir)
|
|
48
|
+
return self
|
|
49
|
+
|
|
50
|
+
@staticmethod
|
|
51
|
+
def compute_workdir(working_dir) -> Path:
|
|
52
|
+
# Create a default working directory if none is provided
|
|
53
|
+
working_dir = Path(working_dir) if working_dir else Path.home() / "deriva-ml"
|
|
54
|
+
return working_dir.absolute()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
OmegaConf.register_new_resolver("compute_workdir", DerivaMLConfig.compute_workdir, replace=True)
|
|
58
|
+
store(
|
|
59
|
+
HydraConf(
|
|
60
|
+
run=RunDir("${compute_workdir:${deriva_ml.working_dir}}/hydra/${now:%Y-%m-%d_%H-%M-%S}"),
|
|
61
|
+
output_subdir="hydra-config",
|
|
62
|
+
),
|
|
63
|
+
group="hydra",
|
|
64
|
+
name="config",
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
store.add_to_hydra_store()
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from typing import Protocol, runtime_checkable
|
|
2
|
+
|
|
3
|
+
from deriva_ml.core.definitions import RID
|
|
4
|
+
|
|
5
|
+
from .aux_classes import DatasetConfig, DatasetConfigList, DatasetSpec, DatasetVersion, VersionPart
|
|
6
|
+
from .dataset import Dataset
|
|
7
|
+
from .dataset_bag import DatasetBag
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"Dataset",
|
|
11
|
+
"DatasetSpec",
|
|
12
|
+
"DatasetConfig",
|
|
13
|
+
"DatasetConfigList",
|
|
14
|
+
"DatasetBag",
|
|
15
|
+
"DatasetVersion",
|
|
16
|
+
"VersionPart",
|
|
17
|
+
]
|
|
@@ -5,6 +5,7 @@ THis module defines the DataSet class with is used to manipulate n
|
|
|
5
5
|
from enum import Enum
|
|
6
6
|
from typing import Any, Optional, SupportsInt
|
|
7
7
|
|
|
8
|
+
from hydra_zen import hydrated_dataclass
|
|
8
9
|
from pydantic import (
|
|
9
10
|
BaseModel,
|
|
10
11
|
ConfigDict,
|
|
@@ -182,8 +183,9 @@ class DatasetSpec(BaseModel):
|
|
|
182
183
|
"""
|
|
183
184
|
|
|
184
185
|
rid: RID
|
|
185
|
-
materialize: bool = True
|
|
186
186
|
version: DatasetVersion | conlist(item_type=int, min_length=3, max_length=3) | tuple[int, int, int] | str
|
|
187
|
+
materialize: bool = True
|
|
188
|
+
description: str = ""
|
|
187
189
|
|
|
188
190
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
189
191
|
|
|
@@ -208,3 +210,20 @@ class DatasetSpec(BaseModel):
|
|
|
208
210
|
@field_serializer("version")
|
|
209
211
|
def serialize_version(self, version: DatasetVersion) -> dict[str, Any]:
|
|
210
212
|
return version.to_dict()
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
@hydrated_dataclass(DatasetSpec)
|
|
216
|
+
class DatasetConfig:
|
|
217
|
+
rid: str
|
|
218
|
+
version: str
|
|
219
|
+
materialize: bool = True
|
|
220
|
+
description: str = ""
|
|
221
|
+
|
|
222
|
+
class DatasetList(BaseModel):
|
|
223
|
+
datasets: list[DatasetSpec]
|
|
224
|
+
description: str = ""
|
|
225
|
+
|
|
226
|
+
@hydrated_dataclass(DatasetList)
|
|
227
|
+
class DatasetConfigList:
|
|
228
|
+
datasets: list[DatasetConfig]
|
|
229
|
+
description: str = ""
|
|
@@ -22,10 +22,11 @@ Typical usage example:
|
|
|
22
22
|
|
|
23
23
|
from __future__ import annotations
|
|
24
24
|
|
|
25
|
-
# Standard library imports
|
|
26
25
|
import json
|
|
27
26
|
import logging
|
|
28
27
|
from collections import defaultdict
|
|
28
|
+
|
|
29
|
+
# Standard library imports
|
|
29
30
|
from graphlib import TopologicalSorter
|
|
30
31
|
from pathlib import Path
|
|
31
32
|
from tempfile import TemporaryDirectory
|
|
@@ -266,6 +266,22 @@ class DatasetBag:
|
|
|
266
266
|
sql_cmd = f'SELECT * FROM "{feature_table}"'
|
|
267
267
|
return cast(datapath._ResultSet, [dict(zip(col_names, r)) for r in db.execute(sql_cmd).fetchall()])
|
|
268
268
|
|
|
269
|
+
def list_dataset_element_types(self) -> list[Table]:
|
|
270
|
+
"""
|
|
271
|
+
Lists the data types of elements contained within a dataset.
|
|
272
|
+
|
|
273
|
+
This method analyzes the dataset and identifies the data types for all
|
|
274
|
+
elements within it. It is useful for understanding the structure and
|
|
275
|
+
content of the dataset and allows for better manipulation and usage of its
|
|
276
|
+
data.
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
list[str]: A list of strings where each string represents a data type
|
|
280
|
+
of an element found in the dataset.
|
|
281
|
+
|
|
282
|
+
"""
|
|
283
|
+
return self.model.list_dataset_element_types()
|
|
284
|
+
|
|
269
285
|
def list_dataset_children(self, recurse: bool = False) -> list[DatasetBag]:
|
|
270
286
|
"""Get nested datasets.
|
|
271
287
|
|
|
@@ -333,6 +349,105 @@ class DatasetBag:
|
|
|
333
349
|
# Term not found
|
|
334
350
|
raise DerivaMLInvalidTerm(vocab_table, term_name)
|
|
335
351
|
|
|
352
|
+
def _denormalize(self, include_tables: list[str] | None) -> str:
|
|
353
|
+
"""
|
|
354
|
+
Generates an SQL statement for denormalizing the dataset based on the tables to include. Processes cycles in
|
|
355
|
+
graph relationships, ensures proper join order, and generates selected columns for denormalization.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
include_tables (list[str] | None): List of table names to include in the denormalized dataset. If None,
|
|
359
|
+
all tables from the dataset will be included.
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
str: SQL query string that represents the process of denormalization.
|
|
363
|
+
"""
|
|
364
|
+
|
|
365
|
+
def column_name(col: Column) -> str:
|
|
366
|
+
return f'"{self.model.normalize_table_name(col.table.name)}"."{col.name}"'
|
|
367
|
+
|
|
368
|
+
# Skip over tables that we don't want to include in the denormalized dataset.
|
|
369
|
+
# Also, strip off the Dataset/Dataset_X part of the path so we don't include dataset columns in the denormalized
|
|
370
|
+
# table.
|
|
371
|
+
|
|
372
|
+
join_tables, tables, denormalized_columns, dataset_rids, dataset_element_tables = (
|
|
373
|
+
self.model._prepare_wide_table(self, self.dataset_rid, include_tables)
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
select_args = [
|
|
377
|
+
# SQLlite will strip out the table name from the column in the select statement, so we need to add
|
|
378
|
+
# an explicit alias to the column name.
|
|
379
|
+
f'"{self.model.normalize_table_name(table_name)}"."{column_name}" AS "{table_name}.{column_name}"'
|
|
380
|
+
for table_name, column_name in denormalized_columns
|
|
381
|
+
]
|
|
382
|
+
|
|
383
|
+
# First table in the table list is the table specified in the method call.
|
|
384
|
+
normalized_join_tables = [self.model.normalize_table_name(t) for t in join_tables]
|
|
385
|
+
sql_statement = f'SELECT {",".join(select_args)} FROM "{normalized_join_tables[0]}"'
|
|
386
|
+
for t in normalized_join_tables[1:]:
|
|
387
|
+
on = tables[t]
|
|
388
|
+
sql_statement += f' LEFT JOIN "{t}" ON '
|
|
389
|
+
sql_statement += "OR ".join([f"{column_name(o[0])} = {column_name(o[1])}" for o in on])
|
|
390
|
+
|
|
391
|
+
# Select only rows from the datasets you wish to include.
|
|
392
|
+
dataset_rid_list = ",".join([f'"{self.dataset_rid}"'] + [f'"{b.dataset_rid}"' for b in dataset_rids])
|
|
393
|
+
sql_statement += f'WHERE "{self.model.normalize_table_name("Dataset")}"."RID" IN ({dataset_rid_list})'
|
|
394
|
+
|
|
395
|
+
# Only include rows that have actual values in them.
|
|
396
|
+
real_row = [f'"{self.model.normalize_table_name(t)}".RID IS NOT NULL ' for t in dataset_element_tables]
|
|
397
|
+
sql_statement += f" AND ({' OR '.join(real_row)})"
|
|
398
|
+
return sql_statement
|
|
399
|
+
|
|
400
|
+
def denormalize_as_dataframe(self, include_tables: list[str] | None = None) -> pd.DataFrame:
|
|
401
|
+
"""
|
|
402
|
+
Denormalize the dataset and return the result as a dataframe.
|
|
403
|
+
|
|
404
|
+
This routine will examine the domain schema for the dataset, determine which tables to include and denormalize
|
|
405
|
+
the dataset values into a single wide table. The result is returned as a dataframe.
|
|
406
|
+
|
|
407
|
+
The optional argument include_tables can be used to specify a subset of tables to include in the denormalized
|
|
408
|
+
view. The tables in this argument can appear anywhere in the dataset schema. The method will determine which
|
|
409
|
+
additional tables are required to complete the denormalization process. If include_tables is not specified,
|
|
410
|
+
all of the tables in the schema will be included.
|
|
411
|
+
|
|
412
|
+
The resulting wide table will include a column for every table needed to complete the denormalization process.
|
|
413
|
+
|
|
414
|
+
Args:
|
|
415
|
+
include_tables: List of table names to include in the denormalized dataset. If None, than the entire schema
|
|
416
|
+
is used.
|
|
417
|
+
|
|
418
|
+
Returns:
|
|
419
|
+
Dataframe containing the denormalized dataset.
|
|
420
|
+
"""
|
|
421
|
+
return pd.read_sql(self._denormalize(include_tables=include_tables), self.database)
|
|
422
|
+
|
|
423
|
+
def denormalize_as_dict(self, include_tables: list[str] | None = None) -> Generator[dict[str, Any], None, None]:
|
|
424
|
+
"""
|
|
425
|
+
Denormalize the dataset and return the result as a set of dictionarys.
|
|
426
|
+
|
|
427
|
+
This routine will examine the domain schema for the dataset, determine which tables to include and denormalize
|
|
428
|
+
the dataset values into a single wide table. The result is returned as a generateor that returns a dictionary
|
|
429
|
+
for each row in the denormlized wide table.
|
|
430
|
+
|
|
431
|
+
The optional argument include_tables can be used to specify a subset of tables to include in the denormalized
|
|
432
|
+
view. The tables in this argument can appear anywhere in the dataset schema. The method will determine which
|
|
433
|
+
additional tables are required to complete the denormalization process. If include_tables is not specified,
|
|
434
|
+
all of the tables in the schema will be included.
|
|
435
|
+
|
|
436
|
+
The resulting wide table will include a column for every table needed to complete the denormalization process.
|
|
437
|
+
|
|
438
|
+
Args:
|
|
439
|
+
include_tables: List of table names to include in the denormalized dataset. If None, than the entire schema
|
|
440
|
+
is used.
|
|
441
|
+
|
|
442
|
+
Returns:
|
|
443
|
+
A generator that returns a dictionary representation of each row in the denormalized dataset.
|
|
444
|
+
"""
|
|
445
|
+
with self.database as dbase:
|
|
446
|
+
cursor = dbase.execute(self._denormalize(include_tables=include_tables))
|
|
447
|
+
columns = [desc[0] for desc in cursor.description]
|
|
448
|
+
for row in cursor:
|
|
449
|
+
yield dict(zip(columns, row))
|
|
450
|
+
|
|
336
451
|
|
|
337
452
|
# Add annotations after definition to deal with forward reference issues in pydantic
|
|
338
453
|
|
|
@@ -412,6 +412,7 @@ def asset_file_path(
|
|
|
412
412
|
"Description",
|
|
413
413
|
}.union(set(DerivaSystemColumns))
|
|
414
414
|
asset_metadata = {c.name for c in asset_table.columns} - asset_columns
|
|
415
|
+
|
|
415
416
|
if not (asset_metadata >= set(metadata.keys())):
|
|
416
417
|
raise DerivaMLException(f"Metadata {metadata} does not match asset metadata {asset_metadata}")
|
|
417
418
|
|
|
@@ -367,7 +367,7 @@ def create_demo_catalog(
|
|
|
367
367
|
create_features=False,
|
|
368
368
|
create_datasets=False,
|
|
369
369
|
on_exit_delete=True,
|
|
370
|
-
logging_level=logging.
|
|
370
|
+
logging_level=logging.WARNING,
|
|
371
371
|
) -> ErmrestCatalog:
|
|
372
372
|
test_catalog = create_ml_catalog(hostname, project_name=project_name)
|
|
373
373
|
if on_exit_delete:
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
2
|
+
|
|
3
|
+
# Safe imports - no circular dependencies
|
|
4
|
+
from deriva_ml.execution.execution_configuration import ExecutionConfiguration
|
|
5
|
+
from deriva_ml.execution.workflow import Workflow
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from deriva_ml.execution.execution import Execution
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# Lazy import for runtime
|
|
12
|
+
def __getattr__(name):
|
|
13
|
+
"""Lazy import to avoid circular dependencies."""
|
|
14
|
+
if name == "Execution":
|
|
15
|
+
from deriva_ml.execution.execution import Execution
|
|
16
|
+
|
|
17
|
+
return Execution
|
|
18
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"Execution", # Lazy-loaded
|
|
23
|
+
"ExecutionConfiguration",
|
|
24
|
+
"Workflow",
|
|
25
|
+
]
|