deriva-ml 1.17.10__py3-none-any.whl → 1.17.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +69 -1
- deriva_ml/asset/__init__.py +17 -0
- deriva_ml/asset/asset.py +357 -0
- deriva_ml/asset/aux_classes.py +100 -0
- deriva_ml/bump_version.py +254 -11
- deriva_ml/catalog/__init__.py +31 -0
- deriva_ml/catalog/clone.py +1939 -0
- deriva_ml/catalog/localize.py +426 -0
- deriva_ml/core/__init__.py +29 -0
- deriva_ml/core/base.py +845 -1067
- deriva_ml/core/config.py +169 -21
- deriva_ml/core/constants.py +120 -19
- deriva_ml/core/definitions.py +123 -13
- deriva_ml/core/enums.py +47 -73
- deriva_ml/core/ermrest.py +226 -193
- deriva_ml/core/exceptions.py +297 -14
- deriva_ml/core/filespec.py +99 -28
- deriva_ml/core/logging_config.py +225 -0
- deriva_ml/core/mixins/__init__.py +42 -0
- deriva_ml/core/mixins/annotation.py +915 -0
- deriva_ml/core/mixins/asset.py +384 -0
- deriva_ml/core/mixins/dataset.py +237 -0
- deriva_ml/core/mixins/execution.py +408 -0
- deriva_ml/core/mixins/feature.py +365 -0
- deriva_ml/core/mixins/file.py +263 -0
- deriva_ml/core/mixins/path_builder.py +145 -0
- deriva_ml/core/mixins/rid_resolution.py +204 -0
- deriva_ml/core/mixins/vocabulary.py +400 -0
- deriva_ml/core/mixins/workflow.py +322 -0
- deriva_ml/core/validation.py +389 -0
- deriva_ml/dataset/__init__.py +2 -1
- deriva_ml/dataset/aux_classes.py +20 -4
- deriva_ml/dataset/catalog_graph.py +575 -0
- deriva_ml/dataset/dataset.py +1242 -1008
- deriva_ml/dataset/dataset_bag.py +1311 -182
- deriva_ml/dataset/history.py +27 -14
- deriva_ml/dataset/upload.py +225 -38
- deriva_ml/demo_catalog.py +126 -110
- deriva_ml/execution/__init__.py +46 -2
- deriva_ml/execution/base_config.py +639 -0
- deriva_ml/execution/execution.py +543 -242
- deriva_ml/execution/execution_configuration.py +26 -11
- deriva_ml/execution/execution_record.py +592 -0
- deriva_ml/execution/find_caller.py +298 -0
- deriva_ml/execution/model_protocol.py +175 -0
- deriva_ml/execution/multirun_config.py +153 -0
- deriva_ml/execution/runner.py +595 -0
- deriva_ml/execution/workflow.py +223 -34
- deriva_ml/experiment/__init__.py +8 -0
- deriva_ml/experiment/experiment.py +411 -0
- deriva_ml/feature.py +6 -1
- deriva_ml/install_kernel.py +143 -6
- deriva_ml/interfaces.py +862 -0
- deriva_ml/model/__init__.py +99 -0
- deriva_ml/model/annotations.py +1278 -0
- deriva_ml/model/catalog.py +286 -60
- deriva_ml/model/database.py +144 -649
- deriva_ml/model/deriva_ml_database.py +308 -0
- deriva_ml/model/handles.py +14 -0
- deriva_ml/run_model.py +319 -0
- deriva_ml/run_notebook.py +507 -38
- deriva_ml/schema/__init__.py +18 -2
- deriva_ml/schema/annotations.py +62 -33
- deriva_ml/schema/create_schema.py +169 -69
- deriva_ml/schema/validation.py +601 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/METADATA +4 -4
- deriva_ml-1.17.12.dist-info/RECORD +77 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/WHEEL +1 -1
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/entry_points.txt +1 -0
- deriva_ml/protocols/dataset.py +0 -19
- deriva_ml/test.py +0 -94
- deriva_ml-1.17.10.dist-info/RECORD +0 -45
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/top_level.txt +0 -0
deriva_ml/core/base.py
CHANGED
|
@@ -14,56 +14,53 @@ Typical usage example:
|
|
|
14
14
|
from __future__ import annotations # noqa: I001
|
|
15
15
|
|
|
16
16
|
# Standard library imports
|
|
17
|
-
from collections import defaultdict
|
|
18
17
|
import logging
|
|
19
18
|
from datetime import datetime
|
|
20
|
-
from itertools import chain
|
|
21
19
|
from pathlib import Path
|
|
22
|
-
from typing import Dict,
|
|
20
|
+
from typing import Dict, List, cast, TYPE_CHECKING, Any
|
|
23
21
|
from typing_extensions import Self
|
|
24
|
-
from urllib.parse import urlsplit
|
|
25
|
-
|
|
26
22
|
|
|
27
23
|
# Third-party imports
|
|
28
24
|
import requests
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
Status,
|
|
51
|
-
TableDefinition,
|
|
52
|
-
VocabularyTerm,
|
|
53
|
-
)
|
|
25
|
+
|
|
26
|
+
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
27
|
+
import importlib
|
|
28
|
+
_deriva_core = importlib.import_module("deriva.core")
|
|
29
|
+
_deriva_server = importlib.import_module("deriva.core.deriva_server")
|
|
30
|
+
_ermrest_catalog = importlib.import_module("deriva.core.ermrest_catalog")
|
|
31
|
+
_ermrest_model = importlib.import_module("deriva.core.ermrest_model")
|
|
32
|
+
_core_utils = importlib.import_module("deriva.core.utils.core_utils")
|
|
33
|
+
_globus_auth_utils = importlib.import_module("deriva.core.utils.globus_auth_utils")
|
|
34
|
+
|
|
35
|
+
DEFAULT_SESSION_CONFIG = _deriva_core.DEFAULT_SESSION_CONFIG
|
|
36
|
+
get_credential = _deriva_core.get_credential
|
|
37
|
+
urlquote = _deriva_core.urlquote
|
|
38
|
+
DerivaServer = _deriva_server.DerivaServer
|
|
39
|
+
ErmrestCatalog = _ermrest_catalog.ErmrestCatalog
|
|
40
|
+
ErmrestSnapshot = _ermrest_catalog.ErmrestSnapshot
|
|
41
|
+
Table = _ermrest_model.Table
|
|
42
|
+
DEFAULT_LOGGER_OVERRIDES = _core_utils.DEFAULT_LOGGER_OVERRIDES
|
|
43
|
+
deriva_tags = _core_utils.tag
|
|
44
|
+
GlobusNativeLogin = _globus_auth_utils.GlobusNativeLogin
|
|
45
|
+
|
|
54
46
|
from deriva_ml.core.config import DerivaMLConfig
|
|
55
|
-
from deriva_ml.core.
|
|
56
|
-
from deriva_ml.
|
|
57
|
-
from deriva_ml.
|
|
58
|
-
from deriva_ml.dataset.
|
|
59
|
-
from deriva_ml.
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
47
|
+
from deriva_ml.core.definitions import ML_SCHEMA, RID, Status, TableDefinition, VocabularyTableDef
|
|
48
|
+
from deriva_ml.core.exceptions import DerivaMLException
|
|
49
|
+
from deriva_ml.core.logging_config import apply_logger_overrides, configure_logging
|
|
50
|
+
from deriva_ml.dataset.upload import bulk_upload_configuration
|
|
51
|
+
from deriva_ml.interfaces import DerivaMLCatalog
|
|
52
|
+
from deriva_ml.core.mixins import (
|
|
53
|
+
AnnotationMixin,
|
|
54
|
+
VocabularyMixin,
|
|
55
|
+
RidResolutionMixin,
|
|
56
|
+
PathBuilderMixin,
|
|
57
|
+
WorkflowMixin,
|
|
58
|
+
FeatureMixin,
|
|
59
|
+
DatasetMixin,
|
|
60
|
+
AssetMixin,
|
|
61
|
+
ExecutionMixin,
|
|
62
|
+
FileMixin,
|
|
63
|
+
)
|
|
67
64
|
|
|
68
65
|
# Optional debug imports
|
|
69
66
|
try:
|
|
@@ -74,13 +71,27 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
|
74
71
|
ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
|
|
75
72
|
|
|
76
73
|
if TYPE_CHECKING:
|
|
74
|
+
from deriva_ml.catalog.clone import CatalogProvenance
|
|
77
75
|
from deriva_ml.execution.execution import Execution
|
|
76
|
+
from deriva_ml.model.catalog import DerivaModel
|
|
78
77
|
|
|
79
78
|
# Stop pycharm from complaining about undefined references.
|
|
80
79
|
ml: DerivaML
|
|
81
80
|
|
|
82
81
|
|
|
83
|
-
class DerivaML(
|
|
82
|
+
class DerivaML(
|
|
83
|
+
PathBuilderMixin,
|
|
84
|
+
RidResolutionMixin,
|
|
85
|
+
VocabularyMixin,
|
|
86
|
+
WorkflowMixin,
|
|
87
|
+
FeatureMixin,
|
|
88
|
+
DatasetMixin,
|
|
89
|
+
AssetMixin,
|
|
90
|
+
ExecutionMixin,
|
|
91
|
+
FileMixin,
|
|
92
|
+
AnnotationMixin,
|
|
93
|
+
DerivaMLCatalog,
|
|
94
|
+
):
|
|
84
95
|
"""Core class for machine learning operations on a Deriva catalog.
|
|
85
96
|
|
|
86
97
|
This class provides core functionality for managing ML workflows, features, and datasets in a Deriva catalog.
|
|
@@ -105,26 +116,79 @@ class DerivaML(Dataset):
|
|
|
105
116
|
>>> ml.add_term('vocabulary_table', 'new_term', description='Description of term')
|
|
106
117
|
"""
|
|
107
118
|
|
|
119
|
+
# Class-level type annotations for DerivaMLCatalog protocol compliance
|
|
120
|
+
ml_schema: str
|
|
121
|
+
domain_schemas: frozenset[str]
|
|
122
|
+
default_schema: str | None
|
|
123
|
+
model: DerivaModel
|
|
124
|
+
cache_dir: Path
|
|
125
|
+
working_dir: Path
|
|
126
|
+
catalog: ErmrestCatalog | ErmrestSnapshot
|
|
127
|
+
catalog_id: str | int
|
|
128
|
+
|
|
108
129
|
@classmethod
|
|
109
130
|
def instantiate(cls, config: DerivaMLConfig) -> Self:
|
|
131
|
+
"""Create a DerivaML instance from a configuration object.
|
|
132
|
+
|
|
133
|
+
This method is the preferred way to instantiate DerivaML when using hydra-zen
|
|
134
|
+
for configuration management. It accepts a DerivaMLConfig (Pydantic model) and
|
|
135
|
+
unpacks it to create the instance.
|
|
136
|
+
|
|
137
|
+
This pattern allows hydra-zen's `instantiate()` to work with DerivaML:
|
|
138
|
+
|
|
139
|
+
Example with hydra-zen:
|
|
140
|
+
>>> from hydra_zen import builds, instantiate
|
|
141
|
+
>>> from deriva_ml import DerivaML
|
|
142
|
+
>>> from deriva_ml.core.config import DerivaMLConfig
|
|
143
|
+
>>>
|
|
144
|
+
>>> # Create a structured config using hydra-zen
|
|
145
|
+
>>> DerivaMLConf = builds(DerivaMLConfig, populate_full_signature=True)
|
|
146
|
+
>>>
|
|
147
|
+
>>> # Configure for your environment
|
|
148
|
+
>>> conf = DerivaMLConf(
|
|
149
|
+
... hostname='deriva.example.org',
|
|
150
|
+
... catalog_id='42',
|
|
151
|
+
... domain_schema='my_domain',
|
|
152
|
+
... )
|
|
153
|
+
>>>
|
|
154
|
+
>>> # Instantiate the config to get a DerivaMLConfig object
|
|
155
|
+
>>> config = instantiate(conf)
|
|
156
|
+
>>>
|
|
157
|
+
>>> # Create the DerivaML instance
|
|
158
|
+
>>> ml = DerivaML.instantiate(config)
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
config: A DerivaMLConfig object containing all configuration parameters.
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
A new DerivaML instance configured according to the config object.
|
|
165
|
+
|
|
166
|
+
Note:
|
|
167
|
+
The DerivaMLConfig class integrates with Hydra's configuration system
|
|
168
|
+
and registers custom resolvers for computing working directories.
|
|
169
|
+
See `deriva_ml.core.config` for details on configuration options.
|
|
170
|
+
"""
|
|
110
171
|
return cls(**config.model_dump())
|
|
111
172
|
|
|
112
173
|
def __init__(
|
|
113
174
|
self,
|
|
114
175
|
hostname: str,
|
|
115
176
|
catalog_id: str | int,
|
|
116
|
-
|
|
177
|
+
domain_schemas: set[str] | None = None,
|
|
178
|
+
default_schema: str | None = None,
|
|
117
179
|
project_name: str | None = None,
|
|
118
180
|
cache_dir: str | Path | None = None,
|
|
119
181
|
working_dir: str | Path | None = None,
|
|
120
182
|
hydra_runtime_output_dir: str | Path | None = None,
|
|
121
183
|
ml_schema: str = ML_SCHEMA,
|
|
122
|
-
logging_level=logging.WARNING,
|
|
123
|
-
deriva_logging_level=logging.WARNING,
|
|
124
|
-
credential=None,
|
|
125
|
-
|
|
184
|
+
logging_level: int = logging.WARNING,
|
|
185
|
+
deriva_logging_level: int = logging.WARNING,
|
|
186
|
+
credential: dict | None = None,
|
|
187
|
+
s3_bucket: str | None = None,
|
|
188
|
+
use_minid: bool | None = None,
|
|
126
189
|
check_auth: bool = True,
|
|
127
|
-
|
|
190
|
+
clean_execution_dir: bool = True,
|
|
191
|
+
) -> None:
|
|
128
192
|
"""Initializes a DerivaML instance.
|
|
129
193
|
|
|
130
194
|
This method will connect to a catalog and initialize local configuration for the ML execution.
|
|
@@ -133,17 +197,28 @@ class DerivaML(Dataset):
|
|
|
133
197
|
Args:
|
|
134
198
|
hostname: Hostname of the Deriva server.
|
|
135
199
|
catalog_id: Catalog ID. Either an identifier or a catalog name.
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
200
|
+
domain_schemas: Optional set of domain schema names. If None, auto-detects all
|
|
201
|
+
non-system schemas. Use this when working with catalogs that have multiple
|
|
202
|
+
user-defined schemas.
|
|
203
|
+
default_schema: The default schema for table creation operations. If None and
|
|
204
|
+
there is exactly one domain schema, that schema is used. If there are multiple
|
|
205
|
+
domain schemas, this must be specified for table creation to work without
|
|
206
|
+
explicit schema parameters.
|
|
139
207
|
ml_schema: Schema name for ML schema. Used if you have a non-standard configuration of deriva-ml.
|
|
140
|
-
project_name: Project name. Defaults to name of
|
|
208
|
+
project_name: Project name. Defaults to name of default_schema.
|
|
141
209
|
cache_dir: Directory path for caching data downloaded from the Deriva server as bdbag. If not provided,
|
|
142
210
|
will default to working_dir.
|
|
143
211
|
working_dir: Directory path for storing data used by or generated by any computations. If no value is
|
|
144
212
|
provided, will default to ${HOME}/deriva_ml
|
|
145
|
-
|
|
213
|
+
s3_bucket: S3 bucket URL for dataset bag storage (e.g., 's3://my-bucket'). If provided,
|
|
214
|
+
enables MINID creation and S3 upload for dataset exports. If None, MINID functionality
|
|
215
|
+
is disabled regardless of use_minid setting.
|
|
216
|
+
use_minid: Use the MINID service when downloading dataset bags. Only effective when
|
|
217
|
+
s3_bucket is configured. If None (default), automatically set to True when s3_bucket
|
|
218
|
+
is provided, False otherwise.
|
|
146
219
|
check_auth: Check if the user has access to the catalog.
|
|
220
|
+
clean_execution_dir: Whether to automatically clean up execution working directories
|
|
221
|
+
after successful upload. Defaults to True. Set to False to retain local copies.
|
|
147
222
|
"""
|
|
148
223
|
# Get or use provided credentials for server access
|
|
149
224
|
self.credential = credential or get_credential(hostname)
|
|
@@ -164,32 +239,46 @@ class DerivaML(Dataset):
|
|
|
164
239
|
"Please check your credentials and make sure you have logged in."
|
|
165
240
|
)
|
|
166
241
|
self.catalog = server.connect_ermrest(catalog_id)
|
|
167
|
-
|
|
242
|
+
# Import here to avoid circular imports
|
|
243
|
+
from deriva_ml.model.catalog import DerivaModel
|
|
244
|
+
self.model = DerivaModel(
|
|
245
|
+
self.catalog.getCatalogModel(),
|
|
246
|
+
ml_schema=ml_schema,
|
|
247
|
+
domain_schemas=domain_schemas,
|
|
248
|
+
default_schema=default_schema,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
# Store S3 bucket configuration and resolve use_minid
|
|
252
|
+
self.s3_bucket = s3_bucket
|
|
253
|
+
if use_minid is None:
|
|
254
|
+
# Auto mode: enable MINID if s3_bucket is configured
|
|
255
|
+
self.use_minid = s3_bucket is not None
|
|
256
|
+
elif use_minid and s3_bucket is None:
|
|
257
|
+
# User requested MINID but no S3 bucket configured - disable MINID
|
|
258
|
+
self.use_minid = False
|
|
259
|
+
else:
|
|
260
|
+
self.use_minid = use_minid
|
|
168
261
|
|
|
169
262
|
# Set up working and cache directories
|
|
170
|
-
self.working_dir = DerivaMLConfig.compute_workdir(working_dir)
|
|
263
|
+
self.working_dir = DerivaMLConfig.compute_workdir(working_dir, catalog_id)
|
|
171
264
|
self.working_dir.mkdir(parents=True, exist_ok=True)
|
|
172
265
|
self.hydra_runtime_output_dir = hydra_runtime_output_dir
|
|
173
266
|
|
|
174
267
|
self.cache_dir = Path(cache_dir) if cache_dir else self.working_dir / "cache"
|
|
175
268
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
176
269
|
|
|
177
|
-
#
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
270
|
+
# Set up logging using centralized configuration
|
|
271
|
+
# This configures deriva_ml, Hydra, and deriva-py loggers without
|
|
272
|
+
# affecting the root logger or calling basicConfig()
|
|
273
|
+
self._logger = configure_logging(
|
|
274
|
+
level=logging_level,
|
|
275
|
+
deriva_level=deriva_logging_level,
|
|
276
|
+
)
|
|
183
277
|
self._logging_level = logging_level
|
|
184
278
|
self._deriva_logging_level = deriva_logging_level
|
|
185
279
|
|
|
186
|
-
#
|
|
187
|
-
|
|
188
|
-
# allow for reconfiguration of module-specific logging levels
|
|
189
|
-
[logging.getLogger(name).setLevel(level) for name, level in logger_config.items()]
|
|
190
|
-
logging.getLogger("root").setLevel(deriva_logging_level)
|
|
191
|
-
logging.getLogger("bagit").setLevel(deriva_logging_level)
|
|
192
|
-
logging.getLogger("bdbag").setLevel(deriva_logging_level)
|
|
280
|
+
# Apply deriva's default logger overrides for fine-grained control
|
|
281
|
+
apply_logger_overrides(DEFAULT_LOGGER_OVERRIDES)
|
|
193
282
|
|
|
194
283
|
# Store instance configuration
|
|
195
284
|
self.host_name = hostname
|
|
@@ -197,22 +286,14 @@ class DerivaML(Dataset):
|
|
|
197
286
|
self.ml_schema = ml_schema
|
|
198
287
|
self.configuration = None
|
|
199
288
|
self._execution: Execution | None = None
|
|
200
|
-
self.
|
|
201
|
-
self.
|
|
289
|
+
self.domain_schemas = self.model.domain_schemas
|
|
290
|
+
self.default_schema = self.model.default_schema
|
|
291
|
+
self.project_name = project_name or self.default_schema or "deriva-ml"
|
|
202
292
|
self.start_time = datetime.now()
|
|
203
293
|
self.status = Status.pending.value
|
|
294
|
+
self.clean_execution_dir = clean_execution_dir
|
|
204
295
|
|
|
205
|
-
|
|
206
|
-
logging.basicConfig(
|
|
207
|
-
level=logging_level,
|
|
208
|
-
format="%(asctime)s - %(name)s.%(levelname)s - %(message)s",
|
|
209
|
-
)
|
|
210
|
-
|
|
211
|
-
# Set Deriva library logging level
|
|
212
|
-
deriva_logger = logging.getLogger("deriva")
|
|
213
|
-
deriva_logger.setLevel(logging_level)
|
|
214
|
-
|
|
215
|
-
def __del__(self):
|
|
296
|
+
def __del__(self) -> None:
|
|
216
297
|
"""Cleanup method to handle incomplete executions."""
|
|
217
298
|
try:
|
|
218
299
|
# Mark execution as aborted if not completed
|
|
@@ -222,7 +303,7 @@ class DerivaML(Dataset):
|
|
|
222
303
|
pass
|
|
223
304
|
|
|
224
305
|
@staticmethod
|
|
225
|
-
def _get_session_config():
|
|
306
|
+
def _get_session_config() -> dict:
|
|
226
307
|
"""Returns customized HTTP session configuration.
|
|
227
308
|
|
|
228
309
|
Configures retry behavior and connection settings for HTTP requests to the Deriva server. Settings include:
|
|
@@ -254,58 +335,23 @@ class DerivaML(Dataset):
|
|
|
254
335
|
)
|
|
255
336
|
return session_config
|
|
256
337
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
"""Returns catalog path builder for queries.
|
|
260
|
-
|
|
261
|
-
The path builder provides a fluent interface for constructing complex queries against the catalog.
|
|
262
|
-
This is a core component used by many other methods to interact with the catalog.
|
|
338
|
+
def is_snapshot(self) -> bool:
|
|
339
|
+
return hasattr(self.catalog, "_snaptime")
|
|
263
340
|
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
341
|
+
def catalog_snapshot(self, version_snapshot: str) -> Self:
|
|
342
|
+
"""Returns a DerivaML instance for a specific snapshot of the catalog."""
|
|
343
|
+
return DerivaML(
|
|
344
|
+
self.host_name,
|
|
345
|
+
version_snapshot,
|
|
346
|
+
logging_level=self._logging_level,
|
|
347
|
+
deriva_logging_level=self._deriva_logging_level,
|
|
348
|
+
)
|
|
272
349
|
|
|
273
350
|
@property
|
|
274
|
-
def
|
|
275
|
-
""
|
|
351
|
+
def _dataset_table(self) -> Table:
|
|
352
|
+
return self.model.schemas[self.model.ml_schema].tables["Dataset"]
|
|
276
353
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
Returns:
|
|
280
|
-
datapath._CatalogWrapper: Path builder object scoped to the domain schema.
|
|
281
|
-
|
|
282
|
-
Example:
|
|
283
|
-
>>> domain = ml.domain_path
|
|
284
|
-
>>> results = domain.my_table.entities().fetch()
|
|
285
|
-
"""
|
|
286
|
-
return self.pathBuilder.schemas[self.domain_schema]
|
|
287
|
-
|
|
288
|
-
def table_path(self, table: str | Table) -> Path:
|
|
289
|
-
"""Returns a local filesystem path for table CSV files.
|
|
290
|
-
|
|
291
|
-
Generates a standardized path where CSV files should be placed when preparing to upload data to a table.
|
|
292
|
-
The path follows the project's directory structure conventions.
|
|
293
|
-
|
|
294
|
-
Args:
|
|
295
|
-
table: Name of the table or Table object to get the path for.
|
|
296
|
-
|
|
297
|
-
Returns:
|
|
298
|
-
Path: Filesystem path where the CSV file should be placed.
|
|
299
|
-
|
|
300
|
-
Example:
|
|
301
|
-
>>> path = ml.table_path("experiment_results")
|
|
302
|
-
>>> df.to_csv(path) # Save data for upload
|
|
303
|
-
"""
|
|
304
|
-
return table_path(
|
|
305
|
-
self.working_dir,
|
|
306
|
-
schema=self.domain_schema,
|
|
307
|
-
table=self.model.name_to_table(table).name,
|
|
308
|
-
)
|
|
354
|
+
# pathBuilder, domain_path, table_path moved to PathBuilderMixin
|
|
309
355
|
|
|
310
356
|
def download_dir(self, cached: bool = False) -> Path:
|
|
311
357
|
"""Returns the appropriate download directory.
|
|
@@ -384,27 +430,37 @@ class DerivaML(Dataset):
|
|
|
384
430
|
uri = self.cite(cast(str, table))
|
|
385
431
|
return f"{uri}/{urlquote(table_obj.schema.name)}:{urlquote(table_obj.name)}"
|
|
386
432
|
|
|
387
|
-
def cite(self, entity: Dict[str, Any] | str) -> str:
|
|
388
|
-
"""Generates
|
|
433
|
+
def cite(self, entity: Dict[str, Any] | str, current: bool = False) -> str:
|
|
434
|
+
"""Generates citation URL for an entity.
|
|
389
435
|
|
|
390
|
-
Creates a
|
|
391
|
-
the catalog snapshot time to ensure version stability
|
|
436
|
+
Creates a URL that can be used to reference a specific entity in the catalog.
|
|
437
|
+
By default, includes the catalog snapshot time to ensure version stability
|
|
438
|
+
(permanent citation). With current=True, returns a URL to the current state.
|
|
392
439
|
|
|
393
440
|
Args:
|
|
394
441
|
entity: Either a RID string or a dictionary containing entity data with a 'RID' key.
|
|
442
|
+
current: If True, return URL to current catalog state (no snapshot).
|
|
443
|
+
If False (default), return permanent citation URL with snapshot time.
|
|
395
444
|
|
|
396
445
|
Returns:
|
|
397
|
-
str:
|
|
446
|
+
str: Citation URL. Format depends on `current` parameter:
|
|
447
|
+
- current=False: https://{host}/id/{catalog}/{rid}@{snapshot_time}
|
|
448
|
+
- current=True: https://{host}/id/{catalog}/{rid}
|
|
398
449
|
|
|
399
450
|
Raises:
|
|
400
451
|
DerivaMLException: If an entity doesn't exist or lacks a RID.
|
|
401
452
|
|
|
402
453
|
Examples:
|
|
403
|
-
|
|
454
|
+
Permanent citation (default):
|
|
404
455
|
>>> url = ml.cite("1-abc123")
|
|
405
456
|
>>> print(url)
|
|
406
457
|
'https://deriva.org/id/1/1-abc123@2024-01-01T12:00:00'
|
|
407
458
|
|
|
459
|
+
Current catalog URL:
|
|
460
|
+
>>> url = ml.cite("1-abc123", current=True)
|
|
461
|
+
>>> print(url)
|
|
462
|
+
'https://deriva.org/id/1/1-abc123'
|
|
463
|
+
|
|
408
464
|
Using a dictionary:
|
|
409
465
|
>>> url = ml.cite({"RID": "1-abc123"})
|
|
410
466
|
"""
|
|
@@ -413,14 +469,44 @@ class DerivaML(Dataset):
|
|
|
413
469
|
return entity
|
|
414
470
|
|
|
415
471
|
try:
|
|
416
|
-
# Resolve RID and create citation URL
|
|
472
|
+
# Resolve RID and create citation URL
|
|
417
473
|
self.resolve_rid(rid := entity if isinstance(entity, str) else entity["RID"])
|
|
418
|
-
|
|
474
|
+
base_url = f"https://{self.host_name}/id/{self.catalog_id}/{rid}"
|
|
475
|
+
if current:
|
|
476
|
+
return base_url
|
|
477
|
+
return f"{base_url}@{self.catalog.latest_snapshot().snaptime}"
|
|
419
478
|
except KeyError as e:
|
|
420
479
|
raise DerivaMLException(f"Entity {e} does not have RID column")
|
|
421
480
|
except DerivaMLException as _e:
|
|
422
481
|
raise DerivaMLException("Entity RID does not exist")
|
|
423
482
|
|
|
483
|
+
@property
|
|
484
|
+
def catalog_provenance(self) -> "CatalogProvenance | None":
|
|
485
|
+
"""Get the provenance information for this catalog.
|
|
486
|
+
|
|
487
|
+
Returns provenance information if the catalog has it set. This includes
|
|
488
|
+
information about how the catalog was created (clone, create, schema),
|
|
489
|
+
who created it, when, and any workflow information.
|
|
490
|
+
|
|
491
|
+
For cloned catalogs, additional details about the clone operation are
|
|
492
|
+
available in the `clone_details` attribute.
|
|
493
|
+
|
|
494
|
+
Returns:
|
|
495
|
+
CatalogProvenance if available, None otherwise.
|
|
496
|
+
|
|
497
|
+
Example:
|
|
498
|
+
>>> ml = DerivaML('localhost', '45')
|
|
499
|
+
>>> prov = ml.catalog_provenance
|
|
500
|
+
>>> if prov:
|
|
501
|
+
... print(f"Created: {prov.created_at} by {prov.created_by}")
|
|
502
|
+
... print(f"Method: {prov.creation_method.value}")
|
|
503
|
+
... if prov.is_clone:
|
|
504
|
+
... print(f"Cloned from: {prov.clone_details.source_hostname}")
|
|
505
|
+
"""
|
|
506
|
+
from deriva_ml.catalog.clone import get_catalog_provenance
|
|
507
|
+
|
|
508
|
+
return get_catalog_provenance(self.catalog)
|
|
509
|
+
|
|
424
510
|
def user_list(self) -> List[Dict[str, str]]:
|
|
425
511
|
"""Returns catalog user list.
|
|
426
512
|
|
|
@@ -439,59 +525,247 @@ class DerivaML(Dataset):
|
|
|
439
525
|
... print(f"{user['Full_Name']} ({user['ID']})")
|
|
440
526
|
"""
|
|
441
527
|
# Get the user table path and fetch basic user info
|
|
442
|
-
user_path = self.pathBuilder.public.ERMrest_Client.path
|
|
528
|
+
user_path = self.pathBuilder().public.ERMrest_Client.path
|
|
443
529
|
return [{"ID": u["ID"], "Full_Name": u["Full_Name"]} for u in user_path.entities().fetch()]
|
|
444
530
|
|
|
445
|
-
|
|
446
|
-
"""Resolves RID to catalog location.
|
|
447
|
-
|
|
448
|
-
Looks up a RID and returns information about where it exists in the catalog, including schema,
|
|
449
|
-
table, and column metadata.
|
|
531
|
+
# resolve_rid, retrieve_rid moved to RidResolutionMixin
|
|
450
532
|
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
533
|
+
def apply_catalog_annotations(
|
|
534
|
+
self,
|
|
535
|
+
navbar_brand_text: str = "ML Data Browser",
|
|
536
|
+
head_title: str = "Catalog ML",
|
|
537
|
+
) -> None:
|
|
538
|
+
"""Apply catalog-level annotations including the navigation bar and display settings.
|
|
539
|
+
|
|
540
|
+
This method configures the Chaise web interface for the catalog. Chaise is Deriva's
|
|
541
|
+
web-based data browser that provides a user-friendly interface for exploring and
|
|
542
|
+
managing catalog data. This method sets up annotations that control how Chaise
|
|
543
|
+
displays and organizes the catalog.
|
|
544
|
+
|
|
545
|
+
**Navigation Bar Structure**:
|
|
546
|
+
The method creates a navigation bar with the following menus:
|
|
547
|
+
- **User Info**: Links to Users, Groups, and RID Lease tables
|
|
548
|
+
- **Deriva-ML**: Core ML tables (Workflow, Execution, Dataset, Dataset_Version, etc.)
|
|
549
|
+
- **WWW**: Web content tables (Page, File)
|
|
550
|
+
- **{Domain Schema}**: All domain-specific tables (excludes vocabularies and associations)
|
|
551
|
+
- **Vocabulary**: All controlled vocabulary tables from both ML and domain schemas
|
|
552
|
+
- **Assets**: All asset tables from both ML and domain schemas
|
|
553
|
+
- **Features**: All feature tables with entries named "TableName:FeatureName"
|
|
554
|
+
- **Catalog Registry**: Link to the ermrest registry
|
|
555
|
+
- **Documentation**: Links to ML notebook instructions and Deriva-ML docs
|
|
556
|
+
|
|
557
|
+
**Display Settings**:
|
|
558
|
+
- Underscores in table/column names displayed as spaces
|
|
559
|
+
- System columns (RID) shown in compact and entry views
|
|
560
|
+
- Default table set to Dataset
|
|
561
|
+
- Faceting and record deletion enabled
|
|
562
|
+
- Export configurations available to all users
|
|
563
|
+
|
|
564
|
+
**Bulk Upload Configuration**:
|
|
565
|
+
Configures upload patterns for asset tables, enabling drag-and-drop file uploads
|
|
566
|
+
through the Chaise interface.
|
|
567
|
+
|
|
568
|
+
Call this after creating the domain schema and all tables to initialize the catalog's
|
|
569
|
+
web interface. The navigation menus are dynamically built based on the current schema
|
|
570
|
+
structure, automatically organizing tables into appropriate categories.
|
|
479
571
|
|
|
480
572
|
Args:
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
Returns:
|
|
484
|
-
dict[str, Any]: Dictionary containing all column values for the entity.
|
|
485
|
-
|
|
486
|
-
Raises:
|
|
487
|
-
DerivaMLException: If the RID doesn't exist in the catalog.
|
|
573
|
+
navbar_brand_text: Text displayed in the navigation bar brand area.
|
|
574
|
+
head_title: Title displayed in the browser tab.
|
|
488
575
|
|
|
489
576
|
Example:
|
|
490
|
-
>>>
|
|
491
|
-
>>>
|
|
577
|
+
>>> ml = DerivaML('deriva.example.org', 'my_catalog')
|
|
578
|
+
>>> # After creating domain schema and tables...
|
|
579
|
+
>>> ml.apply_catalog_annotations()
|
|
580
|
+
>>> # Or with custom branding:
|
|
581
|
+
>>> ml.apply_catalog_annotations("My Project Browser", "My ML Project")
|
|
492
582
|
"""
|
|
493
|
-
|
|
494
|
-
|
|
583
|
+
catalog_id = self.model.catalog.catalog_id
|
|
584
|
+
ml_schema = self.ml_schema
|
|
585
|
+
|
|
586
|
+
# Build domain schema menu items (one menu per domain schema)
|
|
587
|
+
domain_schema_menus = []
|
|
588
|
+
for domain_schema in sorted(self.domain_schemas):
|
|
589
|
+
if domain_schema not in self.model.schemas:
|
|
590
|
+
continue
|
|
591
|
+
domain_schema_menus.append({
|
|
592
|
+
"name": domain_schema,
|
|
593
|
+
"children": [
|
|
594
|
+
{
|
|
595
|
+
"name": tname,
|
|
596
|
+
"url": f"/chaise/recordset/#{catalog_id}/{domain_schema}:{tname}",
|
|
597
|
+
}
|
|
598
|
+
for tname in self.model.schemas[domain_schema].tables
|
|
599
|
+
# Don't include controlled vocabularies, association tables, or feature tables.
|
|
600
|
+
if not (
|
|
601
|
+
self.model.is_vocabulary(tname)
|
|
602
|
+
or self.model.is_association(tname, pure=False, max_arity=3)
|
|
603
|
+
)
|
|
604
|
+
],
|
|
605
|
+
})
|
|
606
|
+
|
|
607
|
+
# Build vocabulary menu items (ML schema + all domain schemas)
|
|
608
|
+
vocab_children = [{"name": f"{ml_schema} Vocabularies", "header": True}]
|
|
609
|
+
vocab_children.extend([
|
|
610
|
+
{
|
|
611
|
+
"url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:{tname}",
|
|
612
|
+
"name": tname,
|
|
613
|
+
}
|
|
614
|
+
for tname in self.model.schemas[ml_schema].tables
|
|
615
|
+
if self.model.is_vocabulary(tname)
|
|
616
|
+
])
|
|
617
|
+
for domain_schema in sorted(self.domain_schemas):
|
|
618
|
+
if domain_schema not in self.model.schemas:
|
|
619
|
+
continue
|
|
620
|
+
vocab_children.append({"name": f"{domain_schema} Vocabularies", "header": True})
|
|
621
|
+
vocab_children.extend([
|
|
622
|
+
{
|
|
623
|
+
"url": f"/chaise/recordset/#{catalog_id}/{domain_schema}:{tname}",
|
|
624
|
+
"name": tname,
|
|
625
|
+
}
|
|
626
|
+
for tname in self.model.schemas[domain_schema].tables
|
|
627
|
+
if self.model.is_vocabulary(tname)
|
|
628
|
+
])
|
|
629
|
+
|
|
630
|
+
# Build asset menu items (ML schema + all domain schemas)
|
|
631
|
+
asset_children = [
|
|
632
|
+
{
|
|
633
|
+
"url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:{tname}",
|
|
634
|
+
"name": tname,
|
|
635
|
+
}
|
|
636
|
+
for tname in self.model.schemas[ml_schema].tables
|
|
637
|
+
if self.model.is_asset(tname)
|
|
638
|
+
]
|
|
639
|
+
for domain_schema in sorted(self.domain_schemas):
|
|
640
|
+
if domain_schema not in self.model.schemas:
|
|
641
|
+
continue
|
|
642
|
+
asset_children.extend([
|
|
643
|
+
{
|
|
644
|
+
"url": f"/chaise/recordset/#{catalog_id}/{domain_schema}:{tname}",
|
|
645
|
+
"name": tname,
|
|
646
|
+
}
|
|
647
|
+
for tname in self.model.schemas[domain_schema].tables
|
|
648
|
+
if self.model.is_asset(tname)
|
|
649
|
+
])
|
|
650
|
+
|
|
651
|
+
catalog_annotation = {
|
|
652
|
+
deriva_tags.display: {"name_style": {"underline_space": True}},
|
|
653
|
+
deriva_tags.chaise_config: {
|
|
654
|
+
"headTitle": head_title,
|
|
655
|
+
"navbarBrandText": navbar_brand_text,
|
|
656
|
+
"systemColumnsDisplayEntry": ["RID"],
|
|
657
|
+
"systemColumnsDisplayCompact": ["RID"],
|
|
658
|
+
"defaultTable": {"table": "Dataset", "schema": "deriva-ml"},
|
|
659
|
+
"deleteRecord": True,
|
|
660
|
+
"showFaceting": True,
|
|
661
|
+
"shareCiteAcls": True,
|
|
662
|
+
"exportConfigsSubmenu": {"acls": {"show": ["*"], "enable": ["*"]}},
|
|
663
|
+
"resolverImplicitCatalog": False,
|
|
664
|
+
"navbarMenu": {
|
|
665
|
+
"newTab": False,
|
|
666
|
+
"children": [
|
|
667
|
+
{
|
|
668
|
+
"name": "User Info",
|
|
669
|
+
"children": [
|
|
670
|
+
{
|
|
671
|
+
"url": f"/chaise/recordset/#{catalog_id}/public:ERMrest_Client",
|
|
672
|
+
"name": "Users",
|
|
673
|
+
},
|
|
674
|
+
{
|
|
675
|
+
"url": f"/chaise/recordset/#{catalog_id}/public:ERMrest_Group",
|
|
676
|
+
"name": "Groups",
|
|
677
|
+
},
|
|
678
|
+
{
|
|
679
|
+
"url": f"/chaise/recordset/#{catalog_id}/public:ERMrest_RID_Lease",
|
|
680
|
+
"name": "ERMrest RID Lease",
|
|
681
|
+
},
|
|
682
|
+
],
|
|
683
|
+
},
|
|
684
|
+
{ # All the primary tables in deriva-ml schema.
|
|
685
|
+
"name": "Deriva-ML",
|
|
686
|
+
"children": [
|
|
687
|
+
{
|
|
688
|
+
"url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Workflow",
|
|
689
|
+
"name": "Workflow",
|
|
690
|
+
},
|
|
691
|
+
{
|
|
692
|
+
"url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Execution",
|
|
693
|
+
"name": "Execution",
|
|
694
|
+
},
|
|
695
|
+
{
|
|
696
|
+
"url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Execution_Metadata",
|
|
697
|
+
"name": "Execution Metadata",
|
|
698
|
+
},
|
|
699
|
+
{
|
|
700
|
+
"url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Execution_Asset",
|
|
701
|
+
"name": "Execution Asset",
|
|
702
|
+
},
|
|
703
|
+
{
|
|
704
|
+
"url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Dataset",
|
|
705
|
+
"name": "Dataset",
|
|
706
|
+
},
|
|
707
|
+
{
|
|
708
|
+
"url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Dataset_Version",
|
|
709
|
+
"name": "Dataset Version",
|
|
710
|
+
},
|
|
711
|
+
],
|
|
712
|
+
},
|
|
713
|
+
{ # WWW schema tables.
|
|
714
|
+
"name": "WWW",
|
|
715
|
+
"children": [
|
|
716
|
+
{
|
|
717
|
+
"url": f"/chaise/recordset/#{catalog_id}/WWW:Page",
|
|
718
|
+
"name": "Page",
|
|
719
|
+
},
|
|
720
|
+
{
|
|
721
|
+
"url": f"/chaise/recordset/#{catalog_id}/WWW:File",
|
|
722
|
+
"name": "File",
|
|
723
|
+
},
|
|
724
|
+
],
|
|
725
|
+
},
|
|
726
|
+
*domain_schema_menus, # One menu per domain schema
|
|
727
|
+
{ # Vocabulary menu with all controlled vocabularies.
|
|
728
|
+
"name": "Vocabulary",
|
|
729
|
+
"children": vocab_children,
|
|
730
|
+
},
|
|
731
|
+
{ # List of all asset tables.
|
|
732
|
+
"name": "Assets",
|
|
733
|
+
"children": asset_children,
|
|
734
|
+
},
|
|
735
|
+
{ # List of all feature tables in the catalog.
|
|
736
|
+
"name": "Features",
|
|
737
|
+
"children": [
|
|
738
|
+
{
|
|
739
|
+
"url": f"/chaise/recordset/#{catalog_id}/{f.feature_table.schema.name}:{f.feature_table.name}",
|
|
740
|
+
"name": f"{f.target_table.name}:{f.feature_name}",
|
|
741
|
+
}
|
|
742
|
+
for f in self.model.find_features()
|
|
743
|
+
],
|
|
744
|
+
},
|
|
745
|
+
{
|
|
746
|
+
"url": "/chaise/recordset/#0/ermrest:registry@sort(RID)",
|
|
747
|
+
"name": "Catalog Registry",
|
|
748
|
+
},
|
|
749
|
+
{
|
|
750
|
+
"name": "Documentation",
|
|
751
|
+
"children": [
|
|
752
|
+
{
|
|
753
|
+
"url": "https://github.com/informatics-isi-edu/deriva-ml/blob/main/docs/ml_workflow_instruction.md",
|
|
754
|
+
"name": "ML Notebook Instruction",
|
|
755
|
+
},
|
|
756
|
+
{
|
|
757
|
+
"url": "https://informatics-isi-edu.github.io/deriva-ml/",
|
|
758
|
+
"name": "Deriva-ML Documentation",
|
|
759
|
+
},
|
|
760
|
+
],
|
|
761
|
+
},
|
|
762
|
+
],
|
|
763
|
+
},
|
|
764
|
+
},
|
|
765
|
+
deriva_tags.bulk_upload: bulk_upload_configuration(model=self.model),
|
|
766
|
+
}
|
|
767
|
+
self.model.annotations.update(catalog_annotation)
|
|
768
|
+
self.model.apply()
|
|
495
769
|
|
|
496
770
|
def add_page(self, title: str, content: str) -> None:
|
|
497
771
|
"""Adds page to web interface.
|
|
@@ -513,9 +787,15 @@ class DerivaML(Dataset):
|
|
|
513
787
|
... )
|
|
514
788
|
"""
|
|
515
789
|
# Insert page into www tables with title and content
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
790
|
+
# Use default schema or first domain schema for www tables
|
|
791
|
+
schema = self.default_schema or (sorted(self.domain_schemas)[0] if self.domain_schemas else None)
|
|
792
|
+
if schema is None:
|
|
793
|
+
raise DerivaMLException("No domain schema available for adding pages")
|
|
794
|
+
self.pathBuilder().www.tables[schema].insert([{"Title": title, "Content": content}])
|
|
795
|
+
|
|
796
|
+
def create_vocabulary(
|
|
797
|
+
self, vocab_name: str, comment: str = "", schema: str | None = None, update_navbar: bool = True
|
|
798
|
+
) -> Table:
|
|
519
799
|
"""Creates a controlled vocabulary table.
|
|
520
800
|
|
|
521
801
|
A controlled vocabulary table maintains a list of standardized terms and their definitions. Each term can have
|
|
@@ -525,6 +805,9 @@ class DerivaML(Dataset):
|
|
|
525
805
|
vocab_name: Name for the new vocabulary table. Must be a valid SQL identifier.
|
|
526
806
|
comment: Description of the vocabulary's purpose and usage. Defaults to empty string.
|
|
527
807
|
schema: Schema name to create the table in. If None, uses domain_schema.
|
|
808
|
+
update_navbar: If True (default), automatically updates the navigation bar to include
|
|
809
|
+
the new vocabulary table. Set to False during batch table creation to avoid
|
|
810
|
+
redundant updates, then call apply_catalog_annotations() once at the end.
|
|
528
811
|
|
|
529
812
|
Returns:
|
|
530
813
|
Table: ERMRest table object representing the newly created vocabulary table.
|
|
@@ -540,988 +823,483 @@ class DerivaML(Dataset):
|
|
|
540
823
|
... comment="Standard tissue classifications",
|
|
541
824
|
... schema="bio_schema"
|
|
542
825
|
... )
|
|
826
|
+
|
|
827
|
+
Create multiple vocabularies without updating navbar until the end:
|
|
828
|
+
|
|
829
|
+
>>> ml.create_vocabulary("Species", update_navbar=False)
|
|
830
|
+
>>> ml.create_vocabulary("Tissue_Type", update_navbar=False)
|
|
831
|
+
>>> ml.apply_catalog_annotations() # Update navbar once
|
|
543
832
|
"""
|
|
544
|
-
# Use
|
|
545
|
-
schema = schema or self.
|
|
833
|
+
# Use default schema if none specified
|
|
834
|
+
schema = schema or self.model._require_default_schema()
|
|
546
835
|
|
|
547
836
|
# Create and return vocabulary table with RID-based URI pattern
|
|
548
837
|
try:
|
|
549
838
|
vocab_table = self.model.schemas[schema].create_table(
|
|
550
|
-
|
|
839
|
+
VocabularyTableDef(
|
|
840
|
+
name=vocab_name,
|
|
841
|
+
curie_template=f"{self.project_name}:{{RID}}",
|
|
842
|
+
comment=comment,
|
|
843
|
+
)
|
|
551
844
|
)
|
|
552
845
|
except ValueError:
|
|
553
846
|
raise DerivaMLException(f"Table {vocab_name} already exist")
|
|
554
|
-
return vocab_table
|
|
555
|
-
|
|
556
|
-
def create_table(self, table: TableDefinition) -> Table:
|
|
557
|
-
"""Creates a new table in the catalog.
|
|
558
|
-
|
|
559
|
-
Creates a table using the provided TableDefinition object, which specifies the table structure including
|
|
560
|
-
columns, keys, and foreign key relationships.
|
|
561
|
-
|
|
562
|
-
Args:
|
|
563
|
-
table: A TableDefinition object containing the complete specification of the table to create.
|
|
564
|
-
|
|
565
|
-
Returns:
|
|
566
|
-
Table: The newly created ERMRest table object.
|
|
567
|
-
|
|
568
|
-
Raises:
|
|
569
|
-
DerivaMLException: If table creation fails or the definition is invalid.
|
|
570
|
-
|
|
571
|
-
Example:
|
|
572
|
-
|
|
573
|
-
>>> table_def = TableDefinition(
|
|
574
|
-
... name="experiments",
|
|
575
|
-
... column_definitions=[
|
|
576
|
-
... ColumnDefinition(name="name", type=BuiltinTypes.text),
|
|
577
|
-
... ColumnDefinition(name="date", type=BuiltinTypes.date)
|
|
578
|
-
... ]
|
|
579
|
-
... )
|
|
580
|
-
>>> new_table = ml.create_table(table_def)
|
|
581
|
-
"""
|
|
582
|
-
# Create table in domain schema using provided definition
|
|
583
|
-
return self.model.schemas[self.domain_schema].create_table(table.model_dump())
|
|
584
|
-
|
|
585
|
-
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
586
|
-
def create_asset(
|
|
587
|
-
self,
|
|
588
|
-
asset_name: str,
|
|
589
|
-
column_defs: Iterable[ColumnDefinition] | None = None,
|
|
590
|
-
fkey_defs: Iterable[ColumnDefinition] | None = None,
|
|
591
|
-
referenced_tables: Iterable[Table] | None = None,
|
|
592
|
-
comment: str = "",
|
|
593
|
-
schema: str | None = None,
|
|
594
|
-
) -> Table:
|
|
595
|
-
"""Creates an asset table.
|
|
596
|
-
|
|
597
|
-
Args:
|
|
598
|
-
asset_name: Name of the asset table.
|
|
599
|
-
column_defs: Iterable of ColumnDefinition objects to provide additional metadata for asset.
|
|
600
|
-
fkey_defs: Iterable of ForeignKeyDefinition objects to provide additional metadata for asset.
|
|
601
|
-
referenced_tables: Iterable of Table objects to which asset should provide foreign-key references to.
|
|
602
|
-
comment: Description of the asset table. (Default value = '')
|
|
603
|
-
schema: Schema in which to create the asset table. Defaults to domain_schema.
|
|
604
|
-
|
|
605
|
-
Returns:
|
|
606
|
-
Table object for the asset table.
|
|
607
|
-
"""
|
|
608
|
-
# Initialize empty collections if None provided
|
|
609
|
-
column_defs = column_defs or []
|
|
610
|
-
fkey_defs = fkey_defs or []
|
|
611
|
-
referenced_tables = referenced_tables or []
|
|
612
|
-
schema = schema or self.domain_schema
|
|
613
|
-
|
|
614
|
-
# Add an asset type to vocabulary
|
|
615
|
-
self.add_term(MLVocab.asset_type, asset_name, description=f"A {asset_name} asset")
|
|
616
|
-
|
|
617
|
-
# Create the main asset table
|
|
618
|
-
asset_table = self.model.schemas[schema].create_table(
|
|
619
|
-
Table.define_asset(
|
|
620
|
-
schema,
|
|
621
|
-
asset_name,
|
|
622
|
-
column_defs=[c.model_dump() for c in column_defs],
|
|
623
|
-
fkey_defs=[fk.model_dump() for fk in fkey_defs],
|
|
624
|
-
comment=comment,
|
|
625
|
-
)
|
|
626
|
-
)
|
|
627
847
|
|
|
628
|
-
#
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
[
|
|
632
|
-
(asset_table.name, asset_table),
|
|
633
|
-
("Asset_Type", self.model.name_to_table("Asset_Type")),
|
|
634
|
-
]
|
|
635
|
-
)
|
|
636
|
-
)
|
|
848
|
+
# Update navbar to include the new vocabulary table
|
|
849
|
+
if update_navbar:
|
|
850
|
+
self.apply_catalog_annotations()
|
|
637
851
|
|
|
638
|
-
|
|
639
|
-
for t in referenced_tables:
|
|
640
|
-
asset_table.create_reference(self.model.name_to_table(t))
|
|
641
|
-
|
|
642
|
-
# Create an association table for tracking execution
|
|
643
|
-
atable = self.model.schemas[self.domain_schema].create_table(
|
|
644
|
-
Table.define_association(
|
|
645
|
-
[
|
|
646
|
-
(asset_name, asset_table),
|
|
647
|
-
(
|
|
648
|
-
"Execution",
|
|
649
|
-
self.model.schemas[self.ml_schema].tables["Execution"],
|
|
650
|
-
),
|
|
651
|
-
]
|
|
652
|
-
)
|
|
653
|
-
)
|
|
654
|
-
atable.create_reference(self.model.name_to_table("Asset_Role"))
|
|
655
|
-
|
|
656
|
-
# Add asset annotations
|
|
657
|
-
asset_annotation(asset_table)
|
|
658
|
-
return asset_table
|
|
852
|
+
return vocab_table
|
|
659
853
|
|
|
660
|
-
def
|
|
661
|
-
"""
|
|
854
|
+
def create_table(self, table: TableDefinition, schema: str | None = None, update_navbar: bool = True) -> Table:
|
|
855
|
+
"""Creates a new table in the domain schema.
|
|
662
856
|
|
|
663
|
-
|
|
857
|
+
Creates a table using the provided TableDefinition object, which specifies the table structure
|
|
858
|
+
including columns, keys, and foreign key relationships. The table is created in the domain
|
|
859
|
+
schema associated with this DerivaML instance.
|
|
664
860
|
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
Returns:
|
|
669
|
-
list[dict[str, Any]]: List of asset records, each containing:
|
|
670
|
-
- RID: Resource identifier
|
|
671
|
-
- Type: Asset type
|
|
672
|
-
- Metadata: Asset metadata
|
|
861
|
+
**Required Classes**:
|
|
862
|
+
Import the following classes from deriva_ml to define tables:
|
|
673
863
|
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
>>> for asset in assets:
|
|
680
|
-
... print(f"{asset['RID']}: {asset['Type']}")
|
|
681
|
-
"""
|
|
682
|
-
# Validate and get asset table reference
|
|
683
|
-
asset_table = self.model.name_to_table(asset_table)
|
|
684
|
-
if not self.model.is_asset(asset_table):
|
|
685
|
-
raise DerivaMLException(f"Table {asset_table.name} is not an asset")
|
|
686
|
-
|
|
687
|
-
# Get path builders for asset and type tables
|
|
688
|
-
pb = self._model.catalog.getPathBuilder()
|
|
689
|
-
asset_path = pb.schemas[asset_table.schema.name].tables[asset_table.name]
|
|
690
|
-
(
|
|
691
|
-
asset_type_table,
|
|
692
|
-
_,
|
|
693
|
-
_,
|
|
694
|
-
) = self._model.find_association(asset_table, MLVocab.asset_type)
|
|
695
|
-
type_path = pb.schemas[asset_type_table.schema.name].tables[asset_type_table.name]
|
|
696
|
-
|
|
697
|
-
# Build a list of assets with their types
|
|
698
|
-
assets = []
|
|
699
|
-
for asset in asset_path.entities().fetch():
|
|
700
|
-
# Get associated asset types for each asset
|
|
701
|
-
asset_types = (
|
|
702
|
-
type_path.filter(type_path.columns[asset_table.name] == asset["RID"])
|
|
703
|
-
.attributes(type_path.Asset_Type)
|
|
704
|
-
.fetch()
|
|
705
|
-
)
|
|
706
|
-
# Combine asset data with its types
|
|
707
|
-
assets.append(
|
|
708
|
-
asset | {MLVocab.asset_type.value: [asset_type[MLVocab.asset_type.value] for asset_type in asset_types]}
|
|
709
|
-
)
|
|
710
|
-
return assets
|
|
864
|
+
- ``TableDefinition``: Defines the complete table structure
|
|
865
|
+
- ``ColumnDefinition``: Defines individual columns with types and constraints
|
|
866
|
+
- ``KeyDefinition``: Defines unique key constraints (optional)
|
|
867
|
+
- ``ForeignKeyDefinition``: Defines foreign key relationships to other tables (optional)
|
|
868
|
+
- ``BuiltinTypes``: Enum of available column data types
|
|
711
869
|
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
terms: list[Table | str] | None = None,
|
|
718
|
-
assets: list[Table | str] | None = None,
|
|
719
|
-
metadata: list[ColumnDefinition | Table | Key | str] | None = None,
|
|
720
|
-
optional: list[str] | None = None,
|
|
721
|
-
comment: str = "",
|
|
722
|
-
) -> type[FeatureRecord]:
|
|
723
|
-
"""Creates a new feature definition.
|
|
724
|
-
|
|
725
|
-
A feature represents a measurable property or characteristic that can be associated with records in the target
|
|
726
|
-
table. Features can include vocabulary terms, asset references, and additional metadata.
|
|
870
|
+
**Available Column Types** (BuiltinTypes enum):
|
|
871
|
+
``text``, ``int2``, ``int4``, ``int8``, ``float4``, ``float8``, ``boolean``,
|
|
872
|
+
``date``, ``timestamp``, ``timestamptz``, ``json``, ``jsonb``, ``markdown``,
|
|
873
|
+
``ermrest_uri``, ``ermrest_rid``, ``ermrest_rcb``, ``ermrest_rmb``,
|
|
874
|
+
``ermrest_rct``, ``ermrest_rmt``
|
|
727
875
|
|
|
728
876
|
Args:
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
metadata: Optional columns, tables, or keys to include in a feature definition.
|
|
734
|
-
optional: Column names that are not required when creating feature instances.
|
|
735
|
-
comment: Description of the feature's purpose and usage.
|
|
877
|
+
table: A TableDefinition object containing the complete specification of the table to create.
|
|
878
|
+
update_navbar: If True (default), automatically updates the navigation bar to include
|
|
879
|
+
the new table. Set to False during batch table creation to avoid redundant updates,
|
|
880
|
+
then call apply_catalog_annotations() once at the end.
|
|
736
881
|
|
|
737
882
|
Returns:
|
|
738
|
-
|
|
883
|
+
Table: The newly created ERMRest table object.
|
|
739
884
|
|
|
740
885
|
Raises:
|
|
741
|
-
DerivaMLException: If
|
|
886
|
+
DerivaMLException: If table creation fails or the definition is invalid.
|
|
742
887
|
|
|
743
888
|
Examples:
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
...
|
|
750
|
-
...
|
|
889
|
+
**Simple table with basic columns**:
|
|
890
|
+
|
|
891
|
+
>>> from deriva_ml import TableDefinition, ColumnDefinition, BuiltinTypes
|
|
892
|
+
>>>
|
|
893
|
+
>>> table_def = TableDefinition(
|
|
894
|
+
... name="Experiment",
|
|
895
|
+
... column_defs=[
|
|
896
|
+
... ColumnDefinition(name="Name", type=BuiltinTypes.text, nullok=False),
|
|
897
|
+
... ColumnDefinition(name="Date", type=BuiltinTypes.date),
|
|
898
|
+
... ColumnDefinition(name="Description", type=BuiltinTypes.markdown),
|
|
899
|
+
... ColumnDefinition(name="Score", type=BuiltinTypes.float4),
|
|
900
|
+
... ],
|
|
901
|
+
... comment="Records of experimental runs"
|
|
751
902
|
... )
|
|
752
|
-
|
|
753
|
-
# Initialize empty collections if None provided
|
|
754
|
-
terms = terms or []
|
|
755
|
-
assets = assets or []
|
|
756
|
-
metadata = metadata or []
|
|
757
|
-
optional = optional or []
|
|
758
|
-
|
|
759
|
-
def normalize_metadata(m: Key | Table | ColumnDefinition | str):
|
|
760
|
-
"""Helper function to normalize metadata references."""
|
|
761
|
-
if isinstance(m, str):
|
|
762
|
-
return self.model.name_to_table(m)
|
|
763
|
-
elif isinstance(m, ColumnDefinition):
|
|
764
|
-
return m.model_dump()
|
|
765
|
-
else:
|
|
766
|
-
return m
|
|
767
|
-
|
|
768
|
-
# Validate asset and term tables
|
|
769
|
-
if not all(map(self.model.is_asset, assets)):
|
|
770
|
-
raise DerivaMLException("Invalid create_feature asset table.")
|
|
771
|
-
if not all(map(self.model.is_vocabulary, terms)):
|
|
772
|
-
raise DerivaMLException("Invalid create_feature asset table.")
|
|
773
|
-
|
|
774
|
-
# Get references to required tables
|
|
775
|
-
target_table = self.model.name_to_table(target_table)
|
|
776
|
-
execution = self.model.schemas[self.ml_schema].tables["Execution"]
|
|
777
|
-
feature_name_table = self.model.schemas[self.ml_schema].tables["Feature_Name"]
|
|
778
|
-
|
|
779
|
-
# Add feature name to vocabulary
|
|
780
|
-
feature_name_term = self.add_term("Feature_Name", feature_name, description=comment)
|
|
781
|
-
atable_name = f"Execution_{target_table.name}_{feature_name_term.name}"
|
|
782
|
-
# Create an association table implementing the feature
|
|
783
|
-
atable = self.model.schemas[self.domain_schema].create_table(
|
|
784
|
-
target_table.define_association(
|
|
785
|
-
table_name=atable_name,
|
|
786
|
-
associates=[execution, target_table, feature_name_table],
|
|
787
|
-
metadata=[normalize_metadata(m) for m in chain(assets, terms, metadata)],
|
|
788
|
-
comment=comment,
|
|
789
|
-
)
|
|
790
|
-
)
|
|
791
|
-
# Configure optional columns and default feature name
|
|
792
|
-
for c in optional:
|
|
793
|
-
atable.columns[c].alter(nullok=True)
|
|
794
|
-
atable.columns["Feature_Name"].alter(default=feature_name_term.name)
|
|
795
|
-
|
|
796
|
-
# Return feature record class for creating instances
|
|
797
|
-
return self.feature_record_class(target_table, feature_name)
|
|
903
|
+
>>> experiment_table = ml.create_table(table_def)
|
|
798
904
|
|
|
799
|
-
|
|
800
|
-
"""Returns a pydantic model class for feature records.
|
|
905
|
+
**Table with foreign key to another table**:
|
|
801
906
|
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
will remove all feature values associated with this feature.
|
|
907
|
+
>>> from deriva_ml import (
|
|
908
|
+
... TableDefinition, ColumnDefinition, ForeignKeyDefinition, BuiltinTypes
|
|
909
|
+
... )
|
|
910
|
+
>>>
|
|
911
|
+
>>> # Create a Sample table that references Subject
|
|
912
|
+
>>> sample_def = TableDefinition(
|
|
913
|
+
... name="Sample",
|
|
914
|
+
... column_defs=[
|
|
915
|
+
... ColumnDefinition(name="Name", type=BuiltinTypes.text, nullok=False),
|
|
916
|
+
... ColumnDefinition(name="Subject", type=BuiltinTypes.text, nullok=False),
|
|
917
|
+
... ColumnDefinition(name="Collection_Date", type=BuiltinTypes.date),
|
|
918
|
+
... ],
|
|
919
|
+
... fkey_defs=[
|
|
920
|
+
... ForeignKeyDefinition(
|
|
921
|
+
... colnames=["Subject"],
|
|
922
|
+
... pk_sname=ml.default_schema, # Schema of referenced table
|
|
923
|
+
... pk_tname="Subject", # Name of referenced table
|
|
924
|
+
... pk_colnames=["RID"], # Column(s) in referenced table
|
|
925
|
+
... on_delete="CASCADE", # Delete samples when subject deleted
|
|
926
|
+
... )
|
|
927
|
+
... ],
|
|
928
|
+
... comment="Biological samples collected from subjects"
|
|
929
|
+
... )
|
|
930
|
+
>>> sample_table = ml.create_table(sample_def)
|
|
827
931
|
|
|
828
|
-
|
|
829
|
-
table: The table containing the feature, either as name or Table object.
|
|
830
|
-
feature_name: Name of the feature to delete.
|
|
932
|
+
**Table with unique key constraint**:
|
|
831
933
|
|
|
832
|
-
|
|
833
|
-
|
|
934
|
+
>>> from deriva_ml import (
|
|
935
|
+
... TableDefinition, ColumnDefinition, KeyDefinition, BuiltinTypes
|
|
936
|
+
... )
|
|
937
|
+
>>>
|
|
938
|
+
>>> protocol_def = TableDefinition(
|
|
939
|
+
... name="Protocol",
|
|
940
|
+
... column_defs=[
|
|
941
|
+
... ColumnDefinition(name="Name", type=BuiltinTypes.text, nullok=False),
|
|
942
|
+
... ColumnDefinition(name="Version", type=BuiltinTypes.text, nullok=False),
|
|
943
|
+
... ColumnDefinition(name="Description", type=BuiltinTypes.markdown),
|
|
944
|
+
... ],
|
|
945
|
+
... key_defs=[
|
|
946
|
+
... KeyDefinition(
|
|
947
|
+
... colnames=["Name", "Version"],
|
|
948
|
+
... constraint_names=[["myschema", "Protocol_Name_Version_key"]],
|
|
949
|
+
... comment="Each protocol name+version must be unique"
|
|
950
|
+
... )
|
|
951
|
+
... ],
|
|
952
|
+
... comment="Experimental protocols with versioning"
|
|
953
|
+
... )
|
|
954
|
+
>>> protocol_table = ml.create_table(protocol_def)
|
|
834
955
|
|
|
835
|
-
|
|
836
|
-
DerivaMLException: If deletion fails due to constraints or permissions.
|
|
956
|
+
**Batch creation without navbar updates**:
|
|
837
957
|
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
958
|
+
>>> ml.create_table(table1_def, update_navbar=False)
|
|
959
|
+
>>> ml.create_table(table2_def, update_navbar=False)
|
|
960
|
+
>>> ml.create_table(table3_def, update_navbar=False)
|
|
961
|
+
>>> ml.apply_catalog_annotations() # Update navbar once at the end
|
|
841
962
|
"""
|
|
842
|
-
#
|
|
843
|
-
|
|
844
|
-
try:
|
|
845
|
-
# Find and delete the feature's implementation table
|
|
846
|
-
feature = next(f for f in self.model.find_features(table) if f.feature_name == feature_name)
|
|
847
|
-
feature.feature_table.drop()
|
|
848
|
-
return True
|
|
849
|
-
except StopIteration:
|
|
850
|
-
return False
|
|
963
|
+
# Use default schema if none specified
|
|
964
|
+
schema = schema or self.model._require_default_schema()
|
|
851
965
|
|
|
852
|
-
|
|
853
|
-
|
|
966
|
+
# Create table in domain schema using provided definition
|
|
967
|
+
# Handle both TableDefinition (dataclass with to_dict) and plain dicts
|
|
968
|
+
table_dict = table.to_dict() if hasattr(table, 'to_dict') else table
|
|
969
|
+
new_table = self.model.schemas[schema].create_table(table_dict)
|
|
854
970
|
|
|
855
|
-
|
|
856
|
-
|
|
971
|
+
# Update navbar to include the new table
|
|
972
|
+
if update_navbar:
|
|
973
|
+
self.apply_catalog_annotations()
|
|
857
974
|
|
|
858
|
-
|
|
859
|
-
table: The table containing the feature, either as name or Table object.
|
|
860
|
-
feature_name: Name of the feature to look up.
|
|
861
|
-
|
|
862
|
-
Returns:
|
|
863
|
-
Feature: An object representing the feature and its implementation.
|
|
975
|
+
return new_table
|
|
864
976
|
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
Example:
|
|
869
|
-
>>> feature = ml.lookup_feature("samples", "expression_level")
|
|
870
|
-
>>> print(feature.feature_name)
|
|
871
|
-
'expression_level'
|
|
872
|
-
"""
|
|
873
|
-
return self.model.lookup_feature(table, feature_name)
|
|
977
|
+
# =========================================================================
|
|
978
|
+
# Cache and Directory Management
|
|
979
|
+
# =========================================================================
|
|
874
980
|
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
"""Retrieves all values for a feature.
|
|
981
|
+
def clear_cache(self, older_than_days: int | None = None) -> dict[str, int]:
|
|
982
|
+
"""Clear the dataset cache directory.
|
|
878
983
|
|
|
879
|
-
|
|
880
|
-
|
|
984
|
+
Removes cached dataset bags from the cache directory. Can optionally filter
|
|
985
|
+
by age to only remove old cache entries.
|
|
881
986
|
|
|
882
987
|
Args:
|
|
883
|
-
|
|
884
|
-
|
|
988
|
+
older_than_days: If provided, only remove cache entries older than this
|
|
989
|
+
many days. If None, removes all cache entries.
|
|
885
990
|
|
|
886
991
|
Returns:
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
992
|
+
dict with keys:
|
|
993
|
+
- 'files_removed': Number of files removed
|
|
994
|
+
- 'dirs_removed': Number of directories removed
|
|
995
|
+
- 'bytes_freed': Total bytes freed
|
|
996
|
+
- 'errors': Number of removal errors
|
|
891
997
|
|
|
892
998
|
Example:
|
|
893
|
-
>>>
|
|
894
|
-
>>>
|
|
895
|
-
|
|
999
|
+
>>> ml = DerivaML('deriva.example.org', 'my_catalog')
|
|
1000
|
+
>>> # Clear all cache
|
|
1001
|
+
>>> result = ml.clear_cache()
|
|
1002
|
+
>>> print(f"Freed {result['bytes_freed'] / 1e6:.1f} MB")
|
|
1003
|
+
>>>
|
|
1004
|
+
>>> # Clear cache older than 7 days
|
|
1005
|
+
>>> result = ml.clear_cache(older_than_days=7)
|
|
896
1006
|
"""
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
feature = self.lookup_feature(table, feature_name)
|
|
900
|
-
|
|
901
|
-
# Build and execute query for feature values
|
|
902
|
-
pb = self.catalog.getPathBuilder()
|
|
903
|
-
return pb.schemas[feature.feature_table.schema.name].tables[feature.feature_table.name].entities().fetch()
|
|
904
|
-
|
|
905
|
-
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
906
|
-
def add_term(
|
|
907
|
-
self,
|
|
908
|
-
table: str | Table,
|
|
909
|
-
term_name: str,
|
|
910
|
-
description: str,
|
|
911
|
-
synonyms: list[str] | None = None,
|
|
912
|
-
exists_ok: bool = True,
|
|
913
|
-
) -> VocabularyTerm:
|
|
914
|
-
"""Adds a term to a vocabulary table.
|
|
1007
|
+
import shutil
|
|
1008
|
+
import time
|
|
915
1009
|
|
|
916
|
-
|
|
917
|
-
Can either create a new term or return an existing one if it already exists.
|
|
1010
|
+
stats = {'files_removed': 0, 'dirs_removed': 0, 'bytes_freed': 0, 'errors': 0}
|
|
918
1011
|
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
term_name: Primary name of the term (must be unique within vocabulary).
|
|
922
|
-
description: Explanation of term's meaning and usage.
|
|
923
|
-
synonyms: Alternative names for the term.
|
|
924
|
-
exists_ok: If True, return the existing term if found. If False, raise error.
|
|
1012
|
+
if not self.cache_dir.exists():
|
|
1013
|
+
return stats
|
|
925
1014
|
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
Raises:
|
|
930
|
-
DerivaMLException: If a term exists and exists_ok=False, or if the table is not a vocabulary table.
|
|
931
|
-
|
|
932
|
-
Examples:
|
|
933
|
-
Add a new tissue type:
|
|
934
|
-
>>> term = ml.add_term(
|
|
935
|
-
... table="tissue_types",
|
|
936
|
-
... term_name="epithelial",
|
|
937
|
-
... description="Epithelial tissue type",
|
|
938
|
-
... synonyms=["epithelium"]
|
|
939
|
-
... )
|
|
940
|
-
|
|
941
|
-
Attempt to add an existing term:
|
|
942
|
-
>>> term = ml.add_term("tissue_types", "epithelial", "...", exists_ok=True)
|
|
943
|
-
"""
|
|
944
|
-
# Initialize an empty synonyms list if None
|
|
945
|
-
synonyms = synonyms or []
|
|
946
|
-
|
|
947
|
-
# Get table reference and validate if it is a vocabulary table
|
|
948
|
-
table = self.model.name_to_table(table)
|
|
949
|
-
pb = self.catalog.getPathBuilder()
|
|
950
|
-
if not (self.model.is_vocabulary(table)):
|
|
951
|
-
raise DerivaMLTableTypeError("vocabulary", table.name)
|
|
952
|
-
|
|
953
|
-
# Get schema and table names for path building
|
|
954
|
-
schema_name = table.schema.name
|
|
955
|
-
table_name = table.name
|
|
1015
|
+
cutoff_time = None
|
|
1016
|
+
if older_than_days is not None:
|
|
1017
|
+
cutoff_time = time.time() - (older_than_days * 24 * 60 * 60)
|
|
956
1018
|
|
|
957
1019
|
try:
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
term_name: Name or synonym of the term to find.
|
|
1020
|
+
for entry in self.cache_dir.iterdir():
|
|
1021
|
+
try:
|
|
1022
|
+
# Check age if filtering
|
|
1023
|
+
if cutoff_time is not None:
|
|
1024
|
+
entry_mtime = entry.stat().st_mtime
|
|
1025
|
+
if entry_mtime > cutoff_time:
|
|
1026
|
+
continue # Skip recent entries
|
|
1027
|
+
|
|
1028
|
+
# Calculate size before removal
|
|
1029
|
+
if entry.is_dir():
|
|
1030
|
+
entry_size = sum(f.stat().st_size for f in entry.rglob('*') if f.is_file())
|
|
1031
|
+
shutil.rmtree(entry)
|
|
1032
|
+
stats['dirs_removed'] += 1
|
|
1033
|
+
else:
|
|
1034
|
+
entry_size = entry.stat().st_size
|
|
1035
|
+
entry.unlink()
|
|
1036
|
+
stats['files_removed'] += 1
|
|
1037
|
+
|
|
1038
|
+
stats['bytes_freed'] += entry_size
|
|
1039
|
+
except (OSError, PermissionError) as e:
|
|
1040
|
+
self._logger.warning(f"Failed to remove cache entry {entry}: {e}")
|
|
1041
|
+
stats['errors'] += 1
|
|
1042
|
+
|
|
1043
|
+
except OSError as e:
|
|
1044
|
+
self._logger.error(f"Failed to iterate cache directory: {e}")
|
|
1045
|
+
stats['errors'] += 1
|
|
1046
|
+
|
|
1047
|
+
return stats
|
|
1048
|
+
|
|
1049
|
+
def get_cache_size(self) -> dict[str, int | float]:
|
|
1050
|
+
"""Get the current size of the cache directory.
|
|
990
1051
|
|
|
991
1052
|
Returns:
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
Examples:
|
|
998
|
-
Look up by primary name:
|
|
999
|
-
>>> term = ml.lookup_term("tissue_types", "epithelial")
|
|
1000
|
-
>>> print(term.description)
|
|
1053
|
+
dict with keys:
|
|
1054
|
+
- 'total_bytes': Total size in bytes
|
|
1055
|
+
- 'total_mb': Total size in megabytes
|
|
1056
|
+
- 'file_count': Number of files
|
|
1057
|
+
- 'dir_count': Number of directories
|
|
1001
1058
|
|
|
1002
|
-
|
|
1003
|
-
|
|
1059
|
+
Example:
|
|
1060
|
+
>>> ml = DerivaML('deriva.example.org', 'my_catalog')
|
|
1061
|
+
>>> size = ml.get_cache_size()
|
|
1062
|
+
>>> print(f"Cache size: {size['total_mb']:.1f} MB ({size['file_count']} files)")
|
|
1004
1063
|
"""
|
|
1005
|
-
|
|
1006
|
-
vocab_table = self.model.name_to_table(table)
|
|
1007
|
-
if not self.model.is_vocabulary(vocab_table):
|
|
1008
|
-
raise DerivaMLException(f"The table {table} is not a controlled vocabulary")
|
|
1064
|
+
stats = {'total_bytes': 0, 'total_mb': 0.0, 'file_count': 0, 'dir_count': 0}
|
|
1009
1065
|
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
schema_path = self.catalog.getPathBuilder().schemas[schema_name]
|
|
1066
|
+
if not self.cache_dir.exists():
|
|
1067
|
+
return stats
|
|
1013
1068
|
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1069
|
+
for entry in self.cache_dir.rglob('*'):
|
|
1070
|
+
if entry.is_file():
|
|
1071
|
+
stats['total_bytes'] += entry.stat().st_size
|
|
1072
|
+
stats['file_count'] += 1
|
|
1073
|
+
elif entry.is_dir():
|
|
1074
|
+
stats['dir_count'] += 1
|
|
1018
1075
|
|
|
1019
|
-
|
|
1020
|
-
|
|
1076
|
+
stats['total_mb'] = stats['total_bytes'] / (1024 * 1024)
|
|
1077
|
+
return stats
|
|
1021
1078
|
|
|
1022
|
-
def
|
|
1023
|
-
"""
|
|
1079
|
+
def list_execution_dirs(self) -> list[dict[str, any]]:
|
|
1080
|
+
"""List execution working directories.
|
|
1024
1081
|
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
Args:
|
|
1028
|
-
table: Vocabulary table to list terms from (name or Table object).
|
|
1082
|
+
Returns information about each execution directory in the working directory,
|
|
1083
|
+
useful for identifying orphaned or incomplete execution outputs.
|
|
1029
1084
|
|
|
1030
1085
|
Returns:
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1086
|
+
List of dicts, each containing:
|
|
1087
|
+
- 'execution_rid': The execution RID (directory name)
|
|
1088
|
+
- 'path': Full path to the directory
|
|
1089
|
+
- 'size_bytes': Total size in bytes
|
|
1090
|
+
- 'size_mb': Total size in megabytes
|
|
1091
|
+
- 'modified': Last modification time (datetime)
|
|
1092
|
+
- 'file_count': Number of files
|
|
1035
1093
|
|
|
1036
|
-
|
|
1037
|
-
>>>
|
|
1038
|
-
>>>
|
|
1039
|
-
|
|
1040
|
-
...
|
|
1041
|
-
... print(f" Synonyms: {', '.join(term.synonyms)}")
|
|
1094
|
+
Example:
|
|
1095
|
+
>>> ml = DerivaML('deriva.example.org', 'my_catalog')
|
|
1096
|
+
>>> dirs = ml.list_execution_dirs()
|
|
1097
|
+
>>> for d in dirs:
|
|
1098
|
+
... print(f"{d['execution_rid']}: {d['size_mb']:.1f} MB")
|
|
1042
1099
|
"""
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
table = self.model.name_to_table(table.value if isinstance(table, MLVocab) else table)
|
|
1046
|
-
|
|
1047
|
-
# Validate table is a vocabulary table
|
|
1048
|
-
if not (self.model.is_vocabulary(table)):
|
|
1049
|
-
raise DerivaMLException(f"The table {table} is not a controlled vocabulary")
|
|
1050
|
-
|
|
1051
|
-
# Fetch and convert all terms to VocabularyTerm objects
|
|
1052
|
-
return [VocabularyTerm(**v) for v in pb.schemas[table.schema.name].tables[table.name].entities().fetch()]
|
|
1053
|
-
|
|
1054
|
-
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
1055
|
-
def download_dataset_bag(
|
|
1056
|
-
self,
|
|
1057
|
-
dataset: DatasetSpec,
|
|
1058
|
-
execution_rid: RID | None = None,
|
|
1059
|
-
) -> DatasetBag:
|
|
1060
|
-
"""Downloads a dataset to the local filesystem and creates a MINID if needed.
|
|
1100
|
+
from datetime import datetime
|
|
1101
|
+
from deriva_ml.dataset.upload import upload_root
|
|
1061
1102
|
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
with an execution record.
|
|
1103
|
+
results = []
|
|
1104
|
+
exec_root = upload_root(self.working_dir) / "execution"
|
|
1065
1105
|
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
execution_rid: Optional execution RID to associate the download with.
|
|
1069
|
-
|
|
1070
|
-
Returns:
|
|
1071
|
-
DatasetBag: Object containing:
|
|
1072
|
-
- path: Local filesystem path to downloaded dataset
|
|
1073
|
-
- rid: Dataset's Resource Identifier
|
|
1074
|
-
- minid: Dataset's Minimal Viable Identifier
|
|
1075
|
-
|
|
1076
|
-
Examples:
|
|
1077
|
-
Download with default options:
|
|
1078
|
-
>>> spec = DatasetSpec(rid="1-abc123")
|
|
1079
|
-
>>> bag = ml.download_dataset_bag(dataset=spec)
|
|
1080
|
-
>>> print(f"Downloaded to {bag.path}")
|
|
1081
|
-
|
|
1082
|
-
Download with execution tracking:
|
|
1083
|
-
>>> bag = ml.download_dataset_bag(
|
|
1084
|
-
... dataset=DatasetSpec(rid="1-abc123", materialize=True),
|
|
1085
|
-
... execution_rid="1-xyz789"
|
|
1086
|
-
... )
|
|
1087
|
-
"""
|
|
1088
|
-
if not self._is_dataset_rid(dataset.rid):
|
|
1089
|
-
raise DerivaMLTableTypeError("Dataset", dataset.rid)
|
|
1090
|
-
return self._download_dataset_bag(
|
|
1091
|
-
dataset=dataset,
|
|
1092
|
-
execution_rid=execution_rid,
|
|
1093
|
-
snapshot_catalog=DerivaML(
|
|
1094
|
-
self.host_name,
|
|
1095
|
-
self._version_snapshot(dataset),
|
|
1096
|
-
logging_level=self._logging_level,
|
|
1097
|
-
deriva_logging_level=self._deriva_logging_level,
|
|
1098
|
-
),
|
|
1099
|
-
)
|
|
1106
|
+
if not exec_root.exists():
|
|
1107
|
+
return results
|
|
1100
1108
|
|
|
1101
|
-
|
|
1102
|
-
|
|
1109
|
+
for entry in exec_root.iterdir():
|
|
1110
|
+
if entry.is_dir():
|
|
1111
|
+
size_bytes = sum(f.stat().st_size for f in entry.rglob('*') if f.is_file())
|
|
1112
|
+
file_count = sum(1 for f in entry.rglob('*') if f.is_file())
|
|
1113
|
+
mtime = datetime.fromtimestamp(entry.stat().st_mtime)
|
|
1103
1114
|
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1115
|
+
results.append({
|
|
1116
|
+
'execution_rid': entry.name,
|
|
1117
|
+
'path': str(entry),
|
|
1118
|
+
'size_bytes': size_bytes,
|
|
1119
|
+
'size_mb': size_bytes / (1024 * 1024),
|
|
1120
|
+
'modified': mtime,
|
|
1121
|
+
'file_count': file_count,
|
|
1122
|
+
})
|
|
1111
1123
|
|
|
1112
|
-
|
|
1124
|
+
return sorted(results, key=lambda x: x['modified'], reverse=True)
|
|
1113
1125
|
|
|
1114
|
-
|
|
1115
|
-
self.status = new_status.value
|
|
1116
|
-
self.pathBuilder.schemas[self.ml_schema].Execution.update(
|
|
1117
|
-
[
|
|
1118
|
-
{
|
|
1119
|
-
"RID": execution_rid,
|
|
1120
|
-
"Status": self.status,
|
|
1121
|
-
"Status_Detail": status_detail,
|
|
1122
|
-
}
|
|
1123
|
-
]
|
|
1124
|
-
)
|
|
1125
|
-
|
|
1126
|
-
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
1127
|
-
def add_files(
|
|
1126
|
+
def clean_execution_dirs(
|
|
1128
1127
|
self,
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
) -> RID:
|
|
1134
|
-
"""Adds files to the catalog with their metadata.
|
|
1135
|
-
|
|
1136
|
-
Registers files in the catalog along with their metadata (MD5, length, URL) and associates them with
|
|
1137
|
-
specified file types. Optionally links files to an execution record.
|
|
1138
|
-
|
|
1139
|
-
Args:
|
|
1140
|
-
files: File specifications containing MD5 checksum, length, and URL.
|
|
1141
|
-
dataset_types: One or more dataset type terms from File_Type vocabulary.
|
|
1142
|
-
description: Description of the files.
|
|
1143
|
-
execution_rid: Optional execution RID to associate files with.
|
|
1144
|
-
|
|
1145
|
-
Returns:
|
|
1146
|
-
RID: Resource of dataset that represents the newly added files.
|
|
1147
|
-
|
|
1148
|
-
Raises:
|
|
1149
|
-
DerivaMLException: If file_types are invalid or execution_rid is not an execution record.
|
|
1150
|
-
|
|
1151
|
-
Examples:
|
|
1152
|
-
Add a single file type:
|
|
1153
|
-
>>> files = [FileSpec(url="path/to/file.txt", md5="abc123", length=1000)]
|
|
1154
|
-
>>> rids = ml.add_files(files, file_types="text")
|
|
1155
|
-
|
|
1156
|
-
Add multiple file types:
|
|
1157
|
-
>>> rids = ml.add_files(
|
|
1158
|
-
... files=[FileSpec(url="image.png", md5="def456", length=2000)],
|
|
1159
|
-
... file_types=["image", "png"],
|
|
1160
|
-
... execution_rid="1-xyz789"
|
|
1161
|
-
... )
|
|
1162
|
-
"""
|
|
1163
|
-
if execution_rid and self.resolve_rid(execution_rid).table.name != "Execution":
|
|
1164
|
-
raise DerivaMLTableTypeError("Execution", execution_rid)
|
|
1165
|
-
|
|
1166
|
-
filespec_list = list(files)
|
|
1167
|
-
|
|
1168
|
-
# Get a list of all defined file types and their synonyms.
|
|
1169
|
-
defined_types = set(
|
|
1170
|
-
chain.from_iterable([[t.name] + t.synonyms for t in self.list_vocabulary_terms(MLVocab.asset_type)])
|
|
1171
|
-
)
|
|
1172
|
-
|
|
1173
|
-
# Get a list of al of the file types used in the filespec_list
|
|
1174
|
-
spec_types = set(chain.from_iterable(filespec.file_types for filespec in filespec_list))
|
|
1175
|
-
|
|
1176
|
-
# Now make sure that all of the file types and dataset_types in the spec list are defined.
|
|
1177
|
-
if spec_types - defined_types:
|
|
1178
|
-
raise DerivaMLInvalidTerm(MLVocab.asset_type.name, f"{spec_types - defined_types}")
|
|
1179
|
-
|
|
1180
|
-
# Normalize dataset_types, make sure FIle type is included.
|
|
1181
|
-
if isinstance(dataset_types, list):
|
|
1182
|
-
dataset_types = ["File"] + dataset_types if "File" not in dataset_types else dataset_types
|
|
1183
|
-
else:
|
|
1184
|
-
dataset_types = ["File", dataset_types] if dataset_types else ["File"]
|
|
1185
|
-
for ds_type in dataset_types:
|
|
1186
|
-
self.lookup_term(MLVocab.dataset_type, ds_type)
|
|
1187
|
-
|
|
1188
|
-
# Add files to the file table, and collect up the resulting entries by directory name.
|
|
1189
|
-
pb = self._model.catalog.getPathBuilder()
|
|
1190
|
-
file_records = list(
|
|
1191
|
-
pb.schemas[self.ml_schema].tables["File"].insert([f.model_dump(by_alias=True) for f in filespec_list])
|
|
1192
|
-
)
|
|
1193
|
-
|
|
1194
|
-
# Get the name of the association table between file_table and file_type and add file_type records
|
|
1195
|
-
atable = self.model.find_association(MLTable.file, MLVocab.asset_type)[0].name
|
|
1196
|
-
# Need to get a link between file record and file_types.
|
|
1197
|
-
type_map = {
|
|
1198
|
-
file_spec.md5: file_spec.file_types + ([] if "File" in file_spec.file_types else [])
|
|
1199
|
-
for file_spec in filespec_list
|
|
1200
|
-
}
|
|
1201
|
-
file_type_records = [
|
|
1202
|
-
{MLVocab.asset_type.value: file_type, "File": file_record["RID"]}
|
|
1203
|
-
for file_record in file_records
|
|
1204
|
-
for file_type in type_map[file_record["MD5"]]
|
|
1205
|
-
]
|
|
1206
|
-
pb.schemas[self._ml_schema].tables[atable].insert(file_type_records)
|
|
1207
|
-
|
|
1208
|
-
if execution_rid:
|
|
1209
|
-
# Get the name of the association table between file_table and execution.
|
|
1210
|
-
pb.schemas[self._ml_schema].File_Execution.insert(
|
|
1211
|
-
[
|
|
1212
|
-
{"File": file_record["RID"], "Execution": execution_rid, "Asset_Role": "Output"}
|
|
1213
|
-
for file_record in file_records
|
|
1214
|
-
]
|
|
1215
|
-
)
|
|
1128
|
+
older_than_days: int | None = None,
|
|
1129
|
+
exclude_rids: list[str] | None = None,
|
|
1130
|
+
) -> dict[str, int]:
|
|
1131
|
+
"""Clean up execution working directories.
|
|
1216
1132
|
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
for e in file_records:
|
|
1220
|
-
dir_rid_map[Path(urlsplit(e["URL"]).path).parent].append(e["RID"])
|
|
1221
|
-
|
|
1222
|
-
nested_datasets = []
|
|
1223
|
-
path_length = 0
|
|
1224
|
-
dataset = None
|
|
1225
|
-
# Start with the longest path so we get subdirectories first.
|
|
1226
|
-
for p, rids in sorted(dir_rid_map.items(), key=lambda kv: len(kv[0].parts), reverse=True):
|
|
1227
|
-
dataset = self.create_dataset(
|
|
1228
|
-
dataset_types=dataset_types, execution_rid=execution_rid, description=description
|
|
1229
|
-
)
|
|
1230
|
-
members = rids
|
|
1231
|
-
if len(p.parts) < path_length:
|
|
1232
|
-
# Going up one level in directory, so Create nested dataset
|
|
1233
|
-
members = nested_datasets + rids
|
|
1234
|
-
nested_datasets = []
|
|
1235
|
-
self.add_dataset_members(dataset_rid=dataset, members=members, execution_rid=execution_rid)
|
|
1236
|
-
nested_datasets.append(dataset)
|
|
1237
|
-
path_length = len(p.parts)
|
|
1238
|
-
|
|
1239
|
-
return dataset
|
|
1240
|
-
|
|
1241
|
-
def list_files(self, file_types: list[str] | None = None) -> list[dict[str, Any]]:
|
|
1242
|
-
"""Lists files in the catalog with their metadata.
|
|
1243
|
-
|
|
1244
|
-
Returns a list of files with their metadata including URL, MD5 hash, length, description,
|
|
1245
|
-
and associated file types. Files can be optionally filtered by type.
|
|
1133
|
+
Removes execution output directories from the local working directory.
|
|
1134
|
+
Use this to free up disk space from completed or orphaned executions.
|
|
1246
1135
|
|
|
1247
1136
|
Args:
|
|
1248
|
-
|
|
1137
|
+
older_than_days: If provided, only remove directories older than this
|
|
1138
|
+
many days. If None, removes all execution directories (except excluded).
|
|
1139
|
+
exclude_rids: List of execution RIDs to preserve (never remove).
|
|
1249
1140
|
|
|
1250
1141
|
Returns:
|
|
1251
|
-
|
|
1252
|
-
-
|
|
1253
|
-
-
|
|
1254
|
-
-
|
|
1255
|
-
- Length: File size
|
|
1256
|
-
- Description: File description
|
|
1257
|
-
- File_Types: List of associated file types
|
|
1258
|
-
|
|
1259
|
-
Examples:
|
|
1260
|
-
List all files:
|
|
1261
|
-
>>> files = ml.list_files()
|
|
1262
|
-
>>> for f in files:
|
|
1263
|
-
... print(f"{f['RID']}: {f['URL']}")
|
|
1264
|
-
|
|
1265
|
-
Filter by file type:
|
|
1266
|
-
>>> image_files = ml.list_files(["image", "png"])
|
|
1267
|
-
"""
|
|
1268
|
-
|
|
1269
|
-
asset_type_atable, file_fk, asset_type_fk = self.model.find_association("File", "Asset_Type")
|
|
1270
|
-
ml_path = self.pathBuilder.schemas[self._ml_schema]
|
|
1271
|
-
file = ml_path.File
|
|
1272
|
-
asset_type = ml_path.tables[asset_type_atable.name]
|
|
1273
|
-
|
|
1274
|
-
path = file.path
|
|
1275
|
-
path = path.link(asset_type.alias("AT"), on=file.RID == asset_type.columns[file_fk], join_type="left")
|
|
1276
|
-
if file_types:
|
|
1277
|
-
path = path.filter(asset_type.columns[asset_type_fk] == datapath.Any(*file_types))
|
|
1278
|
-
path = path.attributes(
|
|
1279
|
-
path.File.RID,
|
|
1280
|
-
path.File.URL,
|
|
1281
|
-
path.File.MD5,
|
|
1282
|
-
path.File.Length,
|
|
1283
|
-
path.File.Description,
|
|
1284
|
-
path.AT.columns[asset_type_fk],
|
|
1285
|
-
)
|
|
1142
|
+
dict with keys:
|
|
1143
|
+
- 'dirs_removed': Number of directories removed
|
|
1144
|
+
- 'bytes_freed': Total bytes freed
|
|
1145
|
+
- 'errors': Number of removal errors
|
|
1286
1146
|
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
def list_workflows(self) -> list[Workflow]:
|
|
1297
|
-
"""Lists all workflows in the catalog.
|
|
1298
|
-
|
|
1299
|
-
Retrieves all workflow definitions, including their names, URLs, types, versions,
|
|
1300
|
-
and descriptions.
|
|
1301
|
-
|
|
1302
|
-
Returns:
|
|
1303
|
-
list[Workflow]: List of workflow objects, each containing:
|
|
1304
|
-
- name: Workflow name
|
|
1305
|
-
- url: Source code URL
|
|
1306
|
-
- workflow_type: Type of workflow
|
|
1307
|
-
- version: Version identifier
|
|
1308
|
-
- description: Workflow description
|
|
1309
|
-
- rid: Resource identifier
|
|
1310
|
-
- checksum: Source code checksum
|
|
1311
|
-
|
|
1312
|
-
Examples:
|
|
1313
|
-
>>> workflows = ml.list_workflows()
|
|
1314
|
-
>>> for w in workflows:
|
|
1315
|
-
print(f"{w.name} (v{w.version}): {w.description}")
|
|
1316
|
-
print(f" Source: {w.url}")
|
|
1147
|
+
Example:
|
|
1148
|
+
>>> ml = DerivaML('deriva.example.org', 'my_catalog')
|
|
1149
|
+
>>> # Clean all execution dirs older than 30 days
|
|
1150
|
+
>>> result = ml.clean_execution_dirs(older_than_days=30)
|
|
1151
|
+
>>> print(f"Freed {result['bytes_freed'] / 1e9:.2f} GB")
|
|
1152
|
+
>>>
|
|
1153
|
+
>>> # Clean all except specific executions
|
|
1154
|
+
>>> result = ml.clean_execution_dirs(exclude_rids=['1-ABC', '1-DEF'])
|
|
1317
1155
|
"""
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
Workflow(
|
|
1322
|
-
name=w["Name"],
|
|
1323
|
-
url=w["URL"],
|
|
1324
|
-
workflow_type=w["Workflow_Type"],
|
|
1325
|
-
version=w["Version"],
|
|
1326
|
-
description=w["Description"],
|
|
1327
|
-
rid=w["RID"],
|
|
1328
|
-
checksum=w["Checksum"],
|
|
1329
|
-
)
|
|
1330
|
-
for w in workflow_path.entities().fetch()
|
|
1331
|
-
]
|
|
1156
|
+
import shutil
|
|
1157
|
+
import time
|
|
1158
|
+
from deriva_ml.dataset.upload import upload_root
|
|
1332
1159
|
|
|
1333
|
-
|
|
1334
|
-
|
|
1160
|
+
stats = {'dirs_removed': 0, 'bytes_freed': 0, 'errors': 0}
|
|
1161
|
+
exclude_rids = set(exclude_rids or [])
|
|
1335
1162
|
|
|
1336
|
-
|
|
1337
|
-
|
|
1163
|
+
exec_root = upload_root(self.working_dir) / "execution"
|
|
1164
|
+
if not exec_root.exists():
|
|
1165
|
+
return stats
|
|
1338
1166
|
|
|
1339
|
-
|
|
1167
|
+
cutoff_time = None
|
|
1168
|
+
if older_than_days is not None:
|
|
1169
|
+
cutoff_time = time.time() - (older_than_days * 24 * 60 * 60)
|
|
1340
1170
|
|
|
1341
|
-
|
|
1342
|
-
|
|
1171
|
+
for entry in exec_root.iterdir():
|
|
1172
|
+
if not entry.is_dir():
|
|
1173
|
+
continue
|
|
1343
1174
|
|
|
1344
|
-
|
|
1345
|
-
|
|
1175
|
+
# Skip excluded RIDs
|
|
1176
|
+
if entry.name in exclude_rids:
|
|
1177
|
+
continue
|
|
1346
1178
|
|
|
1347
|
-
|
|
1348
|
-
|
|
1179
|
+
try:
|
|
1180
|
+
# Check age if filtering
|
|
1181
|
+
if cutoff_time is not None:
|
|
1182
|
+
entry_mtime = entry.stat().st_mtime
|
|
1183
|
+
if entry_mtime > cutoff_time:
|
|
1184
|
+
continue
|
|
1349
1185
|
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
... version="1.0.0",
|
|
1356
|
-
... description="Analyzes gene expression patterns"
|
|
1357
|
-
... )
|
|
1358
|
-
>>> workflow_rid = ml.add_workflow(workflow)
|
|
1359
|
-
"""
|
|
1360
|
-
# Check if a workflow already exists by URL
|
|
1361
|
-
if workflow_rid := self.lookup_workflow(workflow.checksum or workflow.url):
|
|
1362
|
-
return workflow_rid
|
|
1186
|
+
# Calculate size before removal
|
|
1187
|
+
entry_size = sum(f.stat().st_size for f in entry.rglob('*') if f.is_file())
|
|
1188
|
+
shutil.rmtree(entry)
|
|
1189
|
+
stats['dirs_removed'] += 1
|
|
1190
|
+
stats['bytes_freed'] += entry_size
|
|
1363
1191
|
|
|
1364
|
-
|
|
1365
|
-
|
|
1192
|
+
except (OSError, PermissionError) as e:
|
|
1193
|
+
self._logger.warning(f"Failed to remove execution dir {entry}: {e}")
|
|
1194
|
+
stats['errors'] += 1
|
|
1366
1195
|
|
|
1367
|
-
|
|
1368
|
-
# Create a workflow record
|
|
1369
|
-
workflow_record = {
|
|
1370
|
-
"URL": workflow.url,
|
|
1371
|
-
"Name": workflow.name,
|
|
1372
|
-
"Description": workflow.description,
|
|
1373
|
-
"Checksum": workflow.checksum,
|
|
1374
|
-
"Version": workflow.version,
|
|
1375
|
-
MLVocab.workflow_type: self.lookup_term(MLVocab.workflow_type, workflow.workflow_type).name,
|
|
1376
|
-
}
|
|
1377
|
-
# Insert a workflow and get its RID
|
|
1378
|
-
workflow_rid = ml_schema_path.Workflow.insert([workflow_record])[0]["RID"]
|
|
1379
|
-
except Exception as e:
|
|
1380
|
-
error = format_exception(e)
|
|
1381
|
-
raise DerivaMLException(f"Failed to insert workflow. Error: {error}")
|
|
1382
|
-
return workflow_rid
|
|
1196
|
+
return stats
|
|
1383
1197
|
|
|
1384
|
-
def
|
|
1385
|
-
"""
|
|
1198
|
+
def get_storage_summary(self) -> dict[str, any]:
|
|
1199
|
+
"""Get a summary of local storage usage.
|
|
1386
1200
|
|
|
1387
|
-
Args:
|
|
1388
|
-
url_or_checksum: URL or checksum of the workflow.
|
|
1389
1201
|
Returns:
|
|
1390
|
-
|
|
1202
|
+
dict with keys:
|
|
1203
|
+
- 'working_dir': Path to working directory
|
|
1204
|
+
- 'cache_dir': Path to cache directory
|
|
1205
|
+
- 'cache_size_mb': Cache size in MB
|
|
1206
|
+
- 'cache_file_count': Number of files in cache
|
|
1207
|
+
- 'execution_dir_count': Number of execution directories
|
|
1208
|
+
- 'execution_size_mb': Total size of execution directories in MB
|
|
1209
|
+
- 'total_size_mb': Combined size in MB
|
|
1391
1210
|
|
|
1392
1211
|
Example:
|
|
1393
|
-
>>>
|
|
1394
|
-
>>>
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
workflow_path = self.pathBuilder.schemas[self.ml_schema].Workflow
|
|
1399
|
-
try:
|
|
1400
|
-
# Search for workflow by URL
|
|
1401
|
-
url_column = workflow_path.URL
|
|
1402
|
-
checksum_column = workflow_path.Checksum
|
|
1403
|
-
return list(
|
|
1404
|
-
workflow_path.path.filter(
|
|
1405
|
-
(url_column == url_or_checksum) | (checksum_column == url_or_checksum)
|
|
1406
|
-
).entities()
|
|
1407
|
-
)[0]["RID"]
|
|
1408
|
-
except IndexError:
|
|
1409
|
-
return None
|
|
1410
|
-
|
|
1411
|
-
def create_workflow(self, name: str, workflow_type: str, description: str = "") -> Workflow:
|
|
1412
|
-
"""Creates a new workflow definition.
|
|
1413
|
-
|
|
1414
|
-
Creates a Workflow object that represents a computational process or analysis pipeline. The workflow type
|
|
1415
|
-
must be a term from the controlled vocabulary. This method is typically used to define new analysis
|
|
1416
|
-
workflows before execution.
|
|
1417
|
-
|
|
1418
|
-
Args:
|
|
1419
|
-
name: Name of the workflow.
|
|
1420
|
-
workflow_type: Type of workflow (must exist in workflow_type vocabulary).
|
|
1421
|
-
description: Description of what the workflow does.
|
|
1422
|
-
|
|
1423
|
-
Returns:
|
|
1424
|
-
Workflow: New workflow object ready for registration.
|
|
1425
|
-
|
|
1426
|
-
Raises:
|
|
1427
|
-
DerivaMLException: If workflow_type is not in the vocabulary.
|
|
1428
|
-
|
|
1429
|
-
Examples:
|
|
1430
|
-
>>> workflow = ml.create_workflow(
|
|
1431
|
-
... name="RNA Analysis",
|
|
1432
|
-
... workflow_type="python_notebook",
|
|
1433
|
-
... description="RNA sequence analysis pipeline"
|
|
1434
|
-
... )
|
|
1435
|
-
>>> rid = ml.add_workflow(workflow)
|
|
1212
|
+
>>> ml = DerivaML('deriva.example.org', 'my_catalog')
|
|
1213
|
+
>>> summary = ml.get_storage_summary()
|
|
1214
|
+
>>> print(f"Total storage: {summary['total_size_mb']:.1f} MB")
|
|
1215
|
+
>>> print(f" Cache: {summary['cache_size_mb']:.1f} MB")
|
|
1216
|
+
>>> print(f" Executions: {summary['execution_size_mb']:.1f} MB")
|
|
1436
1217
|
"""
|
|
1437
|
-
|
|
1438
|
-
self.
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
1. The datasets specified in the configuration are downloaded and placed in the cache-dir. If a version is
|
|
1452
|
-
not specified in the configuration, then a new minor version number is created for the dataset and downloaded.
|
|
1453
|
-
|
|
1454
|
-
2. If any execution assets are provided in the configuration, they are downloaded
|
|
1455
|
-
and placed in the working directory.
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
Args:
|
|
1459
|
-
configuration: ExecutionConfiguration:
|
|
1460
|
-
workflow: Workflow object representing the workflow to execute if not present in the ExecutionConfiguration.
|
|
1461
|
-
dry_run: Do not create an execution record or upload results.
|
|
1462
|
-
|
|
1463
|
-
Returns:
|
|
1464
|
-
An execution object.
|
|
1465
|
-
"""
|
|
1466
|
-
# Import here to avoid circular dependency
|
|
1467
|
-
from deriva_ml.execution.execution import Execution
|
|
1218
|
+
cache_stats = self.get_cache_size()
|
|
1219
|
+
exec_dirs = self.list_execution_dirs()
|
|
1220
|
+
|
|
1221
|
+
exec_size_mb = sum(d['size_mb'] for d in exec_dirs)
|
|
1222
|
+
|
|
1223
|
+
return {
|
|
1224
|
+
'working_dir': str(self.working_dir),
|
|
1225
|
+
'cache_dir': str(self.cache_dir),
|
|
1226
|
+
'cache_size_mb': cache_stats['total_mb'],
|
|
1227
|
+
'cache_file_count': cache_stats['file_count'],
|
|
1228
|
+
'execution_dir_count': len(exec_dirs),
|
|
1229
|
+
'execution_size_mb': exec_size_mb,
|
|
1230
|
+
'total_size_mb': cache_stats['total_mb'] + exec_size_mb,
|
|
1231
|
+
}
|
|
1468
1232
|
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1233
|
+
# =========================================================================
|
|
1234
|
+
# Schema Validation
|
|
1235
|
+
# =========================================================================
|
|
1472
1236
|
|
|
1473
|
-
def
|
|
1474
|
-
"""
|
|
1237
|
+
def validate_schema(self, strict: bool = False) -> "SchemaValidationReport":
|
|
1238
|
+
"""Validate that the catalog's ML schema matches the expected structure.
|
|
1475
1239
|
|
|
1476
|
-
|
|
1477
|
-
|
|
1240
|
+
This method inspects the catalog schema and verifies that it contains all
|
|
1241
|
+
the required tables, columns, vocabulary terms, and relationships that are
|
|
1242
|
+
created by the ML schema initialization routines in create_schema.py.
|
|
1478
1243
|
|
|
1479
|
-
|
|
1480
|
-
|
|
1244
|
+
The validation checks:
|
|
1245
|
+
- All required ML tables exist (Dataset, Execution, Workflow, etc.)
|
|
1246
|
+
- All required columns exist with correct types
|
|
1247
|
+
- All required vocabulary tables exist (Asset_Type, Dataset_Type, etc.)
|
|
1248
|
+
- All required vocabulary terms are initialized
|
|
1249
|
+
- All association tables exist for relationships
|
|
1481
1250
|
|
|
1482
|
-
|
|
1483
|
-
in the
|
|
1251
|
+
In strict mode, the validator also reports errors for:
|
|
1252
|
+
- Extra tables not in the expected schema
|
|
1253
|
+
- Extra columns not in the expected table definitions
|
|
1484
1254
|
|
|
1485
1255
|
Args:
|
|
1486
|
-
|
|
1256
|
+
strict: If True, extra tables and columns are reported as errors.
|
|
1257
|
+
If False (default), they are reported as informational items.
|
|
1258
|
+
Use strict=True to verify a clean ML catalog matches exactly.
|
|
1259
|
+
Use strict=False to validate a catalog that may have domain extensions.
|
|
1487
1260
|
|
|
1488
1261
|
Returns:
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
|
|
1492
|
-
|
|
1262
|
+
SchemaValidationReport with validation results. Key attributes:
|
|
1263
|
+
- is_valid: True if no errors were found
|
|
1264
|
+
- errors: List of error-level issues
|
|
1265
|
+
- warnings: List of warning-level issues
|
|
1266
|
+
- info: List of informational items
|
|
1267
|
+
- to_text(): Human-readable report
|
|
1268
|
+
- to_dict(): JSON-serializable dictionary
|
|
1493
1269
|
|
|
1494
1270
|
Example:
|
|
1495
|
-
>>>
|
|
1271
|
+
>>> ml = DerivaML('localhost', 'my_catalog')
|
|
1272
|
+
>>> report = ml.validate_schema(strict=False)
|
|
1273
|
+
>>> if report.is_valid:
|
|
1274
|
+
... print("Schema is valid!")
|
|
1275
|
+
... else:
|
|
1276
|
+
... print(report.to_text())
|
|
1277
|
+
|
|
1278
|
+
>>> # Strict validation for a fresh ML catalog
|
|
1279
|
+
>>> report = ml.validate_schema(strict=True)
|
|
1280
|
+
>>> print(f"Found {len(report.errors)} errors, {len(report.warnings)} warnings")
|
|
1281
|
+
|
|
1282
|
+
>>> # Get report as dictionary for JSON/logging
|
|
1283
|
+
>>> import json
|
|
1284
|
+
>>> print(json.dumps(report.to_dict(), indent=2))
|
|
1285
|
+
|
|
1286
|
+
Note:
|
|
1287
|
+
This method validates the ML schema (typically 'deriva-ml'), not the
|
|
1288
|
+
domain schema. Domain-specific tables and columns are not checked
|
|
1289
|
+
unless they are part of the ML schema itself.
|
|
1290
|
+
|
|
1291
|
+
See Also:
|
|
1292
|
+
- deriva_ml.schema.validation.SchemaValidationReport
|
|
1293
|
+
- deriva_ml.schema.validation.validate_ml_schema
|
|
1496
1294
|
"""
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
# If no RID provided, try to find single execution in working directory
|
|
1501
|
-
if not execution_rid:
|
|
1502
|
-
e_rids = execution_rids(self.working_dir)
|
|
1503
|
-
if len(e_rids) != 1:
|
|
1504
|
-
raise DerivaMLException(f"Multiple execution RIDs were found {e_rids}.")
|
|
1505
|
-
execution_rid = e_rids[0]
|
|
1506
|
-
|
|
1507
|
-
# Try to load configuration from a file
|
|
1508
|
-
cfile = asset_file_path(
|
|
1509
|
-
prefix=self.working_dir,
|
|
1510
|
-
exec_rid=execution_rid,
|
|
1511
|
-
file_name="configuration.json",
|
|
1512
|
-
asset_table=self.model.name_to_table("Execution_Metadata"),
|
|
1513
|
-
metadata={},
|
|
1514
|
-
)
|
|
1295
|
+
from deriva_ml.schema.validation import SchemaValidationReport, validate_ml_schema
|
|
1296
|
+
return validate_ml_schema(self, strict=strict)
|
|
1515
1297
|
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
|
|
1520
|
-
|
|
1521
|
-
|
|
1522
|
-
|
|
1523
|
-
description=execution["Description"],
|
|
1524
|
-
)
|
|
1298
|
+
# Methods moved to mixins:
|
|
1299
|
+
# - create_asset, list_assets -> AssetMixin
|
|
1300
|
+
# - create_feature, feature_record_class, delete_feature, lookup_feature, list_feature_values -> FeatureMixin
|
|
1301
|
+
# - find_datasets, create_dataset, lookup_dataset, delete_dataset, list_dataset_element_types,
|
|
1302
|
+
# add_dataset_element_type, download_dataset_bag -> DatasetMixin
|
|
1303
|
+
# - _update_status, create_execution, restore_execution -> ExecutionMixin
|
|
1304
|
+
# - add_files, list_files, _bootstrap_versions, _synchronize_dataset_versions, _set_version_snapshot -> FileMixin
|
|
1525
1305
|
|
|
1526
|
-
# Create and return an execution instance
|
|
1527
|
-
return Execution(configuration, self, reload=execution_rid)
|