deriva-ml 1.17.10__py3-none-any.whl → 1.17.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. deriva_ml/__init__.py +69 -1
  2. deriva_ml/asset/__init__.py +17 -0
  3. deriva_ml/asset/asset.py +357 -0
  4. deriva_ml/asset/aux_classes.py +100 -0
  5. deriva_ml/bump_version.py +254 -11
  6. deriva_ml/catalog/__init__.py +31 -0
  7. deriva_ml/catalog/clone.py +1939 -0
  8. deriva_ml/catalog/localize.py +426 -0
  9. deriva_ml/core/__init__.py +29 -0
  10. deriva_ml/core/base.py +845 -1067
  11. deriva_ml/core/config.py +169 -21
  12. deriva_ml/core/constants.py +120 -19
  13. deriva_ml/core/definitions.py +123 -13
  14. deriva_ml/core/enums.py +47 -73
  15. deriva_ml/core/ermrest.py +226 -193
  16. deriva_ml/core/exceptions.py +297 -14
  17. deriva_ml/core/filespec.py +99 -28
  18. deriva_ml/core/logging_config.py +225 -0
  19. deriva_ml/core/mixins/__init__.py +42 -0
  20. deriva_ml/core/mixins/annotation.py +915 -0
  21. deriva_ml/core/mixins/asset.py +384 -0
  22. deriva_ml/core/mixins/dataset.py +237 -0
  23. deriva_ml/core/mixins/execution.py +408 -0
  24. deriva_ml/core/mixins/feature.py +365 -0
  25. deriva_ml/core/mixins/file.py +263 -0
  26. deriva_ml/core/mixins/path_builder.py +145 -0
  27. deriva_ml/core/mixins/rid_resolution.py +204 -0
  28. deriva_ml/core/mixins/vocabulary.py +400 -0
  29. deriva_ml/core/mixins/workflow.py +322 -0
  30. deriva_ml/core/validation.py +389 -0
  31. deriva_ml/dataset/__init__.py +2 -1
  32. deriva_ml/dataset/aux_classes.py +20 -4
  33. deriva_ml/dataset/catalog_graph.py +575 -0
  34. deriva_ml/dataset/dataset.py +1242 -1008
  35. deriva_ml/dataset/dataset_bag.py +1311 -182
  36. deriva_ml/dataset/history.py +27 -14
  37. deriva_ml/dataset/upload.py +225 -38
  38. deriva_ml/demo_catalog.py +126 -110
  39. deriva_ml/execution/__init__.py +46 -2
  40. deriva_ml/execution/base_config.py +639 -0
  41. deriva_ml/execution/execution.py +543 -242
  42. deriva_ml/execution/execution_configuration.py +26 -11
  43. deriva_ml/execution/execution_record.py +592 -0
  44. deriva_ml/execution/find_caller.py +298 -0
  45. deriva_ml/execution/model_protocol.py +175 -0
  46. deriva_ml/execution/multirun_config.py +153 -0
  47. deriva_ml/execution/runner.py +595 -0
  48. deriva_ml/execution/workflow.py +223 -34
  49. deriva_ml/experiment/__init__.py +8 -0
  50. deriva_ml/experiment/experiment.py +411 -0
  51. deriva_ml/feature.py +6 -1
  52. deriva_ml/install_kernel.py +143 -6
  53. deriva_ml/interfaces.py +862 -0
  54. deriva_ml/model/__init__.py +99 -0
  55. deriva_ml/model/annotations.py +1278 -0
  56. deriva_ml/model/catalog.py +286 -60
  57. deriva_ml/model/database.py +144 -649
  58. deriva_ml/model/deriva_ml_database.py +308 -0
  59. deriva_ml/model/handles.py +14 -0
  60. deriva_ml/run_model.py +319 -0
  61. deriva_ml/run_notebook.py +507 -38
  62. deriva_ml/schema/__init__.py +18 -2
  63. deriva_ml/schema/annotations.py +62 -33
  64. deriva_ml/schema/create_schema.py +169 -69
  65. deriva_ml/schema/validation.py +601 -0
  66. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/METADATA +4 -4
  67. deriva_ml-1.17.12.dist-info/RECORD +77 -0
  68. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/WHEEL +1 -1
  69. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/entry_points.txt +1 -0
  70. deriva_ml/protocols/dataset.py +0 -19
  71. deriva_ml/test.py +0 -94
  72. deriva_ml-1.17.10.dist-info/RECORD +0 -45
  73. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/licenses/LICENSE +0 -0
  74. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/top_level.txt +0 -0
deriva_ml/core/base.py CHANGED
@@ -14,56 +14,53 @@ Typical usage example:
14
14
  from __future__ import annotations # noqa: I001
15
15
 
16
16
  # Standard library imports
17
- from collections import defaultdict
18
17
  import logging
19
18
  from datetime import datetime
20
- from itertools import chain
21
19
  from pathlib import Path
22
- from typing import Dict, Iterable, List, cast, TYPE_CHECKING, Any
20
+ from typing import Dict, List, cast, TYPE_CHECKING, Any
23
21
  from typing_extensions import Self
24
- from urllib.parse import urlsplit
25
-
26
22
 
27
23
  # Third-party imports
28
24
  import requests
29
- from pydantic import ConfigDict, validate_call
30
-
31
- # Deriva imports
32
- from deriva.core import DEFAULT_SESSION_CONFIG, format_exception, get_credential, urlquote
33
-
34
- import deriva.core.datapath as datapath
35
- from deriva.core.datapath import DataPathException, _SchemaWrapper as SchemaWrapper
36
- from deriva.core.deriva_server import DerivaServer
37
- from deriva.core.ermrest_catalog import ResolveRidResult
38
- from deriva.core.ermrest_model import Key, Table
39
- from deriva.core.utils.core_utils import DEFAULT_LOGGER_OVERRIDES
40
- from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
41
-
42
- from deriva_ml.core.exceptions import DerivaMLInvalidTerm
43
- from deriva_ml.core.definitions import (
44
- ML_SCHEMA,
45
- RID,
46
- ColumnDefinition,
47
- FileSpec,
48
- MLVocab,
49
- MLTable,
50
- Status,
51
- TableDefinition,
52
- VocabularyTerm,
53
- )
25
+
26
+ # Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
27
+ import importlib
28
+ _deriva_core = importlib.import_module("deriva.core")
29
+ _deriva_server = importlib.import_module("deriva.core.deriva_server")
30
+ _ermrest_catalog = importlib.import_module("deriva.core.ermrest_catalog")
31
+ _ermrest_model = importlib.import_module("deriva.core.ermrest_model")
32
+ _core_utils = importlib.import_module("deriva.core.utils.core_utils")
33
+ _globus_auth_utils = importlib.import_module("deriva.core.utils.globus_auth_utils")
34
+
35
+ DEFAULT_SESSION_CONFIG = _deriva_core.DEFAULT_SESSION_CONFIG
36
+ get_credential = _deriva_core.get_credential
37
+ urlquote = _deriva_core.urlquote
38
+ DerivaServer = _deriva_server.DerivaServer
39
+ ErmrestCatalog = _ermrest_catalog.ErmrestCatalog
40
+ ErmrestSnapshot = _ermrest_catalog.ErmrestSnapshot
41
+ Table = _ermrest_model.Table
42
+ DEFAULT_LOGGER_OVERRIDES = _core_utils.DEFAULT_LOGGER_OVERRIDES
43
+ deriva_tags = _core_utils.tag
44
+ GlobusNativeLogin = _globus_auth_utils.GlobusNativeLogin
45
+
54
46
  from deriva_ml.core.config import DerivaMLConfig
55
- from deriva_ml.core.exceptions import DerivaMLTableTypeError, DerivaMLException
56
- from deriva_ml.dataset.aux_classes import DatasetSpec
57
- from deriva_ml.dataset.dataset import Dataset
58
- from deriva_ml.dataset.dataset_bag import DatasetBag
59
- from deriva_ml.dataset.upload import asset_file_path, execution_rids, table_path
60
-
61
- # Local imports
62
- from deriva_ml.execution.execution_configuration import ExecutionConfiguration
63
- from deriva_ml.execution.workflow import Workflow
64
- from deriva_ml.feature import Feature, FeatureRecord
65
- from deriva_ml.model.catalog import DerivaModel
66
- from deriva_ml.schema.annotations import asset_annotation
47
+ from deriva_ml.core.definitions import ML_SCHEMA, RID, Status, TableDefinition, VocabularyTableDef
48
+ from deriva_ml.core.exceptions import DerivaMLException
49
+ from deriva_ml.core.logging_config import apply_logger_overrides, configure_logging
50
+ from deriva_ml.dataset.upload import bulk_upload_configuration
51
+ from deriva_ml.interfaces import DerivaMLCatalog
52
+ from deriva_ml.core.mixins import (
53
+ AnnotationMixin,
54
+ VocabularyMixin,
55
+ RidResolutionMixin,
56
+ PathBuilderMixin,
57
+ WorkflowMixin,
58
+ FeatureMixin,
59
+ DatasetMixin,
60
+ AssetMixin,
61
+ ExecutionMixin,
62
+ FileMixin,
63
+ )
67
64
 
68
65
  # Optional debug imports
69
66
  try:
@@ -74,13 +71,27 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
74
71
  ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
75
72
 
76
73
  if TYPE_CHECKING:
74
+ from deriva_ml.catalog.clone import CatalogProvenance
77
75
  from deriva_ml.execution.execution import Execution
76
+ from deriva_ml.model.catalog import DerivaModel
78
77
 
79
78
  # Stop pycharm from complaining about undefined references.
80
79
  ml: DerivaML
81
80
 
82
81
 
83
- class DerivaML(Dataset):
82
+ class DerivaML(
83
+ PathBuilderMixin,
84
+ RidResolutionMixin,
85
+ VocabularyMixin,
86
+ WorkflowMixin,
87
+ FeatureMixin,
88
+ DatasetMixin,
89
+ AssetMixin,
90
+ ExecutionMixin,
91
+ FileMixin,
92
+ AnnotationMixin,
93
+ DerivaMLCatalog,
94
+ ):
84
95
  """Core class for machine learning operations on a Deriva catalog.
85
96
 
86
97
  This class provides core functionality for managing ML workflows, features, and datasets in a Deriva catalog.
@@ -105,26 +116,79 @@ class DerivaML(Dataset):
105
116
  >>> ml.add_term('vocabulary_table', 'new_term', description='Description of term')
106
117
  """
107
118
 
119
+ # Class-level type annotations for DerivaMLCatalog protocol compliance
120
+ ml_schema: str
121
+ domain_schemas: frozenset[str]
122
+ default_schema: str | None
123
+ model: DerivaModel
124
+ cache_dir: Path
125
+ working_dir: Path
126
+ catalog: ErmrestCatalog | ErmrestSnapshot
127
+ catalog_id: str | int
128
+
108
129
  @classmethod
109
130
  def instantiate(cls, config: DerivaMLConfig) -> Self:
131
+ """Create a DerivaML instance from a configuration object.
132
+
133
+ This method is the preferred way to instantiate DerivaML when using hydra-zen
134
+ for configuration management. It accepts a DerivaMLConfig (Pydantic model) and
135
+ unpacks it to create the instance.
136
+
137
+ This pattern allows hydra-zen's `instantiate()` to work with DerivaML:
138
+
139
+ Example with hydra-zen:
140
+ >>> from hydra_zen import builds, instantiate
141
+ >>> from deriva_ml import DerivaML
142
+ >>> from deriva_ml.core.config import DerivaMLConfig
143
+ >>>
144
+ >>> # Create a structured config using hydra-zen
145
+ >>> DerivaMLConf = builds(DerivaMLConfig, populate_full_signature=True)
146
+ >>>
147
+ >>> # Configure for your environment
148
+ >>> conf = DerivaMLConf(
149
+ ... hostname='deriva.example.org',
150
+ ... catalog_id='42',
151
+ ... domain_schema='my_domain',
152
+ ... )
153
+ >>>
154
+ >>> # Instantiate the config to get a DerivaMLConfig object
155
+ >>> config = instantiate(conf)
156
+ >>>
157
+ >>> # Create the DerivaML instance
158
+ >>> ml = DerivaML.instantiate(config)
159
+
160
+ Args:
161
+ config: A DerivaMLConfig object containing all configuration parameters.
162
+
163
+ Returns:
164
+ A new DerivaML instance configured according to the config object.
165
+
166
+ Note:
167
+ The DerivaMLConfig class integrates with Hydra's configuration system
168
+ and registers custom resolvers for computing working directories.
169
+ See `deriva_ml.core.config` for details on configuration options.
170
+ """
110
171
  return cls(**config.model_dump())
111
172
 
112
173
  def __init__(
113
174
  self,
114
175
  hostname: str,
115
176
  catalog_id: str | int,
116
- domain_schema: str | None = None,
177
+ domain_schemas: set[str] | None = None,
178
+ default_schema: str | None = None,
117
179
  project_name: str | None = None,
118
180
  cache_dir: str | Path | None = None,
119
181
  working_dir: str | Path | None = None,
120
182
  hydra_runtime_output_dir: str | Path | None = None,
121
183
  ml_schema: str = ML_SCHEMA,
122
- logging_level=logging.WARNING,
123
- deriva_logging_level=logging.WARNING,
124
- credential=None,
125
- use_minid: bool = True,
184
+ logging_level: int = logging.WARNING,
185
+ deriva_logging_level: int = logging.WARNING,
186
+ credential: dict | None = None,
187
+ s3_bucket: str | None = None,
188
+ use_minid: bool | None = None,
126
189
  check_auth: bool = True,
127
- ):
190
+ clean_execution_dir: bool = True,
191
+ ) -> None:
128
192
  """Initializes a DerivaML instance.
129
193
 
130
194
  This method will connect to a catalog and initialize local configuration for the ML execution.
@@ -133,17 +197,28 @@ class DerivaML(Dataset):
133
197
  Args:
134
198
  hostname: Hostname of the Deriva server.
135
199
  catalog_id: Catalog ID. Either an identifier or a catalog name.
136
- domain_schema: Schema name for domain-specific tables and relationships. Defaults to the name of the
137
- schema that is not one of the standard schemas. If there is more than one user-defined schema, then
138
- this argument must be provided a value.
200
+ domain_schemas: Optional set of domain schema names. If None, auto-detects all
201
+ non-system schemas. Use this when working with catalogs that have multiple
202
+ user-defined schemas.
203
+ default_schema: The default schema for table creation operations. If None and
204
+ there is exactly one domain schema, that schema is used. If there are multiple
205
+ domain schemas, this must be specified for table creation to work without
206
+ explicit schema parameters.
139
207
  ml_schema: Schema name for ML schema. Used if you have a non-standard configuration of deriva-ml.
140
- project_name: Project name. Defaults to name of domain schema.
208
+ project_name: Project name. Defaults to name of default_schema.
141
209
  cache_dir: Directory path for caching data downloaded from the Deriva server as bdbag. If not provided,
142
210
  will default to working_dir.
143
211
  working_dir: Directory path for storing data used by or generated by any computations. If no value is
144
212
  provided, will default to ${HOME}/deriva_ml
145
- use_minid: Use the MINID service when downloading dataset bags.
213
+ s3_bucket: S3 bucket URL for dataset bag storage (e.g., 's3://my-bucket'). If provided,
214
+ enables MINID creation and S3 upload for dataset exports. If None, MINID functionality
215
+ is disabled regardless of use_minid setting.
216
+ use_minid: Use the MINID service when downloading dataset bags. Only effective when
217
+ s3_bucket is configured. If None (default), automatically set to True when s3_bucket
218
+ is provided, False otherwise.
146
219
  check_auth: Check if the user has access to the catalog.
220
+ clean_execution_dir: Whether to automatically clean up execution working directories
221
+ after successful upload. Defaults to True. Set to False to retain local copies.
147
222
  """
148
223
  # Get or use provided credentials for server access
149
224
  self.credential = credential or get_credential(hostname)
@@ -164,32 +239,46 @@ class DerivaML(Dataset):
164
239
  "Please check your credentials and make sure you have logged in."
165
240
  )
166
241
  self.catalog = server.connect_ermrest(catalog_id)
167
- self.model = DerivaModel(self.catalog.getCatalogModel(), domain_schema=domain_schema)
242
+ # Import here to avoid circular imports
243
+ from deriva_ml.model.catalog import DerivaModel
244
+ self.model = DerivaModel(
245
+ self.catalog.getCatalogModel(),
246
+ ml_schema=ml_schema,
247
+ domain_schemas=domain_schemas,
248
+ default_schema=default_schema,
249
+ )
250
+
251
+ # Store S3 bucket configuration and resolve use_minid
252
+ self.s3_bucket = s3_bucket
253
+ if use_minid is None:
254
+ # Auto mode: enable MINID if s3_bucket is configured
255
+ self.use_minid = s3_bucket is not None
256
+ elif use_minid and s3_bucket is None:
257
+ # User requested MINID but no S3 bucket configured - disable MINID
258
+ self.use_minid = False
259
+ else:
260
+ self.use_minid = use_minid
168
261
 
169
262
  # Set up working and cache directories
170
- self.working_dir = DerivaMLConfig.compute_workdir(working_dir)
263
+ self.working_dir = DerivaMLConfig.compute_workdir(working_dir, catalog_id)
171
264
  self.working_dir.mkdir(parents=True, exist_ok=True)
172
265
  self.hydra_runtime_output_dir = hydra_runtime_output_dir
173
266
 
174
267
  self.cache_dir = Path(cache_dir) if cache_dir else self.working_dir / "cache"
175
268
  self.cache_dir.mkdir(parents=True, exist_ok=True)
176
269
 
177
- # Initialize dataset functionality from the parent class
178
- super().__init__(self.model, self.cache_dir, self.working_dir, use_minid=use_minid)
179
-
180
- # Set up logging
181
- self._logger = logging.getLogger("deriva_ml")
182
- self._logger.setLevel(logging_level)
270
+ # Set up logging using centralized configuration
271
+ # This configures deriva_ml, Hydra, and deriva-py loggers without
272
+ # affecting the root logger or calling basicConfig()
273
+ self._logger = configure_logging(
274
+ level=logging_level,
275
+ deriva_level=deriva_logging_level,
276
+ )
183
277
  self._logging_level = logging_level
184
278
  self._deriva_logging_level = deriva_logging_level
185
279
 
186
- # Configure deriva logging level
187
- logger_config = DEFAULT_LOGGER_OVERRIDES
188
- # allow for reconfiguration of module-specific logging levels
189
- [logging.getLogger(name).setLevel(level) for name, level in logger_config.items()]
190
- logging.getLogger("root").setLevel(deriva_logging_level)
191
- logging.getLogger("bagit").setLevel(deriva_logging_level)
192
- logging.getLogger("bdbag").setLevel(deriva_logging_level)
280
+ # Apply deriva's default logger overrides for fine-grained control
281
+ apply_logger_overrides(DEFAULT_LOGGER_OVERRIDES)
193
282
 
194
283
  # Store instance configuration
195
284
  self.host_name = hostname
@@ -197,22 +286,14 @@ class DerivaML(Dataset):
197
286
  self.ml_schema = ml_schema
198
287
  self.configuration = None
199
288
  self._execution: Execution | None = None
200
- self.domain_schema = self.model.domain_schema
201
- self.project_name = project_name or self.domain_schema
289
+ self.domain_schemas = self.model.domain_schemas
290
+ self.default_schema = self.model.default_schema
291
+ self.project_name = project_name or self.default_schema or "deriva-ml"
202
292
  self.start_time = datetime.now()
203
293
  self.status = Status.pending.value
294
+ self.clean_execution_dir = clean_execution_dir
204
295
 
205
- # Configure logging format
206
- logging.basicConfig(
207
- level=logging_level,
208
- format="%(asctime)s - %(name)s.%(levelname)s - %(message)s",
209
- )
210
-
211
- # Set Deriva library logging level
212
- deriva_logger = logging.getLogger("deriva")
213
- deriva_logger.setLevel(logging_level)
214
-
215
- def __del__(self):
296
+ def __del__(self) -> None:
216
297
  """Cleanup method to handle incomplete executions."""
217
298
  try:
218
299
  # Mark execution as aborted if not completed
@@ -222,7 +303,7 @@ class DerivaML(Dataset):
222
303
  pass
223
304
 
224
305
  @staticmethod
225
- def _get_session_config():
306
+ def _get_session_config() -> dict:
226
307
  """Returns customized HTTP session configuration.
227
308
 
228
309
  Configures retry behavior and connection settings for HTTP requests to the Deriva server. Settings include:
@@ -254,58 +335,23 @@ class DerivaML(Dataset):
254
335
  )
255
336
  return session_config
256
337
 
257
- @property
258
- def pathBuilder(self) -> SchemaWrapper:
259
- """Returns catalog path builder for queries.
260
-
261
- The path builder provides a fluent interface for constructing complex queries against the catalog.
262
- This is a core component used by many other methods to interact with the catalog.
338
+ def is_snapshot(self) -> bool:
339
+ return hasattr(self.catalog, "_snaptime")
263
340
 
264
- Returns:
265
- datapath._CatalogWrapper: A new instance of the catalog path builder.
266
-
267
- Example:
268
- >>> path = ml.pathBuilder.schemas['my_schema'].tables['my_table']
269
- >>> results = path.entities().fetch()
270
- """
271
- return self.catalog.getPathBuilder()
341
+ def catalog_snapshot(self, version_snapshot: str) -> Self:
342
+ """Returns a DerivaML instance for a specific snapshot of the catalog."""
343
+ return DerivaML(
344
+ self.host_name,
345
+ version_snapshot,
346
+ logging_level=self._logging_level,
347
+ deriva_logging_level=self._deriva_logging_level,
348
+ )
272
349
 
273
350
  @property
274
- def domain_path(self) -> datapath.DataPath:
275
- """Returns path builder for domain schema.
351
+ def _dataset_table(self) -> Table:
352
+ return self.model.schemas[self.model.ml_schema].tables["Dataset"]
276
353
 
277
- Provides a convenient way to access tables and construct queries within the domain-specific schema.
278
-
279
- Returns:
280
- datapath._CatalogWrapper: Path builder object scoped to the domain schema.
281
-
282
- Example:
283
- >>> domain = ml.domain_path
284
- >>> results = domain.my_table.entities().fetch()
285
- """
286
- return self.pathBuilder.schemas[self.domain_schema]
287
-
288
- def table_path(self, table: str | Table) -> Path:
289
- """Returns a local filesystem path for table CSV files.
290
-
291
- Generates a standardized path where CSV files should be placed when preparing to upload data to a table.
292
- The path follows the project's directory structure conventions.
293
-
294
- Args:
295
- table: Name of the table or Table object to get the path for.
296
-
297
- Returns:
298
- Path: Filesystem path where the CSV file should be placed.
299
-
300
- Example:
301
- >>> path = ml.table_path("experiment_results")
302
- >>> df.to_csv(path) # Save data for upload
303
- """
304
- return table_path(
305
- self.working_dir,
306
- schema=self.domain_schema,
307
- table=self.model.name_to_table(table).name,
308
- )
354
+ # pathBuilder, domain_path, table_path moved to PathBuilderMixin
309
355
 
310
356
  def download_dir(self, cached: bool = False) -> Path:
311
357
  """Returns the appropriate download directory.
@@ -384,27 +430,37 @@ class DerivaML(Dataset):
384
430
  uri = self.cite(cast(str, table))
385
431
  return f"{uri}/{urlquote(table_obj.schema.name)}:{urlquote(table_obj.name)}"
386
432
 
387
- def cite(self, entity: Dict[str, Any] | str) -> str:
388
- """Generates permanent citation URL.
433
+ def cite(self, entity: Dict[str, Any] | str, current: bool = False) -> str:
434
+ """Generates citation URL for an entity.
389
435
 
390
- Creates a versioned URL that can be used to reference a specific entity in the catalog. The URL includes
391
- the catalog snapshot time to ensure version stability.
436
+ Creates a URL that can be used to reference a specific entity in the catalog.
437
+ By default, includes the catalog snapshot time to ensure version stability
438
+ (permanent citation). With current=True, returns a URL to the current state.
392
439
 
393
440
  Args:
394
441
  entity: Either a RID string or a dictionary containing entity data with a 'RID' key.
442
+ current: If True, return URL to current catalog state (no snapshot).
443
+ If False (default), return permanent citation URL with snapshot time.
395
444
 
396
445
  Returns:
397
- str: Permanent citation URL in format: https://{host}/id/{catalog}/{rid}@{snapshot_time}
446
+ str: Citation URL. Format depends on `current` parameter:
447
+ - current=False: https://{host}/id/{catalog}/{rid}@{snapshot_time}
448
+ - current=True: https://{host}/id/{catalog}/{rid}
398
449
 
399
450
  Raises:
400
451
  DerivaMLException: If an entity doesn't exist or lacks a RID.
401
452
 
402
453
  Examples:
403
- Using a RID string:
454
+ Permanent citation (default):
404
455
  >>> url = ml.cite("1-abc123")
405
456
  >>> print(url)
406
457
  'https://deriva.org/id/1/1-abc123@2024-01-01T12:00:00'
407
458
 
459
+ Current catalog URL:
460
+ >>> url = ml.cite("1-abc123", current=True)
461
+ >>> print(url)
462
+ 'https://deriva.org/id/1/1-abc123'
463
+
408
464
  Using a dictionary:
409
465
  >>> url = ml.cite({"RID": "1-abc123"})
410
466
  """
@@ -413,14 +469,44 @@ class DerivaML(Dataset):
413
469
  return entity
414
470
 
415
471
  try:
416
- # Resolve RID and create citation URL with snapshot time
472
+ # Resolve RID and create citation URL
417
473
  self.resolve_rid(rid := entity if isinstance(entity, str) else entity["RID"])
418
- return f"https://{self.host_name}/id/{self.catalog_id}/{rid}@{self.catalog.latest_snapshot().snaptime}"
474
+ base_url = f"https://{self.host_name}/id/{self.catalog_id}/{rid}"
475
+ if current:
476
+ return base_url
477
+ return f"{base_url}@{self.catalog.latest_snapshot().snaptime}"
419
478
  except KeyError as e:
420
479
  raise DerivaMLException(f"Entity {e} does not have RID column")
421
480
  except DerivaMLException as _e:
422
481
  raise DerivaMLException("Entity RID does not exist")
423
482
 
483
+ @property
484
+ def catalog_provenance(self) -> "CatalogProvenance | None":
485
+ """Get the provenance information for this catalog.
486
+
487
+ Returns provenance information if the catalog has it set. This includes
488
+ information about how the catalog was created (clone, create, schema),
489
+ who created it, when, and any workflow information.
490
+
491
+ For cloned catalogs, additional details about the clone operation are
492
+ available in the `clone_details` attribute.
493
+
494
+ Returns:
495
+ CatalogProvenance if available, None otherwise.
496
+
497
+ Example:
498
+ >>> ml = DerivaML('localhost', '45')
499
+ >>> prov = ml.catalog_provenance
500
+ >>> if prov:
501
+ ... print(f"Created: {prov.created_at} by {prov.created_by}")
502
+ ... print(f"Method: {prov.creation_method.value}")
503
+ ... if prov.is_clone:
504
+ ... print(f"Cloned from: {prov.clone_details.source_hostname}")
505
+ """
506
+ from deriva_ml.catalog.clone import get_catalog_provenance
507
+
508
+ return get_catalog_provenance(self.catalog)
509
+
424
510
  def user_list(self) -> List[Dict[str, str]]:
425
511
  """Returns catalog user list.
426
512
 
@@ -439,59 +525,247 @@ class DerivaML(Dataset):
439
525
  ... print(f"{user['Full_Name']} ({user['ID']})")
440
526
  """
441
527
  # Get the user table path and fetch basic user info
442
- user_path = self.pathBuilder.public.ERMrest_Client.path
528
+ user_path = self.pathBuilder().public.ERMrest_Client.path
443
529
  return [{"ID": u["ID"], "Full_Name": u["Full_Name"]} for u in user_path.entities().fetch()]
444
530
 
445
- def resolve_rid(self, rid: RID) -> ResolveRidResult:
446
- """Resolves RID to catalog location.
447
-
448
- Looks up a RID and returns information about where it exists in the catalog, including schema,
449
- table, and column metadata.
531
+ # resolve_rid, retrieve_rid moved to RidResolutionMixin
450
532
 
451
- Args:
452
- rid: Resource Identifier to resolve.
453
-
454
- Returns:
455
- ResolveRidResult: Named tuple containing:
456
- - schema: Schema name
457
- - table: Table name
458
- - columns: Column definitions
459
- - datapath: Path builder for accessing the entity
460
-
461
- Raises:
462
- DerivaMLException: If RID doesn't exist in catalog.
463
-
464
- Examples:
465
- >>> result = ml.resolve_rid("1-abc123")
466
- >>> print(f"Found in {result.schema}.{result.table}")
467
- >>> data = result.datapath.entities().fetch()
468
- """
469
- try:
470
- # Attempt to resolve RID using catalog model
471
- return self.catalog.resolve_rid(rid, self.model.model)
472
- except KeyError as _e:
473
- raise DerivaMLException(f"Invalid RID {rid}")
474
-
475
- def retrieve_rid(self, rid: RID) -> dict[str, Any]:
476
- """Retrieves complete record for RID.
477
-
478
- Fetches all column values for the entity identified by the RID.
533
+ def apply_catalog_annotations(
534
+ self,
535
+ navbar_brand_text: str = "ML Data Browser",
536
+ head_title: str = "Catalog ML",
537
+ ) -> None:
538
+ """Apply catalog-level annotations including the navigation bar and display settings.
539
+
540
+ This method configures the Chaise web interface for the catalog. Chaise is Deriva's
541
+ web-based data browser that provides a user-friendly interface for exploring and
542
+ managing catalog data. This method sets up annotations that control how Chaise
543
+ displays and organizes the catalog.
544
+
545
+ **Navigation Bar Structure**:
546
+ The method creates a navigation bar with the following menus:
547
+ - **User Info**: Links to Users, Groups, and RID Lease tables
548
+ - **Deriva-ML**: Core ML tables (Workflow, Execution, Dataset, Dataset_Version, etc.)
549
+ - **WWW**: Web content tables (Page, File)
550
+ - **{Domain Schema}**: All domain-specific tables (excludes vocabularies and associations)
551
+ - **Vocabulary**: All controlled vocabulary tables from both ML and domain schemas
552
+ - **Assets**: All asset tables from both ML and domain schemas
553
+ - **Features**: All feature tables with entries named "TableName:FeatureName"
554
+ - **Catalog Registry**: Link to the ermrest registry
555
+ - **Documentation**: Links to ML notebook instructions and Deriva-ML docs
556
+
557
+ **Display Settings**:
558
+ - Underscores in table/column names displayed as spaces
559
+ - System columns (RID) shown in compact and entry views
560
+ - Default table set to Dataset
561
+ - Faceting and record deletion enabled
562
+ - Export configurations available to all users
563
+
564
+ **Bulk Upload Configuration**:
565
+ Configures upload patterns for asset tables, enabling drag-and-drop file uploads
566
+ through the Chaise interface.
567
+
568
+ Call this after creating the domain schema and all tables to initialize the catalog's
569
+ web interface. The navigation menus are dynamically built based on the current schema
570
+ structure, automatically organizing tables into appropriate categories.
479
571
 
480
572
  Args:
481
- rid: Resource Identifier of the record to retrieve.
482
-
483
- Returns:
484
- dict[str, Any]: Dictionary containing all column values for the entity.
485
-
486
- Raises:
487
- DerivaMLException: If the RID doesn't exist in the catalog.
573
+ navbar_brand_text: Text displayed in the navigation bar brand area.
574
+ head_title: Title displayed in the browser tab.
488
575
 
489
576
  Example:
490
- >>> record = ml.retrieve_rid("1-abc123")
491
- >>> print(f"Name: {record['name']}, Created: {record['creation_date']}")
577
+ >>> ml = DerivaML('deriva.example.org', 'my_catalog')
578
+ >>> # After creating domain schema and tables...
579
+ >>> ml.apply_catalog_annotations()
580
+ >>> # Or with custom branding:
581
+ >>> ml.apply_catalog_annotations("My Project Browser", "My ML Project")
492
582
  """
493
- # Resolve RID and fetch the first (only) matching record
494
- return self.resolve_rid(rid).datapath.entities().fetch()[0]
583
+ catalog_id = self.model.catalog.catalog_id
584
+ ml_schema = self.ml_schema
585
+
586
+ # Build domain schema menu items (one menu per domain schema)
587
+ domain_schema_menus = []
588
+ for domain_schema in sorted(self.domain_schemas):
589
+ if domain_schema not in self.model.schemas:
590
+ continue
591
+ domain_schema_menus.append({
592
+ "name": domain_schema,
593
+ "children": [
594
+ {
595
+ "name": tname,
596
+ "url": f"/chaise/recordset/#{catalog_id}/{domain_schema}:{tname}",
597
+ }
598
+ for tname in self.model.schemas[domain_schema].tables
599
+ # Don't include controlled vocabularies, association tables, or feature tables.
600
+ if not (
601
+ self.model.is_vocabulary(tname)
602
+ or self.model.is_association(tname, pure=False, max_arity=3)
603
+ )
604
+ ],
605
+ })
606
+
607
+ # Build vocabulary menu items (ML schema + all domain schemas)
608
+ vocab_children = [{"name": f"{ml_schema} Vocabularies", "header": True}]
609
+ vocab_children.extend([
610
+ {
611
+ "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:{tname}",
612
+ "name": tname,
613
+ }
614
+ for tname in self.model.schemas[ml_schema].tables
615
+ if self.model.is_vocabulary(tname)
616
+ ])
617
+ for domain_schema in sorted(self.domain_schemas):
618
+ if domain_schema not in self.model.schemas:
619
+ continue
620
+ vocab_children.append({"name": f"{domain_schema} Vocabularies", "header": True})
621
+ vocab_children.extend([
622
+ {
623
+ "url": f"/chaise/recordset/#{catalog_id}/{domain_schema}:{tname}",
624
+ "name": tname,
625
+ }
626
+ for tname in self.model.schemas[domain_schema].tables
627
+ if self.model.is_vocabulary(tname)
628
+ ])
629
+
630
+ # Build asset menu items (ML schema + all domain schemas)
631
+ asset_children = [
632
+ {
633
+ "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:{tname}",
634
+ "name": tname,
635
+ }
636
+ for tname in self.model.schemas[ml_schema].tables
637
+ if self.model.is_asset(tname)
638
+ ]
639
+ for domain_schema in sorted(self.domain_schemas):
640
+ if domain_schema not in self.model.schemas:
641
+ continue
642
+ asset_children.extend([
643
+ {
644
+ "url": f"/chaise/recordset/#{catalog_id}/{domain_schema}:{tname}",
645
+ "name": tname,
646
+ }
647
+ for tname in self.model.schemas[domain_schema].tables
648
+ if self.model.is_asset(tname)
649
+ ])
650
+
651
+ catalog_annotation = {
652
+ deriva_tags.display: {"name_style": {"underline_space": True}},
653
+ deriva_tags.chaise_config: {
654
+ "headTitle": head_title,
655
+ "navbarBrandText": navbar_brand_text,
656
+ "systemColumnsDisplayEntry": ["RID"],
657
+ "systemColumnsDisplayCompact": ["RID"],
658
+ "defaultTable": {"table": "Dataset", "schema": "deriva-ml"},
659
+ "deleteRecord": True,
660
+ "showFaceting": True,
661
+ "shareCiteAcls": True,
662
+ "exportConfigsSubmenu": {"acls": {"show": ["*"], "enable": ["*"]}},
663
+ "resolverImplicitCatalog": False,
664
+ "navbarMenu": {
665
+ "newTab": False,
666
+ "children": [
667
+ {
668
+ "name": "User Info",
669
+ "children": [
670
+ {
671
+ "url": f"/chaise/recordset/#{catalog_id}/public:ERMrest_Client",
672
+ "name": "Users",
673
+ },
674
+ {
675
+ "url": f"/chaise/recordset/#{catalog_id}/public:ERMrest_Group",
676
+ "name": "Groups",
677
+ },
678
+ {
679
+ "url": f"/chaise/recordset/#{catalog_id}/public:ERMrest_RID_Lease",
680
+ "name": "ERMrest RID Lease",
681
+ },
682
+ ],
683
+ },
684
+ { # All the primary tables in deriva-ml schema.
685
+ "name": "Deriva-ML",
686
+ "children": [
687
+ {
688
+ "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Workflow",
689
+ "name": "Workflow",
690
+ },
691
+ {
692
+ "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Execution",
693
+ "name": "Execution",
694
+ },
695
+ {
696
+ "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Execution_Metadata",
697
+ "name": "Execution Metadata",
698
+ },
699
+ {
700
+ "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Execution_Asset",
701
+ "name": "Execution Asset",
702
+ },
703
+ {
704
+ "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Dataset",
705
+ "name": "Dataset",
706
+ },
707
+ {
708
+ "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Dataset_Version",
709
+ "name": "Dataset Version",
710
+ },
711
+ ],
712
+ },
713
+ { # WWW schema tables.
714
+ "name": "WWW",
715
+ "children": [
716
+ {
717
+ "url": f"/chaise/recordset/#{catalog_id}/WWW:Page",
718
+ "name": "Page",
719
+ },
720
+ {
721
+ "url": f"/chaise/recordset/#{catalog_id}/WWW:File",
722
+ "name": "File",
723
+ },
724
+ ],
725
+ },
726
+ *domain_schema_menus, # One menu per domain schema
727
+ { # Vocabulary menu with all controlled vocabularies.
728
+ "name": "Vocabulary",
729
+ "children": vocab_children,
730
+ },
731
+ { # List of all asset tables.
732
+ "name": "Assets",
733
+ "children": asset_children,
734
+ },
735
+ { # List of all feature tables in the catalog.
736
+ "name": "Features",
737
+ "children": [
738
+ {
739
+ "url": f"/chaise/recordset/#{catalog_id}/{f.feature_table.schema.name}:{f.feature_table.name}",
740
+ "name": f"{f.target_table.name}:{f.feature_name}",
741
+ }
742
+ for f in self.model.find_features()
743
+ ],
744
+ },
745
+ {
746
+ "url": "/chaise/recordset/#0/ermrest:registry@sort(RID)",
747
+ "name": "Catalog Registry",
748
+ },
749
+ {
750
+ "name": "Documentation",
751
+ "children": [
752
+ {
753
+ "url": "https://github.com/informatics-isi-edu/deriva-ml/blob/main/docs/ml_workflow_instruction.md",
754
+ "name": "ML Notebook Instruction",
755
+ },
756
+ {
757
+ "url": "https://informatics-isi-edu.github.io/deriva-ml/",
758
+ "name": "Deriva-ML Documentation",
759
+ },
760
+ ],
761
+ },
762
+ ],
763
+ },
764
+ },
765
+ deriva_tags.bulk_upload: bulk_upload_configuration(model=self.model),
766
+ }
767
+ self.model.annotations.update(catalog_annotation)
768
+ self.model.apply()
495
769
 
496
770
  def add_page(self, title: str, content: str) -> None:
497
771
  """Adds page to web interface.
@@ -513,9 +787,15 @@ class DerivaML(Dataset):
513
787
  ... )
514
788
  """
515
789
  # Insert page into www tables with title and content
516
- self.pathBuilder.www.tables[self.domain_schema].insert([{"Title": title, "Content": content}])
517
-
518
- def create_vocabulary(self, vocab_name: str, comment: str = "", schema: str | None = None) -> Table:
790
+ # Use default schema or first domain schema for www tables
791
+ schema = self.default_schema or (sorted(self.domain_schemas)[0] if self.domain_schemas else None)
792
+ if schema is None:
793
+ raise DerivaMLException("No domain schema available for adding pages")
794
+ self.pathBuilder().www.tables[schema].insert([{"Title": title, "Content": content}])
795
+
796
+ def create_vocabulary(
797
+ self, vocab_name: str, comment: str = "", schema: str | None = None, update_navbar: bool = True
798
+ ) -> Table:
519
799
  """Creates a controlled vocabulary table.
520
800
 
521
801
  A controlled vocabulary table maintains a list of standardized terms and their definitions. Each term can have
@@ -525,6 +805,9 @@ class DerivaML(Dataset):
525
805
  vocab_name: Name for the new vocabulary table. Must be a valid SQL identifier.
526
806
  comment: Description of the vocabulary's purpose and usage. Defaults to empty string.
527
807
  schema: Schema name to create the table in. If None, uses domain_schema.
808
+ update_navbar: If True (default), automatically updates the navigation bar to include
809
+ the new vocabulary table. Set to False during batch table creation to avoid
810
+ redundant updates, then call apply_catalog_annotations() once at the end.
528
811
 
529
812
  Returns:
530
813
  Table: ERMRest table object representing the newly created vocabulary table.
@@ -540,988 +823,483 @@ class DerivaML(Dataset):
540
823
  ... comment="Standard tissue classifications",
541
824
  ... schema="bio_schema"
542
825
  ... )
826
+
827
+ Create multiple vocabularies without updating navbar until the end:
828
+
829
+ >>> ml.create_vocabulary("Species", update_navbar=False)
830
+ >>> ml.create_vocabulary("Tissue_Type", update_navbar=False)
831
+ >>> ml.apply_catalog_annotations() # Update navbar once
543
832
  """
544
- # Use domain schema if none specified
545
- schema = schema or self.domain_schema
833
+ # Use default schema if none specified
834
+ schema = schema or self.model._require_default_schema()
546
835
 
547
836
  # Create and return vocabulary table with RID-based URI pattern
548
837
  try:
549
838
  vocab_table = self.model.schemas[schema].create_table(
550
- Table.define_vocabulary(vocab_name, f"{self.project_name}:{{RID}}", comment=comment)
839
+ VocabularyTableDef(
840
+ name=vocab_name,
841
+ curie_template=f"{self.project_name}:{{RID}}",
842
+ comment=comment,
843
+ )
551
844
  )
552
845
  except ValueError:
553
846
  raise DerivaMLException(f"Table {vocab_name} already exist")
554
- return vocab_table
555
-
556
- def create_table(self, table: TableDefinition) -> Table:
557
- """Creates a new table in the catalog.
558
-
559
- Creates a table using the provided TableDefinition object, which specifies the table structure including
560
- columns, keys, and foreign key relationships.
561
-
562
- Args:
563
- table: A TableDefinition object containing the complete specification of the table to create.
564
-
565
- Returns:
566
- Table: The newly created ERMRest table object.
567
-
568
- Raises:
569
- DerivaMLException: If table creation fails or the definition is invalid.
570
-
571
- Example:
572
-
573
- >>> table_def = TableDefinition(
574
- ... name="experiments",
575
- ... column_definitions=[
576
- ... ColumnDefinition(name="name", type=BuiltinTypes.text),
577
- ... ColumnDefinition(name="date", type=BuiltinTypes.date)
578
- ... ]
579
- ... )
580
- >>> new_table = ml.create_table(table_def)
581
- """
582
- # Create table in domain schema using provided definition
583
- return self.model.schemas[self.domain_schema].create_table(table.model_dump())
584
-
585
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
586
- def create_asset(
587
- self,
588
- asset_name: str,
589
- column_defs: Iterable[ColumnDefinition] | None = None,
590
- fkey_defs: Iterable[ColumnDefinition] | None = None,
591
- referenced_tables: Iterable[Table] | None = None,
592
- comment: str = "",
593
- schema: str | None = None,
594
- ) -> Table:
595
- """Creates an asset table.
596
-
597
- Args:
598
- asset_name: Name of the asset table.
599
- column_defs: Iterable of ColumnDefinition objects to provide additional metadata for asset.
600
- fkey_defs: Iterable of ForeignKeyDefinition objects to provide additional metadata for asset.
601
- referenced_tables: Iterable of Table objects to which asset should provide foreign-key references to.
602
- comment: Description of the asset table. (Default value = '')
603
- schema: Schema in which to create the asset table. Defaults to domain_schema.
604
-
605
- Returns:
606
- Table object for the asset table.
607
- """
608
- # Initialize empty collections if None provided
609
- column_defs = column_defs or []
610
- fkey_defs = fkey_defs or []
611
- referenced_tables = referenced_tables or []
612
- schema = schema or self.domain_schema
613
-
614
- # Add an asset type to vocabulary
615
- self.add_term(MLVocab.asset_type, asset_name, description=f"A {asset_name} asset")
616
-
617
- # Create the main asset table
618
- asset_table = self.model.schemas[schema].create_table(
619
- Table.define_asset(
620
- schema,
621
- asset_name,
622
- column_defs=[c.model_dump() for c in column_defs],
623
- fkey_defs=[fk.model_dump() for fk in fkey_defs],
624
- comment=comment,
625
- )
626
- )
627
847
 
628
- # Create an association table between asset and asset type
629
- self.model.schemas[self.domain_schema].create_table(
630
- Table.define_association(
631
- [
632
- (asset_table.name, asset_table),
633
- ("Asset_Type", self.model.name_to_table("Asset_Type")),
634
- ]
635
- )
636
- )
848
+ # Update navbar to include the new vocabulary table
849
+ if update_navbar:
850
+ self.apply_catalog_annotations()
637
851
 
638
- # Create references to other tables if specified
639
- for t in referenced_tables:
640
- asset_table.create_reference(self.model.name_to_table(t))
641
-
642
- # Create an association table for tracking execution
643
- atable = self.model.schemas[self.domain_schema].create_table(
644
- Table.define_association(
645
- [
646
- (asset_name, asset_table),
647
- (
648
- "Execution",
649
- self.model.schemas[self.ml_schema].tables["Execution"],
650
- ),
651
- ]
652
- )
653
- )
654
- atable.create_reference(self.model.name_to_table("Asset_Role"))
655
-
656
- # Add asset annotations
657
- asset_annotation(asset_table)
658
- return asset_table
852
+ return vocab_table
659
853
 
660
- def list_assets(self, asset_table: Table | str) -> list[dict[str, Any]]:
661
- """Lists contents of an asset table.
854
+ def create_table(self, table: TableDefinition, schema: str | None = None, update_navbar: bool = True) -> Table:
855
+ """Creates a new table in the domain schema.
662
856
 
663
- Returns a list of assets with their types for the specified asset table.
857
+ Creates a table using the provided TableDefinition object, which specifies the table structure
858
+ including columns, keys, and foreign key relationships. The table is created in the domain
859
+ schema associated with this DerivaML instance.
664
860
 
665
- Args:
666
- asset_table: Table or name of the asset table to list assets for.
667
-
668
- Returns:
669
- list[dict[str, Any]]: List of asset records, each containing:
670
- - RID: Resource identifier
671
- - Type: Asset type
672
- - Metadata: Asset metadata
861
+ **Required Classes**:
862
+ Import the following classes from deriva_ml to define tables:
673
863
 
674
- Raises:
675
- DerivaMLException: If the table is not an asset table or doesn't exist.
676
-
677
- Example:
678
- >>> assets = ml.list_assets("tissue_types")
679
- >>> for asset in assets:
680
- ... print(f"{asset['RID']}: {asset['Type']}")
681
- """
682
- # Validate and get asset table reference
683
- asset_table = self.model.name_to_table(asset_table)
684
- if not self.model.is_asset(asset_table):
685
- raise DerivaMLException(f"Table {asset_table.name} is not an asset")
686
-
687
- # Get path builders for asset and type tables
688
- pb = self._model.catalog.getPathBuilder()
689
- asset_path = pb.schemas[asset_table.schema.name].tables[asset_table.name]
690
- (
691
- asset_type_table,
692
- _,
693
- _,
694
- ) = self._model.find_association(asset_table, MLVocab.asset_type)
695
- type_path = pb.schemas[asset_type_table.schema.name].tables[asset_type_table.name]
696
-
697
- # Build a list of assets with their types
698
- assets = []
699
- for asset in asset_path.entities().fetch():
700
- # Get associated asset types for each asset
701
- asset_types = (
702
- type_path.filter(type_path.columns[asset_table.name] == asset["RID"])
703
- .attributes(type_path.Asset_Type)
704
- .fetch()
705
- )
706
- # Combine asset data with its types
707
- assets.append(
708
- asset | {MLVocab.asset_type.value: [asset_type[MLVocab.asset_type.value] for asset_type in asset_types]}
709
- )
710
- return assets
864
+ - ``TableDefinition``: Defines the complete table structure
865
+ - ``ColumnDefinition``: Defines individual columns with types and constraints
866
+ - ``KeyDefinition``: Defines unique key constraints (optional)
867
+ - ``ForeignKeyDefinition``: Defines foreign key relationships to other tables (optional)
868
+ - ``BuiltinTypes``: Enum of available column data types
711
869
 
712
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
713
- def create_feature(
714
- self,
715
- target_table: Table | str,
716
- feature_name: str,
717
- terms: list[Table | str] | None = None,
718
- assets: list[Table | str] | None = None,
719
- metadata: list[ColumnDefinition | Table | Key | str] | None = None,
720
- optional: list[str] | None = None,
721
- comment: str = "",
722
- ) -> type[FeatureRecord]:
723
- """Creates a new feature definition.
724
-
725
- A feature represents a measurable property or characteristic that can be associated with records in the target
726
- table. Features can include vocabulary terms, asset references, and additional metadata.
870
+ **Available Column Types** (BuiltinTypes enum):
871
+ ``text``, ``int2``, ``int4``, ``int8``, ``float4``, ``float8``, ``boolean``,
872
+ ``date``, ``timestamp``, ``timestamptz``, ``json``, ``jsonb``, ``markdown``,
873
+ ``ermrest_uri``, ``ermrest_rid``, ``ermrest_rcb``, ``ermrest_rmb``,
874
+ ``ermrest_rct``, ``ermrest_rmt``
727
875
 
728
876
  Args:
729
- target_table: Table to associate the feature with (name or Table object).
730
- feature_name: Unique name for the feature within the target table.
731
- terms: Optional vocabulary tables/names whose terms can be used as feature values.
732
- assets: Optional asset tables/names that can be referenced by this feature.
733
- metadata: Optional columns, tables, or keys to include in a feature definition.
734
- optional: Column names that are not required when creating feature instances.
735
- comment: Description of the feature's purpose and usage.
877
+ table: A TableDefinition object containing the complete specification of the table to create.
878
+ update_navbar: If True (default), automatically updates the navigation bar to include
879
+ the new table. Set to False during batch table creation to avoid redundant updates,
880
+ then call apply_catalog_annotations() once at the end.
736
881
 
737
882
  Returns:
738
- type[FeatureRecord]: Feature class for creating validated instances.
883
+ Table: The newly created ERMRest table object.
739
884
 
740
885
  Raises:
741
- DerivaMLException: If a feature definition is invalid or conflicts with existing features.
886
+ DerivaMLException: If table creation fails or the definition is invalid.
742
887
 
743
888
  Examples:
744
- Create a feature with confidence score:
745
- >>> feature_class = ml.create_feature(
746
- ... target_table="samples",
747
- ... feature_name="expression_level",
748
- ... terms=["expression_values"],
749
- ... metadata=[ColumnDefinition(name="confidence", type=BuiltinTypes.float4)],
750
- ... comment="Gene expression measurement"
889
+ **Simple table with basic columns**:
890
+
891
+ >>> from deriva_ml import TableDefinition, ColumnDefinition, BuiltinTypes
892
+ >>>
893
+ >>> table_def = TableDefinition(
894
+ ... name="Experiment",
895
+ ... column_defs=[
896
+ ... ColumnDefinition(name="Name", type=BuiltinTypes.text, nullok=False),
897
+ ... ColumnDefinition(name="Date", type=BuiltinTypes.date),
898
+ ... ColumnDefinition(name="Description", type=BuiltinTypes.markdown),
899
+ ... ColumnDefinition(name="Score", type=BuiltinTypes.float4),
900
+ ... ],
901
+ ... comment="Records of experimental runs"
751
902
  ... )
752
- """
753
- # Initialize empty collections if None provided
754
- terms = terms or []
755
- assets = assets or []
756
- metadata = metadata or []
757
- optional = optional or []
758
-
759
- def normalize_metadata(m: Key | Table | ColumnDefinition | str):
760
- """Helper function to normalize metadata references."""
761
- if isinstance(m, str):
762
- return self.model.name_to_table(m)
763
- elif isinstance(m, ColumnDefinition):
764
- return m.model_dump()
765
- else:
766
- return m
767
-
768
- # Validate asset and term tables
769
- if not all(map(self.model.is_asset, assets)):
770
- raise DerivaMLException("Invalid create_feature asset table.")
771
- if not all(map(self.model.is_vocabulary, terms)):
772
- raise DerivaMLException("Invalid create_feature asset table.")
773
-
774
- # Get references to required tables
775
- target_table = self.model.name_to_table(target_table)
776
- execution = self.model.schemas[self.ml_schema].tables["Execution"]
777
- feature_name_table = self.model.schemas[self.ml_schema].tables["Feature_Name"]
778
-
779
- # Add feature name to vocabulary
780
- feature_name_term = self.add_term("Feature_Name", feature_name, description=comment)
781
- atable_name = f"Execution_{target_table.name}_{feature_name_term.name}"
782
- # Create an association table implementing the feature
783
- atable = self.model.schemas[self.domain_schema].create_table(
784
- target_table.define_association(
785
- table_name=atable_name,
786
- associates=[execution, target_table, feature_name_table],
787
- metadata=[normalize_metadata(m) for m in chain(assets, terms, metadata)],
788
- comment=comment,
789
- )
790
- )
791
- # Configure optional columns and default feature name
792
- for c in optional:
793
- atable.columns[c].alter(nullok=True)
794
- atable.columns["Feature_Name"].alter(default=feature_name_term.name)
795
-
796
- # Return feature record class for creating instances
797
- return self.feature_record_class(target_table, feature_name)
903
+ >>> experiment_table = ml.create_table(table_def)
798
904
 
799
- def feature_record_class(self, table: str | Table, feature_name: str) -> type[FeatureRecord]:
800
- """Returns a pydantic model class for feature records.
905
+ **Table with foreign key to another table**:
801
906
 
802
- Creates a typed interface for creating new instances of the specified feature. The returned class includes
803
- validation and type checking based on the feature's definition.
804
-
805
- Args:
806
- table: The table containing the feature, either as name or Table object.
807
- feature_name: Name of the feature to create a record class for.
808
-
809
- Returns:
810
- type[FeatureRecord]: A pydantic model class for creating validated feature records.
811
-
812
- Raises:
813
- DerivaMLException: If the feature doesn't exist or the table is invalid.
814
-
815
- Example:
816
- >>> ExpressionFeature = ml.feature_record_class("samples", "expression_level")
817
- >>> feature = ExpressionFeature(value="high", confidence=0.95)
818
- """
819
- # Look up a feature and return its record class
820
- return self.lookup_feature(table, feature_name).feature_record_class()
821
-
822
- def delete_feature(self, table: Table | str, feature_name: str) -> bool:
823
- """Removes a feature definition and its data.
824
-
825
- Deletes the feature and its implementation table from the catalog. This operation cannot be undone and
826
- will remove all feature values associated with this feature.
907
+ >>> from deriva_ml import (
908
+ ... TableDefinition, ColumnDefinition, ForeignKeyDefinition, BuiltinTypes
909
+ ... )
910
+ >>>
911
+ >>> # Create a Sample table that references Subject
912
+ >>> sample_def = TableDefinition(
913
+ ... name="Sample",
914
+ ... column_defs=[
915
+ ... ColumnDefinition(name="Name", type=BuiltinTypes.text, nullok=False),
916
+ ... ColumnDefinition(name="Subject", type=BuiltinTypes.text, nullok=False),
917
+ ... ColumnDefinition(name="Collection_Date", type=BuiltinTypes.date),
918
+ ... ],
919
+ ... fkey_defs=[
920
+ ... ForeignKeyDefinition(
921
+ ... colnames=["Subject"],
922
+ ... pk_sname=ml.default_schema, # Schema of referenced table
923
+ ... pk_tname="Subject", # Name of referenced table
924
+ ... pk_colnames=["RID"], # Column(s) in referenced table
925
+ ... on_delete="CASCADE", # Delete samples when subject deleted
926
+ ... )
927
+ ... ],
928
+ ... comment="Biological samples collected from subjects"
929
+ ... )
930
+ >>> sample_table = ml.create_table(sample_def)
827
931
 
828
- Args:
829
- table: The table containing the feature, either as name or Table object.
830
- feature_name: Name of the feature to delete.
932
+ **Table with unique key constraint**:
831
933
 
832
- Returns:
833
- bool: True if the feature was successfully deleted, False if it didn't exist.
934
+ >>> from deriva_ml import (
935
+ ... TableDefinition, ColumnDefinition, KeyDefinition, BuiltinTypes
936
+ ... )
937
+ >>>
938
+ >>> protocol_def = TableDefinition(
939
+ ... name="Protocol",
940
+ ... column_defs=[
941
+ ... ColumnDefinition(name="Name", type=BuiltinTypes.text, nullok=False),
942
+ ... ColumnDefinition(name="Version", type=BuiltinTypes.text, nullok=False),
943
+ ... ColumnDefinition(name="Description", type=BuiltinTypes.markdown),
944
+ ... ],
945
+ ... key_defs=[
946
+ ... KeyDefinition(
947
+ ... colnames=["Name", "Version"],
948
+ ... constraint_names=[["myschema", "Protocol_Name_Version_key"]],
949
+ ... comment="Each protocol name+version must be unique"
950
+ ... )
951
+ ... ],
952
+ ... comment="Experimental protocols with versioning"
953
+ ... )
954
+ >>> protocol_table = ml.create_table(protocol_def)
834
955
 
835
- Raises:
836
- DerivaMLException: If deletion fails due to constraints or permissions.
956
+ **Batch creation without navbar updates**:
837
957
 
838
- Example:
839
- >>> success = ml.delete_feature("samples", "obsolete_feature")
840
- >>> print("Deleted" if success else "Not found")
958
+ >>> ml.create_table(table1_def, update_navbar=False)
959
+ >>> ml.create_table(table2_def, update_navbar=False)
960
+ >>> ml.create_table(table3_def, update_navbar=False)
961
+ >>> ml.apply_catalog_annotations() # Update navbar once at the end
841
962
  """
842
- # Get table reference and find feature
843
- table = self.model.name_to_table(table)
844
- try:
845
- # Find and delete the feature's implementation table
846
- feature = next(f for f in self.model.find_features(table) if f.feature_name == feature_name)
847
- feature.feature_table.drop()
848
- return True
849
- except StopIteration:
850
- return False
963
+ # Use default schema if none specified
964
+ schema = schema or self.model._require_default_schema()
851
965
 
852
- def lookup_feature(self, table: str | Table, feature_name: str) -> Feature:
853
- """Retrieves a Feature object.
966
+ # Create table in domain schema using provided definition
967
+ # Handle both TableDefinition (dataclass with to_dict) and plain dicts
968
+ table_dict = table.to_dict() if hasattr(table, 'to_dict') else table
969
+ new_table = self.model.schemas[schema].create_table(table_dict)
854
970
 
855
- Looks up and returns a Feature object that provides an interface to work with an existing feature
856
- definition in the catalog.
971
+ # Update navbar to include the new table
972
+ if update_navbar:
973
+ self.apply_catalog_annotations()
857
974
 
858
- Args:
859
- table: The table containing the feature, either as name or Table object.
860
- feature_name: Name of the feature to look up.
861
-
862
- Returns:
863
- Feature: An object representing the feature and its implementation.
975
+ return new_table
864
976
 
865
- Raises:
866
- DerivaMLException: If the feature doesn't exist in the specified table.
867
-
868
- Example:
869
- >>> feature = ml.lookup_feature("samples", "expression_level")
870
- >>> print(feature.feature_name)
871
- 'expression_level'
872
- """
873
- return self.model.lookup_feature(table, feature_name)
977
+ # =========================================================================
978
+ # Cache and Directory Management
979
+ # =========================================================================
874
980
 
875
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
876
- def list_feature_values(self, table: Table | str, feature_name: str) -> datapath._ResultSet:
877
- """Retrieves all values for a feature.
981
+ def clear_cache(self, older_than_days: int | None = None) -> dict[str, int]:
982
+ """Clear the dataset cache directory.
878
983
 
879
- Returns all instances of the specified feature that have been created, including their associated
880
- metadata and references.
984
+ Removes cached dataset bags from the cache directory. Can optionally filter
985
+ by age to only remove old cache entries.
881
986
 
882
987
  Args:
883
- table: The table containing the feature, either as name or Table object.
884
- feature_name: Name of the feature to retrieve values for.
988
+ older_than_days: If provided, only remove cache entries older than this
989
+ many days. If None, removes all cache entries.
885
990
 
886
991
  Returns:
887
- datapath._ResultSet: A result set containing all feature values and their metadata.
888
-
889
- Raises:
890
- DerivaMLException: If the feature doesn't exist or cannot be accessed.
992
+ dict with keys:
993
+ - 'files_removed': Number of files removed
994
+ - 'dirs_removed': Number of directories removed
995
+ - 'bytes_freed': Total bytes freed
996
+ - 'errors': Number of removal errors
891
997
 
892
998
  Example:
893
- >>> values = ml.list_feature_values("samples", "expression_level")
894
- >>> for value in values:
895
- ... print(f"Sample {value['RID']}: {value['value']}")
999
+ >>> ml = DerivaML('deriva.example.org', 'my_catalog')
1000
+ >>> # Clear all cache
1001
+ >>> result = ml.clear_cache()
1002
+ >>> print(f"Freed {result['bytes_freed'] / 1e6:.1f} MB")
1003
+ >>>
1004
+ >>> # Clear cache older than 7 days
1005
+ >>> result = ml.clear_cache(older_than_days=7)
896
1006
  """
897
- # Get table and feature references
898
- table = self.model.name_to_table(table)
899
- feature = self.lookup_feature(table, feature_name)
900
-
901
- # Build and execute query for feature values
902
- pb = self.catalog.getPathBuilder()
903
- return pb.schemas[feature.feature_table.schema.name].tables[feature.feature_table.name].entities().fetch()
904
-
905
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
906
- def add_term(
907
- self,
908
- table: str | Table,
909
- term_name: str,
910
- description: str,
911
- synonyms: list[str] | None = None,
912
- exists_ok: bool = True,
913
- ) -> VocabularyTerm:
914
- """Adds a term to a vocabulary table.
1007
+ import shutil
1008
+ import time
915
1009
 
916
- Creates a new standardized term with description and optional synonyms in a vocabulary table.
917
- Can either create a new term or return an existing one if it already exists.
1010
+ stats = {'files_removed': 0, 'dirs_removed': 0, 'bytes_freed': 0, 'errors': 0}
918
1011
 
919
- Args:
920
- table: Vocabulary table to add term to (name or Table object).
921
- term_name: Primary name of the term (must be unique within vocabulary).
922
- description: Explanation of term's meaning and usage.
923
- synonyms: Alternative names for the term.
924
- exists_ok: If True, return the existing term if found. If False, raise error.
1012
+ if not self.cache_dir.exists():
1013
+ return stats
925
1014
 
926
- Returns:
927
- VocabularyTerm: Object representing the created or existing term.
928
-
929
- Raises:
930
- DerivaMLException: If a term exists and exists_ok=False, or if the table is not a vocabulary table.
931
-
932
- Examples:
933
- Add a new tissue type:
934
- >>> term = ml.add_term(
935
- ... table="tissue_types",
936
- ... term_name="epithelial",
937
- ... description="Epithelial tissue type",
938
- ... synonyms=["epithelium"]
939
- ... )
940
-
941
- Attempt to add an existing term:
942
- >>> term = ml.add_term("tissue_types", "epithelial", "...", exists_ok=True)
943
- """
944
- # Initialize an empty synonyms list if None
945
- synonyms = synonyms or []
946
-
947
- # Get table reference and validate if it is a vocabulary table
948
- table = self.model.name_to_table(table)
949
- pb = self.catalog.getPathBuilder()
950
- if not (self.model.is_vocabulary(table)):
951
- raise DerivaMLTableTypeError("vocabulary", table.name)
952
-
953
- # Get schema and table names for path building
954
- schema_name = table.schema.name
955
- table_name = table.name
1015
+ cutoff_time = None
1016
+ if older_than_days is not None:
1017
+ cutoff_time = time.time() - (older_than_days * 24 * 60 * 60)
956
1018
 
957
1019
  try:
958
- # Attempt to insert a new term
959
- term_id = VocabularyTerm.model_validate(
960
- pb.schemas[schema_name]
961
- .tables[table_name]
962
- .insert(
963
- [
964
- {
965
- "Name": term_name,
966
- "Description": description,
967
- "Synonyms": synonyms,
968
- }
969
- ],
970
- defaults={"ID", "URI"},
971
- )[0]
972
- )
973
- except DataPathException:
974
- # Term exists - look it up or raise an error
975
- term_id = self.lookup_term(table, term_name)
976
- if not exists_ok:
977
- raise DerivaMLInvalidTerm(table.name, term_name, msg="term already exists")
978
- return term_id
979
-
980
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
981
- def lookup_term(self, table: str | Table, term_name: str) -> VocabularyTerm:
982
- """Finds a term in a vocabulary table.
983
-
984
- Searches for a term in the specified vocabulary table, matching either the primary name
985
- or any of its synonyms.
986
-
987
- Args:
988
- table: Vocabulary table to search in (name or Table object).
989
- term_name: Name or synonym of the term to find.
1020
+ for entry in self.cache_dir.iterdir():
1021
+ try:
1022
+ # Check age if filtering
1023
+ if cutoff_time is not None:
1024
+ entry_mtime = entry.stat().st_mtime
1025
+ if entry_mtime > cutoff_time:
1026
+ continue # Skip recent entries
1027
+
1028
+ # Calculate size before removal
1029
+ if entry.is_dir():
1030
+ entry_size = sum(f.stat().st_size for f in entry.rglob('*') if f.is_file())
1031
+ shutil.rmtree(entry)
1032
+ stats['dirs_removed'] += 1
1033
+ else:
1034
+ entry_size = entry.stat().st_size
1035
+ entry.unlink()
1036
+ stats['files_removed'] += 1
1037
+
1038
+ stats['bytes_freed'] += entry_size
1039
+ except (OSError, PermissionError) as e:
1040
+ self._logger.warning(f"Failed to remove cache entry {entry}: {e}")
1041
+ stats['errors'] += 1
1042
+
1043
+ except OSError as e:
1044
+ self._logger.error(f"Failed to iterate cache directory: {e}")
1045
+ stats['errors'] += 1
1046
+
1047
+ return stats
1048
+
1049
+ def get_cache_size(self) -> dict[str, int | float]:
1050
+ """Get the current size of the cache directory.
990
1051
 
991
1052
  Returns:
992
- VocabularyTerm: The matching vocabulary term.
993
-
994
- Raises:
995
- DerivaMLVocabularyException: If the table is not a vocabulary table, or term is not found.
996
-
997
- Examples:
998
- Look up by primary name:
999
- >>> term = ml.lookup_term("tissue_types", "epithelial")
1000
- >>> print(term.description)
1053
+ dict with keys:
1054
+ - 'total_bytes': Total size in bytes
1055
+ - 'total_mb': Total size in megabytes
1056
+ - 'file_count': Number of files
1057
+ - 'dir_count': Number of directories
1001
1058
 
1002
- Look up by synonym:
1003
- >>> term = ml.lookup_term("tissue_types", "epithelium")
1059
+ Example:
1060
+ >>> ml = DerivaML('deriva.example.org', 'my_catalog')
1061
+ >>> size = ml.get_cache_size()
1062
+ >>> print(f"Cache size: {size['total_mb']:.1f} MB ({size['file_count']} files)")
1004
1063
  """
1005
- # Get and validate vocabulary table reference
1006
- vocab_table = self.model.name_to_table(table)
1007
- if not self.model.is_vocabulary(vocab_table):
1008
- raise DerivaMLException(f"The table {table} is not a controlled vocabulary")
1064
+ stats = {'total_bytes': 0, 'total_mb': 0.0, 'file_count': 0, 'dir_count': 0}
1009
1065
 
1010
- # Get schema and table paths
1011
- schema_name, table_name = vocab_table.schema.name, vocab_table.name
1012
- schema_path = self.catalog.getPathBuilder().schemas[schema_name]
1066
+ if not self.cache_dir.exists():
1067
+ return stats
1013
1068
 
1014
- # Search for term by name or synonym
1015
- for term in schema_path.tables[table_name].entities().fetch():
1016
- if term_name == term["Name"] or (term["Synonyms"] and term_name in term["Synonyms"]):
1017
- return VocabularyTerm.model_validate(term)
1069
+ for entry in self.cache_dir.rglob('*'):
1070
+ if entry.is_file():
1071
+ stats['total_bytes'] += entry.stat().st_size
1072
+ stats['file_count'] += 1
1073
+ elif entry.is_dir():
1074
+ stats['dir_count'] += 1
1018
1075
 
1019
- # Term not found
1020
- raise DerivaMLInvalidTerm(table_name, term_name)
1076
+ stats['total_mb'] = stats['total_bytes'] / (1024 * 1024)
1077
+ return stats
1021
1078
 
1022
- def list_vocabulary_terms(self, table: str | Table) -> list[VocabularyTerm]:
1023
- """Lists all terms in a vocabulary table.
1079
+ def list_execution_dirs(self) -> list[dict[str, any]]:
1080
+ """List execution working directories.
1024
1081
 
1025
- Retrieves all terms, their descriptions, and synonyms from a controlled vocabulary table.
1026
-
1027
- Args:
1028
- table: Vocabulary table to list terms from (name or Table object).
1082
+ Returns information about each execution directory in the working directory,
1083
+ useful for identifying orphaned or incomplete execution outputs.
1029
1084
 
1030
1085
  Returns:
1031
- list[VocabularyTerm]: List of vocabulary terms with their metadata.
1032
-
1033
- Raises:
1034
- DerivaMLException: If table doesn't exist or is not a vocabulary table.
1086
+ List of dicts, each containing:
1087
+ - 'execution_rid': The execution RID (directory name)
1088
+ - 'path': Full path to the directory
1089
+ - 'size_bytes': Total size in bytes
1090
+ - 'size_mb': Total size in megabytes
1091
+ - 'modified': Last modification time (datetime)
1092
+ - 'file_count': Number of files
1035
1093
 
1036
- Examples:
1037
- >>> terms = ml.list_vocabulary_terms("tissue_types")
1038
- >>> for term in terms:
1039
- ... print(f"{term.name}: {term.description}")
1040
- ... if term.synonyms:
1041
- ... print(f" Synonyms: {', '.join(term.synonyms)}")
1094
+ Example:
1095
+ >>> ml = DerivaML('deriva.example.org', 'my_catalog')
1096
+ >>> dirs = ml.list_execution_dirs()
1097
+ >>> for d in dirs:
1098
+ ... print(f"{d['execution_rid']}: {d['size_mb']:.1f} MB")
1042
1099
  """
1043
- # Get path builder and table reference
1044
- pb = self.catalog.getPathBuilder()
1045
- table = self.model.name_to_table(table.value if isinstance(table, MLVocab) else table)
1046
-
1047
- # Validate table is a vocabulary table
1048
- if not (self.model.is_vocabulary(table)):
1049
- raise DerivaMLException(f"The table {table} is not a controlled vocabulary")
1050
-
1051
- # Fetch and convert all terms to VocabularyTerm objects
1052
- return [VocabularyTerm(**v) for v in pb.schemas[table.schema.name].tables[table.name].entities().fetch()]
1053
-
1054
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
1055
- def download_dataset_bag(
1056
- self,
1057
- dataset: DatasetSpec,
1058
- execution_rid: RID | None = None,
1059
- ) -> DatasetBag:
1060
- """Downloads a dataset to the local filesystem and creates a MINID if needed.
1100
+ from datetime import datetime
1101
+ from deriva_ml.dataset.upload import upload_root
1061
1102
 
1062
- Downloads a dataset specified by DatasetSpec to the local filesystem. If the dataset doesn't have
1063
- a MINID (Minimal Viable Identifier), one will be created. The dataset can optionally be associated
1064
- with an execution record.
1103
+ results = []
1104
+ exec_root = upload_root(self.working_dir) / "execution"
1065
1105
 
1066
- Args:
1067
- dataset: Specification of the dataset to download, including version and materialization options.
1068
- execution_rid: Optional execution RID to associate the download with.
1069
-
1070
- Returns:
1071
- DatasetBag: Object containing:
1072
- - path: Local filesystem path to downloaded dataset
1073
- - rid: Dataset's Resource Identifier
1074
- - minid: Dataset's Minimal Viable Identifier
1075
-
1076
- Examples:
1077
- Download with default options:
1078
- >>> spec = DatasetSpec(rid="1-abc123")
1079
- >>> bag = ml.download_dataset_bag(dataset=spec)
1080
- >>> print(f"Downloaded to {bag.path}")
1081
-
1082
- Download with execution tracking:
1083
- >>> bag = ml.download_dataset_bag(
1084
- ... dataset=DatasetSpec(rid="1-abc123", materialize=True),
1085
- ... execution_rid="1-xyz789"
1086
- ... )
1087
- """
1088
- if not self._is_dataset_rid(dataset.rid):
1089
- raise DerivaMLTableTypeError("Dataset", dataset.rid)
1090
- return self._download_dataset_bag(
1091
- dataset=dataset,
1092
- execution_rid=execution_rid,
1093
- snapshot_catalog=DerivaML(
1094
- self.host_name,
1095
- self._version_snapshot(dataset),
1096
- logging_level=self._logging_level,
1097
- deriva_logging_level=self._deriva_logging_level,
1098
- ),
1099
- )
1106
+ if not exec_root.exists():
1107
+ return results
1100
1108
 
1101
- def _update_status(self, new_status: Status, status_detail: str, execution_rid: RID):
1102
- """Update the status of an execution in the catalog.
1109
+ for entry in exec_root.iterdir():
1110
+ if entry.is_dir():
1111
+ size_bytes = sum(f.stat().st_size for f in entry.rglob('*') if f.is_file())
1112
+ file_count = sum(1 for f in entry.rglob('*') if f.is_file())
1113
+ mtime = datetime.fromtimestamp(entry.stat().st_mtime)
1103
1114
 
1104
- Args:
1105
- new_status: New status.
1106
- status_detail: Details of the status.
1107
- execution_rid: Resource Identifier (RID) of the execution.
1108
- new_status: Status:
1109
- status_detail: str:
1110
- execution_rid: RID:
1115
+ results.append({
1116
+ 'execution_rid': entry.name,
1117
+ 'path': str(entry),
1118
+ 'size_bytes': size_bytes,
1119
+ 'size_mb': size_bytes / (1024 * 1024),
1120
+ 'modified': mtime,
1121
+ 'file_count': file_count,
1122
+ })
1111
1123
 
1112
- Returns:
1124
+ return sorted(results, key=lambda x: x['modified'], reverse=True)
1113
1125
 
1114
- """
1115
- self.status = new_status.value
1116
- self.pathBuilder.schemas[self.ml_schema].Execution.update(
1117
- [
1118
- {
1119
- "RID": execution_rid,
1120
- "Status": self.status,
1121
- "Status_Detail": status_detail,
1122
- }
1123
- ]
1124
- )
1125
-
1126
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
1127
- def add_files(
1126
+ def clean_execution_dirs(
1128
1127
  self,
1129
- files: Iterable[FileSpec],
1130
- dataset_types: str | list[str] | None = None,
1131
- description: str = "",
1132
- execution_rid: RID | None = None,
1133
- ) -> RID:
1134
- """Adds files to the catalog with their metadata.
1135
-
1136
- Registers files in the catalog along with their metadata (MD5, length, URL) and associates them with
1137
- specified file types. Optionally links files to an execution record.
1138
-
1139
- Args:
1140
- files: File specifications containing MD5 checksum, length, and URL.
1141
- dataset_types: One or more dataset type terms from File_Type vocabulary.
1142
- description: Description of the files.
1143
- execution_rid: Optional execution RID to associate files with.
1144
-
1145
- Returns:
1146
- RID: Resource of dataset that represents the newly added files.
1147
-
1148
- Raises:
1149
- DerivaMLException: If file_types are invalid or execution_rid is not an execution record.
1150
-
1151
- Examples:
1152
- Add a single file type:
1153
- >>> files = [FileSpec(url="path/to/file.txt", md5="abc123", length=1000)]
1154
- >>> rids = ml.add_files(files, file_types="text")
1155
-
1156
- Add multiple file types:
1157
- >>> rids = ml.add_files(
1158
- ... files=[FileSpec(url="image.png", md5="def456", length=2000)],
1159
- ... file_types=["image", "png"],
1160
- ... execution_rid="1-xyz789"
1161
- ... )
1162
- """
1163
- if execution_rid and self.resolve_rid(execution_rid).table.name != "Execution":
1164
- raise DerivaMLTableTypeError("Execution", execution_rid)
1165
-
1166
- filespec_list = list(files)
1167
-
1168
- # Get a list of all defined file types and their synonyms.
1169
- defined_types = set(
1170
- chain.from_iterable([[t.name] + t.synonyms for t in self.list_vocabulary_terms(MLVocab.asset_type)])
1171
- )
1172
-
1173
- # Get a list of al of the file types used in the filespec_list
1174
- spec_types = set(chain.from_iterable(filespec.file_types for filespec in filespec_list))
1175
-
1176
- # Now make sure that all of the file types and dataset_types in the spec list are defined.
1177
- if spec_types - defined_types:
1178
- raise DerivaMLInvalidTerm(MLVocab.asset_type.name, f"{spec_types - defined_types}")
1179
-
1180
- # Normalize dataset_types, make sure FIle type is included.
1181
- if isinstance(dataset_types, list):
1182
- dataset_types = ["File"] + dataset_types if "File" not in dataset_types else dataset_types
1183
- else:
1184
- dataset_types = ["File", dataset_types] if dataset_types else ["File"]
1185
- for ds_type in dataset_types:
1186
- self.lookup_term(MLVocab.dataset_type, ds_type)
1187
-
1188
- # Add files to the file table, and collect up the resulting entries by directory name.
1189
- pb = self._model.catalog.getPathBuilder()
1190
- file_records = list(
1191
- pb.schemas[self.ml_schema].tables["File"].insert([f.model_dump(by_alias=True) for f in filespec_list])
1192
- )
1193
-
1194
- # Get the name of the association table between file_table and file_type and add file_type records
1195
- atable = self.model.find_association(MLTable.file, MLVocab.asset_type)[0].name
1196
- # Need to get a link between file record and file_types.
1197
- type_map = {
1198
- file_spec.md5: file_spec.file_types + ([] if "File" in file_spec.file_types else [])
1199
- for file_spec in filespec_list
1200
- }
1201
- file_type_records = [
1202
- {MLVocab.asset_type.value: file_type, "File": file_record["RID"]}
1203
- for file_record in file_records
1204
- for file_type in type_map[file_record["MD5"]]
1205
- ]
1206
- pb.schemas[self._ml_schema].tables[atable].insert(file_type_records)
1207
-
1208
- if execution_rid:
1209
- # Get the name of the association table between file_table and execution.
1210
- pb.schemas[self._ml_schema].File_Execution.insert(
1211
- [
1212
- {"File": file_record["RID"], "Execution": execution_rid, "Asset_Role": "Output"}
1213
- for file_record in file_records
1214
- ]
1215
- )
1128
+ older_than_days: int | None = None,
1129
+ exclude_rids: list[str] | None = None,
1130
+ ) -> dict[str, int]:
1131
+ """Clean up execution working directories.
1216
1132
 
1217
- # Now create datasets to capture the original directory structure of the files.
1218
- dir_rid_map = defaultdict(list)
1219
- for e in file_records:
1220
- dir_rid_map[Path(urlsplit(e["URL"]).path).parent].append(e["RID"])
1221
-
1222
- nested_datasets = []
1223
- path_length = 0
1224
- dataset = None
1225
- # Start with the longest path so we get subdirectories first.
1226
- for p, rids in sorted(dir_rid_map.items(), key=lambda kv: len(kv[0].parts), reverse=True):
1227
- dataset = self.create_dataset(
1228
- dataset_types=dataset_types, execution_rid=execution_rid, description=description
1229
- )
1230
- members = rids
1231
- if len(p.parts) < path_length:
1232
- # Going up one level in directory, so Create nested dataset
1233
- members = nested_datasets + rids
1234
- nested_datasets = []
1235
- self.add_dataset_members(dataset_rid=dataset, members=members, execution_rid=execution_rid)
1236
- nested_datasets.append(dataset)
1237
- path_length = len(p.parts)
1238
-
1239
- return dataset
1240
-
1241
- def list_files(self, file_types: list[str] | None = None) -> list[dict[str, Any]]:
1242
- """Lists files in the catalog with their metadata.
1243
-
1244
- Returns a list of files with their metadata including URL, MD5 hash, length, description,
1245
- and associated file types. Files can be optionally filtered by type.
1133
+ Removes execution output directories from the local working directory.
1134
+ Use this to free up disk space from completed or orphaned executions.
1246
1135
 
1247
1136
  Args:
1248
- file_types: Filter results to only include these file types.
1137
+ older_than_days: If provided, only remove directories older than this
1138
+ many days. If None, removes all execution directories (except excluded).
1139
+ exclude_rids: List of execution RIDs to preserve (never remove).
1249
1140
 
1250
1141
  Returns:
1251
- list[dict[str, Any]]: List of file records, each containing:
1252
- - RID: Resource identifier
1253
- - URL: File location
1254
- - MD5: File hash
1255
- - Length: File size
1256
- - Description: File description
1257
- - File_Types: List of associated file types
1258
-
1259
- Examples:
1260
- List all files:
1261
- >>> files = ml.list_files()
1262
- >>> for f in files:
1263
- ... print(f"{f['RID']}: {f['URL']}")
1264
-
1265
- Filter by file type:
1266
- >>> image_files = ml.list_files(["image", "png"])
1267
- """
1268
-
1269
- asset_type_atable, file_fk, asset_type_fk = self.model.find_association("File", "Asset_Type")
1270
- ml_path = self.pathBuilder.schemas[self._ml_schema]
1271
- file = ml_path.File
1272
- asset_type = ml_path.tables[asset_type_atable.name]
1273
-
1274
- path = file.path
1275
- path = path.link(asset_type.alias("AT"), on=file.RID == asset_type.columns[file_fk], join_type="left")
1276
- if file_types:
1277
- path = path.filter(asset_type.columns[asset_type_fk] == datapath.Any(*file_types))
1278
- path = path.attributes(
1279
- path.File.RID,
1280
- path.File.URL,
1281
- path.File.MD5,
1282
- path.File.Length,
1283
- path.File.Description,
1284
- path.AT.columns[asset_type_fk],
1285
- )
1142
+ dict with keys:
1143
+ - 'dirs_removed': Number of directories removed
1144
+ - 'bytes_freed': Total bytes freed
1145
+ - 'errors': Number of removal errors
1286
1146
 
1287
- file_map = {}
1288
- for f in path.fetch():
1289
- entry = file_map.setdefault(f["RID"], {**f, "File_Types": []})
1290
- if ft := f.get("Asset_Type"): # assign-and-test in one go
1291
- entry["File_Types"].append(ft)
1292
-
1293
- # Now get rid of the File_Type key and return the result
1294
- return [(f, f.pop("Asset_Type"))[0] for f in file_map.values()]
1295
-
1296
- def list_workflows(self) -> list[Workflow]:
1297
- """Lists all workflows in the catalog.
1298
-
1299
- Retrieves all workflow definitions, including their names, URLs, types, versions,
1300
- and descriptions.
1301
-
1302
- Returns:
1303
- list[Workflow]: List of workflow objects, each containing:
1304
- - name: Workflow name
1305
- - url: Source code URL
1306
- - workflow_type: Type of workflow
1307
- - version: Version identifier
1308
- - description: Workflow description
1309
- - rid: Resource identifier
1310
- - checksum: Source code checksum
1311
-
1312
- Examples:
1313
- >>> workflows = ml.list_workflows()
1314
- >>> for w in workflows:
1315
- print(f"{w.name} (v{w.version}): {w.description}")
1316
- print(f" Source: {w.url}")
1147
+ Example:
1148
+ >>> ml = DerivaML('deriva.example.org', 'my_catalog')
1149
+ >>> # Clean all execution dirs older than 30 days
1150
+ >>> result = ml.clean_execution_dirs(older_than_days=30)
1151
+ >>> print(f"Freed {result['bytes_freed'] / 1e9:.2f} GB")
1152
+ >>>
1153
+ >>> # Clean all except specific executions
1154
+ >>> result = ml.clean_execution_dirs(exclude_rids=['1-ABC', '1-DEF'])
1317
1155
  """
1318
- # Get a workflow table path and fetch all workflows
1319
- workflow_path = self.pathBuilder.schemas[self.ml_schema].Workflow
1320
- return [
1321
- Workflow(
1322
- name=w["Name"],
1323
- url=w["URL"],
1324
- workflow_type=w["Workflow_Type"],
1325
- version=w["Version"],
1326
- description=w["Description"],
1327
- rid=w["RID"],
1328
- checksum=w["Checksum"],
1329
- )
1330
- for w in workflow_path.entities().fetch()
1331
- ]
1156
+ import shutil
1157
+ import time
1158
+ from deriva_ml.dataset.upload import upload_root
1332
1159
 
1333
- def add_workflow(self, workflow: Workflow) -> RID:
1334
- """Adds a workflow to the catalog.
1160
+ stats = {'dirs_removed': 0, 'bytes_freed': 0, 'errors': 0}
1161
+ exclude_rids = set(exclude_rids or [])
1335
1162
 
1336
- Registers a new workflow in the catalog or returns the RID of an existing workflow with the same
1337
- URL or checksum.
1163
+ exec_root = upload_root(self.working_dir) / "execution"
1164
+ if not exec_root.exists():
1165
+ return stats
1338
1166
 
1339
- Each workflow represents a specific computational process or analysis pipeline.
1167
+ cutoff_time = None
1168
+ if older_than_days is not None:
1169
+ cutoff_time = time.time() - (older_than_days * 24 * 60 * 60)
1340
1170
 
1341
- Args:
1342
- workflow: Workflow object containing name, URL, type, version, and description.
1171
+ for entry in exec_root.iterdir():
1172
+ if not entry.is_dir():
1173
+ continue
1343
1174
 
1344
- Returns:
1345
- RID: Resource Identifier of the added or existing workflow.
1175
+ # Skip excluded RIDs
1176
+ if entry.name in exclude_rids:
1177
+ continue
1346
1178
 
1347
- Raises:
1348
- DerivaMLException: If workflow insertion fails or required fields are missing.
1179
+ try:
1180
+ # Check age if filtering
1181
+ if cutoff_time is not None:
1182
+ entry_mtime = entry.stat().st_mtime
1183
+ if entry_mtime > cutoff_time:
1184
+ continue
1349
1185
 
1350
- Examples:
1351
- >>> workflow = Workflow(
1352
- ... name="Gene Analysis",
1353
- ... url="https://github.com/org/repo/workflows/gene_analysis.py",
1354
- ... workflow_type="python_script",
1355
- ... version="1.0.0",
1356
- ... description="Analyzes gene expression patterns"
1357
- ... )
1358
- >>> workflow_rid = ml.add_workflow(workflow)
1359
- """
1360
- # Check if a workflow already exists by URL
1361
- if workflow_rid := self.lookup_workflow(workflow.checksum or workflow.url):
1362
- return workflow_rid
1186
+ # Calculate size before removal
1187
+ entry_size = sum(f.stat().st_size for f in entry.rglob('*') if f.is_file())
1188
+ shutil.rmtree(entry)
1189
+ stats['dirs_removed'] += 1
1190
+ stats['bytes_freed'] += entry_size
1363
1191
 
1364
- # Get an ML schema path for the workflow table
1365
- ml_schema_path = self.pathBuilder.schemas[self.ml_schema]
1192
+ except (OSError, PermissionError) as e:
1193
+ self._logger.warning(f"Failed to remove execution dir {entry}: {e}")
1194
+ stats['errors'] += 1
1366
1195
 
1367
- try:
1368
- # Create a workflow record
1369
- workflow_record = {
1370
- "URL": workflow.url,
1371
- "Name": workflow.name,
1372
- "Description": workflow.description,
1373
- "Checksum": workflow.checksum,
1374
- "Version": workflow.version,
1375
- MLVocab.workflow_type: self.lookup_term(MLVocab.workflow_type, workflow.workflow_type).name,
1376
- }
1377
- # Insert a workflow and get its RID
1378
- workflow_rid = ml_schema_path.Workflow.insert([workflow_record])[0]["RID"]
1379
- except Exception as e:
1380
- error = format_exception(e)
1381
- raise DerivaMLException(f"Failed to insert workflow. Error: {error}")
1382
- return workflow_rid
1196
+ return stats
1383
1197
 
1384
- def lookup_workflow(self, url_or_checksum: str) -> RID | None:
1385
- """Finds a workflow by URL.
1198
+ def get_storage_summary(self) -> dict[str, any]:
1199
+ """Get a summary of local storage usage.
1386
1200
 
1387
- Args:
1388
- url_or_checksum: URL or checksum of the workflow.
1389
1201
  Returns:
1390
- RID: Resource Identifier of the workflow if found, None otherwise.
1202
+ dict with keys:
1203
+ - 'working_dir': Path to working directory
1204
+ - 'cache_dir': Path to cache directory
1205
+ - 'cache_size_mb': Cache size in MB
1206
+ - 'cache_file_count': Number of files in cache
1207
+ - 'execution_dir_count': Number of execution directories
1208
+ - 'execution_size_mb': Total size of execution directories in MB
1209
+ - 'total_size_mb': Combined size in MB
1391
1210
 
1392
1211
  Example:
1393
- >>> rid = ml.lookup_workflow("https://github.com/org/repo/workflow.py")
1394
- >>> if rid:
1395
- ... print(f"Found workflow: {rid}")
1396
- """
1397
- # Get a workflow table path
1398
- workflow_path = self.pathBuilder.schemas[self.ml_schema].Workflow
1399
- try:
1400
- # Search for workflow by URL
1401
- url_column = workflow_path.URL
1402
- checksum_column = workflow_path.Checksum
1403
- return list(
1404
- workflow_path.path.filter(
1405
- (url_column == url_or_checksum) | (checksum_column == url_or_checksum)
1406
- ).entities()
1407
- )[0]["RID"]
1408
- except IndexError:
1409
- return None
1410
-
1411
- def create_workflow(self, name: str, workflow_type: str, description: str = "") -> Workflow:
1412
- """Creates a new workflow definition.
1413
-
1414
- Creates a Workflow object that represents a computational process or analysis pipeline. The workflow type
1415
- must be a term from the controlled vocabulary. This method is typically used to define new analysis
1416
- workflows before execution.
1417
-
1418
- Args:
1419
- name: Name of the workflow.
1420
- workflow_type: Type of workflow (must exist in workflow_type vocabulary).
1421
- description: Description of what the workflow does.
1422
-
1423
- Returns:
1424
- Workflow: New workflow object ready for registration.
1425
-
1426
- Raises:
1427
- DerivaMLException: If workflow_type is not in the vocabulary.
1428
-
1429
- Examples:
1430
- >>> workflow = ml.create_workflow(
1431
- ... name="RNA Analysis",
1432
- ... workflow_type="python_notebook",
1433
- ... description="RNA sequence analysis pipeline"
1434
- ... )
1435
- >>> rid = ml.add_workflow(workflow)
1212
+ >>> ml = DerivaML('deriva.example.org', 'my_catalog')
1213
+ >>> summary = ml.get_storage_summary()
1214
+ >>> print(f"Total storage: {summary['total_size_mb']:.1f} MB")
1215
+ >>> print(f" Cache: {summary['cache_size_mb']:.1f} MB")
1216
+ >>> print(f" Executions: {summary['execution_size_mb']:.1f} MB")
1436
1217
  """
1437
- # Validate workflow type exists in vocabulary
1438
- self.lookup_term(MLVocab.workflow_type, workflow_type)
1439
-
1440
- # Create and return a new workflow object
1441
- return Workflow(name=name, workflow_type=workflow_type, description=description)
1442
-
1443
- def create_execution(
1444
- self, configuration: ExecutionConfiguration, workflow: Workflow | RID | None = None, dry_run: bool = False
1445
- ) -> "Execution":
1446
- """Creates an execution environment.
1447
-
1448
- Given an execution configuration, initialize the local compute environment to prepare for executing an
1449
- ML or analytic routine. This routine has a number of side effects.
1450
-
1451
- 1. The datasets specified in the configuration are downloaded and placed in the cache-dir. If a version is
1452
- not specified in the configuration, then a new minor version number is created for the dataset and downloaded.
1453
-
1454
- 2. If any execution assets are provided in the configuration, they are downloaded
1455
- and placed in the working directory.
1456
-
1457
-
1458
- Args:
1459
- configuration: ExecutionConfiguration:
1460
- workflow: Workflow object representing the workflow to execute if not present in the ExecutionConfiguration.
1461
- dry_run: Do not create an execution record or upload results.
1462
-
1463
- Returns:
1464
- An execution object.
1465
- """
1466
- # Import here to avoid circular dependency
1467
- from deriva_ml.execution.execution import Execution
1218
+ cache_stats = self.get_cache_size()
1219
+ exec_dirs = self.list_execution_dirs()
1220
+
1221
+ exec_size_mb = sum(d['size_mb'] for d in exec_dirs)
1222
+
1223
+ return {
1224
+ 'working_dir': str(self.working_dir),
1225
+ 'cache_dir': str(self.cache_dir),
1226
+ 'cache_size_mb': cache_stats['total_mb'],
1227
+ 'cache_file_count': cache_stats['file_count'],
1228
+ 'execution_dir_count': len(exec_dirs),
1229
+ 'execution_size_mb': exec_size_mb,
1230
+ 'total_size_mb': cache_stats['total_mb'] + exec_size_mb,
1231
+ }
1468
1232
 
1469
- # Create and store an execution instance
1470
- self._execution = Execution(configuration, self, workflow=workflow, dry_run=dry_run)
1471
- return self._execution
1233
+ # =========================================================================
1234
+ # Schema Validation
1235
+ # =========================================================================
1472
1236
 
1473
- def restore_execution(self, execution_rid: RID | None = None) -> Execution:
1474
- """Restores a previous execution.
1237
+ def validate_schema(self, strict: bool = False) -> "SchemaValidationReport":
1238
+ """Validate that the catalog's ML schema matches the expected structure.
1475
1239
 
1476
- Given an execution RID, retrieves the execution configuration and restores the local compute environment.
1477
- This routine has a number of side effects.
1240
+ This method inspects the catalog schema and verifies that it contains all
1241
+ the required tables, columns, vocabulary terms, and relationships that are
1242
+ created by the ML schema initialization routines in create_schema.py.
1478
1243
 
1479
- 1. The datasets specified in the configuration are downloaded and placed in the cache-dir. If a version is
1480
- not specified in the configuration, then a new minor version number is created for the dataset and downloaded.
1244
+ The validation checks:
1245
+ - All required ML tables exist (Dataset, Execution, Workflow, etc.)
1246
+ - All required columns exist with correct types
1247
+ - All required vocabulary tables exist (Asset_Type, Dataset_Type, etc.)
1248
+ - All required vocabulary terms are initialized
1249
+ - All association tables exist for relationships
1481
1250
 
1482
- 2. If any execution assets are provided in the configuration, they are downloaded and placed
1483
- in the working directory.
1251
+ In strict mode, the validator also reports errors for:
1252
+ - Extra tables not in the expected schema
1253
+ - Extra columns not in the expected table definitions
1484
1254
 
1485
1255
  Args:
1486
- execution_rid: Resource Identifier (RID) of the execution to restore.
1256
+ strict: If True, extra tables and columns are reported as errors.
1257
+ If False (default), they are reported as informational items.
1258
+ Use strict=True to verify a clean ML catalog matches exactly.
1259
+ Use strict=False to validate a catalog that may have domain extensions.
1487
1260
 
1488
1261
  Returns:
1489
- Execution: An execution object representing the restored execution environment.
1490
-
1491
- Raises:
1492
- DerivaMLException: If execution_rid is not valid or execution cannot be restored.
1262
+ SchemaValidationReport with validation results. Key attributes:
1263
+ - is_valid: True if no errors were found
1264
+ - errors: List of error-level issues
1265
+ - warnings: List of warning-level issues
1266
+ - info: List of informational items
1267
+ - to_text(): Human-readable report
1268
+ - to_dict(): JSON-serializable dictionary
1493
1269
 
1494
1270
  Example:
1495
- >>> execution = ml.restore_execution("1-abc123")
1271
+ >>> ml = DerivaML('localhost', 'my_catalog')
1272
+ >>> report = ml.validate_schema(strict=False)
1273
+ >>> if report.is_valid:
1274
+ ... print("Schema is valid!")
1275
+ ... else:
1276
+ ... print(report.to_text())
1277
+
1278
+ >>> # Strict validation for a fresh ML catalog
1279
+ >>> report = ml.validate_schema(strict=True)
1280
+ >>> print(f"Found {len(report.errors)} errors, {len(report.warnings)} warnings")
1281
+
1282
+ >>> # Get report as dictionary for JSON/logging
1283
+ >>> import json
1284
+ >>> print(json.dumps(report.to_dict(), indent=2))
1285
+
1286
+ Note:
1287
+ This method validates the ML schema (typically 'deriva-ml'), not the
1288
+ domain schema. Domain-specific tables and columns are not checked
1289
+ unless they are part of the ML schema itself.
1290
+
1291
+ See Also:
1292
+ - deriva_ml.schema.validation.SchemaValidationReport
1293
+ - deriva_ml.schema.validation.validate_ml_schema
1496
1294
  """
1497
- # Import here to avoid circular dependency
1498
- from deriva_ml.execution.execution import Execution
1499
-
1500
- # If no RID provided, try to find single execution in working directory
1501
- if not execution_rid:
1502
- e_rids = execution_rids(self.working_dir)
1503
- if len(e_rids) != 1:
1504
- raise DerivaMLException(f"Multiple execution RIDs were found {e_rids}.")
1505
- execution_rid = e_rids[0]
1506
-
1507
- # Try to load configuration from a file
1508
- cfile = asset_file_path(
1509
- prefix=self.working_dir,
1510
- exec_rid=execution_rid,
1511
- file_name="configuration.json",
1512
- asset_table=self.model.name_to_table("Execution_Metadata"),
1513
- metadata={},
1514
- )
1295
+ from deriva_ml.schema.validation import SchemaValidationReport, validate_ml_schema
1296
+ return validate_ml_schema(self, strict=strict)
1515
1297
 
1516
- # Load configuration from a file or create from an execution record
1517
- if cfile.exists():
1518
- configuration = ExecutionConfiguration.load_configuration(cfile)
1519
- else:
1520
- execution = self.retrieve_rid(execution_rid)
1521
- configuration = ExecutionConfiguration(
1522
- workflow=execution["Workflow"],
1523
- description=execution["Description"],
1524
- )
1298
+ # Methods moved to mixins:
1299
+ # - create_asset, list_assets -> AssetMixin
1300
+ # - create_feature, feature_record_class, delete_feature, lookup_feature, list_feature_values -> FeatureMixin
1301
+ # - find_datasets, create_dataset, lookup_dataset, delete_dataset, list_dataset_element_types,
1302
+ # add_dataset_element_type, download_dataset_bag -> DatasetMixin
1303
+ # - _update_status, create_execution, restore_execution -> ExecutionMixin
1304
+ # - add_files, list_files, _bootstrap_versions, _synchronize_dataset_versions, _set_version_snapshot -> FileMixin
1525
1305
 
1526
- # Create and return an execution instance
1527
- return Execution(configuration, self, reload=execution_rid)