deriva-ml 1.17.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. deriva_ml/.DS_Store +0 -0
  2. deriva_ml/__init__.py +79 -0
  3. deriva_ml/bump_version.py +142 -0
  4. deriva_ml/core/__init__.py +39 -0
  5. deriva_ml/core/base.py +1527 -0
  6. deriva_ml/core/config.py +69 -0
  7. deriva_ml/core/constants.py +36 -0
  8. deriva_ml/core/definitions.py +74 -0
  9. deriva_ml/core/enums.py +222 -0
  10. deriva_ml/core/ermrest.py +288 -0
  11. deriva_ml/core/exceptions.py +28 -0
  12. deriva_ml/core/filespec.py +116 -0
  13. deriva_ml/dataset/__init__.py +12 -0
  14. deriva_ml/dataset/aux_classes.py +225 -0
  15. deriva_ml/dataset/dataset.py +1519 -0
  16. deriva_ml/dataset/dataset_bag.py +450 -0
  17. deriva_ml/dataset/history.py +109 -0
  18. deriva_ml/dataset/upload.py +439 -0
  19. deriva_ml/demo_catalog.py +495 -0
  20. deriva_ml/execution/__init__.py +26 -0
  21. deriva_ml/execution/environment.py +290 -0
  22. deriva_ml/execution/execution.py +1180 -0
  23. deriva_ml/execution/execution_configuration.py +147 -0
  24. deriva_ml/execution/workflow.py +413 -0
  25. deriva_ml/feature.py +228 -0
  26. deriva_ml/install_kernel.py +71 -0
  27. deriva_ml/model/__init__.py +0 -0
  28. deriva_ml/model/catalog.py +485 -0
  29. deriva_ml/model/database.py +719 -0
  30. deriva_ml/protocols/dataset.py +19 -0
  31. deriva_ml/run_notebook.py +228 -0
  32. deriva_ml/schema/__init__.py +3 -0
  33. deriva_ml/schema/annotations.py +473 -0
  34. deriva_ml/schema/check_schema.py +104 -0
  35. deriva_ml/schema/create_schema.py +393 -0
  36. deriva_ml/schema/deriva-ml-reference.json +8525 -0
  37. deriva_ml/schema/policy.json +81 -0
  38. deriva_ml/schema/table_comments_utils.py +57 -0
  39. deriva_ml/test.py +94 -0
  40. deriva_ml-1.17.10.dist-info/METADATA +38 -0
  41. deriva_ml-1.17.10.dist-info/RECORD +45 -0
  42. deriva_ml-1.17.10.dist-info/WHEEL +5 -0
  43. deriva_ml-1.17.10.dist-info/entry_points.txt +9 -0
  44. deriva_ml-1.17.10.dist-info/licenses/LICENSE +201 -0
  45. deriva_ml-1.17.10.dist-info/top_level.txt +1 -0
deriva_ml/core/base.py ADDED
@@ -0,0 +1,1527 @@
1
+ """Core module for the Deriva ML project.
2
+
3
+ This module implements the DerivaML class, which is the primary interface to Deriva-based catalogs. It provides
4
+ functionality for managing features, vocabularies, and other ML-related operations.
5
+
6
+ The module requires a catalog that implements a 'deriva-ml' schema with specific tables and relationships.
7
+
8
+ Typical usage example:
9
+ >>> ml = DerivaML('deriva.example.org', 'my_catalog')
10
+ >>> ml.create_feature('my_table', 'new_feature')
11
+ >>> ml.add_term('vocabulary_table', 'new_term', description='Description of term')
12
+ """
13
+
14
+ from __future__ import annotations # noqa: I001
15
+
16
+ # Standard library imports
17
+ from collections import defaultdict
18
+ import logging
19
+ from datetime import datetime
20
+ from itertools import chain
21
+ from pathlib import Path
22
+ from typing import Dict, Iterable, List, cast, TYPE_CHECKING, Any
23
+ from typing_extensions import Self
24
+ from urllib.parse import urlsplit
25
+
26
+
27
+ # Third-party imports
28
+ import requests
29
+ from pydantic import ConfigDict, validate_call
30
+
31
+ # Deriva imports
32
+ from deriva.core import DEFAULT_SESSION_CONFIG, format_exception, get_credential, urlquote
33
+
34
+ import deriva.core.datapath as datapath
35
+ from deriva.core.datapath import DataPathException, _SchemaWrapper as SchemaWrapper
36
+ from deriva.core.deriva_server import DerivaServer
37
+ from deriva.core.ermrest_catalog import ResolveRidResult
38
+ from deriva.core.ermrest_model import Key, Table
39
+ from deriva.core.utils.core_utils import DEFAULT_LOGGER_OVERRIDES
40
+ from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
41
+
42
+ from deriva_ml.core.exceptions import DerivaMLInvalidTerm
43
+ from deriva_ml.core.definitions import (
44
+ ML_SCHEMA,
45
+ RID,
46
+ ColumnDefinition,
47
+ FileSpec,
48
+ MLVocab,
49
+ MLTable,
50
+ Status,
51
+ TableDefinition,
52
+ VocabularyTerm,
53
+ )
54
+ from deriva_ml.core.config import DerivaMLConfig
55
+ from deriva_ml.core.exceptions import DerivaMLTableTypeError, DerivaMLException
56
+ from deriva_ml.dataset.aux_classes import DatasetSpec
57
+ from deriva_ml.dataset.dataset import Dataset
58
+ from deriva_ml.dataset.dataset_bag import DatasetBag
59
+ from deriva_ml.dataset.upload import asset_file_path, execution_rids, table_path
60
+
61
+ # Local imports
62
+ from deriva_ml.execution.execution_configuration import ExecutionConfiguration
63
+ from deriva_ml.execution.workflow import Workflow
64
+ from deriva_ml.feature import Feature, FeatureRecord
65
+ from deriva_ml.model.catalog import DerivaModel
66
+ from deriva_ml.schema.annotations import asset_annotation
67
+
68
+ # Optional debug imports
69
+ try:
70
+ from icecream import ic
71
+
72
+ ic.configureOutput(includeContext=True)
73
+ except ImportError: # Graceful fallback if IceCream isn't installed.
74
+ ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
75
+
76
+ if TYPE_CHECKING:
77
+ from deriva_ml.execution.execution import Execution
78
+
79
+ # Stop pycharm from complaining about undefined references.
80
+ ml: DerivaML
81
+
82
+
83
+ class DerivaML(Dataset):
84
+ """Core class for machine learning operations on a Deriva catalog.
85
+
86
+ This class provides core functionality for managing ML workflows, features, and datasets in a Deriva catalog.
87
+ It handles data versioning, feature management, vocabulary control, and execution tracking.
88
+
89
+ Attributes:
90
+ host_name (str): Hostname of the Deriva server (e.g., 'deriva.example.org').
91
+ catalog_id (Union[str, int]): Catalog identifier or name.
92
+ domain_schema (str): Schema name for domain-specific tables and relationships.
93
+ model (DerivaModel): ERMRest model for the catalog.
94
+ working_dir (Path): Directory for storing computation data and results.
95
+ cache_dir (Path): Directory for caching downloaded datasets.
96
+ ml_schema (str): Schema name for ML-specific tables (default: 'deriva_ml').
97
+ configuration (ExecutionConfiguration): Current execution configuration.
98
+ project_name (str): Name of the current project.
99
+ start_time (datetime): Timestamp when this instance was created.
100
+ status (str): Current status of operations.
101
+
102
+ Example:
103
+ >>> ml = DerivaML('deriva.example.org', 'my_catalog')
104
+ >>> ml.create_feature('my_table', 'new_feature')
105
+ >>> ml.add_term('vocabulary_table', 'new_term', description='Description of term')
106
+ """
107
+
108
+ @classmethod
109
+ def instantiate(cls, config: DerivaMLConfig) -> Self:
110
+ return cls(**config.model_dump())
111
+
112
+ def __init__(
113
+ self,
114
+ hostname: str,
115
+ catalog_id: str | int,
116
+ domain_schema: str | None = None,
117
+ project_name: str | None = None,
118
+ cache_dir: str | Path | None = None,
119
+ working_dir: str | Path | None = None,
120
+ hydra_runtime_output_dir: str | Path | None = None,
121
+ ml_schema: str = ML_SCHEMA,
122
+ logging_level=logging.WARNING,
123
+ deriva_logging_level=logging.WARNING,
124
+ credential=None,
125
+ use_minid: bool = True,
126
+ check_auth: bool = True,
127
+ ):
128
+ """Initializes a DerivaML instance.
129
+
130
+ This method will connect to a catalog and initialize local configuration for the ML execution.
131
+ This class is intended to be used as a base class on which domain-specific interfaces are built.
132
+
133
+ Args:
134
+ hostname: Hostname of the Deriva server.
135
+ catalog_id: Catalog ID. Either an identifier or a catalog name.
136
+ domain_schema: Schema name for domain-specific tables and relationships. Defaults to the name of the
137
+ schema that is not one of the standard schemas. If there is more than one user-defined schema, then
138
+ this argument must be provided a value.
139
+ ml_schema: Schema name for ML schema. Used if you have a non-standard configuration of deriva-ml.
140
+ project_name: Project name. Defaults to name of domain schema.
141
+ cache_dir: Directory path for caching data downloaded from the Deriva server as bdbag. If not provided,
142
+ will default to working_dir.
143
+ working_dir: Directory path for storing data used by or generated by any computations. If no value is
144
+ provided, will default to ${HOME}/deriva_ml
145
+ use_minid: Use the MINID service when downloading dataset bags.
146
+ check_auth: Check if the user has access to the catalog.
147
+ """
148
+ # Get or use provided credentials for server access
149
+ self.credential = credential or get_credential(hostname)
150
+
151
+ # Initialize server connection and catalog access
152
+ server = DerivaServer(
153
+ "https",
154
+ hostname,
155
+ credentials=self.credential,
156
+ session_config=self._get_session_config(),
157
+ )
158
+ try:
159
+ if check_auth and server.get_authn_session():
160
+ pass
161
+ except Exception:
162
+ raise DerivaMLException(
163
+ "You are not authorized to access this catalog. "
164
+ "Please check your credentials and make sure you have logged in."
165
+ )
166
+ self.catalog = server.connect_ermrest(catalog_id)
167
+ self.model = DerivaModel(self.catalog.getCatalogModel(), domain_schema=domain_schema)
168
+
169
+ # Set up working and cache directories
170
+ self.working_dir = DerivaMLConfig.compute_workdir(working_dir)
171
+ self.working_dir.mkdir(parents=True, exist_ok=True)
172
+ self.hydra_runtime_output_dir = hydra_runtime_output_dir
173
+
174
+ self.cache_dir = Path(cache_dir) if cache_dir else self.working_dir / "cache"
175
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
176
+
177
+ # Initialize dataset functionality from the parent class
178
+ super().__init__(self.model, self.cache_dir, self.working_dir, use_minid=use_minid)
179
+
180
+ # Set up logging
181
+ self._logger = logging.getLogger("deriva_ml")
182
+ self._logger.setLevel(logging_level)
183
+ self._logging_level = logging_level
184
+ self._deriva_logging_level = deriva_logging_level
185
+
186
+ # Configure deriva logging level
187
+ logger_config = DEFAULT_LOGGER_OVERRIDES
188
+ # allow for reconfiguration of module-specific logging levels
189
+ [logging.getLogger(name).setLevel(level) for name, level in logger_config.items()]
190
+ logging.getLogger("root").setLevel(deriva_logging_level)
191
+ logging.getLogger("bagit").setLevel(deriva_logging_level)
192
+ logging.getLogger("bdbag").setLevel(deriva_logging_level)
193
+
194
+ # Store instance configuration
195
+ self.host_name = hostname
196
+ self.catalog_id = catalog_id
197
+ self.ml_schema = ml_schema
198
+ self.configuration = None
199
+ self._execution: Execution | None = None
200
+ self.domain_schema = self.model.domain_schema
201
+ self.project_name = project_name or self.domain_schema
202
+ self.start_time = datetime.now()
203
+ self.status = Status.pending.value
204
+
205
+ # Configure logging format
206
+ logging.basicConfig(
207
+ level=logging_level,
208
+ format="%(asctime)s - %(name)s.%(levelname)s - %(message)s",
209
+ )
210
+
211
+ # Set Deriva library logging level
212
+ deriva_logger = logging.getLogger("deriva")
213
+ deriva_logger.setLevel(logging_level)
214
+
215
+ def __del__(self):
216
+ """Cleanup method to handle incomplete executions."""
217
+ try:
218
+ # Mark execution as aborted if not completed
219
+ if self._execution and self._execution.status != Status.completed:
220
+ self._execution.update_status(Status.aborted, "Execution Aborted")
221
+ except (AttributeError, requests.HTTPError):
222
+ pass
223
+
224
+ @staticmethod
225
+ def _get_session_config():
226
+ """Returns customized HTTP session configuration.
227
+
228
+ Configures retry behavior and connection settings for HTTP requests to the Deriva server. Settings include:
229
+ - Idempotent retry behavior for all HTTP methods
230
+ - Increased retry attempts for read and connect operations
231
+ - Exponential backoff for retries
232
+
233
+ Returns:
234
+ dict: Session configuration dictionary with retry and connection settings.
235
+
236
+ Example:
237
+ >>> config = DerivaML._get_session_config()
238
+ >>> print(config['retry_read']) # 8
239
+ """
240
+ # Start with a default configuration
241
+ session_config = DEFAULT_SESSION_CONFIG.copy()
242
+
243
+ # Customize retry behavior for robustness
244
+ session_config.update(
245
+ {
246
+ # Allow retries for all HTTP methods (PUT/POST are idempotent)
247
+ "allow_retry_on_all_methods": True,
248
+ # Increase retry attempts for better reliability
249
+ "retry_read": 8,
250
+ "retry_connect": 5,
251
+ # Use exponential backoff for retries
252
+ "retry_backoff_factor": 5,
253
+ }
254
+ )
255
+ return session_config
256
+
257
+ @property
258
+ def pathBuilder(self) -> SchemaWrapper:
259
+ """Returns catalog path builder for queries.
260
+
261
+ The path builder provides a fluent interface for constructing complex queries against the catalog.
262
+ This is a core component used by many other methods to interact with the catalog.
263
+
264
+ Returns:
265
+ datapath._CatalogWrapper: A new instance of the catalog path builder.
266
+
267
+ Example:
268
+ >>> path = ml.pathBuilder.schemas['my_schema'].tables['my_table']
269
+ >>> results = path.entities().fetch()
270
+ """
271
+ return self.catalog.getPathBuilder()
272
+
273
+ @property
274
+ def domain_path(self) -> datapath.DataPath:
275
+ """Returns path builder for domain schema.
276
+
277
+ Provides a convenient way to access tables and construct queries within the domain-specific schema.
278
+
279
+ Returns:
280
+ datapath._CatalogWrapper: Path builder object scoped to the domain schema.
281
+
282
+ Example:
283
+ >>> domain = ml.domain_path
284
+ >>> results = domain.my_table.entities().fetch()
285
+ """
286
+ return self.pathBuilder.schemas[self.domain_schema]
287
+
288
+ def table_path(self, table: str | Table) -> Path:
289
+ """Returns a local filesystem path for table CSV files.
290
+
291
+ Generates a standardized path where CSV files should be placed when preparing to upload data to a table.
292
+ The path follows the project's directory structure conventions.
293
+
294
+ Args:
295
+ table: Name of the table or Table object to get the path for.
296
+
297
+ Returns:
298
+ Path: Filesystem path where the CSV file should be placed.
299
+
300
+ Example:
301
+ >>> path = ml.table_path("experiment_results")
302
+ >>> df.to_csv(path) # Save data for upload
303
+ """
304
+ return table_path(
305
+ self.working_dir,
306
+ schema=self.domain_schema,
307
+ table=self.model.name_to_table(table).name,
308
+ )
309
+
310
+ def download_dir(self, cached: bool = False) -> Path:
311
+ """Returns the appropriate download directory.
312
+
313
+ Provides the appropriate directory path for storing downloaded files, either in the cache or working directory.
314
+
315
+ Args:
316
+ cached: If True, returns the cache directory path. If False, returns the working directory path.
317
+
318
+ Returns:
319
+ Path: Directory path where downloaded files should be stored.
320
+
321
+ Example:
322
+ >>> cache_dir = ml.download_dir(cached=True)
323
+ >>> work_dir = ml.download_dir(cached=False)
324
+ """
325
+ # Return cache directory if cached=True, otherwise working directory
326
+ return self.cache_dir if cached else self.working_dir
327
+
328
+ @staticmethod
329
+ def globus_login(host: str) -> None:
330
+ """Authenticates with Globus for accessing Deriva services.
331
+
332
+ Performs authentication using Globus Auth to access Deriva services. If already logged in, notifies the user.
333
+ Uses non-interactive authentication flow without a browser or local server.
334
+
335
+ Args:
336
+ host: The hostname of the Deriva server to authenticate with (e.g., 'deriva.example.org').
337
+
338
+ Example:
339
+ >>> DerivaML.globus_login('deriva.example.org')
340
+ 'Login Successful'
341
+ """
342
+ gnl = GlobusNativeLogin(host=host)
343
+ if gnl.is_logged_in([host]):
344
+ print("You are already logged in.")
345
+ else:
346
+ gnl.login(
347
+ [host],
348
+ no_local_server=True,
349
+ no_browser=True,
350
+ refresh_tokens=True,
351
+ update_bdbag_keychain=True,
352
+ )
353
+ print("Login Successful")
354
+
355
+ def chaise_url(self, table: RID | Table | str) -> str:
356
+ """Generates Chaise web interface URL.
357
+
358
+ Chaise is Deriva's web interface for data exploration. This method creates a URL that directly links to
359
+ the specified table or record.
360
+
361
+ Args:
362
+ table: Table to generate URL for (name, Table object, or RID).
363
+
364
+ Returns:
365
+ str: URL in format: https://{host}/chaise/recordset/#{catalog}/{schema}:{table}
366
+
367
+ Raises:
368
+ DerivaMLException: If table or RID cannot be found.
369
+
370
+ Examples:
371
+ Using table name:
372
+ >>> ml.chaise_url("experiment_table")
373
+ 'https://deriva.org/chaise/recordset/#1/schema:experiment_table'
374
+
375
+ Using RID:
376
+ >>> ml.chaise_url("1-abc123")
377
+ """
378
+ # Get the table object and build base URI
379
+ table_obj = self.model.name_to_table(table)
380
+ try:
381
+ uri = self.catalog.get_server_uri().replace("ermrest/catalog/", "chaise/recordset/#")
382
+ except DerivaMLException:
383
+ # Handle RID case
384
+ uri = self.cite(cast(str, table))
385
+ return f"{uri}/{urlquote(table_obj.schema.name)}:{urlquote(table_obj.name)}"
386
+
387
+ def cite(self, entity: Dict[str, Any] | str) -> str:
388
+ """Generates permanent citation URL.
389
+
390
+ Creates a versioned URL that can be used to reference a specific entity in the catalog. The URL includes
391
+ the catalog snapshot time to ensure version stability.
392
+
393
+ Args:
394
+ entity: Either a RID string or a dictionary containing entity data with a 'RID' key.
395
+
396
+ Returns:
397
+ str: Permanent citation URL in format: https://{host}/id/{catalog}/{rid}@{snapshot_time}
398
+
399
+ Raises:
400
+ DerivaMLException: If an entity doesn't exist or lacks a RID.
401
+
402
+ Examples:
403
+ Using a RID string:
404
+ >>> url = ml.cite("1-abc123")
405
+ >>> print(url)
406
+ 'https://deriva.org/id/1/1-abc123@2024-01-01T12:00:00'
407
+
408
+ Using a dictionary:
409
+ >>> url = ml.cite({"RID": "1-abc123"})
410
+ """
411
+ # Return if already a citation URL
412
+ if isinstance(entity, str) and entity.startswith(f"https://{self.host_name}/id/{self.catalog_id}/"):
413
+ return entity
414
+
415
+ try:
416
+ # Resolve RID and create citation URL with snapshot time
417
+ self.resolve_rid(rid := entity if isinstance(entity, str) else entity["RID"])
418
+ return f"https://{self.host_name}/id/{self.catalog_id}/{rid}@{self.catalog.latest_snapshot().snaptime}"
419
+ except KeyError as e:
420
+ raise DerivaMLException(f"Entity {e} does not have RID column")
421
+ except DerivaMLException as _e:
422
+ raise DerivaMLException("Entity RID does not exist")
423
+
424
+ def user_list(self) -> List[Dict[str, str]]:
425
+ """Returns catalog user list.
426
+
427
+ Retrieves basic information about all users who have access to the catalog, including their
428
+ identifiers and full names.
429
+
430
+ Returns:
431
+ List[Dict[str, str]]: List of user information dictionaries, each containing:
432
+ - 'ID': User identifier
433
+ - 'Full_Name': User's full name
434
+
435
+ Examples:
436
+
437
+ >>> users = ml.user_list()
438
+ >>> for user in users:
439
+ ... print(f"{user['Full_Name']} ({user['ID']})")
440
+ """
441
+ # Get the user table path and fetch basic user info
442
+ user_path = self.pathBuilder.public.ERMrest_Client.path
443
+ return [{"ID": u["ID"], "Full_Name": u["Full_Name"]} for u in user_path.entities().fetch()]
444
+
445
+ def resolve_rid(self, rid: RID) -> ResolveRidResult:
446
+ """Resolves RID to catalog location.
447
+
448
+ Looks up a RID and returns information about where it exists in the catalog, including schema,
449
+ table, and column metadata.
450
+
451
+ Args:
452
+ rid: Resource Identifier to resolve.
453
+
454
+ Returns:
455
+ ResolveRidResult: Named tuple containing:
456
+ - schema: Schema name
457
+ - table: Table name
458
+ - columns: Column definitions
459
+ - datapath: Path builder for accessing the entity
460
+
461
+ Raises:
462
+ DerivaMLException: If RID doesn't exist in catalog.
463
+
464
+ Examples:
465
+ >>> result = ml.resolve_rid("1-abc123")
466
+ >>> print(f"Found in {result.schema}.{result.table}")
467
+ >>> data = result.datapath.entities().fetch()
468
+ """
469
+ try:
470
+ # Attempt to resolve RID using catalog model
471
+ return self.catalog.resolve_rid(rid, self.model.model)
472
+ except KeyError as _e:
473
+ raise DerivaMLException(f"Invalid RID {rid}")
474
+
475
+ def retrieve_rid(self, rid: RID) -> dict[str, Any]:
476
+ """Retrieves complete record for RID.
477
+
478
+ Fetches all column values for the entity identified by the RID.
479
+
480
+ Args:
481
+ rid: Resource Identifier of the record to retrieve.
482
+
483
+ Returns:
484
+ dict[str, Any]: Dictionary containing all column values for the entity.
485
+
486
+ Raises:
487
+ DerivaMLException: If the RID doesn't exist in the catalog.
488
+
489
+ Example:
490
+ >>> record = ml.retrieve_rid("1-abc123")
491
+ >>> print(f"Name: {record['name']}, Created: {record['creation_date']}")
492
+ """
493
+ # Resolve RID and fetch the first (only) matching record
494
+ return self.resolve_rid(rid).datapath.entities().fetch()[0]
495
+
496
+ def add_page(self, title: str, content: str) -> None:
497
+ """Adds page to web interface.
498
+
499
+ Creates a new page in the catalog's web interface with the specified title and content. The page will be
500
+ accessible through the catalog's navigation system.
501
+
502
+ Args:
503
+ title: The title of the page to be displayed in navigation and headers.
504
+ content: The main content of the page can include HTML markup.
505
+
506
+ Raises:
507
+ DerivaMLException: If the page creation fails or the user lacks necessary permissions.
508
+
509
+ Example:
510
+ >>> ml.add_page(
511
+ ... title="Analysis Results",
512
+ ... content="<h1>Results</h1><p>Analysis completed successfully...</p>"
513
+ ... )
514
+ """
515
+ # Insert page into www tables with title and content
516
+ self.pathBuilder.www.tables[self.domain_schema].insert([{"Title": title, "Content": content}])
517
+
518
+ def create_vocabulary(self, vocab_name: str, comment: str = "", schema: str | None = None) -> Table:
519
+ """Creates a controlled vocabulary table.
520
+
521
+ A controlled vocabulary table maintains a list of standardized terms and their definitions. Each term can have
522
+ synonyms and descriptions to ensure consistent terminology usage across the dataset.
523
+
524
+ Args:
525
+ vocab_name: Name for the new vocabulary table. Must be a valid SQL identifier.
526
+ comment: Description of the vocabulary's purpose and usage. Defaults to empty string.
527
+ schema: Schema name to create the table in. If None, uses domain_schema.
528
+
529
+ Returns:
530
+ Table: ERMRest table object representing the newly created vocabulary table.
531
+
532
+ Raises:
533
+ DerivaMLException: If vocab_name is invalid or already exists.
534
+
535
+ Examples:
536
+ Create a vocabulary for tissue types:
537
+
538
+ >>> table = ml.create_vocabulary(
539
+ ... vocab_name="tissue_types",
540
+ ... comment="Standard tissue classifications",
541
+ ... schema="bio_schema"
542
+ ... )
543
+ """
544
+ # Use domain schema if none specified
545
+ schema = schema or self.domain_schema
546
+
547
+ # Create and return vocabulary table with RID-based URI pattern
548
+ try:
549
+ vocab_table = self.model.schemas[schema].create_table(
550
+ Table.define_vocabulary(vocab_name, f"{self.project_name}:{{RID}}", comment=comment)
551
+ )
552
+ except ValueError:
553
+ raise DerivaMLException(f"Table {vocab_name} already exist")
554
+ return vocab_table
555
+
556
+ def create_table(self, table: TableDefinition) -> Table:
557
+ """Creates a new table in the catalog.
558
+
559
+ Creates a table using the provided TableDefinition object, which specifies the table structure including
560
+ columns, keys, and foreign key relationships.
561
+
562
+ Args:
563
+ table: A TableDefinition object containing the complete specification of the table to create.
564
+
565
+ Returns:
566
+ Table: The newly created ERMRest table object.
567
+
568
+ Raises:
569
+ DerivaMLException: If table creation fails or the definition is invalid.
570
+
571
+ Example:
572
+
573
+ >>> table_def = TableDefinition(
574
+ ... name="experiments",
575
+ ... column_definitions=[
576
+ ... ColumnDefinition(name="name", type=BuiltinTypes.text),
577
+ ... ColumnDefinition(name="date", type=BuiltinTypes.date)
578
+ ... ]
579
+ ... )
580
+ >>> new_table = ml.create_table(table_def)
581
+ """
582
+ # Create table in domain schema using provided definition
583
+ return self.model.schemas[self.domain_schema].create_table(table.model_dump())
584
+
585
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
586
+ def create_asset(
587
+ self,
588
+ asset_name: str,
589
+ column_defs: Iterable[ColumnDefinition] | None = None,
590
+ fkey_defs: Iterable[ColumnDefinition] | None = None,
591
+ referenced_tables: Iterable[Table] | None = None,
592
+ comment: str = "",
593
+ schema: str | None = None,
594
+ ) -> Table:
595
+ """Creates an asset table.
596
+
597
+ Args:
598
+ asset_name: Name of the asset table.
599
+ column_defs: Iterable of ColumnDefinition objects to provide additional metadata for asset.
600
+ fkey_defs: Iterable of ForeignKeyDefinition objects to provide additional metadata for asset.
601
+ referenced_tables: Iterable of Table objects to which asset should provide foreign-key references to.
602
+ comment: Description of the asset table. (Default value = '')
603
+ schema: Schema in which to create the asset table. Defaults to domain_schema.
604
+
605
+ Returns:
606
+ Table object for the asset table.
607
+ """
608
+ # Initialize empty collections if None provided
609
+ column_defs = column_defs or []
610
+ fkey_defs = fkey_defs or []
611
+ referenced_tables = referenced_tables or []
612
+ schema = schema or self.domain_schema
613
+
614
+ # Add an asset type to vocabulary
615
+ self.add_term(MLVocab.asset_type, asset_name, description=f"A {asset_name} asset")
616
+
617
+ # Create the main asset table
618
+ asset_table = self.model.schemas[schema].create_table(
619
+ Table.define_asset(
620
+ schema,
621
+ asset_name,
622
+ column_defs=[c.model_dump() for c in column_defs],
623
+ fkey_defs=[fk.model_dump() for fk in fkey_defs],
624
+ comment=comment,
625
+ )
626
+ )
627
+
628
+ # Create an association table between asset and asset type
629
+ self.model.schemas[self.domain_schema].create_table(
630
+ Table.define_association(
631
+ [
632
+ (asset_table.name, asset_table),
633
+ ("Asset_Type", self.model.name_to_table("Asset_Type")),
634
+ ]
635
+ )
636
+ )
637
+
638
+ # Create references to other tables if specified
639
+ for t in referenced_tables:
640
+ asset_table.create_reference(self.model.name_to_table(t))
641
+
642
+ # Create an association table for tracking execution
643
+ atable = self.model.schemas[self.domain_schema].create_table(
644
+ Table.define_association(
645
+ [
646
+ (asset_name, asset_table),
647
+ (
648
+ "Execution",
649
+ self.model.schemas[self.ml_schema].tables["Execution"],
650
+ ),
651
+ ]
652
+ )
653
+ )
654
+ atable.create_reference(self.model.name_to_table("Asset_Role"))
655
+
656
+ # Add asset annotations
657
+ asset_annotation(asset_table)
658
+ return asset_table
659
+
660
+ def list_assets(self, asset_table: Table | str) -> list[dict[str, Any]]:
661
+ """Lists contents of an asset table.
662
+
663
+ Returns a list of assets with their types for the specified asset table.
664
+
665
+ Args:
666
+ asset_table: Table or name of the asset table to list assets for.
667
+
668
+ Returns:
669
+ list[dict[str, Any]]: List of asset records, each containing:
670
+ - RID: Resource identifier
671
+ - Type: Asset type
672
+ - Metadata: Asset metadata
673
+
674
+ Raises:
675
+ DerivaMLException: If the table is not an asset table or doesn't exist.
676
+
677
+ Example:
678
+ >>> assets = ml.list_assets("tissue_types")
679
+ >>> for asset in assets:
680
+ ... print(f"{asset['RID']}: {asset['Type']}")
681
+ """
682
+ # Validate and get asset table reference
683
+ asset_table = self.model.name_to_table(asset_table)
684
+ if not self.model.is_asset(asset_table):
685
+ raise DerivaMLException(f"Table {asset_table.name} is not an asset")
686
+
687
+ # Get path builders for asset and type tables
688
+ pb = self._model.catalog.getPathBuilder()
689
+ asset_path = pb.schemas[asset_table.schema.name].tables[asset_table.name]
690
+ (
691
+ asset_type_table,
692
+ _,
693
+ _,
694
+ ) = self._model.find_association(asset_table, MLVocab.asset_type)
695
+ type_path = pb.schemas[asset_type_table.schema.name].tables[asset_type_table.name]
696
+
697
+ # Build a list of assets with their types
698
+ assets = []
699
+ for asset in asset_path.entities().fetch():
700
+ # Get associated asset types for each asset
701
+ asset_types = (
702
+ type_path.filter(type_path.columns[asset_table.name] == asset["RID"])
703
+ .attributes(type_path.Asset_Type)
704
+ .fetch()
705
+ )
706
+ # Combine asset data with its types
707
+ assets.append(
708
+ asset | {MLVocab.asset_type.value: [asset_type[MLVocab.asset_type.value] for asset_type in asset_types]}
709
+ )
710
+ return assets
711
+
712
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
713
+ def create_feature(
714
+ self,
715
+ target_table: Table | str,
716
+ feature_name: str,
717
+ terms: list[Table | str] | None = None,
718
+ assets: list[Table | str] | None = None,
719
+ metadata: list[ColumnDefinition | Table | Key | str] | None = None,
720
+ optional: list[str] | None = None,
721
+ comment: str = "",
722
+ ) -> type[FeatureRecord]:
723
+ """Creates a new feature definition.
724
+
725
+ A feature represents a measurable property or characteristic that can be associated with records in the target
726
+ table. Features can include vocabulary terms, asset references, and additional metadata.
727
+
728
+ Args:
729
+ target_table: Table to associate the feature with (name or Table object).
730
+ feature_name: Unique name for the feature within the target table.
731
+ terms: Optional vocabulary tables/names whose terms can be used as feature values.
732
+ assets: Optional asset tables/names that can be referenced by this feature.
733
+ metadata: Optional columns, tables, or keys to include in a feature definition.
734
+ optional: Column names that are not required when creating feature instances.
735
+ comment: Description of the feature's purpose and usage.
736
+
737
+ Returns:
738
+ type[FeatureRecord]: Feature class for creating validated instances.
739
+
740
+ Raises:
741
+ DerivaMLException: If a feature definition is invalid or conflicts with existing features.
742
+
743
+ Examples:
744
+ Create a feature with confidence score:
745
+ >>> feature_class = ml.create_feature(
746
+ ... target_table="samples",
747
+ ... feature_name="expression_level",
748
+ ... terms=["expression_values"],
749
+ ... metadata=[ColumnDefinition(name="confidence", type=BuiltinTypes.float4)],
750
+ ... comment="Gene expression measurement"
751
+ ... )
752
+ """
753
+ # Initialize empty collections if None provided
754
+ terms = terms or []
755
+ assets = assets or []
756
+ metadata = metadata or []
757
+ optional = optional or []
758
+
759
+ def normalize_metadata(m: Key | Table | ColumnDefinition | str):
760
+ """Helper function to normalize metadata references."""
761
+ if isinstance(m, str):
762
+ return self.model.name_to_table(m)
763
+ elif isinstance(m, ColumnDefinition):
764
+ return m.model_dump()
765
+ else:
766
+ return m
767
+
768
+ # Validate asset and term tables
769
+ if not all(map(self.model.is_asset, assets)):
770
+ raise DerivaMLException("Invalid create_feature asset table.")
771
+ if not all(map(self.model.is_vocabulary, terms)):
772
+ raise DerivaMLException("Invalid create_feature asset table.")
773
+
774
+ # Get references to required tables
775
+ target_table = self.model.name_to_table(target_table)
776
+ execution = self.model.schemas[self.ml_schema].tables["Execution"]
777
+ feature_name_table = self.model.schemas[self.ml_schema].tables["Feature_Name"]
778
+
779
+ # Add feature name to vocabulary
780
+ feature_name_term = self.add_term("Feature_Name", feature_name, description=comment)
781
+ atable_name = f"Execution_{target_table.name}_{feature_name_term.name}"
782
+ # Create an association table implementing the feature
783
+ atable = self.model.schemas[self.domain_schema].create_table(
784
+ target_table.define_association(
785
+ table_name=atable_name,
786
+ associates=[execution, target_table, feature_name_table],
787
+ metadata=[normalize_metadata(m) for m in chain(assets, terms, metadata)],
788
+ comment=comment,
789
+ )
790
+ )
791
+ # Configure optional columns and default feature name
792
+ for c in optional:
793
+ atable.columns[c].alter(nullok=True)
794
+ atable.columns["Feature_Name"].alter(default=feature_name_term.name)
795
+
796
+ # Return feature record class for creating instances
797
+ return self.feature_record_class(target_table, feature_name)
798
+
799
+ def feature_record_class(self, table: str | Table, feature_name: str) -> type[FeatureRecord]:
800
+ """Returns a pydantic model class for feature records.
801
+
802
+ Creates a typed interface for creating new instances of the specified feature. The returned class includes
803
+ validation and type checking based on the feature's definition.
804
+
805
+ Args:
806
+ table: The table containing the feature, either as name or Table object.
807
+ feature_name: Name of the feature to create a record class for.
808
+
809
+ Returns:
810
+ type[FeatureRecord]: A pydantic model class for creating validated feature records.
811
+
812
+ Raises:
813
+ DerivaMLException: If the feature doesn't exist or the table is invalid.
814
+
815
+ Example:
816
+ >>> ExpressionFeature = ml.feature_record_class("samples", "expression_level")
817
+ >>> feature = ExpressionFeature(value="high", confidence=0.95)
818
+ """
819
+ # Look up a feature and return its record class
820
+ return self.lookup_feature(table, feature_name).feature_record_class()
821
+
822
+ def delete_feature(self, table: Table | str, feature_name: str) -> bool:
823
+ """Removes a feature definition and its data.
824
+
825
+ Deletes the feature and its implementation table from the catalog. This operation cannot be undone and
826
+ will remove all feature values associated with this feature.
827
+
828
+ Args:
829
+ table: The table containing the feature, either as name or Table object.
830
+ feature_name: Name of the feature to delete.
831
+
832
+ Returns:
833
+ bool: True if the feature was successfully deleted, False if it didn't exist.
834
+
835
+ Raises:
836
+ DerivaMLException: If deletion fails due to constraints or permissions.
837
+
838
+ Example:
839
+ >>> success = ml.delete_feature("samples", "obsolete_feature")
840
+ >>> print("Deleted" if success else "Not found")
841
+ """
842
+ # Get table reference and find feature
843
+ table = self.model.name_to_table(table)
844
+ try:
845
+ # Find and delete the feature's implementation table
846
+ feature = next(f for f in self.model.find_features(table) if f.feature_name == feature_name)
847
+ feature.feature_table.drop()
848
+ return True
849
+ except StopIteration:
850
+ return False
851
+
852
+ def lookup_feature(self, table: str | Table, feature_name: str) -> Feature:
853
+ """Retrieves a Feature object.
854
+
855
+ Looks up and returns a Feature object that provides an interface to work with an existing feature
856
+ definition in the catalog.
857
+
858
+ Args:
859
+ table: The table containing the feature, either as name or Table object.
860
+ feature_name: Name of the feature to look up.
861
+
862
+ Returns:
863
+ Feature: An object representing the feature and its implementation.
864
+
865
+ Raises:
866
+ DerivaMLException: If the feature doesn't exist in the specified table.
867
+
868
+ Example:
869
+ >>> feature = ml.lookup_feature("samples", "expression_level")
870
+ >>> print(feature.feature_name)
871
+ 'expression_level'
872
+ """
873
+ return self.model.lookup_feature(table, feature_name)
874
+
875
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
876
+ def list_feature_values(self, table: Table | str, feature_name: str) -> datapath._ResultSet:
877
+ """Retrieves all values for a feature.
878
+
879
+ Returns all instances of the specified feature that have been created, including their associated
880
+ metadata and references.
881
+
882
+ Args:
883
+ table: The table containing the feature, either as name or Table object.
884
+ feature_name: Name of the feature to retrieve values for.
885
+
886
+ Returns:
887
+ datapath._ResultSet: A result set containing all feature values and their metadata.
888
+
889
+ Raises:
890
+ DerivaMLException: If the feature doesn't exist or cannot be accessed.
891
+
892
+ Example:
893
+ >>> values = ml.list_feature_values("samples", "expression_level")
894
+ >>> for value in values:
895
+ ... print(f"Sample {value['RID']}: {value['value']}")
896
+ """
897
+ # Get table and feature references
898
+ table = self.model.name_to_table(table)
899
+ feature = self.lookup_feature(table, feature_name)
900
+
901
+ # Build and execute query for feature values
902
+ pb = self.catalog.getPathBuilder()
903
+ return pb.schemas[feature.feature_table.schema.name].tables[feature.feature_table.name].entities().fetch()
904
+
905
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
906
+ def add_term(
907
+ self,
908
+ table: str | Table,
909
+ term_name: str,
910
+ description: str,
911
+ synonyms: list[str] | None = None,
912
+ exists_ok: bool = True,
913
+ ) -> VocabularyTerm:
914
+ """Adds a term to a vocabulary table.
915
+
916
+ Creates a new standardized term with description and optional synonyms in a vocabulary table.
917
+ Can either create a new term or return an existing one if it already exists.
918
+
919
+ Args:
920
+ table: Vocabulary table to add term to (name or Table object).
921
+ term_name: Primary name of the term (must be unique within vocabulary).
922
+ description: Explanation of term's meaning and usage.
923
+ synonyms: Alternative names for the term.
924
+ exists_ok: If True, return the existing term if found. If False, raise error.
925
+
926
+ Returns:
927
+ VocabularyTerm: Object representing the created or existing term.
928
+
929
+ Raises:
930
+ DerivaMLException: If a term exists and exists_ok=False, or if the table is not a vocabulary table.
931
+
932
+ Examples:
933
+ Add a new tissue type:
934
+ >>> term = ml.add_term(
935
+ ... table="tissue_types",
936
+ ... term_name="epithelial",
937
+ ... description="Epithelial tissue type",
938
+ ... synonyms=["epithelium"]
939
+ ... )
940
+
941
+ Attempt to add an existing term:
942
+ >>> term = ml.add_term("tissue_types", "epithelial", "...", exists_ok=True)
943
+ """
944
+ # Initialize an empty synonyms list if None
945
+ synonyms = synonyms or []
946
+
947
+ # Get table reference and validate if it is a vocabulary table
948
+ table = self.model.name_to_table(table)
949
+ pb = self.catalog.getPathBuilder()
950
+ if not (self.model.is_vocabulary(table)):
951
+ raise DerivaMLTableTypeError("vocabulary", table.name)
952
+
953
+ # Get schema and table names for path building
954
+ schema_name = table.schema.name
955
+ table_name = table.name
956
+
957
+ try:
958
+ # Attempt to insert a new term
959
+ term_id = VocabularyTerm.model_validate(
960
+ pb.schemas[schema_name]
961
+ .tables[table_name]
962
+ .insert(
963
+ [
964
+ {
965
+ "Name": term_name,
966
+ "Description": description,
967
+ "Synonyms": synonyms,
968
+ }
969
+ ],
970
+ defaults={"ID", "URI"},
971
+ )[0]
972
+ )
973
+ except DataPathException:
974
+ # Term exists - look it up or raise an error
975
+ term_id = self.lookup_term(table, term_name)
976
+ if not exists_ok:
977
+ raise DerivaMLInvalidTerm(table.name, term_name, msg="term already exists")
978
+ return term_id
979
+
980
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
981
+ def lookup_term(self, table: str | Table, term_name: str) -> VocabularyTerm:
982
+ """Finds a term in a vocabulary table.
983
+
984
+ Searches for a term in the specified vocabulary table, matching either the primary name
985
+ or any of its synonyms.
986
+
987
+ Args:
988
+ table: Vocabulary table to search in (name or Table object).
989
+ term_name: Name or synonym of the term to find.
990
+
991
+ Returns:
992
+ VocabularyTerm: The matching vocabulary term.
993
+
994
+ Raises:
995
+ DerivaMLVocabularyException: If the table is not a vocabulary table, or term is not found.
996
+
997
+ Examples:
998
+ Look up by primary name:
999
+ >>> term = ml.lookup_term("tissue_types", "epithelial")
1000
+ >>> print(term.description)
1001
+
1002
+ Look up by synonym:
1003
+ >>> term = ml.lookup_term("tissue_types", "epithelium")
1004
+ """
1005
+ # Get and validate vocabulary table reference
1006
+ vocab_table = self.model.name_to_table(table)
1007
+ if not self.model.is_vocabulary(vocab_table):
1008
+ raise DerivaMLException(f"The table {table} is not a controlled vocabulary")
1009
+
1010
+ # Get schema and table paths
1011
+ schema_name, table_name = vocab_table.schema.name, vocab_table.name
1012
+ schema_path = self.catalog.getPathBuilder().schemas[schema_name]
1013
+
1014
+ # Search for term by name or synonym
1015
+ for term in schema_path.tables[table_name].entities().fetch():
1016
+ if term_name == term["Name"] or (term["Synonyms"] and term_name in term["Synonyms"]):
1017
+ return VocabularyTerm.model_validate(term)
1018
+
1019
+ # Term not found
1020
+ raise DerivaMLInvalidTerm(table_name, term_name)
1021
+
1022
+ def list_vocabulary_terms(self, table: str | Table) -> list[VocabularyTerm]:
1023
+ """Lists all terms in a vocabulary table.
1024
+
1025
+ Retrieves all terms, their descriptions, and synonyms from a controlled vocabulary table.
1026
+
1027
+ Args:
1028
+ table: Vocabulary table to list terms from (name or Table object).
1029
+
1030
+ Returns:
1031
+ list[VocabularyTerm]: List of vocabulary terms with their metadata.
1032
+
1033
+ Raises:
1034
+ DerivaMLException: If table doesn't exist or is not a vocabulary table.
1035
+
1036
+ Examples:
1037
+ >>> terms = ml.list_vocabulary_terms("tissue_types")
1038
+ >>> for term in terms:
1039
+ ... print(f"{term.name}: {term.description}")
1040
+ ... if term.synonyms:
1041
+ ... print(f" Synonyms: {', '.join(term.synonyms)}")
1042
+ """
1043
+ # Get path builder and table reference
1044
+ pb = self.catalog.getPathBuilder()
1045
+ table = self.model.name_to_table(table.value if isinstance(table, MLVocab) else table)
1046
+
1047
+ # Validate table is a vocabulary table
1048
+ if not (self.model.is_vocabulary(table)):
1049
+ raise DerivaMLException(f"The table {table} is not a controlled vocabulary")
1050
+
1051
+ # Fetch and convert all terms to VocabularyTerm objects
1052
+ return [VocabularyTerm(**v) for v in pb.schemas[table.schema.name].tables[table.name].entities().fetch()]
1053
+
1054
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
1055
+ def download_dataset_bag(
1056
+ self,
1057
+ dataset: DatasetSpec,
1058
+ execution_rid: RID | None = None,
1059
+ ) -> DatasetBag:
1060
+ """Downloads a dataset to the local filesystem and creates a MINID if needed.
1061
+
1062
+ Downloads a dataset specified by DatasetSpec to the local filesystem. If the dataset doesn't have
1063
+ a MINID (Minimal Viable Identifier), one will be created. The dataset can optionally be associated
1064
+ with an execution record.
1065
+
1066
+ Args:
1067
+ dataset: Specification of the dataset to download, including version and materialization options.
1068
+ execution_rid: Optional execution RID to associate the download with.
1069
+
1070
+ Returns:
1071
+ DatasetBag: Object containing:
1072
+ - path: Local filesystem path to downloaded dataset
1073
+ - rid: Dataset's Resource Identifier
1074
+ - minid: Dataset's Minimal Viable Identifier
1075
+
1076
+ Examples:
1077
+ Download with default options:
1078
+ >>> spec = DatasetSpec(rid="1-abc123")
1079
+ >>> bag = ml.download_dataset_bag(dataset=spec)
1080
+ >>> print(f"Downloaded to {bag.path}")
1081
+
1082
+ Download with execution tracking:
1083
+ >>> bag = ml.download_dataset_bag(
1084
+ ... dataset=DatasetSpec(rid="1-abc123", materialize=True),
1085
+ ... execution_rid="1-xyz789"
1086
+ ... )
1087
+ """
1088
+ if not self._is_dataset_rid(dataset.rid):
1089
+ raise DerivaMLTableTypeError("Dataset", dataset.rid)
1090
+ return self._download_dataset_bag(
1091
+ dataset=dataset,
1092
+ execution_rid=execution_rid,
1093
+ snapshot_catalog=DerivaML(
1094
+ self.host_name,
1095
+ self._version_snapshot(dataset),
1096
+ logging_level=self._logging_level,
1097
+ deriva_logging_level=self._deriva_logging_level,
1098
+ ),
1099
+ )
1100
+
1101
+ def _update_status(self, new_status: Status, status_detail: str, execution_rid: RID):
1102
+ """Update the status of an execution in the catalog.
1103
+
1104
+ Args:
1105
+ new_status: New status.
1106
+ status_detail: Details of the status.
1107
+ execution_rid: Resource Identifier (RID) of the execution.
1108
+ new_status: Status:
1109
+ status_detail: str:
1110
+ execution_rid: RID:
1111
+
1112
+ Returns:
1113
+
1114
+ """
1115
+ self.status = new_status.value
1116
+ self.pathBuilder.schemas[self.ml_schema].Execution.update(
1117
+ [
1118
+ {
1119
+ "RID": execution_rid,
1120
+ "Status": self.status,
1121
+ "Status_Detail": status_detail,
1122
+ }
1123
+ ]
1124
+ )
1125
+
1126
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
1127
+ def add_files(
1128
+ self,
1129
+ files: Iterable[FileSpec],
1130
+ dataset_types: str | list[str] | None = None,
1131
+ description: str = "",
1132
+ execution_rid: RID | None = None,
1133
+ ) -> RID:
1134
+ """Adds files to the catalog with their metadata.
1135
+
1136
+ Registers files in the catalog along with their metadata (MD5, length, URL) and associates them with
1137
+ specified file types. Optionally links files to an execution record.
1138
+
1139
+ Args:
1140
+ files: File specifications containing MD5 checksum, length, and URL.
1141
+ dataset_types: One or more dataset type terms from File_Type vocabulary.
1142
+ description: Description of the files.
1143
+ execution_rid: Optional execution RID to associate files with.
1144
+
1145
+ Returns:
1146
+ RID: Resource of dataset that represents the newly added files.
1147
+
1148
+ Raises:
1149
+ DerivaMLException: If file_types are invalid or execution_rid is not an execution record.
1150
+
1151
+ Examples:
1152
+ Add a single file type:
1153
+ >>> files = [FileSpec(url="path/to/file.txt", md5="abc123", length=1000)]
1154
+ >>> rids = ml.add_files(files, file_types="text")
1155
+
1156
+ Add multiple file types:
1157
+ >>> rids = ml.add_files(
1158
+ ... files=[FileSpec(url="image.png", md5="def456", length=2000)],
1159
+ ... file_types=["image", "png"],
1160
+ ... execution_rid="1-xyz789"
1161
+ ... )
1162
+ """
1163
+ if execution_rid and self.resolve_rid(execution_rid).table.name != "Execution":
1164
+ raise DerivaMLTableTypeError("Execution", execution_rid)
1165
+
1166
+ filespec_list = list(files)
1167
+
1168
+ # Get a list of all defined file types and their synonyms.
1169
+ defined_types = set(
1170
+ chain.from_iterable([[t.name] + t.synonyms for t in self.list_vocabulary_terms(MLVocab.asset_type)])
1171
+ )
1172
+
1173
+ # Get a list of al of the file types used in the filespec_list
1174
+ spec_types = set(chain.from_iterable(filespec.file_types for filespec in filespec_list))
1175
+
1176
+ # Now make sure that all of the file types and dataset_types in the spec list are defined.
1177
+ if spec_types - defined_types:
1178
+ raise DerivaMLInvalidTerm(MLVocab.asset_type.name, f"{spec_types - defined_types}")
1179
+
1180
+ # Normalize dataset_types, make sure FIle type is included.
1181
+ if isinstance(dataset_types, list):
1182
+ dataset_types = ["File"] + dataset_types if "File" not in dataset_types else dataset_types
1183
+ else:
1184
+ dataset_types = ["File", dataset_types] if dataset_types else ["File"]
1185
+ for ds_type in dataset_types:
1186
+ self.lookup_term(MLVocab.dataset_type, ds_type)
1187
+
1188
+ # Add files to the file table, and collect up the resulting entries by directory name.
1189
+ pb = self._model.catalog.getPathBuilder()
1190
+ file_records = list(
1191
+ pb.schemas[self.ml_schema].tables["File"].insert([f.model_dump(by_alias=True) for f in filespec_list])
1192
+ )
1193
+
1194
+ # Get the name of the association table between file_table and file_type and add file_type records
1195
+ atable = self.model.find_association(MLTable.file, MLVocab.asset_type)[0].name
1196
+ # Need to get a link between file record and file_types.
1197
+ type_map = {
1198
+ file_spec.md5: file_spec.file_types + ([] if "File" in file_spec.file_types else [])
1199
+ for file_spec in filespec_list
1200
+ }
1201
+ file_type_records = [
1202
+ {MLVocab.asset_type.value: file_type, "File": file_record["RID"]}
1203
+ for file_record in file_records
1204
+ for file_type in type_map[file_record["MD5"]]
1205
+ ]
1206
+ pb.schemas[self._ml_schema].tables[atable].insert(file_type_records)
1207
+
1208
+ if execution_rid:
1209
+ # Get the name of the association table between file_table and execution.
1210
+ pb.schemas[self._ml_schema].File_Execution.insert(
1211
+ [
1212
+ {"File": file_record["RID"], "Execution": execution_rid, "Asset_Role": "Output"}
1213
+ for file_record in file_records
1214
+ ]
1215
+ )
1216
+
1217
+ # Now create datasets to capture the original directory structure of the files.
1218
+ dir_rid_map = defaultdict(list)
1219
+ for e in file_records:
1220
+ dir_rid_map[Path(urlsplit(e["URL"]).path).parent].append(e["RID"])
1221
+
1222
+ nested_datasets = []
1223
+ path_length = 0
1224
+ dataset = None
1225
+ # Start with the longest path so we get subdirectories first.
1226
+ for p, rids in sorted(dir_rid_map.items(), key=lambda kv: len(kv[0].parts), reverse=True):
1227
+ dataset = self.create_dataset(
1228
+ dataset_types=dataset_types, execution_rid=execution_rid, description=description
1229
+ )
1230
+ members = rids
1231
+ if len(p.parts) < path_length:
1232
+ # Going up one level in directory, so Create nested dataset
1233
+ members = nested_datasets + rids
1234
+ nested_datasets = []
1235
+ self.add_dataset_members(dataset_rid=dataset, members=members, execution_rid=execution_rid)
1236
+ nested_datasets.append(dataset)
1237
+ path_length = len(p.parts)
1238
+
1239
+ return dataset
1240
+
1241
+ def list_files(self, file_types: list[str] | None = None) -> list[dict[str, Any]]:
1242
+ """Lists files in the catalog with their metadata.
1243
+
1244
+ Returns a list of files with their metadata including URL, MD5 hash, length, description,
1245
+ and associated file types. Files can be optionally filtered by type.
1246
+
1247
+ Args:
1248
+ file_types: Filter results to only include these file types.
1249
+
1250
+ Returns:
1251
+ list[dict[str, Any]]: List of file records, each containing:
1252
+ - RID: Resource identifier
1253
+ - URL: File location
1254
+ - MD5: File hash
1255
+ - Length: File size
1256
+ - Description: File description
1257
+ - File_Types: List of associated file types
1258
+
1259
+ Examples:
1260
+ List all files:
1261
+ >>> files = ml.list_files()
1262
+ >>> for f in files:
1263
+ ... print(f"{f['RID']}: {f['URL']}")
1264
+
1265
+ Filter by file type:
1266
+ >>> image_files = ml.list_files(["image", "png"])
1267
+ """
1268
+
1269
+ asset_type_atable, file_fk, asset_type_fk = self.model.find_association("File", "Asset_Type")
1270
+ ml_path = self.pathBuilder.schemas[self._ml_schema]
1271
+ file = ml_path.File
1272
+ asset_type = ml_path.tables[asset_type_atable.name]
1273
+
1274
+ path = file.path
1275
+ path = path.link(asset_type.alias("AT"), on=file.RID == asset_type.columns[file_fk], join_type="left")
1276
+ if file_types:
1277
+ path = path.filter(asset_type.columns[asset_type_fk] == datapath.Any(*file_types))
1278
+ path = path.attributes(
1279
+ path.File.RID,
1280
+ path.File.URL,
1281
+ path.File.MD5,
1282
+ path.File.Length,
1283
+ path.File.Description,
1284
+ path.AT.columns[asset_type_fk],
1285
+ )
1286
+
1287
+ file_map = {}
1288
+ for f in path.fetch():
1289
+ entry = file_map.setdefault(f["RID"], {**f, "File_Types": []})
1290
+ if ft := f.get("Asset_Type"): # assign-and-test in one go
1291
+ entry["File_Types"].append(ft)
1292
+
1293
+ # Now get rid of the File_Type key and return the result
1294
+ return [(f, f.pop("Asset_Type"))[0] for f in file_map.values()]
1295
+
1296
+ def list_workflows(self) -> list[Workflow]:
1297
+ """Lists all workflows in the catalog.
1298
+
1299
+ Retrieves all workflow definitions, including their names, URLs, types, versions,
1300
+ and descriptions.
1301
+
1302
+ Returns:
1303
+ list[Workflow]: List of workflow objects, each containing:
1304
+ - name: Workflow name
1305
+ - url: Source code URL
1306
+ - workflow_type: Type of workflow
1307
+ - version: Version identifier
1308
+ - description: Workflow description
1309
+ - rid: Resource identifier
1310
+ - checksum: Source code checksum
1311
+
1312
+ Examples:
1313
+ >>> workflows = ml.list_workflows()
1314
+ >>> for w in workflows:
1315
+ print(f"{w.name} (v{w.version}): {w.description}")
1316
+ print(f" Source: {w.url}")
1317
+ """
1318
+ # Get a workflow table path and fetch all workflows
1319
+ workflow_path = self.pathBuilder.schemas[self.ml_schema].Workflow
1320
+ return [
1321
+ Workflow(
1322
+ name=w["Name"],
1323
+ url=w["URL"],
1324
+ workflow_type=w["Workflow_Type"],
1325
+ version=w["Version"],
1326
+ description=w["Description"],
1327
+ rid=w["RID"],
1328
+ checksum=w["Checksum"],
1329
+ )
1330
+ for w in workflow_path.entities().fetch()
1331
+ ]
1332
+
1333
+ def add_workflow(self, workflow: Workflow) -> RID:
1334
+ """Adds a workflow to the catalog.
1335
+
1336
+ Registers a new workflow in the catalog or returns the RID of an existing workflow with the same
1337
+ URL or checksum.
1338
+
1339
+ Each workflow represents a specific computational process or analysis pipeline.
1340
+
1341
+ Args:
1342
+ workflow: Workflow object containing name, URL, type, version, and description.
1343
+
1344
+ Returns:
1345
+ RID: Resource Identifier of the added or existing workflow.
1346
+
1347
+ Raises:
1348
+ DerivaMLException: If workflow insertion fails or required fields are missing.
1349
+
1350
+ Examples:
1351
+ >>> workflow = Workflow(
1352
+ ... name="Gene Analysis",
1353
+ ... url="https://github.com/org/repo/workflows/gene_analysis.py",
1354
+ ... workflow_type="python_script",
1355
+ ... version="1.0.0",
1356
+ ... description="Analyzes gene expression patterns"
1357
+ ... )
1358
+ >>> workflow_rid = ml.add_workflow(workflow)
1359
+ """
1360
+ # Check if a workflow already exists by URL
1361
+ if workflow_rid := self.lookup_workflow(workflow.checksum or workflow.url):
1362
+ return workflow_rid
1363
+
1364
+ # Get an ML schema path for the workflow table
1365
+ ml_schema_path = self.pathBuilder.schemas[self.ml_schema]
1366
+
1367
+ try:
1368
+ # Create a workflow record
1369
+ workflow_record = {
1370
+ "URL": workflow.url,
1371
+ "Name": workflow.name,
1372
+ "Description": workflow.description,
1373
+ "Checksum": workflow.checksum,
1374
+ "Version": workflow.version,
1375
+ MLVocab.workflow_type: self.lookup_term(MLVocab.workflow_type, workflow.workflow_type).name,
1376
+ }
1377
+ # Insert a workflow and get its RID
1378
+ workflow_rid = ml_schema_path.Workflow.insert([workflow_record])[0]["RID"]
1379
+ except Exception as e:
1380
+ error = format_exception(e)
1381
+ raise DerivaMLException(f"Failed to insert workflow. Error: {error}")
1382
+ return workflow_rid
1383
+
1384
+ def lookup_workflow(self, url_or_checksum: str) -> RID | None:
1385
+ """Finds a workflow by URL.
1386
+
1387
+ Args:
1388
+ url_or_checksum: URL or checksum of the workflow.
1389
+ Returns:
1390
+ RID: Resource Identifier of the workflow if found, None otherwise.
1391
+
1392
+ Example:
1393
+ >>> rid = ml.lookup_workflow("https://github.com/org/repo/workflow.py")
1394
+ >>> if rid:
1395
+ ... print(f"Found workflow: {rid}")
1396
+ """
1397
+ # Get a workflow table path
1398
+ workflow_path = self.pathBuilder.schemas[self.ml_schema].Workflow
1399
+ try:
1400
+ # Search for workflow by URL
1401
+ url_column = workflow_path.URL
1402
+ checksum_column = workflow_path.Checksum
1403
+ return list(
1404
+ workflow_path.path.filter(
1405
+ (url_column == url_or_checksum) | (checksum_column == url_or_checksum)
1406
+ ).entities()
1407
+ )[0]["RID"]
1408
+ except IndexError:
1409
+ return None
1410
+
1411
+ def create_workflow(self, name: str, workflow_type: str, description: str = "") -> Workflow:
1412
+ """Creates a new workflow definition.
1413
+
1414
+ Creates a Workflow object that represents a computational process or analysis pipeline. The workflow type
1415
+ must be a term from the controlled vocabulary. This method is typically used to define new analysis
1416
+ workflows before execution.
1417
+
1418
+ Args:
1419
+ name: Name of the workflow.
1420
+ workflow_type: Type of workflow (must exist in workflow_type vocabulary).
1421
+ description: Description of what the workflow does.
1422
+
1423
+ Returns:
1424
+ Workflow: New workflow object ready for registration.
1425
+
1426
+ Raises:
1427
+ DerivaMLException: If workflow_type is not in the vocabulary.
1428
+
1429
+ Examples:
1430
+ >>> workflow = ml.create_workflow(
1431
+ ... name="RNA Analysis",
1432
+ ... workflow_type="python_notebook",
1433
+ ... description="RNA sequence analysis pipeline"
1434
+ ... )
1435
+ >>> rid = ml.add_workflow(workflow)
1436
+ """
1437
+ # Validate workflow type exists in vocabulary
1438
+ self.lookup_term(MLVocab.workflow_type, workflow_type)
1439
+
1440
+ # Create and return a new workflow object
1441
+ return Workflow(name=name, workflow_type=workflow_type, description=description)
1442
+
1443
+ def create_execution(
1444
+ self, configuration: ExecutionConfiguration, workflow: Workflow | RID | None = None, dry_run: bool = False
1445
+ ) -> "Execution":
1446
+ """Creates an execution environment.
1447
+
1448
+ Given an execution configuration, initialize the local compute environment to prepare for executing an
1449
+ ML or analytic routine. This routine has a number of side effects.
1450
+
1451
+ 1. The datasets specified in the configuration are downloaded and placed in the cache-dir. If a version is
1452
+ not specified in the configuration, then a new minor version number is created for the dataset and downloaded.
1453
+
1454
+ 2. If any execution assets are provided in the configuration, they are downloaded
1455
+ and placed in the working directory.
1456
+
1457
+
1458
+ Args:
1459
+ configuration: ExecutionConfiguration:
1460
+ workflow: Workflow object representing the workflow to execute if not present in the ExecutionConfiguration.
1461
+ dry_run: Do not create an execution record or upload results.
1462
+
1463
+ Returns:
1464
+ An execution object.
1465
+ """
1466
+ # Import here to avoid circular dependency
1467
+ from deriva_ml.execution.execution import Execution
1468
+
1469
+ # Create and store an execution instance
1470
+ self._execution = Execution(configuration, self, workflow=workflow, dry_run=dry_run)
1471
+ return self._execution
1472
+
1473
+ def restore_execution(self, execution_rid: RID | None = None) -> Execution:
1474
+ """Restores a previous execution.
1475
+
1476
+ Given an execution RID, retrieves the execution configuration and restores the local compute environment.
1477
+ This routine has a number of side effects.
1478
+
1479
+ 1. The datasets specified in the configuration are downloaded and placed in the cache-dir. If a version is
1480
+ not specified in the configuration, then a new minor version number is created for the dataset and downloaded.
1481
+
1482
+ 2. If any execution assets are provided in the configuration, they are downloaded and placed
1483
+ in the working directory.
1484
+
1485
+ Args:
1486
+ execution_rid: Resource Identifier (RID) of the execution to restore.
1487
+
1488
+ Returns:
1489
+ Execution: An execution object representing the restored execution environment.
1490
+
1491
+ Raises:
1492
+ DerivaMLException: If execution_rid is not valid or execution cannot be restored.
1493
+
1494
+ Example:
1495
+ >>> execution = ml.restore_execution("1-abc123")
1496
+ """
1497
+ # Import here to avoid circular dependency
1498
+ from deriva_ml.execution.execution import Execution
1499
+
1500
+ # If no RID provided, try to find single execution in working directory
1501
+ if not execution_rid:
1502
+ e_rids = execution_rids(self.working_dir)
1503
+ if len(e_rids) != 1:
1504
+ raise DerivaMLException(f"Multiple execution RIDs were found {e_rids}.")
1505
+ execution_rid = e_rids[0]
1506
+
1507
+ # Try to load configuration from a file
1508
+ cfile = asset_file_path(
1509
+ prefix=self.working_dir,
1510
+ exec_rid=execution_rid,
1511
+ file_name="configuration.json",
1512
+ asset_table=self.model.name_to_table("Execution_Metadata"),
1513
+ metadata={},
1514
+ )
1515
+
1516
+ # Load configuration from a file or create from an execution record
1517
+ if cfile.exists():
1518
+ configuration = ExecutionConfiguration.load_configuration(cfile)
1519
+ else:
1520
+ execution = self.retrieve_rid(execution_rid)
1521
+ configuration = ExecutionConfiguration(
1522
+ workflow=execution["Workflow"],
1523
+ description=execution["Description"],
1524
+ )
1525
+
1526
+ # Create and return an execution instance
1527
+ return Execution(configuration, self, reload=execution_rid)