deriva-ml 1.14.0__py3-none-any.whl → 1.14.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. deriva_ml/__init__.py +25 -30
  2. deriva_ml/core/__init__.py +39 -0
  3. deriva_ml/core/base.py +1489 -0
  4. deriva_ml/core/constants.py +36 -0
  5. deriva_ml/core/definitions.py +74 -0
  6. deriva_ml/core/enums.py +222 -0
  7. deriva_ml/core/ermrest.py +288 -0
  8. deriva_ml/core/exceptions.py +28 -0
  9. deriva_ml/core/filespec.py +116 -0
  10. deriva_ml/dataset/__init__.py +4 -0
  11. deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
  12. deriva_ml/{dataset.py → dataset/dataset.py} +405 -428
  13. deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
  14. deriva_ml/{history.py → dataset/history.py} +51 -33
  15. deriva_ml/{upload.py → dataset/upload.py} +48 -70
  16. deriva_ml/demo_catalog.py +233 -183
  17. deriva_ml/execution/environment.py +290 -0
  18. deriva_ml/{execution.py → execution/execution.py} +365 -252
  19. deriva_ml/execution/execution_configuration.py +163 -0
  20. deriva_ml/{execution_configuration.py → execution/workflow.py} +206 -218
  21. deriva_ml/feature.py +83 -46
  22. deriva_ml/model/__init__.py +0 -0
  23. deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
  24. deriva_ml/{database_model.py → model/database.py} +52 -74
  25. deriva_ml/model/sql_mapper.py +44 -0
  26. deriva_ml/run_notebook.py +19 -11
  27. deriva_ml/schema/__init__.py +3 -0
  28. deriva_ml/{schema_setup → schema}/annotations.py +31 -22
  29. deriva_ml/schema/check_schema.py +104 -0
  30. deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
  31. deriva_ml/schema/deriva-ml-reference.json +8525 -0
  32. deriva_ml/schema/table_comments_utils.py +57 -0
  33. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/METADATA +5 -4
  34. deriva_ml-1.14.26.dist-info/RECORD +40 -0
  35. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/entry_points.txt +1 -0
  36. deriva_ml/deriva_definitions.py +0 -391
  37. deriva_ml/deriva_ml_base.py +0 -1046
  38. deriva_ml/execution_environment.py +0 -139
  39. deriva_ml/schema_setup/table_comments_utils.py +0 -56
  40. deriva_ml/test-files/execution-parameters.json +0 -1
  41. deriva_ml/test-files/notebook-parameters.json +0 -5
  42. deriva_ml/test_functions.py +0 -141
  43. deriva_ml/test_notebook.ipynb +0 -197
  44. deriva_ml-1.14.0.dist-info/RECORD +0 -31
  45. /deriva_ml/{schema_setup → execution}/__init__.py +0 -0
  46. /deriva_ml/{schema_setup → schema}/policy.json +0 -0
  47. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/WHEEL +0 -0
  48. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/licenses/LICENSE +0 -0
  49. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/top_level.txt +0 -0
@@ -1,1046 +0,0 @@
1
- """
2
- `deriva_ml_base.py` is the core module for the Deriva ML project. This module implements the DerivaML class, which is
3
- the primary interface to the Deriva based catalogs. The module also implements the Feature and Vocabulary functions
4
- in the DerivaML.
5
-
6
- DerivaML and its associated classes all depend on a catalog that implements a `deriva-ml` schema with tables and
7
- relationships that follow a specific data model.
8
-
9
- """
10
-
11
- from __future__ import annotations
12
-
13
- import getpass
14
- import logging
15
- from datetime import datetime
16
- from itertools import chain
17
- from pathlib import Path
18
- import requests
19
-
20
- from typing import Optional, Any, Iterable, TYPE_CHECKING
21
-
22
- from deriva.core import (
23
- get_credential,
24
- urlquote,
25
- format_exception,
26
- DEFAULT_SESSION_CONFIG,
27
- )
28
- import deriva.core.datapath as datapath
29
- from deriva.core.datapath import DataPathException
30
- from deriva.core.deriva_server import DerivaServer
31
- from deriva.core.ermrest_catalog import ResolveRidResult
32
- from deriva.core.ermrest_model import Key, Table
33
- from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
34
- from pydantic import validate_call, ConfigDict
35
-
36
- from .execution_configuration import ExecutionConfiguration, Workflow
37
- from .feature import Feature, FeatureRecord
38
- from .dataset import Dataset
39
- from .dataset_aux_classes import DatasetSpec
40
- from .dataset_bag import DatasetBag
41
- from .deriva_model import DerivaModel
42
- from .upload import table_path, execution_rids, asset_file_path
43
- from .deriva_definitions import ColumnDefinition
44
- from .deriva_definitions import (
45
- RID,
46
- Status,
47
- DerivaMLException,
48
- ML_SCHEMA,
49
- VocabularyTerm,
50
- MLVocab,
51
- FileSpec,
52
- TableDefinition,
53
- )
54
- from .schema_setup.annotations import asset_annotation
55
-
56
- try:
57
- from icecream import ic
58
- except ImportError: # Graceful fallback if IceCream isn't installed.
59
- ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
60
-
61
-
62
- if TYPE_CHECKING:
63
- from .execution import Execution
64
-
65
-
66
- class DerivaML(Dataset):
67
- """Base class for ML operations on a Deriva catalog.
68
-
69
- This class is intended to be used as a base class on which more domain specific interfaces are built.
70
-
71
- Attributes:
72
- host_name: Hostname of the Deriva server.
73
- catalog_id: Catalog ID. Either and identifier, or a catalog name.
74
- domain_schema: Schema name for domain specific tables and relationships.
75
- model: ERMRest model for the catalog
76
- """
77
-
78
- def __init__(
79
- self,
80
- hostname: str,
81
- catalog_id: str | int,
82
- domain_schema: Optional[str] = None,
83
- project_name: Optional[str] = None,
84
- cache_dir: Optional[str] = None,
85
- working_dir: Optional[str] = None,
86
- ml_schema: str = ML_SCHEMA,
87
- logging_level=logging.INFO,
88
- credential=None,
89
- use_minid=True,
90
- ):
91
- """Create and initialize a DerivaML instance.
92
-
93
- This method will connect to a catalog, and initialize local configuration for the ML execution.
94
- This class is intended to be used as a base class on which domain-specific interfaces are built.
95
-
96
- Args:
97
- hostname: Hostname of the Deriva server.
98
- catalog_id: Catalog ID. Either an identifier or a catalog name.
99
- domain_schema: Schema name for domain-specific tables and relationships.
100
- project_name: Project name. Defaults to name of domain schema.
101
- cache_dir: Directory path for caching data downloaded from the Deriva server as bdbag.
102
- working_dir: Directory path for storing data used by or generated by any computations.
103
- use_minid: Use the MINID serice when downloading dataset bags.
104
- """
105
- self.credential = credential or get_credential(hostname)
106
- server = DerivaServer(
107
- "https",
108
- hostname,
109
- credentials=self.credential,
110
- session_config=self._get_session_config(),
111
- )
112
- self.catalog = server.connect_ermrest(catalog_id)
113
- self.model = DerivaModel(
114
- self.catalog.getCatalogModel(), domain_schema=domain_schema
115
- )
116
-
117
- default_workdir = self.__class__.__name__ + "_working"
118
- self.working_dir = (
119
- Path(working_dir) / getpass.getuser()
120
- if working_dir
121
- else Path.home() / "deriva-ml"
122
- ) / default_workdir
123
-
124
- self.working_dir.mkdir(parents=True, exist_ok=True)
125
- self.cache_dir = Path(cache_dir) if cache_dir else self.working_dir / "cache"
126
-
127
- self.cache_dir.mkdir(parents=True, exist_ok=True)
128
-
129
- # Initialize dataset class.
130
- super().__init__(
131
- self.model, self.cache_dir, self.working_dir, use_minid=use_minid
132
- )
133
- self._logger = logging.getLogger("deriva_ml")
134
- self._logger.setLevel(logging_level)
135
-
136
- self.host_name = hostname
137
- self.catalog_id = catalog_id
138
- self.ml_schema = ml_schema
139
- self.configuration = None
140
- self._execution: Optional[Execution] = None
141
- self.domain_schema = self.model.domain_schema
142
- self.project_name = project_name or self.domain_schema
143
- self.start_time = datetime.now()
144
- self.status = Status.pending.value
145
-
146
- logging.basicConfig(
147
- level=logging_level,
148
- format="%(asctime)s - %(name)s.%(levelname)s - %(message)s",
149
- )
150
-
151
- # Set logging level for Deriva library
152
- deriva_logger = logging.getLogger("deriva")
153
- deriva_logger.setLevel(logging_level)
154
-
155
- def __del__(self):
156
- try:
157
- if self._execution and self._execution.status != Status.completed:
158
- self._execution.update_status(Status.aborted, "Execution Aborted")
159
- except (AttributeError, requests.HTTPError):
160
- pass
161
-
162
- @staticmethod
163
- def _get_session_config():
164
- """ """
165
- session_config = DEFAULT_SESSION_CONFIG.copy()
166
- session_config.update(
167
- {
168
- # our PUT/POST to ermrest is idempotent
169
- "allow_retry_on_all_methods": True,
170
- # do more retries before aborting
171
- "retry_read": 8,
172
- "retry_connect": 5,
173
- # increase delay factor * 2**(n-1) for Nth retry
174
- "retry_backoff_factor": 5,
175
- }
176
- )
177
- return session_config
178
-
179
- # noinspection PyProtectedMember
180
- @property
181
- def pathBuilder(self) -> datapath._CatalogWrapper:
182
- """Get a new instance of a pathBuilder object."""
183
- return self.catalog.getPathBuilder()
184
-
185
- @property
186
- def domain_path(self):
187
- """Get a new instance of a pathBuilder object to the domain schema"""
188
-
189
- return self.pathBuilder.schemas[self.domain_schema]
190
-
191
- def table_path(self, table: str | Table) -> Path:
192
- """Return a local file path in which to place a CSV to add values to a table on upload.
193
-
194
- Args:
195
- table: str | Table:
196
-
197
- Returns:
198
- Path to a CSV file in which to add values to a table on upload.
199
- """
200
- return table_path(
201
- self.working_dir,
202
- schema=self.domain_schema,
203
- table=self.model.name_to_table(table).name,
204
- )
205
-
206
- def download_dir(self, cached: bool = False) -> Path:
207
- """Location where downloaded files are placed.
208
-
209
- Args:
210
- cached: bool: (Default value = False)
211
-
212
- Returns:
213
-
214
- """
215
- return self.cache_dir if cached else self.working_dir
216
-
217
- @staticmethod
218
- def globus_login(host: str) -> None:
219
- """Log into the specified host using Globus.
220
-
221
- Args:
222
- host:
223
-
224
- Returns:
225
-
226
- """
227
- gnl = GlobusNativeLogin(host=host)
228
- if gnl.is_logged_in([host]):
229
- print("You are already logged in.")
230
- else:
231
- gnl.login(
232
- [host],
233
- no_local_server=True,
234
- no_browser=True,
235
- refresh_tokens=True,
236
- update_bdbag_keychain=True,
237
- )
238
- print("Login Successful")
239
-
240
- def chaise_url(self, table: RID | Table) -> str:
241
- """Return a Chaise URL to the specified table.
242
-
243
- Args:
244
- table: Table or RID to be visited
245
- table: str | Table:
246
-
247
- Returns:
248
- URL to the table in Chaise format.
249
- """
250
- table_obj = self.model.name_to_table(table)
251
- try:
252
- uri = self.catalog.get_server_uri().replace(
253
- "ermrest/catalog/", "chaise/recordset/#"
254
- )
255
- except DerivaMLException:
256
- # Perhaps we have a RID....
257
- uri = self.cite(table)
258
- return f"{uri}/{urlquote(table_obj.schema.name)}:{urlquote(table_obj.name)}"
259
-
260
- def cite(self, entity: dict | str) -> str:
261
- """Return a citation URL for the provided entity.
262
-
263
- Args:
264
- entity: A dict that contains the column values for a specific entity or a RID.
265
-
266
- Returns:
267
- The URI for the provided entity.
268
-
269
- Raises:
270
- DerivaMLException: if provided RID does not exist.
271
- """
272
- if isinstance(entity, str) and entity.startswith(
273
- f"https://{self.host_name}/id/{self.catalog_id}/"
274
- ):
275
- # Already got a citation...
276
- return entity
277
- try:
278
- self.resolve_rid(
279
- rid := entity if isinstance(entity, str) else entity["RID"]
280
- )
281
- return f"https://{self.host_name}/id/{self.catalog_id}/{rid}@{self.catalog.latest_snapshot().snaptime}"
282
- except KeyError as e:
283
- raise DerivaMLException(f"Entity {e} does not have RID column")
284
- except DerivaMLException as _e:
285
- raise DerivaMLException("Entity RID does not exist")
286
-
287
- def user_list(self) -> list[dict[str, str]]:
288
- """List of users in the catalog
289
-
290
- Args:
291
-
292
- Returns:
293
- A list of dictionaries containing user information.
294
-
295
- """
296
- user_path = self.pathBuilder.public.ERMrest_Client.path
297
- return [
298
- {"ID": u["ID"], "Full_Name": u["Full_Name"]}
299
- for u in user_path.entities().fetch()
300
- ]
301
-
302
- def resolve_rid(self, rid: RID) -> ResolveRidResult:
303
- """Return a named tuple with information about the specified RID.
304
-
305
- Args:
306
- rid: RID of the object of interest
307
-
308
- Returns:
309
- ResolveRidResult which has information about the specified RID.
310
-
311
- Raises:
312
- DerivaMLException: if the RID doesn't exist.
313
- """
314
- try:
315
- return self.catalog.resolve_rid(rid, self.model.model)
316
- except KeyError as _e:
317
- raise DerivaMLException(f"Invalid RID {rid}")
318
-
319
- def retrieve_rid(self, rid: RID) -> dict[str, Any]:
320
- """Return a dictionary that represents the values of the specified RID.
321
-
322
- Args:
323
- rid: RID of the object of interest
324
-
325
- Returns:
326
- A dictionary that represents the values of the specified RID.
327
-
328
- Raises:
329
- DerivaMLException: if the RID doesn't exist.
330
- """
331
- return self.resolve_rid(rid).datapath.entities().fetch()[0]
332
-
333
- def add_page(self, title: str, content: str) -> None:
334
- """
335
-
336
- Args:
337
- title: str:
338
- content: str:
339
-
340
- Returns:
341
-
342
- """
343
- self.pathBuilder.www.tables[self.domain_schema].insert(
344
- [{"Title": title, "Content": content}]
345
- )
346
-
347
- def create_vocabulary(
348
- self, vocab_name: str, comment: str = "", schema: Optional[str] = None
349
- ) -> Table:
350
- """Create a controlled vocabulary table with the given vocab name.
351
-
352
- Args:
353
- vocab_name: Name of the controlled vocabulary table.
354
- comment: Description of the vocabulary table. (Default value = '')
355
- schema: Schema in which to create the controlled vocabulary table. Defaults to domain_schema.
356
- vocab_name: str:
357
-
358
- Returns:
359
- An ERMRest table object for the newly created vocabulary table.
360
- """
361
- schema = schema or self.domain_schema
362
- return self.model.schemas[schema].create_table(
363
- Table.define_vocabulary(
364
- vocab_name, f"{self.project_name}:{{RID}}", comment=comment
365
- )
366
- )
367
-
368
- def create_table(self, table: TableDefinition) -> Table:
369
- """Create a table from a table definition."""
370
- return self.model.schemas[self.domain_schema].create_table(table.model_dump())
371
-
372
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
373
- def create_asset(
374
- self,
375
- asset_name: str,
376
- column_defs: Optional[Iterable[ColumnDefinition]] = None,
377
- fkey_defs: Optional[Iterable[ColumnDefinition]] = None,
378
- referenced_tables: Optional[Iterable[Table]] = None,
379
- comment: str = "",
380
- schema: Optional[str] = None,
381
- ) -> Table:
382
- """Create an asset table with the given asset name.
383
-
384
- Args:
385
- asset_name: Name of the asset table.
386
- column_defs: Iterable of ColumnDefinition objects to provide additional metadata for asset.
387
- fkey_defs: Iterable of ForeignKeyDefinition objects to provide additional metadata for asset.
388
- referenced_tables: Iterable of Table objects to which asset should provide foreign-key references to.
389
- comment: Description of the asset table. (Default value = '')
390
- schema: Schema in which to create the asset table. Defaults to domain_schema.
391
- asset_name: str:
392
- schema: str: (Default value = None)
393
-
394
- Returns:
395
- Table object for the asset table.
396
- """
397
- column_defs = column_defs or []
398
- fkey_defs = fkey_defs or []
399
- referenced_tables = referenced_tables or []
400
- schema = schema or self.domain_schema
401
-
402
- self.add_term(
403
- MLVocab.asset_type, asset_name, description=f"A {asset_name} asset"
404
- )
405
- asset_table = self.model.schemas[schema].create_table(
406
- Table.define_asset(
407
- schema,
408
- asset_name,
409
- column_defs=[c.model_dump() for c in column_defs],
410
- fkey_defs=[fk.model_dump() for fk in fkey_defs],
411
- comment=comment,
412
- )
413
- )
414
-
415
- self.model.schemas[self.domain_schema].create_table(
416
- Table.define_association(
417
- [
418
- (asset_table.name, asset_table),
419
- ("Asset_Type", self.model.name_to_table("Asset_Type")),
420
- ]
421
- )
422
- )
423
- for t in referenced_tables:
424
- asset_table.create_reference(self.model.name_to_table(t))
425
- # Create a table to track execution that creates the asset
426
- atable = self.model.schemas[self.domain_schema].create_table(
427
- Table.define_association(
428
- [
429
- (asset_name, asset_table),
430
- (
431
- "Execution",
432
- self.model.schemas[self.ml_schema].tables["Execution"],
433
- ),
434
- ]
435
- )
436
- )
437
- atable.create_reference(self.model.name_to_table("Asset_Role"))
438
-
439
- asset_annotation(asset_table)
440
- return asset_table
441
-
442
- # @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
443
- def list_assets(self, asset_table: Table | str):
444
- """Return the contents of an asset table"""
445
-
446
- if not self.model.is_asset(asset_table):
447
- raise DerivaMLException(f"Table {asset_table.name} is not an asset")
448
- asset_table = self.model.name_to_table(asset_table)
449
- pb = self._model.catalog.getPathBuilder()
450
- asset_path = pb.schemas[asset_table.schema.name].tables[asset_table.name]
451
-
452
- asset_type_table = self._model.find_association(asset_table, MLVocab.asset_type)
453
- type_path = pb.schemas[asset_type_table.schema.name].tables[
454
- asset_type_table.name
455
- ]
456
-
457
- # Get a list of all the asset_type values associated with this dataset_table.
458
- assets = []
459
- for asset in asset_path.entities().fetch():
460
- asset_types = (
461
- type_path.filter(type_path.columns[asset_table.name] == asset["RID"])
462
- .attributes(type_path.Asset_Type)
463
- .fetch()
464
- )
465
- assets.append(
466
- asset
467
- | {
468
- MLVocab.asset_type.value: [
469
- asset_type[MLVocab.asset_type.value]
470
- for asset_type in asset_types
471
- ]
472
- }
473
- )
474
- return assets
475
-
476
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
477
- def create_feature(
478
- self,
479
- target_table: Table | str,
480
- feature_name: str,
481
- terms: Optional[list[Table | str]] = None,
482
- assets: Optional[list[Table | str]] = None,
483
- metadata: Optional[Iterable[ColumnDefinition | Table | Key | str]] = None,
484
- optional: Optional[list[str]] = None,
485
- comment: str = "",
486
- ) -> type[FeatureRecord]:
487
- """Create a new feature that can be associated with a table.
488
-
489
- The feature can associate a controlled vocabulary term, an asset, or any other values with a
490
- specific instance of an object and execution.
491
-
492
- Args:
493
- feature_name: Name of the new feature to be defined
494
- target_table: table name or object on which the feature is to be associated
495
- terms: List of controlled vocabulary terms that will be part of the feature value
496
- assets: List of asset table names or objects that will be part of the feature value
497
- metadata: List of other value types that are associated with the feature
498
- optional: List of columns that are optional in the feature
499
- comment: return: A Feature class that can be used to create instances of the feature.
500
-
501
- Returns:
502
- A Feature class that can be used to create instances of the feature.
503
-
504
- Raises:
505
- DerivaException: If the feature cannot be created.
506
- """
507
-
508
- terms = terms or []
509
- assets = assets or []
510
- metadata = metadata or []
511
- optional = optional or []
512
-
513
- def normalize_metadata(m: Key | Table | ColumnDefinition | str):
514
- """
515
-
516
- Args:
517
- m: Key | Table | ColumnDefinition | str:
518
-
519
- Returns:
520
-
521
- """
522
- if isinstance(m, str):
523
- return self.model.name_to_table(m)
524
- elif isinstance(m, ColumnDefinition):
525
- return m.model_dump()
526
- else:
527
- return m
528
-
529
- # Make sure that the provided assets or terms are actually assets or terms.
530
- if not all(map(self.model.is_asset, assets)):
531
- raise DerivaMLException("Invalid create_feature asset table.")
532
- if not all(map(self.model.is_vocabulary, terms)):
533
- raise DerivaMLException("Invalid create_feature asset table.")
534
-
535
- # Get references to the necessary tables and make sure that the
536
- # provided feature name exists.
537
- target_table = self.model.name_to_table(target_table)
538
- execution = self.model.schemas[self.ml_schema].tables["Execution"]
539
- feature_name_table = self.model.schemas[self.ml_schema].tables["Feature_Name"]
540
- feature_name_term = self.add_term(
541
- "Feature_Name", feature_name, description=comment
542
- )
543
- atable_name = f"Execution_{target_table.name}_{feature_name_term.name}"
544
-
545
- # Now create the association table that implements the feature.
546
- atable = self.model.schemas[self.domain_schema].create_table(
547
- target_table.define_association(
548
- table_name=atable_name,
549
- associates=[execution, target_table, feature_name_table],
550
- metadata=[
551
- normalize_metadata(m) for m in chain(assets, terms, metadata)
552
- ],
553
- comment=comment,
554
- )
555
- )
556
- # Now set optional terms.
557
- for c in optional:
558
- atable.columns[c].alter(nullok=True)
559
- atable.columns["Feature_Name"].alter(default=feature_name_term.name)
560
- return self.feature_record_class(target_table, feature_name)
561
-
562
- def feature_record_class(
563
- self, table: str | Table, feature_name: str
564
- ) -> type[FeatureRecord]:
565
- """Create a pydantic model for entries into the specified feature table.
566
-
567
- For information on how to
568
- See the pydantic documentation for more details about the pydantic model.
569
-
570
- Args:
571
- table: table name or object on which the feature is to be associated
572
- feature_name: name of the feature to be created
573
- table: str | Table:
574
- feature_name: str:
575
-
576
- Returns:
577
- A Feature class that can be used to create instances of the feature.
578
- """
579
- return self.lookup_feature(table, feature_name).feature_record_class()
580
-
581
- def delete_feature(self, table: Table | str, feature_name: str) -> bool:
582
- """
583
-
584
- Args:
585
- table: Table | str:
586
- feature_name: str:
587
-
588
- Returns:
589
- """
590
- table = self.model.name_to_table(table)
591
- try:
592
- feature = next(
593
- f for f in self.find_features(table) if f.feature_name == feature_name
594
- )
595
- feature.feature_table.drop()
596
- return True
597
- except StopIteration:
598
- return False
599
-
600
- def lookup_feature(self, table: str | Table, feature_name: str) -> Feature:
601
- """Lookup the named feature associated with the provided table.
602
-
603
- Args:
604
- table: param feature_name:
605
- table: str | Table:
606
- feature_name: str:
607
-
608
- Returns:
609
- A Feature class that represents the requested feature.
610
-
611
- Raises:
612
- DerivaMLException: If the feature cannot be found.
613
- """
614
- return self.model.lookup_feature(table, feature_name)
615
-
616
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
617
- def find_features(self, table: Table | str) -> Iterable[Feature]:
618
- """List the names of the features in the specified table.
619
-
620
- Args:
621
- table: The table to find features for.
622
- table: Table | str:
623
-
624
- Returns:
625
- An iterable of FeatureResult instances that describe the current features in the table.
626
- """
627
- return self.model.find_features(table)
628
-
629
- # noinspection PyProtectedMember
630
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
631
- def list_feature_values(
632
- self, table: Table | str, feature_name: str
633
- ) -> datapath._ResultSet:
634
- """Return a datapath ResultSet containing all values of a feature associated with a table.
635
-
636
- Args:
637
- table: param feature_name:
638
- table: Table | str:
639
- feature_name: str:
640
-
641
- Returns:
642
-
643
- """
644
- table = self.model.name_to_table(table)
645
- feature = self.lookup_feature(table, feature_name)
646
- pb = self.catalog.getPathBuilder()
647
- return (
648
- pb.schemas[feature.feature_table.schema.name]
649
- .tables[feature.feature_table.name]
650
- .entities()
651
- .fetch()
652
- )
653
-
654
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
655
- def add_term(
656
- self,
657
- table: str | Table,
658
- term_name: str,
659
- description: str,
660
- synonyms: Optional[list[str]] = None,
661
- exists_ok: bool = True,
662
- ) -> VocabularyTerm:
663
- """Creates a new control vocabulary term in the control vocabulary table.
664
-
665
- Args:
666
-
667
- Args:
668
- table: The name of the control vocabulary table.
669
- term_name: The name of the new control vocabulary.
670
- description: The description of the new control vocabulary.
671
- synonyms: Optional list of synonyms for the new control vocabulary. Defaults to an empty list.
672
- exists_ok: Optional flag indicating whether to allow creation if the control vocabulary name
673
- already exists. Defaults to True.
674
-
675
- Returns:
676
- The RID of the newly created control vocabulary.
677
-
678
- Raises:
679
- DerivaException: If the control vocabulary name already exists and exist_ok is False.
680
- """
681
- synonyms = synonyms or []
682
- table = self.model.name_to_table(table)
683
- pb = self.catalog.getPathBuilder()
684
- if not (self.model.is_vocabulary(table)):
685
- raise DerivaMLException(f"The table {table} is not a controlled vocabulary")
686
-
687
- schema_name = table.schema.name
688
- table_name = table.name
689
- try:
690
- term_id = VocabularyTerm.model_validate(
691
- pb.schemas[schema_name]
692
- .tables[table_name]
693
- .insert(
694
- [
695
- {
696
- "Name": term_name,
697
- "Description": description,
698
- "Synonyms": synonyms,
699
- }
700
- ],
701
- defaults={"ID", "URI"},
702
- )[0]
703
- )
704
- except DataPathException:
705
- term_id = self.lookup_term(table, term_name)
706
- if not exists_ok:
707
- raise DerivaMLException(f"{term_name} already exists")
708
- # Check vocabulary
709
- return term_id
710
-
711
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
712
- def lookup_term(self, table: str | Table, term_name: str) -> VocabularyTerm:
713
- """Given a term name, return the vocabulary record. Can provide either the term name
714
- or a synonym for the term. Generate an exception if the term is not in the vocabulary.
715
-
716
- Args:
717
- table: The name of the controlled vocabulary table or a ERMRest table object.
718
- term_name: The name of the term to look up.
719
-
720
- Returns:
721
- The entry the associated term or synonym.
722
-
723
- Raises:
724
- DerivaException: If the schema or vocabulary table doesn't exist, or if the term is not
725
- found in the vocabulary.
726
- """
727
- vocab_table = self.model.name_to_table(table)
728
- if not self.model.is_vocabulary(vocab_table):
729
- raise DerivaMLException(f"The table {table} is not a controlled vocabulary")
730
- schema_name, table_name = vocab_table.schema.name, vocab_table.name
731
- schema_path = self.catalog.getPathBuilder().schemas[schema_name]
732
-
733
- for term in schema_path.tables[table_name].entities().fetch():
734
- if term_name == term["Name"] or (
735
- term["Synonyms"] and term_name in term["Synonyms"]
736
- ):
737
- return VocabularyTerm.model_validate(term)
738
- raise DerivaMLException(f"Term {term_name} is not in vocabulary {table_name}")
739
-
740
- def list_vocabulary_terms(self, table: str | Table) -> list[VocabularyTerm]:
741
- """Return a list of terms that are in a vocabulary table.
742
-
743
- Args:
744
- table: The name of the controlled vocabulary table or a ERMRest table object.
745
- table: str | Table:
746
-
747
- Returns:
748
- The list of terms that are in a vocabulary table.
749
-
750
- Raises:
751
- DerivaMLException: If the schema or vocabulary table doesn't exist, or if the table is not
752
- a controlled vocabulary.
753
- """
754
- pb = self.catalog.getPathBuilder()
755
- table = self.model.name_to_table(table)
756
- if not (self.model.is_vocabulary(table)):
757
- raise DerivaMLException(f"The table {table} is not a controlled vocabulary")
758
-
759
- return [
760
- VocabularyTerm(**v)
761
- for v in pb.schemas[table.schema.name].tables[table.name].entities().fetch()
762
- ]
763
-
764
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
765
- def download_dataset_bag(
766
- self,
767
- dataset: DatasetSpec,
768
- execution_rid: Optional[RID] = None,
769
- ) -> DatasetBag:
770
- """Download a dataset onto the local file system. Create a MINID for the dataset if one doesn't already exist.
771
-
772
- Args:
773
- dataset: Specification of the dataset to be downloaded.
774
- execution_rid: Execution RID for the dataset.
775
-
776
- Returns:
777
- Tuple consisting of the path to the dataset, the RID of the dataset that was downloaded and the MINID
778
- for the dataset.
779
- """
780
- return self._download_dataset_bag(
781
- dataset=dataset,
782
- execution_rid=execution_rid,
783
- snapshot_catalog=DerivaML(self.host_name, self._version_snapshot(dataset)),
784
- )
785
-
786
- def _update_status(
787
- self, new_status: Status, status_detail: str, execution_rid: RID
788
- ):
789
- """Update the status of an execution in the catalog.
790
-
791
- Args:
792
- new_status: New status.
793
- status_detail: Details of the status.
794
- execution_rid: Resource Identifier (RID) of the execution.
795
- new_status: Status:
796
- status_detail: str:
797
- execution_rid: RID:
798
-
799
- Returns:
800
-
801
- """
802
- self.status = new_status.value
803
- self.pathBuilder.schemas[self.ml_schema].Execution.update(
804
- [
805
- {
806
- "RID": execution_rid,
807
- "Status": self.status,
808
- "Status_Detail": status_detail,
809
- }
810
- ]
811
- )
812
-
813
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
814
- def add_files(
815
- self,
816
- files: Iterable[FileSpec],
817
- file_types: str | list[str],
818
- execution_rid: Optional[RID] = None,
819
- ) -> Iterable[RID]:
820
- """Add a new file to the File table in the catalog.
821
-
822
- The input is an iterator of FileSpec objects which provide the MD5 checksum, length, and URL.
823
-
824
- Args:
825
- file_types: One or more file types. Must be a term from the File_Type controlled vocabulary.
826
- files: A sequence of file specifications that describe the files to add.
827
- execution_rid: Resource Identifier (RID) of the execution to associate with the file.
828
-
829
- Returns:
830
- Iterable of the RIDs of the files that were added.
831
- """
832
- defined_types = self.list_vocabulary_terms(MLVocab.file_type)
833
- if execution_rid and self.resolve_rid(execution_rid).table.name != "Execution":
834
- raise DerivaMLException(
835
- f"RID {execution_rid} is not for an execution table."
836
- )
837
-
838
- def check_file_type(dtype: str) -> bool:
839
- """Make sure that the specified string is either the name or synonym for a file type term."""
840
- for term in defined_types:
841
- if dtype == term.name or (term.synonyms and file_type in term.synonyms):
842
- return True
843
- return False
844
-
845
- file_types = [file_types] if isinstance(file_types, str) else file_types
846
- pb = self._model.catalog.getPathBuilder()
847
- for file_type in file_types:
848
- if not check_file_type(file_type):
849
- raise DerivaMLException("File type must be a vocabulary term.")
850
- file_table_path = pb.schemas[self.ml_schema].tables["File"]
851
- file_rids = [
852
- e["RID"] for e in file_table_path.insert([f.model_dump() for f in files])
853
- ]
854
-
855
- # Get the name of the association table between file_table and file_type.
856
- atable = next(
857
- self._model.schemas[self._ml_schema]
858
- .tables[MLVocab.file_type]
859
- .find_associations()
860
- ).name
861
- pb.schemas[self._ml_schema].tables[atable].insert(
862
- [
863
- {"File_Type": file_type, "File": file_rid}
864
- for file_rid in file_rids
865
- for file_type in file_types
866
- ]
867
- )
868
-
869
- if execution_rid:
870
- # Get the name of the association table between file_table and execution.
871
- pb.schemas[self._ml_schema].File_Execution.insert(
872
- [
873
- {"File": file_rid, "Execution": execution_rid}
874
- for file_rid in file_rids
875
- ]
876
- )
877
- return file_rids
878
-
879
- def list_files(
880
- self, file_types: Optional[list[str]] = None
881
- ) -> list[dict[str, Any]]:
882
- """Return the contents of the file table. Denormalized file types into the file record."""
883
- ml_path = self.pathBuilder.schemas[self._ml_schema]
884
- file_path = ml_path.File
885
- type_path = ml_path.File_File_Type
886
-
887
- path = file_path.link(
888
- type_path, on=file_path.RID == type_path.File, join_type="left"
889
- )
890
- path = path.File.attributes(
891
- path.File.RID,
892
- path.File.URL,
893
- path.File.MD5,
894
- path.File.Length,
895
- path.File.Description,
896
- path.File_File_Type.File_Type,
897
- )
898
- file_map = {}
899
- for f in path.fetch():
900
- entry = file_map.setdefault(f["RID"], {**f, "File_Types": []})
901
- if ft := f.get("File_Type"): # assign-and-test in one go
902
- entry["File_Types"].append(ft)
903
-
904
- # Now get rid of the File_Type key and return the result
905
- return [(f, f.pop("File_Type"))[0] for f in file_map.values()]
906
-
907
- def list_workflows(self) -> list[Workflow]:
908
- """Return a list of all the workflows in the catalog."""
909
- workflow_path = self.pathBuilder.schemas[self.ml_schema].Workflow
910
- return [
911
- Workflow(
912
- name=w["Name"],
913
- url=w["URL"],
914
- workflow_type=w["Workflow_Type"],
915
- version=w["Version"],
916
- description=w["Description"],
917
- rid=w["RID"],
918
- checksum=w["Checksum"],
919
- )
920
- for w in workflow_path.entities().fetch()
921
- ]
922
-
923
- def add_workflow(self, workflow: Workflow) -> RID:
924
- """Add a workflow to the Workflow table.
925
-
926
- Args:
927
- workflow: An instance of a Workflow object.
928
-
929
- Returns:
930
- - str: Resource Identifier (RID) of the added workflow.
931
-
932
- """
933
-
934
- # Check to make sure that the workflow is not already in the table. If it's not, add it.
935
-
936
- if workflow_rid := self.lookup_workflow(workflow.url):
937
- return workflow_rid
938
-
939
- ml_schema_path = self.pathBuilder.schemas[self.ml_schema]
940
- try:
941
- # Record doesn't exist already
942
- workflow_record = {
943
- "URL": workflow.url,
944
- "Name": workflow.name,
945
- "Description": workflow.description,
946
- "Checksum": workflow.checksum,
947
- "Version": workflow.version,
948
- MLVocab.workflow_type: self.lookup_term(
949
- MLVocab.workflow_type, workflow.workflow_type
950
- ).name,
951
- }
952
- workflow_rid = ml_schema_path.Workflow.insert([workflow_record])[0]["RID"]
953
- except Exception as e:
954
- error = format_exception(e)
955
- raise DerivaMLException(f"Failed to insert workflow. Error: {error}")
956
- return workflow_rid
957
-
958
- def lookup_workflow(self, url: str) -> Optional[RID]:
959
- """Given a URL, look in the workflow table to find a matching workflow."""
960
- workflow_path = self.pathBuilder.schemas[self.ml_schema].Workflow
961
- try:
962
- url_column = workflow_path.URL
963
- return list(workflow_path.filter(url_column == url).entities())[0]["RID"]
964
- except IndexError:
965
- return None
966
-
967
- def create_workflow(
968
- self, name: str, workflow_type: str, description: str = ""
969
- ) -> Workflow:
970
- """Identify current executing program and return a workflow RID for it
971
-
972
- Determine the notebook or script that is currently being executed. Assume that this is
973
- being executed from a cloned GitHub repository. Determine the remote repository name for
974
- this object. Then either retrieve an existing workflow for this executable or create
975
- a new one.
976
-
977
- Args:
978
- name: The name of the workflow.
979
- workflow_type: The type of the workflow.
980
- description: The description of the workflow.
981
-
982
- Returns:
983
- A workflow object.
984
- """
985
- # Make sure type is correct.
986
- self.lookup_term(MLVocab.workflow_type, workflow_type)
987
-
988
- return Workflow.create_workflow(name, workflow_type, description)
989
-
990
- # @validate_call
991
- def create_execution(
992
- self, configuration: ExecutionConfiguration, dry_run: bool = False
993
- ) -> "Execution":
994
- """Create an execution object
995
-
996
- Given an execution configuration, initialize the local compute environment to prepare for executing an
997
- ML or analytic routine. This routine has a number of side effects.
998
-
999
- 1. The datasets specified in the configuration are downloaded and placed in the cache-dir. If a version is
1000
- not specified in the configuration, then a new minor version number is created for the dataset and downloaded.
1001
-
1002
- 2. If any execution assets are provided in the configuration, they are downloaded and placed in the working directory.
1003
-
1004
-
1005
- Args:
1006
- configuration: ExecutionConfiguration:
1007
- dry_run: Do not create an execution record or upload results.
1008
-
1009
- Returns:
1010
- An execution object.
1011
- """
1012
- from .execution import Execution
1013
-
1014
- self._execution = Execution(configuration, self, dry_run=dry_run)
1015
- return self._execution
1016
-
1017
- # @validate_call
1018
- def restore_execution(self, execution_rid: Optional[RID] = None) -> "Execution":
1019
- """Return an Execution object for a previously started execution with the specified RID."""
1020
-
1021
- from .execution import Execution
1022
-
1023
- # Find path to execution
1024
- if not execution_rid:
1025
- e_rids = execution_rids(self.working_dir)
1026
- if len(e_rids) != 1:
1027
- raise DerivaMLException(f"Multiple execution RIDs were found {e_rids}.")
1028
-
1029
- execution_rid = e_rids[0]
1030
- cfile = asset_file_path(
1031
- prefix=self.working_dir,
1032
- exec_rid=execution_rid,
1033
- file_name="configuration.json",
1034
- asset_table=self.model.name_to_table("Execution_Metadata"),
1035
- metadata={},
1036
- )
1037
-
1038
- if cfile.exists():
1039
- configuration = ExecutionConfiguration.load_configuration(cfile)
1040
- else:
1041
- execution = self.retrieve_rid(execution_rid)
1042
- configuration = ExecutionConfiguration(
1043
- workflow=execution["Workflow"],
1044
- description=execution["Description"],
1045
- )
1046
- return Execution(configuration, self, reload=execution_rid)