deriva-ml 1.14.0__py3-none-any.whl → 1.14.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +25 -30
- deriva_ml/core/__init__.py +39 -0
- deriva_ml/core/base.py +1489 -0
- deriva_ml/core/constants.py +36 -0
- deriva_ml/core/definitions.py +74 -0
- deriva_ml/core/enums.py +222 -0
- deriva_ml/core/ermrest.py +288 -0
- deriva_ml/core/exceptions.py +28 -0
- deriva_ml/core/filespec.py +116 -0
- deriva_ml/dataset/__init__.py +4 -0
- deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
- deriva_ml/{dataset.py → dataset/dataset.py} +405 -428
- deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
- deriva_ml/{history.py → dataset/history.py} +51 -33
- deriva_ml/{upload.py → dataset/upload.py} +48 -70
- deriva_ml/demo_catalog.py +233 -183
- deriva_ml/execution/environment.py +290 -0
- deriva_ml/{execution.py → execution/execution.py} +365 -252
- deriva_ml/execution/execution_configuration.py +163 -0
- deriva_ml/{execution_configuration.py → execution/workflow.py} +206 -218
- deriva_ml/feature.py +83 -46
- deriva_ml/model/__init__.py +0 -0
- deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
- deriva_ml/{database_model.py → model/database.py} +52 -74
- deriva_ml/model/sql_mapper.py +44 -0
- deriva_ml/run_notebook.py +19 -11
- deriva_ml/schema/__init__.py +3 -0
- deriva_ml/{schema_setup → schema}/annotations.py +31 -22
- deriva_ml/schema/check_schema.py +104 -0
- deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
- deriva_ml/schema/deriva-ml-reference.json +8525 -0
- deriva_ml/schema/table_comments_utils.py +57 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/METADATA +5 -4
- deriva_ml-1.14.26.dist-info/RECORD +40 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/entry_points.txt +1 -0
- deriva_ml/deriva_definitions.py +0 -391
- deriva_ml/deriva_ml_base.py +0 -1046
- deriva_ml/execution_environment.py +0 -139
- deriva_ml/schema_setup/table_comments_utils.py +0 -56
- deriva_ml/test-files/execution-parameters.json +0 -1
- deriva_ml/test-files/notebook-parameters.json +0 -5
- deriva_ml/test_functions.py +0 -141
- deriva_ml/test_notebook.ipynb +0 -197
- deriva_ml-1.14.0.dist-info/RECORD +0 -31
- /deriva_ml/{schema_setup → execution}/__init__.py +0 -0
- /deriva_ml/{schema_setup → schema}/policy.json +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/WHEEL +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/top_level.txt +0 -0
deriva_ml/deriva_ml_base.py
DELETED
|
@@ -1,1046 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
`deriva_ml_base.py` is the core module for the Deriva ML project. This module implements the DerivaML class, which is
|
|
3
|
-
the primary interface to the Deriva based catalogs. The module also implements the Feature and Vocabulary functions
|
|
4
|
-
in the DerivaML.
|
|
5
|
-
|
|
6
|
-
DerivaML and its associated classes all depend on a catalog that implements a `deriva-ml` schema with tables and
|
|
7
|
-
relationships that follow a specific data model.
|
|
8
|
-
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
from __future__ import annotations
|
|
12
|
-
|
|
13
|
-
import getpass
|
|
14
|
-
import logging
|
|
15
|
-
from datetime import datetime
|
|
16
|
-
from itertools import chain
|
|
17
|
-
from pathlib import Path
|
|
18
|
-
import requests
|
|
19
|
-
|
|
20
|
-
from typing import Optional, Any, Iterable, TYPE_CHECKING
|
|
21
|
-
|
|
22
|
-
from deriva.core import (
|
|
23
|
-
get_credential,
|
|
24
|
-
urlquote,
|
|
25
|
-
format_exception,
|
|
26
|
-
DEFAULT_SESSION_CONFIG,
|
|
27
|
-
)
|
|
28
|
-
import deriva.core.datapath as datapath
|
|
29
|
-
from deriva.core.datapath import DataPathException
|
|
30
|
-
from deriva.core.deriva_server import DerivaServer
|
|
31
|
-
from deriva.core.ermrest_catalog import ResolveRidResult
|
|
32
|
-
from deriva.core.ermrest_model import Key, Table
|
|
33
|
-
from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
|
|
34
|
-
from pydantic import validate_call, ConfigDict
|
|
35
|
-
|
|
36
|
-
from .execution_configuration import ExecutionConfiguration, Workflow
|
|
37
|
-
from .feature import Feature, FeatureRecord
|
|
38
|
-
from .dataset import Dataset
|
|
39
|
-
from .dataset_aux_classes import DatasetSpec
|
|
40
|
-
from .dataset_bag import DatasetBag
|
|
41
|
-
from .deriva_model import DerivaModel
|
|
42
|
-
from .upload import table_path, execution_rids, asset_file_path
|
|
43
|
-
from .deriva_definitions import ColumnDefinition
|
|
44
|
-
from .deriva_definitions import (
|
|
45
|
-
RID,
|
|
46
|
-
Status,
|
|
47
|
-
DerivaMLException,
|
|
48
|
-
ML_SCHEMA,
|
|
49
|
-
VocabularyTerm,
|
|
50
|
-
MLVocab,
|
|
51
|
-
FileSpec,
|
|
52
|
-
TableDefinition,
|
|
53
|
-
)
|
|
54
|
-
from .schema_setup.annotations import asset_annotation
|
|
55
|
-
|
|
56
|
-
try:
|
|
57
|
-
from icecream import ic
|
|
58
|
-
except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
59
|
-
ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
if TYPE_CHECKING:
|
|
63
|
-
from .execution import Execution
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
class DerivaML(Dataset):
|
|
67
|
-
"""Base class for ML operations on a Deriva catalog.
|
|
68
|
-
|
|
69
|
-
This class is intended to be used as a base class on which more domain specific interfaces are built.
|
|
70
|
-
|
|
71
|
-
Attributes:
|
|
72
|
-
host_name: Hostname of the Deriva server.
|
|
73
|
-
catalog_id: Catalog ID. Either and identifier, or a catalog name.
|
|
74
|
-
domain_schema: Schema name for domain specific tables and relationships.
|
|
75
|
-
model: ERMRest model for the catalog
|
|
76
|
-
"""
|
|
77
|
-
|
|
78
|
-
def __init__(
|
|
79
|
-
self,
|
|
80
|
-
hostname: str,
|
|
81
|
-
catalog_id: str | int,
|
|
82
|
-
domain_schema: Optional[str] = None,
|
|
83
|
-
project_name: Optional[str] = None,
|
|
84
|
-
cache_dir: Optional[str] = None,
|
|
85
|
-
working_dir: Optional[str] = None,
|
|
86
|
-
ml_schema: str = ML_SCHEMA,
|
|
87
|
-
logging_level=logging.INFO,
|
|
88
|
-
credential=None,
|
|
89
|
-
use_minid=True,
|
|
90
|
-
):
|
|
91
|
-
"""Create and initialize a DerivaML instance.
|
|
92
|
-
|
|
93
|
-
This method will connect to a catalog, and initialize local configuration for the ML execution.
|
|
94
|
-
This class is intended to be used as a base class on which domain-specific interfaces are built.
|
|
95
|
-
|
|
96
|
-
Args:
|
|
97
|
-
hostname: Hostname of the Deriva server.
|
|
98
|
-
catalog_id: Catalog ID. Either an identifier or a catalog name.
|
|
99
|
-
domain_schema: Schema name for domain-specific tables and relationships.
|
|
100
|
-
project_name: Project name. Defaults to name of domain schema.
|
|
101
|
-
cache_dir: Directory path for caching data downloaded from the Deriva server as bdbag.
|
|
102
|
-
working_dir: Directory path for storing data used by or generated by any computations.
|
|
103
|
-
use_minid: Use the MINID serice when downloading dataset bags.
|
|
104
|
-
"""
|
|
105
|
-
self.credential = credential or get_credential(hostname)
|
|
106
|
-
server = DerivaServer(
|
|
107
|
-
"https",
|
|
108
|
-
hostname,
|
|
109
|
-
credentials=self.credential,
|
|
110
|
-
session_config=self._get_session_config(),
|
|
111
|
-
)
|
|
112
|
-
self.catalog = server.connect_ermrest(catalog_id)
|
|
113
|
-
self.model = DerivaModel(
|
|
114
|
-
self.catalog.getCatalogModel(), domain_schema=domain_schema
|
|
115
|
-
)
|
|
116
|
-
|
|
117
|
-
default_workdir = self.__class__.__name__ + "_working"
|
|
118
|
-
self.working_dir = (
|
|
119
|
-
Path(working_dir) / getpass.getuser()
|
|
120
|
-
if working_dir
|
|
121
|
-
else Path.home() / "deriva-ml"
|
|
122
|
-
) / default_workdir
|
|
123
|
-
|
|
124
|
-
self.working_dir.mkdir(parents=True, exist_ok=True)
|
|
125
|
-
self.cache_dir = Path(cache_dir) if cache_dir else self.working_dir / "cache"
|
|
126
|
-
|
|
127
|
-
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
128
|
-
|
|
129
|
-
# Initialize dataset class.
|
|
130
|
-
super().__init__(
|
|
131
|
-
self.model, self.cache_dir, self.working_dir, use_minid=use_minid
|
|
132
|
-
)
|
|
133
|
-
self._logger = logging.getLogger("deriva_ml")
|
|
134
|
-
self._logger.setLevel(logging_level)
|
|
135
|
-
|
|
136
|
-
self.host_name = hostname
|
|
137
|
-
self.catalog_id = catalog_id
|
|
138
|
-
self.ml_schema = ml_schema
|
|
139
|
-
self.configuration = None
|
|
140
|
-
self._execution: Optional[Execution] = None
|
|
141
|
-
self.domain_schema = self.model.domain_schema
|
|
142
|
-
self.project_name = project_name or self.domain_schema
|
|
143
|
-
self.start_time = datetime.now()
|
|
144
|
-
self.status = Status.pending.value
|
|
145
|
-
|
|
146
|
-
logging.basicConfig(
|
|
147
|
-
level=logging_level,
|
|
148
|
-
format="%(asctime)s - %(name)s.%(levelname)s - %(message)s",
|
|
149
|
-
)
|
|
150
|
-
|
|
151
|
-
# Set logging level for Deriva library
|
|
152
|
-
deriva_logger = logging.getLogger("deriva")
|
|
153
|
-
deriva_logger.setLevel(logging_level)
|
|
154
|
-
|
|
155
|
-
def __del__(self):
|
|
156
|
-
try:
|
|
157
|
-
if self._execution and self._execution.status != Status.completed:
|
|
158
|
-
self._execution.update_status(Status.aborted, "Execution Aborted")
|
|
159
|
-
except (AttributeError, requests.HTTPError):
|
|
160
|
-
pass
|
|
161
|
-
|
|
162
|
-
@staticmethod
|
|
163
|
-
def _get_session_config():
|
|
164
|
-
""" """
|
|
165
|
-
session_config = DEFAULT_SESSION_CONFIG.copy()
|
|
166
|
-
session_config.update(
|
|
167
|
-
{
|
|
168
|
-
# our PUT/POST to ermrest is idempotent
|
|
169
|
-
"allow_retry_on_all_methods": True,
|
|
170
|
-
# do more retries before aborting
|
|
171
|
-
"retry_read": 8,
|
|
172
|
-
"retry_connect": 5,
|
|
173
|
-
# increase delay factor * 2**(n-1) for Nth retry
|
|
174
|
-
"retry_backoff_factor": 5,
|
|
175
|
-
}
|
|
176
|
-
)
|
|
177
|
-
return session_config
|
|
178
|
-
|
|
179
|
-
# noinspection PyProtectedMember
|
|
180
|
-
@property
|
|
181
|
-
def pathBuilder(self) -> datapath._CatalogWrapper:
|
|
182
|
-
"""Get a new instance of a pathBuilder object."""
|
|
183
|
-
return self.catalog.getPathBuilder()
|
|
184
|
-
|
|
185
|
-
@property
|
|
186
|
-
def domain_path(self):
|
|
187
|
-
"""Get a new instance of a pathBuilder object to the domain schema"""
|
|
188
|
-
|
|
189
|
-
return self.pathBuilder.schemas[self.domain_schema]
|
|
190
|
-
|
|
191
|
-
def table_path(self, table: str | Table) -> Path:
|
|
192
|
-
"""Return a local file path in which to place a CSV to add values to a table on upload.
|
|
193
|
-
|
|
194
|
-
Args:
|
|
195
|
-
table: str | Table:
|
|
196
|
-
|
|
197
|
-
Returns:
|
|
198
|
-
Path to a CSV file in which to add values to a table on upload.
|
|
199
|
-
"""
|
|
200
|
-
return table_path(
|
|
201
|
-
self.working_dir,
|
|
202
|
-
schema=self.domain_schema,
|
|
203
|
-
table=self.model.name_to_table(table).name,
|
|
204
|
-
)
|
|
205
|
-
|
|
206
|
-
def download_dir(self, cached: bool = False) -> Path:
|
|
207
|
-
"""Location where downloaded files are placed.
|
|
208
|
-
|
|
209
|
-
Args:
|
|
210
|
-
cached: bool: (Default value = False)
|
|
211
|
-
|
|
212
|
-
Returns:
|
|
213
|
-
|
|
214
|
-
"""
|
|
215
|
-
return self.cache_dir if cached else self.working_dir
|
|
216
|
-
|
|
217
|
-
@staticmethod
|
|
218
|
-
def globus_login(host: str) -> None:
|
|
219
|
-
"""Log into the specified host using Globus.
|
|
220
|
-
|
|
221
|
-
Args:
|
|
222
|
-
host:
|
|
223
|
-
|
|
224
|
-
Returns:
|
|
225
|
-
|
|
226
|
-
"""
|
|
227
|
-
gnl = GlobusNativeLogin(host=host)
|
|
228
|
-
if gnl.is_logged_in([host]):
|
|
229
|
-
print("You are already logged in.")
|
|
230
|
-
else:
|
|
231
|
-
gnl.login(
|
|
232
|
-
[host],
|
|
233
|
-
no_local_server=True,
|
|
234
|
-
no_browser=True,
|
|
235
|
-
refresh_tokens=True,
|
|
236
|
-
update_bdbag_keychain=True,
|
|
237
|
-
)
|
|
238
|
-
print("Login Successful")
|
|
239
|
-
|
|
240
|
-
def chaise_url(self, table: RID | Table) -> str:
|
|
241
|
-
"""Return a Chaise URL to the specified table.
|
|
242
|
-
|
|
243
|
-
Args:
|
|
244
|
-
table: Table or RID to be visited
|
|
245
|
-
table: str | Table:
|
|
246
|
-
|
|
247
|
-
Returns:
|
|
248
|
-
URL to the table in Chaise format.
|
|
249
|
-
"""
|
|
250
|
-
table_obj = self.model.name_to_table(table)
|
|
251
|
-
try:
|
|
252
|
-
uri = self.catalog.get_server_uri().replace(
|
|
253
|
-
"ermrest/catalog/", "chaise/recordset/#"
|
|
254
|
-
)
|
|
255
|
-
except DerivaMLException:
|
|
256
|
-
# Perhaps we have a RID....
|
|
257
|
-
uri = self.cite(table)
|
|
258
|
-
return f"{uri}/{urlquote(table_obj.schema.name)}:{urlquote(table_obj.name)}"
|
|
259
|
-
|
|
260
|
-
def cite(self, entity: dict | str) -> str:
|
|
261
|
-
"""Return a citation URL for the provided entity.
|
|
262
|
-
|
|
263
|
-
Args:
|
|
264
|
-
entity: A dict that contains the column values for a specific entity or a RID.
|
|
265
|
-
|
|
266
|
-
Returns:
|
|
267
|
-
The URI for the provided entity.
|
|
268
|
-
|
|
269
|
-
Raises:
|
|
270
|
-
DerivaMLException: if provided RID does not exist.
|
|
271
|
-
"""
|
|
272
|
-
if isinstance(entity, str) and entity.startswith(
|
|
273
|
-
f"https://{self.host_name}/id/{self.catalog_id}/"
|
|
274
|
-
):
|
|
275
|
-
# Already got a citation...
|
|
276
|
-
return entity
|
|
277
|
-
try:
|
|
278
|
-
self.resolve_rid(
|
|
279
|
-
rid := entity if isinstance(entity, str) else entity["RID"]
|
|
280
|
-
)
|
|
281
|
-
return f"https://{self.host_name}/id/{self.catalog_id}/{rid}@{self.catalog.latest_snapshot().snaptime}"
|
|
282
|
-
except KeyError as e:
|
|
283
|
-
raise DerivaMLException(f"Entity {e} does not have RID column")
|
|
284
|
-
except DerivaMLException as _e:
|
|
285
|
-
raise DerivaMLException("Entity RID does not exist")
|
|
286
|
-
|
|
287
|
-
def user_list(self) -> list[dict[str, str]]:
|
|
288
|
-
"""List of users in the catalog
|
|
289
|
-
|
|
290
|
-
Args:
|
|
291
|
-
|
|
292
|
-
Returns:
|
|
293
|
-
A list of dictionaries containing user information.
|
|
294
|
-
|
|
295
|
-
"""
|
|
296
|
-
user_path = self.pathBuilder.public.ERMrest_Client.path
|
|
297
|
-
return [
|
|
298
|
-
{"ID": u["ID"], "Full_Name": u["Full_Name"]}
|
|
299
|
-
for u in user_path.entities().fetch()
|
|
300
|
-
]
|
|
301
|
-
|
|
302
|
-
def resolve_rid(self, rid: RID) -> ResolveRidResult:
|
|
303
|
-
"""Return a named tuple with information about the specified RID.
|
|
304
|
-
|
|
305
|
-
Args:
|
|
306
|
-
rid: RID of the object of interest
|
|
307
|
-
|
|
308
|
-
Returns:
|
|
309
|
-
ResolveRidResult which has information about the specified RID.
|
|
310
|
-
|
|
311
|
-
Raises:
|
|
312
|
-
DerivaMLException: if the RID doesn't exist.
|
|
313
|
-
"""
|
|
314
|
-
try:
|
|
315
|
-
return self.catalog.resolve_rid(rid, self.model.model)
|
|
316
|
-
except KeyError as _e:
|
|
317
|
-
raise DerivaMLException(f"Invalid RID {rid}")
|
|
318
|
-
|
|
319
|
-
def retrieve_rid(self, rid: RID) -> dict[str, Any]:
|
|
320
|
-
"""Return a dictionary that represents the values of the specified RID.
|
|
321
|
-
|
|
322
|
-
Args:
|
|
323
|
-
rid: RID of the object of interest
|
|
324
|
-
|
|
325
|
-
Returns:
|
|
326
|
-
A dictionary that represents the values of the specified RID.
|
|
327
|
-
|
|
328
|
-
Raises:
|
|
329
|
-
DerivaMLException: if the RID doesn't exist.
|
|
330
|
-
"""
|
|
331
|
-
return self.resolve_rid(rid).datapath.entities().fetch()[0]
|
|
332
|
-
|
|
333
|
-
def add_page(self, title: str, content: str) -> None:
|
|
334
|
-
"""
|
|
335
|
-
|
|
336
|
-
Args:
|
|
337
|
-
title: str:
|
|
338
|
-
content: str:
|
|
339
|
-
|
|
340
|
-
Returns:
|
|
341
|
-
|
|
342
|
-
"""
|
|
343
|
-
self.pathBuilder.www.tables[self.domain_schema].insert(
|
|
344
|
-
[{"Title": title, "Content": content}]
|
|
345
|
-
)
|
|
346
|
-
|
|
347
|
-
def create_vocabulary(
|
|
348
|
-
self, vocab_name: str, comment: str = "", schema: Optional[str] = None
|
|
349
|
-
) -> Table:
|
|
350
|
-
"""Create a controlled vocabulary table with the given vocab name.
|
|
351
|
-
|
|
352
|
-
Args:
|
|
353
|
-
vocab_name: Name of the controlled vocabulary table.
|
|
354
|
-
comment: Description of the vocabulary table. (Default value = '')
|
|
355
|
-
schema: Schema in which to create the controlled vocabulary table. Defaults to domain_schema.
|
|
356
|
-
vocab_name: str:
|
|
357
|
-
|
|
358
|
-
Returns:
|
|
359
|
-
An ERMRest table object for the newly created vocabulary table.
|
|
360
|
-
"""
|
|
361
|
-
schema = schema or self.domain_schema
|
|
362
|
-
return self.model.schemas[schema].create_table(
|
|
363
|
-
Table.define_vocabulary(
|
|
364
|
-
vocab_name, f"{self.project_name}:{{RID}}", comment=comment
|
|
365
|
-
)
|
|
366
|
-
)
|
|
367
|
-
|
|
368
|
-
def create_table(self, table: TableDefinition) -> Table:
|
|
369
|
-
"""Create a table from a table definition."""
|
|
370
|
-
return self.model.schemas[self.domain_schema].create_table(table.model_dump())
|
|
371
|
-
|
|
372
|
-
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
373
|
-
def create_asset(
|
|
374
|
-
self,
|
|
375
|
-
asset_name: str,
|
|
376
|
-
column_defs: Optional[Iterable[ColumnDefinition]] = None,
|
|
377
|
-
fkey_defs: Optional[Iterable[ColumnDefinition]] = None,
|
|
378
|
-
referenced_tables: Optional[Iterable[Table]] = None,
|
|
379
|
-
comment: str = "",
|
|
380
|
-
schema: Optional[str] = None,
|
|
381
|
-
) -> Table:
|
|
382
|
-
"""Create an asset table with the given asset name.
|
|
383
|
-
|
|
384
|
-
Args:
|
|
385
|
-
asset_name: Name of the asset table.
|
|
386
|
-
column_defs: Iterable of ColumnDefinition objects to provide additional metadata for asset.
|
|
387
|
-
fkey_defs: Iterable of ForeignKeyDefinition objects to provide additional metadata for asset.
|
|
388
|
-
referenced_tables: Iterable of Table objects to which asset should provide foreign-key references to.
|
|
389
|
-
comment: Description of the asset table. (Default value = '')
|
|
390
|
-
schema: Schema in which to create the asset table. Defaults to domain_schema.
|
|
391
|
-
asset_name: str:
|
|
392
|
-
schema: str: (Default value = None)
|
|
393
|
-
|
|
394
|
-
Returns:
|
|
395
|
-
Table object for the asset table.
|
|
396
|
-
"""
|
|
397
|
-
column_defs = column_defs or []
|
|
398
|
-
fkey_defs = fkey_defs or []
|
|
399
|
-
referenced_tables = referenced_tables or []
|
|
400
|
-
schema = schema or self.domain_schema
|
|
401
|
-
|
|
402
|
-
self.add_term(
|
|
403
|
-
MLVocab.asset_type, asset_name, description=f"A {asset_name} asset"
|
|
404
|
-
)
|
|
405
|
-
asset_table = self.model.schemas[schema].create_table(
|
|
406
|
-
Table.define_asset(
|
|
407
|
-
schema,
|
|
408
|
-
asset_name,
|
|
409
|
-
column_defs=[c.model_dump() for c in column_defs],
|
|
410
|
-
fkey_defs=[fk.model_dump() for fk in fkey_defs],
|
|
411
|
-
comment=comment,
|
|
412
|
-
)
|
|
413
|
-
)
|
|
414
|
-
|
|
415
|
-
self.model.schemas[self.domain_schema].create_table(
|
|
416
|
-
Table.define_association(
|
|
417
|
-
[
|
|
418
|
-
(asset_table.name, asset_table),
|
|
419
|
-
("Asset_Type", self.model.name_to_table("Asset_Type")),
|
|
420
|
-
]
|
|
421
|
-
)
|
|
422
|
-
)
|
|
423
|
-
for t in referenced_tables:
|
|
424
|
-
asset_table.create_reference(self.model.name_to_table(t))
|
|
425
|
-
# Create a table to track execution that creates the asset
|
|
426
|
-
atable = self.model.schemas[self.domain_schema].create_table(
|
|
427
|
-
Table.define_association(
|
|
428
|
-
[
|
|
429
|
-
(asset_name, asset_table),
|
|
430
|
-
(
|
|
431
|
-
"Execution",
|
|
432
|
-
self.model.schemas[self.ml_schema].tables["Execution"],
|
|
433
|
-
),
|
|
434
|
-
]
|
|
435
|
-
)
|
|
436
|
-
)
|
|
437
|
-
atable.create_reference(self.model.name_to_table("Asset_Role"))
|
|
438
|
-
|
|
439
|
-
asset_annotation(asset_table)
|
|
440
|
-
return asset_table
|
|
441
|
-
|
|
442
|
-
# @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
443
|
-
def list_assets(self, asset_table: Table | str):
|
|
444
|
-
"""Return the contents of an asset table"""
|
|
445
|
-
|
|
446
|
-
if not self.model.is_asset(asset_table):
|
|
447
|
-
raise DerivaMLException(f"Table {asset_table.name} is not an asset")
|
|
448
|
-
asset_table = self.model.name_to_table(asset_table)
|
|
449
|
-
pb = self._model.catalog.getPathBuilder()
|
|
450
|
-
asset_path = pb.schemas[asset_table.schema.name].tables[asset_table.name]
|
|
451
|
-
|
|
452
|
-
asset_type_table = self._model.find_association(asset_table, MLVocab.asset_type)
|
|
453
|
-
type_path = pb.schemas[asset_type_table.schema.name].tables[
|
|
454
|
-
asset_type_table.name
|
|
455
|
-
]
|
|
456
|
-
|
|
457
|
-
# Get a list of all the asset_type values associated with this dataset_table.
|
|
458
|
-
assets = []
|
|
459
|
-
for asset in asset_path.entities().fetch():
|
|
460
|
-
asset_types = (
|
|
461
|
-
type_path.filter(type_path.columns[asset_table.name] == asset["RID"])
|
|
462
|
-
.attributes(type_path.Asset_Type)
|
|
463
|
-
.fetch()
|
|
464
|
-
)
|
|
465
|
-
assets.append(
|
|
466
|
-
asset
|
|
467
|
-
| {
|
|
468
|
-
MLVocab.asset_type.value: [
|
|
469
|
-
asset_type[MLVocab.asset_type.value]
|
|
470
|
-
for asset_type in asset_types
|
|
471
|
-
]
|
|
472
|
-
}
|
|
473
|
-
)
|
|
474
|
-
return assets
|
|
475
|
-
|
|
476
|
-
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
477
|
-
def create_feature(
|
|
478
|
-
self,
|
|
479
|
-
target_table: Table | str,
|
|
480
|
-
feature_name: str,
|
|
481
|
-
terms: Optional[list[Table | str]] = None,
|
|
482
|
-
assets: Optional[list[Table | str]] = None,
|
|
483
|
-
metadata: Optional[Iterable[ColumnDefinition | Table | Key | str]] = None,
|
|
484
|
-
optional: Optional[list[str]] = None,
|
|
485
|
-
comment: str = "",
|
|
486
|
-
) -> type[FeatureRecord]:
|
|
487
|
-
"""Create a new feature that can be associated with a table.
|
|
488
|
-
|
|
489
|
-
The feature can associate a controlled vocabulary term, an asset, or any other values with a
|
|
490
|
-
specific instance of an object and execution.
|
|
491
|
-
|
|
492
|
-
Args:
|
|
493
|
-
feature_name: Name of the new feature to be defined
|
|
494
|
-
target_table: table name or object on which the feature is to be associated
|
|
495
|
-
terms: List of controlled vocabulary terms that will be part of the feature value
|
|
496
|
-
assets: List of asset table names or objects that will be part of the feature value
|
|
497
|
-
metadata: List of other value types that are associated with the feature
|
|
498
|
-
optional: List of columns that are optional in the feature
|
|
499
|
-
comment: return: A Feature class that can be used to create instances of the feature.
|
|
500
|
-
|
|
501
|
-
Returns:
|
|
502
|
-
A Feature class that can be used to create instances of the feature.
|
|
503
|
-
|
|
504
|
-
Raises:
|
|
505
|
-
DerivaException: If the feature cannot be created.
|
|
506
|
-
"""
|
|
507
|
-
|
|
508
|
-
terms = terms or []
|
|
509
|
-
assets = assets or []
|
|
510
|
-
metadata = metadata or []
|
|
511
|
-
optional = optional or []
|
|
512
|
-
|
|
513
|
-
def normalize_metadata(m: Key | Table | ColumnDefinition | str):
|
|
514
|
-
"""
|
|
515
|
-
|
|
516
|
-
Args:
|
|
517
|
-
m: Key | Table | ColumnDefinition | str:
|
|
518
|
-
|
|
519
|
-
Returns:
|
|
520
|
-
|
|
521
|
-
"""
|
|
522
|
-
if isinstance(m, str):
|
|
523
|
-
return self.model.name_to_table(m)
|
|
524
|
-
elif isinstance(m, ColumnDefinition):
|
|
525
|
-
return m.model_dump()
|
|
526
|
-
else:
|
|
527
|
-
return m
|
|
528
|
-
|
|
529
|
-
# Make sure that the provided assets or terms are actually assets or terms.
|
|
530
|
-
if not all(map(self.model.is_asset, assets)):
|
|
531
|
-
raise DerivaMLException("Invalid create_feature asset table.")
|
|
532
|
-
if not all(map(self.model.is_vocabulary, terms)):
|
|
533
|
-
raise DerivaMLException("Invalid create_feature asset table.")
|
|
534
|
-
|
|
535
|
-
# Get references to the necessary tables and make sure that the
|
|
536
|
-
# provided feature name exists.
|
|
537
|
-
target_table = self.model.name_to_table(target_table)
|
|
538
|
-
execution = self.model.schemas[self.ml_schema].tables["Execution"]
|
|
539
|
-
feature_name_table = self.model.schemas[self.ml_schema].tables["Feature_Name"]
|
|
540
|
-
feature_name_term = self.add_term(
|
|
541
|
-
"Feature_Name", feature_name, description=comment
|
|
542
|
-
)
|
|
543
|
-
atable_name = f"Execution_{target_table.name}_{feature_name_term.name}"
|
|
544
|
-
|
|
545
|
-
# Now create the association table that implements the feature.
|
|
546
|
-
atable = self.model.schemas[self.domain_schema].create_table(
|
|
547
|
-
target_table.define_association(
|
|
548
|
-
table_name=atable_name,
|
|
549
|
-
associates=[execution, target_table, feature_name_table],
|
|
550
|
-
metadata=[
|
|
551
|
-
normalize_metadata(m) for m in chain(assets, terms, metadata)
|
|
552
|
-
],
|
|
553
|
-
comment=comment,
|
|
554
|
-
)
|
|
555
|
-
)
|
|
556
|
-
# Now set optional terms.
|
|
557
|
-
for c in optional:
|
|
558
|
-
atable.columns[c].alter(nullok=True)
|
|
559
|
-
atable.columns["Feature_Name"].alter(default=feature_name_term.name)
|
|
560
|
-
return self.feature_record_class(target_table, feature_name)
|
|
561
|
-
|
|
562
|
-
def feature_record_class(
|
|
563
|
-
self, table: str | Table, feature_name: str
|
|
564
|
-
) -> type[FeatureRecord]:
|
|
565
|
-
"""Create a pydantic model for entries into the specified feature table.
|
|
566
|
-
|
|
567
|
-
For information on how to
|
|
568
|
-
See the pydantic documentation for more details about the pydantic model.
|
|
569
|
-
|
|
570
|
-
Args:
|
|
571
|
-
table: table name or object on which the feature is to be associated
|
|
572
|
-
feature_name: name of the feature to be created
|
|
573
|
-
table: str | Table:
|
|
574
|
-
feature_name: str:
|
|
575
|
-
|
|
576
|
-
Returns:
|
|
577
|
-
A Feature class that can be used to create instances of the feature.
|
|
578
|
-
"""
|
|
579
|
-
return self.lookup_feature(table, feature_name).feature_record_class()
|
|
580
|
-
|
|
581
|
-
def delete_feature(self, table: Table | str, feature_name: str) -> bool:
|
|
582
|
-
"""
|
|
583
|
-
|
|
584
|
-
Args:
|
|
585
|
-
table: Table | str:
|
|
586
|
-
feature_name: str:
|
|
587
|
-
|
|
588
|
-
Returns:
|
|
589
|
-
"""
|
|
590
|
-
table = self.model.name_to_table(table)
|
|
591
|
-
try:
|
|
592
|
-
feature = next(
|
|
593
|
-
f for f in self.find_features(table) if f.feature_name == feature_name
|
|
594
|
-
)
|
|
595
|
-
feature.feature_table.drop()
|
|
596
|
-
return True
|
|
597
|
-
except StopIteration:
|
|
598
|
-
return False
|
|
599
|
-
|
|
600
|
-
def lookup_feature(self, table: str | Table, feature_name: str) -> Feature:
|
|
601
|
-
"""Lookup the named feature associated with the provided table.
|
|
602
|
-
|
|
603
|
-
Args:
|
|
604
|
-
table: param feature_name:
|
|
605
|
-
table: str | Table:
|
|
606
|
-
feature_name: str:
|
|
607
|
-
|
|
608
|
-
Returns:
|
|
609
|
-
A Feature class that represents the requested feature.
|
|
610
|
-
|
|
611
|
-
Raises:
|
|
612
|
-
DerivaMLException: If the feature cannot be found.
|
|
613
|
-
"""
|
|
614
|
-
return self.model.lookup_feature(table, feature_name)
|
|
615
|
-
|
|
616
|
-
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
617
|
-
def find_features(self, table: Table | str) -> Iterable[Feature]:
|
|
618
|
-
"""List the names of the features in the specified table.
|
|
619
|
-
|
|
620
|
-
Args:
|
|
621
|
-
table: The table to find features for.
|
|
622
|
-
table: Table | str:
|
|
623
|
-
|
|
624
|
-
Returns:
|
|
625
|
-
An iterable of FeatureResult instances that describe the current features in the table.
|
|
626
|
-
"""
|
|
627
|
-
return self.model.find_features(table)
|
|
628
|
-
|
|
629
|
-
# noinspection PyProtectedMember
|
|
630
|
-
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
631
|
-
def list_feature_values(
|
|
632
|
-
self, table: Table | str, feature_name: str
|
|
633
|
-
) -> datapath._ResultSet:
|
|
634
|
-
"""Return a datapath ResultSet containing all values of a feature associated with a table.
|
|
635
|
-
|
|
636
|
-
Args:
|
|
637
|
-
table: param feature_name:
|
|
638
|
-
table: Table | str:
|
|
639
|
-
feature_name: str:
|
|
640
|
-
|
|
641
|
-
Returns:
|
|
642
|
-
|
|
643
|
-
"""
|
|
644
|
-
table = self.model.name_to_table(table)
|
|
645
|
-
feature = self.lookup_feature(table, feature_name)
|
|
646
|
-
pb = self.catalog.getPathBuilder()
|
|
647
|
-
return (
|
|
648
|
-
pb.schemas[feature.feature_table.schema.name]
|
|
649
|
-
.tables[feature.feature_table.name]
|
|
650
|
-
.entities()
|
|
651
|
-
.fetch()
|
|
652
|
-
)
|
|
653
|
-
|
|
654
|
-
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
655
|
-
def add_term(
|
|
656
|
-
self,
|
|
657
|
-
table: str | Table,
|
|
658
|
-
term_name: str,
|
|
659
|
-
description: str,
|
|
660
|
-
synonyms: Optional[list[str]] = None,
|
|
661
|
-
exists_ok: bool = True,
|
|
662
|
-
) -> VocabularyTerm:
|
|
663
|
-
"""Creates a new control vocabulary term in the control vocabulary table.
|
|
664
|
-
|
|
665
|
-
Args:
|
|
666
|
-
|
|
667
|
-
Args:
|
|
668
|
-
table: The name of the control vocabulary table.
|
|
669
|
-
term_name: The name of the new control vocabulary.
|
|
670
|
-
description: The description of the new control vocabulary.
|
|
671
|
-
synonyms: Optional list of synonyms for the new control vocabulary. Defaults to an empty list.
|
|
672
|
-
exists_ok: Optional flag indicating whether to allow creation if the control vocabulary name
|
|
673
|
-
already exists. Defaults to True.
|
|
674
|
-
|
|
675
|
-
Returns:
|
|
676
|
-
The RID of the newly created control vocabulary.
|
|
677
|
-
|
|
678
|
-
Raises:
|
|
679
|
-
DerivaException: If the control vocabulary name already exists and exist_ok is False.
|
|
680
|
-
"""
|
|
681
|
-
synonyms = synonyms or []
|
|
682
|
-
table = self.model.name_to_table(table)
|
|
683
|
-
pb = self.catalog.getPathBuilder()
|
|
684
|
-
if not (self.model.is_vocabulary(table)):
|
|
685
|
-
raise DerivaMLException(f"The table {table} is not a controlled vocabulary")
|
|
686
|
-
|
|
687
|
-
schema_name = table.schema.name
|
|
688
|
-
table_name = table.name
|
|
689
|
-
try:
|
|
690
|
-
term_id = VocabularyTerm.model_validate(
|
|
691
|
-
pb.schemas[schema_name]
|
|
692
|
-
.tables[table_name]
|
|
693
|
-
.insert(
|
|
694
|
-
[
|
|
695
|
-
{
|
|
696
|
-
"Name": term_name,
|
|
697
|
-
"Description": description,
|
|
698
|
-
"Synonyms": synonyms,
|
|
699
|
-
}
|
|
700
|
-
],
|
|
701
|
-
defaults={"ID", "URI"},
|
|
702
|
-
)[0]
|
|
703
|
-
)
|
|
704
|
-
except DataPathException:
|
|
705
|
-
term_id = self.lookup_term(table, term_name)
|
|
706
|
-
if not exists_ok:
|
|
707
|
-
raise DerivaMLException(f"{term_name} already exists")
|
|
708
|
-
# Check vocabulary
|
|
709
|
-
return term_id
|
|
710
|
-
|
|
711
|
-
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
712
|
-
def lookup_term(self, table: str | Table, term_name: str) -> VocabularyTerm:
|
|
713
|
-
"""Given a term name, return the vocabulary record. Can provide either the term name
|
|
714
|
-
or a synonym for the term. Generate an exception if the term is not in the vocabulary.
|
|
715
|
-
|
|
716
|
-
Args:
|
|
717
|
-
table: The name of the controlled vocabulary table or a ERMRest table object.
|
|
718
|
-
term_name: The name of the term to look up.
|
|
719
|
-
|
|
720
|
-
Returns:
|
|
721
|
-
The entry the associated term or synonym.
|
|
722
|
-
|
|
723
|
-
Raises:
|
|
724
|
-
DerivaException: If the schema or vocabulary table doesn't exist, or if the term is not
|
|
725
|
-
found in the vocabulary.
|
|
726
|
-
"""
|
|
727
|
-
vocab_table = self.model.name_to_table(table)
|
|
728
|
-
if not self.model.is_vocabulary(vocab_table):
|
|
729
|
-
raise DerivaMLException(f"The table {table} is not a controlled vocabulary")
|
|
730
|
-
schema_name, table_name = vocab_table.schema.name, vocab_table.name
|
|
731
|
-
schema_path = self.catalog.getPathBuilder().schemas[schema_name]
|
|
732
|
-
|
|
733
|
-
for term in schema_path.tables[table_name].entities().fetch():
|
|
734
|
-
if term_name == term["Name"] or (
|
|
735
|
-
term["Synonyms"] and term_name in term["Synonyms"]
|
|
736
|
-
):
|
|
737
|
-
return VocabularyTerm.model_validate(term)
|
|
738
|
-
raise DerivaMLException(f"Term {term_name} is not in vocabulary {table_name}")
|
|
739
|
-
|
|
740
|
-
def list_vocabulary_terms(self, table: str | Table) -> list[VocabularyTerm]:
|
|
741
|
-
"""Return a list of terms that are in a vocabulary table.
|
|
742
|
-
|
|
743
|
-
Args:
|
|
744
|
-
table: The name of the controlled vocabulary table or a ERMRest table object.
|
|
745
|
-
table: str | Table:
|
|
746
|
-
|
|
747
|
-
Returns:
|
|
748
|
-
The list of terms that are in a vocabulary table.
|
|
749
|
-
|
|
750
|
-
Raises:
|
|
751
|
-
DerivaMLException: If the schema or vocabulary table doesn't exist, or if the table is not
|
|
752
|
-
a controlled vocabulary.
|
|
753
|
-
"""
|
|
754
|
-
pb = self.catalog.getPathBuilder()
|
|
755
|
-
table = self.model.name_to_table(table)
|
|
756
|
-
if not (self.model.is_vocabulary(table)):
|
|
757
|
-
raise DerivaMLException(f"The table {table} is not a controlled vocabulary")
|
|
758
|
-
|
|
759
|
-
return [
|
|
760
|
-
VocabularyTerm(**v)
|
|
761
|
-
for v in pb.schemas[table.schema.name].tables[table.name].entities().fetch()
|
|
762
|
-
]
|
|
763
|
-
|
|
764
|
-
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
765
|
-
def download_dataset_bag(
|
|
766
|
-
self,
|
|
767
|
-
dataset: DatasetSpec,
|
|
768
|
-
execution_rid: Optional[RID] = None,
|
|
769
|
-
) -> DatasetBag:
|
|
770
|
-
"""Download a dataset onto the local file system. Create a MINID for the dataset if one doesn't already exist.
|
|
771
|
-
|
|
772
|
-
Args:
|
|
773
|
-
dataset: Specification of the dataset to be downloaded.
|
|
774
|
-
execution_rid: Execution RID for the dataset.
|
|
775
|
-
|
|
776
|
-
Returns:
|
|
777
|
-
Tuple consisting of the path to the dataset, the RID of the dataset that was downloaded and the MINID
|
|
778
|
-
for the dataset.
|
|
779
|
-
"""
|
|
780
|
-
return self._download_dataset_bag(
|
|
781
|
-
dataset=dataset,
|
|
782
|
-
execution_rid=execution_rid,
|
|
783
|
-
snapshot_catalog=DerivaML(self.host_name, self._version_snapshot(dataset)),
|
|
784
|
-
)
|
|
785
|
-
|
|
786
|
-
def _update_status(
|
|
787
|
-
self, new_status: Status, status_detail: str, execution_rid: RID
|
|
788
|
-
):
|
|
789
|
-
"""Update the status of an execution in the catalog.
|
|
790
|
-
|
|
791
|
-
Args:
|
|
792
|
-
new_status: New status.
|
|
793
|
-
status_detail: Details of the status.
|
|
794
|
-
execution_rid: Resource Identifier (RID) of the execution.
|
|
795
|
-
new_status: Status:
|
|
796
|
-
status_detail: str:
|
|
797
|
-
execution_rid: RID:
|
|
798
|
-
|
|
799
|
-
Returns:
|
|
800
|
-
|
|
801
|
-
"""
|
|
802
|
-
self.status = new_status.value
|
|
803
|
-
self.pathBuilder.schemas[self.ml_schema].Execution.update(
|
|
804
|
-
[
|
|
805
|
-
{
|
|
806
|
-
"RID": execution_rid,
|
|
807
|
-
"Status": self.status,
|
|
808
|
-
"Status_Detail": status_detail,
|
|
809
|
-
}
|
|
810
|
-
]
|
|
811
|
-
)
|
|
812
|
-
|
|
813
|
-
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
814
|
-
def add_files(
|
|
815
|
-
self,
|
|
816
|
-
files: Iterable[FileSpec],
|
|
817
|
-
file_types: str | list[str],
|
|
818
|
-
execution_rid: Optional[RID] = None,
|
|
819
|
-
) -> Iterable[RID]:
|
|
820
|
-
"""Add a new file to the File table in the catalog.
|
|
821
|
-
|
|
822
|
-
The input is an iterator of FileSpec objects which provide the MD5 checksum, length, and URL.
|
|
823
|
-
|
|
824
|
-
Args:
|
|
825
|
-
file_types: One or more file types. Must be a term from the File_Type controlled vocabulary.
|
|
826
|
-
files: A sequence of file specifications that describe the files to add.
|
|
827
|
-
execution_rid: Resource Identifier (RID) of the execution to associate with the file.
|
|
828
|
-
|
|
829
|
-
Returns:
|
|
830
|
-
Iterable of the RIDs of the files that were added.
|
|
831
|
-
"""
|
|
832
|
-
defined_types = self.list_vocabulary_terms(MLVocab.file_type)
|
|
833
|
-
if execution_rid and self.resolve_rid(execution_rid).table.name != "Execution":
|
|
834
|
-
raise DerivaMLException(
|
|
835
|
-
f"RID {execution_rid} is not for an execution table."
|
|
836
|
-
)
|
|
837
|
-
|
|
838
|
-
def check_file_type(dtype: str) -> bool:
|
|
839
|
-
"""Make sure that the specified string is either the name or synonym for a file type term."""
|
|
840
|
-
for term in defined_types:
|
|
841
|
-
if dtype == term.name or (term.synonyms and file_type in term.synonyms):
|
|
842
|
-
return True
|
|
843
|
-
return False
|
|
844
|
-
|
|
845
|
-
file_types = [file_types] if isinstance(file_types, str) else file_types
|
|
846
|
-
pb = self._model.catalog.getPathBuilder()
|
|
847
|
-
for file_type in file_types:
|
|
848
|
-
if not check_file_type(file_type):
|
|
849
|
-
raise DerivaMLException("File type must be a vocabulary term.")
|
|
850
|
-
file_table_path = pb.schemas[self.ml_schema].tables["File"]
|
|
851
|
-
file_rids = [
|
|
852
|
-
e["RID"] for e in file_table_path.insert([f.model_dump() for f in files])
|
|
853
|
-
]
|
|
854
|
-
|
|
855
|
-
# Get the name of the association table between file_table and file_type.
|
|
856
|
-
atable = next(
|
|
857
|
-
self._model.schemas[self._ml_schema]
|
|
858
|
-
.tables[MLVocab.file_type]
|
|
859
|
-
.find_associations()
|
|
860
|
-
).name
|
|
861
|
-
pb.schemas[self._ml_schema].tables[atable].insert(
|
|
862
|
-
[
|
|
863
|
-
{"File_Type": file_type, "File": file_rid}
|
|
864
|
-
for file_rid in file_rids
|
|
865
|
-
for file_type in file_types
|
|
866
|
-
]
|
|
867
|
-
)
|
|
868
|
-
|
|
869
|
-
if execution_rid:
|
|
870
|
-
# Get the name of the association table between file_table and execution.
|
|
871
|
-
pb.schemas[self._ml_schema].File_Execution.insert(
|
|
872
|
-
[
|
|
873
|
-
{"File": file_rid, "Execution": execution_rid}
|
|
874
|
-
for file_rid in file_rids
|
|
875
|
-
]
|
|
876
|
-
)
|
|
877
|
-
return file_rids
|
|
878
|
-
|
|
879
|
-
def list_files(
|
|
880
|
-
self, file_types: Optional[list[str]] = None
|
|
881
|
-
) -> list[dict[str, Any]]:
|
|
882
|
-
"""Return the contents of the file table. Denormalized file types into the file record."""
|
|
883
|
-
ml_path = self.pathBuilder.schemas[self._ml_schema]
|
|
884
|
-
file_path = ml_path.File
|
|
885
|
-
type_path = ml_path.File_File_Type
|
|
886
|
-
|
|
887
|
-
path = file_path.link(
|
|
888
|
-
type_path, on=file_path.RID == type_path.File, join_type="left"
|
|
889
|
-
)
|
|
890
|
-
path = path.File.attributes(
|
|
891
|
-
path.File.RID,
|
|
892
|
-
path.File.URL,
|
|
893
|
-
path.File.MD5,
|
|
894
|
-
path.File.Length,
|
|
895
|
-
path.File.Description,
|
|
896
|
-
path.File_File_Type.File_Type,
|
|
897
|
-
)
|
|
898
|
-
file_map = {}
|
|
899
|
-
for f in path.fetch():
|
|
900
|
-
entry = file_map.setdefault(f["RID"], {**f, "File_Types": []})
|
|
901
|
-
if ft := f.get("File_Type"): # assign-and-test in one go
|
|
902
|
-
entry["File_Types"].append(ft)
|
|
903
|
-
|
|
904
|
-
# Now get rid of the File_Type key and return the result
|
|
905
|
-
return [(f, f.pop("File_Type"))[0] for f in file_map.values()]
|
|
906
|
-
|
|
907
|
-
def list_workflows(self) -> list[Workflow]:
|
|
908
|
-
"""Return a list of all the workflows in the catalog."""
|
|
909
|
-
workflow_path = self.pathBuilder.schemas[self.ml_schema].Workflow
|
|
910
|
-
return [
|
|
911
|
-
Workflow(
|
|
912
|
-
name=w["Name"],
|
|
913
|
-
url=w["URL"],
|
|
914
|
-
workflow_type=w["Workflow_Type"],
|
|
915
|
-
version=w["Version"],
|
|
916
|
-
description=w["Description"],
|
|
917
|
-
rid=w["RID"],
|
|
918
|
-
checksum=w["Checksum"],
|
|
919
|
-
)
|
|
920
|
-
for w in workflow_path.entities().fetch()
|
|
921
|
-
]
|
|
922
|
-
|
|
923
|
-
def add_workflow(self, workflow: Workflow) -> RID:
|
|
924
|
-
"""Add a workflow to the Workflow table.
|
|
925
|
-
|
|
926
|
-
Args:
|
|
927
|
-
workflow: An instance of a Workflow object.
|
|
928
|
-
|
|
929
|
-
Returns:
|
|
930
|
-
- str: Resource Identifier (RID) of the added workflow.
|
|
931
|
-
|
|
932
|
-
"""
|
|
933
|
-
|
|
934
|
-
# Check to make sure that the workflow is not already in the table. If it's not, add it.
|
|
935
|
-
|
|
936
|
-
if workflow_rid := self.lookup_workflow(workflow.url):
|
|
937
|
-
return workflow_rid
|
|
938
|
-
|
|
939
|
-
ml_schema_path = self.pathBuilder.schemas[self.ml_schema]
|
|
940
|
-
try:
|
|
941
|
-
# Record doesn't exist already
|
|
942
|
-
workflow_record = {
|
|
943
|
-
"URL": workflow.url,
|
|
944
|
-
"Name": workflow.name,
|
|
945
|
-
"Description": workflow.description,
|
|
946
|
-
"Checksum": workflow.checksum,
|
|
947
|
-
"Version": workflow.version,
|
|
948
|
-
MLVocab.workflow_type: self.lookup_term(
|
|
949
|
-
MLVocab.workflow_type, workflow.workflow_type
|
|
950
|
-
).name,
|
|
951
|
-
}
|
|
952
|
-
workflow_rid = ml_schema_path.Workflow.insert([workflow_record])[0]["RID"]
|
|
953
|
-
except Exception as e:
|
|
954
|
-
error = format_exception(e)
|
|
955
|
-
raise DerivaMLException(f"Failed to insert workflow. Error: {error}")
|
|
956
|
-
return workflow_rid
|
|
957
|
-
|
|
958
|
-
def lookup_workflow(self, url: str) -> Optional[RID]:
|
|
959
|
-
"""Given a URL, look in the workflow table to find a matching workflow."""
|
|
960
|
-
workflow_path = self.pathBuilder.schemas[self.ml_schema].Workflow
|
|
961
|
-
try:
|
|
962
|
-
url_column = workflow_path.URL
|
|
963
|
-
return list(workflow_path.filter(url_column == url).entities())[0]["RID"]
|
|
964
|
-
except IndexError:
|
|
965
|
-
return None
|
|
966
|
-
|
|
967
|
-
def create_workflow(
|
|
968
|
-
self, name: str, workflow_type: str, description: str = ""
|
|
969
|
-
) -> Workflow:
|
|
970
|
-
"""Identify current executing program and return a workflow RID for it
|
|
971
|
-
|
|
972
|
-
Determine the notebook or script that is currently being executed. Assume that this is
|
|
973
|
-
being executed from a cloned GitHub repository. Determine the remote repository name for
|
|
974
|
-
this object. Then either retrieve an existing workflow for this executable or create
|
|
975
|
-
a new one.
|
|
976
|
-
|
|
977
|
-
Args:
|
|
978
|
-
name: The name of the workflow.
|
|
979
|
-
workflow_type: The type of the workflow.
|
|
980
|
-
description: The description of the workflow.
|
|
981
|
-
|
|
982
|
-
Returns:
|
|
983
|
-
A workflow object.
|
|
984
|
-
"""
|
|
985
|
-
# Make sure type is correct.
|
|
986
|
-
self.lookup_term(MLVocab.workflow_type, workflow_type)
|
|
987
|
-
|
|
988
|
-
return Workflow.create_workflow(name, workflow_type, description)
|
|
989
|
-
|
|
990
|
-
# @validate_call
|
|
991
|
-
def create_execution(
|
|
992
|
-
self, configuration: ExecutionConfiguration, dry_run: bool = False
|
|
993
|
-
) -> "Execution":
|
|
994
|
-
"""Create an execution object
|
|
995
|
-
|
|
996
|
-
Given an execution configuration, initialize the local compute environment to prepare for executing an
|
|
997
|
-
ML or analytic routine. This routine has a number of side effects.
|
|
998
|
-
|
|
999
|
-
1. The datasets specified in the configuration are downloaded and placed in the cache-dir. If a version is
|
|
1000
|
-
not specified in the configuration, then a new minor version number is created for the dataset and downloaded.
|
|
1001
|
-
|
|
1002
|
-
2. If any execution assets are provided in the configuration, they are downloaded and placed in the working directory.
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
Args:
|
|
1006
|
-
configuration: ExecutionConfiguration:
|
|
1007
|
-
dry_run: Do not create an execution record or upload results.
|
|
1008
|
-
|
|
1009
|
-
Returns:
|
|
1010
|
-
An execution object.
|
|
1011
|
-
"""
|
|
1012
|
-
from .execution import Execution
|
|
1013
|
-
|
|
1014
|
-
self._execution = Execution(configuration, self, dry_run=dry_run)
|
|
1015
|
-
return self._execution
|
|
1016
|
-
|
|
1017
|
-
# @validate_call
|
|
1018
|
-
def restore_execution(self, execution_rid: Optional[RID] = None) -> "Execution":
|
|
1019
|
-
"""Return an Execution object for a previously started execution with the specified RID."""
|
|
1020
|
-
|
|
1021
|
-
from .execution import Execution
|
|
1022
|
-
|
|
1023
|
-
# Find path to execution
|
|
1024
|
-
if not execution_rid:
|
|
1025
|
-
e_rids = execution_rids(self.working_dir)
|
|
1026
|
-
if len(e_rids) != 1:
|
|
1027
|
-
raise DerivaMLException(f"Multiple execution RIDs were found {e_rids}.")
|
|
1028
|
-
|
|
1029
|
-
execution_rid = e_rids[0]
|
|
1030
|
-
cfile = asset_file_path(
|
|
1031
|
-
prefix=self.working_dir,
|
|
1032
|
-
exec_rid=execution_rid,
|
|
1033
|
-
file_name="configuration.json",
|
|
1034
|
-
asset_table=self.model.name_to_table("Execution_Metadata"),
|
|
1035
|
-
metadata={},
|
|
1036
|
-
)
|
|
1037
|
-
|
|
1038
|
-
if cfile.exists():
|
|
1039
|
-
configuration = ExecutionConfiguration.load_configuration(cfile)
|
|
1040
|
-
else:
|
|
1041
|
-
execution = self.retrieve_rid(execution_rid)
|
|
1042
|
-
configuration = ExecutionConfiguration(
|
|
1043
|
-
workflow=execution["Workflow"],
|
|
1044
|
-
description=execution["Description"],
|
|
1045
|
-
)
|
|
1046
|
-
return Execution(configuration, self, reload=execution_rid)
|