deriva-ml 1.8.10__py3-none-any.whl → 1.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/database_model.py +27 -4
- deriva_ml/dataset.py +14 -9
- deriva_ml/dataset_bag.py +1 -1
- deriva_ml/demo_catalog.py +9 -8
- deriva_ml/deriva_definitions.py +8 -3
- deriva_ml/deriva_ml_base.py +142 -50
- deriva_ml/deriva_model.py +2 -2
- deriva_ml/execution.py +9 -16
- deriva_ml/execution_configuration.py +20 -23
- deriva_ml/schema_setup/annotations.py +1 -1
- deriva_ml/schema_setup/create_schema.py +3 -2
- deriva_ml/upload.py +1 -1
- {deriva_ml-1.8.10.dist-info → deriva_ml-1.9.0.dist-info}/METADATA +1 -1
- deriva_ml-1.9.0.dist-info/RECORD +27 -0
- deriva_ml/build/lib/schema_setup/__init__.py +0 -0
- deriva_ml/build/lib/schema_setup/alter_annotation.py +0 -36
- deriva_ml/build/lib/schema_setup/annotation_temp.py +0 -255
- deriva_ml/build/lib/schema_setup/create_schema.py +0 -165
- deriva_ml/build/lib/schema_setup/table_comments_utils.py +0 -56
- deriva_ml/schema_setup/alter_annotation.py +0 -55
- deriva_ml-1.8.10.dist-info/RECORD +0 -33
- {deriva_ml-1.8.10.dist-info → deriva_ml-1.9.0.dist-info}/WHEEL +0 -0
- {deriva_ml-1.8.10.dist-info → deriva_ml-1.9.0.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.8.10.dist-info → deriva_ml-1.9.0.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.8.10.dist-info → deriva_ml-1.9.0.dist-info}/top_level.txt +0 -0
deriva_ml/database_model.py
CHANGED
|
@@ -1,12 +1,15 @@
|
|
|
1
|
-
"""Ths module
|
|
1
|
+
"""Ths module contains the definition of the DatabaseModel class. The role of this class is to provide an nterface between the BDBag representation
|
|
2
2
|
of a dataset and a sqllite database in which the contents of the bag are stored.
|
|
3
3
|
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
4
7
|
import logging
|
|
5
8
|
import sqlite3
|
|
6
9
|
|
|
7
10
|
from csv import reader
|
|
8
11
|
from pathlib import Path
|
|
9
|
-
from typing import Any, Optional
|
|
12
|
+
from typing import Any, Optional, Generator
|
|
10
13
|
from urllib.parse import urlparse
|
|
11
14
|
|
|
12
15
|
from deriva.core.ermrest_model import Model
|
|
@@ -20,7 +23,7 @@ from .dataset_bag import DatasetBag
|
|
|
20
23
|
class DatabaseModelMeta(type):
|
|
21
24
|
"""Use metaclass to ensure that there is onl one instance per path"""
|
|
22
25
|
|
|
23
|
-
_paths_loaded: dict[Path
|
|
26
|
+
_paths_loaded: dict[Path, "DatabaseModel"] = {}
|
|
24
27
|
|
|
25
28
|
def __call__(cls, *args, **kwargs):
|
|
26
29
|
logger = logging.getLogger("deriva_ml")
|
|
@@ -47,7 +50,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
47
50
|
Because of nested datasets, it's possible that more than one dataset rid is in a bag, or that a dataset rid might
|
|
48
51
|
appear in more than one database. To help manage this, a global list of all the datasets that have been loaded
|
|
49
52
|
into DatabaseModels, is kept in the class variable `_rid_map`.
|
|
50
|
-
|
|
53
|
+
|
|
51
54
|
Because you can load diffent versions of a dataset simultaniously, the dataset RID and version number are tracked, and a new
|
|
52
55
|
sqllite instance is created for every new dataset version present.
|
|
53
56
|
|
|
@@ -315,6 +318,26 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
315
318
|
)
|
|
316
319
|
return datasets
|
|
317
320
|
|
|
321
|
+
def get_table_as_dict(self, table: str) -> Generator[dict[str, Any], None, None]:
|
|
322
|
+
"""Retrieve the contents of the specified table as a dictionary.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
table: Table to retrieve data from. f schema is not provided as part of the table name,
|
|
326
|
+
the method will attempt to locate the schema for the table.
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
A generator producing dictionaries containing the contents of the specified table as name/value pairs.
|
|
330
|
+
"""
|
|
331
|
+
table_name = self.normalize_table_name(table)
|
|
332
|
+
with self.dbase as dbase:
|
|
333
|
+
col_names = [
|
|
334
|
+
c[1]
|
|
335
|
+
for c in dbase.execute(f'PRAGMA table_info("{table_name}")').fetchall()
|
|
336
|
+
]
|
|
337
|
+
result = self.dbase.execute(f'SELECT * FROM "{table_name}"')
|
|
338
|
+
while row := result.fetchone():
|
|
339
|
+
yield dict(zip(col_names, row))
|
|
340
|
+
|
|
318
341
|
def normalize_table_name(self, table: str) -> str:
|
|
319
342
|
"""Attempt to insert the schema into a table name if it's not provided.
|
|
320
343
|
|
deriva_ml/dataset.py
CHANGED
|
@@ -92,7 +92,7 @@ class Dataset:
|
|
|
92
92
|
dataset_list: list[DatasetSpec],
|
|
93
93
|
description: Optional[str] = "",
|
|
94
94
|
execution_rid: Optional[RID] = None,
|
|
95
|
-
) ->
|
|
95
|
+
) -> list[dict[str, Any]]:
|
|
96
96
|
schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
|
|
97
97
|
|
|
98
98
|
# Construct version records for insert
|
|
@@ -245,7 +245,7 @@ class Dataset:
|
|
|
245
245
|
DerivaMLException: if provided RID is not to a dataset_table.
|
|
246
246
|
"""
|
|
247
247
|
|
|
248
|
-
# Find all
|
|
248
|
+
# Find all the datasets that are reachable from this dataset and determine their new version numbers.
|
|
249
249
|
related_datasets = list(self._build_dataset_graph(dataset_rid=dataset_rid))
|
|
250
250
|
version_update_list = [
|
|
251
251
|
DatasetSpec(
|
|
@@ -254,7 +254,7 @@ class Dataset:
|
|
|
254
254
|
)
|
|
255
255
|
for ds_rid in related_datasets
|
|
256
256
|
]
|
|
257
|
-
|
|
257
|
+
self._insert_dataset_versions(
|
|
258
258
|
version_update_list, description=description, execution_rid=execution_rid
|
|
259
259
|
)
|
|
260
260
|
return [d.version for d in version_update_list if d.rid == dataset_rid][0]
|
|
@@ -751,9 +751,10 @@ class Dataset:
|
|
|
751
751
|
]
|
|
752
752
|
|
|
753
753
|
def _table_paths(
|
|
754
|
-
self,
|
|
754
|
+
self,
|
|
755
|
+
dataset: Optional[DatasetSpec] = None,
|
|
756
|
+
snapshot_catalog: Optional[DerivaML] = None,
|
|
755
757
|
) -> Iterator[tuple[str, str, Table]]:
|
|
756
|
-
|
|
757
758
|
paths = self._collect_paths(dataset and dataset.rid, snapshot_catalog)
|
|
758
759
|
|
|
759
760
|
def source_path(path: tuple[Table, ...]):
|
|
@@ -779,17 +780,20 @@ class Dataset:
|
|
|
779
780
|
def _collect_paths(
|
|
780
781
|
self,
|
|
781
782
|
dataset_rid: Optional[RID] = None,
|
|
782
|
-
|
|
783
|
+
snapshot: Optional[Dataset] = None,
|
|
783
784
|
dataset_nesting_depth: Optional[int] = None,
|
|
784
785
|
) -> set[tuple[Table, ...]]:
|
|
785
786
|
|
|
786
|
-
snapshot_catalog =
|
|
787
|
+
snapshot_catalog = snapshot if snapshot else self
|
|
788
|
+
|
|
787
789
|
dataset_table = snapshot_catalog._model.schemas[self._ml_schema].tables[
|
|
788
790
|
"Dataset"
|
|
789
791
|
]
|
|
790
792
|
dataset_dataset = snapshot_catalog._model.schemas[self._ml_schema].tables[
|
|
791
793
|
"Dataset_Dataset"
|
|
792
794
|
]
|
|
795
|
+
|
|
796
|
+
# Figure out what types of elements the dataset contains.
|
|
793
797
|
dataset_associations = [
|
|
794
798
|
a
|
|
795
799
|
for a in self.dataset_table.find_associations()
|
|
@@ -812,7 +816,8 @@ class Dataset:
|
|
|
812
816
|
]
|
|
813
817
|
else:
|
|
814
818
|
included_associations = dataset_associations
|
|
815
|
-
|
|
819
|
+
|
|
820
|
+
# Get the paths through the schema and filter out all the dataset paths not used by this dataset.
|
|
816
821
|
paths = {
|
|
817
822
|
tuple(p)
|
|
818
823
|
for p in snapshot_catalog._model._schema_to_paths()
|
|
@@ -827,7 +832,7 @@ class Dataset:
|
|
|
827
832
|
if dataset_rid:
|
|
828
833
|
for c in snapshot_catalog.list_dataset_children(dataset_rid=dataset_rid):
|
|
829
834
|
nested_paths |= self._collect_paths(
|
|
830
|
-
c,
|
|
835
|
+
c, snapshot=snapshot_catalog
|
|
831
836
|
)
|
|
832
837
|
else:
|
|
833
838
|
# Initialize nesting depth if not already provided.
|
deriva_ml/dataset_bag.py
CHANGED
|
@@ -109,7 +109,7 @@ class DatasetBag:
|
|
|
109
109
|
for ts, on in paths:
|
|
110
110
|
tables = " JOIN ".join(ts)
|
|
111
111
|
on_expression = " and ".join(
|
|
112
|
-
[f"{column_name(
|
|
112
|
+
[f"{column_name(left)}={column_name(right)}" for left, right in on]
|
|
113
113
|
)
|
|
114
114
|
sql.append(
|
|
115
115
|
f"SELECT {select_args} FROM {tables} ON {on_expression} WHERE {dataset_table_name}.RID IN ({datasets})"
|
deriva_ml/demo_catalog.py
CHANGED
|
@@ -5,6 +5,7 @@ import logging
|
|
|
5
5
|
from random import random, randint
|
|
6
6
|
import tempfile
|
|
7
7
|
from tempfile import TemporaryDirectory
|
|
8
|
+
from typing import Optional
|
|
8
9
|
import itertools
|
|
9
10
|
|
|
10
11
|
from deriva.config.acl_config import AclConfig
|
|
@@ -18,7 +19,6 @@ from requests import HTTPError
|
|
|
18
19
|
from deriva_ml import (
|
|
19
20
|
DerivaML,
|
|
20
21
|
ExecutionConfiguration,
|
|
21
|
-
Workflow,
|
|
22
22
|
MLVocab,
|
|
23
23
|
BuiltinTypes,
|
|
24
24
|
ColumnDefinition,
|
|
@@ -169,12 +169,9 @@ def create_demo_features(ml_instance):
|
|
|
169
169
|
description="Model for our API workflow",
|
|
170
170
|
)
|
|
171
171
|
|
|
172
|
-
api_workflow = ml_instance.
|
|
173
|
-
Workflow
|
|
174
|
-
|
|
175
|
-
url="https://github.com/informatics-isi-edu/deriva-ml/blob/main/pyproject.toml",
|
|
176
|
-
workflow_type="API Workflow",
|
|
177
|
-
)
|
|
172
|
+
api_workflow = ml_instance.create_workflow(
|
|
173
|
+
name="API Workflow",
|
|
174
|
+
workflow_type="API Workflow",
|
|
178
175
|
)
|
|
179
176
|
|
|
180
177
|
api_execution = ml_instance.create_execution(
|
|
@@ -322,7 +319,11 @@ def create_demo_catalog(
|
|
|
322
319
|
|
|
323
320
|
class DemoML(DerivaML):
|
|
324
321
|
def __init__(
|
|
325
|
-
self,
|
|
322
|
+
self,
|
|
323
|
+
hostname,
|
|
324
|
+
catalog_id,
|
|
325
|
+
cache_dir: Optional[str] = None,
|
|
326
|
+
working_dir: Optional[str] = None,
|
|
326
327
|
):
|
|
327
328
|
super().__init__(
|
|
328
329
|
hostname=hostname,
|
deriva_ml/deriva_definitions.py
CHANGED
|
@@ -8,7 +8,7 @@ from enum import Enum
|
|
|
8
8
|
from typing import Any, Iterable, Optional, Annotated
|
|
9
9
|
|
|
10
10
|
import deriva.core.ermrest_model as em
|
|
11
|
-
from urllib.parse import urlparse
|
|
11
|
+
from urllib.parse import urlparse
|
|
12
12
|
from deriva.core.ermrest_model import builtin_types
|
|
13
13
|
from pydantic import (
|
|
14
14
|
BaseModel,
|
|
@@ -139,13 +139,18 @@ class FileSpec(BaseModel):
|
|
|
139
139
|
if url_parts.scheme == "tag":
|
|
140
140
|
return v
|
|
141
141
|
elif not url_parts.scheme:
|
|
142
|
-
return f
|
|
142
|
+
return f"tag://{gethostname()},{date.today()}:file://{v}"
|
|
143
143
|
else:
|
|
144
144
|
raise ValidationError("url is not a file URL")
|
|
145
145
|
|
|
146
146
|
@model_serializer()
|
|
147
147
|
def serialize_filespec(self):
|
|
148
|
-
return {
|
|
148
|
+
return {
|
|
149
|
+
"URL": self.url,
|
|
150
|
+
"Description": self.description,
|
|
151
|
+
"MD5": self.md5,
|
|
152
|
+
"Length": self.length,
|
|
153
|
+
}
|
|
149
154
|
|
|
150
155
|
|
|
151
156
|
class VocabularyTerm(BaseModel):
|
deriva_ml/deriva_ml_base.py
CHANGED
|
@@ -19,7 +19,6 @@ import setuptools_scm
|
|
|
19
19
|
from pathlib import Path
|
|
20
20
|
import requests
|
|
21
21
|
import subprocess
|
|
22
|
-
import shutil
|
|
23
22
|
from typing import Optional, Any, Iterable, TYPE_CHECKING
|
|
24
23
|
from deriva.core import (
|
|
25
24
|
get_credential,
|
|
@@ -33,7 +32,9 @@ from deriva.core.deriva_server import DerivaServer
|
|
|
33
32
|
from deriva.core.ermrest_catalog import ResolveRidResult
|
|
34
33
|
from deriva.core.ermrest_model import Key, Table
|
|
35
34
|
from deriva.core.hatrac_store import HatracStore
|
|
35
|
+
from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
|
|
36
36
|
from pydantic import validate_call, ConfigDict
|
|
37
|
+
from requests import RequestException
|
|
37
38
|
|
|
38
39
|
from .execution_configuration import ExecutionConfiguration, Workflow
|
|
39
40
|
from .feature import Feature, FeatureRecord
|
|
@@ -70,7 +71,28 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
|
70
71
|
try:
|
|
71
72
|
from IPython import get_ipython
|
|
72
73
|
except ImportError: # Graceful fallback if IPython isn't installed.
|
|
73
|
-
|
|
74
|
+
|
|
75
|
+
def get_ipython():
|
|
76
|
+
"""Dummy routine in case you are not running in IPython."""
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
from jupyter_server.serverapp import list_running_servers
|
|
82
|
+
except ImportError:
|
|
83
|
+
|
|
84
|
+
def list_running_servers():
|
|
85
|
+
"""Dummy routine in case you are not running in Jupyter."""
|
|
86
|
+
return []
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
from ipykernel import get_connection_file
|
|
91
|
+
except ImportError:
|
|
92
|
+
|
|
93
|
+
def get_connection_file():
|
|
94
|
+
"""Dummy routine in case you are not running in Jupyter."""
|
|
95
|
+
return ""
|
|
74
96
|
|
|
75
97
|
|
|
76
98
|
if TYPE_CHECKING:
|
|
@@ -93,8 +115,8 @@ class DerivaML(Dataset):
|
|
|
93
115
|
self,
|
|
94
116
|
hostname: str,
|
|
95
117
|
catalog_id: str | int,
|
|
96
|
-
domain_schema: str = None,
|
|
97
|
-
project_name: str = None,
|
|
118
|
+
domain_schema: Optional[str] = None,
|
|
119
|
+
project_name: Optional[str] = None,
|
|
98
120
|
cache_dir: Optional[str] = None,
|
|
99
121
|
working_dir: Optional[str] = None,
|
|
100
122
|
model_version: str = "1",
|
|
@@ -151,8 +173,7 @@ class DerivaML(Dataset):
|
|
|
151
173
|
self.version = model_version
|
|
152
174
|
self.configuration = None
|
|
153
175
|
self._execution: Optional[Execution] = None
|
|
154
|
-
self.
|
|
155
|
-
self._notebook = self._get_python_notebook()
|
|
176
|
+
self.executable_path, self._is_notebook = self._get_python_script()
|
|
156
177
|
self.domain_schema = self.model.domain_schema
|
|
157
178
|
self.project_name = project_name or self.domain_schema
|
|
158
179
|
self.start_time = datetime.now()
|
|
@@ -179,38 +200,76 @@ class DerivaML(Dataset):
|
|
|
179
200
|
except (AttributeError, requests.HTTPError):
|
|
180
201
|
pass
|
|
181
202
|
|
|
182
|
-
def
|
|
203
|
+
def _check_nbstrip_status(self) -> None:
|
|
183
204
|
"""Figure out if you are running in a Jupyter notebook
|
|
184
205
|
|
|
185
206
|
Returns:
|
|
186
207
|
A Path to the notebook file that is currently being executed.
|
|
187
208
|
"""
|
|
188
|
-
notebook = None
|
|
189
209
|
try:
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
210
|
+
if subprocess.run(
|
|
211
|
+
["nbstripout", "--is-installed"],
|
|
212
|
+
check=False,
|
|
213
|
+
capture_output=True,
|
|
214
|
+
).returncode:
|
|
215
|
+
self._logger.warning(
|
|
216
|
+
"nbstripout is not installed in repository. Please run nbstripout --install"
|
|
217
|
+
)
|
|
218
|
+
except subprocess.CalledProcessError:
|
|
219
|
+
self._logger.error("nbstripout is not found.")
|
|
220
|
+
|
|
221
|
+
@staticmethod
|
|
222
|
+
def _get_notebook_session() -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
|
|
223
|
+
"""Return the absolute path of the current notebook."""
|
|
224
|
+
# Get the kernel's connection file and extract the kernel ID
|
|
225
|
+
try:
|
|
226
|
+
if not (connection_file := Path(get_connection_file()).name):
|
|
227
|
+
return None, None
|
|
228
|
+
except RuntimeError:
|
|
229
|
+
return None, None
|
|
230
|
+
|
|
231
|
+
kernel_id = connection_file.split("-", 1)[1].split(".")[0]
|
|
232
|
+
|
|
233
|
+
# Look through the running server sessions to find the matching kernel ID
|
|
234
|
+
for server in list_running_servers():
|
|
235
|
+
try:
|
|
236
|
+
# If a token is required for authentication, include it in headers
|
|
237
|
+
token = server.get("token", "")
|
|
238
|
+
headers = {}
|
|
239
|
+
if token:
|
|
240
|
+
headers["Authorization"] = f"token {token}"
|
|
241
|
+
|
|
195
242
|
try:
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
return
|
|
243
|
+
sessions_url = server["url"] + "api/sessions"
|
|
244
|
+
response = requests.get(sessions_url, headers=headers)
|
|
245
|
+
response.raise_for_status()
|
|
246
|
+
sessions = response.json()
|
|
247
|
+
except RequestException as e:
|
|
248
|
+
raise e
|
|
249
|
+
for sess in sessions:
|
|
250
|
+
if sess["kernel"]["id"] == kernel_id:
|
|
251
|
+
return server, sess
|
|
252
|
+
except Exception as _e:
|
|
253
|
+
# Ignore servers we can't connect to.
|
|
254
|
+
pass
|
|
255
|
+
return None, None
|
|
256
|
+
|
|
257
|
+
def _get_notebook_path(self) -> Path | None:
|
|
258
|
+
"""Return the absolute path of the current notebook."""
|
|
259
|
+
|
|
260
|
+
server, session = self._get_notebook_session()
|
|
261
|
+
if server and session:
|
|
262
|
+
self._check_nbstrip_status()
|
|
263
|
+
relative_path = session["notebook"]["path"]
|
|
264
|
+
# Join the notebook directory with the relative path
|
|
265
|
+
return Path(server["root_dir"]) / relative_path
|
|
266
|
+
else:
|
|
267
|
+
return None
|
|
209
268
|
|
|
210
269
|
def _get_python_script(self) -> tuple[Path, bool]:
|
|
211
270
|
"""Return the path to the currently executing script"""
|
|
212
271
|
is_notebook = False
|
|
213
|
-
if filename := self.
|
|
272
|
+
if filename := self._get_notebook_path():
|
|
214
273
|
is_notebook = True
|
|
215
274
|
else:
|
|
216
275
|
stack = inspect.stack()
|
|
@@ -220,7 +279,7 @@ class DerivaML(Dataset):
|
|
|
220
279
|
) # Get the caller's filename, which is two up the stack from here.
|
|
221
280
|
else:
|
|
222
281
|
raise DerivaMLException(
|
|
223
|
-
|
|
282
|
+
"Looking for caller failed"
|
|
224
283
|
) # Stack is too shallow
|
|
225
284
|
return filename, is_notebook
|
|
226
285
|
|
|
@@ -228,11 +287,11 @@ class DerivaML(Dataset):
|
|
|
228
287
|
try:
|
|
229
288
|
result = subprocess.run(
|
|
230
289
|
["git", "rev-parse", "--show-toplevel"],
|
|
231
|
-
cwd=self.
|
|
290
|
+
cwd=self.executable_path.parent,
|
|
232
291
|
stdout=subprocess.PIPE,
|
|
233
292
|
stderr=subprocess.DEVNULL,
|
|
234
293
|
text=True,
|
|
235
|
-
check=True
|
|
294
|
+
check=True,
|
|
236
295
|
)
|
|
237
296
|
return result.stdout.strip()
|
|
238
297
|
except subprocess.CalledProcessError:
|
|
@@ -262,6 +321,7 @@ class DerivaML(Dataset):
|
|
|
262
321
|
return self.catalog.getPathBuilder()
|
|
263
322
|
|
|
264
323
|
def get_version(self) -> str:
|
|
324
|
+
"""Return the version number of the executable"""
|
|
265
325
|
return setuptools_scm.get_version(root=self._get_git_root())
|
|
266
326
|
|
|
267
327
|
@property
|
|
@@ -287,7 +347,7 @@ class DerivaML(Dataset):
|
|
|
287
347
|
)
|
|
288
348
|
|
|
289
349
|
def asset_dir(
|
|
290
|
-
self, table: str | Table, prefix: str | Path = None
|
|
350
|
+
self, table: str | Table, prefix: Optional[str | Path] = None
|
|
291
351
|
) -> UploadAssetDirectory:
|
|
292
352
|
"""Return a local file path in which to place a files for an asset table. T
|
|
293
353
|
|
|
@@ -321,6 +381,29 @@ class DerivaML(Dataset):
|
|
|
321
381
|
"""
|
|
322
382
|
return self.cache_dir if cached else self.working_dir
|
|
323
383
|
|
|
384
|
+
@staticmethod
|
|
385
|
+
def globus_login(host: str) -> None:
|
|
386
|
+
"""Log into the specified host using Globus.
|
|
387
|
+
|
|
388
|
+
Args:
|
|
389
|
+
host:
|
|
390
|
+
|
|
391
|
+
Returns:
|
|
392
|
+
|
|
393
|
+
"""
|
|
394
|
+
gnl = GlobusNativeLogin(host=host)
|
|
395
|
+
if gnl.is_logged_in([host]):
|
|
396
|
+
print("You are already logged in.")
|
|
397
|
+
else:
|
|
398
|
+
gnl.login(
|
|
399
|
+
[host],
|
|
400
|
+
no_local_server=True,
|
|
401
|
+
no_browser=True,
|
|
402
|
+
refresh_tokens=True,
|
|
403
|
+
update_bdbag_keychain=True,
|
|
404
|
+
)
|
|
405
|
+
print("Login Successful")
|
|
406
|
+
|
|
324
407
|
def chaise_url(self, table: RID | Table) -> str:
|
|
325
408
|
"""Return a Chaise URL to the specified table.
|
|
326
409
|
|
|
@@ -331,15 +414,15 @@ class DerivaML(Dataset):
|
|
|
331
414
|
Returns:
|
|
332
415
|
URL to the table in Chaise format.
|
|
333
416
|
"""
|
|
417
|
+
table_obj = self.model.name_to_table(table)
|
|
334
418
|
try:
|
|
335
|
-
table = self.model.name_to_table(table)
|
|
336
419
|
uri = self.catalog.get_server_uri().replace(
|
|
337
420
|
"ermrest/catalog/", "chaise/recordset/#"
|
|
338
421
|
)
|
|
339
422
|
except DerivaMLException:
|
|
340
423
|
# Perhaps we have a RID....
|
|
341
424
|
uri = self.cite(table)
|
|
342
|
-
return f"{uri}/{urlquote(
|
|
425
|
+
return f"{uri}/{urlquote(table_obj.schema.name)}:{urlquote(table_obj.name)}"
|
|
343
426
|
|
|
344
427
|
def cite(self, entity: dict | str) -> str:
|
|
345
428
|
"""Return a citation URL for the provided entity.
|
|
@@ -353,7 +436,9 @@ class DerivaML(Dataset):
|
|
|
353
436
|
Raises:
|
|
354
437
|
DerivaMLException: if provided RID does not exist.
|
|
355
438
|
"""
|
|
356
|
-
if entity.startswith(
|
|
439
|
+
if isinstance(entity, str) and entity.startswith(
|
|
440
|
+
f"https://{self.host_name}/id/{self.catalog_id}/"
|
|
441
|
+
):
|
|
357
442
|
# Already got a citation...
|
|
358
443
|
return entity
|
|
359
444
|
try:
|
|
@@ -450,9 +535,9 @@ class DerivaML(Dataset):
|
|
|
450
535
|
def create_asset(
|
|
451
536
|
self,
|
|
452
537
|
asset_name: str,
|
|
453
|
-
column_defs: Iterable[ColumnDefinition] = None,
|
|
538
|
+
column_defs: Optional[Iterable[ColumnDefinition]] = None,
|
|
454
539
|
comment: str = "",
|
|
455
|
-
schema: str = None,
|
|
540
|
+
schema: Optional[str] = None,
|
|
456
541
|
) -> Table:
|
|
457
542
|
"""Create an asset table with the given asset name.
|
|
458
543
|
|
|
@@ -484,9 +569,9 @@ class DerivaML(Dataset):
|
|
|
484
569
|
self,
|
|
485
570
|
target_table: Table | str,
|
|
486
571
|
feature_name: str,
|
|
487
|
-
terms: list[Table | str] = None,
|
|
488
|
-
assets: list[Table | str] = None,
|
|
489
|
-
metadata: Iterable[ColumnDefinition | Table | Key | str] = None,
|
|
572
|
+
terms: Optional[list[Table | str]] = None,
|
|
573
|
+
assets: Optional[list[Table | str]] = None,
|
|
574
|
+
metadata: Optional[Iterable[ColumnDefinition | Table | Key | str]] = None,
|
|
490
575
|
optional: Optional[list[str]] = None,
|
|
491
576
|
comment: str = "",
|
|
492
577
|
) -> type[FeatureRecord]:
|
|
@@ -851,6 +936,7 @@ class DerivaML(Dataset):
|
|
|
851
936
|
"""
|
|
852
937
|
|
|
853
938
|
def path_to_asset(path: str) -> str:
|
|
939
|
+
"""Pull the asset name out of a path to that asset in the filesystem"""
|
|
854
940
|
components = path.split("/")
|
|
855
941
|
return components[
|
|
856
942
|
components.index("asset") + 2
|
|
@@ -915,6 +1001,7 @@ class DerivaML(Dataset):
|
|
|
915
1001
|
)
|
|
916
1002
|
|
|
917
1003
|
def check_file_type(dtype: str) -> bool:
|
|
1004
|
+
"""Make sure that the specified string is either the name or synonym for a file type term."""
|
|
918
1005
|
for term in defined_types:
|
|
919
1006
|
if dtype == term.name or (term.synonyms and file_type in term.synonyms):
|
|
920
1007
|
return True
|
|
@@ -1040,6 +1127,7 @@ class DerivaML(Dataset):
|
|
|
1040
1127
|
return workflow_rid
|
|
1041
1128
|
|
|
1042
1129
|
def lookup_workflow(self, url: str) -> Optional[RID]:
|
|
1130
|
+
"""Given a URL, look in the workflow table to find a matching workflow."""
|
|
1043
1131
|
workflow_path = self.pathBuilder.schemas[self.ml_schema].Workflow
|
|
1044
1132
|
try:
|
|
1045
1133
|
url_column = workflow_path.URL
|
|
@@ -1049,7 +1137,7 @@ class DerivaML(Dataset):
|
|
|
1049
1137
|
|
|
1050
1138
|
def create_workflow(
|
|
1051
1139
|
self, name: str, workflow_type: str, description: str = "", create: bool = True
|
|
1052
|
-
) -> RID:
|
|
1140
|
+
) -> RID | None:
|
|
1053
1141
|
"""Identify current executing program and return a workflow RID for it
|
|
1054
1142
|
|
|
1055
1143
|
Determine the notebook or script that is currently being executed. Assume that this is
|
|
@@ -1069,20 +1157,21 @@ class DerivaML(Dataset):
|
|
|
1069
1157
|
|
|
1070
1158
|
if is_dirty:
|
|
1071
1159
|
self._logger.warning(
|
|
1072
|
-
f"File {self.
|
|
1160
|
+
f"File {self.executable_path} has been modified since last commit. Consider commiting before executing"
|
|
1073
1161
|
)
|
|
1074
1162
|
|
|
1075
1163
|
# If you are in a notebook, strip out the outputs before computing the checksum.
|
|
1076
1164
|
cmd = (
|
|
1077
|
-
f"nbstripout {self.
|
|
1165
|
+
f"nbstripout {self.executable_path} | git hash-object --stdin"
|
|
1078
1166
|
if self._is_notebook
|
|
1079
|
-
else f"git hash-object {self.
|
|
1167
|
+
else f"git hash-object {self.executable_path}"
|
|
1080
1168
|
)
|
|
1081
1169
|
checksum = subprocess.run(
|
|
1082
1170
|
cmd,
|
|
1083
1171
|
capture_output=True,
|
|
1084
1172
|
text=True,
|
|
1085
1173
|
check=True,
|
|
1174
|
+
shell=True,
|
|
1086
1175
|
).stdout.strip()
|
|
1087
1176
|
|
|
1088
1177
|
workflow = Workflow(
|
|
@@ -1109,12 +1198,14 @@ class DerivaML(Dataset):
|
|
|
1109
1198
|
# Get repo URL from local github repo.
|
|
1110
1199
|
try:
|
|
1111
1200
|
result = subprocess.run(
|
|
1112
|
-
["git", "remote", "get-url", "origin"],
|
|
1113
|
-
|
|
1201
|
+
["git", "remote", "get-url", "origin"],
|
|
1202
|
+
capture_output=True,
|
|
1203
|
+
text=True,
|
|
1204
|
+
cwd=self.executable_path.parent,
|
|
1114
1205
|
)
|
|
1115
1206
|
github_url = result.stdout.strip().removesuffix(".git")
|
|
1116
1207
|
except subprocess.CalledProcessError:
|
|
1117
|
-
raise DerivaMLException(
|
|
1208
|
+
raise DerivaMLException("No GIT remote found")
|
|
1118
1209
|
|
|
1119
1210
|
# Find the root directory for the repository
|
|
1120
1211
|
repo_root = self._get_git_root()
|
|
@@ -1123,7 +1214,7 @@ class DerivaML(Dataset):
|
|
|
1123
1214
|
try:
|
|
1124
1215
|
result = subprocess.run(
|
|
1125
1216
|
["git", "status", "--porcelain"],
|
|
1126
|
-
cwd=self.
|
|
1217
|
+
cwd=self.executable_path.parent,
|
|
1127
1218
|
capture_output=True,
|
|
1128
1219
|
text=True,
|
|
1129
1220
|
check=True,
|
|
@@ -1136,14 +1227,14 @@ class DerivaML(Dataset):
|
|
|
1136
1227
|
|
|
1137
1228
|
"""Get SHA-1 hash of latest commit of the file in the repository"""
|
|
1138
1229
|
result = subprocess.run(
|
|
1139
|
-
["git", "log", "-n", "1", "--pretty=format:%H
|
|
1140
|
-
cwd=self.
|
|
1230
|
+
["git", "log", "-n", "1", "--pretty=format:%H--", self.executable_path],
|
|
1231
|
+
cwd=self.executable_path.parent,
|
|
1141
1232
|
capture_output=True,
|
|
1142
1233
|
text=True,
|
|
1143
1234
|
check=True,
|
|
1144
1235
|
)
|
|
1145
1236
|
sha = result.stdout.strip()
|
|
1146
|
-
url = f"{github_url}/blob/{sha}/{self.
|
|
1237
|
+
url = f"{github_url}/blob/{sha}/{self.executable_path.relative_to(repo_root)}"
|
|
1147
1238
|
return url, is_dirty
|
|
1148
1239
|
|
|
1149
1240
|
# @validate_call
|
|
@@ -1174,6 +1265,7 @@ class DerivaML(Dataset):
|
|
|
1174
1265
|
|
|
1175
1266
|
# @validate_call
|
|
1176
1267
|
def restore_execution(self, execution_rid: Optional[RID] = None) -> "Execution":
|
|
1268
|
+
"""Return an Execution object for a previously started execution with the specified RID."""
|
|
1177
1269
|
from .execution import Execution
|
|
1178
1270
|
|
|
1179
1271
|
# Find path to execution
|
deriva_ml/deriva_model.py
CHANGED
|
@@ -21,7 +21,7 @@ from .deriva_definitions import (
|
|
|
21
21
|
|
|
22
22
|
from collections import Counter
|
|
23
23
|
from pydantic import validate_call, ConfigDict
|
|
24
|
-
from typing import Iterable
|
|
24
|
+
from typing import Iterable, Optional
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class DerivaModel:
|
|
@@ -267,7 +267,7 @@ class DerivaModel:
|
|
|
267
267
|
def _schema_to_paths(
|
|
268
268
|
self,
|
|
269
269
|
root: Table = None,
|
|
270
|
-
path: list[Table] = None,
|
|
270
|
+
path: Optional[list[Table]] = None,
|
|
271
271
|
) -> list[list[Table]]:
|
|
272
272
|
"""Recursively walk over the domain schema graph and extend the current path.
|
|
273
273
|
|