deriva-ml 1.8.11__py3-none-any.whl → 1.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/database_model.py +29 -7
- deriva_ml/dataset.py +16 -13
- deriva_ml/dataset_bag.py +1 -1
- deriva_ml/demo_catalog.py +9 -8
- deriva_ml/deriva_definitions.py +8 -3
- deriva_ml/deriva_ml_base.py +62 -23
- deriva_ml/deriva_model.py +2 -2
- deriva_ml/execution.py +5 -4
- deriva_ml/execution_configuration.py +20 -23
- deriva_ml/schema_setup/annotations.py +1 -1
- deriva_ml/schema_setup/create_schema.py +3 -2
- deriva_ml/upload.py +1 -1
- {deriva_ml-1.8.11.dist-info → deriva_ml-1.9.1.dist-info}/METADATA +11 -2
- deriva_ml-1.9.1.dist-info/RECORD +27 -0
- deriva_ml/build/lib/schema_setup/__init__.py +0 -0
- deriva_ml/build/lib/schema_setup/alter_annotation.py +0 -36
- deriva_ml/build/lib/schema_setup/annotation_temp.py +0 -255
- deriva_ml/build/lib/schema_setup/create_schema.py +0 -165
- deriva_ml/build/lib/schema_setup/table_comments_utils.py +0 -56
- deriva_ml/schema_setup/alter_annotation.py +0 -55
- deriva_ml-1.8.11.dist-info/RECORD +0 -33
- {deriva_ml-1.8.11.dist-info → deriva_ml-1.9.1.dist-info}/WHEEL +0 -0
- {deriva_ml-1.8.11.dist-info → deriva_ml-1.9.1.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.8.11.dist-info → deriva_ml-1.9.1.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.8.11.dist-info → deriva_ml-1.9.1.dist-info}/top_level.txt +0 -0
deriva_ml/database_model.py
CHANGED
|
@@ -1,12 +1,15 @@
|
|
|
1
|
-
"""Ths module
|
|
1
|
+
"""Ths module contains the definition of the DatabaseModel class. The role of this class is to provide an nterface between the BDBag representation
|
|
2
2
|
of a dataset and a sqllite database in which the contents of the bag are stored.
|
|
3
3
|
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
4
7
|
import logging
|
|
5
8
|
import sqlite3
|
|
6
9
|
|
|
7
10
|
from csv import reader
|
|
8
11
|
from pathlib import Path
|
|
9
|
-
from typing import Any, Optional
|
|
12
|
+
from typing import Any, Optional, Generator
|
|
10
13
|
from urllib.parse import urlparse
|
|
11
14
|
|
|
12
15
|
from deriva.core.ermrest_model import Model
|
|
@@ -20,7 +23,7 @@ from .dataset_bag import DatasetBag
|
|
|
20
23
|
class DatabaseModelMeta(type):
|
|
21
24
|
"""Use metaclass to ensure that there is onl one instance per path"""
|
|
22
25
|
|
|
23
|
-
_paths_loaded: dict[Path
|
|
26
|
+
_paths_loaded: dict[Path, "DatabaseModel"] = {}
|
|
24
27
|
|
|
25
28
|
def __call__(cls, *args, **kwargs):
|
|
26
29
|
logger = logging.getLogger("deriva_ml")
|
|
@@ -47,7 +50,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
47
50
|
Because of nested datasets, it's possible that more than one dataset rid is in a bag, or that a dataset rid might
|
|
48
51
|
appear in more than one database. To help manage this, a global list of all the datasets that have been loaded
|
|
49
52
|
into DatabaseModels, is kept in the class variable `_rid_map`.
|
|
50
|
-
|
|
53
|
+
|
|
51
54
|
Because you can load diffent versions of a dataset simultaniously, the dataset RID and version number are tracked, and a new
|
|
52
55
|
sqllite instance is created for every new dataset version present.
|
|
53
56
|
|
|
@@ -81,7 +84,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
81
84
|
except KeyError:
|
|
82
85
|
raise DerivaMLException(f"Dataset {dataset_rid} not found")
|
|
83
86
|
|
|
84
|
-
def __init__(self, minid: DatasetMinid, bag_path: Path):
|
|
87
|
+
def __init__(self, minid: DatasetMinid, bag_path: Path, dbase_path: Path):
|
|
85
88
|
"""Create a new DatabaseModel.
|
|
86
89
|
|
|
87
90
|
Args:
|
|
@@ -92,8 +95,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
92
95
|
self.bag_path = bag_path
|
|
93
96
|
self.minid = minid
|
|
94
97
|
self.dataset_rid = minid.dataset_rid
|
|
95
|
-
|
|
96
|
-
self.dbase_file = dir_path / f"{minid.version_rid}.db"
|
|
98
|
+
self.dbase_file = dbase_path / f"{minid.version_rid}.db"
|
|
97
99
|
self.dbase = sqlite3.connect(self.dbase_file)
|
|
98
100
|
|
|
99
101
|
super().__init__(
|
|
@@ -315,6 +317,26 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
315
317
|
)
|
|
316
318
|
return datasets
|
|
317
319
|
|
|
320
|
+
def get_table_as_dict(self, table: str) -> Generator[dict[str, Any], None, None]:
|
|
321
|
+
"""Retrieve the contents of the specified table as a dictionary.
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
table: Table to retrieve data from. f schema is not provided as part of the table name,
|
|
325
|
+
the method will attempt to locate the schema for the table.
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
A generator producing dictionaries containing the contents of the specified table as name/value pairs.
|
|
329
|
+
"""
|
|
330
|
+
table_name = self.normalize_table_name(table)
|
|
331
|
+
with self.dbase as dbase:
|
|
332
|
+
col_names = [
|
|
333
|
+
c[1]
|
|
334
|
+
for c in dbase.execute(f'PRAGMA table_info("{table_name}")').fetchall()
|
|
335
|
+
]
|
|
336
|
+
result = self.dbase.execute(f'SELECT * FROM "{table_name}"')
|
|
337
|
+
while row := result.fetchone():
|
|
338
|
+
yield dict(zip(col_names, row))
|
|
339
|
+
|
|
318
340
|
def normalize_table_name(self, table: str) -> str:
|
|
319
341
|
"""Attempt to insert the schema into a table name if it's not provided.
|
|
320
342
|
|
deriva_ml/dataset.py
CHANGED
|
@@ -67,11 +67,12 @@ class Dataset:
|
|
|
67
67
|
|
|
68
68
|
_Logger = logging.getLogger("deriva_ml")
|
|
69
69
|
|
|
70
|
-
def __init__(self, model: DerivaModel, cache_dir: Path):
|
|
70
|
+
def __init__(self, model: DerivaModel, cache_dir: Path, working_dir: Path):
|
|
71
71
|
self._model = model
|
|
72
72
|
self._ml_schema = ML_SCHEMA
|
|
73
73
|
self.dataset_table = self._model.schemas[self._ml_schema].tables["Dataset"]
|
|
74
74
|
self._cache_dir = cache_dir
|
|
75
|
+
self._working_dir = working_dir
|
|
75
76
|
self._logger = logging.getLogger("deriva_ml")
|
|
76
77
|
|
|
77
78
|
def _is_dataset_rid(self, dataset_rid: RID, deleted: bool = False) -> bool:
|
|
@@ -92,7 +93,7 @@ class Dataset:
|
|
|
92
93
|
dataset_list: list[DatasetSpec],
|
|
93
94
|
description: Optional[str] = "",
|
|
94
95
|
execution_rid: Optional[RID] = None,
|
|
95
|
-
) ->
|
|
96
|
+
) -> list[dict[str, Any]]:
|
|
96
97
|
schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
|
|
97
98
|
|
|
98
99
|
# Construct version records for insert
|
|
@@ -245,7 +246,7 @@ class Dataset:
|
|
|
245
246
|
DerivaMLException: if provided RID is not to a dataset_table.
|
|
246
247
|
"""
|
|
247
248
|
|
|
248
|
-
# Find all
|
|
249
|
+
# Find all the datasets that are reachable from this dataset and determine their new version numbers.
|
|
249
250
|
related_datasets = list(self._build_dataset_graph(dataset_rid=dataset_rid))
|
|
250
251
|
version_update_list = [
|
|
251
252
|
DatasetSpec(
|
|
@@ -254,7 +255,7 @@ class Dataset:
|
|
|
254
255
|
)
|
|
255
256
|
for ds_rid in related_datasets
|
|
256
257
|
]
|
|
257
|
-
|
|
258
|
+
self._insert_dataset_versions(
|
|
258
259
|
version_update_list, description=description, execution_rid=execution_rid
|
|
259
260
|
)
|
|
260
261
|
return [d.version for d in version_update_list if d.rid == dataset_rid][0]
|
|
@@ -751,9 +752,10 @@ class Dataset:
|
|
|
751
752
|
]
|
|
752
753
|
|
|
753
754
|
def _table_paths(
|
|
754
|
-
self,
|
|
755
|
+
self,
|
|
756
|
+
dataset: Optional[DatasetSpec] = None,
|
|
757
|
+
snapshot_catalog: Optional[DerivaML] = None,
|
|
755
758
|
) -> Iterator[tuple[str, str, Table]]:
|
|
756
|
-
|
|
757
759
|
paths = self._collect_paths(dataset and dataset.rid, snapshot_catalog)
|
|
758
760
|
|
|
759
761
|
def source_path(path: tuple[Table, ...]):
|
|
@@ -779,17 +781,19 @@ class Dataset:
|
|
|
779
781
|
def _collect_paths(
|
|
780
782
|
self,
|
|
781
783
|
dataset_rid: Optional[RID] = None,
|
|
782
|
-
|
|
784
|
+
snapshot: Optional[Dataset] = None,
|
|
783
785
|
dataset_nesting_depth: Optional[int] = None,
|
|
784
786
|
) -> set[tuple[Table, ...]]:
|
|
787
|
+
snapshot_catalog = snapshot if snapshot else self
|
|
785
788
|
|
|
786
|
-
snapshot_catalog = snapshot_catalog or self
|
|
787
789
|
dataset_table = snapshot_catalog._model.schemas[self._ml_schema].tables[
|
|
788
790
|
"Dataset"
|
|
789
791
|
]
|
|
790
792
|
dataset_dataset = snapshot_catalog._model.schemas[self._ml_schema].tables[
|
|
791
793
|
"Dataset_Dataset"
|
|
792
794
|
]
|
|
795
|
+
|
|
796
|
+
# Figure out what types of elements the dataset contains.
|
|
793
797
|
dataset_associations = [
|
|
794
798
|
a
|
|
795
799
|
for a in self.dataset_table.find_associations()
|
|
@@ -812,7 +816,8 @@ class Dataset:
|
|
|
812
816
|
]
|
|
813
817
|
else:
|
|
814
818
|
included_associations = dataset_associations
|
|
815
|
-
|
|
819
|
+
|
|
820
|
+
# Get the paths through the schema and filter out all the dataset paths not used by this dataset.
|
|
816
821
|
paths = {
|
|
817
822
|
tuple(p)
|
|
818
823
|
for p in snapshot_catalog._model._schema_to_paths()
|
|
@@ -826,9 +831,7 @@ class Dataset:
|
|
|
826
831
|
nested_paths = set()
|
|
827
832
|
if dataset_rid:
|
|
828
833
|
for c in snapshot_catalog.list_dataset_children(dataset_rid=dataset_rid):
|
|
829
|
-
nested_paths |= self._collect_paths(
|
|
830
|
-
c, snapshot_catalog=snapshot_catalog
|
|
831
|
-
)
|
|
834
|
+
nested_paths |= self._collect_paths(c, snapshot=snapshot_catalog)
|
|
832
835
|
else:
|
|
833
836
|
# Initialize nesting depth if not already provided.
|
|
834
837
|
dataset_nesting_depth = (
|
|
@@ -974,7 +977,7 @@ class Dataset:
|
|
|
974
977
|
if dataset.materialize
|
|
975
978
|
else self._download_dataset_minid(minid)
|
|
976
979
|
)
|
|
977
|
-
return DatabaseModel(minid, bag_path).get_dataset()
|
|
980
|
+
return DatabaseModel(minid, bag_path, self._working_dir).get_dataset()
|
|
978
981
|
|
|
979
982
|
def _version_snapshot(self, dataset: DatasetSpec) -> str:
|
|
980
983
|
"""Return a catalog with snapshot for the specified dataset version"""
|
deriva_ml/dataset_bag.py
CHANGED
|
@@ -109,7 +109,7 @@ class DatasetBag:
|
|
|
109
109
|
for ts, on in paths:
|
|
110
110
|
tables = " JOIN ".join(ts)
|
|
111
111
|
on_expression = " and ".join(
|
|
112
|
-
[f"{column_name(
|
|
112
|
+
[f"{column_name(left)}={column_name(right)}" for left, right in on]
|
|
113
113
|
)
|
|
114
114
|
sql.append(
|
|
115
115
|
f"SELECT {select_args} FROM {tables} ON {on_expression} WHERE {dataset_table_name}.RID IN ({datasets})"
|
deriva_ml/demo_catalog.py
CHANGED
|
@@ -5,6 +5,7 @@ import logging
|
|
|
5
5
|
from random import random, randint
|
|
6
6
|
import tempfile
|
|
7
7
|
from tempfile import TemporaryDirectory
|
|
8
|
+
from typing import Optional
|
|
8
9
|
import itertools
|
|
9
10
|
|
|
10
11
|
from deriva.config.acl_config import AclConfig
|
|
@@ -18,7 +19,6 @@ from requests import HTTPError
|
|
|
18
19
|
from deriva_ml import (
|
|
19
20
|
DerivaML,
|
|
20
21
|
ExecutionConfiguration,
|
|
21
|
-
Workflow,
|
|
22
22
|
MLVocab,
|
|
23
23
|
BuiltinTypes,
|
|
24
24
|
ColumnDefinition,
|
|
@@ -169,12 +169,9 @@ def create_demo_features(ml_instance):
|
|
|
169
169
|
description="Model for our API workflow",
|
|
170
170
|
)
|
|
171
171
|
|
|
172
|
-
api_workflow = ml_instance.
|
|
173
|
-
Workflow
|
|
174
|
-
|
|
175
|
-
url="https://github.com/informatics-isi-edu/deriva-ml/blob/main/pyproject.toml",
|
|
176
|
-
workflow_type="API Workflow",
|
|
177
|
-
)
|
|
172
|
+
api_workflow = ml_instance.create_workflow(
|
|
173
|
+
name="API Workflow",
|
|
174
|
+
workflow_type="API Workflow",
|
|
178
175
|
)
|
|
179
176
|
|
|
180
177
|
api_execution = ml_instance.create_execution(
|
|
@@ -322,7 +319,11 @@ def create_demo_catalog(
|
|
|
322
319
|
|
|
323
320
|
class DemoML(DerivaML):
|
|
324
321
|
def __init__(
|
|
325
|
-
self,
|
|
322
|
+
self,
|
|
323
|
+
hostname,
|
|
324
|
+
catalog_id,
|
|
325
|
+
cache_dir: Optional[str] = None,
|
|
326
|
+
working_dir: Optional[str] = None,
|
|
326
327
|
):
|
|
327
328
|
super().__init__(
|
|
328
329
|
hostname=hostname,
|
deriva_ml/deriva_definitions.py
CHANGED
|
@@ -8,7 +8,7 @@ from enum import Enum
|
|
|
8
8
|
from typing import Any, Iterable, Optional, Annotated
|
|
9
9
|
|
|
10
10
|
import deriva.core.ermrest_model as em
|
|
11
|
-
from urllib.parse import urlparse
|
|
11
|
+
from urllib.parse import urlparse
|
|
12
12
|
from deriva.core.ermrest_model import builtin_types
|
|
13
13
|
from pydantic import (
|
|
14
14
|
BaseModel,
|
|
@@ -139,13 +139,18 @@ class FileSpec(BaseModel):
|
|
|
139
139
|
if url_parts.scheme == "tag":
|
|
140
140
|
return v
|
|
141
141
|
elif not url_parts.scheme:
|
|
142
|
-
return f
|
|
142
|
+
return f"tag://{gethostname()},{date.today()}:file://{v}"
|
|
143
143
|
else:
|
|
144
144
|
raise ValidationError("url is not a file URL")
|
|
145
145
|
|
|
146
146
|
@model_serializer()
|
|
147
147
|
def serialize_filespec(self):
|
|
148
|
-
return {
|
|
148
|
+
return {
|
|
149
|
+
"URL": self.url,
|
|
150
|
+
"Description": self.description,
|
|
151
|
+
"MD5": self.md5,
|
|
152
|
+
"Length": self.length,
|
|
153
|
+
}
|
|
149
154
|
|
|
150
155
|
|
|
151
156
|
class VocabularyTerm(BaseModel):
|
deriva_ml/deriva_ml_base.py
CHANGED
|
@@ -32,6 +32,7 @@ from deriva.core.deriva_server import DerivaServer
|
|
|
32
32
|
from deriva.core.ermrest_catalog import ResolveRidResult
|
|
33
33
|
from deriva.core.ermrest_model import Key, Table
|
|
34
34
|
from deriva.core.hatrac_store import HatracStore
|
|
35
|
+
from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
|
|
35
36
|
from pydantic import validate_call, ConfigDict
|
|
36
37
|
from requests import RequestException
|
|
37
38
|
|
|
@@ -70,17 +71,29 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
|
70
71
|
try:
|
|
71
72
|
from IPython import get_ipython
|
|
72
73
|
except ImportError: # Graceful fallback if IPython isn't installed.
|
|
73
|
-
|
|
74
|
+
|
|
75
|
+
def get_ipython():
|
|
76
|
+
"""Dummy routine in case you are not running in IPython."""
|
|
77
|
+
return None
|
|
78
|
+
|
|
74
79
|
|
|
75
80
|
try:
|
|
76
81
|
from jupyter_server.serverapp import list_running_servers
|
|
77
82
|
except ImportError:
|
|
78
|
-
|
|
83
|
+
|
|
84
|
+
def list_running_servers():
|
|
85
|
+
"""Dummy routine in case you are not running in Jupyter."""
|
|
86
|
+
return []
|
|
87
|
+
|
|
79
88
|
|
|
80
89
|
try:
|
|
81
90
|
from ipykernel import get_connection_file
|
|
82
91
|
except ImportError:
|
|
83
|
-
|
|
92
|
+
|
|
93
|
+
def get_connection_file():
|
|
94
|
+
"""Dummy routine in case you are not running in Jupyter."""
|
|
95
|
+
return ""
|
|
96
|
+
|
|
84
97
|
|
|
85
98
|
if TYPE_CHECKING:
|
|
86
99
|
from .execution import Execution
|
|
@@ -102,8 +115,8 @@ class DerivaML(Dataset):
|
|
|
102
115
|
self,
|
|
103
116
|
hostname: str,
|
|
104
117
|
catalog_id: str | int,
|
|
105
|
-
domain_schema: str = None,
|
|
106
|
-
project_name: str = None,
|
|
118
|
+
domain_schema: Optional[str] = None,
|
|
119
|
+
project_name: Optional[str] = None,
|
|
107
120
|
cache_dir: Optional[str] = None,
|
|
108
121
|
working_dir: Optional[str] = None,
|
|
109
122
|
model_version: str = "1",
|
|
@@ -150,7 +163,7 @@ class DerivaML(Dataset):
|
|
|
150
163
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
151
164
|
|
|
152
165
|
# Initialize dataset class.
|
|
153
|
-
super().__init__(self.model, self.cache_dir)
|
|
166
|
+
super().__init__(self.model, self.cache_dir, self.working_dir)
|
|
154
167
|
self._logger = logging.getLogger("deriva_ml")
|
|
155
168
|
self._logger.setLevel(logging_level)
|
|
156
169
|
|
|
@@ -205,9 +218,8 @@ class DerivaML(Dataset):
|
|
|
205
218
|
except subprocess.CalledProcessError:
|
|
206
219
|
self._logger.error("nbstripout is not found.")
|
|
207
220
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
) -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
|
|
221
|
+
@staticmethod
|
|
222
|
+
def _get_notebook_session() -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
|
|
211
223
|
"""Return the absolute path of the current notebook."""
|
|
212
224
|
# Get the kernel's connection file and extract the kernel ID
|
|
213
225
|
try:
|
|
@@ -245,7 +257,7 @@ class DerivaML(Dataset):
|
|
|
245
257
|
def _get_notebook_path(self) -> Path | None:
|
|
246
258
|
"""Return the absolute path of the current notebook."""
|
|
247
259
|
|
|
248
|
-
server, session =
|
|
260
|
+
server, session = DerivaML._get_notebook_session()
|
|
249
261
|
if server and session:
|
|
250
262
|
self._check_nbstrip_status()
|
|
251
263
|
relative_path = session["notebook"]["path"]
|
|
@@ -267,7 +279,7 @@ class DerivaML(Dataset):
|
|
|
267
279
|
) # Get the caller's filename, which is two up the stack from here.
|
|
268
280
|
else:
|
|
269
281
|
raise DerivaMLException(
|
|
270
|
-
|
|
282
|
+
"Looking for caller failed"
|
|
271
283
|
) # Stack is too shallow
|
|
272
284
|
return filename, is_notebook
|
|
273
285
|
|
|
@@ -335,7 +347,7 @@ class DerivaML(Dataset):
|
|
|
335
347
|
)
|
|
336
348
|
|
|
337
349
|
def asset_dir(
|
|
338
|
-
self, table: str | Table, prefix: str | Path = None
|
|
350
|
+
self, table: str | Table, prefix: Optional[str | Path] = None
|
|
339
351
|
) -> UploadAssetDirectory:
|
|
340
352
|
"""Return a local file path in which to place a files for an asset table. T
|
|
341
353
|
|
|
@@ -369,6 +381,29 @@ class DerivaML(Dataset):
|
|
|
369
381
|
"""
|
|
370
382
|
return self.cache_dir if cached else self.working_dir
|
|
371
383
|
|
|
384
|
+
@staticmethod
|
|
385
|
+
def globus_login(host: str) -> None:
|
|
386
|
+
"""Log into the specified host using Globus.
|
|
387
|
+
|
|
388
|
+
Args:
|
|
389
|
+
host:
|
|
390
|
+
|
|
391
|
+
Returns:
|
|
392
|
+
|
|
393
|
+
"""
|
|
394
|
+
gnl = GlobusNativeLogin(host=host)
|
|
395
|
+
if gnl.is_logged_in([host]):
|
|
396
|
+
print("You are already logged in.")
|
|
397
|
+
else:
|
|
398
|
+
gnl.login(
|
|
399
|
+
[host],
|
|
400
|
+
no_local_server=True,
|
|
401
|
+
no_browser=True,
|
|
402
|
+
refresh_tokens=True,
|
|
403
|
+
update_bdbag_keychain=True,
|
|
404
|
+
)
|
|
405
|
+
print("Login Successful")
|
|
406
|
+
|
|
372
407
|
def chaise_url(self, table: RID | Table) -> str:
|
|
373
408
|
"""Return a Chaise URL to the specified table.
|
|
374
409
|
|
|
@@ -379,15 +414,15 @@ class DerivaML(Dataset):
|
|
|
379
414
|
Returns:
|
|
380
415
|
URL to the table in Chaise format.
|
|
381
416
|
"""
|
|
417
|
+
table_obj = self.model.name_to_table(table)
|
|
382
418
|
try:
|
|
383
|
-
table = self.model.name_to_table(table)
|
|
384
419
|
uri = self.catalog.get_server_uri().replace(
|
|
385
420
|
"ermrest/catalog/", "chaise/recordset/#"
|
|
386
421
|
)
|
|
387
422
|
except DerivaMLException:
|
|
388
423
|
# Perhaps we have a RID....
|
|
389
424
|
uri = self.cite(table)
|
|
390
|
-
return f"{uri}/{urlquote(
|
|
425
|
+
return f"{uri}/{urlquote(table_obj.schema.name)}:{urlquote(table_obj.name)}"
|
|
391
426
|
|
|
392
427
|
def cite(self, entity: dict | str) -> str:
|
|
393
428
|
"""Return a citation URL for the provided entity.
|
|
@@ -401,7 +436,9 @@ class DerivaML(Dataset):
|
|
|
401
436
|
Raises:
|
|
402
437
|
DerivaMLException: if provided RID does not exist.
|
|
403
438
|
"""
|
|
404
|
-
if entity.startswith(
|
|
439
|
+
if isinstance(entity, str) and entity.startswith(
|
|
440
|
+
f"https://{self.host_name}/id/{self.catalog_id}/"
|
|
441
|
+
):
|
|
405
442
|
# Already got a citation...
|
|
406
443
|
return entity
|
|
407
444
|
try:
|
|
@@ -498,9 +535,9 @@ class DerivaML(Dataset):
|
|
|
498
535
|
def create_asset(
|
|
499
536
|
self,
|
|
500
537
|
asset_name: str,
|
|
501
|
-
column_defs: Iterable[ColumnDefinition] = None,
|
|
538
|
+
column_defs: Optional[Iterable[ColumnDefinition]] = None,
|
|
502
539
|
comment: str = "",
|
|
503
|
-
schema: str = None,
|
|
540
|
+
schema: Optional[str] = None,
|
|
504
541
|
) -> Table:
|
|
505
542
|
"""Create an asset table with the given asset name.
|
|
506
543
|
|
|
@@ -532,9 +569,9 @@ class DerivaML(Dataset):
|
|
|
532
569
|
self,
|
|
533
570
|
target_table: Table | str,
|
|
534
571
|
feature_name: str,
|
|
535
|
-
terms: list[Table | str] = None,
|
|
536
|
-
assets: list[Table | str] = None,
|
|
537
|
-
metadata: Iterable[ColumnDefinition | Table | Key | str] = None,
|
|
572
|
+
terms: Optional[list[Table | str]] = None,
|
|
573
|
+
assets: Optional[list[Table | str]] = None,
|
|
574
|
+
metadata: Optional[Iterable[ColumnDefinition | Table | Key | str]] = None,
|
|
538
575
|
optional: Optional[list[str]] = None,
|
|
539
576
|
comment: str = "",
|
|
540
577
|
) -> type[FeatureRecord]:
|
|
@@ -899,6 +936,7 @@ class DerivaML(Dataset):
|
|
|
899
936
|
"""
|
|
900
937
|
|
|
901
938
|
def path_to_asset(path: str) -> str:
|
|
939
|
+
"""Pull the asset name out of a path to that asset in the filesystem"""
|
|
902
940
|
components = path.split("/")
|
|
903
941
|
return components[
|
|
904
942
|
components.index("asset") + 2
|
|
@@ -963,6 +1001,7 @@ class DerivaML(Dataset):
|
|
|
963
1001
|
)
|
|
964
1002
|
|
|
965
1003
|
def check_file_type(dtype: str) -> bool:
|
|
1004
|
+
"""Make sure that the specified string is either the name or synonym for a file type term."""
|
|
966
1005
|
for term in defined_types:
|
|
967
1006
|
if dtype == term.name or (term.synonyms and file_type in term.synonyms):
|
|
968
1007
|
return True
|
|
@@ -1098,7 +1137,7 @@ class DerivaML(Dataset):
|
|
|
1098
1137
|
|
|
1099
1138
|
def create_workflow(
|
|
1100
1139
|
self, name: str, workflow_type: str, description: str = "", create: bool = True
|
|
1101
|
-
) -> RID:
|
|
1140
|
+
) -> RID | None:
|
|
1102
1141
|
"""Identify current executing program and return a workflow RID for it
|
|
1103
1142
|
|
|
1104
1143
|
Determine the notebook or script that is currently being executed. Assume that this is
|
|
@@ -1166,7 +1205,7 @@ class DerivaML(Dataset):
|
|
|
1166
1205
|
)
|
|
1167
1206
|
github_url = result.stdout.strip().removesuffix(".git")
|
|
1168
1207
|
except subprocess.CalledProcessError:
|
|
1169
|
-
raise DerivaMLException(
|
|
1208
|
+
raise DerivaMLException("No GIT remote found")
|
|
1170
1209
|
|
|
1171
1210
|
# Find the root directory for the repository
|
|
1172
1211
|
repo_root = self._get_git_root()
|
|
@@ -1188,7 +1227,7 @@ class DerivaML(Dataset):
|
|
|
1188
1227
|
|
|
1189
1228
|
"""Get SHA-1 hash of latest commit of the file in the repository"""
|
|
1190
1229
|
result = subprocess.run(
|
|
1191
|
-
["git", "log", "-n", "1", "--pretty=format:%H
|
|
1230
|
+
["git", "log", "-n", "1", "--pretty=format:%H--", self.executable_path],
|
|
1192
1231
|
cwd=self.executable_path.parent,
|
|
1193
1232
|
capture_output=True,
|
|
1194
1233
|
text=True,
|
deriva_ml/deriva_model.py
CHANGED
|
@@ -21,7 +21,7 @@ from .deriva_definitions import (
|
|
|
21
21
|
|
|
22
22
|
from collections import Counter
|
|
23
23
|
from pydantic import validate_call, ConfigDict
|
|
24
|
-
from typing import Iterable
|
|
24
|
+
from typing import Iterable, Optional
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class DerivaModel:
|
|
@@ -267,7 +267,7 @@ class DerivaModel:
|
|
|
267
267
|
def _schema_to_paths(
|
|
268
268
|
self,
|
|
269
269
|
root: Table = None,
|
|
270
|
-
path: list[Table] = None,
|
|
270
|
+
path: Optional[list[Table]] = None,
|
|
271
271
|
) -> list[list[Table]]:
|
|
272
272
|
"""Recursively walk over the domain schema graph and extend the current path.
|
|
273
273
|
|
deriva_ml/execution.py
CHANGED
|
@@ -54,7 +54,9 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
|
54
54
|
try:
|
|
55
55
|
from jupyter_server.serverapp import list_running_servers
|
|
56
56
|
except ImportError:
|
|
57
|
-
|
|
57
|
+
|
|
58
|
+
def list_running_servers():
|
|
59
|
+
return []
|
|
58
60
|
|
|
59
61
|
|
|
60
62
|
class Execution:
|
|
@@ -155,7 +157,6 @@ class Execution:
|
|
|
155
157
|
self._initialize_execution(reload)
|
|
156
158
|
|
|
157
159
|
def _save_runtime_environment(self):
|
|
158
|
-
|
|
159
160
|
runtime_env_path = ExecMetadataVocab.runtime_env.value
|
|
160
161
|
runtime_env_dir = self.execution_metadata_path(runtime_env_path)
|
|
161
162
|
with NamedTemporaryFile(
|
|
@@ -267,7 +268,7 @@ class Execution:
|
|
|
267
268
|
# Execution metadata cannot be in a directory, so map path into filename.
|
|
268
269
|
checkpoint_path = (
|
|
269
270
|
self.execution_metadata_path(ExecMetadataVocab.runtime_env.value)
|
|
270
|
-
/ f"{notebook_name.replace('/','_')}.checkpoint"
|
|
271
|
+
/ f"{notebook_name.replace('/', '_')}.checkpoint"
|
|
271
272
|
)
|
|
272
273
|
with open(checkpoint_path, "w", encoding="utf-8") as f:
|
|
273
274
|
json.dump(notebook_content, f)
|
|
@@ -359,7 +360,7 @@ class Execution:
|
|
|
359
360
|
if m := is_feature_asset_dir(p):
|
|
360
361
|
try:
|
|
361
362
|
self.update_status(
|
|
362
|
-
Status.running, f
|
|
363
|
+
Status.running, f"Uploading feature {m['feature_name']}..."
|
|
363
364
|
)
|
|
364
365
|
feature_assets[m["target_table"], m["feature_name"]] = (
|
|
365
366
|
self._ml_object.upload_assets(p)
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
-
from typing import Optional
|
|
4
|
+
from typing import Optional
|
|
5
5
|
|
|
6
6
|
from pydantic import (
|
|
7
7
|
BaseModel,
|
|
8
8
|
conlist,
|
|
9
|
-
ConfigDict,
|
|
9
|
+
ConfigDict,
|
|
10
10
|
)
|
|
11
11
|
from pathlib import Path
|
|
12
12
|
|
|
@@ -36,7 +36,6 @@ class Workflow(BaseModel):
|
|
|
36
36
|
checksum: Optional[str]
|
|
37
37
|
|
|
38
38
|
|
|
39
|
-
|
|
40
39
|
class ExecutionConfiguration(BaseModel):
|
|
41
40
|
"""Define the parameters that are used to configure a specific execution.
|
|
42
41
|
|
|
@@ -69,23 +68,21 @@ class ExecutionConfiguration(BaseModel):
|
|
|
69
68
|
config = json.load(fd)
|
|
70
69
|
return ExecutionConfiguration.model_validate(config)
|
|
71
70
|
|
|
72
|
-
def download_execution_configuration(
|
|
73
|
-
|
|
74
|
-
) -> ExecutionConfiguration:
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
# hs.get_obj(path=configuration["URL"], destfilename=dest_file.name)
|
|
91
|
-
# return ExecutionConfiguration.load_configuration(Path(dest_file.name))
|
|
71
|
+
# def download_execution_configuration(
|
|
72
|
+
# self, configuration_rid: RID
|
|
73
|
+
# ) -> ExecutionConfiguration:
|
|
74
|
+
# """Create an ExecutionConfiguration object from a catalog RID that points to a JSON representation of that
|
|
75
|
+
# configuration in hatrac
|
|
76
|
+
#
|
|
77
|
+
# Args:
|
|
78
|
+
# configuration_rid: RID that should be to an asset table that refers to an execution configuration
|
|
79
|
+
#
|
|
80
|
+
# Returns:
|
|
81
|
+
# A ExecutionConfiguration object for configured by the parameters in the configuration file.
|
|
82
|
+
# """
|
|
83
|
+
# AssertionError("Not Implemented")
|
|
84
|
+
# configuration = self.retrieve_rid(configuration_rid)
|
|
85
|
+
# with NamedTemporaryFile("w+", delete=False, suffix=".json") as dest_file:
|
|
86
|
+
# hs = HatracStore("https", self.host_name, self.credential)
|
|
87
|
+
# hs.get_obj(path=configuration["URL"], destfilename=dest_file.name)
|
|
88
|
+
# return ExecutionConfiguration.load_configuration(Path(dest_file.name))
|
|
@@ -240,7 +240,7 @@ def main():
|
|
|
240
240
|
parser.add_argument("--catalog_id", type=str, required=True)
|
|
241
241
|
parser.add_argument("--schema_name", type=str, required=True)
|
|
242
242
|
args = parser.parse_args()
|
|
243
|
-
generate_annotation(args.catalog_id
|
|
243
|
+
generate_annotation(args.catalog_id)
|
|
244
244
|
|
|
245
245
|
|
|
246
246
|
if __name__ == "__main__":
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import sys
|
|
3
|
+
from typing import Optional
|
|
3
4
|
|
|
4
5
|
from deriva.core import DerivaServer, get_credential
|
|
5
6
|
from deriva.core.ermrest_model import Model
|
|
@@ -32,7 +33,7 @@ def define_table_workflow(workflow_annotation: dict):
|
|
|
32
33
|
)
|
|
33
34
|
|
|
34
35
|
|
|
35
|
-
def define_table_dataset(dataset_annotation: dict = None):
|
|
36
|
+
def define_table_dataset(dataset_annotation: Optional[dict] = None):
|
|
36
37
|
return Table.define(
|
|
37
38
|
tname="Dataset",
|
|
38
39
|
column_defs=[
|
|
@@ -154,7 +155,7 @@ def create_www_schema(model: Model):
|
|
|
154
155
|
|
|
155
156
|
|
|
156
157
|
def create_ml_schema(
|
|
157
|
-
model: Model, schema_name: str = "deriva-ml", project_name: str = None
|
|
158
|
+
model: Model, schema_name: str = "deriva-ml", project_name: Optional[str] = None
|
|
158
159
|
):
|
|
159
160
|
if model.schemas.get(schema_name):
|
|
160
161
|
model.schemas[schema_name].drop(cascade=True)
|
deriva_ml/upload.py
CHANGED
|
@@ -483,7 +483,7 @@ def upload_directory(
|
|
|
483
483
|
|
|
484
484
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
485
485
|
def upload_asset(
|
|
486
|
-
model: DerivaModel, file: Path | str, table: Table
|
|
486
|
+
model: DerivaModel, file: Path | str, table: Table, **kwargs: Any
|
|
487
487
|
) -> dict:
|
|
488
488
|
"""Upload the specified file into Hatrac and update the associated asset table.
|
|
489
489
|
|