deriva-ml 1.14.0__py3-none-any.whl → 1.14.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +25 -30
- deriva_ml/core/__init__.py +39 -0
- deriva_ml/core/base.py +1489 -0
- deriva_ml/core/constants.py +36 -0
- deriva_ml/core/definitions.py +74 -0
- deriva_ml/core/enums.py +222 -0
- deriva_ml/core/ermrest.py +288 -0
- deriva_ml/core/exceptions.py +28 -0
- deriva_ml/core/filespec.py +116 -0
- deriva_ml/dataset/__init__.py +4 -0
- deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
- deriva_ml/{dataset.py → dataset/dataset.py} +406 -428
- deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
- deriva_ml/{history.py → dataset/history.py} +51 -33
- deriva_ml/{upload.py → dataset/upload.py} +48 -70
- deriva_ml/demo_catalog.py +233 -183
- deriva_ml/execution/environment.py +290 -0
- deriva_ml/{execution.py → execution/execution.py} +365 -252
- deriva_ml/execution/execution_configuration.py +163 -0
- deriva_ml/{execution_configuration.py → execution/workflow.py} +212 -224
- deriva_ml/feature.py +83 -46
- deriva_ml/model/__init__.py +0 -0
- deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
- deriva_ml/{database_model.py → model/database.py} +52 -74
- deriva_ml/model/sql_mapper.py +44 -0
- deriva_ml/run_notebook.py +19 -11
- deriva_ml/schema/__init__.py +3 -0
- deriva_ml/{schema_setup → schema}/annotations.py +31 -22
- deriva_ml/schema/check_schema.py +104 -0
- deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
- deriva_ml/schema/deriva-ml-reference.json +8525 -0
- deriva_ml/schema/table_comments_utils.py +57 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/METADATA +5 -4
- deriva_ml-1.14.27.dist-info/RECORD +40 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/entry_points.txt +1 -0
- deriva_ml/deriva_definitions.py +0 -391
- deriva_ml/deriva_ml_base.py +0 -1046
- deriva_ml/execution_environment.py +0 -139
- deriva_ml/schema_setup/table_comments_utils.py +0 -56
- deriva_ml/test-files/execution-parameters.json +0 -1
- deriva_ml/test-files/notebook-parameters.json +0 -5
- deriva_ml/test_functions.py +0 -141
- deriva_ml/test_notebook.ipynb +0 -197
- deriva_ml-1.14.0.dist-info/RECORD +0 -31
- /deriva_ml/{schema_setup → execution}/__init__.py +0 -0
- /deriva_ml/{schema_setup → schema}/policy.json +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/WHEEL +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/top_level.txt +0 -0
|
@@ -1,23 +1,25 @@
|
|
|
1
|
-
"""
|
|
2
|
-
of a dataset and a
|
|
1
|
+
"""This module contains the definition of the DatabaseModel class. The role of this class is to provide an interface
|
|
2
|
+
between the BDBag representation of a dataset and a sqlite database in which the contents of the bag are stored.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
|
+
import json
|
|
7
8
|
import logging
|
|
8
9
|
import sqlite3
|
|
9
|
-
|
|
10
10
|
from csv import reader
|
|
11
11
|
from pathlib import Path
|
|
12
|
-
from typing import Any,
|
|
12
|
+
from typing import Any, Generator, Optional
|
|
13
13
|
from urllib.parse import urlparse
|
|
14
14
|
|
|
15
15
|
from deriva.core.ermrest_model import Model
|
|
16
16
|
|
|
17
|
-
from .
|
|
18
|
-
from .
|
|
19
|
-
from .
|
|
20
|
-
from .dataset_bag import DatasetBag
|
|
17
|
+
from deriva_ml.core.definitions import ML_SCHEMA, RID, MLVocab
|
|
18
|
+
from deriva_ml.core.exceptions import DerivaMLException
|
|
19
|
+
from deriva_ml.dataset.aux_classes import DatasetMinid, DatasetVersion
|
|
20
|
+
from deriva_ml.dataset.dataset_bag import DatasetBag
|
|
21
|
+
from deriva_ml.model.catalog import DerivaModel
|
|
22
|
+
from deriva_ml.model.sql_mapper import SQLMapper
|
|
21
23
|
|
|
22
24
|
try:
|
|
23
25
|
from icecream import ic
|
|
@@ -42,13 +44,13 @@ class DatabaseModelMeta(type):
|
|
|
42
44
|
class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
43
45
|
"""Read in the contents of a BDBag and create a local SQLite database.
|
|
44
46
|
|
|
45
|
-
As part of its initialization, this routine will create a sqlite database that has the contents of all the
|
|
46
|
-
in the dataset_table. In addition, any asset tables will the `Filename` column remapped to have the path
|
|
47
|
-
copy of the file. In addition, a local version of the ERMRest model that as used to generate the
|
|
48
|
-
available.
|
|
47
|
+
As part of its initialization, this routine will create a sqlite database that has the contents of all the
|
|
48
|
+
tables in the dataset_table. In addition, any asset tables will the `Filename` column remapped to have the path
|
|
49
|
+
of the local copy of the file. In addition, a local version of the ERMRest model that as used to generate the
|
|
50
|
+
dataset_table is available.
|
|
49
51
|
|
|
50
52
|
The sqlite database will not have any foreign key constraints applied, however, foreign-key relationships can be
|
|
51
|
-
found by looking in the ERMRest model. In addition, as
|
|
53
|
+
found by looking in the ERMRest model. In addition, as sqlite doesn't support schema, Ermrest schema are added
|
|
52
54
|
to the table name using the convention SchemaName:TableName. Methods in DatasetBag that have table names as the
|
|
53
55
|
argument will perform the appropriate name mappings.
|
|
54
56
|
|
|
@@ -56,8 +58,8 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
56
58
|
appear in more than one database. To help manage this, a global list of all the datasets that have been loaded
|
|
57
59
|
into DatabaseModels, is kept in the class variable `_rid_map`.
|
|
58
60
|
|
|
59
|
-
Because you can load different versions of a dataset simultaneously, the dataset RID and version number are tracked,
|
|
60
|
-
|
|
61
|
+
Because you can load different versions of a dataset simultaneously, the dataset RID and version number are tracked,
|
|
62
|
+
and a new sqlite instance is created for every new dataset version present.
|
|
61
63
|
|
|
62
64
|
Attributes:
|
|
63
65
|
bag_path (Path): path to the local copy of the BDBag
|
|
@@ -103,10 +105,11 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
103
105
|
self.dbase_file = dbase_path / f"{minid.version_rid}.db"
|
|
104
106
|
self.dbase = sqlite3.connect(self.dbase_file)
|
|
105
107
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
108
|
+
schema_file = self.bag_path / "data/schema.json"
|
|
109
|
+
with schema_file.open("r") as f:
|
|
110
|
+
self.snaptime = json.load(f)["snaptime"]
|
|
109
111
|
|
|
112
|
+
super().__init__(Model.fromfile("file-system", self.bag_path / "data/schema.json"))
|
|
110
113
|
self._logger = logging.getLogger("deriva_ml")
|
|
111
114
|
self._load_model()
|
|
112
115
|
self.ml_schema = ML_SCHEMA
|
|
@@ -121,28 +124,22 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
121
124
|
sql_dataset = self.normalize_table_name("Dataset_Version")
|
|
122
125
|
with self.dbase:
|
|
123
126
|
dataset_versions = [
|
|
124
|
-
t
|
|
125
|
-
for t in self.dbase.execute(
|
|
126
|
-
f'SELECT "Dataset", "Version" FROM "{sql_dataset}"'
|
|
127
|
-
).fetchall()
|
|
127
|
+
t for t in self.dbase.execute(f'SELECT "Dataset", "Version" FROM "{sql_dataset}"').fetchall()
|
|
128
128
|
]
|
|
129
|
-
dataset_versions = [
|
|
130
|
-
(v[0], DatasetVersion.parse(v[1])) for v in dataset_versions
|
|
131
|
-
]
|
|
129
|
+
dataset_versions = [(v[0], DatasetVersion.parse(v[1])) for v in dataset_versions]
|
|
132
130
|
|
|
133
131
|
# Get most current version of each rid
|
|
134
132
|
self.bag_rids = {}
|
|
135
133
|
for rid, version in dataset_versions:
|
|
136
|
-
self.bag_rids[rid] = max(
|
|
137
|
-
self.bag_rids.get(rid, DatasetVersion(0, 1, 0)), version
|
|
138
|
-
)
|
|
134
|
+
self.bag_rids[rid] = max(self.bag_rids.get(rid, DatasetVersion(0, 1, 0)), version)
|
|
139
135
|
|
|
140
136
|
for dataset_rid, dataset_version in self.bag_rids.items():
|
|
141
137
|
version_list = DatabaseModel._rid_map.setdefault(dataset_rid, [])
|
|
142
138
|
version_list.append((dataset_version, self))
|
|
143
139
|
|
|
144
140
|
def _load_model(self) -> None:
|
|
145
|
-
"""Create a sqlite database schema that contains all the tables within the catalog from which the BDBag
|
|
141
|
+
"""Create a sqlite database schema that contains all the tables within the catalog from which the BDBag
|
|
142
|
+
was created."""
|
|
146
143
|
with self.dbase:
|
|
147
144
|
for t in self.model.schemas[self.domain_schema].tables.values():
|
|
148
145
|
self.dbase.execute(t.sqlite3_ddl())
|
|
@@ -153,7 +150,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
153
150
|
"""Load a SQLite database from a bdbag. THis is done by looking for all the CSV files in the bdbag directory.
|
|
154
151
|
|
|
155
152
|
If the file is for an asset table, update the FileName column of the table to have the local file path for
|
|
156
|
-
the materialized file. Then load into the
|
|
153
|
+
the materialized file. Then load into the sqlite database.
|
|
157
154
|
Note: none of the foreign key constraints are included in the database.
|
|
158
155
|
"""
|
|
159
156
|
dpath = self.bag_path / "data"
|
|
@@ -162,11 +159,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
162
159
|
# Find all the CSV files in the subdirectory and load each file into the database.
|
|
163
160
|
for csv_file in Path(dpath).rglob("*.csv"):
|
|
164
161
|
table = csv_file.stem
|
|
165
|
-
schema =
|
|
166
|
-
self.domain_schema
|
|
167
|
-
if table in self.model.schemas[self.domain_schema].tables
|
|
168
|
-
else self.ml_schema
|
|
169
|
-
)
|
|
162
|
+
schema = self.domain_schema if table in self.model.schemas[self.domain_schema].tables else self.ml_schema
|
|
170
163
|
|
|
171
164
|
with csv_file.open(newline="") as csvfile:
|
|
172
165
|
csv_reader = reader(csvfile)
|
|
@@ -174,19 +167,14 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
174
167
|
|
|
175
168
|
# Determine which columns in the table has the Filename and the URL
|
|
176
169
|
asset_indexes = (
|
|
177
|
-
(column_names.index("Filename"), column_names.index("URL"))
|
|
178
|
-
if self._is_asset(table)
|
|
179
|
-
else None
|
|
170
|
+
(column_names.index("Filename"), column_names.index("URL")) if self._is_asset(table) else None
|
|
180
171
|
)
|
|
181
172
|
|
|
182
|
-
value_template = ",".join(
|
|
183
|
-
["?"] * len(column_names)
|
|
184
|
-
) # SQL placeholder for row (?,?..)
|
|
173
|
+
value_template = ",".join(["?"] * len(column_names)) # SQL placeholder for row (?,?..)
|
|
185
174
|
column_list = ",".join([f'"{c}"' for c in column_names])
|
|
186
175
|
with self.dbase:
|
|
187
176
|
object_table = (
|
|
188
|
-
self._localize_asset(o, asset_indexes, asset_map)
|
|
189
|
-
for o in csv_reader
|
|
177
|
+
self._localize_asset(o, asset_indexes, asset_map, table == "Dataset") for o in csv_reader
|
|
190
178
|
)
|
|
191
179
|
self.dbase.executemany(
|
|
192
180
|
f'INSERT OR REPLACE INTO "{schema}:{table}" ({column_list}) VALUES ({value_template})',
|
|
@@ -202,7 +190,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
202
190
|
"""
|
|
203
191
|
fetch_map = {}
|
|
204
192
|
try:
|
|
205
|
-
with open(self.bag_path / "fetch.txt", newline="\n") as fetch_file:
|
|
193
|
+
with Path.open(self.bag_path / "fetch.txt", newline="\n") as fetch_file:
|
|
206
194
|
for row in fetch_file:
|
|
207
195
|
# Rows in fetch.text are tab seperated with URL filename.
|
|
208
196
|
fields = row.split("\t")
|
|
@@ -224,18 +212,12 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
224
212
|
Boolean that is true if the table looks like an asset table.
|
|
225
213
|
"""
|
|
226
214
|
asset_columns = {"Filename", "URL", "Length", "MD5", "Description"}
|
|
227
|
-
sname =
|
|
228
|
-
self.domain_schema
|
|
229
|
-
if table_name in self.model.schemas[self.domain_schema].tables
|
|
230
|
-
else self.ml_schema
|
|
231
|
-
)
|
|
215
|
+
sname = self.domain_schema if table_name in self.model.schemas[self.domain_schema].tables else self.ml_schema
|
|
232
216
|
asset_table = self.model.schemas[sname].tables[table_name]
|
|
233
217
|
return asset_columns.issubset({c.name for c in asset_table.columns})
|
|
234
218
|
|
|
235
219
|
@staticmethod
|
|
236
|
-
def _localize_asset(
|
|
237
|
-
o: list, indexes: tuple[int, int], asset_map: dict[str, str]
|
|
238
|
-
) -> tuple:
|
|
220
|
+
def _localize_asset(o: list, indexes: tuple[int, int], asset_map: dict[str, str], debug: bool = False) -> tuple:
|
|
239
221
|
"""Given a list of column values for a table, replace the FileName column with the local file name based on
|
|
240
222
|
the URL value.
|
|
241
223
|
|
|
@@ -295,24 +277,21 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
295
277
|
Returns:
|
|
296
278
|
list of currently available datasets.
|
|
297
279
|
"""
|
|
298
|
-
atable = next(
|
|
299
|
-
self.model.schemas[ML_SCHEMA]
|
|
300
|
-
.tables[MLVocab.dataset_type]
|
|
301
|
-
.find_associations()
|
|
302
|
-
).name
|
|
280
|
+
atable = next(self.model.schemas[ML_SCHEMA].tables[MLVocab.dataset_type].find_associations()).name
|
|
303
281
|
|
|
304
282
|
# Get a list of all the dataset_type values associated with this dataset_table.
|
|
305
283
|
datasets = []
|
|
306
|
-
ds_types = list(self.
|
|
307
|
-
for dataset in self.
|
|
284
|
+
ds_types = list(self._get_table(atable))
|
|
285
|
+
for dataset in self._get_table("Dataset"):
|
|
308
286
|
my_types = [t for t in ds_types if t["Dataset"] == dataset["RID"]]
|
|
309
|
-
datasets.append(
|
|
310
|
-
dataset
|
|
311
|
-
| {MLVocab.dataset_type: [ds[MLVocab.dataset_type] for ds in my_types]}
|
|
312
|
-
)
|
|
287
|
+
datasets.append(dataset | {MLVocab.dataset_type: [ds[MLVocab.dataset_type] for ds in my_types]})
|
|
313
288
|
return datasets
|
|
314
289
|
|
|
315
|
-
def
|
|
290
|
+
def list_dataset_members(self, dataset_rid: RID) -> dict[str, Any]:
|
|
291
|
+
"""Returns a list of all the dataset_table entries associated with a dataset."""
|
|
292
|
+
return self.get_dataset(dataset_rid).list_dataset_members()
|
|
293
|
+
|
|
294
|
+
def _get_table(self, table: str) -> Generator[dict[str, Any], None, None]:
|
|
316
295
|
"""Retrieve the contents of the specified table as a dictionary.
|
|
317
296
|
|
|
318
297
|
Args:
|
|
@@ -323,14 +302,14 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
323
302
|
A generator producing dictionaries containing the contents of the specified table as name/value pairs.
|
|
324
303
|
"""
|
|
325
304
|
table_name = self.normalize_table_name(table)
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
]
|
|
305
|
+
table = self.name_to_table(table)
|
|
306
|
+
|
|
307
|
+
with self.dbase as _dbase:
|
|
308
|
+
mapper = SQLMapper(self, table.name)
|
|
331
309
|
result = self.dbase.execute(f'SELECT * FROM "{table_name}"')
|
|
332
|
-
|
|
333
|
-
|
|
310
|
+
|
|
311
|
+
while (row := result.fetchone()) is not None:
|
|
312
|
+
yield mapper.transform_tuple(row)
|
|
334
313
|
|
|
335
314
|
def normalize_table_name(self, table: str) -> str:
|
|
336
315
|
"""Attempt to insert the schema into a table name if it's not provided.
|
|
@@ -342,13 +321,12 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
342
321
|
table name with schema included.
|
|
343
322
|
|
|
344
323
|
"""
|
|
345
|
-
sname = ""
|
|
346
324
|
try:
|
|
347
325
|
[sname, tname] = table.split(":")
|
|
348
326
|
except ValueError:
|
|
349
327
|
tname = table
|
|
350
|
-
for sname
|
|
351
|
-
if table in
|
|
328
|
+
for sname in [self.domain_schema, self.ml_schema, "WWW"]: # Be careful of File table.
|
|
329
|
+
if table in self.model.schemas[sname].tables:
|
|
352
330
|
break
|
|
353
331
|
try:
|
|
354
332
|
_ = self.model.schemas[sname].tables[tname]
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from datetime import datetime, timezone
|
|
2
|
+
from typing import TYPE_CHECKING, Any, Sequence
|
|
3
|
+
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
from deriva_ml.model.database import DatabaseModel
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
from icecream import ic
|
|
9
|
+
except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
10
|
+
ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SQLMapper:
|
|
14
|
+
def __init__(self, database: "DatabaseModel", table: str) -> None:
|
|
15
|
+
table_name = database.normalize_table_name(table)
|
|
16
|
+
schema, table = table_name.split(":")
|
|
17
|
+
|
|
18
|
+
with database.dbase as dbase:
|
|
19
|
+
self.col_names = [c[1] for c in dbase.execute(f'PRAGMA table_info("{table_name}")').fetchall()]
|
|
20
|
+
|
|
21
|
+
self.boolean_columns = [
|
|
22
|
+
self.col_names.index(c.name)
|
|
23
|
+
for c in database.model.schemas[schema].tables[table].columns
|
|
24
|
+
if c.type.typename == "boolean"
|
|
25
|
+
]
|
|
26
|
+
self.time_columns = [
|
|
27
|
+
self.col_names.index(c.name)
|
|
28
|
+
for c in database.model.schemas[schema].tables[table].columns
|
|
29
|
+
if c.type.typename in ["ermrest_rct", "ermrest_rmt"]
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
def _map_value(self, idx: int, v: Any) -> Any:
|
|
33
|
+
"""
|
|
34
|
+
Return a new value based on `data` where, for each index in `idxs`,
|
|
35
|
+
"""
|
|
36
|
+
tf_map = {"t": True, "f": False}
|
|
37
|
+
if idx in self.boolean_columns:
|
|
38
|
+
return tf_map.get(v, v)
|
|
39
|
+
if idx in self.time_columns:
|
|
40
|
+
return datetime.strptime(v, "%Y-%m-%d %H:%M:%S.%f+00").replace(tzinfo=timezone.utc).isoformat()
|
|
41
|
+
return v
|
|
42
|
+
|
|
43
|
+
def transform_tuple(self, data: Sequence[Any]) -> Any:
|
|
44
|
+
return dict(zip(self.col_names, tuple(self._map_value(i, v) for i, v in enumerate(data))))
|
deriva_ml/run_notebook.py
CHANGED
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
"""Module to run a notebook using papermill"""
|
|
2
2
|
|
|
3
|
-
from datetime import datetime
|
|
4
3
|
import json
|
|
5
4
|
import os
|
|
6
|
-
import papermill as pm
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
import regex as re
|
|
9
5
|
import tempfile
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from pathlib import Path
|
|
10
8
|
|
|
11
|
-
|
|
9
|
+
import papermill as pm
|
|
10
|
+
import regex as re
|
|
12
11
|
from deriva.core import BaseCLI
|
|
13
|
-
|
|
12
|
+
|
|
13
|
+
from deriva_ml import DerivaML, ExecAssetType, MLAsset, Workflow
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class DerivaMLRunNotebookCLI(BaseCLI):
|
|
@@ -91,7 +91,7 @@ class DerivaMLRunNotebookCLI(BaseCLI):
|
|
|
91
91
|
if not (parameter_file.is_file() and parameter_file.suffix == ".json"):
|
|
92
92
|
print("Parameter file must be an json file.")
|
|
93
93
|
exit(1)
|
|
94
|
-
with open(
|
|
94
|
+
with Path(parameter_file).open("r") as f:
|
|
95
95
|
parameters |= json.load(f)
|
|
96
96
|
|
|
97
97
|
if not (notebook_file.is_file() and notebook_file.suffix == ".ipynb"):
|
|
@@ -101,7 +101,8 @@ class DerivaMLRunNotebookCLI(BaseCLI):
|
|
|
101
101
|
os.environ["DERIVA_HOST"] = args.host
|
|
102
102
|
os.environ["DERIVA_CATALOG_ID"] = args.catalog
|
|
103
103
|
|
|
104
|
-
# Create a workflow instance for this specific version of the script.
|
|
104
|
+
# Create a workflow instance for this specific version of the script.
|
|
105
|
+
# Return an existing workflow if one is found.
|
|
105
106
|
notebook_parameters = pm.inspect_notebook(notebook_file)
|
|
106
107
|
if args.inspect:
|
|
107
108
|
for param, value in notebook_parameters.items():
|
|
@@ -133,8 +134,8 @@ class DerivaMLRunNotebookCLI(BaseCLI):
|
|
|
133
134
|
parameters=parameters,
|
|
134
135
|
kernel_name=kernel,
|
|
135
136
|
)
|
|
136
|
-
|
|
137
|
-
with open(
|
|
137
|
+
catalog_id = execution_rid = None
|
|
138
|
+
with Path(notebook_output).open("r") as f:
|
|
138
139
|
for line in f:
|
|
139
140
|
if m := re.search(
|
|
140
141
|
r"Execution RID: https://(?P<host>.*)/id/(?P<catalog_id>.*)/(?P<execution_rid>[\w-]+)",
|
|
@@ -161,7 +162,7 @@ class DerivaMLRunNotebookCLI(BaseCLI):
|
|
|
161
162
|
file_name=f"notebook-parameters-{datetime.now().strftime('%Y%m%d-%H%M%S')}.json",
|
|
162
163
|
asset_types=ExecAssetType.input_file.value,
|
|
163
164
|
)
|
|
164
|
-
with open(
|
|
165
|
+
with Path(parameter_file).open("w") as f:
|
|
165
166
|
json.dump(parameters, f)
|
|
166
167
|
|
|
167
168
|
execution.upload_execution_outputs()
|
|
@@ -169,6 +170,13 @@ class DerivaMLRunNotebookCLI(BaseCLI):
|
|
|
169
170
|
|
|
170
171
|
|
|
171
172
|
def main():
|
|
173
|
+
"""Main entry point for the notebook runner CLI.
|
|
174
|
+
|
|
175
|
+
Creates and runs the DerivaMLRunNotebookCLI instance.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
None. Executes the CLI.
|
|
179
|
+
"""
|
|
172
180
|
cli = DerivaMLRunNotebookCLI(
|
|
173
181
|
description="Deriva ML Execution Script Demo", epilog=""
|
|
174
182
|
)
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import argparse
|
|
2
|
-
import sys
|
|
3
2
|
|
|
4
3
|
from deriva.core.ermrest_model import Model, Table
|
|
5
4
|
from deriva.core.utils.core_utils import tag as deriva_tags
|
|
6
|
-
|
|
7
|
-
from
|
|
5
|
+
|
|
6
|
+
from deriva_ml.core.constants import DerivaAssetColumns
|
|
7
|
+
from deriva_ml.dataset.upload import bulk_upload_configuration
|
|
8
|
+
from deriva_ml.model.catalog import DerivaModel
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
def catalog_annotation(model: DerivaModel) -> None:
|
|
@@ -106,17 +107,12 @@ def catalog_annotation(model: DerivaModel) -> None:
|
|
|
106
107
|
}
|
|
107
108
|
for tname in model.schemas[model.domain_schema].tables
|
|
108
109
|
# Don't include controlled vocabularies, association tables, or feature tables.
|
|
109
|
-
if not (
|
|
110
|
-
model.is_vocabulary(tname)
|
|
111
|
-
or model.is_association(tname, pure=False, max_arity=3)
|
|
112
|
-
)
|
|
110
|
+
if not (model.is_vocabulary(tname) or model.is_association(tname, pure=False, max_arity=3))
|
|
113
111
|
],
|
|
114
112
|
},
|
|
115
113
|
{ # Vocabulary menu which will list all the controlled vocabularies in deriva-ml and domain.
|
|
116
114
|
"name": "Vocabulary",
|
|
117
|
-
"children": [
|
|
118
|
-
{"name": f"{ml_schema} Vocabularies", "header": True}
|
|
119
|
-
]
|
|
115
|
+
"children": [{"name": f"{ml_schema} Vocabularies", "header": True}]
|
|
120
116
|
+ [
|
|
121
117
|
{
|
|
122
118
|
"url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:{tname}",
|
|
@@ -186,9 +182,18 @@ def catalog_annotation(model: DerivaModel) -> None:
|
|
|
186
182
|
|
|
187
183
|
|
|
188
184
|
def asset_annotation(asset_table: Table):
|
|
185
|
+
"""Generate annotations for an asset table.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
asset_table: The Table object representing the asset table.
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
A dictionary containing the annotations for the asset table.
|
|
192
|
+
"""
|
|
193
|
+
|
|
189
194
|
schema = asset_table.schema.name
|
|
190
195
|
asset_name = asset_table.name
|
|
191
|
-
|
|
196
|
+
asset_metadata = {c.name for c in asset_table.columns} - DerivaAssetColumns
|
|
192
197
|
|
|
193
198
|
def fkey_column(column):
|
|
194
199
|
"""Map the column name to a FK if a constraint exists on the column"""
|
|
@@ -202,9 +207,7 @@ def asset_annotation(asset_table: Table):
|
|
|
202
207
|
)
|
|
203
208
|
|
|
204
209
|
annotations = {
|
|
205
|
-
deriva_tags.table_display: {
|
|
206
|
-
"row_name": {"row_markdown_pattern": "{{{Filename}}}"}
|
|
207
|
-
},
|
|
210
|
+
deriva_tags.table_display: {"row_name": {"row_markdown_pattern": "{{{Filename}}}"}},
|
|
208
211
|
deriva_tags.visible_columns: {
|
|
209
212
|
"*": [
|
|
210
213
|
"RID",
|
|
@@ -236,11 +239,11 @@ def asset_annotation(asset_table: Table):
|
|
|
236
239
|
"markdown_name": "Asset Types",
|
|
237
240
|
},
|
|
238
241
|
]
|
|
239
|
-
+ [fkey_column(c) for c in
|
|
242
|
+
+ [fkey_column(c) for c in asset_metadata],
|
|
240
243
|
},
|
|
241
244
|
}
|
|
242
245
|
asset_table.annotations.update(annotations)
|
|
243
|
-
model.apply()
|
|
246
|
+
asset_table.schema.model.apply()
|
|
244
247
|
|
|
245
248
|
|
|
246
249
|
def generate_annotation(model: Model, schema: str) -> dict:
|
|
@@ -435,9 +438,7 @@ def generate_annotation(model: Model, schema: str) -> dict:
|
|
|
435
438
|
},
|
|
436
439
|
deriva_tags.visible_foreign_keys: {"*": []},
|
|
437
440
|
deriva_tags.table_display: {
|
|
438
|
-
"row_name": {
|
|
439
|
-
"row_markdown_pattern": "{{{$fkey_deriva-ml_Dataset_Version_Dataset_fkey.RID}}}:{{{Version}}}"
|
|
440
|
-
}
|
|
441
|
+
"row_name": {"row_markdown_pattern": "{{{$fkey_deriva-ml_Dataset_Version_Dataset_fkey.RID}}}:{{{Version}}}"}
|
|
441
442
|
},
|
|
442
443
|
}
|
|
443
444
|
|
|
@@ -451,9 +452,17 @@ def generate_annotation(model: Model, schema: str) -> dict:
|
|
|
451
452
|
|
|
452
453
|
|
|
453
454
|
def main():
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
455
|
+
"""Main entry point for the annotations CLI.
|
|
456
|
+
|
|
457
|
+
Applies annotations to the ML schema based on command line arguments.
|
|
458
|
+
|
|
459
|
+
Returns:
|
|
460
|
+
None. Executes the CLI.
|
|
461
|
+
"""
|
|
462
|
+
parser = argparse.ArgumentParser(description="Apply annotations to ML schema")
|
|
463
|
+
parser.add_argument("hostname", help="Hostname for the catalog")
|
|
464
|
+
parser.add_argument("catalog_id", help="Catalog ID")
|
|
465
|
+
parser.add_argument("schema-name", default="deriva-ml", help="Schema name (default: deriva-ml)")
|
|
457
466
|
args = parser.parse_args()
|
|
458
467
|
generate_annotation(args.catalog_id, args.schema_name)
|
|
459
468
|
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
from importlib.resources import files
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from pprint import pprint
|
|
6
|
+
|
|
7
|
+
from deepdiff import DeepDiff
|
|
8
|
+
from deriva.core import AttrDict, BaseCLI, get_credential
|
|
9
|
+
from deriva.core.ermrest_catalog import ErmrestCatalog
|
|
10
|
+
|
|
11
|
+
from deriva_ml.core.definitions import ML_SCHEMA
|
|
12
|
+
from deriva_ml.schema.create_schema import create_ml_catalog
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def normalize_schema(d):
|
|
16
|
+
if isinstance(d, dict) or isinstance(d, AttrDict):
|
|
17
|
+
m = {}
|
|
18
|
+
for k, v in d.items():
|
|
19
|
+
if k == "acl_bindings" or k == "annotations" or k == "comment":
|
|
20
|
+
continue
|
|
21
|
+
m[k] = normalize_schema(v)
|
|
22
|
+
return m
|
|
23
|
+
elif isinstance(d, list):
|
|
24
|
+
return [normalize_schema(i) for i in d]
|
|
25
|
+
elif isinstance(d, str):
|
|
26
|
+
# ID templates for controlled vocabulary
|
|
27
|
+
if m := re.match("(?P<s>.*):{RID}", d):
|
|
28
|
+
d = d if m["s"] == "deriva-ml" else "reference-catalog:{RID}" if re.match(".*:{RID}", d) else d
|
|
29
|
+
return d
|
|
30
|
+
else:
|
|
31
|
+
return d
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def check_ml_schema(hostname, catalog_id, schema_file: Path | None = None):
|
|
35
|
+
"""Check the ML schema against a reference schema file.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
hostname: The hostname of the Deriva catalog.
|
|
39
|
+
catalog_id: The catalog ID to check.
|
|
40
|
+
schema_file: Optional path to reference schema file. If None, uses default reference.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
None. Prints the diff between target and reference schemas.
|
|
44
|
+
"""
|
|
45
|
+
# schema_file = schema_file or files("deriva-ml.data").joinpath("deriva-ml-reference.json")
|
|
46
|
+
schema_file = schema_file or files("deriva_ml.schema").joinpath("deriva-ml-reference.json")
|
|
47
|
+
|
|
48
|
+
# Now map
|
|
49
|
+
|
|
50
|
+
with Path(schema_file).open("r") as f:
|
|
51
|
+
reference_schema = normalize_schema(json.load(f)["schemas"][ML_SCHEMA])
|
|
52
|
+
|
|
53
|
+
catalog = ErmrestCatalog("https", hostname, catalog_id, credentials=get_credential(hostname))
|
|
54
|
+
target_schema = normalize_schema(catalog.getCatalogModel().schemas[ML_SCHEMA].prejson())
|
|
55
|
+
|
|
56
|
+
# Compute the diff
|
|
57
|
+
diff = DeepDiff(reference_schema, target_schema, ignore_order=True, view="tree")
|
|
58
|
+
print(f"Diff between {schema_file} and {ML_SCHEMA} schema:")
|
|
59
|
+
# Pretty‐print as JSON
|
|
60
|
+
pprint(diff, indent=2)
|
|
61
|
+
return diff
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def dump_ml_schema(hostname: str, filename: str = "deriva-ml-reference.json") -> None:
|
|
65
|
+
"""Dump the schema of the ML catalog to stdout."""
|
|
66
|
+
catalog = create_ml_catalog(hostname, "reference-catalog")
|
|
67
|
+
try:
|
|
68
|
+
model = catalog.getCatalogModel()
|
|
69
|
+
print(f"Dumping ML schema to {Path(filename).resolve()}...")
|
|
70
|
+
with Path(filename).open("w") as f:
|
|
71
|
+
json.dump(model.prejson(), f, indent=2)
|
|
72
|
+
finally:
|
|
73
|
+
catalog.delete_ermrest_catalog(really=True)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class CheckMLSchemaCLI(BaseCLI):
|
|
77
|
+
"""Main class to part command line arguments and call model"""
|
|
78
|
+
|
|
79
|
+
def __init__(self, description, epilog, **kwargs):
|
|
80
|
+
BaseCLI.__init__(self, description, epilog, **kwargs)
|
|
81
|
+
|
|
82
|
+
self.parser.add_argument("--catalog", default=1, metavar="<1>", help="Catalog number. Default: 1")
|
|
83
|
+
self.parser.add_argument("--dump", action="store_true", help="Perform execution in dry-run mode.")
|
|
84
|
+
|
|
85
|
+
def main(self):
|
|
86
|
+
"""Parse arguments and set up execution environment."""
|
|
87
|
+
args = self.parse_cli()
|
|
88
|
+
hostname = args.host
|
|
89
|
+
catalog_id = args.catalog
|
|
90
|
+
|
|
91
|
+
if args.dump:
|
|
92
|
+
dump_ml_schema(hostname, catalog_id)
|
|
93
|
+
return
|
|
94
|
+
|
|
95
|
+
check_ml_schema(hostname, catalog_id)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def main():
|
|
99
|
+
cli = CheckMLSchemaCLI(description="Check DerivaML Catalog for Compliance", epilog="")
|
|
100
|
+
cli.main()
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
if __name__ == "__main__":
|
|
104
|
+
main()
|