deriva-ml 1.14.0__py3-none-any.whl → 1.14.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. deriva_ml/__init__.py +25 -30
  2. deriva_ml/core/__init__.py +39 -0
  3. deriva_ml/core/base.py +1489 -0
  4. deriva_ml/core/constants.py +36 -0
  5. deriva_ml/core/definitions.py +74 -0
  6. deriva_ml/core/enums.py +222 -0
  7. deriva_ml/core/ermrest.py +288 -0
  8. deriva_ml/core/exceptions.py +28 -0
  9. deriva_ml/core/filespec.py +116 -0
  10. deriva_ml/dataset/__init__.py +4 -0
  11. deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
  12. deriva_ml/{dataset.py → dataset/dataset.py} +406 -428
  13. deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
  14. deriva_ml/{history.py → dataset/history.py} +51 -33
  15. deriva_ml/{upload.py → dataset/upload.py} +48 -70
  16. deriva_ml/demo_catalog.py +233 -183
  17. deriva_ml/execution/environment.py +290 -0
  18. deriva_ml/{execution.py → execution/execution.py} +365 -252
  19. deriva_ml/execution/execution_configuration.py +163 -0
  20. deriva_ml/{execution_configuration.py → execution/workflow.py} +212 -224
  21. deriva_ml/feature.py +83 -46
  22. deriva_ml/model/__init__.py +0 -0
  23. deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
  24. deriva_ml/{database_model.py → model/database.py} +52 -74
  25. deriva_ml/model/sql_mapper.py +44 -0
  26. deriva_ml/run_notebook.py +19 -11
  27. deriva_ml/schema/__init__.py +3 -0
  28. deriva_ml/{schema_setup → schema}/annotations.py +31 -22
  29. deriva_ml/schema/check_schema.py +104 -0
  30. deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
  31. deriva_ml/schema/deriva-ml-reference.json +8525 -0
  32. deriva_ml/schema/table_comments_utils.py +57 -0
  33. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/METADATA +5 -4
  34. deriva_ml-1.14.27.dist-info/RECORD +40 -0
  35. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/entry_points.txt +1 -0
  36. deriva_ml/deriva_definitions.py +0 -391
  37. deriva_ml/deriva_ml_base.py +0 -1046
  38. deriva_ml/execution_environment.py +0 -139
  39. deriva_ml/schema_setup/table_comments_utils.py +0 -56
  40. deriva_ml/test-files/execution-parameters.json +0 -1
  41. deriva_ml/test-files/notebook-parameters.json +0 -5
  42. deriva_ml/test_functions.py +0 -141
  43. deriva_ml/test_notebook.ipynb +0 -197
  44. deriva_ml-1.14.0.dist-info/RECORD +0 -31
  45. /deriva_ml/{schema_setup → execution}/__init__.py +0 -0
  46. /deriva_ml/{schema_setup → schema}/policy.json +0 -0
  47. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/WHEEL +0 -0
  48. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/licenses/LICENSE +0 -0
  49. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/top_level.txt +0 -0
@@ -1,23 +1,25 @@
1
- """Ths module contains the definition of the DatabaseModel class. The role of this class is to provide an interface between the BDBag representation
2
- of a dataset and a sqllite database in which the contents of the bag are stored.
1
+ """This module contains the definition of the DatabaseModel class. The role of this class is to provide an interface
2
+ between the BDBag representation of a dataset and a sqlite database in which the contents of the bag are stored.
3
3
  """
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
+ import json
7
8
  import logging
8
9
  import sqlite3
9
-
10
10
  from csv import reader
11
11
  from pathlib import Path
12
- from typing import Any, Optional, Generator
12
+ from typing import Any, Generator, Optional
13
13
  from urllib.parse import urlparse
14
14
 
15
15
  from deriva.core.ermrest_model import Model
16
16
 
17
- from .deriva_definitions import ML_SCHEMA, MLVocab, RID, DerivaMLException
18
- from .dataset_aux_classes import DatasetVersion, DatasetMinid
19
- from .deriva_model import DerivaModel
20
- from .dataset_bag import DatasetBag
17
+ from deriva_ml.core.definitions import ML_SCHEMA, RID, MLVocab
18
+ from deriva_ml.core.exceptions import DerivaMLException
19
+ from deriva_ml.dataset.aux_classes import DatasetMinid, DatasetVersion
20
+ from deriva_ml.dataset.dataset_bag import DatasetBag
21
+ from deriva_ml.model.catalog import DerivaModel
22
+ from deriva_ml.model.sql_mapper import SQLMapper
21
23
 
22
24
  try:
23
25
  from icecream import ic
@@ -42,13 +44,13 @@ class DatabaseModelMeta(type):
42
44
  class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
43
45
  """Read in the contents of a BDBag and create a local SQLite database.
44
46
 
45
- As part of its initialization, this routine will create a sqlite database that has the contents of all the tables
46
- in the dataset_table. In addition, any asset tables will the `Filename` column remapped to have the path of the local
47
- copy of the file. In addition, a local version of the ERMRest model that as used to generate the dataset_table is
48
- available.
47
+ As part of its initialization, this routine will create a sqlite database that has the contents of all the
48
+ tables in the dataset_table. In addition, any asset tables will the `Filename` column remapped to have the path
49
+ of the local copy of the file. In addition, a local version of the ERMRest model that as used to generate the
50
+ dataset_table is available.
49
51
 
50
52
  The sqlite database will not have any foreign key constraints applied, however, foreign-key relationships can be
51
- found by looking in the ERMRest model. In addition, as sqllite doesn't support schema, Ermrest schema are added
53
+ found by looking in the ERMRest model. In addition, as sqlite doesn't support schema, Ermrest schema are added
52
54
  to the table name using the convention SchemaName:TableName. Methods in DatasetBag that have table names as the
53
55
  argument will perform the appropriate name mappings.
54
56
 
@@ -56,8 +58,8 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
56
58
  appear in more than one database. To help manage this, a global list of all the datasets that have been loaded
57
59
  into DatabaseModels, is kept in the class variable `_rid_map`.
58
60
 
59
- Because you can load different versions of a dataset simultaneously, the dataset RID and version number are tracked, and a new
60
- sqllite instance is created for every new dataset version present.
61
+ Because you can load different versions of a dataset simultaneously, the dataset RID and version number are tracked,
62
+ and a new sqlite instance is created for every new dataset version present.
61
63
 
62
64
  Attributes:
63
65
  bag_path (Path): path to the local copy of the BDBag
@@ -103,10 +105,11 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
103
105
  self.dbase_file = dbase_path / f"{minid.version_rid}.db"
104
106
  self.dbase = sqlite3.connect(self.dbase_file)
105
107
 
106
- super().__init__(
107
- Model.fromfile("file-system", self.bag_path / "data/schema.json")
108
- )
108
+ schema_file = self.bag_path / "data/schema.json"
109
+ with schema_file.open("r") as f:
110
+ self.snaptime = json.load(f)["snaptime"]
109
111
 
112
+ super().__init__(Model.fromfile("file-system", self.bag_path / "data/schema.json"))
110
113
  self._logger = logging.getLogger("deriva_ml")
111
114
  self._load_model()
112
115
  self.ml_schema = ML_SCHEMA
@@ -121,28 +124,22 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
121
124
  sql_dataset = self.normalize_table_name("Dataset_Version")
122
125
  with self.dbase:
123
126
  dataset_versions = [
124
- t
125
- for t in self.dbase.execute(
126
- f'SELECT "Dataset", "Version" FROM "{sql_dataset}"'
127
- ).fetchall()
127
+ t for t in self.dbase.execute(f'SELECT "Dataset", "Version" FROM "{sql_dataset}"').fetchall()
128
128
  ]
129
- dataset_versions = [
130
- (v[0], DatasetVersion.parse(v[1])) for v in dataset_versions
131
- ]
129
+ dataset_versions = [(v[0], DatasetVersion.parse(v[1])) for v in dataset_versions]
132
130
 
133
131
  # Get most current version of each rid
134
132
  self.bag_rids = {}
135
133
  for rid, version in dataset_versions:
136
- self.bag_rids[rid] = max(
137
- self.bag_rids.get(rid, DatasetVersion(0, 1, 0)), version
138
- )
134
+ self.bag_rids[rid] = max(self.bag_rids.get(rid, DatasetVersion(0, 1, 0)), version)
139
135
 
140
136
  for dataset_rid, dataset_version in self.bag_rids.items():
141
137
  version_list = DatabaseModel._rid_map.setdefault(dataset_rid, [])
142
138
  version_list.append((dataset_version, self))
143
139
 
144
140
  def _load_model(self) -> None:
145
- """Create a sqlite database schema that contains all the tables within the catalog from which the BDBag was created."""
141
+ """Create a sqlite database schema that contains all the tables within the catalog from which the BDBag
142
+ was created."""
146
143
  with self.dbase:
147
144
  for t in self.model.schemas[self.domain_schema].tables.values():
148
145
  self.dbase.execute(t.sqlite3_ddl())
@@ -153,7 +150,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
153
150
  """Load a SQLite database from a bdbag. THis is done by looking for all the CSV files in the bdbag directory.
154
151
 
155
152
  If the file is for an asset table, update the FileName column of the table to have the local file path for
156
- the materialized file. Then load into the sqllite database.
153
+ the materialized file. Then load into the sqlite database.
157
154
  Note: none of the foreign key constraints are included in the database.
158
155
  """
159
156
  dpath = self.bag_path / "data"
@@ -162,11 +159,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
162
159
  # Find all the CSV files in the subdirectory and load each file into the database.
163
160
  for csv_file in Path(dpath).rglob("*.csv"):
164
161
  table = csv_file.stem
165
- schema = (
166
- self.domain_schema
167
- if table in self.model.schemas[self.domain_schema].tables
168
- else self.ml_schema
169
- )
162
+ schema = self.domain_schema if table in self.model.schemas[self.domain_schema].tables else self.ml_schema
170
163
 
171
164
  with csv_file.open(newline="") as csvfile:
172
165
  csv_reader = reader(csvfile)
@@ -174,19 +167,14 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
174
167
 
175
168
  # Determine which columns in the table has the Filename and the URL
176
169
  asset_indexes = (
177
- (column_names.index("Filename"), column_names.index("URL"))
178
- if self._is_asset(table)
179
- else None
170
+ (column_names.index("Filename"), column_names.index("URL")) if self._is_asset(table) else None
180
171
  )
181
172
 
182
- value_template = ",".join(
183
- ["?"] * len(column_names)
184
- ) # SQL placeholder for row (?,?..)
173
+ value_template = ",".join(["?"] * len(column_names)) # SQL placeholder for row (?,?..)
185
174
  column_list = ",".join([f'"{c}"' for c in column_names])
186
175
  with self.dbase:
187
176
  object_table = (
188
- self._localize_asset(o, asset_indexes, asset_map)
189
- for o in csv_reader
177
+ self._localize_asset(o, asset_indexes, asset_map, table == "Dataset") for o in csv_reader
190
178
  )
191
179
  self.dbase.executemany(
192
180
  f'INSERT OR REPLACE INTO "{schema}:{table}" ({column_list}) VALUES ({value_template})',
@@ -202,7 +190,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
202
190
  """
203
191
  fetch_map = {}
204
192
  try:
205
- with open(self.bag_path / "fetch.txt", newline="\n") as fetch_file:
193
+ with Path.open(self.bag_path / "fetch.txt", newline="\n") as fetch_file:
206
194
  for row in fetch_file:
207
195
  # Rows in fetch.text are tab seperated with URL filename.
208
196
  fields = row.split("\t")
@@ -224,18 +212,12 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
224
212
  Boolean that is true if the table looks like an asset table.
225
213
  """
226
214
  asset_columns = {"Filename", "URL", "Length", "MD5", "Description"}
227
- sname = (
228
- self.domain_schema
229
- if table_name in self.model.schemas[self.domain_schema].tables
230
- else self.ml_schema
231
- )
215
+ sname = self.domain_schema if table_name in self.model.schemas[self.domain_schema].tables else self.ml_schema
232
216
  asset_table = self.model.schemas[sname].tables[table_name]
233
217
  return asset_columns.issubset({c.name for c in asset_table.columns})
234
218
 
235
219
  @staticmethod
236
- def _localize_asset(
237
- o: list, indexes: tuple[int, int], asset_map: dict[str, str]
238
- ) -> tuple:
220
+ def _localize_asset(o: list, indexes: tuple[int, int], asset_map: dict[str, str], debug: bool = False) -> tuple:
239
221
  """Given a list of column values for a table, replace the FileName column with the local file name based on
240
222
  the URL value.
241
223
 
@@ -295,24 +277,21 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
295
277
  Returns:
296
278
  list of currently available datasets.
297
279
  """
298
- atable = next(
299
- self.model.schemas[ML_SCHEMA]
300
- .tables[MLVocab.dataset_type]
301
- .find_associations()
302
- ).name
280
+ atable = next(self.model.schemas[ML_SCHEMA].tables[MLVocab.dataset_type].find_associations()).name
303
281
 
304
282
  # Get a list of all the dataset_type values associated with this dataset_table.
305
283
  datasets = []
306
- ds_types = list(self.get_table_as_dict(atable))
307
- for dataset in self.get_table_as_dict("Dataset"):
284
+ ds_types = list(self._get_table(atable))
285
+ for dataset in self._get_table("Dataset"):
308
286
  my_types = [t for t in ds_types if t["Dataset"] == dataset["RID"]]
309
- datasets.append(
310
- dataset
311
- | {MLVocab.dataset_type: [ds[MLVocab.dataset_type] for ds in my_types]}
312
- )
287
+ datasets.append(dataset | {MLVocab.dataset_type: [ds[MLVocab.dataset_type] for ds in my_types]})
313
288
  return datasets
314
289
 
315
- def get_table_as_dict(self, table: str) -> Generator[dict[str, Any], None, None]:
290
+ def list_dataset_members(self, dataset_rid: RID) -> dict[str, Any]:
291
+ """Returns a list of all the dataset_table entries associated with a dataset."""
292
+ return self.get_dataset(dataset_rid).list_dataset_members()
293
+
294
+ def _get_table(self, table: str) -> Generator[dict[str, Any], None, None]:
316
295
  """Retrieve the contents of the specified table as a dictionary.
317
296
 
318
297
  Args:
@@ -323,14 +302,14 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
323
302
  A generator producing dictionaries containing the contents of the specified table as name/value pairs.
324
303
  """
325
304
  table_name = self.normalize_table_name(table)
326
- with self.dbase as dbase:
327
- col_names = [
328
- c[1]
329
- for c in dbase.execute(f'PRAGMA table_info("{table_name}")').fetchall()
330
- ]
305
+ table = self.name_to_table(table)
306
+
307
+ with self.dbase as _dbase:
308
+ mapper = SQLMapper(self, table.name)
331
309
  result = self.dbase.execute(f'SELECT * FROM "{table_name}"')
332
- while row := result.fetchone():
333
- yield dict(zip(col_names, row))
310
+
311
+ while (row := result.fetchone()) is not None:
312
+ yield mapper.transform_tuple(row)
334
313
 
335
314
  def normalize_table_name(self, table: str) -> str:
336
315
  """Attempt to insert the schema into a table name if it's not provided.
@@ -342,13 +321,12 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
342
321
  table name with schema included.
343
322
 
344
323
  """
345
- sname = ""
346
324
  try:
347
325
  [sname, tname] = table.split(":")
348
326
  except ValueError:
349
327
  tname = table
350
- for sname, s in self.model.schemas.items():
351
- if table in s.tables:
328
+ for sname in [self.domain_schema, self.ml_schema, "WWW"]: # Be careful of File table.
329
+ if table in self.model.schemas[sname].tables:
352
330
  break
353
331
  try:
354
332
  _ = self.model.schemas[sname].tables[tname]
@@ -0,0 +1,44 @@
1
+ from datetime import datetime, timezone
2
+ from typing import TYPE_CHECKING, Any, Sequence
3
+
4
+ if TYPE_CHECKING:
5
+ from deriva_ml.model.database import DatabaseModel
6
+
7
+ try:
8
+ from icecream import ic
9
+ except ImportError: # Graceful fallback if IceCream isn't installed.
10
+ ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
11
+
12
+
13
+ class SQLMapper:
14
+ def __init__(self, database: "DatabaseModel", table: str) -> None:
15
+ table_name = database.normalize_table_name(table)
16
+ schema, table = table_name.split(":")
17
+
18
+ with database.dbase as dbase:
19
+ self.col_names = [c[1] for c in dbase.execute(f'PRAGMA table_info("{table_name}")').fetchall()]
20
+
21
+ self.boolean_columns = [
22
+ self.col_names.index(c.name)
23
+ for c in database.model.schemas[schema].tables[table].columns
24
+ if c.type.typename == "boolean"
25
+ ]
26
+ self.time_columns = [
27
+ self.col_names.index(c.name)
28
+ for c in database.model.schemas[schema].tables[table].columns
29
+ if c.type.typename in ["ermrest_rct", "ermrest_rmt"]
30
+ ]
31
+
32
+ def _map_value(self, idx: int, v: Any) -> Any:
33
+ """
34
+ Return a new value based on `data` where, for each index in `idxs`,
35
+ """
36
+ tf_map = {"t": True, "f": False}
37
+ if idx in self.boolean_columns:
38
+ return tf_map.get(v, v)
39
+ if idx in self.time_columns:
40
+ return datetime.strptime(v, "%Y-%m-%d %H:%M:%S.%f+00").replace(tzinfo=timezone.utc).isoformat()
41
+ return v
42
+
43
+ def transform_tuple(self, data: Sequence[Any]) -> Any:
44
+ return dict(zip(self.col_names, tuple(self._map_value(i, v) for i, v in enumerate(data))))
deriva_ml/run_notebook.py CHANGED
@@ -1,16 +1,16 @@
1
1
  """Module to run a notebook using papermill"""
2
2
 
3
- from datetime import datetime
4
3
  import json
5
4
  import os
6
- import papermill as pm
7
- from pathlib import Path
8
- import regex as re
9
5
  import tempfile
6
+ from datetime import datetime
7
+ from pathlib import Path
10
8
 
11
- from deriva_ml import Workflow, DerivaML
9
+ import papermill as pm
10
+ import regex as re
12
11
  from deriva.core import BaseCLI
13
- from deriva_ml import MLAsset, ExecAssetType
12
+
13
+ from deriva_ml import DerivaML, ExecAssetType, MLAsset, Workflow
14
14
 
15
15
 
16
16
  class DerivaMLRunNotebookCLI(BaseCLI):
@@ -91,7 +91,7 @@ class DerivaMLRunNotebookCLI(BaseCLI):
91
91
  if not (parameter_file.is_file() and parameter_file.suffix == ".json"):
92
92
  print("Parameter file must be an json file.")
93
93
  exit(1)
94
- with open(parameter_file, "r") as f:
94
+ with Path(parameter_file).open("r") as f:
95
95
  parameters |= json.load(f)
96
96
 
97
97
  if not (notebook_file.is_file() and notebook_file.suffix == ".ipynb"):
@@ -101,7 +101,8 @@ class DerivaMLRunNotebookCLI(BaseCLI):
101
101
  os.environ["DERIVA_HOST"] = args.host
102
102
  os.environ["DERIVA_CATALOG_ID"] = args.catalog
103
103
 
104
- # Create a workflow instance for this specific version of the script. Return an existing workflow if one is found.
104
+ # Create a workflow instance for this specific version of the script.
105
+ # Return an existing workflow if one is found.
105
106
  notebook_parameters = pm.inspect_notebook(notebook_file)
106
107
  if args.inspect:
107
108
  for param, value in notebook_parameters.items():
@@ -133,8 +134,8 @@ class DerivaMLRunNotebookCLI(BaseCLI):
133
134
  parameters=parameters,
134
135
  kernel_name=kernel,
135
136
  )
136
- host = catalog_id = execution_rid = None
137
- with open(notebook_output, "r") as f:
137
+ catalog_id = execution_rid = None
138
+ with Path(notebook_output).open("r") as f:
138
139
  for line in f:
139
140
  if m := re.search(
140
141
  r"Execution RID: https://(?P<host>.*)/id/(?P<catalog_id>.*)/(?P<execution_rid>[\w-]+)",
@@ -161,7 +162,7 @@ class DerivaMLRunNotebookCLI(BaseCLI):
161
162
  file_name=f"notebook-parameters-{datetime.now().strftime('%Y%m%d-%H%M%S')}.json",
162
163
  asset_types=ExecAssetType.input_file.value,
163
164
  )
164
- with open(parameter_file, "w") as f:
165
+ with Path(parameter_file).open("w") as f:
165
166
  json.dump(parameters, f)
166
167
 
167
168
  execution.upload_execution_outputs()
@@ -169,6 +170,13 @@ class DerivaMLRunNotebookCLI(BaseCLI):
169
170
 
170
171
 
171
172
  def main():
173
+ """Main entry point for the notebook runner CLI.
174
+
175
+ Creates and runs the DerivaMLRunNotebookCLI instance.
176
+
177
+ Returns:
178
+ None. Executes the CLI.
179
+ """
172
180
  cli = DerivaMLRunNotebookCLI(
173
181
  description="Deriva ML Execution Script Demo", epilog=""
174
182
  )
@@ -0,0 +1,3 @@
1
+ from deriva_ml.schema.create_schema import create_ml_catalog, reset_ml_schema
2
+
3
+ __all__ = ["create_ml_catalog", "reset_ml_schema"]
@@ -1,10 +1,11 @@
1
1
  import argparse
2
- import sys
3
2
 
4
3
  from deriva.core.ermrest_model import Model, Table
5
4
  from deriva.core.utils.core_utils import tag as deriva_tags
6
- from ..deriva_model import DerivaModel
7
- from ..upload import bulk_upload_configuration
5
+
6
+ from deriva_ml.core.constants import DerivaAssetColumns
7
+ from deriva_ml.dataset.upload import bulk_upload_configuration
8
+ from deriva_ml.model.catalog import DerivaModel
8
9
 
9
10
 
10
11
  def catalog_annotation(model: DerivaModel) -> None:
@@ -106,17 +107,12 @@ def catalog_annotation(model: DerivaModel) -> None:
106
107
  }
107
108
  for tname in model.schemas[model.domain_schema].tables
108
109
  # Don't include controlled vocabularies, association tables, or feature tables.
109
- if not (
110
- model.is_vocabulary(tname)
111
- or model.is_association(tname, pure=False, max_arity=3)
112
- )
110
+ if not (model.is_vocabulary(tname) or model.is_association(tname, pure=False, max_arity=3))
113
111
  ],
114
112
  },
115
113
  { # Vocabulary menu which will list all the controlled vocabularies in deriva-ml and domain.
116
114
  "name": "Vocabulary",
117
- "children": [
118
- {"name": f"{ml_schema} Vocabularies", "header": True}
119
- ]
115
+ "children": [{"name": f"{ml_schema} Vocabularies", "header": True}]
120
116
  + [
121
117
  {
122
118
  "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:{tname}",
@@ -186,9 +182,18 @@ def catalog_annotation(model: DerivaModel) -> None:
186
182
 
187
183
 
188
184
  def asset_annotation(asset_table: Table):
185
+ """Generate annotations for an asset table.
186
+
187
+ Args:
188
+ asset_table: The Table object representing the asset table.
189
+
190
+ Returns:
191
+ A dictionary containing the annotations for the asset table.
192
+ """
193
+
189
194
  schema = asset_table.schema.name
190
195
  asset_name = asset_table.name
191
- model = DerivaModel(asset_table.schema.model)
196
+ asset_metadata = {c.name for c in asset_table.columns} - DerivaAssetColumns
192
197
 
193
198
  def fkey_column(column):
194
199
  """Map the column name to a FK if a constraint exists on the column"""
@@ -202,9 +207,7 @@ def asset_annotation(asset_table: Table):
202
207
  )
203
208
 
204
209
  annotations = {
205
- deriva_tags.table_display: {
206
- "row_name": {"row_markdown_pattern": "{{{Filename}}}"}
207
- },
210
+ deriva_tags.table_display: {"row_name": {"row_markdown_pattern": "{{{Filename}}}"}},
208
211
  deriva_tags.visible_columns: {
209
212
  "*": [
210
213
  "RID",
@@ -236,11 +239,11 @@ def asset_annotation(asset_table: Table):
236
239
  "markdown_name": "Asset Types",
237
240
  },
238
241
  ]
239
- + [fkey_column(c) for c in model.asset_metadata(asset_table)],
242
+ + [fkey_column(c) for c in asset_metadata],
240
243
  },
241
244
  }
242
245
  asset_table.annotations.update(annotations)
243
- model.apply()
246
+ asset_table.schema.model.apply()
244
247
 
245
248
 
246
249
  def generate_annotation(model: Model, schema: str) -> dict:
@@ -435,9 +438,7 @@ def generate_annotation(model: Model, schema: str) -> dict:
435
438
  },
436
439
  deriva_tags.visible_foreign_keys: {"*": []},
437
440
  deriva_tags.table_display: {
438
- "row_name": {
439
- "row_markdown_pattern": "{{{$fkey_deriva-ml_Dataset_Version_Dataset_fkey.RID}}}:{{{Version}}}"
440
- }
441
+ "row_name": {"row_markdown_pattern": "{{{$fkey_deriva-ml_Dataset_Version_Dataset_fkey.RID}}}:{{{Version}}}"}
441
442
  },
442
443
  }
443
444
 
@@ -451,9 +452,17 @@ def generate_annotation(model: Model, schema: str) -> dict:
451
452
 
452
453
 
453
454
  def main():
454
- parser = argparse.ArgumentParser()
455
- parser.add_argument("--catalog_id", type=str, required=True)
456
- parser.add_argument("--schema_name", type=str, required=True)
455
+ """Main entry point for the annotations CLI.
456
+
457
+ Applies annotations to the ML schema based on command line arguments.
458
+
459
+ Returns:
460
+ None. Executes the CLI.
461
+ """
462
+ parser = argparse.ArgumentParser(description="Apply annotations to ML schema")
463
+ parser.add_argument("hostname", help="Hostname for the catalog")
464
+ parser.add_argument("catalog_id", help="Catalog ID")
465
+ parser.add_argument("schema-name", default="deriva-ml", help="Schema name (default: deriva-ml)")
457
466
  args = parser.parse_args()
458
467
  generate_annotation(args.catalog_id, args.schema_name)
459
468
 
@@ -0,0 +1,104 @@
1
+ import json
2
+ import re
3
+ from importlib.resources import files
4
+ from pathlib import Path
5
+ from pprint import pprint
6
+
7
+ from deepdiff import DeepDiff
8
+ from deriva.core import AttrDict, BaseCLI, get_credential
9
+ from deriva.core.ermrest_catalog import ErmrestCatalog
10
+
11
+ from deriva_ml.core.definitions import ML_SCHEMA
12
+ from deriva_ml.schema.create_schema import create_ml_catalog
13
+
14
+
15
+ def normalize_schema(d):
16
+ if isinstance(d, dict) or isinstance(d, AttrDict):
17
+ m = {}
18
+ for k, v in d.items():
19
+ if k == "acl_bindings" or k == "annotations" or k == "comment":
20
+ continue
21
+ m[k] = normalize_schema(v)
22
+ return m
23
+ elif isinstance(d, list):
24
+ return [normalize_schema(i) for i in d]
25
+ elif isinstance(d, str):
26
+ # ID templates for controlled vocabulary
27
+ if m := re.match("(?P<s>.*):{RID}", d):
28
+ d = d if m["s"] == "deriva-ml" else "reference-catalog:{RID}" if re.match(".*:{RID}", d) else d
29
+ return d
30
+ else:
31
+ return d
32
+
33
+
34
+ def check_ml_schema(hostname, catalog_id, schema_file: Path | None = None):
35
+ """Check the ML schema against a reference schema file.
36
+
37
+ Args:
38
+ hostname: The hostname of the Deriva catalog.
39
+ catalog_id: The catalog ID to check.
40
+ schema_file: Optional path to reference schema file. If None, uses default reference.
41
+
42
+ Returns:
43
+ None. Prints the diff between target and reference schemas.
44
+ """
45
+ # schema_file = schema_file or files("deriva-ml.data").joinpath("deriva-ml-reference.json")
46
+ schema_file = schema_file or files("deriva_ml.schema").joinpath("deriva-ml-reference.json")
47
+
48
+ # Now map
49
+
50
+ with Path(schema_file).open("r") as f:
51
+ reference_schema = normalize_schema(json.load(f)["schemas"][ML_SCHEMA])
52
+
53
+ catalog = ErmrestCatalog("https", hostname, catalog_id, credentials=get_credential(hostname))
54
+ target_schema = normalize_schema(catalog.getCatalogModel().schemas[ML_SCHEMA].prejson())
55
+
56
+ # Compute the diff
57
+ diff = DeepDiff(reference_schema, target_schema, ignore_order=True, view="tree")
58
+ print(f"Diff between {schema_file} and {ML_SCHEMA} schema:")
59
+ # Pretty‐print as JSON
60
+ pprint(diff, indent=2)
61
+ return diff
62
+
63
+
64
+ def dump_ml_schema(hostname: str, filename: str = "deriva-ml-reference.json") -> None:
65
+ """Dump the schema of the ML catalog to stdout."""
66
+ catalog = create_ml_catalog(hostname, "reference-catalog")
67
+ try:
68
+ model = catalog.getCatalogModel()
69
+ print(f"Dumping ML schema to {Path(filename).resolve()}...")
70
+ with Path(filename).open("w") as f:
71
+ json.dump(model.prejson(), f, indent=2)
72
+ finally:
73
+ catalog.delete_ermrest_catalog(really=True)
74
+
75
+
76
+ class CheckMLSchemaCLI(BaseCLI):
77
+ """Main class to part command line arguments and call model"""
78
+
79
+ def __init__(self, description, epilog, **kwargs):
80
+ BaseCLI.__init__(self, description, epilog, **kwargs)
81
+
82
+ self.parser.add_argument("--catalog", default=1, metavar="<1>", help="Catalog number. Default: 1")
83
+ self.parser.add_argument("--dump", action="store_true", help="Perform execution in dry-run mode.")
84
+
85
+ def main(self):
86
+ """Parse arguments and set up execution environment."""
87
+ args = self.parse_cli()
88
+ hostname = args.host
89
+ catalog_id = args.catalog
90
+
91
+ if args.dump:
92
+ dump_ml_schema(hostname, catalog_id)
93
+ return
94
+
95
+ check_ml_schema(hostname, catalog_id)
96
+
97
+
98
+ def main():
99
+ cli = CheckMLSchemaCLI(description="Check DerivaML Catalog for Compliance", epilog="")
100
+ cli.main()
101
+
102
+
103
+ if __name__ == "__main__":
104
+ main()