PyPI - deriva-ml - Versions diffs - 1.14.0__py3-none-any.whl → 1.14.27__py3-none-any.whl - Mend

deriva-ml 1.14.0py3-none-any.whl → 1.14.27py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

deriva_ml/__init__.py +25 -30
deriva_ml/core/__init__.py +39 -0
deriva_ml/core/base.py +1489 -0
deriva_ml/core/constants.py +36 -0
deriva_ml/core/definitions.py +74 -0
deriva_ml/core/enums.py +222 -0
deriva_ml/core/ermrest.py +288 -0
deriva_ml/core/exceptions.py +28 -0
deriva_ml/core/filespec.py +116 -0
deriva_ml/dataset/__init__.py +4 -0
deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
deriva_ml/{dataset.py → dataset/dataset.py} +406 -428
deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
deriva_ml/{history.py → dataset/history.py} +51 -33
deriva_ml/{upload.py → dataset/upload.py} +48 -70
deriva_ml/demo_catalog.py +233 -183
deriva_ml/execution/environment.py +290 -0
deriva_ml/{execution.py → execution/execution.py} +365 -252
deriva_ml/execution/execution_configuration.py +163 -0
deriva_ml/{execution_configuration.py → execution/workflow.py} +212 -224
deriva_ml/feature.py +83 -46
deriva_ml/model/__init__.py +0 -0
deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
deriva_ml/{database_model.py → model/database.py} +52 -74
deriva_ml/model/sql_mapper.py +44 -0
deriva_ml/run_notebook.py +19 -11
deriva_ml/schema/__init__.py +3 -0
deriva_ml/{schema_setup → schema}/annotations.py +31 -22
deriva_ml/schema/check_schema.py +104 -0
deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
deriva_ml/schema/deriva-ml-reference.json +8525 -0
deriva_ml/schema/table_comments_utils.py +57 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/METADATA +5 -4
deriva_ml-1.14.27.dist-info/RECORD +40 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/entry_points.txt +1 -0
deriva_ml/deriva_definitions.py +0 -391
deriva_ml/deriva_ml_base.py +0 -1046
deriva_ml/execution_environment.py +0 -139
deriva_ml/schema_setup/table_comments_utils.py +0 -56
deriva_ml/test-files/execution-parameters.json +0 -1
deriva_ml/test-files/notebook-parameters.json +0 -5
deriva_ml/test_functions.py +0 -141
deriva_ml/test_notebook.ipynb +0 -197
deriva_ml-1.14.0.dist-info/RECORD +0 -31
/deriva_ml/{schema_setup → execution}/__init__.py +0 -0
/deriva_ml/{schema_setup → schema}/policy.json +0 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/WHEEL +0 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/top_level.txt +0 -0

deriva_ml/{database_model.py → model/database.py} RENAMED Viewed

@@ -1,23 +1,25 @@
-"""Ths module contains the definition of the DatabaseModel class.  The role of this class is to provide an interface between the BDBag representation
-of a dataset and a sqllite database in which the contents of the bag are stored.
+"""This module contains the definition of the DatabaseModel class.  The role of this class is to provide an interface
+between the BDBag representation of a dataset and a sqlite database in which the contents of the bag are stored.
 """
 from __future__ import annotations
+import json
 import logging
 import sqlite3
 from csv import reader
 from pathlib import Path
-from typing import Any, Optional, Generator
+from typing import Any, Generator, Optional
 from urllib.parse import urlparse
 from deriva.core.ermrest_model import Model
-from .deriva_definitions import ML_SCHEMA, MLVocab, RID, DerivaMLException
-from .dataset_aux_classes import DatasetVersion, DatasetMinid
-from .deriva_model import DerivaModel
-from .dataset_bag import DatasetBag
+from deriva_ml.core.definitions import ML_SCHEMA, RID, MLVocab
+from deriva_ml.core.exceptions import DerivaMLException
+from deriva_ml.dataset.aux_classes import DatasetMinid, DatasetVersion
+from deriva_ml.dataset.dataset_bag import DatasetBag
+from deriva_ml.model.catalog import DerivaModel
+from deriva_ml.model.sql_mapper import SQLMapper
 try:
     from icecream import ic
@@ -42,13 +44,13 @@ class DatabaseModelMeta(type):
 class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
     """Read in the contents of a BDBag and create a local SQLite database.
-        As part of its initialization, this routine will create a sqlite database that has the contents of all the tables
-    in the dataset_table.  In addition, any asset tables will the `Filename` column remapped to have the path of the local
-    copy of the file. In addition, a local version of the ERMRest model that as used to generate the dataset_table is
-    available.
+        As part of its initialization, this routine will create a sqlite database that has the contents of all the
+    tables in the dataset_table.  In addition, any asset tables will the `Filename` column remapped to have the path
+    of the local copy of the file. In addition, a local version of the ERMRest model that as used to generate the
+    dataset_table is available.
        The sqlite database will not have any foreign key constraints applied, however, foreign-key relationships can be
-    found by looking in the ERMRest model.  In addition, as sqllite doesn't support schema, Ermrest schema are added
+    found by looking in the ERMRest model.  In addition, as sqlite doesn't support schema, Ermrest schema are added
     to the table name using the convention SchemaName:TableName.  Methods in DatasetBag that have table names as the
     argument will perform the appropriate name mappings.
@@ -56,8 +58,8 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
     appear in more than one database. To help manage this, a global list of all the datasets that have been loaded
     into DatabaseModels, is kept in the class variable `_rid_map`.
-    Because you can load different versions of a dataset simultaneously, the dataset RID and version number are tracked, and a new
-    sqllite instance is created for every new dataset version present.
+    Because you can load different versions of a dataset simultaneously, the dataset RID and version number are tracked,
+    and a new sqlite instance is created for every new dataset version present.
     Attributes:
         bag_path (Path): path to the local copy of the BDBag
@@ -103,10 +105,11 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
         self.dbase_file = dbase_path / f"{minid.version_rid}.db"
         self.dbase = sqlite3.connect(self.dbase_file)
-        super().__init__(
-            Model.fromfile("file-system", self.bag_path / "data/schema.json")
-        )
+        schema_file = self.bag_path / "data/schema.json"
+        with schema_file.open("r") as f:
+            self.snaptime = json.load(f)["snaptime"]
+        super().__init__(Model.fromfile("file-system", self.bag_path / "data/schema.json"))
         self._logger = logging.getLogger("deriva_ml")
         self._load_model()
         self.ml_schema = ML_SCHEMA
@@ -121,28 +124,22 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
         sql_dataset = self.normalize_table_name("Dataset_Version")
         with self.dbase:
             dataset_versions = [
-                t
-                for t in self.dbase.execute(
-                    f'SELECT "Dataset", "Version" FROM "{sql_dataset}"'
-                ).fetchall()
+                t for t in self.dbase.execute(f'SELECT "Dataset", "Version" FROM "{sql_dataset}"').fetchall()
             ]
-        dataset_versions = [
-            (v[0], DatasetVersion.parse(v[1])) for v in dataset_versions
-        ]
+        dataset_versions = [(v[0], DatasetVersion.parse(v[1])) for v in dataset_versions]
         # Get most current version of each rid
         self.bag_rids = {}
         for rid, version in dataset_versions:
-            self.bag_rids[rid] = max(
-                self.bag_rids.get(rid, DatasetVersion(0, 1, 0)), version
-            )
+            self.bag_rids[rid] = max(self.bag_rids.get(rid, DatasetVersion(0, 1, 0)), version)
         for dataset_rid, dataset_version in self.bag_rids.items():
             version_list = DatabaseModel._rid_map.setdefault(dataset_rid, [])
             version_list.append((dataset_version, self))
     def _load_model(self) -> None:
-        """Create a sqlite database schema that contains all the tables within the catalog from which the BDBag was created."""
+        """Create a sqlite database schema that contains all the tables within the catalog from which the BDBag
+        was created."""
         with self.dbase:
             for t in self.model.schemas[self.domain_schema].tables.values():
                 self.dbase.execute(t.sqlite3_ddl())
@@ -153,7 +150,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
         """Load a SQLite database from a bdbag.  THis is done by looking for all the CSV files in the bdbag directory.
         If the file is for an asset table, update the FileName column of the table to have the local file path for
-        the materialized file.  Then load into the sqllite database.
+        the materialized file.  Then load into the sqlite database.
         Note: none of the foreign key constraints are included in the database.
         """
         dpath = self.bag_path / "data"
@@ -162,11 +159,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
         # Find all the CSV files in the subdirectory and load each file into the database.
         for csv_file in Path(dpath).rglob("*.csv"):
             table = csv_file.stem
-            schema = (
-                self.domain_schema
-                if table in self.model.schemas[self.domain_schema].tables
-                else self.ml_schema
-            )
+            schema = self.domain_schema if table in self.model.schemas[self.domain_schema].tables else self.ml_schema
             with csv_file.open(newline="") as csvfile:
                 csv_reader = reader(csvfile)
@@ -174,19 +167,14 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
                 # Determine which columns in the table has the Filename and the URL
                 asset_indexes = (
-                    (column_names.index("Filename"), column_names.index("URL"))
-                    if self._is_asset(table)
-                    else None
+                    (column_names.index("Filename"), column_names.index("URL")) if self._is_asset(table) else None
                 )
-                value_template = ",".join(
-                    ["?"] * len(column_names)
-                )  # SQL placeholder for row (?,?..)
+                value_template = ",".join(["?"] * len(column_names))  # SQL placeholder for row (?,?..)
                 column_list = ",".join([f'"{c}"' for c in column_names])
                 with self.dbase:
                     object_table = (
-                        self._localize_asset(o, asset_indexes, asset_map)
-                        for o in csv_reader
+                        self._localize_asset(o, asset_indexes, asset_map, table == "Dataset") for o in csv_reader
                     )
                     self.dbase.executemany(
                         f'INSERT OR REPLACE INTO "{schema}:{table}" ({column_list}) VALUES ({value_template})',
@@ -202,7 +190,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
         """
         fetch_map = {}
         try:
-            with open(self.bag_path / "fetch.txt", newline="\n") as fetch_file:
+            with Path.open(self.bag_path / "fetch.txt", newline="\n") as fetch_file:
                 for row in fetch_file:
                     # Rows in fetch.text are tab seperated with URL filename.
                     fields = row.split("\t")
@@ -224,18 +212,12 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
             Boolean that is true if the table looks like an asset table.
         """
         asset_columns = {"Filename", "URL", "Length", "MD5", "Description"}
-        sname = (
-            self.domain_schema
-            if table_name in self.model.schemas[self.domain_schema].tables
-            else self.ml_schema
-        )
+        sname = self.domain_schema if table_name in self.model.schemas[self.domain_schema].tables else self.ml_schema
         asset_table = self.model.schemas[sname].tables[table_name]
         return asset_columns.issubset({c.name for c in asset_table.columns})
     @staticmethod
-    def _localize_asset(
-        o: list, indexes: tuple[int, int], asset_map: dict[str, str]
-    ) -> tuple:
+    def _localize_asset(o: list, indexes: tuple[int, int], asset_map: dict[str, str], debug: bool = False) -> tuple:
         """Given a list of column values for a table, replace the FileName column with the local file name based on
         the URL value.
@@ -295,24 +277,21 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
         Returns:
              list of currently available datasets.
         """
-        atable = next(
-            self.model.schemas[ML_SCHEMA]
-            .tables[MLVocab.dataset_type]
-            .find_associations()
-        ).name
+        atable = next(self.model.schemas[ML_SCHEMA].tables[MLVocab.dataset_type].find_associations()).name
         # Get a list of all the dataset_type values associated with this dataset_table.
         datasets = []
-        ds_types = list(self.get_table_as_dict(atable))
-        for dataset in self.get_table_as_dict("Dataset"):
+        ds_types = list(self._get_table(atable))
+        for dataset in self._get_table("Dataset"):
             my_types = [t for t in ds_types if t["Dataset"] == dataset["RID"]]
-            datasets.append(
-                dataset
-                | {MLVocab.dataset_type: [ds[MLVocab.dataset_type] for ds in my_types]}
-            )
+            datasets.append(dataset | {MLVocab.dataset_type: [ds[MLVocab.dataset_type] for ds in my_types]})
         return datasets
-    def get_table_as_dict(self, table: str) -> Generator[dict[str, Any], None, None]:
+    def list_dataset_members(self, dataset_rid: RID) -> dict[str, Any]:
+        """Returns a list of all the dataset_table entries associated with a dataset."""
+        return self.get_dataset(dataset_rid).list_dataset_members()
+    def _get_table(self, table: str) -> Generator[dict[str, Any], None, None]:
         """Retrieve the contents of the specified table as a dictionary.
         Args:
@@ -323,14 +302,14 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
           A generator producing dictionaries containing the contents of the specified table as name/value pairs.
         """
         table_name = self.normalize_table_name(table)
-        with self.dbase as dbase:
-            col_names = [
-                c[1]
-                for c in dbase.execute(f'PRAGMA table_info("{table_name}")').fetchall()
-            ]
+        table = self.name_to_table(table)
+        with self.dbase as _dbase:
+            mapper = SQLMapper(self, table.name)
             result = self.dbase.execute(f'SELECT * FROM "{table_name}"')
-            while row := result.fetchone():
-                yield dict(zip(col_names, row))
+            while (row := result.fetchone()) is not None:
+                yield mapper.transform_tuple(row)
     def normalize_table_name(self, table: str) -> str:
         """Attempt to insert the schema into a table name if it's not provided.
@@ -342,13 +321,12 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
           table name with schema included.
         """
-        sname = ""
         try:
             [sname, tname] = table.split(":")
         except ValueError:
             tname = table
-            for sname, s in self.model.schemas.items():
-                if table in s.tables:
+            for sname in [self.domain_schema, self.ml_schema, "WWW"]:  # Be careful of File table.
+                if table in self.model.schemas[sname].tables:
                     break
         try:
             _ = self.model.schemas[sname].tables[tname]

deriva_ml/model/sql_mapper.py ADDED Viewed

@@ -0,0 +1,44 @@
+from datetime import datetime, timezone
+from typing import TYPE_CHECKING, Any, Sequence
+if TYPE_CHECKING:
+    from deriva_ml.model.database import DatabaseModel
+try:
+    from icecream import ic
+except ImportError:  # Graceful fallback if IceCream isn't installed.
+    ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a)  # noqa
+class SQLMapper:
+    def __init__(self, database: "DatabaseModel", table: str) -> None:
+        table_name = database.normalize_table_name(table)
+        schema, table = table_name.split(":")
+        with database.dbase as dbase:
+            self.col_names = [c[1] for c in dbase.execute(f'PRAGMA table_info("{table_name}")').fetchall()]
+        self.boolean_columns = [
+            self.col_names.index(c.name)
+            for c in database.model.schemas[schema].tables[table].columns
+            if c.type.typename == "boolean"
+        ]
+        self.time_columns = [
+            self.col_names.index(c.name)
+            for c in database.model.schemas[schema].tables[table].columns
+            if c.type.typename in ["ermrest_rct", "ermrest_rmt"]
+        ]
+    def _map_value(self, idx: int, v: Any) -> Any:
+        """
+        Return a new value based on `data` where, for each index in `idxs`,
+        """
+        tf_map = {"t": True, "f": False}
+        if idx in self.boolean_columns:
+            return tf_map.get(v, v)
+        if idx in self.time_columns:
+            return datetime.strptime(v, "%Y-%m-%d %H:%M:%S.%f+00").replace(tzinfo=timezone.utc).isoformat()
+        return v
+    def transform_tuple(self, data: Sequence[Any]) -> Any:
+        return dict(zip(self.col_names, tuple(self._map_value(i, v) for i, v in enumerate(data))))

deriva_ml/run_notebook.py CHANGED Viewed

@@ -1,16 +1,16 @@
 """Module to run a notebook using papermill"""
-from datetime import datetime
 import json
 import os
-import papermill as pm
-from pathlib import Path
-import regex as re
 import tempfile
+from datetime import datetime
+from pathlib import Path
-from deriva_ml import Workflow, DerivaML
+import papermill as pm
+import regex as re
 from deriva.core import BaseCLI
-from deriva_ml import MLAsset, ExecAssetType
+from deriva_ml import DerivaML, ExecAssetType, MLAsset, Workflow
 class DerivaMLRunNotebookCLI(BaseCLI):
@@ -91,7 +91,7 @@ class DerivaMLRunNotebookCLI(BaseCLI):
             if not (parameter_file.is_file() and parameter_file.suffix == ".json"):
                 print("Parameter file must be an json file.")
                 exit(1)
-            with open(parameter_file, "r") as f:
+            with Path(parameter_file).open("r") as f:
                 parameters |= json.load(f)
         if not (notebook_file.is_file() and notebook_file.suffix == ".ipynb"):
@@ -101,7 +101,8 @@ class DerivaMLRunNotebookCLI(BaseCLI):
         os.environ["DERIVA_HOST"] = args.host
         os.environ["DERIVA_CATALOG_ID"] = args.catalog
-        # Create a workflow instance for this specific version of the script.  Return an existing workflow if one is found.
+        # Create a workflow instance for this specific version of the script.
+        # Return an existing workflow if one is found.
         notebook_parameters = pm.inspect_notebook(notebook_file)
         if args.inspect:
             for param, value in notebook_parameters.items():
@@ -133,8 +134,8 @@ class DerivaMLRunNotebookCLI(BaseCLI):
                 parameters=parameters,
                 kernel_name=kernel,
             )
-            host = catalog_id = execution_rid = None
-            with open(notebook_output, "r") as f:
+            catalog_id = execution_rid = None
+            with Path(notebook_output).open("r") as f:
                 for line in f:
                     if m := re.search(
                         r"Execution RID: https://(?P<host>.*)/id/(?P<catalog_id>.*)/(?P<execution_rid>[\w-]+)",
@@ -161,7 +162,7 @@ class DerivaMLRunNotebookCLI(BaseCLI):
                 file_name=f"notebook-parameters-{datetime.now().strftime('%Y%m%d-%H%M%S')}.json",
                 asset_types=ExecAssetType.input_file.value,
             )
-            with open(parameter_file, "w") as f:
+            with Path(parameter_file).open("w") as f:
                 json.dump(parameters, f)
             execution.upload_execution_outputs()
@@ -169,6 +170,13 @@ class DerivaMLRunNotebookCLI(BaseCLI):
 def main():
+    """Main entry point for the notebook runner CLI.
+    Creates and runs the DerivaMLRunNotebookCLI instance.
+    Returns:
+        None. Executes the CLI.
+    """
     cli = DerivaMLRunNotebookCLI(
         description="Deriva ML Execution Script Demo", epilog=""
     )

deriva_ml/schema/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from deriva_ml.schema.create_schema import create_ml_catalog, reset_ml_schema
+__all__ = ["create_ml_catalog", "reset_ml_schema"]

deriva_ml/{schema_setup → schema}/annotations.py RENAMED Viewed

@@ -1,10 +1,11 @@
 import argparse
-import sys
 from deriva.core.ermrest_model import Model, Table
 from deriva.core.utils.core_utils import tag as deriva_tags
-from ..deriva_model import DerivaModel
-from ..upload import bulk_upload_configuration
+from deriva_ml.core.constants import DerivaAssetColumns
+from deriva_ml.dataset.upload import bulk_upload_configuration
+from deriva_ml.model.catalog import DerivaModel
 def catalog_annotation(model: DerivaModel) -> None:
@@ -106,17 +107,12 @@ def catalog_annotation(model: DerivaModel) -> None:
                             }
                             for tname in model.schemas[model.domain_schema].tables
                             # Don't include controlled vocabularies, association tables, or feature tables.
-                            if not (
-                                model.is_vocabulary(tname)
-                                or model.is_association(tname, pure=False, max_arity=3)
-                            )
+                            if not (model.is_vocabulary(tname) or model.is_association(tname, pure=False, max_arity=3))
                         ],
                     },
                     {  # Vocabulary menu which will list all the controlled vocabularies in deriva-ml and domain.
                         "name": "Vocabulary",
-                        "children": [
-                            {"name": f"{ml_schema} Vocabularies", "header": True}
-                        ]
+                        "children": [{"name": f"{ml_schema} Vocabularies", "header": True}]
                         + [
                             {
                                 "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:{tname}",
@@ -186,9 +182,18 @@ def catalog_annotation(model: DerivaModel) -> None:
 def asset_annotation(asset_table: Table):
+    """Generate annotations for an asset table.
+    Args:
+        asset_table: The Table object representing the asset table.
+    Returns:
+        A dictionary containing the annotations for the asset table.
+    """
     schema = asset_table.schema.name
     asset_name = asset_table.name
-    model = DerivaModel(asset_table.schema.model)
+    asset_metadata = {c.name for c in asset_table.columns} - DerivaAssetColumns
     def fkey_column(column):
         """Map the column name to a FK if a constraint exists on the column"""
@@ -202,9 +207,7 @@ def asset_annotation(asset_table: Table):
         )
     annotations = {
-        deriva_tags.table_display: {
-            "row_name": {"row_markdown_pattern": "{{{Filename}}}"}
-        },
+        deriva_tags.table_display: {"row_name": {"row_markdown_pattern": "{{{Filename}}}"}},
         deriva_tags.visible_columns: {
             "*": [
                 "RID",
@@ -236,11 +239,11 @@ def asset_annotation(asset_table: Table):
                     "markdown_name": "Asset Types",
                 },
             ]
-            + [fkey_column(c) for c in model.asset_metadata(asset_table)],
+            + [fkey_column(c) for c in asset_metadata],
         },
     }
     asset_table.annotations.update(annotations)
-    model.apply()
+    asset_table.schema.model.apply()
 def generate_annotation(model: Model, schema: str) -> dict:
@@ -435,9 +438,7 @@ def generate_annotation(model: Model, schema: str) -> dict:
         },
         deriva_tags.visible_foreign_keys: {"*": []},
         deriva_tags.table_display: {
-            "row_name": {
-                "row_markdown_pattern": "{{{$fkey_deriva-ml_Dataset_Version_Dataset_fkey.RID}}}:{{{Version}}}"
-            }
+            "row_name": {"row_markdown_pattern": "{{{$fkey_deriva-ml_Dataset_Version_Dataset_fkey.RID}}}:{{{Version}}}"}
         },
     }
@@ -451,9 +452,17 @@ def generate_annotation(model: Model, schema: str) -> dict:
 def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--catalog_id", type=str, required=True)
-    parser.add_argument("--schema_name", type=str, required=True)
+    """Main entry point for the annotations CLI.
+    Applies annotations to the ML schema based on command line arguments.
+    Returns:
+        None. Executes the CLI.
+    """
+    parser = argparse.ArgumentParser(description="Apply annotations to ML schema")
+    parser.add_argument("hostname", help="Hostname for the catalog")
+    parser.add_argument("catalog_id", help="Catalog ID")
+    parser.add_argument("schema-name", default="deriva-ml", help="Schema name (default: deriva-ml)")
     args = parser.parse_args()
     generate_annotation(args.catalog_id, args.schema_name)

deriva_ml/schema/check_schema.py ADDED Viewed

@@ -0,0 +1,104 @@
+import json
+import re
+from importlib.resources import files
+from pathlib import Path
+from pprint import pprint
+from deepdiff import DeepDiff
+from deriva.core import AttrDict, BaseCLI, get_credential
+from deriva.core.ermrest_catalog import ErmrestCatalog
+from deriva_ml.core.definitions import ML_SCHEMA
+from deriva_ml.schema.create_schema import create_ml_catalog
+def normalize_schema(d):
+    if isinstance(d, dict) or isinstance(d, AttrDict):
+        m = {}
+        for k, v in d.items():
+            if k == "acl_bindings" or k == "annotations" or k == "comment":
+                continue
+            m[k] = normalize_schema(v)
+        return m
+    elif isinstance(d, list):
+        return [normalize_schema(i) for i in d]
+    elif isinstance(d, str):
+        # ID templates for controlled vocabulary
+        if m := re.match("(?P<s>.*):{RID}", d):
+            d = d if m["s"] == "deriva-ml" else "reference-catalog:{RID}" if re.match(".*:{RID}", d) else d
+        return d
+    else:
+        return d
+def check_ml_schema(hostname, catalog_id, schema_file: Path | None = None):
+    """Check the ML schema against a reference schema file.
+    Args:
+        hostname: The hostname of the Deriva catalog.
+        catalog_id: The catalog ID to check.
+        schema_file: Optional path to reference schema file. If None, uses default reference.
+    Returns:
+        None. Prints the diff between target and reference schemas.
+    """
+    # schema_file = schema_file or files("deriva-ml.data").joinpath("deriva-ml-reference.json")
+    schema_file = schema_file or files("deriva_ml.schema").joinpath("deriva-ml-reference.json")
+    # Now map
+    with Path(schema_file).open("r") as f:
+        reference_schema = normalize_schema(json.load(f)["schemas"][ML_SCHEMA])
+    catalog = ErmrestCatalog("https", hostname, catalog_id, credentials=get_credential(hostname))
+    target_schema = normalize_schema(catalog.getCatalogModel().schemas[ML_SCHEMA].prejson())
+    # Compute the diff
+    diff = DeepDiff(reference_schema, target_schema, ignore_order=True, view="tree")
+    print(f"Diff between {schema_file} and {ML_SCHEMA} schema:")
+    # Pretty‐print as JSON
+    pprint(diff, indent=2)
+    return diff
+def dump_ml_schema(hostname: str, filename: str = "deriva-ml-reference.json") -> None:
+    """Dump the schema of the ML catalog to stdout."""
+    catalog = create_ml_catalog(hostname, "reference-catalog")
+    try:
+        model = catalog.getCatalogModel()
+        print(f"Dumping ML schema to {Path(filename).resolve()}...")
+        with Path(filename).open("w") as f:
+            json.dump(model.prejson(), f, indent=2)
+    finally:
+        catalog.delete_ermrest_catalog(really=True)
+class CheckMLSchemaCLI(BaseCLI):
+    """Main class to part command line arguments and call model"""
+    def __init__(self, description, epilog, **kwargs):
+        BaseCLI.__init__(self, description, epilog, **kwargs)
+        self.parser.add_argument("--catalog", default=1, metavar="<1>", help="Catalog number. Default: 1")
+        self.parser.add_argument("--dump", action="store_true", help="Perform execution in dry-run mode.")
+    def main(self):
+        """Parse arguments and set up execution environment."""
+        args = self.parse_cli()
+        hostname = args.host
+        catalog_id = args.catalog
+        if args.dump:
+            dump_ml_schema(hostname, catalog_id)
+            return
+        check_ml_schema(hostname, catalog_id)
+def main():
+    cli = CheckMLSchemaCLI(description="Check DerivaML Catalog for Compliance", epilog="")
+    cli.main()
+if __name__ == "__main__":
+    main()

deriva-ml 1.14.0__py3-none-any.whl → 1.14.27__py3-none-any.whl

deriva-ml 1.14.0py3-none-any.whl → 1.14.27py3-none-any.whl