PyPI - deriva-ml - Versions diffs - 1.17.10__py3-none-any.whl - Mend

deriva-ml 1.17.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

deriva_ml/.DS_Store +0 -0
deriva_ml/__init__.py +79 -0
deriva_ml/bump_version.py +142 -0
deriva_ml/core/__init__.py +39 -0
deriva_ml/core/base.py +1527 -0
deriva_ml/core/config.py +69 -0
deriva_ml/core/constants.py +36 -0
deriva_ml/core/definitions.py +74 -0
deriva_ml/core/enums.py +222 -0
deriva_ml/core/ermrest.py +288 -0
deriva_ml/core/exceptions.py +28 -0
deriva_ml/core/filespec.py +116 -0
deriva_ml/dataset/__init__.py +12 -0
deriva_ml/dataset/aux_classes.py +225 -0
deriva_ml/dataset/dataset.py +1519 -0
deriva_ml/dataset/dataset_bag.py +450 -0
deriva_ml/dataset/history.py +109 -0
deriva_ml/dataset/upload.py +439 -0
deriva_ml/demo_catalog.py +495 -0
deriva_ml/execution/__init__.py +26 -0
deriva_ml/execution/environment.py +290 -0
deriva_ml/execution/execution.py +1180 -0
deriva_ml/execution/execution_configuration.py +147 -0
deriva_ml/execution/workflow.py +413 -0
deriva_ml/feature.py +228 -0
deriva_ml/install_kernel.py +71 -0
deriva_ml/model/__init__.py +0 -0
deriva_ml/model/catalog.py +485 -0
deriva_ml/model/database.py +719 -0
deriva_ml/protocols/dataset.py +19 -0
deriva_ml/run_notebook.py +228 -0
deriva_ml/schema/__init__.py +3 -0
deriva_ml/schema/annotations.py +473 -0
deriva_ml/schema/check_schema.py +104 -0
deriva_ml/schema/create_schema.py +393 -0
deriva_ml/schema/deriva-ml-reference.json +8525 -0
deriva_ml/schema/policy.json +81 -0
deriva_ml/schema/table_comments_utils.py +57 -0
deriva_ml/test.py +94 -0
deriva_ml-1.17.10.dist-info/METADATA +38 -0
deriva_ml-1.17.10.dist-info/RECORD +45 -0
deriva_ml-1.17.10.dist-info/WHEEL +5 -0
deriva_ml-1.17.10.dist-info/entry_points.txt +9 -0
deriva_ml-1.17.10.dist-info/licenses/LICENSE +201 -0
deriva_ml-1.17.10.dist-info/top_level.txt +1 -0

deriva_ml/schema/annotations.py ADDED Viewed

@@ -0,0 +1,473 @@
+import argparse
+import sys
+from deriva.core.ermrest_model import Model, Table
+from deriva.core.utils.core_utils import tag as deriva_tags
+from deriva_ml.core.constants import DerivaAssetColumns
+from deriva_ml.dataset.upload import bulk_upload_configuration
+from deriva_ml.model.catalog import DerivaModel
+def catalog_annotation(model: DerivaModel) -> None:
+    """Set the annotations for a catalog.
+    This routine will dynamically walk the domain schema and create menu bar for the catalog based on the current
+    configuration.  A side effect is that the annotation attribute of the catalog will be updated and the result
+    pushed to the catalog.
+    Args:
+        model: A deriva model to the current catalog.
+    """
+    catalog_id = model.catalog.catalog_id
+    ml_schema = model.ml_schema
+    catalog_annotation = {
+        deriva_tags.display: {"name_style": {"underline_space": True}},
+        deriva_tags.chaise_config: {
+            "headTitle": "Catalog ML",
+            "navbarBrandText": "ML Data Browser",
+            "systemColumnsDisplayEntry": ["RID"],
+            "systemColumnsDisplayCompact": ["RID"],
+            "defaultTable": {"table": "Dataset", "schema": "deriva-ml"},
+            "deleteRecord": True,
+            "showFaceting": True,
+            "shareCiteAcls": True,
+            "exportConfigsSubmenu": {"acls": {"show": ["*"], "enable": ["*"]}},
+            "resolverImplicitCatalog": False,
+            "navbarMenu": {
+                "newTab": False,
+                "children": [
+                    {
+                        "name": "User Info",
+                        "children": [
+                            {
+                                "url": f"/chaise/recordset/#{catalog_id}/public:ERMrest_Client",
+                                "name": "Users",
+                            },
+                            {
+                                "url": f"/chaise/recordset/#{catalog_id}/public:ERMrest_Group",
+                                "name": "Groups",
+                            },
+                            {
+                                "url": f"/chaise/recordset/#{catalog_id}/public:ERMrest_RID_Lease",
+                                "name": "ERMrest RID Lease",
+                            },
+                        ],
+                    },
+                    {  # All the primary tables in deriva-ml schema.
+                        "name": "Deriva-ML",
+                        "children": [
+                            {
+                                "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Workflow",
+                                "name": "Workflow",
+                            },
+                            {
+                                "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Execution",
+                                "name": "Execution",
+                            },
+                            {
+                                "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Execution_Metadata",
+                                "name": "Execution Metadata",
+                            },
+                            {
+                                "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Execution_Asset",
+                                "name": "Execution Asset",
+                            },
+                            {
+                                "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Dataset",
+                                "name": "Dataset",
+                            },
+                            {
+                                "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Dataset_Version",
+                                "name": "Dataset Version",
+                            },
+                        ],
+                    },
+                    {  # All the primary tables in deriva-ml schema.
+                        "name": "WWW",
+                        "children": [
+                            {
+                                "url": f"/chaise/recordset/#{catalog_id}/WWW:Page",
+                                "name": "Page",
+                            },
+                            {
+                                "url": f"/chaise/recordset/#{catalog_id}/WWW:File",
+                                "name": "File",
+                            },
+                        ],
+                    },
+                    {
+                        "name": model.domain_schema,
+                        "children": [
+                            {
+                                "name": tname,
+                                "url": f"/chaise/recordset/#{catalog_id}/{model.domain_schema}:{tname}",
+                            }
+                            for tname in model.schemas[model.domain_schema].tables
+                            # Don't include controlled vocabularies, association tables, or feature tables.
+                            if not (model.is_vocabulary(tname) or model.is_association(tname, pure=False, max_arity=3))
+                        ],
+                    },
+                    {  # Vocabulary menu which will list all the controlled vocabularies in deriva-ml and domain.
+                        "name": "Vocabulary",
+                        "children": [{"name": f"{ml_schema} Vocabularies", "header": True}]
+                        + [
+                            {
+                                "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:{tname}",
+                                "name": tname,
+                            }
+                            for tname in model.schemas[model.ml_schema].tables
+                            if model.is_vocabulary(tname)
+                        ]
+                        + [
+                            {
+                                "name": f"{model.domain_schema} Vocabularies",
+                                "header": True,
+                            }
+                        ]
+                        + [
+                            {
+                                "url": f"/chaise/recordset/#{catalog_id}/{model.domain_schema}:{tname}",
+                                "name": tname,
+                            }
+                            for tname in model.schemas[model.domain_schema].tables
+                            if model.is_vocabulary(tname)
+                        ],
+                    },
+                    {  # List of all of the asset tables in deriva-ml and domain schemas.
+                        "name": "Assets",
+                        "children": [
+                            {
+                                "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:{tname}",
+                                "name": tname,
+                            }
+                            for tname in model.schemas[model.ml_schema].tables
+                            if model.is_asset(tname)
+                        ]
+                        + [
+                            {
+                                "url": f"/chaise/recordset/#{catalog_id}/{model.domain_schema}:{tname}",
+                                "name": tname,
+                            }
+                            for tname in model.schemas[model.domain_schema].tables
+                            if model.is_asset(tname)
+                        ],
+                    },
+                    {
+                        "url": "/chaise/recordset/#0/ermrest:registry@sort(RID)",
+                        "name": "Catalog Registry",
+                    },
+                    {
+                        "name": "Documentation",
+                        "children": [
+                            {
+                                "url": "https://github.com/informatics-isi-edu/deriva-ml/blob/main/docs/ml_workflow_instruction.md",
+                                "name": "ML Notebook Instruction",
+                            },
+                            {
+                                "url": "https://informatics-isi-edu.github.io/deriva-ml/",
+                                "name": "Deriva-ML Documentation",
+                            },
+                        ],
+                    },
+                ],
+            },
+        },
+        deriva_tags.bulk_upload: bulk_upload_configuration(model=model),
+    }
+    model.annotations.update(catalog_annotation)
+    model.apply()
+def asset_annotation(asset_table: Table):
+    """Generate annotations for an asset table.
+    Args:
+        asset_table: The Table object representing the asset table.
+    Returns:
+        A dictionary containing the annotations for the asset table.
+    """
+    schema = asset_table.schema.name
+    asset_name = asset_table.name
+    asset_metadata = {c.name for c in asset_table.columns} - DerivaAssetColumns
+    def fkey_column(column):
+        """Map the column name to a FK if a constraint exists on the column"""
+        return next(
+            (
+                (fk.name[0].name, fk.name[1])
+                for fk in asset_table.foreign_keys
+                if asset_table.columns[column] in fk.column_map
+            ),
+            column,
+        )
+    annotations = {
+        deriva_tags.table_display: {"row_name": {"row_markdown_pattern": "{{{Filename}}}"}},
+        deriva_tags.visible_columns: {
+            "*": [
+                "RID",
+                "RCT",
+                "RMT",
+                [schema, f"{asset_name}_RCB_fkey"],
+                [schema, f"{asset_name}_RMB_fkey"],
+                "URL",
+                "Filename",
+                "Description",
+                "Length",
+                "MD5",
+                {
+                    "source": [
+                        {
+                            "inbound": [
+                                schema,
+                                f"{asset_name}_Asset_Type_{asset_name}_fkey",
+                            ]
+                        },
+                        {
+                            "outbound": [
+                                schema,
+                                f"{asset_name}_Asset_Type_Asset_Type_fkey",
+                            ]
+                        },
+                        "RID",
+                    ],
+                    "markdown_name": "Asset Types",
+                },
+            ]
+            + [fkey_column(c) for c in asset_metadata],
+        },
+    }
+    asset_table.annotations.update(annotations)
+    asset_table.schema.model.apply()
+def generate_annotation(model: Model, schema: str) -> dict:
+    catalog_id = model.catalog.catalog_id
+    workflow_annotation = {
+        deriva_tags.visible_columns: {
+            "*": [
+                "RID",
+                [schema, "Workflow_RCB_fkey"],
+                [schema, "Workflow_RMB_fkey"],
+                "Name",
+                "Description",
+                {
+                    "display": {"markdown_pattern": "[{{{URL}}}]({{{URL}}})"},
+                    "markdown_name": "URL",
+                },
+                "Checksum",
+                "Version",
+                {
+                    "source": [
+                        {"outbound": [schema, "Workflow_Workflow_Type_fkey"]},
+                        "RID",
+                    ]
+                },
+            ]
+        }
+    }
+    execution_annotation = {
+        deriva_tags.visible_columns: {
+            "*": [
+                "RID",
+                [schema, "Execution_RCB_fkey"],
+                [schema, "Execution_RMB_fkey"],
+                "RCT",
+                "Description",
+                {"source": [{"outbound": [schema, "Execution_Workflow_fkey"]}, "RID"]},
+                "Duration",
+                "Status",
+                "Status_Detail",
+            ]
+        },
+        "tag:isrd.isi.edu,2016:visible-foreign-keys": {
+            "detailed": [
+                {
+                    "source": [
+                        {"inbound": [schema, "Dataset_Execution_Execution_fkey"]},
+                        {"outbound": [schema, "Dataset_Execution_Dataset_fkey"]},
+                        "RID",
+                    ],
+                    "markdown_name": "Dataset",
+                },
+                {
+                    "source": [
+                        {
+                            "inbound": [
+                                schema,
+                                "Execution_Asset_Execution_Execution_fkey",
+                            ]
+                        },
+                        {
+                            "outbound": [
+                                schema,
+                                "Execution_Asset_Execution_Execution_Asset_fkey",
+                            ]
+                        },
+                        "RID",
+                    ],
+                    "markdown_name": "Execution Asset",
+                },
+                {
+                    "source": [
+                        {"inbound": [schema, "Execution_Metadata_Execution_Execution_fkey"]},
+                        {"outbound": [schema, "Execution_Metadata_Execution_Execution_Metadata_fkey"]},
+                        "RID",
+                    ],
+                    "markdown_name": "Execution Metadata",
+                },
+            ]
+        },
+    }
+    dataset_annotation = {
+        deriva_tags.visible_columns: {
+            "*": [
+                "RID",
+                "Description",
+                [schema, "Dataset_RCB_fkey"],
+                [schema, "Dataset_RMB_fkey"],
+                {
+                    "source": [
+                        {"outbound": ["deriva-ml", "Dataset_Version_fkey"]},
+                        "Version",
+                    ],
+                    "markdown_name": "Dataset Version",
+                },
+            ],
+            "detailed": [
+                "RID",
+                "Description",
+                {
+                    "source": [
+                        {"inbound": ["deriva-ml", "Dataset_Dataset_Type_Dataset_fkey"]},
+                        {
+                            "outbound": [
+                                "deriva-ml",
+                                "Dataset_Dataset_Type_Dataset_Type_fkey",
+                            ]
+                        },
+                        "RID",
+                    ],
+                    "markdown_name": "Dataset Types",
+                },
+                {
+                    "source": [
+                        {"outbound": ["deriva-ml", "Dataset_Version_fkey"]},
+                        "Version",
+                    ],
+                    "markdown_name": "Dataset Version",
+                },
+                [schema, "Dataset_RCB_fkey"],
+                [schema, "Dataset_RMB_fkey"],
+            ],
+            "filter": {
+                "and": [
+                    {"source": "RID"},
+                    {"source": "Description"},
+                    {
+                        "source": [
+                            {
+                                "inbound": [
+                                    "deriva-ml",
+                                    "Dataset_Dataset_Type_Dataset_fkey",
+                                ]
+                            },
+                            {
+                                "outbound": [
+                                    "deriva-ml",
+                                    "Dataset_Dataset_Type_Dataset_Type_fkey",
+                                ]
+                            },
+                            "RID",
+                        ],
+                        "markdown_name": "Dataset Types",
+                    },
+                    {
+                        "source": [{"outbound": [schema, "Dataset_RCB_fkey"]}, "RID"],
+                        "markdown_name": "Created By",
+                    },
+                    {
+                        "source": [{"outbound": [schema, "Dataset_RMB_fkey"]}, "RID"],
+                        "markdown_name": "Modified By",
+                    },
+                ]
+            },
+        }
+    }
+    schema_annotation = {
+        "name_style": {"underline_space": True},
+    }
+    dataset_version_annotation = {
+        deriva_tags.visible_columns: {
+            "*": [
+                "RID",
+                "RCT",
+                "RMT",
+                [schema, "Dataset_Version_RCB_fkey"],
+                [schema, "Dataset_Version_RMB_fkey"],
+                {
+                    "source": [
+                        {"outbound": [schema, "Dataset_Version_Dataset_fkey"]},
+                        "RID",
+                    ]
+                },
+                "Description",
+                {
+                    "display": {
+                        "template_engine": "handlebars",
+                        "markdown_pattern": "[{{{Version}}}](https://{{{$location.host}}}/id/{{{$catalog.id}}}/{{{Dataset}}}@{{{Snapshot}}})",
+                    },
+                    "markdown_name": "Version",
+                },
+                "Minid",
+                {
+                    "source": [
+                        {"outbound": [schema, "Dataset_Version_Execution_fkey"]},
+                        "RID",
+                    ]
+                },
+            ]
+        },
+        deriva_tags.visible_foreign_keys: {"*": []},
+        deriva_tags.table_display: {
+            "row_name": {"row_markdown_pattern": "{{{$fkey_deriva-ml_Dataset_Version_Dataset_fkey.RID}}}:{{{Version}}}"}
+        },
+    }
+    return {
+        "workflow_annotation": workflow_annotation,
+        "dataset_annotation": dataset_annotation,
+        "execution_annotation": execution_annotation,
+        "schema_annotation": schema_annotation,
+        "dataset_version_annotation": dataset_version_annotation,
+    }
+def main():
+    """Main entry point for the annotations CLI.
+    Applies annotations to the ML schema based on command line arguments.
+    Returns:
+        None. Executes the CLI.
+    """
+    parser = argparse.ArgumentParser(description="Apply annotations to ML schema")
+    parser.add_argument("hostname", help="Hostname for the catalog")
+    parser.add_argument("catalog_id", help="Catalog ID")
+    parser.add_argument("schema-name", default="deriva-ml", help="Schema name (default: deriva-ml)")
+    args = parser.parse_args()
+    generate_annotation(args.catalog_id, args.schema_name)
+if __name__ == "__main__":
+    sys.exit(main())

deriva_ml/schema/check_schema.py ADDED Viewed

@@ -0,0 +1,104 @@
+import json
+import re
+from importlib.resources import files
+from pathlib import Path
+from pprint import pprint
+from deepdiff import DeepDiff
+from deriva.core import AttrDict, BaseCLI, get_credential
+from deriva.core.ermrest_catalog import ErmrestCatalog
+from deriva_ml.core.definitions import ML_SCHEMA
+from deriva_ml.schema.create_schema import create_ml_catalog
+def normalize_schema(d):
+    if isinstance(d, dict) or isinstance(d, AttrDict):
+        m = {}
+        for k, v in d.items():
+            if k == "acl_bindings" or k == "annotations" or k == "comment":
+                continue
+            m[k] = normalize_schema(v)
+        return m
+    elif isinstance(d, list):
+        return [normalize_schema(i) for i in d]
+    elif isinstance(d, str):
+        # ID templates for controlled vocabulary
+        if m := re.match("(?P<s>.*):{RID}", d):
+            d = d if m["s"] == "deriva-ml" else "reference-catalog:{RID}" if re.match(".*:{RID}", d) else d
+        return d
+    else:
+        return d
+def check_ml_schema(hostname, catalog_id, schema_file: Path | None = None):
+    """Check the ML schema against a reference schema file.
+    Args:
+        hostname: The hostname of the Deriva catalog.
+        catalog_id: The catalog ID to check.
+        schema_file: Optional path to reference schema file. If None, uses default reference.
+    Returns:
+        None. Prints the diff between target and reference schemas.
+    """
+    # schema_file = schema_file or files("deriva-ml.data").joinpath("deriva-ml-reference.json")
+    schema_file = schema_file or files("deriva_ml.schema").joinpath("deriva-ml-reference.json")
+    # Now map
+    with Path(schema_file).open("r") as f:
+        reference_schema = normalize_schema(json.load(f)["schemas"][ML_SCHEMA])
+    catalog = ErmrestCatalog("https", hostname, catalog_id, credentials=get_credential(hostname))
+    target_schema = normalize_schema(catalog.getCatalogModel().schemas[ML_SCHEMA].prejson())
+    # Compute the diff
+    diff = DeepDiff(reference_schema, target_schema, ignore_order=True, view="tree")
+    print(f"Diff between {schema_file} and {ML_SCHEMA} schema:")
+    # Pretty‐print as JSON
+    pprint(diff, indent=2)
+    return diff
+def dump_ml_schema(hostname: str, filename: str = "deriva-ml-reference.json") -> None:
+    """Dump the schema of the ML catalog to stdout."""
+    catalog = create_ml_catalog(hostname, "reference-catalog")
+    try:
+        model = catalog.getCatalogModel()
+        print(f"Dumping ML schema to {Path(filename).resolve()}...")
+        with Path(filename).open("w") as f:
+            json.dump(model.prejson(), f, indent=2)
+    finally:
+        catalog.delete_ermrest_catalog(really=True)
+class CheckMLSchemaCLI(BaseCLI):
+    """Main class to part command line arguments and call model"""
+    def __init__(self, description, epilog, **kwargs):
+        BaseCLI.__init__(self, description, epilog, **kwargs)
+        self.parser.add_argument("--catalog", default=1, metavar="<1>", help="Catalog number. Default: 1")
+        self.parser.add_argument("--dump", action="store_true", help="Perform execution in dry-run mode.")
+    def main(self):
+        """Parse arguments and set up execution environment."""
+        args = self.parse_cli()
+        hostname = args.host
+        catalog_id = args.catalog
+        if args.dump:
+            dump_ml_schema(hostname, catalog_id)
+            return
+        check_ml_schema(hostname, catalog_id)
+def main():
+    cli = CheckMLSchemaCLI(description="Check DerivaML Catalog for Compliance", epilog="")
+    cli.main()
+if __name__ == "__main__":
+    main()