PyPI - deriva-ml - Versions diffs - 1.14.0__py3-none-any.whl → 1.14.27__py3-none-any.whl - Mend

deriva-ml 1.14.0py3-none-any.whl → 1.14.27py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

deriva_ml/__init__.py +25 -30
deriva_ml/core/__init__.py +39 -0
deriva_ml/core/base.py +1489 -0
deriva_ml/core/constants.py +36 -0
deriva_ml/core/definitions.py +74 -0
deriva_ml/core/enums.py +222 -0
deriva_ml/core/ermrest.py +288 -0
deriva_ml/core/exceptions.py +28 -0
deriva_ml/core/filespec.py +116 -0
deriva_ml/dataset/__init__.py +4 -0
deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
deriva_ml/{dataset.py → dataset/dataset.py} +406 -428
deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
deriva_ml/{history.py → dataset/history.py} +51 -33
deriva_ml/{upload.py → dataset/upload.py} +48 -70
deriva_ml/demo_catalog.py +233 -183
deriva_ml/execution/environment.py +290 -0
deriva_ml/{execution.py → execution/execution.py} +365 -252
deriva_ml/execution/execution_configuration.py +163 -0
deriva_ml/{execution_configuration.py → execution/workflow.py} +212 -224
deriva_ml/feature.py +83 -46
deriva_ml/model/__init__.py +0 -0
deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
deriva_ml/{database_model.py → model/database.py} +52 -74
deriva_ml/model/sql_mapper.py +44 -0
deriva_ml/run_notebook.py +19 -11
deriva_ml/schema/__init__.py +3 -0
deriva_ml/{schema_setup → schema}/annotations.py +31 -22
deriva_ml/schema/check_schema.py +104 -0
deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
deriva_ml/schema/deriva-ml-reference.json +8525 -0
deriva_ml/schema/table_comments_utils.py +57 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/METADATA +5 -4
deriva_ml-1.14.27.dist-info/RECORD +40 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/entry_points.txt +1 -0
deriva_ml/deriva_definitions.py +0 -391
deriva_ml/deriva_ml_base.py +0 -1046
deriva_ml/execution_environment.py +0 -139
deriva_ml/schema_setup/table_comments_utils.py +0 -56
deriva_ml/test-files/execution-parameters.json +0 -1
deriva_ml/test-files/notebook-parameters.json +0 -5
deriva_ml/test_functions.py +0 -141
deriva_ml/test_notebook.ipynb +0 -197
deriva_ml-1.14.0.dist-info/RECORD +0 -31
/deriva_ml/{schema_setup → execution}/__init__.py +0 -0
/deriva_ml/{schema_setup → schema}/policy.json +0 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/WHEEL +0 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/top_level.txt +0 -0

deriva_ml/{schema_setup → schema}/create_schema.py RENAMED Viewed

@@ -1,20 +1,27 @@
 import argparse
+import subprocess
 import sys
-from typing import Optional, Any
+from importlib.resources import files
+from typing import Any, Optional
-from deriva.core import DerivaServer, get_credential, ErmrestCatalog
-from deriva.core.ermrest_model import Model
+from deriva.core import DerivaServer, ErmrestCatalog, get_credential
 from deriva.core.ermrest_model import (
-    builtin_types,
-    Schema,
-    Table,
     Column,
     ForeignKey,
     Key,
+    Model,
+    Schema,
+    Table,
+    builtin_types,
 )
-from deriva_ml import MLVocab
-from deriva_ml.schema_setup.annotations import generate_annotation, asset_annotation
+from deriva_ml.core.definitions import ML_SCHEMA, MLTable, MLVocab
+from deriva_ml.schema.annotations import asset_annotation, generate_annotation
+try:
+    from icecream import ic
+except ImportError:  # Graceful fallback if IceCream isn't installed.
+    ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a)  # noqa
 def create_dataset_table(
@@ -23,10 +30,10 @@ def create_dataset_table(
     project_name: str,
     dataset_annotation: Optional[dict] = None,
     version_annotation: Optional[dict] = None,
-):
+) -> Table:
     dataset_table = schema.create_table(
         Table.define(
-            tname="Dataset",
+            tname=MLTable.dataset,
             column_defs=[
                 Column.define("Description", builtin_types.markdown),
                 Column.define("Deleted", builtin_types.boolean),
@@ -35,9 +42,8 @@ def create_dataset_table(
         )
     )
-    dataset_type = schema.create_table(
-        Table.define_vocabulary(MLVocab.dataset_type, f"{project_name}:{{RID}}")
-    )
+    dataset_type = schema.create_table(Table.define_vocabulary(MLVocab.dataset_type, f"{project_name}:{{RID}}"))
     schema.create_table(
         Table.define_association(
             associates=[
@@ -47,27 +53,31 @@ def create_dataset_table(
         )
     )
-    dataset_version = schema.create_table(
-        define_table_dataset_version(schema.name, version_annotation)
-    )
+    dataset_version = schema.create_table(define_table_dataset_version(schema.name, version_annotation))
     dataset_table.create_reference(("Version", True, dataset_version))
     # Nested datasets.
     schema.create_table(
-        Table.define_association(
-            associates=[("Dataset", dataset_table), ("Nested_Dataset", dataset_table)]
-        )
+        Table.define_association(associates=[("Dataset", dataset_table), ("Nested_Dataset", dataset_table)])
     )
     schema.create_table(
-        Table.define_association(
-            associates=[("Dataset", dataset_table), ("Execution", execution_table)]
-        )
+        Table.define_association(associates=[("Dataset", dataset_table), ("Execution", execution_table)])
     )
+    return dataset_table
 def define_table_dataset_version(sname: str, annotation: Optional[dict] = None):
-    return Table.define(
-        tname="Dataset_Version",
+    """Define the dataset version table in the specified schema.
+    Args:
+        sname: The schema name where the table should be created.
+        annotation: Optional annotation dictionary for the table.
+    Returns:
+        The created Table object.
+    """
+    table = Table.define(
+        tname=MLTable.dataset_version,
         column_defs=[
             Column.define(
                 "Version",
@@ -78,9 +88,7 @@ def define_table_dataset_version(sname: str, annotation: Optional[dict] = None):
             Column.define("Description", builtin_types.markdown),
             Column.define("Dataset", builtin_types.text, comment="RID of dataset"),
             Column.define("Execution", builtin_types.text, comment="RID of execution"),
-            Column.define(
-                "Minid", builtin_types.text, comment="URL to MINID for dataset"
-            ),
+            Column.define("Minid", builtin_types.text, comment="URL to MINID for dataset"),
             Column.define(
                 "Snapshot",
                 builtin_types.text,
@@ -94,13 +102,23 @@ def define_table_dataset_version(sname: str, annotation: Optional[dict] = None):
             ForeignKey.define(["Execution"], sname, "Execution", ["RID"]),
         ],
     )
+    return table
 def create_execution_table(schema, annotation: Optional[dict] = None):
+    """Create the execution table in the specified schema.
+    Args:
+        schema: The schema where the table should be created.
+        annotation: Optional annotation dictionary for the table.
+    Returns:
+        The created Table object.
+    """
     annotation = annotation if annotation is not None else {}
     execution = schema.create_table(
         Table.define(
-            "Execution",
+            MLTable.execution,
             column_defs=[
                 Column.define("Workflow", builtin_types.text),
                 Column.define("Description", builtin_types.markdown),
@@ -108,9 +126,7 @@ def create_execution_table(schema, annotation: Optional[dict] = None):
                 Column.define("Status", builtin_types.text),
                 Column.define("Status_Detail", builtin_types.text),
             ],
-            fkey_defs=[
-                ForeignKey.define(["Workflow"], schema.name, "Workflow", ["RID"])
-            ],
+            fkey_defs=[ForeignKey.define(["Workflow"], schema.name, "Workflow", ["RID"])],
             annotations=annotation,
         )
     )
@@ -123,6 +139,7 @@ def create_asset_table(
     execution_table,
     asset_type_table,
     asset_role_table,
+    use_hatrac: bool = True,
 ):
     asset_table = schema.create_table(
         Table.define_asset(
@@ -153,45 +170,19 @@ def create_asset_table(
     return asset_table
-def create_file_table(
-    schema: Schema,
-    execution_table: Table,
-    project_name: str,
-    annotation: Optional[dict] = None,
-):
-    """Define files table structure"""
-    annotation = annotation or {}
-    file_table = schema.create_table(
-        Table.define_asset(sname=schema.name, tname="File")
-    )
-    file_type = schema.create_table(
-        Table.define_vocabulary(MLVocab.file_type, f"{project_name}:{{RID}}")
-    )
-    schema.create_table(
-        Table.define_association(
-            associates=[
-                ("File", file_table),
-                (MLVocab.file_type, file_type),
-            ]
-        )
-    )
-    schema.create_table(
-        Table.define_association(
-            [
-                ("File", file_table),
-                ("Execution", execution_table),
-            ]
-        )
-    )
+def create_workflow_table(schema: Schema, annotations: Optional[dict[str, Any]] = None):
+    """Create the workflow table in the specified schema.
+    Args:
+        schema: The schema where the table should be created.
+        annotations: Optional annotation dictionary for the table.
-def create_workflow_table(schema: Schema, annotations: Optional[dict[str, Any]] = None):
-    annotations = annotations or {}
+    Returns:
+        The created Table object.
+    """
     workflow_table = schema.create_table(
         Table.define(
-            "Workflow",
+            tname=MLTable.workflow,
             column_defs=[
                 Column.define("Name", builtin_types.text),
                 Column.define("Description", builtin_types.markdown),
@@ -203,9 +194,7 @@ def create_workflow_table(schema: Schema, annotations: Optional[dict[str, Any]]
         )
     )
     workflow_table.create_reference(
-        schema.create_table(
-            Table.define_vocabulary(MLVocab.workflow_type, f"{schema.name}:{{RID}}")
-        )
+        schema.create_table(Table.define_vocabulary(MLVocab.workflow_type, f"{schema.name}:{{RID}}"))
     )
     return workflow_table
@@ -226,39 +215,23 @@ def create_ml_schema(
     client_annotation = {
         "tag:misd.isi.edu,2015:display": {"name": "Users"},
-        "tag:isrd.isi.edu,2016:table-display": {
-            "row_name": {"row_markdown_pattern": "{{{Full_Name}}}"}
-        },
-        "tag:isrd.isi.edu,2016:visible-columns": {
-            "compact": ["Full_Name", "Display_Name", "Email", "ID"]
-        },
+        "tag:isrd.isi.edu,2016:table-display": {"row_name": {"row_markdown_pattern": "{{{Full_Name}}}"}},
+        "tag:isrd.isi.edu,2016:visible-columns": {"compact": ["Full_Name", "Display_Name", "Email", "ID"]},
     }
-    model.schemas["public"].tables["ERMrest_Client"].annotations.update(
-        client_annotation
-    )
+    model.schemas["public"].tables["ERMrest_Client"].annotations.update(client_annotation)
     model.apply()
-    schema = model.create_schema(
-        Schema.define(schema_name, annotations=annotations["schema_annotation"])
-    )
+    schema = model.create_schema(Schema.define(schema_name, annotations=annotations["schema_annotation"]))
     # Create workflow and execution table.
-    schema.create_table(
-        Table.define_vocabulary("Feature_Name", f"{project_name}:{{RID}}")
-    )
-    asset_type_table = schema.create_table(
-        Table.define_vocabulary("Asset_Type", f"{project_name}:{{RID}}")
-    )
-    asset_role_table = schema.create_table(
-        Table.define_vocabulary("Asset_Role", f"{project_name}:{{RID}}")
-    )
+    schema.create_table(Table.define_vocabulary(MLVocab.feature_name, f"{project_name}:{{RID}}"))
+    asset_type_table = schema.create_table(Table.define_vocabulary(MLVocab.asset_type, f"{project_name}:{{RID}}"))
+    asset_role_table = schema.create_table(Table.define_vocabulary(MLVocab.asset_role, f"{project_name}:{{RID}}"))
     create_workflow_table(schema, annotations["workflow_annotation"])
-    execution_table = create_execution_table(
-        schema, annotations["execution_annotation"]
-    )
-    create_dataset_table(
+    execution_table = create_execution_table(schema, annotations["execution_annotation"])
+    dataset_table = create_dataset_table(
         schema,
         execution_table,
         project_name,
@@ -268,7 +241,7 @@ def create_ml_schema(
     create_asset_table(
         schema,
-        "Execution_Metadata",
+        MLTable.execution_metadata,
         execution_table,
         asset_type_table,
         asset_role_table,
@@ -276,21 +249,47 @@ def create_ml_schema(
     create_asset_table(
         schema,
-        "Execution_Asset",
+        MLTable.execution_asset,
         execution_table,
         asset_type_table,
         asset_role_table,
     )
     # File table
-    create_file_table(schema, execution_table, project_name)
+    file_table = create_asset_table(
+        schema,
+        MLTable.file,
+        execution_table,
+        asset_type_table,
+        asset_role_table,
+        use_hatrac=False,
+    )
+    # And make Files be part of a dataset.
+    schema.create_table(
+        Table.define_association(
+            associates=[
+                ("Dataset", dataset_table),
+                (MLTable.file, file_table),
+            ]
+        )
+    )
     initialize_ml_schema(model, schema_name)
 def initialize_ml_schema(model: Model, schema_name: str = "deriva-ml"):
+    """Initialize the ML schema with all required tables.
+    Args:
+        model: The ERMrest model to add the schema to.
+        schema_name: The name of the schema to create. Defaults to "deriva-ml".
+    Returns:
+        None. Modifies the model in place.
+    """
     catalog = model.catalog
-    asset_type = catalog.getPathBuilder().schemas[schema_name].tables["Asset_Type"]
+    asset_type = catalog.getPathBuilder().schemas[schema_name].tables[MLVocab.asset_type]
     asset_type.insert(
         [
             {
@@ -309,10 +308,13 @@ def initialize_ml_schema(model: Model, schema_name: str = "deriva-ml"):
                 "Name": "Execution_Asset",
                 "Description": "A file generated by an execution",
             },
+            {"Name": "File", "Description": "A file that is not managed by Hatrac"},
+            {"Name": "Model_File", "Description": "The ML model."},
         ],
         defaults={"ID", "URI"},
     )
-    asset_role = catalog.getPathBuilder().schemas[schema_name].tables["Asset_Role"]
+    asset_role = catalog.getPathBuilder().schemas[schema_name].tables[MLVocab.asset_role]
     asset_role.insert(
         [
             {"Name": "Input", "Description": "Asset used for input of an execution."},
@@ -320,21 +322,66 @@ def initialize_ml_schema(model: Model, schema_name: str = "deriva-ml"):
         ],
         defaults={"ID", "URI"},
     )
+    dataset_type = catalog.getPathBuilder().schemas[schema_name].tables[MLVocab.dataset_type]
+    dataset_type.insert(
+        [{"Name": "File", "Description": "A dataset that contains file assets."}],
+        defaults={"ID", "URI"},
+    )
+def create_ml_catalog(hostname: str, project_name: str) -> ErmrestCatalog:
+    server = DerivaServer("https", hostname, credentials=get_credential(hostname))
+    catalog = server.create_ermrest_catalog()
+    model = catalog.getCatalogModel()
+    model.configure_baseline_catalog()
+    policy_file = files("deriva_ml.schema").joinpath("policy.json")
+    subprocess.run(
+        [
+            "deriva-acl-config",
+            "--host",
+            catalog.deriva_server.server,
+            "--config-file",
+            policy_file,
+            catalog.catalog_id,
+        ]
+    )
+    create_ml_schema(catalog, project_name=project_name)
+    return catalog
+def reset_ml_schema(catalog: ErmrestCatalog, ml_schema=ML_SCHEMA) -> None:
+    model = catalog.getCatalogModel()
+    schemas = [schema for sname, schema in model.schemas.items() if sname not in ["public", "WWW"]]
+    for s in schemas:
+        s.drop(cascade=True)
+    model = catalog.getCatalogModel()
+    create_ml_schema(catalog, ml_schema)
 def main():
+    """Main entry point for the schema creation CLI.
+    Creates ML schema and catalog based on command line arguments.
+    Returns:
+        None. Executes the CLI.
+    """
     scheme = "https"
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--hostname", type=str, required=True)
-    parser.add_argument("--schema_name", type=str, required=True)
-    parser.add_argument("--catalog_id", type=str, required=True)
-    parser.add_argument("--curie_prefix", type=str, required=True)
+    parser = argparse.ArgumentParser(description="Create ML schema and catalog")
+    parser.add_argument("hostname", help="Hostname for the catalog")
+    parser.add_argument("project_name", help="Project name for the catalog")
+    parser.add_argument("schema-name", default="deriva-ml", help="Schema name (default: deriva-ml)")
+    parser.add_argument("curie_prefix", type=str, required=True)
     args = parser.parse_args()
     credentials = get_credential(args.hostname)
     server = DerivaServer(scheme, args.hostname, credentials)
     model = server.connect_ermrest(args.catalog_id).getCatalogModel()
     create_ml_schema(model, args.schema_name)
+    print(f"Created ML catalog at {args.hostname} with project {args.project_name}")
+    print(f"Schema '{args.schema_name}' initialized successfully")
 if __name__ == "__main__":
     sys.exit(main())

deriva-ml 1.14.0__py3-none-any.whl → 1.14.27__py3-none-any.whl

deriva-ml 1.14.0py3-none-any.whl → 1.14.27py3-none-any.whl