PyPI - deriva-ml - Versions diffs - 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl - Mend

deriva-ml 1.10.0py3-none-any.whl → 1.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

deriva_ml/dataset.py +1 -1
deriva_ml/dataset_bag.py +10 -3
deriva_ml/demo_catalog.py +84 -78
deriva_ml/deriva_definitions.py +2 -2
deriva_ml/deriva_ml_base.py +87 -128
deriva_ml/deriva_model.py +25 -0
deriva_ml/execution.py +389 -309
deriva_ml/execution_configuration.py +16 -6
deriva_ml/feature.py +1 -2
deriva_ml/schema_setup/create_schema.py +223 -183
deriva_ml/upload.py +95 -232
{deriva_ml-1.10.0.dist-info → deriva_ml-1.11.0.dist-info}/METADATA +2 -1
deriva_ml-1.11.0.dist-info/RECORD +27 -0
{deriva_ml-1.10.0.dist-info → deriva_ml-1.11.0.dist-info}/WHEEL +1 -1
deriva_ml-1.10.0.dist-info/RECORD +0 -27
{deriva_ml-1.10.0.dist-info → deriva_ml-1.11.0.dist-info}/entry_points.txt +0 -0
{deriva_ml-1.10.0.dist-info → deriva_ml-1.11.0.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.10.0.dist-info → deriva_ml-1.11.0.dist-info}/top_level.txt +0 -0

deriva_ml/dataset.py CHANGED Viewed

@@ -805,7 +805,7 @@ class Dataset:
             dataset_elements = [
                 snapshot_catalog._model.name_to_table(e)
                 for e, m in snapshot_catalog.list_dataset_members(
-                    dataset_rid=dataset_rid, limit=1
+                    dataset_rid=dataset_rid,  #  limit=1  Limit seems to make things run slow.
                 ).items()
                 if m
             ]

deriva_ml/dataset_bag.py CHANGED Viewed

@@ -168,7 +168,7 @@ class DatasetBag:
                 yield dict(zip(col_names, row))
     @validate_call
-    def list_dataset_members(self, recurse: bool = False) -> dict[str, list[tuple]]:
+    def list_dataset_members(self, recurse: bool = False) -> dict[str, dict[str, Any]]:
         """Return a list of entities associated with a specific _dataset_table.
         Args:
@@ -206,12 +206,19 @@ class DatasetBag:
             )
             with self.database as db:
+                col_names = [
+                    c[1]
+                    for c in db.execute(f'PRAGMA table_info("{sql_target}")').fetchall()
+                ]
+                select_cols = ",".join([f'"{sql_target}".{c}' for c in col_names])
                 sql_cmd = (
-                    f'SELECT * FROM "{sql_member}" '
+                    f'SELECT {select_cols} FROM "{sql_member}" '
                     f'JOIN "{sql_target}" ON "{sql_member}".{member_link[0]} = "{sql_target}".{member_link[1]} '
                     f'WHERE "{self.dataset_rid}" = "{sql_member}".Dataset;'
                 )
-                target_entities = db.execute(sql_cmd).fetchall()
+                target_entities = [
+                    dict(zip(col_names, e)) for e in db.execute(sql_cmd).fetchall()
+                ]
                 members[target_table.name].extend(target_entities)
             target_entities = []  # path.entities().fetch()

deriva_ml/demo_catalog.py CHANGED Viewed

@@ -2,9 +2,7 @@ import atexit
 from importlib.metadata import version
 from importlib.resources import files
 import logging
-from random import random, randint
-import tempfile
-from tempfile import TemporaryDirectory
+from random import randint, random
 from typing import Optional
 import itertools
@@ -12,7 +10,6 @@ from deriva.config.acl_config import AclConfig
 from deriva.core import DerivaServer
 from deriva.core import ErmrestCatalog, get_credential
 from deriva.core.datapath import DataPathException
-from deriva.core.ermrest_model import Model
 from deriva.core.ermrest_model import builtin_types, Schema, Table, Column
 from requests import HTTPError
@@ -35,48 +32,51 @@ TEST_DATASET_SIZE = 4
 def reset_demo_catalog(deriva_ml: DerivaML, sname: str):
     model = deriva_ml.model
     for trial in range(3):
-        for t in [
-            v
-            for v in model.schemas[sname].tables.values()
-            if v.name not in {"Subject", "Image"}
-        ]:
+        for t in [v for v in model.schemas[sname].tables.values()]:
             try:
                 t.drop()
             except HTTPError:
                 pass
+    model.schemas[sname].drop()
     # Empty out remaining tables.
     pb = deriva_ml.pathBuilder
     retry = True
     while retry:
-        retry = False
-        for s in [sname, "deriva-ml"]:
-            for t in pb.schemas[s].tables.values():
-                for e in t.entities().fetch():
-                    try:
-                        t.filter(t.RID == e["RID"]).delete()
-                    except DataPathException:  # FK constraint.
-                        retry = True
+        for t in pb.schemas["deriva-ml"].tables.values():
+            for e in t.entities().fetch():
+                try:
+                    t.filter(t.RID == e["RID"]).delete()
+                except DataPathException:  # FK constraint.
+                    retry = True
     initialize_ml_schema(model, "deriva-ml")
+    create_domain_schema(deriva_ml, sname)
 def populate_demo_catalog(deriva_ml: DerivaML, sname: str) -> None:
     # Delete any vocabularies and features.
-    reset_demo_catalog(deriva_ml, sname)
     domain_schema = deriva_ml.catalog.getPathBuilder().schemas[sname]
     subject = domain_schema.tables["Subject"]
     ss = subject.insert([{"Name": f"Thing{t + 1}"} for t in range(TEST_DATASET_SIZE)])
-    with TemporaryDirectory() as tmpdir:
-        image_dir = deriva_ml.asset_dir("Image", prefix=tmpdir)
+    deriva_ml.add_term(
+        MLVocab.workflow_type,
+        "Demo Catalog Creation",
+        description="A workflow demonstrating how to create a demo catalog.",
+    )
+    execution = deriva_ml.create_execution(
+        ExecutionConfiguration(
+            workflow=deriva_ml.create_workflow(
+                name="Demo Catalog", workflow_type="Demo Catalog Creation"
+            )
+        )
+    )
+    with execution.execute() as e:
         for s in ss:
-            image_file = image_dir.create_file(
-                f"test_{s['RID']}.txt", {"Subject": s["RID"]}
+            image_file = e.asset_file_path(
+                "Image", f"test_{s['RID']}.txt", Subject=s["RID"]
             )
             with open(image_file, "w") as f:
                 f.write(f"Hello there {random()}\n")
-        deriva_ml.upload_assets(image_dir)
+        execution.upload_execution_outputs()
 def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RID]]:
@@ -84,6 +84,13 @@ def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RI
     ml_instance.add_dataset_element_type("Image")
     type_rid = ml_instance.add_term("Dataset_Type", "TestSet", description="A test")
+    training_rid = ml_instance.add_term(
+        "Dataset_Type", "Training", description="A traing set"
+    )
+    testing_rid = ml_instance.add_term(
+        "Dataset_Type", "Testing", description="A testing set"
+    )
     table_path = (
         ml_instance.catalog.getPathBuilder()
         .schemas[ml_instance.domain_schema]
@@ -94,7 +101,7 @@ def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RI
     dataset_rids = []
     for r in subject_rids[0:4]:
         d = ml_instance.create_dataset(
-            type_rid.name,
+            type=[type_rid.name, "Testing"],
             description=f"Dataset {r}",
             version=DatasetVersion(1, 0, 0),
         )
@@ -104,7 +111,7 @@ def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RI
     nested_datasets = []
     for i in range(0, 4, 2):
         nested_dataset = ml_instance.create_dataset(
-            type_rid.name,
+            type=[type_rid.name, "Training"],
             description=f"Nested Dataset {i}",
             version=DatasetVersion(1, 0, 0),
         )
@@ -132,13 +139,11 @@ def create_demo_features(ml_instance):
         "Well",
         description="The subject self reports that they feel well",
     )
     ml_instance.create_vocabulary(
         "ImageQuality", "Controlled vocabulary for image quality"
     )
     ml_instance.add_term("ImageQuality", "Good", description="The image is good")
     ml_instance.add_term("ImageQuality", "Bad", description="The image is bad")
     box_asset = ml_instance.create_asset(
         "BoundingBox", comment="A file that contains a cropped version of a image"
     )
@@ -150,7 +155,6 @@ def create_demo_features(ml_instance):
         metadata=[ColumnDefinition(name="Scale", type=BuiltinTypes.int2, nullok=True)],
         optional=["Scale"],
     )
     ml_instance.create_feature("Image", "BoundingBox", assets=[box_asset])
     ml_instance.create_feature("Image", "Quality", terms=["ImageQuality"])
@@ -158,78 +162,88 @@ def create_demo_features(ml_instance):
     ImageBoundingboxFeature = ml_instance.feature_record_class("Image", "BoundingBox")
     SubjectWellnessFeature = ml_instance.feature_record_class("Subject", "Health")
+    # Get the workflow for this notebook
     ml_instance.add_term(
         MLVocab.workflow_type,
-        "API Workflow",
+        "Feature Notebook Workflow",
         description="A Workflow that uses Deriva ML API",
     )
     ml_instance.add_term(
-        MLVocab.execution_asset_type,
-        "API_Model",
-        description="Model for our API workflow",
+        MLVocab.asset_type, "API_Model", description="Model for our Notebook workflow"
     )
-    api_workflow = ml_instance.create_workflow(
-        name="API Workflow",
-        workflow_type="API Workflow",
+    notebook_workflow = ml_instance.create_workflow(
+        name="API Workflow", workflow_type="Feature Notebook Workflow"
     )
-    api_execution = ml_instance.create_execution(
+    feature_execution = ml_instance.create_execution(
         ExecutionConfiguration(
-            workflow=api_workflow, description="Our Sample Workflow instance"
+            workflow=notebook_workflow, description="Our Sample Workflow instance"
         )
     )
-    with tempfile.TemporaryDirectory() as temp_dir:
-        assetdir = ml_instance.asset_dir("BoundingBox", prefix=temp_dir)
-        for i in range(10):
-            with open(assetdir.path / f"box{i}.txt", "w") as fp:
-                fp.write(f"Hi there {i}")
-        bounding_box_assets = ml_instance.upload_assets(assetdir)
-    bounding_box_rids = [a.result["RID"] for a in bounding_box_assets.values()]
-    # Get the IDs of al of the things that we are going to want to attach features to.
     subject_rids = [
         i["RID"] for i in ml_instance.domain_path.tables["Subject"].entities().fetch()
     ]
     image_rids = [
         i["RID"] for i in ml_instance.domain_path.tables["Image"].entities().fetch()
     ]
     subject_feature_list = [
         SubjectWellnessFeature(
             Subject=subject_rid,
-            Execution=api_execution.execution_rid,
+            Execution=feature_execution.execution_rid,
             SubjectHealth=["Well", "Sick"][randint(0, 1)],
             Scale=randint(1, 10),
         )
         for subject_rid in subject_rids
     ]
+    # Create a new set of images.  For fun, lets wrap this in an execution so we get status updates
+    bounding_box_files = []
+    for i in range(10):
+        bounding_box_file = feature_execution.asset_file_path(
+            "BoundingBox", f"box{i}.txt"
+        )
+        with open(bounding_box_file, "w") as fp:
+            fp.write(f"Hi there {i}")
+        bounding_box_files.append(bounding_box_file)
+    image_bounding_box_feature_list = [
+        ImageBoundingboxFeature(
+            Image=image_rid,
+            BoundingBox=asset_name,
+        )
+        for image_rid, asset_name in zip(
+            image_rids, itertools.cycle(bounding_box_files)
+        )
+    ]
     image_quality_feature_list = [
         ImageQualityFeature(
             Image=image_rid,
-            Execution=api_execution.execution_rid,
             ImageQuality=["Good", "Bad"][randint(0, 1)],
         )
         for image_rid in image_rids
     ]
-    image_bounding_box_feature_list = [
-        ImageBoundingboxFeature(
-            Image=image_rid,
-            Execution=api_execution.execution_rid,
-            BoundingBox=asset_rid,
+    subject_feature_list = [
+        SubjectWellnessFeature(
+            Subject=subject_rid,
+            SubjectHealth=["Well", "Sick"][randint(0, 1)],
+            Scale=randint(1, 10),
         )
-        for image_rid, asset_rid in zip(image_rids, itertools.cycle(bounding_box_rids))
+        for subject_rid in subject_rids
     ]
-    ml_instance.add_features(subject_feature_list)
-    ml_instance.add_features(image_quality_feature_list)
-    ml_instance.add_features(image_bounding_box_feature_list)
+    with feature_execution.execute() as execution:
+        feature_execution.add_features(image_bounding_box_feature_list)
+        feature_execution.add_features(image_quality_feature_list)
+        feature_execution.add_features(subject_feature_list)
+    feature_execution.upload_execution_outputs()
-def create_domain_schema(model: Model, sname: str) -> None:
+def create_domain_schema(ml_instance: DerivaML, sname: str) -> None:
     """
     Create a domain schema.  Assumes that the ml-schema has already been created.
     :param model:
@@ -238,28 +252,19 @@ def create_domain_schema(model: Model, sname: str) -> None:
     """
     # Make sure that we have a ml schema
-    _ = model.schemas["deriva-ml"]
+    _ = ml_instance.model.schemas["deriva-ml"]
-    if model.schemas.get(sname):
+    if ml_instance.model.schemas.get(sname):
         # Clean out any old junk....
-        model.schemas[sname].drop()
+        ml_instance.model.schemas[sname].drop()
-    domain_schema = model.create_schema(
+    domain_schema = ml_instance.model.model.create_schema(
         Schema.define(sname, annotations={"name_style": {"underline_space": True}})
     )
     subject_table = domain_schema.create_table(
         Table.define("Subject", column_defs=[Column.define("Name", builtin_types.text)])
     )
-    image_table = domain_schema.create_table(
-        Table.define_asset(
-            sname=sname,
-            tname="Image",
-            hatrac_template="/hatrac/image_asset/{{MD5}}.{{Filename}}",
-            column_defs=[Column.define("Name", builtin_types.text)],
-        )
-    )
-    image_table.create_reference(subject_table)
+    ml_instance.create_asset("Image", referenced_tables=[subject_table])
 def destroy_demo_catalog(catalog):
@@ -284,13 +289,14 @@ def create_demo_catalog(
     try:
         create_ml_schema(model, project_name=project_name)
-        create_domain_schema(model, domain_schema)
         deriva_ml = DerivaML(
             hostname=hostname,
             catalog_id=test_catalog.catalog_id,
             project_name=project_name,
+            domain_schema=domain_schema,
             logging_level=logging.WARN,
         )
+        create_domain_schema(deriva_ml, domain_schema)
         working_dir = deriva_ml.working_dir
         dataset_table = deriva_ml.dataset_table
         dataset_table.annotations.update(

deriva_ml/deriva_definitions.py CHANGED Viewed

@@ -186,9 +186,9 @@ class MLVocab(StrEnum):
     dataset_type = "Dataset_Type"
     workflow_type = "Workflow_Type"
-    execution_asset_type = "Execution_Asset_Type"
-    execution_metadata_type = "Execution_Metadata_Type"
     file_type = "File_Type"
+    asset_type = "Asset_Type"
+    asset_role = "Asset_Role"
 class ExecMetadataVocab(StrEnum):

deriva-ml 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

deriva-ml 1.10.0py3-none-any.whl → 1.11.0py3-none-any.whl