PyPI - deriva-ml - Versions diffs - 1.10.1__py3-none-any.whl → 1.12.0__py3-none-any.whl - Mend

deriva-ml 1.10.1py3-none-any.whl → 1.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

deriva_ml/database_model.py +3 -2
deriva_ml/dataset.py +7 -16
deriva_ml/dataset_bag.py +10 -3
deriva_ml/demo_catalog.py +84 -78
deriva_ml/deriva_definitions.py +2 -2
deriva_ml/deriva_ml_base.py +105 -132
deriva_ml/deriva_model.py +31 -0
deriva_ml/execution.py +422 -315
deriva_ml/execution_configuration.py +4 -0
deriva_ml/feature.py +1 -2
deriva_ml/schema_setup/create_schema.py +223 -183
deriva_ml/upload.py +99 -236
{deriva_ml-1.10.1.dist-info → deriva_ml-1.12.0.dist-info}/METADATA +3 -1
deriva_ml-1.12.0.dist-info/RECORD +27 -0
deriva_ml-1.10.1.dist-info/RECORD +0 -27
{deriva_ml-1.10.1.dist-info → deriva_ml-1.12.0.dist-info}/WHEEL +0 -0
{deriva_ml-1.10.1.dist-info → deriva_ml-1.12.0.dist-info}/entry_points.txt +0 -0
{deriva_ml-1.10.1.dist-info → deriva_ml-1.12.0.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.10.1.dist-info → deriva_ml-1.12.0.dist-info}/top_level.txt +0 -0

deriva_ml/database_model.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Ths module contains the definition of the DatabaseModel class.  The role of this class is to provide an nterface between the BDBag representation
+"""Ths module contains the definition of the DatabaseModel class.  The role of this class is to provide an interface between the BDBag representation
 of a dataset and a sqllite database in which the contents of the bag are stored.
 """
@@ -51,7 +51,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
     appear in more than one database. To help manage this, a global list of all the datasets that have been loaded
     into DatabaseModels, is kept in the class variable `_rid_map`.
-    Because you can load diffent versions of a dataset simultaniously, the dataset RID and version number are tracked, and a new
+    Because you can load different versions of a dataset simultaneously, the dataset RID and version number are tracked, and a new
     sqllite instance is created for every new dataset version present.
     Attributes:
@@ -290,6 +290,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
         return DatasetBag(self, dataset_rid or self.dataset_rid)
     def dataset_version(self, dataset_rid: Optional[RID] = None) -> DatasetVersion:
+        """Return the version of the specified dataset."""
         if dataset_rid and dataset_rid not in self.bag_rids:
             DerivaMLException(f"Dataset RID {dataset_rid} is not in model.")
         return self.bag_rids[dataset_rid]

deriva_ml/dataset.py CHANGED Viewed

@@ -232,12 +232,10 @@ class Dataset:
         """Increment the version of the specified dataset_table.
         Args:
-          dataset_rid: RID to a dataset_table
-          component: Which version of the dataset_table to increment.
-          dataset_rid: RID of the dataset whose version is to be incremented.
-          component: Major, Minor or Patch
-          description: Description of the version update of the dataset_table.
-          execution_rid: Which execution is performing increment.
+            dataset_rid: RID of the dataset whose version is to be incremented.
+            component: Which version of the dataset_table to increment. Major, Minor or Patch
+            description: Description of the version update of the dataset_table.
+            execution_rid: Which execution is performing increment.
         Returns:
           new semantic version of the dataset_table as a 3-tuple
@@ -275,9 +273,6 @@ class Dataset:
             description: Description of the dataset_table.
             execution_rid: Execution under which the dataset_table will be created.
             version: Version of the dataset_table.
-            type: str | list[str]:
-            description: str:
         Returns:
             New dataset_table RID.
@@ -349,7 +344,6 @@ class Dataset:
         Args:
             dataset_rid: RID of the dataset_table to delete.
             recurse: If True, delete the dataset_table along with any nested datasets. (Default value = False)
-            dataset_rid: RID:
         """
         # Get association table entries for this dataset_table
         # Delete association table entries
@@ -397,7 +391,7 @@ class Dataset:
             filtered_path = dataset_path
         else:
             filtered_path = dataset_path.filter(
-                (dataset_path.Deleted == False) | (dataset_path.Deleted == None)
+                (dataset_path.Deleted == False) | (dataset_path.Deleted == None)  # noqa: E712
             )
         # Get a list of all the dataset_type values associated with this dataset_table.
@@ -439,8 +433,7 @@ class Dataset:
         routine makes it possible to add objects from the specified table to a dataset_table.
         Args:
-            element: Name or the table or table object that is to be added to the dataset_table.
-            element: str | Table:
+            element: Name of the table or table object that is to be added to the dataset_table.
         Returns:
             The table object that was added to the dataset_table.
@@ -464,7 +457,6 @@ class Dataset:
         Args:
             dataset_rid: param recurse: If this is a nested dataset_table, list the members of the contained datasets
-            dataset_rid: RID:
             recurse:  (Default value = False)
             limit: If provided, the maximum number of members to return for each element type.
@@ -677,7 +669,6 @@ class Dataset:
         Args:
             dataset_rid: return: RID of the parent dataset_table.
-            dataset_rid: RID:
         Returns:
             RID of the parent dataset_table.
@@ -805,7 +796,7 @@ class Dataset:
             dataset_elements = [
                 snapshot_catalog._model.name_to_table(e)
                 for e, m in snapshot_catalog.list_dataset_members(
-                    dataset_rid=dataset_rid, limit=1
+                    dataset_rid=dataset_rid,  #  limit=1  Limit seems to make things run slow.
                 ).items()
                 if m
             ]

deriva_ml/dataset_bag.py CHANGED Viewed

@@ -168,7 +168,7 @@ class DatasetBag:
                 yield dict(zip(col_names, row))
     @validate_call
-    def list_dataset_members(self, recurse: bool = False) -> dict[str, list[tuple]]:
+    def list_dataset_members(self, recurse: bool = False) -> dict[str, dict[str, list]]:
         """Return a list of entities associated with a specific _dataset_table.
         Args:
@@ -206,12 +206,19 @@ class DatasetBag:
             )
             with self.database as db:
+                col_names = [
+                    c[1]
+                    for c in db.execute(f'PRAGMA table_info("{sql_target}")').fetchall()
+                ]
+                select_cols = ",".join([f'"{sql_target}".{c}' for c in col_names])
                 sql_cmd = (
-                    f'SELECT * FROM "{sql_member}" '
+                    f'SELECT {select_cols} FROM "{sql_member}" '
                     f'JOIN "{sql_target}" ON "{sql_member}".{member_link[0]} = "{sql_target}".{member_link[1]} '
                     f'WHERE "{self.dataset_rid}" = "{sql_member}".Dataset;'
                 )
-                target_entities = db.execute(sql_cmd).fetchall()
+                target_entities = [
+                    dict(zip(col_names, e)) for e in db.execute(sql_cmd).fetchall()
+                ]
                 members[target_table.name].extend(target_entities)
             target_entities = []  # path.entities().fetch()

deriva_ml/demo_catalog.py CHANGED Viewed

@@ -2,9 +2,7 @@ import atexit
 from importlib.metadata import version
 from importlib.resources import files
 import logging
-from random import random, randint
-import tempfile
-from tempfile import TemporaryDirectory
+from random import randint, random
 from typing import Optional
 import itertools
@@ -12,7 +10,6 @@ from deriva.config.acl_config import AclConfig
 from deriva.core import DerivaServer
 from deriva.core import ErmrestCatalog, get_credential
 from deriva.core.datapath import DataPathException
-from deriva.core.ermrest_model import Model
 from deriva.core.ermrest_model import builtin_types, Schema, Table, Column
 from requests import HTTPError
@@ -35,48 +32,51 @@ TEST_DATASET_SIZE = 4
 def reset_demo_catalog(deriva_ml: DerivaML, sname: str):
     model = deriva_ml.model
     for trial in range(3):
-        for t in [
-            v
-            for v in model.schemas[sname].tables.values()
-            if v.name not in {"Subject", "Image"}
-        ]:
+        for t in [v for v in model.schemas[sname].tables.values()]:
             try:
                 t.drop()
             except HTTPError:
                 pass
+    model.schemas[sname].drop()
     # Empty out remaining tables.
     pb = deriva_ml.pathBuilder
     retry = True
     while retry:
-        retry = False
-        for s in [sname, "deriva-ml"]:
-            for t in pb.schemas[s].tables.values():
-                for e in t.entities().fetch():
-                    try:
-                        t.filter(t.RID == e["RID"]).delete()
-                    except DataPathException:  # FK constraint.
-                        retry = True
+        for t in pb.schemas["deriva-ml"].tables.values():
+            for e in t.entities().fetch():
+                try:
+                    t.filter(t.RID == e["RID"]).delete()
+                except DataPathException:  # FK constraint.
+                    retry = True
     initialize_ml_schema(model, "deriva-ml")
+    create_domain_schema(deriva_ml, sname)
 def populate_demo_catalog(deriva_ml: DerivaML, sname: str) -> None:
     # Delete any vocabularies and features.
-    reset_demo_catalog(deriva_ml, sname)
     domain_schema = deriva_ml.catalog.getPathBuilder().schemas[sname]
     subject = domain_schema.tables["Subject"]
     ss = subject.insert([{"Name": f"Thing{t + 1}"} for t in range(TEST_DATASET_SIZE)])
-    with TemporaryDirectory() as tmpdir:
-        image_dir = deriva_ml.asset_dir("Image", prefix=tmpdir)
+    deriva_ml.add_term(
+        MLVocab.workflow_type,
+        "Demo Catalog Creation",
+        description="A workflow demonstrating how to create a demo catalog.",
+    )
+    execution = deriva_ml.create_execution(
+        ExecutionConfiguration(
+            workflow=deriva_ml.create_workflow(
+                name="Demo Catalog", workflow_type="Demo Catalog Creation"
+            )
+        )
+    )
+    with execution.execute() as e:
         for s in ss:
-            image_file = image_dir.create_file(
-                f"test_{s['RID']}.txt", {"Subject": s["RID"]}
+            image_file = e.asset_file_path(
+                "Image", f"test_{s['RID']}.txt", Subject=s["RID"]
             )
             with open(image_file, "w") as f:
                 f.write(f"Hello there {random()}\n")
-        deriva_ml.upload_assets(image_dir)
+        execution.upload_execution_outputs()
 def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RID]]:
@@ -84,6 +84,13 @@ def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RI
     ml_instance.add_dataset_element_type("Image")
     type_rid = ml_instance.add_term("Dataset_Type", "TestSet", description="A test")
+    training_rid = ml_instance.add_term(
+        "Dataset_Type", "Training", description="A traing set"
+    )
+    testing_rid = ml_instance.add_term(
+        "Dataset_Type", "Testing", description="A testing set"
+    )
     table_path = (
         ml_instance.catalog.getPathBuilder()
         .schemas[ml_instance.domain_schema]
@@ -94,7 +101,7 @@ def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RI
     dataset_rids = []
     for r in subject_rids[0:4]:
         d = ml_instance.create_dataset(
-            type_rid.name,
+            type=[type_rid.name, "Testing"],
             description=f"Dataset {r}",
             version=DatasetVersion(1, 0, 0),
         )
@@ -104,7 +111,7 @@ def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RI
     nested_datasets = []
     for i in range(0, 4, 2):
         nested_dataset = ml_instance.create_dataset(
-            type_rid.name,
+            type=[type_rid.name, "Training"],
             description=f"Nested Dataset {i}",
             version=DatasetVersion(1, 0, 0),
         )
@@ -132,13 +139,11 @@ def create_demo_features(ml_instance):
         "Well",
         description="The subject self reports that they feel well",
     )
     ml_instance.create_vocabulary(
         "ImageQuality", "Controlled vocabulary for image quality"
     )
     ml_instance.add_term("ImageQuality", "Good", description="The image is good")
     ml_instance.add_term("ImageQuality", "Bad", description="The image is bad")
     box_asset = ml_instance.create_asset(
         "BoundingBox", comment="A file that contains a cropped version of a image"
     )
@@ -150,7 +155,6 @@ def create_demo_features(ml_instance):
         metadata=[ColumnDefinition(name="Scale", type=BuiltinTypes.int2, nullok=True)],
         optional=["Scale"],
     )
     ml_instance.create_feature("Image", "BoundingBox", assets=[box_asset])
     ml_instance.create_feature("Image", "Quality", terms=["ImageQuality"])
@@ -158,78 +162,88 @@ def create_demo_features(ml_instance):
     ImageBoundingboxFeature = ml_instance.feature_record_class("Image", "BoundingBox")
     SubjectWellnessFeature = ml_instance.feature_record_class("Subject", "Health")
+    # Get the workflow for this notebook
     ml_instance.add_term(
         MLVocab.workflow_type,
-        "API Workflow",
+        "Feature Notebook Workflow",
         description="A Workflow that uses Deriva ML API",
     )
     ml_instance.add_term(
-        MLVocab.execution_asset_type,
-        "API_Model",
-        description="Model for our API workflow",
+        MLVocab.asset_type, "API_Model", description="Model for our Notebook workflow"
     )
-    api_workflow = ml_instance.create_workflow(
-        name="API Workflow",
-        workflow_type="API Workflow",
+    notebook_workflow = ml_instance.create_workflow(
+        name="API Workflow", workflow_type="Feature Notebook Workflow"
     )
-    api_execution = ml_instance.create_execution(
+    feature_execution = ml_instance.create_execution(
         ExecutionConfiguration(
-            workflow=api_workflow, description="Our Sample Workflow instance"
+            workflow=notebook_workflow, description="Our Sample Workflow instance"
         )
     )
-    with tempfile.TemporaryDirectory() as temp_dir:
-        assetdir = ml_instance.asset_dir("BoundingBox", prefix=temp_dir)
-        for i in range(10):
-            with open(assetdir.path / f"box{i}.txt", "w") as fp:
-                fp.write(f"Hi there {i}")
-        bounding_box_assets = ml_instance.upload_assets(assetdir)
-    bounding_box_rids = [a.result["RID"] for a in bounding_box_assets.values()]
-    # Get the IDs of al of the things that we are going to want to attach features to.
     subject_rids = [
         i["RID"] for i in ml_instance.domain_path.tables["Subject"].entities().fetch()
     ]
     image_rids = [
         i["RID"] for i in ml_instance.domain_path.tables["Image"].entities().fetch()
     ]
     subject_feature_list = [
         SubjectWellnessFeature(
             Subject=subject_rid,
-            Execution=api_execution.execution_rid,
+            Execution=feature_execution.execution_rid,
             SubjectHealth=["Well", "Sick"][randint(0, 1)],
             Scale=randint(1, 10),
         )
         for subject_rid in subject_rids
     ]
+    # Create a new set of images.  For fun, lets wrap this in an execution so we get status updates
+    bounding_box_files = []
+    for i in range(10):
+        bounding_box_file = feature_execution.asset_file_path(
+            "BoundingBox", f"box{i}.txt"
+        )
+        with open(bounding_box_file, "w") as fp:
+            fp.write(f"Hi there {i}")
+        bounding_box_files.append(bounding_box_file)
+    image_bounding_box_feature_list = [
+        ImageBoundingboxFeature(
+            Image=image_rid,
+            BoundingBox=asset_name,
+        )
+        for image_rid, asset_name in zip(
+            image_rids, itertools.cycle(bounding_box_files)
+        )
+    ]
     image_quality_feature_list = [
         ImageQualityFeature(
             Image=image_rid,
-            Execution=api_execution.execution_rid,
             ImageQuality=["Good", "Bad"][randint(0, 1)],
         )
         for image_rid in image_rids
     ]
-    image_bounding_box_feature_list = [
-        ImageBoundingboxFeature(
-            Image=image_rid,
-            Execution=api_execution.execution_rid,
-            BoundingBox=asset_rid,
+    subject_feature_list = [
+        SubjectWellnessFeature(
+            Subject=subject_rid,
+            SubjectHealth=["Well", "Sick"][randint(0, 1)],
+            Scale=randint(1, 10),
         )
-        for image_rid, asset_rid in zip(image_rids, itertools.cycle(bounding_box_rids))
+        for subject_rid in subject_rids
     ]
-    ml_instance.add_features(subject_feature_list)
-    ml_instance.add_features(image_quality_feature_list)
-    ml_instance.add_features(image_bounding_box_feature_list)
+    with feature_execution.execute() as execution:
+        feature_execution.add_features(image_bounding_box_feature_list)
+        feature_execution.add_features(image_quality_feature_list)
+        feature_execution.add_features(subject_feature_list)
+    feature_execution.upload_execution_outputs()
-def create_domain_schema(model: Model, sname: str) -> None:
+def create_domain_schema(ml_instance: DerivaML, sname: str) -> None:
     """
     Create a domain schema.  Assumes that the ml-schema has already been created.
     :param model:
@@ -238,28 +252,19 @@ def create_domain_schema(model: Model, sname: str) -> None:
     """
     # Make sure that we have a ml schema
-    _ = model.schemas["deriva-ml"]
+    _ = ml_instance.model.schemas["deriva-ml"]
-    if model.schemas.get(sname):
+    if ml_instance.model.schemas.get(sname):
         # Clean out any old junk....
-        model.schemas[sname].drop()
+        ml_instance.model.schemas[sname].drop()
-    domain_schema = model.create_schema(
+    domain_schema = ml_instance.model.model.create_schema(
         Schema.define(sname, annotations={"name_style": {"underline_space": True}})
     )
     subject_table = domain_schema.create_table(
         Table.define("Subject", column_defs=[Column.define("Name", builtin_types.text)])
     )
-    image_table = domain_schema.create_table(
-        Table.define_asset(
-            sname=sname,
-            tname="Image",
-            hatrac_template="/hatrac/image_asset/{{MD5}}.{{Filename}}",
-            column_defs=[Column.define("Name", builtin_types.text)],
-        )
-    )
-    image_table.create_reference(subject_table)
+    ml_instance.create_asset("Image", referenced_tables=[subject_table])
 def destroy_demo_catalog(catalog):
@@ -284,13 +289,14 @@ def create_demo_catalog(
     try:
         create_ml_schema(model, project_name=project_name)
-        create_domain_schema(model, domain_schema)
         deriva_ml = DerivaML(
             hostname=hostname,
             catalog_id=test_catalog.catalog_id,
             project_name=project_name,
+            domain_schema=domain_schema,
             logging_level=logging.WARN,
         )
+        create_domain_schema(deriva_ml, domain_schema)
         working_dir = deriva_ml.working_dir
         dataset_table = deriva_ml.dataset_table
         dataset_table.annotations.update(

deriva_ml/deriva_definitions.py CHANGED Viewed

@@ -186,9 +186,9 @@ class MLVocab(StrEnum):
     dataset_type = "Dataset_Type"
     workflow_type = "Workflow_Type"
-    execution_asset_type = "Execution_Asset_Type"
-    execution_metadata_type = "Execution_Metadata_Type"
     file_type = "File_Type"
+    asset_type = "Asset_Type"
+    asset_role = "Asset_Role"
 class ExecMetadataVocab(StrEnum):

deriva-ml 1.10.1__py3-none-any.whl → 1.12.0__py3-none-any.whl

deriva-ml 1.10.1py3-none-any.whl → 1.12.0py3-none-any.whl