PyPI - deriva-ml - Versions diffs - 1.13.3__py3-none-any.whl → 1.14.26__py3-none-any.whl - Mend

deriva-ml 1.13.3py3-none-any.whl → 1.14.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

deriva_ml/__init__.py +25 -30
deriva_ml/core/__init__.py +39 -0
deriva_ml/core/base.py +1489 -0
deriva_ml/core/constants.py +36 -0
deriva_ml/core/definitions.py +74 -0
deriva_ml/core/enums.py +222 -0
deriva_ml/core/ermrest.py +288 -0
deriva_ml/core/exceptions.py +28 -0
deriva_ml/core/filespec.py +116 -0
deriva_ml/dataset/__init__.py +4 -0
deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
deriva_ml/{dataset.py → dataset/dataset.py} +408 -416
deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
deriva_ml/{history.py → dataset/history.py} +52 -33
deriva_ml/{upload.py → dataset/upload.py} +48 -70
deriva_ml/demo_catalog.py +233 -183
deriva_ml/execution/environment.py +290 -0
deriva_ml/{execution.py → execution/execution.py} +365 -252
deriva_ml/execution/execution_configuration.py +163 -0
deriva_ml/{execution_configuration.py → execution/workflow.py} +206 -218
deriva_ml/feature.py +83 -46
deriva_ml/model/__init__.py +0 -0
deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
deriva_ml/{database_model.py → model/database.py} +52 -74
deriva_ml/model/sql_mapper.py +44 -0
deriva_ml/run_notebook.py +19 -11
deriva_ml/schema/__init__.py +3 -0
deriva_ml/{schema_setup → schema}/annotations.py +31 -22
deriva_ml/schema/check_schema.py +104 -0
deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
deriva_ml/schema/deriva-ml-reference.json +8525 -0
deriva_ml/schema/table_comments_utils.py +57 -0
{deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/METADATA +5 -4
deriva_ml-1.14.26.dist-info/RECORD +40 -0
{deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/entry_points.txt +1 -0
deriva_ml/deriva_definitions.py +0 -372
deriva_ml/deriva_ml_base.py +0 -1046
deriva_ml/execution_environment.py +0 -139
deriva_ml/schema_setup/table_comments_utils.py +0 -56
deriva_ml/test-files/execution-parameters.json +0 -1
deriva_ml/test-files/notebook-parameters.json +0 -5
deriva_ml/test_functions.py +0 -141
deriva_ml/test_notebook.ipynb +0 -197
deriva_ml-1.13.3.dist-info/RECORD +0 -31
/deriva_ml/{schema_setup → execution}/__init__.py +0 -0
/deriva_ml/{schema_setup → schema}/policy.json +0 -0
{deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/WHEEL +0 -0
{deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/top_level.txt +0 -0

deriva_ml/demo_catalog.py CHANGED Viewed

@@ -1,150 +1,188 @@
+from __future__ import annotations
 import atexit
-from importlib.resources import files
 import itertools
 import logging
-from random import randint, random
-from typing import Optional
+import string
+from collections.abc import Iterator, Sequence
+from numbers import Integral
+from pathlib import Path
+from random import choice, randint, random
 from tempfile import TemporaryDirectory
-from deriva.core import DerivaServer, get_credential
 from deriva.core import ErmrestCatalog
-from deriva.core.datapath import DataPathException
-from deriva.core.ermrest_model import builtin_types, Schema, Table, Column
-from requests import HTTPError
-import subprocess
-from .schema_setup.annotations import catalog_annotation
-from deriva_ml import (
-    DerivaML,
-    ExecutionConfiguration,
-    MLVocab,
-    BuiltinTypes,
-    ColumnDefinition,
-    DatasetVersion,
-    RID,
+from deriva.core.ermrest_model import Column, Schema, Table, builtin_types
+from pydantic import BaseModel, ConfigDict
+from requests.exceptions import HTTPError
+from deriva_ml import DerivaML, MLVocab
+from deriva_ml.core.definitions import RID, BuiltinTypes, ColumnDefinition
+from deriva_ml.dataset.aux_classes import DatasetVersion
+from deriva_ml.execution.execution import Execution
+from deriva_ml.execution.execution_configuration import ExecutionConfiguration
+from deriva_ml.schema import (
+    create_ml_catalog,
 )
+from deriva_ml.schema.annotations import catalog_annotation
-from deriva_ml.schema_setup.create_schema import (
-    initialize_ml_schema,
-    create_ml_schema,
-)
+try:
+    from icecream import ic
+    ic.configureOutput(includeContext=True)
+except ImportError:  # Graceful fallback if IceCream isn't installed.
+    ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a)  # noqa
+TEST_DATASET_SIZE = 12
-TEST_DATASET_SIZE = 4
-def reset_demo_catalog(deriva_ml: DerivaML, sname: str):
-    model = deriva_ml.model
-    for trial in range(3):
-        for t in [v for v in model.schemas[sname].tables.values()]:
-            try:
-                t.drop()
-            except HTTPError:
-                pass
-    model.schemas[sname].drop()
-    # Empty out remaining tables.
-    pb = deriva_ml.pathBuilder
-    retry = True
-    while retry:
-        for t in pb.schemas["deriva-ml"].tables.values():
-            for e in t.entities().fetch():
-                try:
-                    t.filter(t.RID == e["RID"]).delete()
-                except DataPathException:  # FK constraint.
-                    retry = True
-    initialize_ml_schema(model, "deriva-ml")
-    create_domain_schema(deriva_ml, sname)
-def populate_demo_catalog(deriva_ml: DerivaML, sname: str) -> None:
+def populate_demo_catalog(ml_instance: DerivaML) -> None:
     # Delete any vocabularies and features.
-    domain_schema = deriva_ml.catalog.getPathBuilder().schemas[sname]
+    domain_schema = ml_instance.pathBuilder.schemas[ml_instance.domain_schema]
     subject = domain_schema.tables["Subject"]
     ss = subject.insert([{"Name": f"Thing{t + 1}"} for t in range(TEST_DATASET_SIZE)])
-    deriva_ml.add_term(
+    ml_instance.add_term(
         MLVocab.workflow_type,
         "Demo Catalog Creation",
         description="A workflow demonstrating how to create a demo catalog.",
     )
-    execution = deriva_ml.create_execution(
+    execution = ml_instance.create_execution(
         ExecutionConfiguration(
-            workflow=deriva_ml.create_workflow(
-                name="Demo Catalog", workflow_type="Demo Catalog Creation"
-            )
+            workflow=ml_instance.create_workflow(name="Demo Catalog", workflow_type="Demo Catalog Creation")
         )
     )
     with execution.execute() as e:
         for s in ss:
-            image_file = e.asset_file_path(
-                "Image", f"test_{s['RID']}.txt", Subject=s["RID"]
-            )
-            with open(image_file, "w") as f:
+            image_file = e.asset_file_path("Image", f"test_{s['RID']}.txt", Subject=s["RID"])
+            with image_file.open("w") as f:
                 f.write(f"Hello there {random()}\n")
         execution.upload_execution_outputs()
-def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RID]]:
-    ml_instance.add_dataset_element_type("Subject")
-    ml_instance.add_dataset_element_type("Image")
+class DatasetDescription(BaseModel):
+    types: list[str]  # Types of the dataset.
+    description: str  # Description.
+    members: dict[
+        str, int | list[DatasetDescription]
+    ]  # Either a list of nested dataset, or then number of elements to add
+    member_rids: dict[str, list[RID]] = {}  # The rids of the members of the dataset.
+    version: DatasetVersion = DatasetVersion(1, 0, 0)  # The initial version.
+    rid: RID = None  # RID of dataset that was created.
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+def create_datasets(
+    client: Execution,
+    spec: DatasetDescription,
+    member_rids: dict[str, Iterator[RID]],
+) -> DatasetDescription:
+    """
+    Create a dataset per `spec`, then add child members (either by slicing
+    off pre-generated RIDs or by recursing on nested specs).
+    """
+    dataset_rid = client.create_dataset(
+        dataset_types=spec.types,
+        description=spec.description,
+        version=spec.version,
+    )
-    type_rid = ml_instance.add_term("Dataset_Type", "TestSet", description="A test")
-    training_rid = ml_instance.add_term(
-        "Dataset_Type", "Training", description="A training set"
+    result_spec = DatasetDescription(
+        description=spec.description,
+        members={},
+        types=spec.types,
+        rid=dataset_rid,
+        version=spec.version,
     )
-    testing_rid = ml_instance.add_term(
-        "Dataset_Type", "Testing", description="A testing set"
+    dataset_rids = {}
+    for member_type, value in spec.members.items():
+        if isinstance(value, Sequence) and not isinstance(value, (str, bytes)):
+            nested_specs: list[DatasetDescription] = list(value)
+            rids: list[RID] = []
+            for child_spec in nested_specs:
+                child_ds = create_datasets(client, child_spec, member_rids)
+                result_spec.members.setdefault(member_type, []).append(child_ds)
+                rids.append(child_ds.rid)
+        elif isinstance(value, Integral):
+            count = int(value)
+            # take exactly `count` RIDs (or an empty list if count <= 0)
+            rids = list(itertools.islice(member_rids[member_type], count))
+            assert len(rids) == count, f"Expected {count} RIDs, got {len(rids)}"
+            result_spec.members[member_type] = count
+        else:
+            raise TypeError(
+                f"Expected spec.members['{member_type}'] to be either an int or a list, got {type(value).__name__!r}"
+            )
+        # attach and record
+        if rids:
+            dataset_rids[member_type] = rids
+            result_spec.member_rids.setdefault(member_type, []).extend(rids)
+    client.add_dataset_members(dataset_rid, dataset_rids, description="Added by create_datasets")
+    return result_spec
+def dataset_spec() -> DatasetDescription:
+    dataset = DatasetDescription(
+        description="A dataset",
+        members={"Subject": 2},
+        types=[],
+    )
+    training_dataset = DatasetDescription(
+        description="A dataset that is nested",
+        members={"Dataset": [dataset, dataset], "Image": 2},
+        types=["Testing"],
+    )
+    testing_dataset = DatasetDescription(
+        description="A dataset that is nested",
+        members={"Dataset": [dataset, dataset], "Image": 2},
+        types=["Testing"],
     )
-    table_path = (
-        ml_instance.catalog.getPathBuilder()
-        .schemas[ml_instance.domain_schema]
-        .tables["Subject"]
+    double_nested_dataset = DatasetDescription(
+        description="A dataset that is double nested",
+        members={"Dataset": [training_dataset, testing_dataset]},
+        types=["Complete"],
     )
+    return double_nested_dataset
+def create_demo_datasets(ml_instance: DerivaML) -> DatasetDescription:
+    """Create datasets from a populated catalog."""
+    ml_instance.add_dataset_element_type("Subject")
+    ml_instance.add_dataset_element_type("Image")
+    _type_rid = ml_instance.add_term("Dataset_Type", "Complete", synonyms=["Whole"], description="A test")
+    _training_rid = ml_instance.add_term("Dataset_Type", "Training", synonyms=["Train"], description="A training set")
+    _testing_rid = ml_instance.add_term("Dataset_Type", "Testing", description="A testing set")
+    table_path = ml_instance.catalog.getPathBuilder().schemas[ml_instance.domain_schema].tables["Subject"]
     subject_rids = [i["RID"] for i in table_path.entities().fetch()]
+    table_path = ml_instance.catalog.getPathBuilder().schemas[ml_instance.domain_schema].tables["Image"]
+    image_rids = [i["RID"] for i in table_path.entities().fetch()]
     ml_instance.add_term(
         MLVocab.workflow_type,
         "Create Dataset Workflow",
         description="A Workflow that creates a new dataset.",
     )
-    dataset_workflow = ml_instance.create_workflow(
-        name="API Workflow", workflow_type="Create Dataset Workflow"
-    )
+    dataset_workflow = ml_instance.create_workflow(name="API Workflow", workflow_type="Create Dataset Workflow")
     dataset_execution = ml_instance.create_execution(
         ExecutionConfiguration(workflow=dataset_workflow, description="Create Dataset")
     )
     with dataset_execution.execute() as exe:
-        dataset_rids = []
-        for r in subject_rids[0:4]:
-            d = exe.create_dataset(
-                dataset_types=[type_rid.name, "Testing"],
-                description=f"Dataset {r}",
-                version=DatasetVersion(1, 0, 0),
-            )
-            ml_instance.add_dataset_members(d, [r])
-            dataset_rids.append(d)
-        nested_datasets = []
-        for i in range(0, 4, 2):
-            nested_dataset = exe.create_dataset(
-                dataset_types=[type_rid.name, "Training"],
-                description=f"Nested Dataset {i}",
-                version=DatasetVersion(1, 0, 0),
-            )
-            exe.add_dataset_members(nested_dataset, dataset_rids[i : i + 2])
-            nested_datasets.append(nested_dataset)
-        double_nested_dataset = exe.create_dataset(
-            dataset_types=type_rid.name,
-            description="Double nested dataset",
-            version=DatasetVersion(1, 0, 0),
-        )
-        exe.add_dataset_members(double_nested_dataset, nested_datasets)
-    return double_nested_dataset, nested_datasets, dataset_rids
+        spec = dataset_spec()
+        dataset = create_datasets(exe, spec, {"Subject": iter(subject_rids), "Image": iter(image_rids)})
+    return dataset
-def create_demo_features(ml_instance):
+def create_demo_features(ml_instance: DerivaML) -> None:
     ml_instance.create_vocabulary("SubjectHealth", "A vocab")
     ml_instance.add_term(
         "SubjectHealth",
@@ -156,14 +194,10 @@ def create_demo_features(ml_instance):
         "Well",
         description="The subject self reports that they feel well",
     )
-    ml_instance.create_vocabulary(
-        "ImageQuality", "Controlled vocabulary for image quality"
-    )
+    ml_instance.create_vocabulary("ImageQuality", "Controlled vocabulary for image quality")
     ml_instance.add_term("ImageQuality", "Good", description="The image is good")
     ml_instance.add_term("ImageQuality", "Bad", description="The image is bad")
-    box_asset = ml_instance.create_asset(
-        "BoundingBox", comment="A file that contains a cropped version of a image"
-    )
+    box_asset = ml_instance.create_asset("BoundingBox", comment="A file that contains a cropped version of a image")
     ml_instance.create_feature(
         "Subject",
@@ -186,30 +220,20 @@ def create_demo_features(ml_instance):
         "Feature Notebook Workflow",
         description="A Workflow that uses Deriva ML API",
     )
-    ml_instance.add_term(
-        MLVocab.asset_type, "API_Model", description="Model for our Notebook workflow"
-    )
-    notebook_workflow = ml_instance.create_workflow(
-        name="API Workflow", workflow_type="Feature Notebook Workflow"
-    )
+    ml_instance.add_term(MLVocab.asset_type, "API_Model", description="Model for our Notebook workflow")
+    notebook_workflow = ml_instance.create_workflow(name="API Workflow", workflow_type="Feature Notebook Workflow")
     feature_execution = ml_instance.create_execution(
-        ExecutionConfiguration(
-            workflow=notebook_workflow, description="Our Sample Workflow instance"
-        )
+        ExecutionConfiguration(workflow=notebook_workflow, description="Our Sample Workflow instance")
     )
-    subject_rids = [
-        i["RID"] for i in ml_instance.domain_path.tables["Subject"].entities().fetch()
-    ]
-    image_rids = [
-        i["RID"] for i in ml_instance.domain_path.tables["Image"].entities().fetch()
-    ]
-    subject_feature_list = [
+    subject_rids = [i["RID"] for i in ml_instance.domain_path.tables["Subject"].entities().fetch()]
+    image_rids = [i["RID"] for i in ml_instance.domain_path.tables["Image"].entities().fetch()]
+    _subject_feature_list = [
         SubjectWellnessFeature(
             Subject=subject_rid,
             Execution=feature_execution.execution_rid,
-            SubjectHealth=["Well", "Sick"][randint(0, 1)],
+            SubjectHealth=choice(["Well", "Sick"]),
             Scale=randint(1, 10),
         )
         for subject_rid in subject_rids
@@ -218,10 +242,8 @@ def create_demo_features(ml_instance):
     # Create a new set of images.  For fun, lets wrap this in an execution so we get status updates
     bounding_box_files = []
     for i in range(10):
-        bounding_box_file = feature_execution.asset_file_path(
-            "BoundingBox", f"box{i}.txt"
-        )
-        with open(bounding_box_file, "w") as fp:
+        bounding_box_file = feature_execution.asset_file_path("BoundingBox", f"box{i}.txt")
+        with bounding_box_file.open("w") as fp:
             fp.write(f"Hi there {i}")
         bounding_box_files.append(bounding_box_file)
@@ -230,15 +252,13 @@ def create_demo_features(ml_instance):
             Image=image_rid,
             BoundingBox=asset_name,
         )
-        for image_rid, asset_name in zip(
-            image_rids, itertools.cycle(bounding_box_files)
-        )
+        for image_rid, asset_name in zip(image_rids, itertools.cycle(bounding_box_files))
     ]
     image_quality_feature_list = [
         ImageQualityFeature(
             Image=image_rid,
-            ImageQuality=["Good", "Bad"][randint(0, 1)],
+            ImageQuality=choice(["Good", "Bad"]),
         )
         for image_rid in image_rids
     ]
@@ -246,99 +266,129 @@ def create_demo_features(ml_instance):
     subject_feature_list = [
         SubjectWellnessFeature(
             Subject=subject_rid,
-            SubjectHealth=["Well", "Sick"][randint(0, 1)],
+            SubjectHealth=choice(["Well", "Sick"]),
             Scale=randint(1, 10),
         )
         for subject_rid in subject_rids
     ]
     with feature_execution.execute() as execution:
-        feature_execution.add_features(image_bounding_box_feature_list)
-        feature_execution.add_features(image_quality_feature_list)
-        feature_execution.add_features(subject_feature_list)
+        execution.add_features(image_bounding_box_feature_list)
+        execution.add_features(image_quality_feature_list)
+        execution.add_features(subject_feature_list)
     feature_execution.upload_execution_outputs()
-def create_domain_schema(ml_instance: DerivaML, sname: str) -> None:
+def create_demo_files(ml_instance: DerivaML):
+    """Create demo files for testing purposes.
+    Args:
+        ml_instance: The DerivaML instance to create files for.
+    Returns:
+        None. Creates files in the working directory.
+    """
+    def random_string(length: int) -> str:
+        """Generate a random string of specified length.
+        Args:
+            length: The length of the string to generate.
+        Returns:
+            A random string of the specified length.
+        """
+        return "".join(random.choice(string.ascii_letters) for _ in range(length))
+    test_dir = ml_instance.working_dir / "test_dir"
+    test_dir.mkdir(parents=True, exist_ok=True)
+    d1 = test_dir / "d1"
+    d1.mkdir(parents=True, exist_ok=True)
+    d2 = test_dir / "d2"
+    d2.mkdir(parents=True, exist_ok=True)
+    # Create some demo files
+    for d in [test_dir, d1, d2]:
+        for i in range(5):
+            fname = Path(d) / f"file{i}.{random.choice(['txt', 'jpeg'])}"
+            with fname.open("w") as f:
+                f.write(random_string(10))
+    ml_instance.add_term(MLVocab.workflow_type, "File Test Workflow", description="Test workflow")
+def create_domain_schema(catalog: ErmrestCatalog, sname: str) -> None:
     """
     Create a domain schema.  Assumes that the ml-schema has already been created.
-    :param model:
     :param sname:
     :return:
     """
+    model = catalog.getCatalogModel()
+    _ = model.schemas["deriva-ml"]
-    _ = ml_instance.model.schemas["deriva-ml"]
-    if ml_instance.model.schemas.get(sname):
-        # Clean out any old junk....
-        ml_instance.model.schemas[sname].drop()
-    domain_schema = ml_instance.model.create_schema(
-        Schema.define(sname, annotations={"name_style": {"underline_space": True}})
-    )
+    try:
+        model.schemas[sname].drop(cascade=True)
+    except KeyError:
+        pass
+    except HTTPError as e:
+        print(e)
+        if f"Schema {sname} does not exist" in str(e):
+            pass
+        else:
+            raise e
+    domain_schema = model.create_schema(Schema.define(sname, annotations={"name_style": {"underline_space": True}}))
     subject_table = domain_schema.create_table(
         Table.define("Subject", column_defs=[Column.define("Name", builtin_types.text)])
     )
-    ml_instance.create_asset("Image", referenced_tables=[subject_table])
-    catalog_annotation(ml_instance.model)
+    with TemporaryDirectory() as tmpdir:
+        ml_instance = DerivaML(hostname=catalog.deriva_server.server, catalog_id=catalog.catalog_id, working_dir=tmpdir)
+        ml_instance.create_asset("Image", referenced_tables=[subject_table])
+        catalog_annotation(ml_instance.model)
 def destroy_demo_catalog(catalog):
+    """Destroy the demo catalog and clean up resources.
+    Args:
+        catalog: The ErmrestCatalog instance to destroy.
+    Returns:
+        None. Destroys the catalog.
+    """
     catalog.delete_ermrest_catalog(really=True)
 def create_demo_catalog(
     hostname,
-    domain_schema="test-schema",
+    domain_schema="demo-schema",
     project_name="ml-test",
     populate=True,
     create_features=False,
     create_datasets=False,
     on_exit_delete=True,
+    logging_level=logging.INFO,
 ) -> ErmrestCatalog:
-    credential = get_credential(hostname)
-    server = DerivaServer("https", hostname, credentials=credential)
-    test_catalog = server.create_ermrest_catalog()
-    model = test_catalog.getCatalogModel()
-    model.configure_baseline_catalog()
-    policy_file = files("deriva_ml.schema_setup").joinpath("policy.json")
-    subprocess.run(
-        [
-            "deriva-acl-config",
-            "--host",
-            test_catalog.deriva_server.server,
-            "--config-file",
-            policy_file,
-            test_catalog.catalog_id,
-        ]
-    )
+    test_catalog = create_ml_catalog(hostname, project_name=project_name)
     if on_exit_delete:
         atexit.register(destroy_demo_catalog, test_catalog)
     try:
         with TemporaryDirectory() as tmpdir:
-            create_ml_schema(test_catalog, project_name=project_name)
-            deriva_ml = DerivaML(
-                hostname=hostname,
+            create_domain_schema(test_catalog, domain_schema)
+            ml_instance = DerivaML(
+                hostname,
                 catalog_id=test_catalog.catalog_id,
-                project_name=project_name,
                 domain_schema=domain_schema,
-                logging_level=logging.WARN,
                 working_dir=tmpdir,
-                credential=credential,
+                logging_level=logging_level,
             )
-            create_domain_schema(deriva_ml, domain_schema)
             if populate or create_features or create_datasets:
-                populate_demo_catalog(deriva_ml, domain_schema)
+                populate_demo_catalog(ml_instance)
                 if create_features:
-                    create_demo_features(deriva_ml)
+                    create_demo_features(ml_instance)
                 if create_datasets:
-                    create_demo_datasets(deriva_ml)
+                    create_demo_datasets(ml_instance)
     except Exception:
         # on failure, delete catalog and re-raise exception
@@ -352,8 +402,8 @@ class DemoML(DerivaML):
         self,
         hostname,
         catalog_id,
-        cache_dir: Optional[str] = None,
-        working_dir: Optional[str] = None,
+        cache_dir: str | None = None,
+        working_dir: str | None = None,
         use_minid=True,
     ):
         super().__init__(

deriva-ml 1.13.3__py3-none-any.whl → 1.14.26__py3-none-any.whl

deriva-ml 1.13.3py3-none-any.whl → 1.14.26py3-none-any.whl