PyPI - deriva-ml - Versions diffs - 1.17.10__py3-none-any.whl → 1.17.11__py3-none-any.whl - Mend

deriva-ml 1.17.10py3-none-any.whl → 1.17.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

deriva_ml/__init__.py +43 -1
deriva_ml/asset/__init__.py +17 -0
deriva_ml/asset/asset.py +357 -0
deriva_ml/asset/aux_classes.py +100 -0
deriva_ml/bump_version.py +254 -11
deriva_ml/catalog/__init__.py +21 -0
deriva_ml/catalog/clone.py +1199 -0
deriva_ml/catalog/localize.py +426 -0
deriva_ml/core/__init__.py +29 -0
deriva_ml/core/base.py +817 -1067
deriva_ml/core/config.py +169 -21
deriva_ml/core/constants.py +120 -19
deriva_ml/core/definitions.py +123 -13
deriva_ml/core/enums.py +47 -73
deriva_ml/core/ermrest.py +226 -193
deriva_ml/core/exceptions.py +297 -14
deriva_ml/core/filespec.py +99 -28
deriva_ml/core/logging_config.py +225 -0
deriva_ml/core/mixins/__init__.py +42 -0
deriva_ml/core/mixins/annotation.py +915 -0
deriva_ml/core/mixins/asset.py +384 -0
deriva_ml/core/mixins/dataset.py +237 -0
deriva_ml/core/mixins/execution.py +408 -0
deriva_ml/core/mixins/feature.py +365 -0
deriva_ml/core/mixins/file.py +263 -0
deriva_ml/core/mixins/path_builder.py +145 -0
deriva_ml/core/mixins/rid_resolution.py +204 -0
deriva_ml/core/mixins/vocabulary.py +400 -0
deriva_ml/core/mixins/workflow.py +322 -0
deriva_ml/core/validation.py +389 -0
deriva_ml/dataset/__init__.py +2 -1
deriva_ml/dataset/aux_classes.py +20 -4
deriva_ml/dataset/catalog_graph.py +575 -0
deriva_ml/dataset/dataset.py +1242 -1008
deriva_ml/dataset/dataset_bag.py +1311 -182
deriva_ml/dataset/history.py +27 -14
deriva_ml/dataset/upload.py +225 -38
deriva_ml/demo_catalog.py +126 -110
deriva_ml/execution/__init__.py +46 -2
deriva_ml/execution/base_config.py +639 -0
deriva_ml/execution/execution.py +543 -242
deriva_ml/execution/execution_configuration.py +26 -11
deriva_ml/execution/execution_record.py +592 -0
deriva_ml/execution/find_caller.py +298 -0
deriva_ml/execution/model_protocol.py +175 -0
deriva_ml/execution/multirun_config.py +153 -0
deriva_ml/execution/runner.py +595 -0
deriva_ml/execution/workflow.py +223 -34
deriva_ml/experiment/__init__.py +8 -0
deriva_ml/experiment/experiment.py +411 -0
deriva_ml/feature.py +6 -1
deriva_ml/install_kernel.py +143 -6
deriva_ml/interfaces.py +862 -0
deriva_ml/model/__init__.py +99 -0
deriva_ml/model/annotations.py +1278 -0
deriva_ml/model/catalog.py +286 -60
deriva_ml/model/database.py +144 -649
deriva_ml/model/deriva_ml_database.py +308 -0
deriva_ml/model/handles.py +14 -0
deriva_ml/run_model.py +319 -0
deriva_ml/run_notebook.py +507 -38
deriva_ml/schema/__init__.py +18 -2
deriva_ml/schema/annotations.py +62 -33
deriva_ml/schema/create_schema.py +169 -69
deriva_ml/schema/validation.py +601 -0
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/METADATA +4 -4
deriva_ml-1.17.11.dist-info/RECORD +77 -0
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/WHEEL +1 -1
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/entry_points.txt +1 -0
deriva_ml/protocols/dataset.py +0 -19
deriva_ml/test.py +0 -94
deriva_ml-1.17.10.dist-info/RECORD +0 -45
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/top_level.txt +0 -0

deriva_ml/demo_catalog.py CHANGED Viewed

@@ -1,10 +1,17 @@
+# type: ignore[arg-type, call-arg]
+"""Demo catalog utilities for DerivaML testing and examples.
+This module creates demo catalogs with sample data for testing. It uses
+dynamically created Pydantic models for features, which cannot be statically
+typed - hence the type ignore above.
+"""
 from __future__ import annotations
 import atexit
 import itertools
 import logging
-import os
 import string
+import subprocess
 from collections.abc import Iterator, Sequence
 from datetime import datetime
 from numbers import Integral
@@ -13,24 +20,29 @@ from random import choice, randint, random
 from tempfile import TemporaryDirectory
 from deriva.core import BaseCLI, ErmrestCatalog
-from deriva.core.ermrest_model import Column, Schema, Table, builtin_types
+from deriva.core.ermrest_model import Schema, Table
+from deriva.core.typed import BuiltinType, ColumnDef, SchemaDef, TableDef
 from pydantic import BaseModel, ConfigDict
 from requests.exceptions import HTTPError
-from deriva_ml import DerivaML, MLVocab
+from deriva_ml import DerivaML, DerivaMLException, MLVocab
 from deriva_ml.core.definitions import RID, BuiltinTypes, ColumnDefinition
+from deriva_ml.dataset import Dataset
 from deriva_ml.dataset.aux_classes import DatasetVersion
-from deriva_ml.execution.execution import Execution, Workflow
-from deriva_ml.execution.execution_configuration import ExecutionConfiguration
+from deriva_ml.execution.execution import Execution, ExecutionConfiguration
 from deriva_ml.schema import (
     create_ml_catalog,
 )
-from deriva_ml.schema.annotations import catalog_annotation
 try:
+    from pprint import pformat
     from icecream import ic
-    ic.configureOutput(includeContext=True)
+    ic.configureOutput(
+        includeContext=True,
+        argToStringFunction=lambda x: pformat(x.model_dump() if hasattr(x, "model_dump") else x, width=80, depth=10),
+    )
 except ImportError:  # Graceful fallback if IceCream isn't installed.
     ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a)  # noqa
@@ -38,39 +50,24 @@ except ImportError:  # Graceful fallback if IceCream isn't installed.
 TEST_DATASET_SIZE = 12
-def populate_demo_catalog(ml_instance: DerivaML) -> None:
+def populate_demo_catalog(execution: Execution) -> None:
     # Delete any vocabularies and features.
-    domain_schema = ml_instance.pathBuilder.schemas[ml_instance.domain_schema]
+    ml_instance = execution._ml_object
+    domain_schema = ml_instance.domain_path()
     subject = domain_schema.tables["Subject"]
     ss = subject.insert([{"Name": f"Thing{t + 1}"} for t in range(TEST_DATASET_SIZE)])
+    for s in ss:
+        image_file = execution.asset_file_path(
+            "Image",
+            f"test_{s['RID']}.txt",
+            Subject=s["RID"],
+            Acquisition_Time=datetime.now(),
+            Acquisition_Date=datetime.now().date(),
+        )
+        with image_file.open("w") as f:
+            f.write(f"Hello there {random()}\n")
-    ml_instance.add_term(
-        MLVocab.workflow_type,
-        "Demo Catalog Creation",
-        description="A workflow demonstrating how to create a demo catalog.",
-    )
-    workflow = Workflow(
-        name="Demo Catalog",
-        workflow_type="Demo Catalog Creation",
-        url="https://github.com/informatics-isi-edu/deriva-ml/blob/main/src/deriva_ml/demo_catalog.py",
-        version="1.0.0",
-        checksum="27",
-        git_root=Path(),
-    )
-    execution = ml_instance.create_execution(ExecutionConfiguration(workflow=workflow))
-    with execution.execute() as e:
-        for s in ss:
-            image_file = e.asset_file_path(
-                "Image",
-                f"test_{s['RID']}.txt",
-                Subject=s["RID"],
-                Acquisition_Time=datetime.now(),
-                Acquisition_Date=datetime.now().date(),
-            )
-            with image_file.open("w") as f:
-                f.write(f"Hello there {random()}\n")
-        execution.upload_execution_outputs()
+    execution.upload_execution_outputs()
 class DatasetDescription(BaseModel):
@@ -81,7 +78,7 @@ class DatasetDescription(BaseModel):
     ]  # Either a list of nested dataset, or then number of elements to add
     member_rids: dict[str, list[RID]] = {}  # The rids of the members of the dataset.
     version: DatasetVersion = DatasetVersion(1, 0, 0)  # The initial version.
-    rid: RID = None  # RID of dataset that was created.
+    dataset: Dataset = None  # RID of dataset that was created.
     model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -95,7 +92,8 @@ def create_datasets(
     Create a dataset per `spec`, then add child members (either by slicing
     off pre-generated RIDs or by recursing on nested specs).
     """
-    dataset_rid = client.create_dataset(
+    # Create unpinned dataset.
+    dataset = client.create_dataset(
         dataset_types=spec.types,
         description=spec.description,
         version=spec.version,
@@ -105,9 +103,10 @@ def create_datasets(
         description=spec.description,
         members={},
         types=spec.types,
-        rid=dataset_rid,
+        dataset=dataset,
         version=spec.version,
     )
     dataset_rids = {}
     for member_type, value in spec.members.items():
         if isinstance(value, Sequence) and not isinstance(value, (str, bytes)):
@@ -116,7 +115,7 @@ def create_datasets(
             for child_spec in nested_specs:
                 child_ds = create_datasets(client, child_spec, member_rids)
                 result_spec.members.setdefault(member_type, []).append(child_ds)
-                rids.append(child_ds.rid)
+                rids.append(child_ds.dataset.dataset_rid)
         elif isinstance(value, Integral):
             count = int(value)
             # take exactly `count` RIDs (or an empty list if count <= 0)
@@ -132,7 +131,7 @@ def create_datasets(
         if rids:
             dataset_rids[member_type] = rids
             result_spec.member_rids.setdefault(member_type, []).extend(rids)
-    client.add_dataset_members(dataset_rid, dataset_rids, description="Added by create_datasets")
+    dataset.add_dataset_members(dataset_rids, description="Added by create_datasets")
     return result_spec
@@ -147,7 +146,7 @@ def dataset_spec() -> DatasetDescription:
     training_dataset = DatasetDescription(
         description="A dataset that is nested",
         members={"Dataset": [dataset, dataset], "Image": 2},
-        types=["Testing"],
+        types=["Training"],
     )
     testing_dataset = DatasetDescription(
@@ -164,39 +163,37 @@ def dataset_spec() -> DatasetDescription:
     return double_nested_dataset
-def create_demo_datasets(ml_instance: DerivaML) -> DatasetDescription:
+def create_demo_datasets(execution: Execution) -> DatasetDescription:
     """Create datasets from a populated catalog."""
+    ml_instance = execution._ml_object
     ml_instance.add_dataset_element_type("Subject")
     ml_instance.add_dataset_element_type("Image")
-    _type_rid = ml_instance.add_term("Dataset_Type", "Complete", synonyms=["Whole"], description="A test")
-    _training_rid = ml_instance.add_term("Dataset_Type", "Training", synonyms=["Train"], description="A training set")
-    _testing_rid = ml_instance.add_term("Dataset_Type", "Testing", description="A testing set")
+    _type_rid = ml_instance.add_term(
+        "Dataset_Type", "Complete", synonyms=["Whole", "complete", "whole"], description="A test"
+    )
+    _training_rid = ml_instance.add_term(
+        "Dataset_Type", "Training", synonyms=["Train", "train", "training"], description="A training set"
+    )
+    _testing_rid = ml_instance.add_term(
+        "Dataset_Type", "Testing", synonyms=["Test", "test", "testing"], description="A testing set"
+    )
-    table_path = ml_instance.catalog.getPathBuilder().schemas[ml_instance.domain_schema].tables["Subject"]
+    table_path = ml_instance.domain_path().tables["Subject"]
     subject_rids = [i["RID"] for i in table_path.entities().fetch()]
-    table_path = ml_instance.catalog.getPathBuilder().schemas[ml_instance.domain_schema].tables["Image"]
-    image_rids = [i["RID"] for i in table_path.entities().fetch()]
-    ml_instance.add_term(
-        MLVocab.workflow_type,
-        "Create Dataset Workflow",
-        description="A Workflow that creates a new dataset.",
-    )
-    dataset_workflow = ml_instance.create_workflow(name="API Workflow", workflow_type="Create Dataset Workflow")
-    dataset_execution = ml_instance.create_execution(
-        ExecutionConfiguration(workflow=dataset_workflow, description="Create Dataset")
-    )
+    table_path = ml_instance.domain_path().tables["Image"]
+    image_rids = [i["RID"] for i in table_path.entities().fetch()]
-    with dataset_execution.execute() as exe:
-        spec = dataset_spec()
-        dataset = create_datasets(exe, spec, {"Subject": iter(subject_rids), "Image": iter(image_rids)})
+    spec = dataset_spec()
+    dataset = create_datasets(execution, spec, {"Subject": iter(subject_rids), "Image": iter(image_rids)})
     return dataset
-def create_demo_features(ml_instance: DerivaML) -> None:
-    ml_instance.create_vocabulary("SubjectHealth", "A vocab")
+def create_demo_features(execution: Execution) -> None:
+    ml_instance = execution._ml_object
+    # Use update_navbar=False for batch creation, then call apply_catalog_annotations() once at the end
+    ml_instance.create_vocabulary("SubjectHealth", "A vocab", update_navbar=False)
     ml_instance.add_term(
         "SubjectHealth",
         "Sick",
@@ -207,10 +204,12 @@ def create_demo_features(ml_instance: DerivaML) -> None:
         "Well",
         description="The subject self reports that they feel well",
     )
-    ml_instance.create_vocabulary("ImageQuality", "Controlled vocabulary for image quality")
+    ml_instance.create_vocabulary("ImageQuality", "Controlled vocabulary for image quality", update_navbar=False)
     ml_instance.add_term("ImageQuality", "Good", description="The image is good")
     ml_instance.add_term("ImageQuality", "Bad", description="The image is bad")
-    box_asset = ml_instance.create_asset("BoundingBox", comment="A file that contains a cropped version of a image")
+    box_asset = ml_instance.create_asset(
+        "BoundingBox", comment="A file that contains a cropped version of a image", update_navbar=False
+    )
     ml_instance.create_feature(
         "Subject",
@@ -218,9 +217,13 @@ def create_demo_features(ml_instance: DerivaML) -> None:
         terms=["SubjectHealth"],
         metadata=[ColumnDefinition(name="Scale", type=BuiltinTypes.int2, nullok=True)],
         optional=["Scale"],
+        update_navbar=False,
     )
-    ml_instance.create_feature("Image", "BoundingBox", assets=[box_asset])
-    ml_instance.create_feature("Image", "Quality", terms=["ImageQuality"])
+    ml_instance.create_feature("Image", "BoundingBox", assets=[box_asset], update_navbar=False)
+    ml_instance.create_feature("Image", "Quality", terms=["ImageQuality"], update_navbar=False)
+    # Update navbar once after all tables are created
+    ml_instance.apply_catalog_annotations()
     ImageQualityFeature = ml_instance.feature_record_class("Image", "Quality")
     ImageBoundingboxFeature = ml_instance.feature_record_class("Image", "BoundingBox")
@@ -228,24 +231,12 @@ def create_demo_features(ml_instance: DerivaML) -> None:
     # Get the workflow for this notebook
-    ml_instance.add_term(
-        MLVocab.workflow_type,
-        "Feature Notebook Workflow",
-        description="A Workflow that uses Deriva ML API",
-    )
-    ml_instance.add_term(MLVocab.asset_type, "API_Model", description="Model for our Notebook workflow")
-    notebook_workflow = ml_instance.create_workflow(name="API Workflow", workflow_type="Feature Notebook Workflow")
-    feature_execution = ml_instance.create_execution(
-        ExecutionConfiguration(workflow=notebook_workflow, description="Our Sample Workflow instance")
-    )
-    subject_rids = [i["RID"] for i in ml_instance.domain_path.tables["Subject"].entities().fetch()]
-    image_rids = [i["RID"] for i in ml_instance.domain_path.tables["Image"].entities().fetch()]
+    subject_rids = [i["RID"] for i in ml_instance.domain_path().tables["Subject"].entities().fetch()]
+    image_rids = [i["RID"] for i in ml_instance.domain_path().tables["Image"].entities().fetch()]
     _subject_feature_list = [
         SubjectWellnessFeature(
             Subject=subject_rid,
-            Execution=feature_execution.execution_rid,
+            Execution=execution.execution_rid,
             SubjectHealth=choice(["Well", "Sick"]),
             Scale=randint(1, 10),
         )
@@ -255,7 +246,7 @@ def create_demo_features(ml_instance: DerivaML) -> None:
     # Create a new set of images.  For fun, lets wrap this in an execution so we get status updates
     bounding_box_files = []
     for i in range(10):
-        bounding_box_file = feature_execution.asset_file_path("BoundingBox", f"box{i}.txt")
+        bounding_box_file = execution.asset_file_path("BoundingBox", f"box{i}.txt")
         with bounding_box_file.open("w") as fp:
             fp.write(f"Hi there {i}")
         bounding_box_files.append(bounding_box_file)
@@ -285,12 +276,9 @@ def create_demo_features(ml_instance: DerivaML) -> None:
         for subject_rid in subject_rids
     ]
-    with feature_execution.execute() as execution:
-        execution.add_features(image_bounding_box_feature_list)
-        execution.add_features(image_quality_feature_list)
-        execution.add_features(subject_feature_list)
-    feature_execution.upload_execution_outputs()
+    execution.add_features(image_bounding_box_feature_list)
+    execution.add_features(image_quality_feature_list)
+    execution.add_features(subject_feature_list)
 def create_demo_files(ml_instance: DerivaML):
@@ -350,21 +338,25 @@ def create_domain_schema(catalog: ErmrestCatalog, sname: str) -> None:
         else:
             raise e
-    domain_schema = model.create_schema(Schema.define(sname, annotations={"name_style": {"underline_space": True}}))
+    domain_schema = model.create_schema(
+        SchemaDef(name=sname, annotations={"name_style": {"underline_space": True}})
+    )
     subject_table = domain_schema.create_table(
-        Table.define("Subject", column_defs=[Column.define("Name", builtin_types.text)])
+        TableDef(name="Subject", columns=[ColumnDef("Name", BuiltinType.text)])
     )
     with TemporaryDirectory() as tmpdir:
         ml_instance = DerivaML(hostname=catalog.deriva_server.server, catalog_id=catalog.catalog_id, working_dir=tmpdir)
+        # Use update_navbar=False since we call apply_catalog_annotations() explicitly at the end
         ml_instance.create_asset(
             "Image",
             column_defs=[
-                Column.define("Acquisition_Time", builtin_types.timestamp),
-                Column.define("Acquisition_Date", builtin_types.date),
+                ColumnDef("Acquisition_Time", BuiltinType.timestamp),
+                ColumnDef("Acquisition_Date", BuiltinType.date),
             ],
             referenced_tables=[subject_table],
+            update_navbar=False,
         )
-        catalog_annotation(ml_instance.model)
+        ml_instance.apply_catalog_annotations()
 def destroy_demo_catalog(catalog):
@@ -395,27 +387,51 @@ def create_demo_catalog(
     try:
         with TemporaryDirectory() as tmpdir:
-            os.chdir(tmpdir)  # Do this so we don't get confused if running from a GitHub repo.
+            try:
+                subprocess.run(
+                    "git clone https://github.com/informatics-isi-edu/deriva-ml.git",
+                    capture_output=True,
+                    text=True,
+                    shell=True,
+                    check=True,
+                    cwd=tmpdir,
+                )
+            except subprocess.CalledProcessError:
+                raise DerivaMLException("Cannot clone deriva-ml repo from GitHub.")
             create_domain_schema(test_catalog, domain_schema)
-            ml_instance = DerivaML(
-                hostname,
-                catalog_id=test_catalog.catalog_id,
-                domain_schema=domain_schema,
-                working_dir=tmpdir,
-                logging_level=logging_level,
-            )
             if populate or create_features or create_datasets:
-                populate_demo_catalog(ml_instance)
-                if create_features:
-                    create_demo_features(ml_instance)
-                if create_datasets:
-                    create_demo_datasets(ml_instance)
+                ml_instance = DerivaML(
+                    hostname,
+                    catalog_id=test_catalog.catalog_id,
+                    default_schema=domain_schema,
+                    working_dir=tmpdir,
+                    logging_level=logging_level,
+                )
+                ml_instance.add_term(
+                    MLVocab.workflow_type,
+                    "Demo Catalog Creation",
+                    description="A Workflow that creates a new catalog and populates it with demo data.",
+                )
+                populate_workflow = ml_instance.create_workflow(
+                    name="Demo Creation", workflow_type="Demo Catalog Creation"
+                )
+                execution = ml_instance.create_execution(
+                    workflow=populate_workflow, configuration=ExecutionConfiguration()
+                )
+                with execution.execute() as exe:
+                    populate_demo_catalog(exe)
+                    if create_features:
+                        create_demo_features(exe)
+                    if create_datasets:
+                        create_demo_datasets(exe)
+                execution.upload_execution_outputs()
-    except Exception:
+    except Exception as e:
         # on failure, delete catalog and re-raise exception
         test_catalog.delete_ermrest_catalog(really=True)
-        raise
+        raise e
     return test_catalog

deriva_ml/execution/__init__.py CHANGED Viewed

@@ -1,8 +1,30 @@
 from typing import TYPE_CHECKING
 # Safe imports - no circular dependencies
-from deriva_ml.execution.execution_configuration import AssetRIDConfig, ExecutionConfiguration
+from deriva_ml.execution.base_config import (
+    BaseConfig,
+    DerivaBaseConfig,
+    base_defaults,
+    get_notebook_configuration,
+    # New simplified API
+    notebook_config,
+    load_configs,
+    run_notebook,
+    # Config metadata helpers
+    DescribedList,
+    with_description,
+)
+from deriva_ml.execution.multirun_config import (
+    MultirunSpec,
+    multirun_config,
+    get_multirun_config,
+    list_multirun_configs,
+    get_all_multirun_configs,
+)
+from deriva_ml.execution.execution_configuration import AssetRID, ExecutionConfiguration
 from deriva_ml.execution.workflow import Workflow
+from deriva_ml.execution.runner import run_model, create_model_config, reset_multirun_state
+from deriva_ml.execution.model_protocol import DerivaMLModel
 if TYPE_CHECKING:
     from deriva_ml.execution.execution import Execution
@@ -22,5 +44,27 @@ __all__ = [
     "Execution",  # Lazy-loaded
     "ExecutionConfiguration",
     "Workflow",
-    "AssetRIDConfig",
+    "AssetRID",
+    "run_model",
+    "create_model_config",
+    "reset_multirun_state",
+    "DerivaMLModel",
+    # Base configuration
+    "BaseConfig",
+    "DerivaBaseConfig",
+    "base_defaults",
+    "get_notebook_configuration",
+    # Simplified API
+    "notebook_config",
+    "load_configs",
+    "run_notebook",
+    # Config metadata helpers
+    "DescribedList",
+    "with_description",
+    # Multirun configuration
+    "MultirunSpec",
+    "multirun_config",
+    "get_multirun_config",
+    "list_multirun_configs",
+    "get_all_multirun_configs",
 ]

deriva-ml 1.17.10__py3-none-any.whl → 1.17.11__py3-none-any.whl

deriva-ml 1.17.10py3-none-any.whl → 1.17.11py3-none-any.whl