PyPI - deriva-ml - Versions diffs - 1.13.1__py3-none-any.whl → 1.13.3__py3-none-any.whl - Mend

deriva-ml 1.13.1py3-none-any.whl → 1.13.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

deriva_ml/database_model.py +5 -11
deriva_ml/dataset.py +293 -307
deriva_ml/dataset_aux_classes.py +10 -10
deriva_ml/demo_catalog.py +90 -67
deriva_ml/deriva_definitions.py +43 -4
deriva_ml/deriva_ml_base.py +31 -30
deriva_ml/deriva_model.py +17 -5
deriva_ml/execution.py +102 -89
deriva_ml/execution_configuration.py +2 -1
deriva_ml/history.py +2 -0
deriva_ml/schema_setup/annotations.py +341 -126
deriva_ml/schema_setup/create_schema.py +33 -65
deriva_ml/schema_setup/policy.json +7 -3
deriva_ml/upload.py +3 -3
{deriva_ml-1.13.1.dist-info → deriva_ml-1.13.3.dist-info}/METADATA +2 -2
deriva_ml-1.13.3.dist-info/RECORD +31 -0
{deriva_ml-1.13.1.dist-info → deriva_ml-1.13.3.dist-info}/WHEEL +1 -1
deriva_ml-1.13.1.dist-info/RECORD +0 -31
{deriva_ml-1.13.1.dist-info → deriva_ml-1.13.3.dist-info}/entry_points.txt +0 -0
{deriva_ml-1.13.1.dist-info → deriva_ml-1.13.3.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.13.1.dist-info → deriva_ml-1.13.3.dist-info}/top_level.txt +0 -0

deriva_ml/dataset_aux_classes.py CHANGED Viewed

@@ -2,8 +2,8 @@
 THis module defines the DataSet class with is used to manipulate n
 """
-from datetime import datetime
 from .deriva_definitions import RID
 from enum import Enum
 from pydantic import (
     BaseModel,
@@ -98,7 +98,7 @@ class DatasetHistory(BaseModel):
         version_rid (RID): The RID of the version record for the dataset in the Dataset_Version table.
         minid (str): The URL that represents the handle of the dataset bag.  This will be None if a MINID has not
                      been created yet.
-        timestamp (datetime): The timestamp of when the  dataset was created.
+        snapshot (str): Catalog snapshot ID of when the version record was created.
     """
     dataset_version: DatasetVersion
@@ -107,7 +107,7 @@ class DatasetHistory(BaseModel):
     execution_rid: Optional[RID] = None
     description: str = ""
     minid: Optional[str] = None
-    timestamp: Optional[datetime] = None
+    snapshot: Optional[str] = None
     model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -128,12 +128,12 @@ class DatasetMinid(BaseModel):
     """
     dataset_version: DatasetVersion
-    metadata: dict[str, str | int]
-    minid: str = Field(alias="compact_uri")
+    metadata: dict[str, str | int] = {}
+    minid: str = Field(alias="compact_uri", default=None)
     bag_url: str = Field(alias="location")
-    identifier: str
-    landing_page: str
-    version_rid: RID = Field(alias="Dataset_RID")
+    identifier: Optional[str] = None
+    landing_page: Optional[str] = None
+    version_rid: RID = Field(alias="RID")
     checksum: str = Field(alias="checksums", default="")
     @computed_field
@@ -156,8 +156,8 @@ class DatasetMinid(BaseModel):
     @field_validator("bag_url", mode="before")
     @classmethod
-    def convert_location_to_str(cls, value: list[str]) -> str:
-        return value[0]
+    def convert_location_to_str(cls, value: list[str] | str) -> str:
+        return value[0] if isinstance(value, list) else value
     @field_validator("checksum", mode="before")
     @classmethod

deriva_ml/demo_catalog.py CHANGED Viewed

@@ -1,18 +1,19 @@
 import atexit
-from importlib.metadata import version
 from importlib.resources import files
+import itertools
 import logging
 from random import randint, random
 from typing import Optional
-import itertools
+from tempfile import TemporaryDirectory
-from deriva.config.acl_config import AclConfig
-from deriva.core import DerivaServer
-from deriva.core import ErmrestCatalog, get_credential
+from deriva.core import DerivaServer, get_credential
+from deriva.core import ErmrestCatalog
 from deriva.core.datapath import DataPathException
 from deriva.core.ermrest_model import builtin_types, Schema, Table, Column
 from requests import HTTPError
+import subprocess
+from .schema_setup.annotations import catalog_annotation
 from deriva_ml import (
     DerivaML,
     ExecutionConfiguration,
@@ -23,8 +24,10 @@ from deriva_ml import (
     RID,
 )
-from deriva_ml.schema_setup.create_schema import initialize_ml_schema, create_ml_schema
-from deriva_ml.dataset import Dataset
+from deriva_ml.schema_setup.create_schema import (
+    initialize_ml_schema,
+    create_ml_schema,
+)
 TEST_DATASET_SIZE = 4
@@ -85,7 +88,7 @@ def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RI
     type_rid = ml_instance.add_term("Dataset_Type", "TestSet", description="A test")
     training_rid = ml_instance.add_term(
-        "Dataset_Type", "Training", description="A traing set"
+        "Dataset_Type", "Training", description="A training set"
     )
     testing_rid = ml_instance.add_term(
         "Dataset_Type", "Testing", description="A testing set"
@@ -98,32 +101,46 @@ def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RI
     )
     subject_rids = [i["RID"] for i in table_path.entities().fetch()]
-    dataset_rids = []
-    for r in subject_rids[0:4]:
-        d = ml_instance.create_dataset(
-            type=[type_rid.name, "Testing"],
-            description=f"Dataset {r}",
-            version=DatasetVersion(1, 0, 0),
-        )
-        ml_instance.add_dataset_members(d, [r])
-        dataset_rids.append(d)
-    nested_datasets = []
-    for i in range(0, 4, 2):
-        nested_dataset = ml_instance.create_dataset(
-            type=[type_rid.name, "Training"],
-            description=f"Nested Dataset {i}",
-            version=DatasetVersion(1, 0, 0),
-        )
-        ml_instance.add_dataset_members(nested_dataset, dataset_rids[i : i + 2])
-        nested_datasets.append(nested_dataset)
+    ml_instance.add_term(
+        MLVocab.workflow_type,
+        "Create Dataset Workflow",
+        description="A Workflow that creates a new dataset.",
+    )
+    dataset_workflow = ml_instance.create_workflow(
+        name="API Workflow", workflow_type="Create Dataset Workflow"
+    )
-    double_nested_dataset = ml_instance.create_dataset(
-        type_rid.name,
-        description="Double nested dataset",
-        version=DatasetVersion(1, 0, 0),
+    dataset_execution = ml_instance.create_execution(
+        ExecutionConfiguration(workflow=dataset_workflow, description="Create Dataset")
     )
-    ml_instance.add_dataset_members(double_nested_dataset, nested_datasets)
+    with dataset_execution.execute() as exe:
+        dataset_rids = []
+        for r in subject_rids[0:4]:
+            d = exe.create_dataset(
+                dataset_types=[type_rid.name, "Testing"],
+                description=f"Dataset {r}",
+                version=DatasetVersion(1, 0, 0),
+            )
+            ml_instance.add_dataset_members(d, [r])
+            dataset_rids.append(d)
+        nested_datasets = []
+        for i in range(0, 4, 2):
+            nested_dataset = exe.create_dataset(
+                dataset_types=[type_rid.name, "Training"],
+                description=f"Nested Dataset {i}",
+                version=DatasetVersion(1, 0, 0),
+            )
+            exe.add_dataset_members(nested_dataset, dataset_rids[i : i + 2])
+            nested_datasets.append(nested_dataset)
+        double_nested_dataset = exe.create_dataset(
+            dataset_types=type_rid.name,
+            description="Double nested dataset",
+            version=DatasetVersion(1, 0, 0),
+        )
+        exe.add_dataset_members(double_nested_dataset, nested_datasets)
     return double_nested_dataset, nested_datasets, dataset_rids
@@ -251,14 +268,13 @@ def create_domain_schema(ml_instance: DerivaML, sname: str) -> None:
     :return:
     """
-    # Make sure that we have a ml schema
     _ = ml_instance.model.schemas["deriva-ml"]
     if ml_instance.model.schemas.get(sname):
         # Clean out any old junk....
         ml_instance.model.schemas[sname].drop()
-    domain_schema = ml_instance.model.model.create_schema(
+    domain_schema = ml_instance.model.create_schema(
         Schema.define(sname, annotations={"name_style": {"underline_space": True}})
     )
     subject_table = domain_schema.create_table(
@@ -266,6 +282,8 @@ def create_domain_schema(ml_instance: DerivaML, sname: str) -> None:
     )
     ml_instance.create_asset("Image", referenced_tables=[subject_table])
+    catalog_annotation(ml_instance.model)
 def destroy_demo_catalog(catalog):
     catalog.delete_ermrest_catalog(really=True)
@@ -280,43 +298,47 @@ def create_demo_catalog(
     create_datasets=False,
     on_exit_delete=True,
 ) -> ErmrestCatalog:
-    credentials = get_credential(hostname)
-    server = DerivaServer("https", hostname, credentials=credentials)
+    credential = get_credential(hostname)
+    server = DerivaServer("https", hostname, credentials=credential)
     test_catalog = server.create_ermrest_catalog()
+    model = test_catalog.getCatalogModel()
+    model.configure_baseline_catalog()
+    policy_file = files("deriva_ml.schema_setup").joinpath("policy.json")
+    subprocess.run(
+        [
+            "deriva-acl-config",
+            "--host",
+            test_catalog.deriva_server.server,
+            "--config-file",
+            policy_file,
+            test_catalog.catalog_id,
+        ]
+    )
     if on_exit_delete:
         atexit.register(destroy_demo_catalog, test_catalog)
-    model = test_catalog.getCatalogModel()
     try:
-        create_ml_schema(model, project_name=project_name)
-        deriva_ml = DerivaML(
-            hostname=hostname,
-            catalog_id=test_catalog.catalog_id,
-            project_name=project_name,
-            domain_schema=domain_schema,
-            logging_level=logging.WARN,
-        )
-        create_domain_schema(deriva_ml, domain_schema)
-        working_dir = deriva_ml.working_dir
-        dataset_table = deriva_ml.dataset_table
-        dataset_table.annotations.update(
-            Dataset(
-                deriva_ml.model,
-                cache_dir=deriva_ml.cache_dir,
-                working_dir=deriva_ml.working_dir,
-            )._generate_dataset_annotations()
-        )
-        deriva_ml.model.apply()
-        policy_file = files("deriva_ml.schema_setup").joinpath("policy.json")
-        AclConfig(
-            hostname, test_catalog.catalog_id, policy_file, credentials=credentials
-        )
-        if populate or create_features or create_datasets:
-            populate_demo_catalog(deriva_ml, domain_schema)
-            if create_features:
-                create_demo_features(deriva_ml)
-            if create_datasets:
-                create_demo_datasets(deriva_ml)
+        with TemporaryDirectory() as tmpdir:
+            create_ml_schema(test_catalog, project_name=project_name)
+            deriva_ml = DerivaML(
+                hostname=hostname,
+                catalog_id=test_catalog.catalog_id,
+                project_name=project_name,
+                domain_schema=domain_schema,
+                logging_level=logging.WARN,
+                working_dir=tmpdir,
+                credential=credential,
+            )
+            create_domain_schema(deriva_ml, domain_schema)
+            if populate or create_features or create_datasets:
+                populate_demo_catalog(deriva_ml, domain_schema)
+                if create_features:
+                    create_demo_features(deriva_ml)
+                if create_datasets:
+                    create_demo_datasets(deriva_ml)
     except Exception:
         # on failure, delete catalog and re-raise exception
@@ -332,6 +354,7 @@ class DemoML(DerivaML):
         catalog_id,
         cache_dir: Optional[str] = None,
         working_dir: Optional[str] = None,
+        use_minid=True,
     ):
         super().__init__(
             hostname=hostname,
@@ -339,5 +362,5 @@ class DemoML(DerivaML):
             project_name="ml-test",
             cache_dir=cache_dir,
             working_dir=working_dir,
-            model_version=version(__name__.split(".")[0]),
+            use_minid=use_minid,
         )

deriva_ml/deriva_definitions.py CHANGED Viewed

@@ -2,12 +2,16 @@
 Shared definitions that are used in different DerivaML modules.
 """
+from __future__ import annotations
 import warnings
 from datetime import date
 from enum import Enum
-from typing import Any, Iterable, Optional, Annotated
+from pathlib import Path
+from typing import Any, Iterable, Optional, Annotated, Generator
 import deriva.core.ermrest_model as em
+import deriva.core.utils.hash_utils as hash_utils
 from urllib.parse import urlparse
 from deriva.core.ermrest_model import builtin_types
 from pydantic import (
@@ -136,11 +140,14 @@ class FileSpec(BaseModel):
     @field_validator("url")
     @classmethod
     def validate_file_url(cls, v):
+        """Examine the provided URL.  If it's a local path, convert it into a tag URL."""
         url_parts = urlparse(v)
         if url_parts.scheme == "tag":
+            # Already a tag URL, so just return it.
             return v
-        elif not url_parts.scheme:
-            return f"tag://{gethostname()},{date.today()}:file://{v}"
+        elif (not url_parts.scheme) or url_parts.scheme == "file":
+            # There is no scheme part tof the URL, or it is a file URL, so it is a local file path, so convert to a tag URL.
+            return f"tag://{gethostname()},{date.today()}:file://{url_parts.path}"
         else:
             raise ValidationError("url is not a file URL")
@@ -153,6 +160,38 @@ class FileSpec(BaseModel):
             "Length": self.length,
         }
+    @staticmethod
+    def create_filespecs(
+        path: Path | str, description: str
+    ) -> Generator["FileSpec", None, None]:
+        """Given a file or directory, generate the sequence of corresponding FileSpecs sutable to create a File table
+        Arguments:
+            path: Path to the file or directory.
+            description: The description of the file(s)
+        Returns:
+            An iterable of FileSpecs for each file in the directory.
+        """
+        path = Path(path)
+        def list_all_files(p) -> list[Path]:
+            return (
+                (f for f in Path(p).rglob("*") if f.is_file()) if path.is_dir() else [p]
+            )
+        def create_spec(p: Path, description: str) -> FileSpec:
+            hashes = hash_utils.compute_file_hashes(p, hashes=["md5", "sha256"])
+            md5 = hashes["md5"][0]
+            return FileSpec(
+                length=path.stat().st_size,
+                md5=md5,
+                description=description,
+                url=p.as_posix(),
+            )
+        return (create_spec(file, description) for file in list_all_files(path))
 class VocabularyTerm(BaseModel):
     """An entry in a vocabulary table.
@@ -162,7 +201,7 @@ class VocabularyTerm(BaseModel):
        synonyms: List of alternative names for the term
        id: CURI identifier for the term
        uri: Unique URI for the term.
-       description: A description of the meaning of the term
+       description: A description of the term meaning
        rid: Resource identifier assigned to the term
     Args:

deriva_ml/deriva_ml_base.py CHANGED Viewed

@@ -51,6 +51,7 @@ from .deriva_definitions import (
     FileSpec,
     TableDefinition,
 )
+from .schema_setup.annotations import asset_annotation
 try:
     from icecream import ic
@@ -82,9 +83,10 @@ class DerivaML(Dataset):
         project_name: Optional[str] = None,
         cache_dir: Optional[str] = None,
         working_dir: Optional[str] = None,
-        model_version: str = "1",
         ml_schema: str = ML_SCHEMA,
         logging_level=logging.INFO,
+        credential=None,
+        use_minid=True,
     ):
         """Create and initialize a DerivaML instance.
@@ -93,13 +95,14 @@ class DerivaML(Dataset):
         Args:
             hostname: Hostname of the Deriva server.
-            catalog_id: Catalog ID. Either and identifier, or a catalog name.
-            domain_schema: Schema name for domain specific tables and relationships.
+            catalog_id: Catalog ID. Either an identifier or a catalog name.
+            domain_schema: Schema name for domain-specific tables and relationships.
+            project_name: Project name. Defaults to name of domain schema.
             cache_dir: Directory path for caching data downloaded from the Deriva server as bdbag.
             working_dir: Directory path for storing data used by or generated by any computations.
-            model_version: A string that indicates the version model.  Typically passed in via
+            use_minid: Use the MINID serice when downloading dataset bags.
         """
-        self.credential = get_credential(hostname)
+        self.credential = credential or get_credential(hostname)
         server = DerivaServer(
             "https",
             hostname,
@@ -119,21 +122,20 @@ class DerivaML(Dataset):
         ) / default_workdir
         self.working_dir.mkdir(parents=True, exist_ok=True)
-        self.cache_dir = (
-            Path(cache_dir) if cache_dir else Path.home() / "deriva-ml" / "cache"
-        )
+        self.cache_dir = Path(cache_dir) if cache_dir else self.working_dir / "cache"
         self.cache_dir.mkdir(parents=True, exist_ok=True)
         # Initialize dataset class.
-        super().__init__(self.model, self.cache_dir, self.working_dir)
+        super().__init__(
+            self.model, self.cache_dir, self.working_dir, use_minid=use_minid
+        )
         self._logger = logging.getLogger("deriva_ml")
         self._logger.setLevel(logging_level)
         self.host_name = hostname
         self.catalog_id = catalog_id
         self.ml_schema = ml_schema
-        self.version = model_version
         self.configuration = None
         self._execution: Optional[Execution] = None
         self.domain_schema = self.model.domain_schema
@@ -150,11 +152,6 @@ class DerivaML(Dataset):
         deriva_logger = logging.getLogger("deriva")
         deriva_logger.setLevel(logging_level)
-        if "dirty" in self.version:
-            logging.info(
-                f"Loading dirty model.  Consider commiting and tagging: {self.version}"
-            )
     def __del__(self):
         try:
             if self._execution and self._execution.status != Status.completed:
@@ -438,6 +435,8 @@ class DerivaML(Dataset):
             )
         )
         atable.create_reference(self.model.name_to_table("Asset_Role"))
+        asset_annotation(asset_table)
         return asset_table
     # @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
@@ -820,6 +819,8 @@ class DerivaML(Dataset):
     ) -> Iterable[RID]:
         """Add a new file to the File table in the catalog.
+        The input is an iterator of FileSpec objects which provide the MD5 checksum, length, and URL.
         Args:
             file_types: One or more file types.  Must be a term from the File_Type controlled vocabulary.
             files: A sequence of file specifications that describe the files to add.
@@ -841,7 +842,6 @@ class DerivaML(Dataset):
                     return True
             return False
-        # Create the entry for the new dataset_table and get its RID.
         file_types = [file_types] if isinstance(file_types, str) else file_types
         pb = self._model.catalog.getPathBuilder()
         for file_type in file_types:
@@ -868,18 +868,12 @@ class DerivaML(Dataset):
         if execution_rid:
             # Get the name of the association table between file_table and execution.
-            exec_table = next(
-                self._model.schemas[self._ml_schema]
-                .tables["Execution"]
-                .find_associations()
-            ).name
-            pb.schemas[self._ml_schema].tables[exec_table].insert(
+            pb.schemas[self._ml_schema].File_Execution.insert(
                 [
                     {"File": file_rid, "Execution": execution_rid}
                     for file_rid in file_rids
                 ]
             )
         return file_rids
     def list_files(
@@ -890,9 +884,10 @@ class DerivaML(Dataset):
         file_path = ml_path.File
         type_path = ml_path.File_File_Type
-        # Get a list of all the dataset_type values associated with this dataset_table.
-        path = file_path.link(type_path)
-        path = path.attributes(
+        path = file_path.link(
+            type_path, on=file_path.RID == type_path.File, join_type="left"
+        )
+        path = path.File.attributes(
             path.File.RID,
             path.File.URL,
             path.File.MD5,
@@ -902,9 +897,9 @@ class DerivaML(Dataset):
         )
         file_map = {}
         for f in path.fetch():
-            file_map.setdefault(f["RID"], f | {"File_Types": []})["File_Types"].append(
-                f["File_Type"]
-            )
+            entry = file_map.setdefault(f["RID"], {**f, "File_Types": []})
+            if ft := f.get("File_Type"):  # assign-and-test in one go
+                entry["File_Types"].append(ft)
         # Now get rid of the File_Type key and return the result
         return [(f, f.pop("File_Type"))[0] for f in file_map.values()]
@@ -974,7 +969,7 @@ class DerivaML(Dataset):
     ) -> Workflow:
         """Identify current executing program and return a workflow RID for it
-        Determine the notebook or script that is currently being executed. Assume that  this is
+        Determine the notebook or script that is currently being executed. Assume that this is
         being executed from a cloned GitHub repository.  Determine the remote repository name for
         this object.  Then either retrieve an existing workflow for this executable or create
         a new one.
@@ -983,6 +978,9 @@ class DerivaML(Dataset):
             name: The name of the workflow.
             workflow_type: The type of the workflow.
             description: The description of the workflow.
+        Returns:
+            A workflow object.
         """
         # Make sure type is correct.
         self.lookup_term(MLVocab.workflow_type, workflow_type)
@@ -1001,6 +999,9 @@ class DerivaML(Dataset):
         1. The datasets specified in the configuration are downloaded and placed in the cache-dir. If a version is
         not specified in the configuration, then a new minor version number is created for the dataset and downloaded.
+        2. If any execution assets are provided in the configuration, they are downloaded and placed in the working directory.
         Args:
             configuration: ExecutionConfiguration:
             dry_run: Do not create an execution record or upload results.

deriva_ml/deriva_model.py CHANGED Viewed

@@ -21,7 +21,7 @@ from .deriva_definitions import (
 from collections import Counter
 from pydantic import validate_call, ConfigDict
-from typing import Iterable, Optional
+from typing import Iterable, Optional, Any
 class DerivaModel:
@@ -61,7 +61,7 @@ class DerivaModel:
         self.schemas = self.model.schemas
         self.ml_schema = ml_schema
-        builtin_schemas = ["public", self.ml_schema, "www"]
+        builtin_schemas = ["public", self.ml_schema, "www", "WWW"]
         try:
             self.domain_schema = (
                 domain_schema
@@ -73,6 +73,11 @@ class DerivaModel:
             # No domain schema defined.
             self.domain_schema = domain_schema
+    @property
+    def chaise_config(self) -> dict[str, Any]:
+        """Return the chaise configuration."""
+        return self.model.chaise_config
     def __getattr__(self, name):
         # Called only if `name` is not found in Manager.  Delegate attributes to model class.
         return getattr(self.model, name)
@@ -115,7 +120,12 @@ class DerivaModel:
         return vocab_columns.issubset({c.name.upper() for c in table.columns})
     def is_association(
-        self, table_name: str | Table, unqualified: bool = True, pure: bool = True
+        self,
+        table_name: str | Table,
+        unqualified: bool = True,
+        pure: bool = True,
+        min_arity: int = 2,
+        max_arity: int = 2,
     ) -> bool | set | int:
         """Check the specified table to see if it is an association table.
@@ -130,7 +140,9 @@ class DerivaModel:
         """
         table = self.name_to_table(table_name)
-        return table.is_association(unqualified=unqualified, pure=pure)
+        return table.is_association(
+            unqualified=unqualified, pure=pure, min_arity=min_arity, max_arity=max_arity
+        )
     def find_association(self, table1: Table | str, table2: Table | str) -> Table:
         """Given two tables, return an association table that connects the two.
@@ -302,7 +314,7 @@ class DerivaModel:
     ) -> list[list[Table]]:
         """Recursively walk over the domain schema graph and extend the current path.
-        Walk a schema graph and return a list all  the paths through the graph.
+        Walk a schema graph and return a list all the paths through the graph.
         Args:
             path: Source path so far

deriva-ml 1.13.1__py3-none-any.whl → 1.13.3__py3-none-any.whl

deriva-ml 1.13.1py3-none-any.whl → 1.13.3py3-none-any.whl