PyPI - deriva-ml - Versions diffs - 1.14.0__py3-none-any.whl → 1.14.27__py3-none-any.whl - Mend

deriva-ml 1.14.0py3-none-any.whl → 1.14.27py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

deriva_ml/__init__.py +25 -30
deriva_ml/core/__init__.py +39 -0
deriva_ml/core/base.py +1489 -0
deriva_ml/core/constants.py +36 -0
deriva_ml/core/definitions.py +74 -0
deriva_ml/core/enums.py +222 -0
deriva_ml/core/ermrest.py +288 -0
deriva_ml/core/exceptions.py +28 -0
deriva_ml/core/filespec.py +116 -0
deriva_ml/dataset/__init__.py +4 -0
deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
deriva_ml/{dataset.py → dataset/dataset.py} +406 -428
deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
deriva_ml/{history.py → dataset/history.py} +51 -33
deriva_ml/{upload.py → dataset/upload.py} +48 -70
deriva_ml/demo_catalog.py +233 -183
deriva_ml/execution/environment.py +290 -0
deriva_ml/{execution.py → execution/execution.py} +365 -252
deriva_ml/execution/execution_configuration.py +163 -0
deriva_ml/{execution_configuration.py → execution/workflow.py} +212 -224
deriva_ml/feature.py +83 -46
deriva_ml/model/__init__.py +0 -0
deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
deriva_ml/{database_model.py → model/database.py} +52 -74
deriva_ml/model/sql_mapper.py +44 -0
deriva_ml/run_notebook.py +19 -11
deriva_ml/schema/__init__.py +3 -0
deriva_ml/{schema_setup → schema}/annotations.py +31 -22
deriva_ml/schema/check_schema.py +104 -0
deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
deriva_ml/schema/deriva-ml-reference.json +8525 -0
deriva_ml/schema/table_comments_utils.py +57 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/METADATA +5 -4
deriva_ml-1.14.27.dist-info/RECORD +40 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/entry_points.txt +1 -0
deriva_ml/deriva_definitions.py +0 -391
deriva_ml/deriva_ml_base.py +0 -1046
deriva_ml/execution_environment.py +0 -139
deriva_ml/schema_setup/table_comments_utils.py +0 -56
deriva_ml/test-files/execution-parameters.json +0 -1
deriva_ml/test-files/notebook-parameters.json +0 -5
deriva_ml/test_functions.py +0 -141
deriva_ml/test_notebook.ipynb +0 -197
deriva_ml-1.14.0.dist-info/RECORD +0 -31
/deriva_ml/{schema_setup → execution}/__init__.py +0 -0
/deriva_ml/{schema_setup → schema}/policy.json +0 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/WHEEL +0 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/top_level.txt +0 -0

deriva_ml/core/filespec.py ADDED Viewed

@@ -0,0 +1,116 @@
+"""
+File-related utility functions for DerivaML.
+"""
+from __future__ import annotations
+import json
+from datetime import date
+from pathlib import Path
+from socket import gethostname
+from typing import Callable, Generator
+from urllib.parse import urlparse
+import deriva.core.utils.hash_utils as hash_utils
+from pydantic import BaseModel, Field, conlist, field_validator, validate_call
+class FileSpec(BaseModel):
+    """An entry into the File table
+    Attributes:
+        url: The File url to the url.
+        description: The description of the file.
+        md5: The MD5 hash of the file.
+        length: The length of the file in bytes.
+        file_types: A list of file types.  Each files_type should be a defined term in MLVocab.file_type vocabulary.
+    """
+    url: str = Field(alias="URL", validation_alias="url")
+    md5: str = Field(alias="MD5", validation_alias="md5")
+    length: int = Field(alias="Length", validation_alias="length")
+    description: str | None = Field(default="", alias="Description", validation_alias="description")
+    file_types: conlist(str) | None = []
+    @field_validator("url")
+    @classmethod
+    def validate_file_url(cls, url: str) -> str:
+        """Examine the provided URL. If it's a local path, convert it into a tag URL.
+        Args:
+            url: The URL to validate and potentially convert
+        Returns:
+            The validated/converted URL
+        Raises:
+            ValidationError: If the URL is not a file URL
+        """
+        url_parts = urlparse(url)
+        if url_parts.scheme == "tag":
+            # Already a tag URL, so just return it.
+            return url
+        elif (not url_parts.scheme) or url_parts.scheme == "file":
+            # There is no scheme part of the URL, or it is a file URL, so it is a local file path.
+            # Convert to a tag URL.
+            return f"tag://{gethostname()},{date.today()}:file://{url_parts.path}"
+        else:
+            raise ValueError("url is not a file URL")
+    @classmethod
+    def create_filespecs(
+        cls, path: Path | str, description: str, file_types: list[str] | Callable[[Path], list[str]] | None = None
+    ) -> Generator[FileSpec, None, None]:
+        """Given a file or directory, generate the sequence of corresponding FileSpecs suitable to create a File table.
+        Args:
+            path: Path to the file or directory.
+            description: The description of the file(s)
+            file_types: A list of file types or a function that takes a file path and returns a list of file types.
+        Returns:
+            An iterable of FileSpecs for each file in the directory.
+        """
+        path = Path(path)
+        file_types = file_types or []
+        file_types_fn = file_types if callable(file_types) else lambda _x: file_types
+        def create_spec(file_path: Path) -> FileSpec:
+            hashes = hash_utils.compute_file_hashes(file_path, hashes=frozenset(["md5", "sha256"]))
+            md5 = hashes["md5"][0]
+            type_list = file_types_fn(file_path)
+            return FileSpec(
+                length=path.stat().st_size,
+                md5=md5,
+                description=description,
+                url=file_path.as_posix(),
+                file_types=type_list if "File" in type_list else ["File"] + type_list,
+            )
+        files = [path] if path.is_file() else [f for f in Path(path).rglob("*") if f.is_file()]
+        return (create_spec(file) for file in files)
+    @staticmethod
+    def read_filespec(path: Path | str) -> Generator[FileSpec, None, None]:
+        """Get FileSpecs from a JSON lines file.
+        Args:
+         path: Path to the .jsonl file (string or Path).
+        Yields:
+             A FileSpec object.
+        """
+        path = Path(path)
+        with path.open("r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                yield FileSpec(**json.loads(line))
+# Hack round pydantic validate_call and forward reference.
+_raw = FileSpec.create_filespecs.__func__
+# wrap it with validate_call, then re‐make it a classmethod
+FileSpec.create_filespecs = classmethod(validate_call(_raw))

deriva_ml/dataset/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .aux_classes import DatasetSpec
+from .dataset import Dataset
+__all__ = ["Dataset", "DatasetSpec"]

deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} RENAMED Viewed

@@ -2,21 +2,22 @@
 THis module defines the DataSet class with is used to manipulate n
 """
-from .deriva_definitions import RID
 from enum import Enum
+from typing import Any, Optional, SupportsInt
 from pydantic import (
     BaseModel,
     ConfigDict,
-    field_validator,
     Field,
     computed_field,
-    model_validator,
+    conlist,
     field_serializer,
+    field_validator,
+    model_validator,
 )
 from semver import Version
-from typing import Optional, Any, SupportsInt
+from deriva_ml.core.definitions import RID
 class VersionPart(Enum):
@@ -41,9 +42,7 @@ class DatasetVersion(Version):
         replace(major, minor, patch): Replace the major and minor versions
     """
-    def __init__(
-        self, major: SupportsInt, minor: SupportsInt = 0, patch: SupportsInt = 0
-    ):
+    def __init__(self, major: SupportsInt, minor: SupportsInt = 0, patch: SupportsInt = 0):
         """Initialize a DatasetVersion object.
         Args:
@@ -139,7 +138,8 @@ class DatasetMinid(BaseModel):
     @computed_field
     @property
     def dataset_rid(self) -> str:
-        return self.version_rid.split("@")[0]
+        rid_parts = self.version_rid.split("@")
+        return rid_parts[0]
     @computed_field
     @property
@@ -177,13 +177,13 @@ class DatasetSpec(BaseModel):
     Attributes:
         rid (RID): A dataset_table RID
-        materialize (bool): If False, do not materialize datasets, only download table data, no assets.  Defaults to True
+        materialize (bool): If False do not materialize datasets, only download table data, no assets.  Defaults to True
         version (DatasetVersion): The version of the dataset.  Should follow semantic versioning.
     """
     rid: RID
     materialize: bool = True
-    version: DatasetVersion
+    version: DatasetVersion | conlist(item_type=int, min_length=3, max_length=3) | tuple[int, int, int] | str
     model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -192,6 +192,10 @@ class DatasetSpec(BaseModel):
     def version_field_validator(cls, v: Any) -> Any:
         if isinstance(v, dict):
             return DatasetVersion(**v)
+        elif isinstance(v, str):
+            return DatasetVersion.parse(v)
+        elif (isinstance(v, list) or isinstance(v, tuple)) and len(v) == 3:
+            return DatasetVersion(int(v[0]), int(v[1]), int(v[2]))
         else:
             return v

deriva-ml 1.14.0__py3-none-any.whl → 1.14.27__py3-none-any.whl

deriva-ml 1.14.0py3-none-any.whl → 1.14.27py3-none-any.whl