PyPI - deriva-ml - Versions diffs - 1.17.10__py3-none-any.whl → 1.17.11__py3-none-any.whl - Mend

deriva-ml 1.17.10py3-none-any.whl → 1.17.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

deriva_ml/__init__.py +43 -1
deriva_ml/asset/__init__.py +17 -0
deriva_ml/asset/asset.py +357 -0
deriva_ml/asset/aux_classes.py +100 -0
deriva_ml/bump_version.py +254 -11
deriva_ml/catalog/__init__.py +21 -0
deriva_ml/catalog/clone.py +1199 -0
deriva_ml/catalog/localize.py +426 -0
deriva_ml/core/__init__.py +29 -0
deriva_ml/core/base.py +817 -1067
deriva_ml/core/config.py +169 -21
deriva_ml/core/constants.py +120 -19
deriva_ml/core/definitions.py +123 -13
deriva_ml/core/enums.py +47 -73
deriva_ml/core/ermrest.py +226 -193
deriva_ml/core/exceptions.py +297 -14
deriva_ml/core/filespec.py +99 -28
deriva_ml/core/logging_config.py +225 -0
deriva_ml/core/mixins/__init__.py +42 -0
deriva_ml/core/mixins/annotation.py +915 -0
deriva_ml/core/mixins/asset.py +384 -0
deriva_ml/core/mixins/dataset.py +237 -0
deriva_ml/core/mixins/execution.py +408 -0
deriva_ml/core/mixins/feature.py +365 -0
deriva_ml/core/mixins/file.py +263 -0
deriva_ml/core/mixins/path_builder.py +145 -0
deriva_ml/core/mixins/rid_resolution.py +204 -0
deriva_ml/core/mixins/vocabulary.py +400 -0
deriva_ml/core/mixins/workflow.py +322 -0
deriva_ml/core/validation.py +389 -0
deriva_ml/dataset/__init__.py +2 -1
deriva_ml/dataset/aux_classes.py +20 -4
deriva_ml/dataset/catalog_graph.py +575 -0
deriva_ml/dataset/dataset.py +1242 -1008
deriva_ml/dataset/dataset_bag.py +1311 -182
deriva_ml/dataset/history.py +27 -14
deriva_ml/dataset/upload.py +225 -38
deriva_ml/demo_catalog.py +126 -110
deriva_ml/execution/__init__.py +46 -2
deriva_ml/execution/base_config.py +639 -0
deriva_ml/execution/execution.py +543 -242
deriva_ml/execution/execution_configuration.py +26 -11
deriva_ml/execution/execution_record.py +592 -0
deriva_ml/execution/find_caller.py +298 -0
deriva_ml/execution/model_protocol.py +175 -0
deriva_ml/execution/multirun_config.py +153 -0
deriva_ml/execution/runner.py +595 -0
deriva_ml/execution/workflow.py +223 -34
deriva_ml/experiment/__init__.py +8 -0
deriva_ml/experiment/experiment.py +411 -0
deriva_ml/feature.py +6 -1
deriva_ml/install_kernel.py +143 -6
deriva_ml/interfaces.py +862 -0
deriva_ml/model/__init__.py +99 -0
deriva_ml/model/annotations.py +1278 -0
deriva_ml/model/catalog.py +286 -60
deriva_ml/model/database.py +144 -649
deriva_ml/model/deriva_ml_database.py +308 -0
deriva_ml/model/handles.py +14 -0
deriva_ml/run_model.py +319 -0
deriva_ml/run_notebook.py +507 -38
deriva_ml/schema/__init__.py +18 -2
deriva_ml/schema/annotations.py +62 -33
deriva_ml/schema/create_schema.py +169 -69
deriva_ml/schema/validation.py +601 -0
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/METADATA +4 -4
deriva_ml-1.17.11.dist-info/RECORD +77 -0
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/WHEEL +1 -1
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/entry_points.txt +1 -0
deriva_ml/protocols/dataset.py +0 -19
deriva_ml/test.py +0 -94
deriva_ml-1.17.10.dist-info/RECORD +0 -45
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/top_level.txt +0 -0

deriva_ml/dataset/history.py CHANGED Viewed

@@ -1,15 +1,25 @@
 import base64
 import struct
 from datetime import datetime
+from typing import Any
 from dateutil.parser import isoparse
 from deriva.core import urlquote
+from deriva.core.deriva_server import DerivaServer
 # -- ==============================================================================================
-def get_record_history(server, cid, sname, tname, kvals, kcols=["RID"], snap=None):
+def get_record_history(
+    server: DerivaServer,
+    cid: str | int,
+    sname: str,
+    tname: str,
+    kvals: list[str],
+    kcols: list[str] | None = None,
+    snap: str | None = None,
+) -> dict[str, dict[str, Any]]:
     """Get the history of a record from the catalog.
     Args:
         server: The server instance.
         cid: The catalog ID.
@@ -18,13 +28,16 @@ def get_record_history(server, cid, sname, tname, kvals, kcols=["RID"], snap=Non
         kvals: The key values to look up.
         kcols: The key columns. Defaults to ["RID"].
         snap: Optional snapshot ID.
     Returns:
         The history data for the record.
     Raises:
         ValueError: If more than one row is returned.
     """
+    if kcols is None:
+        kcols = ["RID"]
     parts = {
         "cid": urlquote(cid),
         "sname": urlquote(sname),
@@ -46,7 +59,7 @@ def get_record_history(server, cid, sname, tname, kvals, kcols=["RID"], snap=Non
     path = "/ermrest/catalog/%(cid)s@%(snap)s/entity/%(sname)s:%(tname)s/%(filter)s"
     rows_found = []
-    snap2rows = {}
+    snap2rows: dict[str, dict[str, Any]] = {}
     while True:
         url = path % parts
         # sys.stderr.write("%s\n" % url)
@@ -67,12 +80,12 @@ def get_record_history(server, cid, sname, tname, kvals, kcols=["RID"], snap=Non
 # -- --------------------------------------------------------------------------------------
-def datetime_epoch_us(dt):
+def datetime_epoch_us(dt: datetime) -> int:
     """Convert datetime to epoch microseconds.
     Args:
         dt: The datetime object to convert.
     Returns:
         The epoch time in microseconds.
     """
@@ -84,12 +97,12 @@ def datetime_epoch_us(dt):
 #
-def iso_to_snap(iso_datetime):
+def iso_to_snap(iso_datetime: str) -> int:
     """Convert ISO datetime string to snapshot format.
     Args:
         iso_datetime: The ISO datetime string.
     Returns:
         The snapshot timestamp.
     """
@@ -97,12 +110,12 @@ def iso_to_snap(iso_datetime):
 # -- --------------------------------------------------------------------------------------
-def urlb32_encode(i):
+def urlb32_encode(i: int) -> str:
     """Encode an integer to URL-safe base32.
     Args:
         i: The integer to encode.
     Returns:
         The URL-safe base32 encoded string.
     """

deriva_ml/dataset/upload.py CHANGED Viewed

@@ -39,20 +39,32 @@ import json
 import os
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Any, Optional
+from typing import Any, Callable, Optional
 import regex as re
-from deriva.core import urlquote
-from deriva.core.ermrest_model import Table
-from deriva.core.hatrac_store import HatracStore
-from deriva.core.utils import hash_utils, mime_utils
-from deriva.transfer.upload.deriva_upload import GenericUploader
+# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
+import importlib
+_deriva_core = importlib.import_module("deriva.core")
+_ermrest_model = importlib.import_module("deriva.core.ermrest_model")
+_hatrac_store = importlib.import_module("deriva.core.hatrac_store")
+_hash_utils = importlib.import_module("deriva.core.utils.hash_utils")
+_mime_utils = importlib.import_module("deriva.core.utils.mime_utils")
+_deriva_upload = importlib.import_module("deriva.transfer.upload.deriva_upload")
+urlquote = _deriva_core.urlquote
+Table = _ermrest_model.Table
+HatracStore = _hatrac_store.HatracStore
+hash_utils = _hash_utils
+mime_utils = _mime_utils
+GenericUploader = _deriva_upload.GenericUploader
 from pydantic import ConfigDict, validate_call
 from deriva_ml.core.definitions import (
     RID,
     DerivaSystemColumns,
     FileUploadState,
+    UploadProgress,
     UploadState,
 )
 from deriva_ml.core.exceptions import DerivaMLException
@@ -89,7 +101,7 @@ def is_feature_dir(path: Path) -> Optional[re.Match]:
     return re.match(feature_table_dir_regex + "$", path.as_posix())
-def normalize_asset_dir(path: str) -> Optional[tuple[str, str]]:
+def normalize_asset_dir(path: str | Path) -> Optional[tuple[str, str]]:
     """Parse a path to an asset file and return the asset table name and file name.
     Args:
@@ -177,12 +189,16 @@ def table_path(prefix: Path | str, schema: str, table: str) -> Path:
     return path / f"{table}.csv"
-def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table):
+def asset_table_upload_spec(
+    model: DerivaModel, asset_table: str | Table, chunk_size: int | None = None
+):
     """Generate upload specification for an asset table.
     Args:
         model: The DerivaModel instance.
         asset_table: The asset table name or Table object.
+        chunk_size: Optional chunk size in bytes for hatrac uploads. If provided,
+            large files will be uploaded in chunks of this size.
     Returns:
         A dictionary containing the upload specification for the asset table.
@@ -197,6 +213,11 @@ def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table):
     asset_table = model.name_to_table(asset_table)
     schema = model.name_to_table(asset_table).schema.name
+    # Build hatrac_options with optional chunk_size
+    hatrac_options = {"versioned_urls": True}
+    if chunk_size is not None:
+        hatrac_options["chunk_size"] = chunk_size
     # Create upload specification
     spec = {
         # Upload assets into an asset table of an asset table.
@@ -211,7 +232,7 @@ def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table):
         "asset_type": "file",
         "target_table": [schema, asset_table.name],
         "checksum_types": ["sha256", "md5"],
-        "hatrac_options": {"versioned_urls": True},
+        "hatrac_options": hatrac_options,
         "hatrac_templates": {
             "hatrac_uri": f"/hatrac/{asset_table.name}/{{md5}}.{{file_name}}",
             "content-disposition": "filename*=UTF-8''{file_name}",
@@ -221,14 +242,27 @@ def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table):
     return spec
-def bulk_upload_configuration(model: DerivaModel) -> dict[str, Any]:
+def bulk_upload_configuration(
+    model: DerivaModel, chunk_size: int | None = None
+) -> dict[str, Any]:
     """Return an upload specification for deriva-ml
-    Arguments:
-        model: Model from which to generate the upload configuration
+    Args:
+        model: Model from which to generate the upload configuration.
+        chunk_size: Optional chunk size in bytes for hatrac uploads. If provided,
+            large files will be uploaded in chunks of this size.
     """
     asset_tables_with_metadata = [
-        asset_table_upload_spec(model=model, asset_table=t) for t in model.find_assets() if model.asset_metadata(t)
+        asset_table_upload_spec(model=model, asset_table=t, chunk_size=chunk_size)
+        for t in model.find_assets()
+        if model.asset_metadata(t)
     ]
+    # Build hatrac_options with optional chunk_size for non-metadata assets
+    hatrac_options = {"versioned_urls": True}
+    if chunk_size is not None:
+        hatrac_options["chunk_size"] = chunk_size
     return {
         "asset_mappings": asset_tables_with_metadata
         + [
@@ -244,7 +278,7 @@ def bulk_upload_configuration(model: DerivaModel) -> dict[str, Any]:
                 "target_table": ["{schema}", "{asset_table}"],
                 "file_pattern": asset_path_regex + "/" + asset_file_regex,  # Sets schema, asset_table, name, ext
                 "checksum_types": ["sha256", "md5"],
-                "hatrac_options": {"versioned_urls": True},
+                "hatrac_options": hatrac_options,
                 "hatrac_templates": {
                     "hatrac_uri": "/hatrac/{asset_table}/{md5}.{file_name}",
                     "content-disposition": "filename*=UTF-8''{file_name}",
@@ -273,14 +307,42 @@ def bulk_upload_configuration(model: DerivaModel) -> dict[str, Any]:
     }
+# Default timeout for large file uploads in seconds
+# The requests timeout tuple is (connect_timeout, read_timeout), but this doesn't
+# cover write operations. We also need to set socket.setdefaulttimeout() for writes.
+DEFAULT_UPLOAD_TIMEOUT = (6, 600)
+# Socket timeout for write operations (in seconds)
+# This is needed because requests timeout only covers connect and read, not write.
+# For large chunk uploads, the socket write can take significant time.
+DEFAULT_SOCKET_TIMEOUT = 600.0
 @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
-def upload_directory(model: DerivaModel, directory: Path | str) -> dict[Any, FileUploadState] | None:
+def upload_directory(
+    model: DerivaModel,
+    directory: Path | str,
+    progress_callback: Callable[[UploadProgress], None] | None = None,
+    max_retries: int = 3,
+    retry_delay: float = 5.0,
+    timeout: tuple[int, int] | None = None,
+    chunk_size: int | None = None,
+) -> dict[Any, FileUploadState] | None:
     """Upload assets from a directory. This routine assumes that the current upload specification includes a
     configuration for the specified directory.  Every asset in the specified directory is uploaded
     Args:
         model: Model to upload assets to.
         directory: Directory containing the assets and tables to upload.
+        progress_callback: Optional callback function to receive upload progress updates.
+            Called with UploadProgress objects containing file information and progress.
+        max_retries: Maximum number of retry attempts for failed uploads (default: 3).
+        retry_delay: Initial delay in seconds between retries, doubles with each attempt (default: 5.0).
+        timeout: Tuple of (connect_timeout, read_timeout) in seconds. Default is (6, 600)
+            which allows up to 10 minutes for each chunk upload. Increase read_timeout for
+            very large files on slow connections.
+        chunk_size: Optional chunk size in bytes for hatrac uploads. If provided,
+            large files will be uploaded in chunks of this size.
     Returns:
         Results of the upload operation.
@@ -288,37 +350,162 @@ def upload_directory(model: DerivaModel, directory: Path | str) -> dict[Any, Fil
     Raises:
         DerivaMLException: If there is an issue with uploading the assets.
     """
+    import logging
+    import time
+    from deriva.core import DEFAULT_SESSION_CONFIG
+    logger = logging.getLogger("deriva_ml")
     directory = Path(directory)
     if not directory.is_dir():
         raise DerivaMLException("Directory does not exist")
+    # Track upload progress across files
+    # status_callback is called twice per file: once before upload starts, once after it completes
+    upload_state = {"completed_files": 0, "total_files": 0, "status_calls": 0}
+    # Count total files to upload
+    for root, dirs, files in os.walk(directory):
+        upload_state["total_files"] += len(files)
+    # Create wrapper callbacks for GenericUploader if a progress callback was provided
+    def file_callback(**kwargs) -> bool:
+        """Callback for per-chunk progress updates from GenericUploader.
+        The deriva GenericUploader passes kwargs with: completed, total, file_path, host, job_info.
+        Note: This callback is only invoked for large files (> 25MB) that use chunked uploads.
+        Small files are uploaded in a single request and this callback won't be called.
+        """
+        if progress_callback is not None:
+            file_path = kwargs.get("file_path", "")
+            completed_chunks = kwargs.get("completed", 0)
+            total_chunks = kwargs.get("total", 0)
+            progress = UploadProgress(
+                file_path=file_path,
+                file_name=Path(file_path).name if file_path else "",
+                bytes_completed=completed_chunks,
+                bytes_total=total_chunks,
+                percent_complete=(completed_chunks / total_chunks * 100) if total_chunks > 0 else 0,
+                phase="uploading_chunks",
+                message=f"Uploading large file: chunk {completed_chunks} of {total_chunks}",
+            )
+            progress_callback(progress)
+        return True  # Continue upload
+    def status_callback() -> None:
+        """Callback for per-file status updates from GenericUploader.
+        GenericUploader calls this twice per file: once before upload starts (odd calls)
+        and once after upload completes (even calls). We use even calls to track completed files.
+        """
+        if progress_callback is not None:
+            upload_state["status_calls"] += 1
+            # Even calls indicate file completion (after upload)
+            if upload_state["status_calls"] % 2 == 0:
+                upload_state["completed_files"] += 1
+            # Report progress with current file count
+            current_file = (upload_state["status_calls"] + 1) // 2  # 1-indexed current file
+            progress = UploadProgress(
+                phase="uploading",
+                message=f"Uploading file {current_file} of {upload_state['total_files']}",
+                percent_complete=(upload_state["completed_files"] / upload_state["total_files"] * 100)
+                if upload_state["total_files"] > 0
+                else 0,
+            )
+            progress_callback(progress)
+    def do_upload(uploader) -> dict[str, dict]:
+        """Perform the upload and return raw results."""
+        uploader.getUpdatedConfig()
+        uploader.scanDirectory(directory, purge_state=True)
+        return uploader.uploadFiles(
+            file_callback=file_callback if progress_callback else None,
+            status_callback=status_callback if progress_callback else None,
+        )
+    # Use provided timeout or default
+    upload_timeout = timeout if timeout is not None else DEFAULT_UPLOAD_TIMEOUT
     # Now upload the files by creating an upload spec and then calling the uploader.
     with TemporaryDirectory() as temp_dir:
         spec_file = Path(temp_dir) / "config.json"
         with spec_file.open("w+") as cfile:
-            json.dump(bulk_upload_configuration(model), cfile)
-        uploader = GenericUploader(
-            server={
-                "host": model.hostname,
-                "protocol": "https",
-                "catalog_id": model.catalog.catalog_id,
-            },
-            config_file=spec_file,
-        )
-        try:
-            uploader.getUpdatedConfig()
-            uploader.scanDirectory(directory, purge_state=True)
-            results = {
-                path: FileUploadState(
-                    state=UploadState(result["State"]),
-                    status=result["Status"],
-                    result=result["Result"],
+            json.dump(bulk_upload_configuration(model, chunk_size=chunk_size), cfile)
+        # Create session config with longer timeout for large file uploads
+        session_config = DEFAULT_SESSION_CONFIG.copy()
+        session_config["timeout"] = upload_timeout
+        logger.debug(f"Upload session config timeout: {session_config['timeout']}")
+        all_results = {}
+        attempt = 0
+        current_delay = retry_delay
+        while attempt <= max_retries:
+            uploader = GenericUploader(
+                server={
+                    "host": model.hostname,
+                    "protocol": "https",
+                    "catalog_id": model.catalog.catalog_id,
+                    "session": session_config,
+                },
+                config_file=spec_file,
+            )
+            try:
+                raw_results = do_upload(uploader)
+                # Process results and check for failures
+                failed_files = []
+                for path, result in raw_results.items():
+                    state = UploadState(result["State"])
+                    if state == UploadState.failed or result["Result"] is None:
+                        failed_files.append((path, result["Status"]))
+                    else:
+                        # Store successful results
+                        all_results[path] = FileUploadState(
+                            state=state,
+                            status=result["Status"],
+                            result=result["Result"],
+                        )
+                if not failed_files:
+                    # All uploads successful
+                    break
+                attempt += 1
+                if attempt > max_retries:
+                    # Final attempt failed, raise error with details
+                    error_details = "; ".join([f"{path}: {msg}" for path, msg in failed_files])
+                    raise DerivaMLException(
+                        f"Failed to upload {len(failed_files)} file(s) after {max_retries} retries: {error_details}"
+                    )
+                # Log retry attempt and wait before retrying
+                logger.warning(
+                    f"Upload failed for {len(failed_files)} file(s), retrying in {current_delay:.1f}s "
+                    f"(attempt {attempt}/{max_retries}): {[p for p, _ in failed_files]}"
                 )
-                for path, result in uploader.uploadFiles().items()
-            }
-        finally:
-            uploader.cleanup()
-        return results
+                if progress_callback:
+                    progress_callback(UploadProgress(
+                        phase="retrying",
+                        message=f"Retrying {len(failed_files)} failed upload(s) in {current_delay:.1f}s (attempt {attempt}/{max_retries})",
+                        percent_complete=0,
+                    ))
+                time.sleep(current_delay)
+                current_delay *= 2  # Exponential backoff
+                # Reset upload state for retry
+                upload_state["status_calls"] = 0
+            finally:
+                uploader.cleanup()
+        return all_results
 @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
@@ -347,7 +534,7 @@ def upload_asset(model: DerivaModel, file: Path | str, table: Table, **kwargs: A
         server=model.catalog.deriva_server.server,
         credentials=model.catalog.deriva_server.credentials,
     )
-    md5_hashes = hash_utils.compute_file_hashes(file, ["md5"])["md5"]
+    md5_hashes = hash_utils.compute_file_hashes(file, frozenset(["md5"]))["md5"]
     sanitized_filename = urlquote(re.sub("[^a-zA-Z0-9_.-]", "_", md5_hashes[0] + "." + file_name))
     hatrac_path = f"{hatrac_path}{sanitized_filename}"

deriva-ml 1.17.10__py3-none-any.whl → 1.17.11__py3-none-any.whl

deriva-ml 1.17.10py3-none-any.whl → 1.17.11py3-none-any.whl