PyPI - arkindex-base-worker - Versions diffs - 0.5.2a1__tar.gz → 0.5.2b1__tar.gz - Mend

arkindex-base-worker 0.5.2a1tar.gz → 0.5.2b1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

{arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2b1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: arkindex-base-worker
-Version: 0.5.2a1
+Version: 0.5.2b1
 Summary: Base Worker to easily build Arkindex ML workflows
 Author-email: Teklia <contact@teklia.com>
 Maintainer-email: Teklia <contact@teklia.com>
@@ -23,8 +23,9 @@ Requires-Dist: humanize==4.15.0
 Requires-Dist: peewee~=3.17
 Requires-Dist: Pillow==11.3.0
 Requires-Dist: python-gnupg==0.5.6
+Requires-Dist: python-magic==0.4.27
 Requires-Dist: shapely==2.0.6
-Requires-Dist: teklia-toolbox==0.1.12
+Requires-Dist: teklia-toolbox==0.1.13
 Requires-Dist: zstandard==0.25.0
 Provides-Extra: tests
 Requires-Dist: pytest-mock==3.15.1; extra == "tests"

{arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2b1}/arkindex_base_worker.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: arkindex-base-worker
-Version: 0.5.2a1
+Version: 0.5.2b1
 Summary: Base Worker to easily build Arkindex ML workflows
 Author-email: Teklia <contact@teklia.com>
 Maintainer-email: Teklia <contact@teklia.com>
@@ -23,8 +23,9 @@ Requires-Dist: humanize==4.15.0
 Requires-Dist: peewee~=3.17
 Requires-Dist: Pillow==11.3.0
 Requires-Dist: python-gnupg==0.5.6
+Requires-Dist: python-magic==0.4.27
 Requires-Dist: shapely==2.0.6
-Requires-Dist: teklia-toolbox==0.1.12
+Requires-Dist: teklia-toolbox==0.1.13
 Requires-Dist: zstandard==0.25.0
 Provides-Extra: tests
 Requires-Dist: pytest-mock==3.15.1; extra == "tests"

{arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2b1}/arkindex_base_worker.egg-info/requires.txt RENAMED Viewed

@@ -2,8 +2,9 @@ humanize==4.15.0
 peewee~=3.17
 Pillow==11.3.0
 python-gnupg==0.5.6
+python-magic==0.4.27
 shapely==2.0.6
-teklia-toolbox==0.1.12
+teklia-toolbox==0.1.13
 zstandard==0.25.0
 [tests]

{arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2b1}/arkindex_worker/image.py RENAMED Viewed

@@ -38,7 +38,7 @@ if TYPE_CHECKING:
     from arkindex_worker.models import Element
 # See http://docs.python-requests.org/en/master/user/advanced/#timeouts
-DOWNLOAD_TIMEOUT = (30, 60)
+REQUEST_TIMEOUT = (30, 60)
 BoundingBox = namedtuple("BoundingBox", ["x", "y", "width", "height"])
@@ -346,7 +346,7 @@ def _retried_request(url, *args, method=requests.get, **kwargs):
         url,
         *args,
         headers={"User-Agent": IIIF_USER_AGENT},
-        timeout=DOWNLOAD_TIMEOUT,
+        timeout=REQUEST_TIMEOUT,
         verify=should_verify_cert(url),
         **kwargs,
     )

{arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2b1}/arkindex_worker/utils.py RENAMED Viewed

@@ -163,12 +163,12 @@ def zstd_compress(
 def create_tar_archive(
     path: Path, destination: Path | None = None
-) -> tuple[int | None, Path, str]:
+) -> tuple[int | None, Path]:
     """Create a tar archive using the content at specified location.
     :param path: Path to the file to archive
     :param destination: Optional path for the created TAR archive. A tempfile will be created if this is omitted.
-    :return: The file descriptor (if one was created) and path to the TAR archive, hash of its content.
+    :return: The file descriptor (if one was created) and path to the TAR archive.
     """
     # Parse destination and create a tmpfile if none was specified
     file_d, destination = (
@@ -204,26 +204,26 @@ def create_tar_archive(
         with file_path.open("rb") as file_data:
             for chunk in iter(lambda: file_data.read(CHUNK_SIZE), b""):
                 content_hasher.update(chunk)
-    return file_d, destination, content_hasher.hexdigest()
+    return file_d, destination
 def create_tar_zst_archive(
     source: Path, destination: Path | None = None
-) -> tuple[int | None, Path, str, str]:
+) -> tuple[int | None, Path, str]:
     """Helper to create a TAR+ZST archive from a source folder.
     :param source: Path to the folder whose content should be archived.
     :param destination: Path to the created archive, defaults to None. If unspecified, a temporary file will be created.
-    :return: The file descriptor of the created tempfile (if one was created), path to the archive, its hash and the hash of the tar archive's content.
+    :return: The file descriptor of the created tempfile (if one was created), path to the archive and its hash.
     """
     # Create tar archive
-    tar_fd, tar_archive, tar_hash = create_tar_archive(source)
+    tar_fd, tar_archive = create_tar_archive(source)
     zst_fd, zst_archive, zst_hash = zstd_compress(tar_archive, destination)
     close_delete_file(tar_fd, tar_archive)
-    return zst_fd, zst_archive, zst_hash, tar_hash
+    return zst_fd, zst_archive, zst_hash
 def create_zip_archive(source: Path, destination: Path | None = None) -> Path:

{arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2b1}/arkindex_worker/worker/__init__.py RENAMED Viewed

@@ -424,12 +424,13 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
         failed = 0
         for i, dataset_set in enumerate(dataset_sets, start=1):
             try:
-                assert dataset_set.dataset.state == DatasetState.Complete.value, (
-                    "When processing a set, its dataset state should be Complete."
-                )
-                logger.info(f"Retrieving data for {dataset_set} ({i}/{count})")
-                self.download_dataset_artifact(dataset_set.dataset)
+                if dataset_set.dataset.state == DatasetState.Complete.value:
+                    logger.info(f"Retrieving data for {dataset_set} ({i}/{count})")
+                    self.download_dataset_artifact(dataset_set.dataset)
+                else:
+                    logger.warning(
+                        f"The dataset {dataset_set.dataset} has its state set to `{dataset_set.dataset.state}`, its archive will not be downloaded"
+                    )
                 logger.info(f"Processing {dataset_set} ({i}/{count})")
                 self.process_set(dataset_set)
@@ -444,7 +445,7 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
                 logger.warning(message, exc_info=e if self.args.verbose else None)
-        # Cleanup the latest downloaded dataset artifact
+        # Cleanup the latest downloaded dataset artifact (if needed)
         self.cleanup_downloaded_artifact()
         message = f"Ran on {count} {pluralize('set', count)}: {count - failed} completed, {failed} failed"

{arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2b1}/arkindex_worker/worker/base.py RENAMED Viewed

@@ -15,7 +15,7 @@ import gnupg
 import yaml
 from arkindex import options_from_env
-from arkindex.exceptions import ClientError, ErrorResponse
+from arkindex.exceptions import ErrorResponse
 from arkindex_worker import logger
 from arkindex_worker.cache import (
     check_version,
@@ -261,6 +261,10 @@ class BaseWorker:
         logger.info(f"Loaded {worker_run['summary']} from API")
+        # The `RetrieveSecret` endpoint is only available in Arkindex EE.
+        # In CE, the values of `secret` fields should be used directly without calling `RetrieveSecret`.
+        can_retrieve_secret = "RetrieveSecret" in self.api_client.document.links
         def _process_config_item(item: dict) -> tuple[str, Any]:
             if not item["secret"]:
                 return (item["key"], item["value"])
@@ -270,16 +274,12 @@ class BaseWorker:
                 logger.info(f"Optional secret `{item['key']}` is not set")
                 return (item["key"], None)
-            # Load secret, only available in Arkindex EE
-            try:
-                secret = self.load_secret(Path(item["value"]))
-            except ClientError as e:
-                logger.error(
-                    f"Failed to retrieve the secret {item['value']}, probably an Arkindex Community Edition: {e}"
-                )
-                return (item["key"], None)
+            value = item["value"]
+            # Load secret when `RetrieveSecret` is available
+            if can_retrieve_secret:
+                value = self.load_secret(Path(item["value"]))
-            return (item["key"], secret)
+            return (item["key"], value)
         # Load model version configuration when available
         # Workers will use model version ID and details to download the model

arkindex_base_worker-0.5.2b1/arkindex_worker/worker/task.py ADDED Viewed

@@ -0,0 +1,100 @@
+"""
+BaseWorker methods for tasks.
+"""
+import uuid
+from collections.abc import Iterator
+from http.client import REQUEST_TIMEOUT
+from pathlib import Path
+import magic
+import requests
+from arkindex.compat import DownloadedFile
+from arkindex_worker import logger
+from arkindex_worker.models import Artifact
+from teklia_toolbox.requests import should_verify_cert
+class TaskMixin:
+    def list_artifacts(self, task_id: uuid.UUID) -> Iterator[Artifact]:
+        """
+        List artifacts associated to a task.
+        :param task_id: Task ID to find artifacts from.
+        :returns: An iterator of ``Artifact`` objects built from the ``ListArtifacts`` API endpoint.
+        """
+        assert task_id and isinstance(task_id, uuid.UUID), (
+            "task_id shouldn't be null and should be an UUID"
+        )
+        results = self.api_client.request("ListArtifacts", id=task_id)
+        return map(Artifact, results)
+    def download_artifact(
+        self, task_id: uuid.UUID, artifact: Artifact
+    ) -> DownloadedFile:
+        """
+        Download an artifact content.
+        :param task_id: Task ID the Artifact is from.
+        :param artifact: Artifact to download content from.
+        :returns: A temporary file containing the ``Artifact`` downloaded from the ``DownloadArtifact`` API endpoint.
+        """
+        assert task_id and isinstance(task_id, uuid.UUID), (
+            "task_id shouldn't be null and should be an UUID"
+        )
+        assert artifact and isinstance(artifact, Artifact), (
+            "artifact shouldn't be null and should be an Artifact"
+        )
+        return self.api_client.request(
+            "DownloadArtifact", id=task_id, path=artifact.path
+        )
+    def upload_artifact(self, path: Path) -> None:
+        """
+        Upload a single file as an Artifact of the current task.
+        :param path: Path of the single file to upload as an Artifact.
+        """
+        assert path and isinstance(path, Path) and path.exists(), (
+            "path shouldn't be null, should be a Path and should exist"
+        )
+        if self.is_read_only:
+            logger.warning("Cannot upload artifact as this worker is in read-only mode")
+            return
+        # Get path relative to task's data directory
+        relpath = str(path.relative_to(self.work_dir))
+        # Get file size
+        size = path.stat().st_size
+        # Detect content type
+        try:
+            content_type = magic.from_file(path, mime=True)
+        except Exception as e:
+            logger.warning(f"Failed to get a mime type for {path}: {e}")
+            content_type = "application/octet-stream"
+        # Create artifact on API to get an S3 url
+        artifact = self.api_client.request(
+            "CreateArtifact",
+            id=self.task_id,
+            body={"path": relpath, "content_type": content_type, "size": size},
+        )
+        # Upload the file content to S3
+        s3_put_url = artifact["s3_put_url"]
+        with path.open("rb") as content:
+            resp = requests.put(
+                s3_put_url,
+                data=content,
+                headers={"Content-Type": content_type},
+                timeout=REQUEST_TIMEOUT,
+                verify=should_verify_cert(s3_put_url),
+            )
+            resp.raise_for_status()

{arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2b1}/arkindex_worker/worker/training.py RENAMED Viewed

@@ -3,16 +3,15 @@ BaseWorker methods for training.
 """
 import functools
+from collections.abc import Generator
 from contextlib import contextmanager
 from pathlib import Path
 from typing import NewType
 from uuid import UUID
-import requests
-from arkindex.exceptions import ErrorResponse
 from arkindex_worker import logger
 from arkindex_worker.utils import close_delete_file, create_tar_zst_archive
+from teklia_toolbox.uploads import MultipartUpload
 DirPath = NewType("DirPath", Path)
 """Path to a directory"""
@@ -25,23 +24,21 @@ FileSize = NewType("FileSize", int)
 @contextmanager
-def create_archive(path: DirPath) -> tuple[Path, Hash, FileSize, Hash]:
+def create_archive(path: DirPath) -> Generator[tuple[Path, FileSize, Hash]]:
     """
     Create a tar archive from the files at the given location then compress it to a zst archive.
-    Yield its location, its hash, its size and its content's hash.
+    Yield its location, its size and its hash.
     :param path: Create a compressed tar archive from the files
-    :returns: The location of the created archive, its hash, its size and its content's hash
+    :returns: The location of the created archive, its size and its hash
     """
     assert path.is_dir(), "create_archive needs a directory"
-    zst_descriptor, zst_archive, archive_hash, content_hash = create_tar_zst_archive(
-        path
-    )
+    zst_descriptor, zst_archive, archive_hash = create_tar_zst_archive(path)
     # Get content hash, archive size and hash
-    yield zst_archive, content_hash, zst_archive.stat().st_size, archive_hash
+    yield zst_archive, zst_archive.stat().st_size, archive_hash
     # Remove the zst archive
     close_delete_file(zst_descriptor, zst_archive)
@@ -112,62 +109,48 @@ class TrainingMixin:
         """
         configuration = configuration or {}
-        if not self.model_version:
-            self.create_model_version(
-                model_id=model_id,
-                tag=tag,
-                description=description,
-                configuration=configuration,
-                parent=parent,
-            )
-        elif tag or description or configuration or parent:
-            assert self.model_version.get("model_id") == model_id, (
-                "Given `model_id` does not match the current model version"
-            )
-            # If any attribute field has been defined, PATCH the current model version
-            self.update_model_version(
-                tag=tag,
-                description=description,
-                configuration=configuration,
-                parent=parent,
-            )
         # Create the zst archive, get its hash and size
-        # Validate the model version
         with create_archive(path=model_path) as (
             path_to_archive,
-            hash,
             size,
-            archive_hash,
+            hash,
         ):
-            # Create a new model version with hash and size
-            self.upload_to_s3(archive_path=path_to_archive)
-            current_version_id = self.model_version["id"]
-            # Mark the model as valid
-            self.validate_model_version(
-                size=size,
-                hash=hash,
-                archive_hash=archive_hash,
-            )
-            if self.model_version["id"] != current_version_id and (
-                tag or description or configuration or parent
-            ):
-                logger.warning(
-                    "Updating the existing available model version with the given attributes."
+            # Update an existing model version with hash, size and any other defined attribute
+            if self.model_version:
+                assert self.model_version.get("model_id") == model_id, (
+                    "Given `model_id` does not match the current model version"
                 )
                 self.update_model_version(
+                    size=size,
+                    archive_hash=hash,
+                    tag=tag,
+                    description=description,
+                    configuration=configuration,
+                    parent=parent,
+                )
+            # Create a new model version with hash and size
+            else:
+                self.create_model_version(
+                    model_id=model_id,
+                    size=size,
+                    archive_hash=hash,
                     tag=tag,
                     description=description,
                     configuration=configuration,
                     parent=parent,
                 )
+            # Upload the archive in multiple parts (supports huge files)
+            self.upload_to_s3(path_to_archive)
     @skip_if_read_only
     def create_model_version(
         self,
         model_id: str,
+        size: FileSize,
+        archive_hash: Hash,
         tag: str | None = None,
         description: str | None = None,
         configuration: dict | None = None,
@@ -177,6 +160,8 @@ class TrainingMixin:
         Create a new version of the specified model with its base attributes.
         Once successfully created, the model version is accessible via `self.model_version`.
+        :param size: Size of uploaded archive
+        :param hash: MD5 hash of the uploaded archive
         :param tag: Tag of the model version
         :param description: Description of the model version
         :param configuration: Configuration of the model version
@@ -189,6 +174,8 @@ class TrainingMixin:
             "CreateModelVersion",
             id=model_id,
             body=build_clean_payload(
+                size=size,
+                archive_hash=archive_hash,
                 tag=tag,
                 description=description,
                 configuration=configuration,
@@ -197,12 +184,14 @@ class TrainingMixin:
         )
         logger.info(
-            f"Model version ({self.model_version['id']}) was successfully created"
+            f"Model version ({self.model_version['id']}) was successfully created."
         )
     @skip_if_read_only
     def update_model_version(
         self,
+        size: FileSize,
+        archive_hash: Hash,
         tag: str | None = None,
         description: str | None = None,
         configuration: dict | None = None,
@@ -211,6 +200,8 @@ class TrainingMixin:
         """
         Update the current model version with the given attributes.
+        :param size: Size of uploaded archive
+        :param hash: MD5 hash of the uploaded archive
         :param tag: Tag of the model version
         :param description: Description of the model version
         :param configuration: Configuration of the model version
@@ -221,6 +212,8 @@ class TrainingMixin:
             "UpdateModelVersion",
             id=self.model_version["id"],
             body=build_clean_payload(
+                size=size,
+                archive_hash=archive_hash,
                 tag=tag,
                 description=description,
                 configuration=configuration,
@@ -228,93 +221,34 @@ class TrainingMixin:
             ),
         )
         logger.info(
-            f"Model version ({self.model_version['id']}) was successfully updated"
+            f"Model version ({self.model_version['id']}) was successfully updated."
         )
     @skip_if_read_only
     def upload_to_s3(self, archive_path: Path) -> None:
         """
-        Upload the archive of the model's files to an Amazon s3 compatible storage
+        Upload the archive of the model's files to an Amazon s3 compatible storage in multiple parts
         """
         assert self.model_version, (
             "You must create the model version before uploading an archive."
         )
         assert self.model_version["state"] != "Available", (
-            "The model is already marked as available."
+            "The model version is already marked as available."
         )
-        s3_put_url = self.model_version.get("s3_put_url")
-        assert s3_put_url, (
-            "S3 PUT URL is not set, please ensure you have the right to validate a model version."
-        )
-        logger.info("Uploading to s3...")
-        # Upload the archive on s3
-        with archive_path.open("rb") as archive:
-            r = requests.put(
-                url=s3_put_url,
-                data=archive,
-                headers={"Content-Type": "application/zstd"},
-            )
-        r.raise_for_status()
-    @skip_if_read_only
-    def validate_model_version(
-        self,
-        hash: str,
-        size: int,
-        archive_hash: str,
-    ):
-        """
-        Sets the model version as `Available`, once its archive has been uploaded to S3.
-        :param hash: MD5 hash of the files contained in the archive
-        :param size: The size of the uploaded archive
-        :param archive_hash: MD5 hash of the uploaded archive
-        """
-        assert self.model_version, (
-            "You must create the model version and upload its archive before validating it."
+        multipart = MultipartUpload(
+            client=self.api_client,
+            file_path=archive_path,
+            object_type="model_version",
+            object_id=str(self.model_version["id"]),
         )
         try:
-            self.model_version = self.api_client.request(
-                "PartialUpdateModelVersion",
-                id=self.model_version["id"],
-                body={
-                    "state": "available",
-                    "size": size,
-                    "hash": hash,
-                    "archive_hash": archive_hash,
-                },
+            multipart.upload()
+            multipart.complete()
+        except Exception:
+            multipart.abort()
+            raise
+        else:
+            logger.info(
+                f"Model version ({self.model_version['id']}) archive was successfully uploaded and is now available."
             )
-        except ErrorResponse as e:
-            model_version = e.content
-            if not model_version or "id" not in model_version:
-                raise e
-            logger.warning(
-                f"An available model version exists with hash {hash}, using it instead of the pending version."
-            )
-            pending_version_id = self.model_version["id"]
-            logger.warning("Removing the pending model version.")
-            try:
-                self.api_client.request("DestroyModelVersion", id=pending_version_id)
-            except ErrorResponse as e:
-                msg = getattr(e, "content", str(e))
-                logger.error(
-                    f"An error occurred removing the pending version {pending_version_id}: {msg}."
-                )
-            logger.info("Retrieving the existing model version.")
-            existing_version_id = model_version["id"].pop()
-            try:
-                self.model_version = self.api_client.request(
-                    "RetrieveModelVersion", id=existing_version_id
-                )
-            except ErrorResponse as e:
-                logger.error(
-                    f"An error occurred retrieving the existing version {existing_version_id}: {e.status_code} - {e.content}."
-                )
-                raise
-        logger.info(f"Model version {self.model_version['id']} is now available.")

{arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2b1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "arkindex-base-worker"
-version = "0.5.2a1"
+version = "0.5.2b1"
 description = "Base Worker to easily build Arkindex ML workflows"
 license-files = ["LICENSE"]
 dependencies = [
@@ -12,8 +12,9 @@ dependencies = [
     "peewee~=3.17",
     "Pillow==11.3.0",
     "python-gnupg==0.5.6",
+    "python-magic==0.4.27",
     "shapely==2.0.6",
-    "teklia-toolbox==0.1.12",
+    "teklia-toolbox==0.1.13",
     "zstandard==0.25.0",
 ]
 authors = [

arkindex-base-worker 0.5.2a1__tar.gz → 0.5.2b1__tar.gz

arkindex-base-worker 0.5.2a1tar.gz → 0.5.2b1tar.gz