PyPI - arkindex-base-worker - Versions diffs - 0.5.2a1__py3-none-any.whl → 0.5.2b1__py3-none-any.whl - Mend

arkindex-base-worker 0.5.2a1py3-none-any.whl → 0.5.2b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

{arkindex_base_worker-0.5.2a1.dist-info → arkindex_base_worker-0.5.2b1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: arkindex-base-worker
-Version: 0.5.2a1
+Version: 0.5.2b1
 Summary: Base Worker to easily build Arkindex ML workflows
 Author-email: Teklia <contact@teklia.com>
 Maintainer-email: Teklia <contact@teklia.com>
@@ -23,8 +23,9 @@ Requires-Dist: humanize==4.15.0
 Requires-Dist: peewee~=3.17
 Requires-Dist: Pillow==11.3.0
 Requires-Dist: python-gnupg==0.5.6
+Requires-Dist: python-magic==0.4.27
 Requires-Dist: shapely==2.0.6
-Requires-Dist: teklia-toolbox==0.1.12
+Requires-Dist: teklia-toolbox==0.1.13
 Requires-Dist: zstandard==0.25.0
 Provides-Extra: tests
 Requires-Dist: pytest-mock==3.15.1; extra == "tests"

{arkindex_base_worker-0.5.2a1.dist-info → arkindex_base_worker-0.5.2b1.dist-info}/RECORD RENAMED Viewed

@@ -1,11 +1,11 @@
-arkindex_base_worker-0.5.2a1.dist-info/licenses/LICENSE,sha256=NVshRi1efwVezMfW7xXYLrdDr2Li1AfwfGOd5WuH1kQ,1063
+arkindex_base_worker-0.5.2b1.dist-info/licenses/LICENSE,sha256=NVshRi1efwVezMfW7xXYLrdDr2Li1AfwfGOd5WuH1kQ,1063
 arkindex_worker/__init__.py,sha256=Sdt5KXn8EgURb2MurYVrUWaHbH3iFA1XLRo0Lc5AJ44,250
 arkindex_worker/cache.py,sha256=XpEXMSnbhYCvrJquwA9XXqZo-ajMLpaCxKG5wH3Gp6Y,10959
-arkindex_worker/image.py,sha256=sGE8to5iykXv25bpkftOEWzlh5NzBZSKy4lSRoHYHPU,20929
+arkindex_worker/image.py,sha256=9KeZHWNIDkwNJZR0y-mbyD_pvKfrgdktMB32jZqSMYk,20927
 arkindex_worker/models.py,sha256=DgKvAB_2e1cPcuUavZkyTkV10jBK8y083oVklB9idSk,10855
-arkindex_worker/utils.py,sha256=Eqg5pGAuOmuwMT3EhKTQDMek7wHC1KzZL7XXqYVVfHY,10977
-arkindex_worker/worker/__init__.py,sha256=SzD0s1_m6gMV02EUF-NeciqZdVPA4dpXI84tSj-g494,17869
-arkindex_worker/worker/base.py,sha256=-R_aLMJHbR6X1uM-U0zExsF_KLy5Wl3WJ_YMGO9We0I,22153
+arkindex_worker/utils.py,sha256=kqOTVLBh-0krD2ukTkroiMZ2820wNYxeR8Cf1AyoqNA,10859
+arkindex_worker/worker/__init__.py,sha256=tM_ynAARmtuJw5YWb_jI0AD5KNXbWN1K-VDiixIp7O4,18009
+arkindex_worker/worker/base.py,sha256=2nQdPGh2qQOUNmvV2Mc1KZeqE8d4Fhy9tCo6Q2nNdNQ,22214
 arkindex_worker/worker/classification.py,sha256=qvykymkgd4nGywHCxL8obo4egstoGsmWNS4Ztc1qNWQ,11024
 arkindex_worker/worker/corpus.py,sha256=MeIMod7jkWyX0frtD0a37rhumnMV3p9ZOC1xwAoXrAA,2291
 arkindex_worker/worker/dataset.py,sha256=tVaPx43vaH-KTtx4w5V06e26ha8XPfiJTRzBXlu928Y,5273
@@ -14,8 +14,8 @@ arkindex_worker/worker/entity.py,sha256=Aj6EOfzHEm7qQV-Egm0YKLZgCrLS_3ggOKTY81M2
 arkindex_worker/worker/image.py,sha256=L6Ikuf0Z0RxJk7JarY5PggJGrYSHLaPK0vn0dy0CIaQ,623
 arkindex_worker/worker/metadata.py,sha256=keZdOdUthSH2hAw9iet5pN7rzWihTUYjZHRGTEjaltw,6843
 arkindex_worker/worker/process.py,sha256=9TEHpMcBax1wc6PrWMMrdXe2uNfqyVj7n_dAYZRBGnY,1854
-arkindex_worker/worker/task.py,sha256=nYfMSFm_d-4t8y4PO4HjFBnLsZf7IsDjkS7-A2Pgnac,1525
-arkindex_worker/worker/training.py,sha256=tyQOHcwv--_wdYz6CgLEe1YM7kwwwKN30LvGTsnWd78,10923
+arkindex_worker/worker/task.py,sha256=HASQU5LYVtgvCnRCLFC6iH7h7v6q_usZNZ-r_Wkv9A8,3306
+arkindex_worker/worker/training.py,sha256=b1YGeUiOWob_DacS4fphGkErJGsx84YVgr5NnsukoEQ,8420
 arkindex_worker/worker/transcription.py,sha256=sw718R119tsLNY8inPMVeIilvFJo94fMbMtYgH0zTM8,21250
 examples/standalone/python/worker.py,sha256=Zr4s4pHvgexEjlkixLFYZp1UuwMLeoTxjyNG5_S2iYE,6672
 examples/tooled/python/worker.py,sha256=kIYlHLsO5UpwX4XtERRq4tf2qTsvqKK30C-w8t0yyhA,1821
@@ -24,11 +24,11 @@ tests/__init__.py,sha256=DG--S6IpGl399rzSAjDdHL76CkOIeZIjajCcyUSDhOQ,241
 tests/conftest.py,sha256=Tp7YFK17NATwF2yAcBwi0QFNyKSXtLS0VhZ-zZngsQI,24343
 tests/test_base_worker.py,sha256=lwS4X3atS2ktEKd1XdogmN3mbzq-tO206-k_0EDITlw,29302
 tests/test_cache.py,sha256=_wztzh94EwVrb8UvpFqgl2aa2_FLaCcJKaqunCYR5Dw,10435
-tests/test_dataset_worker.py,sha256=iDJM2C4PfQNH0r4_QqSWoPt8BcM0geUUdODtWY0Z9PA,22412
+tests/test_dataset_worker.py,sha256=LmL3ERF1__PUPkTLiAFC0IYglZTv5WQYA42Vm-uhe2w,22023
 tests/test_element.py,sha256=hlj5VSF4plwC7uz9R4LGOOXZJQcHZiYCIDZT5V6EIB8,14334
 tests/test_image.py,sha256=yAM5mMfpQcIurT1KLHmu0AhSX2Qm3YvCu7afyZ3XUdU,28314
 tests/test_merge.py,sha256=REpZ13jkq_qm_4L5URQgFy5lxvPZtXxQEiWfYLMdmF0,7956
-tests/test_modern_config.py,sha256=Bm-a4LYQXgLZWQX7AmVyfJW0LNoLy1wj2d2GjzDkcBk,2683
+tests/test_modern_config.py,sha256=ZbMHT5b5RG3ZPX4MoqI8zitRg2y5fV1C6ynfyRkq828,4008
 tests/test_utils.py,sha256=tgzNqyJMpddpeFWEjgsew_yDzmqnCA9HDaA5IpevAcM,5353
 tests/test_elements_worker/__init__.py,sha256=2t3NciCIOun_N-Wv63FWGsTm5W9N3mbwAWVuFORlMg8,308
 tests/test_elements_worker/test_classification.py,sha256=nya7veSPR_O9G41Enodp2-o6AifMBcaSTWJP2vXSSJ4,30133
@@ -44,8 +44,8 @@ tests/test_elements_worker/test_entity.py,sha256=SNAZEsVVLnqlliOmjkgv_cZhw0bAuJU
 tests/test_elements_worker/test_image.py,sha256=BljMNKgec_9a5bzNzFpYZIvSbuvwsWDfdqLHVJaTa7M,2079
 tests/test_elements_worker/test_metadata.py,sha256=qtTDtlp3VnBkfck7PAguK2dEgTLlr1i1EVnmNTeNf3A,20515
 tests/test_elements_worker/test_process.py,sha256=y4RoVhPfyHzR795fw7-_FXElBcKo3fy4Ew_HI-kxJic,3088
-tests/test_elements_worker/test_task.py,sha256=wTUWqN9UhfKmJn3IcFY75EW4I1ulRhisflmY1kmP47s,5574
-tests/test_elements_worker/test_training.py,sha256=qgK7BLucddRzc8ePbQtY75x17QvGDEq5XCwgyyvmAJE,8717
+tests/test_elements_worker/test_task.py,sha256=oHwP1fbJftXFA2U4qA3Gb4vX-iJoV-sBvPHnfBBpRrc,8906
+tests/test_elements_worker/test_training.py,sha256=VY3YKYAm8IijAD6gWY0g06I27gXvMxa7SnAsCWm7G-8,4896
 tests/test_elements_worker/test_transcription_create.py,sha256=yznO9B_BVsOR0Z_VY5ZL8gJp0ZPCz_4sPUs5dXtixAg,29281
 tests/test_elements_worker/test_transcription_create_with_elements.py,sha256=tmcyglgssEqMnt1Mdy_u6X1m2wgLWTo_HdWst3GrK2k,33056
 tests/test_elements_worker/test_transcription_list.py,sha256=ikz7HYPCoQWTdTRCd382SB-y-T2BbigPLlIcx5Eow-I,15324
@@ -55,7 +55,7 @@ worker-demo/tests/conftest.py,sha256=XzNMNeg6pmABUAH8jN6eZTlZSFGLYjS3-DTXjiRN6Yc
 worker-demo/tests/test_worker.py,sha256=3DLd4NRK4bfyatG5P_PK4k9P9tJHx9XQq5_ryFEEFVg,304
 worker-demo/worker_demo/__init__.py,sha256=2BPomV8ZMNf3YXJgloatKeHQCE6QOkwmsHGkO6MkQuM,125
 worker-demo/worker_demo/worker.py,sha256=Rt-DjWa5iBP08k58NDZMfeyPuFbtNcbX6nc5jFX7GNo,440
-arkindex_base_worker-0.5.2a1.dist-info/METADATA,sha256=AwYp_xJZzu6zAtvnvZjeK_W29tzqvRuwYnxwMYcKSIc,1849
-arkindex_base_worker-0.5.2a1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-arkindex_base_worker-0.5.2a1.dist-info/top_level.txt,sha256=-vNjP2VfROx0j83mdi9aIqRZ88eoJjxeWz-R_gPgyXU,49
-arkindex_base_worker-0.5.2a1.dist-info/RECORD,,
+arkindex_base_worker-0.5.2b1.dist-info/METADATA,sha256=3dxm9h6sl-bra7LqCF5_plfIrX9qW8zgzF4cnAiUcoQ,1885
+arkindex_base_worker-0.5.2b1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+arkindex_base_worker-0.5.2b1.dist-info/top_level.txt,sha256=-vNjP2VfROx0j83mdi9aIqRZ88eoJjxeWz-R_gPgyXU,49
+arkindex_base_worker-0.5.2b1.dist-info/RECORD,,

{arkindex_base_worker-0.5.2a1.dist-info → arkindex_base_worker-0.5.2b1.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: setuptools (80.10.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

arkindex_worker/image.py CHANGED Viewed

@@ -38,7 +38,7 @@ if TYPE_CHECKING:
     from arkindex_worker.models import Element
 # See http://docs.python-requests.org/en/master/user/advanced/#timeouts
-DOWNLOAD_TIMEOUT = (30, 60)
+REQUEST_TIMEOUT = (30, 60)
 BoundingBox = namedtuple("BoundingBox", ["x", "y", "width", "height"])
@@ -346,7 +346,7 @@ def _retried_request(url, *args, method=requests.get, **kwargs):
         url,
         *args,
         headers={"User-Agent": IIIF_USER_AGENT},
-        timeout=DOWNLOAD_TIMEOUT,
+        timeout=REQUEST_TIMEOUT,
         verify=should_verify_cert(url),
         **kwargs,
     )

arkindex_worker/utils.py CHANGED Viewed

@@ -163,12 +163,12 @@ def zstd_compress(
 def create_tar_archive(
     path: Path, destination: Path | None = None
-) -> tuple[int | None, Path, str]:
+) -> tuple[int | None, Path]:
     """Create a tar archive using the content at specified location.
     :param path: Path to the file to archive
     :param destination: Optional path for the created TAR archive. A tempfile will be created if this is omitted.
-    :return: The file descriptor (if one was created) and path to the TAR archive, hash of its content.
+    :return: The file descriptor (if one was created) and path to the TAR archive.
     """
     # Parse destination and create a tmpfile if none was specified
     file_d, destination = (
@@ -204,26 +204,26 @@ def create_tar_archive(
         with file_path.open("rb") as file_data:
             for chunk in iter(lambda: file_data.read(CHUNK_SIZE), b""):
                 content_hasher.update(chunk)
-    return file_d, destination, content_hasher.hexdigest()
+    return file_d, destination
 def create_tar_zst_archive(
     source: Path, destination: Path | None = None
-) -> tuple[int | None, Path, str, str]:
+) -> tuple[int | None, Path, str]:
     """Helper to create a TAR+ZST archive from a source folder.
     :param source: Path to the folder whose content should be archived.
     :param destination: Path to the created archive, defaults to None. If unspecified, a temporary file will be created.
-    :return: The file descriptor of the created tempfile (if one was created), path to the archive, its hash and the hash of the tar archive's content.
+    :return: The file descriptor of the created tempfile (if one was created), path to the archive and its hash.
     """
     # Create tar archive
-    tar_fd, tar_archive, tar_hash = create_tar_archive(source)
+    tar_fd, tar_archive = create_tar_archive(source)
     zst_fd, zst_archive, zst_hash = zstd_compress(tar_archive, destination)
     close_delete_file(tar_fd, tar_archive)
-    return zst_fd, zst_archive, zst_hash, tar_hash
+    return zst_fd, zst_archive, zst_hash
 def create_zip_archive(source: Path, destination: Path | None = None) -> Path:

arkindex_worker/worker/__init__.py CHANGED Viewed

@@ -424,12 +424,13 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
         failed = 0
         for i, dataset_set in enumerate(dataset_sets, start=1):
             try:
-                assert dataset_set.dataset.state == DatasetState.Complete.value, (
-                    "When processing a set, its dataset state should be Complete."
-                )
-                logger.info(f"Retrieving data for {dataset_set} ({i}/{count})")
-                self.download_dataset_artifact(dataset_set.dataset)
+                if dataset_set.dataset.state == DatasetState.Complete.value:
+                    logger.info(f"Retrieving data for {dataset_set} ({i}/{count})")
+                    self.download_dataset_artifact(dataset_set.dataset)
+                else:
+                    logger.warning(
+                        f"The dataset {dataset_set.dataset} has its state set to `{dataset_set.dataset.state}`, its archive will not be downloaded"
+                    )
                 logger.info(f"Processing {dataset_set} ({i}/{count})")
                 self.process_set(dataset_set)
@@ -444,7 +445,7 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
                 logger.warning(message, exc_info=e if self.args.verbose else None)
-        # Cleanup the latest downloaded dataset artifact
+        # Cleanup the latest downloaded dataset artifact (if needed)
         self.cleanup_downloaded_artifact()
         message = f"Ran on {count} {pluralize('set', count)}: {count - failed} completed, {failed} failed"

arkindex_worker/worker/base.py CHANGED Viewed

@@ -15,7 +15,7 @@ import gnupg
 import yaml
 from arkindex import options_from_env
-from arkindex.exceptions import ClientError, ErrorResponse
+from arkindex.exceptions import ErrorResponse
 from arkindex_worker import logger
 from arkindex_worker.cache import (
     check_version,
@@ -261,6 +261,10 @@ class BaseWorker:
         logger.info(f"Loaded {worker_run['summary']} from API")
+        # The `RetrieveSecret` endpoint is only available in Arkindex EE.
+        # In CE, the values of `secret` fields should be used directly without calling `RetrieveSecret`.
+        can_retrieve_secret = "RetrieveSecret" in self.api_client.document.links
         def _process_config_item(item: dict) -> tuple[str, Any]:
             if not item["secret"]:
                 return (item["key"], item["value"])
@@ -270,16 +274,12 @@ class BaseWorker:
                 logger.info(f"Optional secret `{item['key']}` is not set")
                 return (item["key"], None)
-            # Load secret, only available in Arkindex EE
-            try:
-                secret = self.load_secret(Path(item["value"]))
-            except ClientError as e:
-                logger.error(
-                    f"Failed to retrieve the secret {item['value']}, probably an Arkindex Community Edition: {e}"
-                )
-                return (item["key"], None)
+            value = item["value"]
+            # Load secret when `RetrieveSecret` is available
+            if can_retrieve_secret:
+                value = self.load_secret(Path(item["value"]))
-            return (item["key"], secret)
+            return (item["key"], value)
         # Load model version configuration when available
         # Workers will use model version ID and details to download the model

arkindex_worker/worker/task.py CHANGED Viewed

@@ -4,9 +4,16 @@ BaseWorker methods for tasks.
 import uuid
 from collections.abc import Iterator
+from http.client import REQUEST_TIMEOUT
+from pathlib import Path
+import magic
+import requests
 from arkindex.compat import DownloadedFile
+from arkindex_worker import logger
 from arkindex_worker.models import Artifact
+from teklia_toolbox.requests import should_verify_cert
 class TaskMixin:
@@ -45,3 +52,49 @@ class TaskMixin:
         return self.api_client.request(
             "DownloadArtifact", id=task_id, path=artifact.path
         )
+    def upload_artifact(self, path: Path) -> None:
+        """
+        Upload a single file as an Artifact of the current task.
+        :param path: Path of the single file to upload as an Artifact.
+        """
+        assert path and isinstance(path, Path) and path.exists(), (
+            "path shouldn't be null, should be a Path and should exist"
+        )
+        if self.is_read_only:
+            logger.warning("Cannot upload artifact as this worker is in read-only mode")
+            return
+        # Get path relative to task's data directory
+        relpath = str(path.relative_to(self.work_dir))
+        # Get file size
+        size = path.stat().st_size
+        # Detect content type
+        try:
+            content_type = magic.from_file(path, mime=True)
+        except Exception as e:
+            logger.warning(f"Failed to get a mime type for {path}: {e}")
+            content_type = "application/octet-stream"
+        # Create artifact on API to get an S3 url
+        artifact = self.api_client.request(
+            "CreateArtifact",
+            id=self.task_id,
+            body={"path": relpath, "content_type": content_type, "size": size},
+        )
+        # Upload the file content to S3
+        s3_put_url = artifact["s3_put_url"]
+        with path.open("rb") as content:
+            resp = requests.put(
+                s3_put_url,
+                data=content,
+                headers={"Content-Type": content_type},
+                timeout=REQUEST_TIMEOUT,
+                verify=should_verify_cert(s3_put_url),
+            )
+            resp.raise_for_status()

arkindex_worker/worker/training.py CHANGED Viewed

@@ -3,16 +3,15 @@ BaseWorker methods for training.
 """
 import functools
+from collections.abc import Generator
 from contextlib import contextmanager
 from pathlib import Path
 from typing import NewType
 from uuid import UUID
-import requests
-from arkindex.exceptions import ErrorResponse
 from arkindex_worker import logger
 from arkindex_worker.utils import close_delete_file, create_tar_zst_archive
+from teklia_toolbox.uploads import MultipartUpload
 DirPath = NewType("DirPath", Path)
 """Path to a directory"""
@@ -25,23 +24,21 @@ FileSize = NewType("FileSize", int)
 @contextmanager
-def create_archive(path: DirPath) -> tuple[Path, Hash, FileSize, Hash]:
+def create_archive(path: DirPath) -> Generator[tuple[Path, FileSize, Hash]]:
     """
     Create a tar archive from the files at the given location then compress it to a zst archive.
-    Yield its location, its hash, its size and its content's hash.
+    Yield its location, its size and its hash.
     :param path: Create a compressed tar archive from the files
-    :returns: The location of the created archive, its hash, its size and its content's hash
+    :returns: The location of the created archive, its size and its hash
     """
     assert path.is_dir(), "create_archive needs a directory"
-    zst_descriptor, zst_archive, archive_hash, content_hash = create_tar_zst_archive(
-        path
-    )
+    zst_descriptor, zst_archive, archive_hash = create_tar_zst_archive(path)
     # Get content hash, archive size and hash
-    yield zst_archive, content_hash, zst_archive.stat().st_size, archive_hash
+    yield zst_archive, zst_archive.stat().st_size, archive_hash
     # Remove the zst archive
     close_delete_file(zst_descriptor, zst_archive)
@@ -112,62 +109,48 @@ class TrainingMixin:
         """
         configuration = configuration or {}
-        if not self.model_version:
-            self.create_model_version(
-                model_id=model_id,
-                tag=tag,
-                description=description,
-                configuration=configuration,
-                parent=parent,
-            )
-        elif tag or description or configuration or parent:
-            assert self.model_version.get("model_id") == model_id, (
-                "Given `model_id` does not match the current model version"
-            )
-            # If any attribute field has been defined, PATCH the current model version
-            self.update_model_version(
-                tag=tag,
-                description=description,
-                configuration=configuration,
-                parent=parent,
-            )
         # Create the zst archive, get its hash and size
-        # Validate the model version
         with create_archive(path=model_path) as (
             path_to_archive,
-            hash,
             size,
-            archive_hash,
+            hash,
         ):
-            # Create a new model version with hash and size
-            self.upload_to_s3(archive_path=path_to_archive)
-            current_version_id = self.model_version["id"]
-            # Mark the model as valid
-            self.validate_model_version(
-                size=size,
-                hash=hash,
-                archive_hash=archive_hash,
-            )
-            if self.model_version["id"] != current_version_id and (
-                tag or description or configuration or parent
-            ):
-                logger.warning(
-                    "Updating the existing available model version with the given attributes."
+            # Update an existing model version with hash, size and any other defined attribute
+            if self.model_version:
+                assert self.model_version.get("model_id") == model_id, (
+                    "Given `model_id` does not match the current model version"
                 )
                 self.update_model_version(
+                    size=size,
+                    archive_hash=hash,
+                    tag=tag,
+                    description=description,
+                    configuration=configuration,
+                    parent=parent,
+                )
+            # Create a new model version with hash and size
+            else:
+                self.create_model_version(
+                    model_id=model_id,
+                    size=size,
+                    archive_hash=hash,
                     tag=tag,
                     description=description,
                     configuration=configuration,
                     parent=parent,
                 )
+            # Upload the archive in multiple parts (supports huge files)
+            self.upload_to_s3(path_to_archive)
     @skip_if_read_only
     def create_model_version(
         self,
         model_id: str,
+        size: FileSize,
+        archive_hash: Hash,
         tag: str | None = None,
         description: str | None = None,
         configuration: dict | None = None,
@@ -177,6 +160,8 @@ class TrainingMixin:
         Create a new version of the specified model with its base attributes.
         Once successfully created, the model version is accessible via `self.model_version`.
+        :param size: Size of uploaded archive
+        :param hash: MD5 hash of the uploaded archive
         :param tag: Tag of the model version
         :param description: Description of the model version
         :param configuration: Configuration of the model version
@@ -189,6 +174,8 @@ class TrainingMixin:
             "CreateModelVersion",
             id=model_id,
             body=build_clean_payload(
+                size=size,
+                archive_hash=archive_hash,
                 tag=tag,
                 description=description,
                 configuration=configuration,
@@ -197,12 +184,14 @@ class TrainingMixin:
         )
         logger.info(
-            f"Model version ({self.model_version['id']}) was successfully created"
+            f"Model version ({self.model_version['id']}) was successfully created."
         )
     @skip_if_read_only
     def update_model_version(
         self,
+        size: FileSize,
+        archive_hash: Hash,
         tag: str | None = None,
         description: str | None = None,
         configuration: dict | None = None,
@@ -211,6 +200,8 @@ class TrainingMixin:
         """
         Update the current model version with the given attributes.
+        :param size: Size of uploaded archive
+        :param hash: MD5 hash of the uploaded archive
         :param tag: Tag of the model version
         :param description: Description of the model version
         :param configuration: Configuration of the model version
@@ -221,6 +212,8 @@ class TrainingMixin:
             "UpdateModelVersion",
             id=self.model_version["id"],
             body=build_clean_payload(
+                size=size,
+                archive_hash=archive_hash,
                 tag=tag,
                 description=description,
                 configuration=configuration,
@@ -228,93 +221,34 @@ class TrainingMixin:
             ),
         )
         logger.info(
-            f"Model version ({self.model_version['id']}) was successfully updated"
+            f"Model version ({self.model_version['id']}) was successfully updated."
         )
     @skip_if_read_only
     def upload_to_s3(self, archive_path: Path) -> None:
         """
-        Upload the archive of the model's files to an Amazon s3 compatible storage
+        Upload the archive of the model's files to an Amazon s3 compatible storage in multiple parts
         """
         assert self.model_version, (
             "You must create the model version before uploading an archive."
         )
         assert self.model_version["state"] != "Available", (
-            "The model is already marked as available."
+            "The model version is already marked as available."
         )
-        s3_put_url = self.model_version.get("s3_put_url")
-        assert s3_put_url, (
-            "S3 PUT URL is not set, please ensure you have the right to validate a model version."
-        )
-        logger.info("Uploading to s3...")
-        # Upload the archive on s3
-        with archive_path.open("rb") as archive:
-            r = requests.put(
-                url=s3_put_url,
-                data=archive,
-                headers={"Content-Type": "application/zstd"},
-            )
-        r.raise_for_status()
-    @skip_if_read_only
-    def validate_model_version(
-        self,
-        hash: str,
-        size: int,
-        archive_hash: str,
-    ):
-        """
-        Sets the model version as `Available`, once its archive has been uploaded to S3.
-        :param hash: MD5 hash of the files contained in the archive
-        :param size: The size of the uploaded archive
-        :param archive_hash: MD5 hash of the uploaded archive
-        """
-        assert self.model_version, (
-            "You must create the model version and upload its archive before validating it."
+        multipart = MultipartUpload(
+            client=self.api_client,
+            file_path=archive_path,
+            object_type="model_version",
+            object_id=str(self.model_version["id"]),
         )
         try:
-            self.model_version = self.api_client.request(
-                "PartialUpdateModelVersion",
-                id=self.model_version["id"],
-                body={
-                    "state": "available",
-                    "size": size,
-                    "hash": hash,
-                    "archive_hash": archive_hash,
-                },
+            multipart.upload()
+            multipart.complete()
+        except Exception:
+            multipart.abort()
+            raise
+        else:
+            logger.info(
+                f"Model version ({self.model_version['id']}) archive was successfully uploaded and is now available."
             )
-        except ErrorResponse as e:
-            model_version = e.content
-            if not model_version or "id" not in model_version:
-                raise e
-            logger.warning(
-                f"An available model version exists with hash {hash}, using it instead of the pending version."
-            )
-            pending_version_id = self.model_version["id"]
-            logger.warning("Removing the pending model version.")
-            try:
-                self.api_client.request("DestroyModelVersion", id=pending_version_id)
-            except ErrorResponse as e:
-                msg = getattr(e, "content", str(e))
-                logger.error(
-                    f"An error occurred removing the pending version {pending_version_id}: {msg}."
-                )
-            logger.info("Retrieving the existing model version.")
-            existing_version_id = model_version["id"].pop()
-            try:
-                self.model_version = self.api_client.request(
-                    "RetrieveModelVersion", id=existing_version_id
-                )
-            except ErrorResponse as e:
-                logger.error(
-                    f"An error occurred retrieving the existing version {existing_version_id}: {e.status_code} - {e.content}."
-                )
-                raise
-        logger.info(f"Model version {self.model_version['id']} is now available.")

tests/test_dataset_worker.py CHANGED Viewed

@@ -435,34 +435,6 @@ def test_run_no_sets(mocker, caplog, mock_dataset_worker):
     ]
-def test_run_initial_dataset_state_error(
-    mocker, responses, caplog, mock_dataset_worker, default_dataset
-):
-    default_dataset.state = DatasetState.Building.value
-    mocker.patch(
-        "arkindex_worker.worker.DatasetWorker.list_sets",
-        return_value=[Set(name="train", dataset=default_dataset)],
-    )
-    with pytest.raises(SystemExit):
-        mock_dataset_worker.run()
-    assert len(responses.calls) == len(BASE_API_CALLS) * 2
-    assert [
-        (call.request.method, call.request.url) for call in responses.calls
-    ] == BASE_API_CALLS * 2
-    assert [(level, message) for _, level, message in caplog.record_tuples] == [
-        (logging.INFO, "Loaded Worker Fake worker @ 123412 from API"),
-        (logging.INFO, "Modern configuration is not available"),
-        (
-            logging.WARNING,
-            "Failed running worker on Set (train) from Dataset (dataset_id): AssertionError('When processing a set, its dataset state should be Complete.')",
-        ),
-        (logging.ERROR, "Ran on 1 set: 0 completed, 1 failed"),
-    ]
 def test_run_download_dataset_artifact_api_error(
     mocker,
     tmp_path,
@@ -570,16 +542,18 @@ def test_run_no_downloaded_dataset_artifact_error(
     ]
+@pytest.mark.parametrize("dataset_state", DatasetState)
 def test_run(
     mocker,
     tmp_path,
     responses,
     caplog,
+    dataset_state,
     mock_dataset_worker,
     default_dataset,
     default_artifact,
 ):
-    default_dataset.state = DatasetState.Complete.value
+    default_dataset.state = dataset_state.value
     mocker.patch(
         "arkindex_worker.worker.DatasetWorker.list_sets",
         return_value=[Set(name="train", dataset=default_dataset)],
@@ -590,55 +564,68 @@ def test_run(
     )
     mock_process = mocker.patch("arkindex_worker.worker.DatasetWorker.process_set")
-    archive_path = (
-        FIXTURES_DIR
-        / "extract_parent_archives"
-        / "first_parent"
-        / "arkindex_data.tar.zst"
-    )
-    responses.add(
-        responses.GET,
-        f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/",
-        status=200,
-        json=[default_artifact],
-    )
-    responses.add(
-        responses.GET,
-        f"http://testserver/api/v1/task/{default_dataset.task_id}/artifact/dataset_id.tar.zst",
-        status=200,
-        body=archive_path.read_bytes(),
-        content_type="application/zstd",
-    )
+    if dataset_state == DatasetState.Complete:
+        archive_path = (
+            FIXTURES_DIR
+            / "extract_parent_archives"
+            / "first_parent"
+            / "arkindex_data.tar.zst"
+        )
+        responses.add(
+            responses.GET,
+            f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/",
+            status=200,
+            json=[default_artifact],
+        )
+        responses.add(
+            responses.GET,
+            f"http://testserver/api/v1/task/{default_dataset.task_id}/artifact/dataset_id.tar.zst",
+            status=200,
+            body=archive_path.read_bytes(),
+            content_type="application/zstd",
+        )
     mock_dataset_worker.run()
     assert mock_process.call_count == 1
-    assert len(responses.calls) == len(BASE_API_CALLS) * 2 + 2
+    # We only download the dataset archive when it is Complete
+    extra_calls = []
+    if dataset_state == DatasetState.Complete:
+        extra_calls = [
+            (
+                "GET",
+                f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/",
+            ),
+            (
+                "GET",
+                f"http://testserver/api/v1/task/{default_dataset.task_id}/artifact/dataset_id.tar.zst",
+            ),
+        ]
+    assert len(responses.calls) == len(BASE_API_CALLS) * 2 + len(extra_calls)
     assert [
         (call.request.method, call.request.url) for call in responses.calls
-    ] == BASE_API_CALLS * 2 + [
-        (
-            "GET",
-            f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/",
-        ),
-        (
-            "GET",
-            f"http://testserver/api/v1/task/{default_dataset.task_id}/artifact/dataset_id.tar.zst",
-        ),
-    ]
+    ] == BASE_API_CALLS * 2 + extra_calls
-    assert [(level, message) for _, level, message in caplog.record_tuples] == [
+    logs = [
         (logging.INFO, "Loaded Worker Fake worker @ 123412 from API"),
         (logging.INFO, "Modern configuration is not available"),
         (
-            logging.INFO,
-            "Retrieving data for Set (train) from Dataset (dataset_id) (1/1)",
+            logging.WARNING,
+            f"The dataset Dataset (dataset_id) has its state set to `{dataset_state.value}`, its archive will not be downloaded",
         ),
-        (logging.INFO, "Downloading artifact for Dataset (dataset_id)"),
         (logging.INFO, "Processing Set (train) from Dataset (dataset_id) (1/1)"),
         (logging.INFO, "Ran on 1 set: 1 completed, 0 failed"),
     ]
+    if dataset_state == DatasetState.Complete:
+        logs[2] = (
+            logging.INFO,
+            "Retrieving data for Set (train) from Dataset (dataset_id) (1/1)",
+        )
+        logs.insert(3, (logging.INFO, "Downloading artifact for Dataset (dataset_id)"))
+    assert [(level, message) for _, level, message in caplog.record_tuples] == logs
 def test_run_read_only(

tests/test_elements_worker/test_task.py CHANGED Viewed

@@ -1,6 +1,9 @@
+import tempfile
 import uuid
+from pathlib import Path
 import pytest
+from requests import HTTPError
 from arkindex.exceptions import ErrorResponse
 from arkindex_worker.models import Artifact
@@ -196,3 +199,112 @@ def test_download_artifact(
     ] == BASE_API_CALLS + [
         ("GET", f"http://testserver/api/v1/task/{TASK_ID}/artifact/dataset_id.tar.zst"),
     ]
+@pytest.mark.parametrize(
+    ("payload", "error"),
+    [
+        # Path
+        (
+            {"path": None},
+            "path shouldn't be null, should be a Path and should exist",
+        ),
+        (
+            {"path": "not path type"},
+            "path shouldn't be null, should be a Path and should exist",
+        ),
+        (
+            {"path": Path("i_do_no_exist.oops")},
+            "path shouldn't be null, should be a Path and should exist",
+        ),
+    ],
+)
+def test_upload_artifact_wrong_param_path(mock_dataset_worker, payload, error):
+    with pytest.raises(AssertionError, match=error):
+        mock_dataset_worker.upload_artifact(**payload)
+@pytest.fixture
+def tmp_file(mock_dataset_worker):
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".txt", dir=mock_dataset_worker.work_dir
+    ) as file:
+        file.write("Some content...")
+        file.seek(0)
+        yield Path(file.name)
+def test_upload_artifact_api_error(responses, mock_dataset_worker, tmp_file):
+    responses.add(
+        responses.POST,
+        "http://testserver/api/v1/task/my_task/artifacts/",
+        status=418,
+    )
+    with pytest.raises(ErrorResponse):
+        mock_dataset_worker.upload_artifact(path=tmp_file)
+    assert len(responses.calls) == len(BASE_API_CALLS) + 1
+    assert [
+        (call.request.method, call.request.url) for call in responses.calls
+    ] == BASE_API_CALLS + [("POST", "http://testserver/api/v1/task/my_task/artifacts/")]
+def test_upload_artifact_s3_upload_error(
+    responses,
+    mock_dataset_worker,
+    tmp_file,
+):
+    responses.add(
+        responses.POST,
+        "http://testserver/api/v1/task/my_task/artifacts/",
+        json={
+            "id": "11111111-1111-1111-1111-111111111111",
+            "path": tmp_file.name,
+            "size": 15,
+            "content_type": "text/plain",
+            "s3_put_url": "http://example.com/oops.txt",
+        },
+    )
+    responses.add(responses.PUT, "http://example.com/oops.txt", status=500)
+    with pytest.raises(HTTPError):
+        mock_dataset_worker.upload_artifact(path=tmp_file)
+    assert len(responses.calls) == len(BASE_API_CALLS) + 2
+    assert [
+        (call.request.method, call.request.url) for call in responses.calls
+    ] == BASE_API_CALLS + [
+        ("POST", "http://testserver/api/v1/task/my_task/artifacts/"),
+        ("PUT", "http://example.com/oops.txt"),
+    ]
+def test_upload_artifact(
+    responses,
+    mock_dataset_worker,
+    tmp_file,
+):
+    responses.add(
+        responses.POST,
+        "http://testserver/api/v1/task/my_task/artifacts/",
+        json={
+            "id": "11111111-1111-1111-1111-111111111111",
+            "path": tmp_file.name,
+            "size": 15,
+            "content_type": "text/plain",
+            "s3_put_url": "http://example.com/test.txt",
+        },
+    )
+    responses.add(responses.PUT, "http://example.com/test.txt")
+    mock_dataset_worker.upload_artifact(path=tmp_file)
+    assert len(responses.calls) == len(BASE_API_CALLS) + 2
+    assert [
+        (call.request.method, call.request.url) for call in responses.calls
+    ] == BASE_API_CALLS + [
+        ("POST", "http://testserver/api/v1/task/my_task/artifacts/"),
+        ("PUT", "http://example.com/test.txt"),
+    ]

tests/test_elements_worker/test_training.py CHANGED Viewed

@@ -27,16 +27,16 @@ def default_model_version():
     return {
         "id": "model_version_id",
         "model_id": "model_id",
-        "state": "created",
         "parent": "42" * 16,
-        "tag": "A simple tag",
         "description": "A description",
+        "tag": "A simple tag",
+        "state": "created",
+        "size": 42,
+        "archive_hash": "123456789",
         "configuration": {"test": "value"},
-        "s3_url": None,
+        "s3_etag": None,
         "s3_put_url": "http://upload.archive",
-        "hash": None,
-        "archive_hash": None,
-        "size": None,
+        "s3_url": None,
         "created": "2000-01-01T00:00:00Z",
     }
@@ -46,14 +46,11 @@ def test_create_archive(model_file_dir):
     with create_archive(path=model_file_dir) as (
         zst_archive_path,
-        hash,
         size,
-        archive_hash,
+        hash,
     ):
         assert zst_archive_path.exists(), "The archive was not created"
-        assert hash == "c5aedde18a768757351068b840c8c8f9", (
-            "Hash was not properly computed"
-        )
+        assert len(hash) == 32
         assert 300 < size < 700
     assert not zst_archive_path.exists(), "Auto removal failed"
@@ -64,37 +61,16 @@ def test_create_archive_with_subfolder(model_file_dir_with_subfolder):
     with create_archive(path=model_file_dir_with_subfolder) as (
         zst_archive_path,
-        hash,
         size,
-        archive_hash,
+        hash,
     ):
         assert zst_archive_path.exists(), "The archive was not created"
-        assert hash == "3e453881404689e6e125144d2db3e605", (
-            "Hash was not properly computed"
-        )
+        assert len(hash) == 32
         assert 300 < size < 1500
     assert not zst_archive_path.exists(), "Auto removal failed"
-def test_handle_s3_uploading_errors(responses, mock_training_worker, model_file_dir):
-    s3_endpoint_url = "http://s3.localhost.com"
-    responses.add_passthru(s3_endpoint_url)
-    responses.add(responses.PUT, s3_endpoint_url, status=400)
-    mock_training_worker.model_version = {
-        "state": "Created",
-        "s3_put_url": s3_endpoint_url,
-    }
-    file_path = model_file_dir / "model_file.pth"
-    with pytest.raises(
-        Exception,
-        match="400 Client Error: Bad Request for url: http://s3.localhost.com/",
-    ):
-        mock_training_worker.upload_to_s3(file_path)
 @pytest.mark.parametrize(
     "method",
     [
@@ -102,7 +78,6 @@ def test_handle_s3_uploading_errors(responses, mock_training_worker, model_file_
         "create_model_version",
         "update_model_version",
         "upload_to_s3",
-        "validate_model_version",
     ],
 )
 def test_training_mixin_read_only(mock_training_worker, method, caplog):
@@ -127,12 +102,16 @@ def test_create_model_version_already_created(mock_training_worker):
     with pytest.raises(
         AssertionError, match="A model version has already been created."
     ):
-        mock_training_worker.create_model_version(model_id="model_id")
+        mock_training_worker.create_model_version(
+            model_id="model_id", size=42, archive_hash="123456789"
+        )
 @pytest.mark.parametrize("set_tag", [True, False])
 def test_create_model_version(mock_training_worker, default_model_version, set_tag):
     args = {
+        "size": 42,
+        "archive_hash": "123456789",
         "parent": "42" * 16,
         "tag": "A simple tag",
         "description": "A description",
@@ -154,12 +133,12 @@ def test_create_model_version(mock_training_worker, default_model_version, set_t
 def test_update_model_version_not_created(mock_training_worker):
     with pytest.raises(AssertionError, match="No model version has been created yet."):
-        mock_training_worker.update_model_version()
+        mock_training_worker.update_model_version(size=42, archive_hash="123456789")
 def test_update_model_version(mock_training_worker, default_model_version):
     mock_training_worker.model_version = default_model_version
-    args = {"tag": "A new tag"}
+    args = {"size": 42, "archive_hash": "123456789", "tag": "A new tag"}
     new_model_version = {**default_model_version, "tag": "A new tag"}
     mock_training_worker.api_client.add_response(
         "UpdateModelVersion",
@@ -169,101 +148,3 @@ def test_update_model_version(mock_training_worker, default_model_version):
     )
     mock_training_worker.update_model_version(**args)
     assert mock_training_worker.model_version == new_model_version
-def test_validate_model_version_not_created(mock_training_worker):
-    with pytest.raises(
-        AssertionError,
-        match="You must create the model version and upload its archive before validating it.",
-    ):
-        mock_training_worker.validate_model_version(hash="a", size=1, archive_hash="b")
-@pytest.mark.parametrize("deletion_failed", [True, False])
-def test_validate_model_version_hash_conflict(
-    mock_training_worker,
-    default_model_version,
-    caplog,
-    deletion_failed,
-):
-    mock_training_worker.model_version = {"id": "another_id"}
-    args = {
-        "hash": "hash",
-        "archive_hash": "archive_hash",
-        "size": 30,
-    }
-    mock_training_worker.api_client.add_error_response(
-        "PartialUpdateModelVersion",
-        id="another_id",
-        status_code=409,
-        body={"state": "available", **args},
-        content={"id": ["model_version_id"]},
-    )
-    if deletion_failed:
-        mock_training_worker.api_client.add_error_response(
-            "DestroyModelVersion",
-            id="another_id",
-            status_code=403,
-            content="Not admin",
-        )
-    else:
-        mock_training_worker.api_client.add_response(
-            "DestroyModelVersion",
-            id="another_id",
-            response="No content",
-        )
-    mock_training_worker.api_client.add_response(
-        "RetrieveModelVersion",
-        id="model_version_id",
-        response=default_model_version,
-    )
-    mock_training_worker.validate_model_version(**args)
-    assert mock_training_worker.model_version == default_model_version
-    error_msg = []
-    if deletion_failed:
-        error_msg = [
-            (
-                logging.ERROR,
-                "An error occurred removing the pending version another_id: Not admin.",
-            )
-        ]
-    assert [
-        (level, message)
-        for module, level, message in caplog.record_tuples
-        if module == "arkindex_worker"
-    ] == [
-        (
-            logging.WARNING,
-            "An available model version exists with hash hash, using it instead of the pending version.",
-        ),
-        (logging.WARNING, "Removing the pending model version."),
-        *error_msg,
-        (logging.INFO, "Retrieving the existing model version."),
-        (logging.INFO, "Model version model_version_id is now available."),
-    ]
-def test_validate_model_version(mock_training_worker, default_model_version, caplog):
-    mock_training_worker.model_version = {"id": "model_version_id"}
-    args = {
-        "hash": "hash",
-        "archive_hash": "archive_hash",
-        "size": 30,
-    }
-    mock_training_worker.api_client.add_response(
-        "PartialUpdateModelVersion",
-        id="model_version_id",
-        body={"state": "available", **args},
-        response=default_model_version,
-    )
-    mock_training_worker.validate_model_version(**args)
-    assert mock_training_worker.model_version == default_model_version
-    assert [
-        (level, message)
-        for module, level, message in caplog.record_tuples
-        if module == "arkindex_worker"
-    ] == [
-        (logging.INFO, "Model version model_version_id is now available."),
-    ]

tests/test_modern_config.py CHANGED Viewed

@@ -79,3 +79,42 @@ def test_with_secrets(mock_base_worker_modern_conf, responses):
     assert mock_base_worker_modern_conf.secrets == {
         "a_secret": "My super duper secret value"
     }
+def test_with_secrets_ce(mock_base_worker_modern_conf, responses, monkeypatch):
+    # Provide the full configuration directly from the worker run
+    responses.add(
+        responses.GET,
+        "http://testserver/api/v1/workers/runs/56785678-5678-5678-5678-567856785678/configuration/",
+        status=200,
+        json={
+            "configuration": [
+                {"key": "some_key", "value": "test", "secret": False},
+                {
+                    "key": "a_secret",
+                    "value": "471b9e64-29af-48dc-8bda-1a64a2da0c12",
+                    "secret": True,
+                },
+            ]
+        },
+    )
+    # Remove the RetrieveSecret endpoint to simulate Arkindex CE
+    monkeypatch.delitem(
+        mock_base_worker_modern_conf.api_client.document.links, "RetrieveSecret"
+    )
+    mock_base_worker_modern_conf.configure()
+    assert mock_base_worker_modern_conf.config == {
+        "a_secret": "471b9e64-29af-48dc-8bda-1a64a2da0c12",
+        "some_key": "test",
+    }
+    assert (
+        mock_base_worker_modern_conf.user_configuration
+        == mock_base_worker_modern_conf.config
+    )
+    assert mock_base_worker_modern_conf.secrets == {
+        # The value is used directly instead of treated as a secret name
+        "a_secret": "471b9e64-29af-48dc-8bda-1a64a2da0c12",
+    }

{arkindex_base_worker-0.5.2a1.dist-info → arkindex_base_worker-0.5.2b1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{arkindex_base_worker-0.5.2a1.dist-info → arkindex_base_worker-0.5.2b1.dist-info}/top_level.txt RENAMED Viewed

File without changes

arkindex-base-worker 0.5.2a1__py3-none-any.whl → 0.5.2b1__py3-none-any.whl

arkindex-base-worker 0.5.2a1py3-none-any.whl → 0.5.2b1py3-none-any.whl