PyPI - arkindex-base-worker - Versions diffs - 0.5.1rc3__py3-none-any.whl → 0.5.2a2__py3-none-any.whl - Mend

arkindex-base-worker 0.5.1rc3py3-none-any.whl → 0.5.2a2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

{arkindex_base_worker-0.5.1rc3.dist-info → arkindex_base_worker-0.5.2a2.dist-info}/METADATA RENAMED Viewed

@@ -1,31 +1,9 @@
 Metadata-Version: 2.4
 Name: arkindex-base-worker
-Version: 0.5.1rc3
+Version: 0.5.2a2
 Summary: Base Worker to easily build Arkindex ML workflows
 Author-email: Teklia <contact@teklia.com>
 Maintainer-email: Teklia <contact@teklia.com>
-License: MIT License
-        Copyright (c) 2023 Teklia
-        Permission is hereby granted, free of charge, to any person obtaining a copy
-        of this software and associated documentation files (the "Software"), to deal
-        in the Software without restriction, including without limitation the rights
-        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-        copies of the Software, and to permit persons to whom the Software is
-        furnished to do so, subject to the following conditions:
-        The above copyright notice and this permission notice shall be included in all
-        copies or substantial portions of the Software.
-        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-        SOFTWARE.
 Project-URL: Homepage, https://workers.arkindex.org
 Project-URL: Documentation, https://workers.arkindex.org
 Project-URL: Repository, https://gitlab.teklia.com/workers/base-worker
@@ -41,12 +19,13 @@ Classifier: Programming Language :: Python :: 3.12
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: humanize==4.14.0
+Requires-Dist: humanize==4.15.0
 Requires-Dist: peewee~=3.17
 Requires-Dist: Pillow==11.3.0
-Requires-Dist: python-gnupg==0.5.5
+Requires-Dist: python-gnupg==0.5.6
+Requires-Dist: python-magic==0.4.27
 Requires-Dist: shapely==2.0.6
-Requires-Dist: teklia-toolbox==0.1.11
+Requires-Dist: teklia-toolbox==0.1.12
 Requires-Dist: zstandard==0.25.0
 Provides-Extra: tests
 Requires-Dist: pytest-mock==3.15.1; extra == "tests"

{arkindex_base_worker-0.5.1rc3.dist-info → arkindex_base_worker-0.5.2a2.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,10 @@
-arkindex_base_worker-0.5.1rc3.dist-info/licenses/LICENSE,sha256=NVshRi1efwVezMfW7xXYLrdDr2Li1AfwfGOd5WuH1kQ,1063
+arkindex_base_worker-0.5.2a2.dist-info/licenses/LICENSE,sha256=NVshRi1efwVezMfW7xXYLrdDr2Li1AfwfGOd5WuH1kQ,1063
 arkindex_worker/__init__.py,sha256=Sdt5KXn8EgURb2MurYVrUWaHbH3iFA1XLRo0Lc5AJ44,250
 arkindex_worker/cache.py,sha256=XpEXMSnbhYCvrJquwA9XXqZo-ajMLpaCxKG5wH3Gp6Y,10959
-arkindex_worker/image.py,sha256=sGE8to5iykXv25bpkftOEWzlh5NzBZSKy4lSRoHYHPU,20929
-arkindex_worker/models.py,sha256=7GnKqpWPOSxyR_eKlDNVBe_r3TcE4ofK-1GzaonJEdM,10132
-arkindex_worker/utils.py,sha256=yq_LmRlqfWOzB09Aiz2XYx4xPZnoEXR3As48h2HxOVc,10974
-arkindex_worker/worker/__init__.py,sha256=SzD0s1_m6gMV02EUF-NeciqZdVPA4dpXI84tSj-g494,17869
+arkindex_worker/image.py,sha256=9KeZHWNIDkwNJZR0y-mbyD_pvKfrgdktMB32jZqSMYk,20927
+arkindex_worker/models.py,sha256=DgKvAB_2e1cPcuUavZkyTkV10jBK8y083oVklB9idSk,10855
+arkindex_worker/utils.py,sha256=Eqg5pGAuOmuwMT3EhKTQDMek7wHC1KzZL7XXqYVVfHY,10977
+arkindex_worker/worker/__init__.py,sha256=tM_ynAARmtuJw5YWb_jI0AD5KNXbWN1K-VDiixIp7O4,18009
 arkindex_worker/worker/base.py,sha256=-R_aLMJHbR6X1uM-U0zExsF_KLy5Wl3WJ_YMGO9We0I,22153
 arkindex_worker/worker/classification.py,sha256=qvykymkgd4nGywHCxL8obo4egstoGsmWNS4Ztc1qNWQ,11024
 arkindex_worker/worker/corpus.py,sha256=MeIMod7jkWyX0frtD0a37rhumnMV3p9ZOC1xwAoXrAA,2291
@@ -14,7 +14,7 @@ arkindex_worker/worker/entity.py,sha256=Aj6EOfzHEm7qQV-Egm0YKLZgCrLS_3ggOKTY81M2
 arkindex_worker/worker/image.py,sha256=L6Ikuf0Z0RxJk7JarY5PggJGrYSHLaPK0vn0dy0CIaQ,623
 arkindex_worker/worker/metadata.py,sha256=keZdOdUthSH2hAw9iet5pN7rzWihTUYjZHRGTEjaltw,6843
 arkindex_worker/worker/process.py,sha256=9TEHpMcBax1wc6PrWMMrdXe2uNfqyVj7n_dAYZRBGnY,1854
-arkindex_worker/worker/task.py,sha256=nYfMSFm_d-4t8y4PO4HjFBnLsZf7IsDjkS7-A2Pgnac,1525
+arkindex_worker/worker/task.py,sha256=HASQU5LYVtgvCnRCLFC6iH7h7v6q_usZNZ-r_Wkv9A8,3306
 arkindex_worker/worker/training.py,sha256=tyQOHcwv--_wdYz6CgLEe1YM7kwwwKN30LvGTsnWd78,10923
 arkindex_worker/worker/transcription.py,sha256=sw718R119tsLNY8inPMVeIilvFJo94fMbMtYgH0zTM8,21250
 examples/standalone/python/worker.py,sha256=Zr4s4pHvgexEjlkixLFYZp1UuwMLeoTxjyNG5_S2iYE,6672
@@ -24,7 +24,7 @@ tests/__init__.py,sha256=DG--S6IpGl399rzSAjDdHL76CkOIeZIjajCcyUSDhOQ,241
 tests/conftest.py,sha256=Tp7YFK17NATwF2yAcBwi0QFNyKSXtLS0VhZ-zZngsQI,24343
 tests/test_base_worker.py,sha256=lwS4X3atS2ktEKd1XdogmN3mbzq-tO206-k_0EDITlw,29302
 tests/test_cache.py,sha256=_wztzh94EwVrb8UvpFqgl2aa2_FLaCcJKaqunCYR5Dw,10435
-tests/test_dataset_worker.py,sha256=iDJM2C4PfQNH0r4_QqSWoPt8BcM0geUUdODtWY0Z9PA,22412
+tests/test_dataset_worker.py,sha256=LmL3ERF1__PUPkTLiAFC0IYglZTv5WQYA42Vm-uhe2w,22023
 tests/test_element.py,sha256=hlj5VSF4plwC7uz9R4LGOOXZJQcHZiYCIDZT5V6EIB8,14334
 tests/test_image.py,sha256=yAM5mMfpQcIurT1KLHmu0AhSX2Qm3YvCu7afyZ3XUdU,28314
 tests/test_merge.py,sha256=REpZ13jkq_qm_4L5URQgFy5lxvPZtXxQEiWfYLMdmF0,7956
@@ -44,7 +44,7 @@ tests/test_elements_worker/test_entity.py,sha256=SNAZEsVVLnqlliOmjkgv_cZhw0bAuJU
 tests/test_elements_worker/test_image.py,sha256=BljMNKgec_9a5bzNzFpYZIvSbuvwsWDfdqLHVJaTa7M,2079
 tests/test_elements_worker/test_metadata.py,sha256=qtTDtlp3VnBkfck7PAguK2dEgTLlr1i1EVnmNTeNf3A,20515
 tests/test_elements_worker/test_process.py,sha256=y4RoVhPfyHzR795fw7-_FXElBcKo3fy4Ew_HI-kxJic,3088
-tests/test_elements_worker/test_task.py,sha256=wTUWqN9UhfKmJn3IcFY75EW4I1ulRhisflmY1kmP47s,5574
+tests/test_elements_worker/test_task.py,sha256=oHwP1fbJftXFA2U4qA3Gb4vX-iJoV-sBvPHnfBBpRrc,8906
 tests/test_elements_worker/test_training.py,sha256=qgK7BLucddRzc8ePbQtY75x17QvGDEq5XCwgyyvmAJE,8717
 tests/test_elements_worker/test_transcription_create.py,sha256=yznO9B_BVsOR0Z_VY5ZL8gJp0ZPCz_4sPUs5dXtixAg,29281
 tests/test_elements_worker/test_transcription_create_with_elements.py,sha256=tmcyglgssEqMnt1Mdy_u6X1m2wgLWTo_HdWst3GrK2k,33056
@@ -55,7 +55,7 @@ worker-demo/tests/conftest.py,sha256=XzNMNeg6pmABUAH8jN6eZTlZSFGLYjS3-DTXjiRN6Yc
 worker-demo/tests/test_worker.py,sha256=3DLd4NRK4bfyatG5P_PK4k9P9tJHx9XQq5_ryFEEFVg,304
 worker-demo/worker_demo/__init__.py,sha256=2BPomV8ZMNf3YXJgloatKeHQCE6QOkwmsHGkO6MkQuM,125
 worker-demo/worker_demo/worker.py,sha256=Rt-DjWa5iBP08k58NDZMfeyPuFbtNcbX6nc5jFX7GNo,440
-arkindex_base_worker-0.5.1rc3.dist-info/METADATA,sha256=EhM_vIe59B-G10-l3mQUdEWXDIe4HmoSp8vbstcJ2Cs,3091
-arkindex_base_worker-0.5.1rc3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-arkindex_base_worker-0.5.1rc3.dist-info/top_level.txt,sha256=-vNjP2VfROx0j83mdi9aIqRZ88eoJjxeWz-R_gPgyXU,49
-arkindex_base_worker-0.5.1rc3.dist-info/RECORD,,
+arkindex_base_worker-0.5.2a2.dist-info/METADATA,sha256=LyPpeyvKIadAuqir1cymTwxoWm3XovhF-JmzQ1LW0MI,1885
+arkindex_base_worker-0.5.2a2.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
+arkindex_base_worker-0.5.2a2.dist-info/top_level.txt,sha256=-vNjP2VfROx0j83mdi9aIqRZ88eoJjxeWz-R_gPgyXU,49
+arkindex_base_worker-0.5.2a2.dist-info/RECORD,,

{arkindex_base_worker-0.5.1rc3.dist-info → arkindex_base_worker-0.5.2a2.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: setuptools (80.10.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

arkindex_worker/image.py CHANGED Viewed

@@ -38,7 +38,7 @@ if TYPE_CHECKING:
     from arkindex_worker.models import Element
 # See http://docs.python-requests.org/en/master/user/advanced/#timeouts
-DOWNLOAD_TIMEOUT = (30, 60)
+REQUEST_TIMEOUT = (30, 60)
 BoundingBox = namedtuple("BoundingBox", ["x", "y", "width", "height"])
@@ -346,7 +346,7 @@ def _retried_request(url, *args, method=requests.get, **kwargs):
         url,
         *args,
         headers={"User-Agent": IIIF_USER_AGENT},
-        timeout=DOWNLOAD_TIMEOUT,
+        timeout=REQUEST_TIMEOUT,
         verify=should_verify_cert(url),
         **kwargs,
     )

arkindex_worker/models.py CHANGED Viewed

@@ -9,6 +9,8 @@ from contextlib import contextmanager
 from PIL import Image
 from requests import HTTPError
+IMAGE_EXTENSION = "jpg"
 class MagicDict(dict):
     """
@@ -62,28 +64,38 @@ class Element(MagicDict):
     Describes an Arkindex element.
     """
-    def resize_zone_url(self, size: str = "full") -> str:
+    def resize_zone_url(
+        self, size: str = "full", extension: str = IMAGE_EXTENSION
+    ) -> str:
         """
         Compute the URL of the image corresponding to the size
         :param size: Requested size
+        :param extension: IIIF extension to download the image
         :return: The URL corresponding to the size
         """
+        # Removing the `jpg` default extension at the end of the URL to use the provided one
+        url = self.zone.url[:-3] + extension
         if size == "full":
-            return self.zone.url
+            return url
         else:
-            parts = self.zone.url.split("/")
+            parts = url.split("/")
             parts[-3] = size
             return "/".join(parts)
-    def image_url(self, size: str = "full") -> str | None:
+    def image_url(
+        self, size: str = "full", extension: str = IMAGE_EXTENSION
+    ) -> str | None:
         """
         Build a URL to access the image.
         When possible, will return the S3 URL for images, so an ML worker can bypass IIIF servers.
         :param size: Subresolution of the image, following the syntax of the IIIF resize parameter.
+        :param extension: IIIF extension to download the image.
         :returns: A URL to the image, or None if the element does not have an image.
         """
         if not self.get("zone"):
             return
         url = self.zone.image.get("s3_url")
         if url:
             return url
@@ -95,7 +107,7 @@ class Element(MagicDict):
         url = self.zone.image.url
         if not url.endswith("/"):
             url += "/"
-        return f"{url}full/{size}/0/default.jpg"
+        return f"{url}full/{size}/0/default.{extension}"
     @property
     def polygon(self) -> list[float]:
@@ -131,6 +143,7 @@ class Element(MagicDict):
         max_width: int | None = None,
         max_height: int | None = None,
         use_full_image: bool | None = False,
+        extension: str | None = IMAGE_EXTENSION,
         **kwargs,
     ) -> Image.Image:
         """
@@ -163,6 +176,7 @@ class Element(MagicDict):
         :param max_height: The maximum height of the image.
         :param use_full_image: Ignore the ``zone.polygon`` and always
            retrieve the image without cropping.
+        :param extension: The extension to download the image.
         :param *args: Positional arguments passed to [arkindex_worker.image.open_image][].
         :param **kwargs: Keyword arguments passed to [arkindex_worker.image.open_image][].
         :raises ValueError: When the element does not have an image.
@@ -200,7 +214,11 @@ class Element(MagicDict):
             else:
                 resize = f"{max_width or ''},{max_height or ''}"
-        url = self.image_url(resize) if use_full_image else self.resize_zone_url(resize)
+        url = (
+            self.image_url(resize, extension)
+            if use_full_image
+            else self.resize_zone_url(resize, extension)
+        )
         try:
             return open_image(
@@ -224,7 +242,7 @@ class Element(MagicDict):
     @contextmanager
     def open_image_tempfile(
-        self, format: str | None = "jpeg", *args, **kwargs
+        self, extension: str | None = IMAGE_EXTENSION, *args, **kwargs
     ) -> Generator[tempfile.NamedTemporaryFile, None, None]:
         """
         Get the element's image as a temporary file stored on the disk.
@@ -243,8 +261,12 @@ class Element(MagicDict):
         :param **kwargs: Keyword arguments passed to [arkindex_worker.image.open_image][].
         """
-        with tempfile.NamedTemporaryFile() as f:
-            self.open_image(*args, **kwargs).save(f, format=format)
+        PIL_format = "jpeg" if extension == IMAGE_EXTENSION else extension
+        with tempfile.NamedTemporaryFile(suffix=f".{extension}") as f:
+            self.open_image(*args, extension=extension, **kwargs).save(
+                f, format=PIL_format
+            )
             yield f
     def __str__(self):

arkindex_worker/utils.py CHANGED Viewed

@@ -243,7 +243,7 @@ def create_zip_archive(source: Path, destination: Path | None = None) -> Path:
     logger.debug(f"Compressing file to {destination}")
     with zipfile.ZipFile(
-        destination, mode="w", compression=zipfile.ZIP_BZIP2
+        destination, mode="w", compression=zipfile.ZIP_DEFLATED
     ) as archive:
         for p in source.rglob("*"):
             relpath = p.relative_to(source)

arkindex_worker/worker/__init__.py CHANGED Viewed

@@ -424,12 +424,13 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
         failed = 0
         for i, dataset_set in enumerate(dataset_sets, start=1):
             try:
-                assert dataset_set.dataset.state == DatasetState.Complete.value, (
-                    "When processing a set, its dataset state should be Complete."
-                )
-                logger.info(f"Retrieving data for {dataset_set} ({i}/{count})")
-                self.download_dataset_artifact(dataset_set.dataset)
+                if dataset_set.dataset.state == DatasetState.Complete.value:
+                    logger.info(f"Retrieving data for {dataset_set} ({i}/{count})")
+                    self.download_dataset_artifact(dataset_set.dataset)
+                else:
+                    logger.warning(
+                        f"The dataset {dataset_set.dataset} has its state set to `{dataset_set.dataset.state}`, its archive will not be downloaded"
+                    )
                 logger.info(f"Processing {dataset_set} ({i}/{count})")
                 self.process_set(dataset_set)
@@ -444,7 +445,7 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
                 logger.warning(message, exc_info=e if self.args.verbose else None)
-        # Cleanup the latest downloaded dataset artifact
+        # Cleanup the latest downloaded dataset artifact (if needed)
         self.cleanup_downloaded_artifact()
         message = f"Ran on {count} {pluralize('set', count)}: {count - failed} completed, {failed} failed"

arkindex_worker/worker/task.py CHANGED Viewed

@@ -4,9 +4,16 @@ BaseWorker methods for tasks.
 import uuid
 from collections.abc import Iterator
+from http.client import REQUEST_TIMEOUT
+from pathlib import Path
+import magic
+import requests
 from arkindex.compat import DownloadedFile
+from arkindex_worker import logger
 from arkindex_worker.models import Artifact
+from teklia_toolbox.requests import should_verify_cert
 class TaskMixin:
@@ -45,3 +52,49 @@ class TaskMixin:
         return self.api_client.request(
             "DownloadArtifact", id=task_id, path=artifact.path
         )
+    def upload_artifact(self, path: Path) -> None:
+        """
+        Upload a single file as an Artifact of the current task.
+        :param path: Path of the single file to upload as an Artifact.
+        """
+        assert path and isinstance(path, Path) and path.exists(), (
+            "path shouldn't be null, should be a Path and should exist"
+        )
+        if self.is_read_only:
+            logger.warning("Cannot upload artifact as this worker is in read-only mode")
+            return
+        # Get path relative to task's data directory
+        relpath = str(path.relative_to(self.work_dir))
+        # Get file size
+        size = path.stat().st_size
+        # Detect content type
+        try:
+            content_type = magic.from_file(path, mime=True)
+        except Exception as e:
+            logger.warning(f"Failed to get a mime type for {path}: {e}")
+            content_type = "application/octet-stream"
+        # Create artifact on API to get an S3 url
+        artifact = self.api_client.request(
+            "CreateArtifact",
+            id=self.task_id,
+            body={"path": relpath, "content_type": content_type, "size": size},
+        )
+        # Upload the file content to S3
+        s3_put_url = artifact["s3_put_url"]
+        with path.open("rb") as content:
+            resp = requests.put(
+                s3_put_url,
+                data=content,
+                headers={"Content-Type": content_type},
+                timeout=REQUEST_TIMEOUT,
+                verify=should_verify_cert(s3_put_url),
+            )
+            resp.raise_for_status()

tests/test_dataset_worker.py CHANGED Viewed

@@ -435,34 +435,6 @@ def test_run_no_sets(mocker, caplog, mock_dataset_worker):
     ]
-def test_run_initial_dataset_state_error(
-    mocker, responses, caplog, mock_dataset_worker, default_dataset
-):
-    default_dataset.state = DatasetState.Building.value
-    mocker.patch(
-        "arkindex_worker.worker.DatasetWorker.list_sets",
-        return_value=[Set(name="train", dataset=default_dataset)],
-    )
-    with pytest.raises(SystemExit):
-        mock_dataset_worker.run()
-    assert len(responses.calls) == len(BASE_API_CALLS) * 2
-    assert [
-        (call.request.method, call.request.url) for call in responses.calls
-    ] == BASE_API_CALLS * 2
-    assert [(level, message) for _, level, message in caplog.record_tuples] == [
-        (logging.INFO, "Loaded Worker Fake worker @ 123412 from API"),
-        (logging.INFO, "Modern configuration is not available"),
-        (
-            logging.WARNING,
-            "Failed running worker on Set (train) from Dataset (dataset_id): AssertionError('When processing a set, its dataset state should be Complete.')",
-        ),
-        (logging.ERROR, "Ran on 1 set: 0 completed, 1 failed"),
-    ]
 def test_run_download_dataset_artifact_api_error(
     mocker,
     tmp_path,
@@ -570,16 +542,18 @@ def test_run_no_downloaded_dataset_artifact_error(
     ]
+@pytest.mark.parametrize("dataset_state", DatasetState)
 def test_run(
     mocker,
     tmp_path,
     responses,
     caplog,
+    dataset_state,
     mock_dataset_worker,
     default_dataset,
     default_artifact,
 ):
-    default_dataset.state = DatasetState.Complete.value
+    default_dataset.state = dataset_state.value
     mocker.patch(
         "arkindex_worker.worker.DatasetWorker.list_sets",
         return_value=[Set(name="train", dataset=default_dataset)],
@@ -590,55 +564,68 @@ def test_run(
     )
     mock_process = mocker.patch("arkindex_worker.worker.DatasetWorker.process_set")
-    archive_path = (
-        FIXTURES_DIR
-        / "extract_parent_archives"
-        / "first_parent"
-        / "arkindex_data.tar.zst"
-    )
-    responses.add(
-        responses.GET,
-        f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/",
-        status=200,
-        json=[default_artifact],
-    )
-    responses.add(
-        responses.GET,
-        f"http://testserver/api/v1/task/{default_dataset.task_id}/artifact/dataset_id.tar.zst",
-        status=200,
-        body=archive_path.read_bytes(),
-        content_type="application/zstd",
-    )
+    if dataset_state == DatasetState.Complete:
+        archive_path = (
+            FIXTURES_DIR
+            / "extract_parent_archives"
+            / "first_parent"
+            / "arkindex_data.tar.zst"
+        )
+        responses.add(
+            responses.GET,
+            f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/",
+            status=200,
+            json=[default_artifact],
+        )
+        responses.add(
+            responses.GET,
+            f"http://testserver/api/v1/task/{default_dataset.task_id}/artifact/dataset_id.tar.zst",
+            status=200,
+            body=archive_path.read_bytes(),
+            content_type="application/zstd",
+        )
     mock_dataset_worker.run()
     assert mock_process.call_count == 1
-    assert len(responses.calls) == len(BASE_API_CALLS) * 2 + 2
+    # We only download the dataset archive when it is Complete
+    extra_calls = []
+    if dataset_state == DatasetState.Complete:
+        extra_calls = [
+            (
+                "GET",
+                f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/",
+            ),
+            (
+                "GET",
+                f"http://testserver/api/v1/task/{default_dataset.task_id}/artifact/dataset_id.tar.zst",
+            ),
+        ]
+    assert len(responses.calls) == len(BASE_API_CALLS) * 2 + len(extra_calls)
     assert [
         (call.request.method, call.request.url) for call in responses.calls
-    ] == BASE_API_CALLS * 2 + [
-        (
-            "GET",
-            f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/",
-        ),
-        (
-            "GET",
-            f"http://testserver/api/v1/task/{default_dataset.task_id}/artifact/dataset_id.tar.zst",
-        ),
-    ]
+    ] == BASE_API_CALLS * 2 + extra_calls
-    assert [(level, message) for _, level, message in caplog.record_tuples] == [
+    logs = [
         (logging.INFO, "Loaded Worker Fake worker @ 123412 from API"),
         (logging.INFO, "Modern configuration is not available"),
         (
-            logging.INFO,
-            "Retrieving data for Set (train) from Dataset (dataset_id) (1/1)",
+            logging.WARNING,
+            f"The dataset Dataset (dataset_id) has its state set to `{dataset_state.value}`, its archive will not be downloaded",
         ),
-        (logging.INFO, "Downloading artifact for Dataset (dataset_id)"),
         (logging.INFO, "Processing Set (train) from Dataset (dataset_id) (1/1)"),
         (logging.INFO, "Ran on 1 set: 1 completed, 0 failed"),
     ]
+    if dataset_state == DatasetState.Complete:
+        logs[2] = (
+            logging.INFO,
+            "Retrieving data for Set (train) from Dataset (dataset_id) (1/1)",
+        )
+        logs.insert(3, (logging.INFO, "Downloading artifact for Dataset (dataset_id)"))
+    assert [(level, message) for _, level, message in caplog.record_tuples] == logs
 def test_run_read_only(

tests/test_elements_worker/test_task.py CHANGED Viewed

@@ -1,6 +1,9 @@
+import tempfile
 import uuid
+from pathlib import Path
 import pytest
+from requests import HTTPError
 from arkindex.exceptions import ErrorResponse
 from arkindex_worker.models import Artifact
@@ -196,3 +199,112 @@ def test_download_artifact(
     ] == BASE_API_CALLS + [
         ("GET", f"http://testserver/api/v1/task/{TASK_ID}/artifact/dataset_id.tar.zst"),
     ]
+@pytest.mark.parametrize(
+    ("payload", "error"),
+    [
+        # Path
+        (
+            {"path": None},
+            "path shouldn't be null, should be a Path and should exist",
+        ),
+        (
+            {"path": "not path type"},
+            "path shouldn't be null, should be a Path and should exist",
+        ),
+        (
+            {"path": Path("i_do_no_exist.oops")},
+            "path shouldn't be null, should be a Path and should exist",
+        ),
+    ],
+)
+def test_upload_artifact_wrong_param_path(mock_dataset_worker, payload, error):
+    with pytest.raises(AssertionError, match=error):
+        mock_dataset_worker.upload_artifact(**payload)
+@pytest.fixture
+def tmp_file(mock_dataset_worker):
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".txt", dir=mock_dataset_worker.work_dir
+    ) as file:
+        file.write("Some content...")
+        file.seek(0)
+        yield Path(file.name)
+def test_upload_artifact_api_error(responses, mock_dataset_worker, tmp_file):
+    responses.add(
+        responses.POST,
+        "http://testserver/api/v1/task/my_task/artifacts/",
+        status=418,
+    )
+    with pytest.raises(ErrorResponse):
+        mock_dataset_worker.upload_artifact(path=tmp_file)
+    assert len(responses.calls) == len(BASE_API_CALLS) + 1
+    assert [
+        (call.request.method, call.request.url) for call in responses.calls
+    ] == BASE_API_CALLS + [("POST", "http://testserver/api/v1/task/my_task/artifacts/")]
+def test_upload_artifact_s3_upload_error(
+    responses,
+    mock_dataset_worker,
+    tmp_file,
+):
+    responses.add(
+        responses.POST,
+        "http://testserver/api/v1/task/my_task/artifacts/",
+        json={
+            "id": "11111111-1111-1111-1111-111111111111",
+            "path": tmp_file.name,
+            "size": 15,
+            "content_type": "text/plain",
+            "s3_put_url": "http://example.com/oops.txt",
+        },
+    )
+    responses.add(responses.PUT, "http://example.com/oops.txt", status=500)
+    with pytest.raises(HTTPError):
+        mock_dataset_worker.upload_artifact(path=tmp_file)
+    assert len(responses.calls) == len(BASE_API_CALLS) + 2
+    assert [
+        (call.request.method, call.request.url) for call in responses.calls
+    ] == BASE_API_CALLS + [
+        ("POST", "http://testserver/api/v1/task/my_task/artifacts/"),
+        ("PUT", "http://example.com/oops.txt"),
+    ]
+def test_upload_artifact(
+    responses,
+    mock_dataset_worker,
+    tmp_file,
+):
+    responses.add(
+        responses.POST,
+        "http://testserver/api/v1/task/my_task/artifacts/",
+        json={
+            "id": "11111111-1111-1111-1111-111111111111",
+            "path": tmp_file.name,
+            "size": 15,
+            "content_type": "text/plain",
+            "s3_put_url": "http://example.com/test.txt",
+        },
+    )
+    responses.add(responses.PUT, "http://example.com/test.txt")
+    mock_dataset_worker.upload_artifact(path=tmp_file)
+    assert len(responses.calls) == len(BASE_API_CALLS) + 2
+    assert [
+        (call.request.method, call.request.url) for call in responses.calls
+    ] == BASE_API_CALLS + [
+        ("POST", "http://testserver/api/v1/task/my_task/artifacts/"),
+        ("PUT", "http://example.com/test.txt"),
+    ]

{arkindex_base_worker-0.5.1rc3.dist-info → arkindex_base_worker-0.5.2a2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{arkindex_base_worker-0.5.1rc3.dist-info → arkindex_base_worker-0.5.2a2.dist-info}/top_level.txt RENAMED Viewed

File without changes

arkindex-base-worker 0.5.1rc3__py3-none-any.whl → 0.5.2a2__py3-none-any.whl

arkindex-base-worker 0.5.1rc3py3-none-any.whl → 0.5.2a2py3-none-any.whl