PyPI - arkindex-base-worker - Versions diffs - 0.5.2a1__py3-none-any.whl → 0.5.2a2__py3-none-any.whl - Mend

arkindex-base-worker 0.5.2a1py3-none-any.whl → 0.5.2a2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

{arkindex_base_worker-0.5.2a1.dist-info → arkindex_base_worker-0.5.2a2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: arkindex-base-worker
-Version: 0.5.2a1
+Version: 0.5.2a2
 Summary: Base Worker to easily build Arkindex ML workflows
 Author-email: Teklia <contact@teklia.com>
 Maintainer-email: Teklia <contact@teklia.com>
@@ -23,6 +23,7 @@ Requires-Dist: humanize==4.15.0
 Requires-Dist: peewee~=3.17
 Requires-Dist: Pillow==11.3.0
 Requires-Dist: python-gnupg==0.5.6
+Requires-Dist: python-magic==0.4.27
 Requires-Dist: shapely==2.0.6
 Requires-Dist: teklia-toolbox==0.1.12
 Requires-Dist: zstandard==0.25.0

{arkindex_base_worker-0.5.2a1.dist-info → arkindex_base_worker-0.5.2a2.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,10 @@
-arkindex_base_worker-0.5.2a1.dist-info/licenses/LICENSE,sha256=NVshRi1efwVezMfW7xXYLrdDr2Li1AfwfGOd5WuH1kQ,1063
+arkindex_base_worker-0.5.2a2.dist-info/licenses/LICENSE,sha256=NVshRi1efwVezMfW7xXYLrdDr2Li1AfwfGOd5WuH1kQ,1063
 arkindex_worker/__init__.py,sha256=Sdt5KXn8EgURb2MurYVrUWaHbH3iFA1XLRo0Lc5AJ44,250
 arkindex_worker/cache.py,sha256=XpEXMSnbhYCvrJquwA9XXqZo-ajMLpaCxKG5wH3Gp6Y,10959
-arkindex_worker/image.py,sha256=sGE8to5iykXv25bpkftOEWzlh5NzBZSKy4lSRoHYHPU,20929
+arkindex_worker/image.py,sha256=9KeZHWNIDkwNJZR0y-mbyD_pvKfrgdktMB32jZqSMYk,20927
 arkindex_worker/models.py,sha256=DgKvAB_2e1cPcuUavZkyTkV10jBK8y083oVklB9idSk,10855
 arkindex_worker/utils.py,sha256=Eqg5pGAuOmuwMT3EhKTQDMek7wHC1KzZL7XXqYVVfHY,10977
-arkindex_worker/worker/__init__.py,sha256=SzD0s1_m6gMV02EUF-NeciqZdVPA4dpXI84tSj-g494,17869
+arkindex_worker/worker/__init__.py,sha256=tM_ynAARmtuJw5YWb_jI0AD5KNXbWN1K-VDiixIp7O4,18009
 arkindex_worker/worker/base.py,sha256=-R_aLMJHbR6X1uM-U0zExsF_KLy5Wl3WJ_YMGO9We0I,22153
 arkindex_worker/worker/classification.py,sha256=qvykymkgd4nGywHCxL8obo4egstoGsmWNS4Ztc1qNWQ,11024
 arkindex_worker/worker/corpus.py,sha256=MeIMod7jkWyX0frtD0a37rhumnMV3p9ZOC1xwAoXrAA,2291
@@ -14,7 +14,7 @@ arkindex_worker/worker/entity.py,sha256=Aj6EOfzHEm7qQV-Egm0YKLZgCrLS_3ggOKTY81M2
 arkindex_worker/worker/image.py,sha256=L6Ikuf0Z0RxJk7JarY5PggJGrYSHLaPK0vn0dy0CIaQ,623
 arkindex_worker/worker/metadata.py,sha256=keZdOdUthSH2hAw9iet5pN7rzWihTUYjZHRGTEjaltw,6843
 arkindex_worker/worker/process.py,sha256=9TEHpMcBax1wc6PrWMMrdXe2uNfqyVj7n_dAYZRBGnY,1854
-arkindex_worker/worker/task.py,sha256=nYfMSFm_d-4t8y4PO4HjFBnLsZf7IsDjkS7-A2Pgnac,1525
+arkindex_worker/worker/task.py,sha256=HASQU5LYVtgvCnRCLFC6iH7h7v6q_usZNZ-r_Wkv9A8,3306
 arkindex_worker/worker/training.py,sha256=tyQOHcwv--_wdYz6CgLEe1YM7kwwwKN30LvGTsnWd78,10923
 arkindex_worker/worker/transcription.py,sha256=sw718R119tsLNY8inPMVeIilvFJo94fMbMtYgH0zTM8,21250
 examples/standalone/python/worker.py,sha256=Zr4s4pHvgexEjlkixLFYZp1UuwMLeoTxjyNG5_S2iYE,6672
@@ -24,7 +24,7 @@ tests/__init__.py,sha256=DG--S6IpGl399rzSAjDdHL76CkOIeZIjajCcyUSDhOQ,241
 tests/conftest.py,sha256=Tp7YFK17NATwF2yAcBwi0QFNyKSXtLS0VhZ-zZngsQI,24343
 tests/test_base_worker.py,sha256=lwS4X3atS2ktEKd1XdogmN3mbzq-tO206-k_0EDITlw,29302
 tests/test_cache.py,sha256=_wztzh94EwVrb8UvpFqgl2aa2_FLaCcJKaqunCYR5Dw,10435
-tests/test_dataset_worker.py,sha256=iDJM2C4PfQNH0r4_QqSWoPt8BcM0geUUdODtWY0Z9PA,22412
+tests/test_dataset_worker.py,sha256=LmL3ERF1__PUPkTLiAFC0IYglZTv5WQYA42Vm-uhe2w,22023
 tests/test_element.py,sha256=hlj5VSF4plwC7uz9R4LGOOXZJQcHZiYCIDZT5V6EIB8,14334
 tests/test_image.py,sha256=yAM5mMfpQcIurT1KLHmu0AhSX2Qm3YvCu7afyZ3XUdU,28314
 tests/test_merge.py,sha256=REpZ13jkq_qm_4L5URQgFy5lxvPZtXxQEiWfYLMdmF0,7956
@@ -44,7 +44,7 @@ tests/test_elements_worker/test_entity.py,sha256=SNAZEsVVLnqlliOmjkgv_cZhw0bAuJU
 tests/test_elements_worker/test_image.py,sha256=BljMNKgec_9a5bzNzFpYZIvSbuvwsWDfdqLHVJaTa7M,2079
 tests/test_elements_worker/test_metadata.py,sha256=qtTDtlp3VnBkfck7PAguK2dEgTLlr1i1EVnmNTeNf3A,20515
 tests/test_elements_worker/test_process.py,sha256=y4RoVhPfyHzR795fw7-_FXElBcKo3fy4Ew_HI-kxJic,3088
-tests/test_elements_worker/test_task.py,sha256=wTUWqN9UhfKmJn3IcFY75EW4I1ulRhisflmY1kmP47s,5574
+tests/test_elements_worker/test_task.py,sha256=oHwP1fbJftXFA2U4qA3Gb4vX-iJoV-sBvPHnfBBpRrc,8906
 tests/test_elements_worker/test_training.py,sha256=qgK7BLucddRzc8ePbQtY75x17QvGDEq5XCwgyyvmAJE,8717
 tests/test_elements_worker/test_transcription_create.py,sha256=yznO9B_BVsOR0Z_VY5ZL8gJp0ZPCz_4sPUs5dXtixAg,29281
 tests/test_elements_worker/test_transcription_create_with_elements.py,sha256=tmcyglgssEqMnt1Mdy_u6X1m2wgLWTo_HdWst3GrK2k,33056
@@ -55,7 +55,7 @@ worker-demo/tests/conftest.py,sha256=XzNMNeg6pmABUAH8jN6eZTlZSFGLYjS3-DTXjiRN6Yc
 worker-demo/tests/test_worker.py,sha256=3DLd4NRK4bfyatG5P_PK4k9P9tJHx9XQq5_ryFEEFVg,304
 worker-demo/worker_demo/__init__.py,sha256=2BPomV8ZMNf3YXJgloatKeHQCE6QOkwmsHGkO6MkQuM,125
 worker-demo/worker_demo/worker.py,sha256=Rt-DjWa5iBP08k58NDZMfeyPuFbtNcbX6nc5jFX7GNo,440
-arkindex_base_worker-0.5.2a1.dist-info/METADATA,sha256=AwYp_xJZzu6zAtvnvZjeK_W29tzqvRuwYnxwMYcKSIc,1849
-arkindex_base_worker-0.5.2a1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-arkindex_base_worker-0.5.2a1.dist-info/top_level.txt,sha256=-vNjP2VfROx0j83mdi9aIqRZ88eoJjxeWz-R_gPgyXU,49
-arkindex_base_worker-0.5.2a1.dist-info/RECORD,,
+arkindex_base_worker-0.5.2a2.dist-info/METADATA,sha256=LyPpeyvKIadAuqir1cymTwxoWm3XovhF-JmzQ1LW0MI,1885
+arkindex_base_worker-0.5.2a2.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
+arkindex_base_worker-0.5.2a2.dist-info/top_level.txt,sha256=-vNjP2VfROx0j83mdi9aIqRZ88eoJjxeWz-R_gPgyXU,49
+arkindex_base_worker-0.5.2a2.dist-info/RECORD,,

{arkindex_base_worker-0.5.2a1.dist-info → arkindex_base_worker-0.5.2a2.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: setuptools (80.10.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

arkindex_worker/image.py CHANGED Viewed

@@ -38,7 +38,7 @@ if TYPE_CHECKING:
     from arkindex_worker.models import Element
 # See http://docs.python-requests.org/en/master/user/advanced/#timeouts
-DOWNLOAD_TIMEOUT = (30, 60)
+REQUEST_TIMEOUT = (30, 60)
 BoundingBox = namedtuple("BoundingBox", ["x", "y", "width", "height"])
@@ -346,7 +346,7 @@ def _retried_request(url, *args, method=requests.get, **kwargs):
         url,
         *args,
         headers={"User-Agent": IIIF_USER_AGENT},
-        timeout=DOWNLOAD_TIMEOUT,
+        timeout=REQUEST_TIMEOUT,
         verify=should_verify_cert(url),
         **kwargs,
     )

arkindex_worker/worker/__init__.py CHANGED Viewed

@@ -424,12 +424,13 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
         failed = 0
         for i, dataset_set in enumerate(dataset_sets, start=1):
             try:
-                assert dataset_set.dataset.state == DatasetState.Complete.value, (
-                    "When processing a set, its dataset state should be Complete."
-                )
-                logger.info(f"Retrieving data for {dataset_set} ({i}/{count})")
-                self.download_dataset_artifact(dataset_set.dataset)
+                if dataset_set.dataset.state == DatasetState.Complete.value:
+                    logger.info(f"Retrieving data for {dataset_set} ({i}/{count})")
+                    self.download_dataset_artifact(dataset_set.dataset)
+                else:
+                    logger.warning(
+                        f"The dataset {dataset_set.dataset} has its state set to `{dataset_set.dataset.state}`, its archive will not be downloaded"
+                    )
                 logger.info(f"Processing {dataset_set} ({i}/{count})")
                 self.process_set(dataset_set)
@@ -444,7 +445,7 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
                 logger.warning(message, exc_info=e if self.args.verbose else None)
-        # Cleanup the latest downloaded dataset artifact
+        # Cleanup the latest downloaded dataset artifact (if needed)
         self.cleanup_downloaded_artifact()
         message = f"Ran on {count} {pluralize('set', count)}: {count - failed} completed, {failed} failed"

arkindex_worker/worker/task.py CHANGED Viewed

@@ -4,9 +4,16 @@ BaseWorker methods for tasks.
 import uuid
 from collections.abc import Iterator
+from http.client import REQUEST_TIMEOUT
+from pathlib import Path
+import magic
+import requests
 from arkindex.compat import DownloadedFile
+from arkindex_worker import logger
 from arkindex_worker.models import Artifact
+from teklia_toolbox.requests import should_verify_cert
 class TaskMixin:
@@ -45,3 +52,49 @@ class TaskMixin:
         return self.api_client.request(
             "DownloadArtifact", id=task_id, path=artifact.path
         )
+    def upload_artifact(self, path: Path) -> None:
+        """
+        Upload a single file as an Artifact of the current task.
+        :param path: Path of the single file to upload as an Artifact.
+        """
+        assert path and isinstance(path, Path) and path.exists(), (
+            "path shouldn't be null, should be a Path and should exist"
+        )
+        if self.is_read_only:
+            logger.warning("Cannot upload artifact as this worker is in read-only mode")
+            return
+        # Get path relative to task's data directory
+        relpath = str(path.relative_to(self.work_dir))
+        # Get file size
+        size = path.stat().st_size
+        # Detect content type
+        try:
+            content_type = magic.from_file(path, mime=True)
+        except Exception as e:
+            logger.warning(f"Failed to get a mime type for {path}: {e}")
+            content_type = "application/octet-stream"
+        # Create artifact on API to get an S3 url
+        artifact = self.api_client.request(
+            "CreateArtifact",
+            id=self.task_id,
+            body={"path": relpath, "content_type": content_type, "size": size},
+        )
+        # Upload the file content to S3
+        s3_put_url = artifact["s3_put_url"]
+        with path.open("rb") as content:
+            resp = requests.put(
+                s3_put_url,
+                data=content,
+                headers={"Content-Type": content_type},
+                timeout=REQUEST_TIMEOUT,
+                verify=should_verify_cert(s3_put_url),
+            )
+            resp.raise_for_status()

tests/test_dataset_worker.py CHANGED Viewed

@@ -435,34 +435,6 @@ def test_run_no_sets(mocker, caplog, mock_dataset_worker):
     ]
-def test_run_initial_dataset_state_error(
-    mocker, responses, caplog, mock_dataset_worker, default_dataset
-):
-    default_dataset.state = DatasetState.Building.value
-    mocker.patch(
-        "arkindex_worker.worker.DatasetWorker.list_sets",
-        return_value=[Set(name="train", dataset=default_dataset)],
-    )
-    with pytest.raises(SystemExit):
-        mock_dataset_worker.run()
-    assert len(responses.calls) == len(BASE_API_CALLS) * 2
-    assert [
-        (call.request.method, call.request.url) for call in responses.calls
-    ] == BASE_API_CALLS * 2
-    assert [(level, message) for _, level, message in caplog.record_tuples] == [
-        (logging.INFO, "Loaded Worker Fake worker @ 123412 from API"),
-        (logging.INFO, "Modern configuration is not available"),
-        (
-            logging.WARNING,
-            "Failed running worker on Set (train) from Dataset (dataset_id): AssertionError('When processing a set, its dataset state should be Complete.')",
-        ),
-        (logging.ERROR, "Ran on 1 set: 0 completed, 1 failed"),
-    ]
 def test_run_download_dataset_artifact_api_error(
     mocker,
     tmp_path,
@@ -570,16 +542,18 @@ def test_run_no_downloaded_dataset_artifact_error(
     ]
+@pytest.mark.parametrize("dataset_state", DatasetState)
 def test_run(
     mocker,
     tmp_path,
     responses,
     caplog,
+    dataset_state,
     mock_dataset_worker,
     default_dataset,
     default_artifact,
 ):
-    default_dataset.state = DatasetState.Complete.value
+    default_dataset.state = dataset_state.value
     mocker.patch(
         "arkindex_worker.worker.DatasetWorker.list_sets",
         return_value=[Set(name="train", dataset=default_dataset)],
@@ -590,55 +564,68 @@ def test_run(
     )
     mock_process = mocker.patch("arkindex_worker.worker.DatasetWorker.process_set")
-    archive_path = (
-        FIXTURES_DIR
-        / "extract_parent_archives"
-        / "first_parent"
-        / "arkindex_data.tar.zst"
-    )
-    responses.add(
-        responses.GET,
-        f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/",
-        status=200,
-        json=[default_artifact],
-    )
-    responses.add(
-        responses.GET,
-        f"http://testserver/api/v1/task/{default_dataset.task_id}/artifact/dataset_id.tar.zst",
-        status=200,
-        body=archive_path.read_bytes(),
-        content_type="application/zstd",
-    )
+    if dataset_state == DatasetState.Complete:
+        archive_path = (
+            FIXTURES_DIR
+            / "extract_parent_archives"
+            / "first_parent"
+            / "arkindex_data.tar.zst"
+        )
+        responses.add(
+            responses.GET,
+            f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/",
+            status=200,
+            json=[default_artifact],
+        )
+        responses.add(
+            responses.GET,
+            f"http://testserver/api/v1/task/{default_dataset.task_id}/artifact/dataset_id.tar.zst",
+            status=200,
+            body=archive_path.read_bytes(),
+            content_type="application/zstd",
+        )
     mock_dataset_worker.run()
     assert mock_process.call_count == 1
-    assert len(responses.calls) == len(BASE_API_CALLS) * 2 + 2
+    # We only download the dataset archive when it is Complete
+    extra_calls = []
+    if dataset_state == DatasetState.Complete:
+        extra_calls = [
+            (
+                "GET",
+                f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/",
+            ),
+            (
+                "GET",
+                f"http://testserver/api/v1/task/{default_dataset.task_id}/artifact/dataset_id.tar.zst",
+            ),
+        ]
+    assert len(responses.calls) == len(BASE_API_CALLS) * 2 + len(extra_calls)
     assert [
         (call.request.method, call.request.url) for call in responses.calls
-    ] == BASE_API_CALLS * 2 + [
-        (
-            "GET",
-            f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/",
-        ),
-        (
-            "GET",
-            f"http://testserver/api/v1/task/{default_dataset.task_id}/artifact/dataset_id.tar.zst",
-        ),
-    ]
+    ] == BASE_API_CALLS * 2 + extra_calls
-    assert [(level, message) for _, level, message in caplog.record_tuples] == [
+    logs = [
         (logging.INFO, "Loaded Worker Fake worker @ 123412 from API"),
         (logging.INFO, "Modern configuration is not available"),
         (
-            logging.INFO,
-            "Retrieving data for Set (train) from Dataset (dataset_id) (1/1)",
+            logging.WARNING,
+            f"The dataset Dataset (dataset_id) has its state set to `{dataset_state.value}`, its archive will not be downloaded",
         ),
-        (logging.INFO, "Downloading artifact for Dataset (dataset_id)"),
         (logging.INFO, "Processing Set (train) from Dataset (dataset_id) (1/1)"),
         (logging.INFO, "Ran on 1 set: 1 completed, 0 failed"),
     ]
+    if dataset_state == DatasetState.Complete:
+        logs[2] = (
+            logging.INFO,
+            "Retrieving data for Set (train) from Dataset (dataset_id) (1/1)",
+        )
+        logs.insert(3, (logging.INFO, "Downloading artifact for Dataset (dataset_id)"))
+    assert [(level, message) for _, level, message in caplog.record_tuples] == logs
 def test_run_read_only(

tests/test_elements_worker/test_task.py CHANGED Viewed

@@ -1,6 +1,9 @@
+import tempfile
 import uuid
+from pathlib import Path
 import pytest
+from requests import HTTPError
 from arkindex.exceptions import ErrorResponse
 from arkindex_worker.models import Artifact
@@ -196,3 +199,112 @@ def test_download_artifact(
     ] == BASE_API_CALLS + [
         ("GET", f"http://testserver/api/v1/task/{TASK_ID}/artifact/dataset_id.tar.zst"),
     ]
+@pytest.mark.parametrize(
+    ("payload", "error"),
+    [
+        # Path
+        (
+            {"path": None},
+            "path shouldn't be null, should be a Path and should exist",
+        ),
+        (
+            {"path": "not path type"},
+            "path shouldn't be null, should be a Path and should exist",
+        ),
+        (
+            {"path": Path("i_do_no_exist.oops")},
+            "path shouldn't be null, should be a Path and should exist",
+        ),
+    ],
+)
+def test_upload_artifact_wrong_param_path(mock_dataset_worker, payload, error):
+    with pytest.raises(AssertionError, match=error):
+        mock_dataset_worker.upload_artifact(**payload)
+@pytest.fixture
+def tmp_file(mock_dataset_worker):
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".txt", dir=mock_dataset_worker.work_dir
+    ) as file:
+        file.write("Some content...")
+        file.seek(0)
+        yield Path(file.name)
+def test_upload_artifact_api_error(responses, mock_dataset_worker, tmp_file):
+    responses.add(
+        responses.POST,
+        "http://testserver/api/v1/task/my_task/artifacts/",
+        status=418,
+    )
+    with pytest.raises(ErrorResponse):
+        mock_dataset_worker.upload_artifact(path=tmp_file)
+    assert len(responses.calls) == len(BASE_API_CALLS) + 1
+    assert [
+        (call.request.method, call.request.url) for call in responses.calls
+    ] == BASE_API_CALLS + [("POST", "http://testserver/api/v1/task/my_task/artifacts/")]
+def test_upload_artifact_s3_upload_error(
+    responses,
+    mock_dataset_worker,
+    tmp_file,
+):
+    responses.add(
+        responses.POST,
+        "http://testserver/api/v1/task/my_task/artifacts/",
+        json={
+            "id": "11111111-1111-1111-1111-111111111111",
+            "path": tmp_file.name,
+            "size": 15,
+            "content_type": "text/plain",
+            "s3_put_url": "http://example.com/oops.txt",
+        },
+    )
+    responses.add(responses.PUT, "http://example.com/oops.txt", status=500)
+    with pytest.raises(HTTPError):
+        mock_dataset_worker.upload_artifact(path=tmp_file)
+    assert len(responses.calls) == len(BASE_API_CALLS) + 2
+    assert [
+        (call.request.method, call.request.url) for call in responses.calls
+    ] == BASE_API_CALLS + [
+        ("POST", "http://testserver/api/v1/task/my_task/artifacts/"),
+        ("PUT", "http://example.com/oops.txt"),
+    ]
+def test_upload_artifact(
+    responses,
+    mock_dataset_worker,
+    tmp_file,
+):
+    responses.add(
+        responses.POST,
+        "http://testserver/api/v1/task/my_task/artifacts/",
+        json={
+            "id": "11111111-1111-1111-1111-111111111111",
+            "path": tmp_file.name,
+            "size": 15,
+            "content_type": "text/plain",
+            "s3_put_url": "http://example.com/test.txt",
+        },
+    )
+    responses.add(responses.PUT, "http://example.com/test.txt")
+    mock_dataset_worker.upload_artifact(path=tmp_file)
+    assert len(responses.calls) == len(BASE_API_CALLS) + 2
+    assert [
+        (call.request.method, call.request.url) for call in responses.calls
+    ] == BASE_API_CALLS + [
+        ("POST", "http://testserver/api/v1/task/my_task/artifacts/"),
+        ("PUT", "http://example.com/test.txt"),
+    ]

{arkindex_base_worker-0.5.2a1.dist-info → arkindex_base_worker-0.5.2a2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{arkindex_base_worker-0.5.2a1.dist-info → arkindex_base_worker-0.5.2a2.dist-info}/top_level.txt RENAMED Viewed

File without changes

arkindex-base-worker 0.5.2a1__py3-none-any.whl → 0.5.2a2__py3-none-any.whl

arkindex-base-worker 0.5.2a1py3-none-any.whl → 0.5.2a2py3-none-any.whl