PyPI - arkindex-base-worker - Versions diffs - 0.5.0b3__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

arkindex-base-worker 0.5.0b3py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{arkindex_base_worker-0.5.0b3.dist-info → arkindex_base_worker-0.5.1.dist-info}/METADATA +7 -8
{arkindex_base_worker-0.5.0b3.dist-info → arkindex_base_worker-0.5.1.dist-info}/RECORD +27 -27
arkindex_worker/cache.py +8 -22
arkindex_worker/image.py +5 -1
arkindex_worker/models.py +5 -0
arkindex_worker/utils.py +27 -0
arkindex_worker/worker/__init__.py +62 -6
arkindex_worker/worker/base.py +53 -1
arkindex_worker/worker/element.py +20 -0
arkindex_worker/worker/entity.py +17 -126
arkindex_worker/worker/metadata.py +3 -14
tests/conftest.py +113 -12
tests/test_base_worker.py +99 -125
tests/test_cache.py +2 -3
tests/test_dataset_worker.py +5 -0
tests/test_element.py +52 -12
tests/test_elements_worker/__init__.py +4 -0
tests/test_elements_worker/{test_entity_create.py → test_entity.py} +220 -227
tests/test_elements_worker/test_metadata.py +0 -47
tests/test_elements_worker/test_worker.py +106 -0
tests/test_image.py +19 -3
tests/test_merge.py +0 -7
tests/test_modern_config.py +81 -0
tests/test_utils.py +42 -0
tests/test_elements_worker/test_entity_list_and_check.py +0 -293
{arkindex_base_worker-0.5.0b3.dist-info → arkindex_base_worker-0.5.1.dist-info}/WHEEL +0 -0
{arkindex_base_worker-0.5.0b3.dist-info → arkindex_base_worker-0.5.1.dist-info}/licenses/LICENSE +0 -0
{arkindex_base_worker-0.5.0b3.dist-info → arkindex_base_worker-0.5.1.dist-info}/top_level.txt +0 -0

{arkindex_base_worker-0.5.0b3.dist-info → arkindex_base_worker-0.5.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: arkindex-base-worker
-Version: 0.5.0b3
+Version: 0.5.1
 Summary: Base Worker to easily build Arkindex ML workflows
 Author-email: Teklia <contact@teklia.com>
 Maintainer-email: Teklia <contact@teklia.com>
@@ -41,16 +41,15 @@ Classifier: Programming Language :: Python :: 3.12
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: humanize==4.12.3
+Requires-Dist: humanize==4.14.0
 Requires-Dist: peewee~=3.17
-Requires-Dist: Pillow==11.2.1
-Requires-Dist: python-gnupg==0.5.4
+Requires-Dist: Pillow==11.3.0
+Requires-Dist: python-gnupg==0.5.5
 Requires-Dist: shapely==2.0.6
-Requires-Dist: teklia-toolbox==0.1.9
-Requires-Dist: zstandard==0.23.0
+Requires-Dist: teklia-toolbox==0.1.11
+Requires-Dist: zstandard==0.25.0
 Provides-Extra: tests
-Requires-Dist: pytest==8.3.5; extra == "tests"
-Requires-Dist: pytest-mock==3.14.0; extra == "tests"
+Requires-Dist: pytest-mock==3.15.1; extra == "tests"
 Requires-Dist: pytest-responses==0.5.1; extra == "tests"
 Dynamic: license-file

{arkindex_base_worker-0.5.0b3.dist-info → arkindex_base_worker-0.5.1.dist-info}/RECORD RENAMED Viewed

@@ -1,18 +1,18 @@
-arkindex_base_worker-0.5.0b3.dist-info/licenses/LICENSE,sha256=NVshRi1efwVezMfW7xXYLrdDr2Li1AfwfGOd5WuH1kQ,1063
+arkindex_base_worker-0.5.1.dist-info/licenses/LICENSE,sha256=NVshRi1efwVezMfW7xXYLrdDr2Li1AfwfGOd5WuH1kQ,1063
 arkindex_worker/__init__.py,sha256=Sdt5KXn8EgURb2MurYVrUWaHbH3iFA1XLRo0Lc5AJ44,250
-arkindex_worker/cache.py,sha256=NpCsYFnqBmyBrACqeV7c3P6j6YrTtyi-HgtewwxUpxc,11221
-arkindex_worker/image.py,sha256=GvIpW7LNSalVw3Obt9nySDWnW7-NbC0__SWREEQqVCk,20696
-arkindex_worker/models.py,sha256=bPQzGZNs5a6z6DEcygsa8T33VOqPlMUbwKzHqlKzwbw,9923
-arkindex_worker/utils.py,sha256=MbbJT8oh8DMHHR-vidFeXdUH0TSXGWm7ZDGWzrRXoEY,9933
-arkindex_worker/worker/__init__.py,sha256=jipbseOaEggAacd0sU3Xe0q2de8FHIMbmpZKoHXrlAw,15867
-arkindex_worker/worker/base.py,sha256=03WLu7R8vca54LI00g-S0EqIbFiNaIsgZjN6zmfHisw,20126
+arkindex_worker/cache.py,sha256=XpEXMSnbhYCvrJquwA9XXqZo-ajMLpaCxKG5wH3Gp6Y,10959
+arkindex_worker/image.py,sha256=sGE8to5iykXv25bpkftOEWzlh5NzBZSKy4lSRoHYHPU,20929
+arkindex_worker/models.py,sha256=7GnKqpWPOSxyR_eKlDNVBe_r3TcE4ofK-1GzaonJEdM,10132
+arkindex_worker/utils.py,sha256=yq_LmRlqfWOzB09Aiz2XYx4xPZnoEXR3As48h2HxOVc,10974
+arkindex_worker/worker/__init__.py,sha256=SzD0s1_m6gMV02EUF-NeciqZdVPA4dpXI84tSj-g494,17869
+arkindex_worker/worker/base.py,sha256=-R_aLMJHbR6X1uM-U0zExsF_KLy5Wl3WJ_YMGO9We0I,22153
 arkindex_worker/worker/classification.py,sha256=qvykymkgd4nGywHCxL8obo4egstoGsmWNS4Ztc1qNWQ,11024
 arkindex_worker/worker/corpus.py,sha256=MeIMod7jkWyX0frtD0a37rhumnMV3p9ZOC1xwAoXrAA,2291
 arkindex_worker/worker/dataset.py,sha256=tVaPx43vaH-KTtx4w5V06e26ha8XPfiJTRzBXlu928Y,5273
-arkindex_worker/worker/element.py,sha256=982Dnk73v8wykCh3gweVi3q-bnvaY1LwkDFoNIoJ3KY,46579
-arkindex_worker/worker/entity.py,sha256=v1OhpvWNKASMIo6xF2cQ2nsEO2pHhqJjFjJ4J3jNfgQ,16257
+arkindex_worker/worker/element.py,sha256=sLfnf09AfJ5tSCKQ7cAkl7WsGhjsfq14swsT30MDnYk,47385
+arkindex_worker/worker/entity.py,sha256=Aj6EOfzHEm7qQV-Egm0YKLZgCrLS_3ggOKTY81M2JbI,12323
 arkindex_worker/worker/image.py,sha256=L6Ikuf0Z0RxJk7JarY5PggJGrYSHLaPK0vn0dy0CIaQ,623
-arkindex_worker/worker/metadata.py,sha256=mb9hVU-nRw3drCN-0AvtZ0nPY-4tD-ye9_mVy6icbk4,7309
+arkindex_worker/worker/metadata.py,sha256=keZdOdUthSH2hAw9iet5pN7rzWihTUYjZHRGTEjaltw,6843
 arkindex_worker/worker/process.py,sha256=9TEHpMcBax1wc6PrWMMrdXe2uNfqyVj7n_dAYZRBGnY,1854
 arkindex_worker/worker/task.py,sha256=nYfMSFm_d-4t8y4PO4HjFBnLsZf7IsDjkS7-A2Pgnac,1525
 arkindex_worker/worker/training.py,sha256=tyQOHcwv--_wdYz6CgLEe1YM7kwwwKN30LvGTsnWd78,10923
@@ -21,15 +21,16 @@ examples/standalone/python/worker.py,sha256=Zr4s4pHvgexEjlkixLFYZp1UuwMLeoTxjyNG
 examples/tooled/python/worker.py,sha256=kIYlHLsO5UpwX4XtERRq4tf2qTsvqKK30C-w8t0yyhA,1821
 hooks/pre_gen_project.py,sha256=xQJERv3vv9VzIqcBHI281eeWLWREXUF4mMw7PvJHHXM,269
 tests/__init__.py,sha256=DG--S6IpGl399rzSAjDdHL76CkOIeZIjajCcyUSDhOQ,241
-tests/conftest.py,sha256=kR9zYRHri2BPvzQbbhnvylHba2xvw0w8v1qaLwdGkK0,20993
-tests/test_base_worker.py,sha256=dA00oxauTSCwnFX3ZFBl-RI71HN6GmK48FBBW_oYN-k,30627
-tests/test_cache.py,sha256=ii0gyr0DrG7ChEs7pmT8hMdSguAOAcCze4bRMiFQxuk,10640
-tests/test_dataset_worker.py,sha256=z8ydliUlwW2j-irgLAotJMacgJXkVvF5TgsWLyCn1Jo,22087
-tests/test_element.py,sha256=2G9M15TLxQRmvrWM9Kw2ucnElh4kSv_oF_5FYwwAxTY,13181
-tests/test_image.py,sha256=NEIp5evr6QoTWgJ-_fze19IEFm_hG6YEcuW1kxnxS_I,28013
-tests/test_merge.py,sha256=TuOeUS0UCz66DPOQFFhc4NQBxIjZL9f5czi4XnvGrr4,8270
-tests/test_utils.py,sha256=nYL1s2ViZoLoMiNpLGDaWwxf8dJ1D8aT522AO-PVaEQ,3607
-tests/test_elements_worker/__init__.py,sha256=Fh4nkbbyJSMv_VtjQxnWrOqTnxXaaWI8S9WU0VrzCHs,179
+tests/conftest.py,sha256=Tp7YFK17NATwF2yAcBwi0QFNyKSXtLS0VhZ-zZngsQI,24343
+tests/test_base_worker.py,sha256=lwS4X3atS2ktEKd1XdogmN3mbzq-tO206-k_0EDITlw,29302
+tests/test_cache.py,sha256=_wztzh94EwVrb8UvpFqgl2aa2_FLaCcJKaqunCYR5Dw,10435
+tests/test_dataset_worker.py,sha256=iDJM2C4PfQNH0r4_QqSWoPt8BcM0geUUdODtWY0Z9PA,22412
+tests/test_element.py,sha256=hlj5VSF4plwC7uz9R4LGOOXZJQcHZiYCIDZT5V6EIB8,14334
+tests/test_image.py,sha256=yAM5mMfpQcIurT1KLHmu0AhSX2Qm3YvCu7afyZ3XUdU,28314
+tests/test_merge.py,sha256=REpZ13jkq_qm_4L5URQgFy5lxvPZtXxQEiWfYLMdmF0,7956
+tests/test_modern_config.py,sha256=Bm-a4LYQXgLZWQX7AmVyfJW0LNoLy1wj2d2GjzDkcBk,2683
+tests/test_utils.py,sha256=tgzNqyJMpddpeFWEjgsew_yDzmqnCA9HDaA5IpevAcM,5353
+tests/test_elements_worker/__init__.py,sha256=2t3NciCIOun_N-Wv63FWGsTm5W9N3mbwAWVuFORlMg8,308
 tests/test_elements_worker/test_classification.py,sha256=nya7veSPR_O9G41Enodp2-o6AifMBcaSTWJP2vXSSJ4,30133
 tests/test_elements_worker/test_cli.py,sha256=a23i1pUDbXi23MUtbWwGEcLLrmc_YlrbDgOG3h66wLM,2620
 tests/test_elements_worker/test_corpus.py,sha256=kscJyM8k1njYJJFGuvliVzn89lWh41mEyDCCawnp3W8,5483
@@ -39,23 +40,22 @@ tests/test_elements_worker/test_element_create_multiple.py,sha256=arYFGmxc0517ZU
 tests/test_elements_worker/test_element_create_single.py,sha256=Fa9zm12J2rQ3VrUe3yIlHAc7Vty_eQYb_YGnNPQB3IE,16697
 tests/test_elements_worker/test_element_list_children.py,sha256=2zH4h663w3EduqpzQr-7bf9zIDzO1x2WxdUYYHsIHkI,31358
 tests/test_elements_worker/test_element_list_parents.py,sha256=TXeGW-a3W-7GmB2QrhJH9mMnvxuybeAwQ4tL3iIxwXo,16734
-tests/test_elements_worker/test_entity_create.py,sha256=9Tjr9KA2yo44VFV283q_cs6XbbVguUMDNfCj-DILSJg,29353
-tests/test_elements_worker/test_entity_list_and_check.py,sha256=zAfwa49D8lHZdB7dqQu14R0P0SQu40qNalW7RjOPYic,9456
+tests/test_elements_worker/test_entity.py,sha256=SNAZEsVVLnqlliOmjkgv_cZhw0bAuJUY70_z57PpEE0,29624
 tests/test_elements_worker/test_image.py,sha256=BljMNKgec_9a5bzNzFpYZIvSbuvwsWDfdqLHVJaTa7M,2079
-tests/test_elements_worker/test_metadata.py,sha256=Xfggy-vxw5DZ3hFKx3sB7OYb2d1tu1RiNK8fvKJIaBs,22294
+tests/test_elements_worker/test_metadata.py,sha256=qtTDtlp3VnBkfck7PAguK2dEgTLlr1i1EVnmNTeNf3A,20515
 tests/test_elements_worker/test_process.py,sha256=y4RoVhPfyHzR795fw7-_FXElBcKo3fy4Ew_HI-kxJic,3088
 tests/test_elements_worker/test_task.py,sha256=wTUWqN9UhfKmJn3IcFY75EW4I1ulRhisflmY1kmP47s,5574
 tests/test_elements_worker/test_training.py,sha256=qgK7BLucddRzc8ePbQtY75x17QvGDEq5XCwgyyvmAJE,8717
 tests/test_elements_worker/test_transcription_create.py,sha256=yznO9B_BVsOR0Z_VY5ZL8gJp0ZPCz_4sPUs5dXtixAg,29281
 tests/test_elements_worker/test_transcription_create_with_elements.py,sha256=tmcyglgssEqMnt1Mdy_u6X1m2wgLWTo_HdWst3GrK2k,33056
 tests/test_elements_worker/test_transcription_list.py,sha256=ikz7HYPCoQWTdTRCd382SB-y-T2BbigPLlIcx5Eow-I,15324
-tests/test_elements_worker/test_worker.py,sha256=HDw_UQdiMUzlBd4-jRvC-B3pNrZmmpps4sfZ9a87JVY,25378
+tests/test_elements_worker/test_worker.py,sha256=ypAQS_DJj9qGlRJCs9g5qUXe7IgqaKXWDcxqwlhAqSg,28598
 worker-demo/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 worker-demo/tests/conftest.py,sha256=XzNMNeg6pmABUAH8jN6eZTlZSFGLYjS3-DTXjiRN6Yc,1002
 worker-demo/tests/test_worker.py,sha256=3DLd4NRK4bfyatG5P_PK4k9P9tJHx9XQq5_ryFEEFVg,304
 worker-demo/worker_demo/__init__.py,sha256=2BPomV8ZMNf3YXJgloatKeHQCE6QOkwmsHGkO6MkQuM,125
 worker-demo/worker_demo/worker.py,sha256=Rt-DjWa5iBP08k58NDZMfeyPuFbtNcbX6nc5jFX7GNo,440
-arkindex_base_worker-0.5.0b3.dist-info/METADATA,sha256=N8XHi4oVPpVnhutNdUWIV_lySfkUkTymABz8wpNhR4k,3136
-arkindex_base_worker-0.5.0b3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-arkindex_base_worker-0.5.0b3.dist-info/top_level.txt,sha256=-vNjP2VfROx0j83mdi9aIqRZ88eoJjxeWz-R_gPgyXU,49
-arkindex_base_worker-0.5.0b3.dist-info/RECORD,,
+arkindex_base_worker-0.5.1.dist-info/METADATA,sha256=U24JorNb4RsZFkjF25jbfg0VNacYMpwrceaQCCYSQQ0,3088
+arkindex_base_worker-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+arkindex_base_worker-0.5.1.dist-info/top_level.txt,sha256=-vNjP2VfROx0j83mdi9aIqRZ88eoJjxeWz-R_gPgyXU,49
+arkindex_base_worker-0.5.1.dist-info/RECORD,,

arkindex_worker/cache.py CHANGED Viewed

@@ -73,6 +73,7 @@ class CachedImage(Model):
     width = IntegerField()
     height = IntegerField()
     url = TextField()
+    version = IntegerField(default=2)
     class Meta:
         database = db
@@ -157,6 +158,10 @@ class CachedElement(Model):
             else:
                 resize = f"{max_width or ''},{max_height or ''}"
+        # Use `max` instead of `full` for IIIF 3, since `full` was deprecated in 2.1 then removed in 3.0
+        if self.image.version == 3 and resize == "full":
+            resize = "max"
         url = self.image.url
         if not url.endswith("/"):
             url += "/"
@@ -206,23 +211,6 @@ class CachedClassification(Model):
         table_name = "classifications"
-class CachedEntity(Model):
-    """
-    Cache entity table
-    """
-    id = UUIDField(primary_key=True)
-    type = CharField(max_length=50)
-    name = TextField()
-    validated = BooleanField(default=False)
-    metas = JSONField(null=True)
-    worker_run_id = UUIDField(null=True)
-    class Meta:
-        database = db
-        table_name = "entities"
 class CachedTranscriptionEntity(Model):
     """
     Cache transcription entity table
@@ -231,14 +219,14 @@ class CachedTranscriptionEntity(Model):
     transcription = ForeignKeyField(
         CachedTranscription, backref="transcription_entities"
     )
-    entity = ForeignKeyField(CachedEntity, backref="transcription_entities")
+    type = CharField(max_length=50)
     offset = IntegerField(constraints=[Check("offset >= 0")])
     length = IntegerField(constraints=[Check("length > 0")])
     worker_run_id = UUIDField(null=True)
     confidence = FloatField(null=True)
     class Meta:
-        primary_key = CompositeKey("transcription", "entity")
+        primary_key = CompositeKey("transcription", "type")
         database = db
         table_name = "transcription_entities"
@@ -272,12 +260,11 @@ MODELS = [
     CachedElement,
     CachedTranscription,
     CachedClassification,
-    CachedEntity,
     CachedTranscriptionEntity,
     CachedDataset,
     CachedDatasetElement,
 ]
-SQL_VERSION = 3
+SQL_VERSION = 5
 def init_cache_db(path: Path):
@@ -365,7 +352,6 @@ def merge_parents_cache(paths: list, current_database: Path):
             f"REPLACE INTO elements SELECT * FROM source_{idx}.elements;",
             f"REPLACE INTO transcriptions SELECT * FROM source_{idx}.transcriptions;",
             f"REPLACE INTO classifications SELECT * FROM source_{idx}.classifications;",
-            f"REPLACE INTO entities SELECT * FROM source_{idx}.entities;",
             f"REPLACE INTO transcription_entities SELECT * FROM source_{idx}.transcription_entities;",
             f"REPLACE INTO datasets SELECT * FROM source_{idx}.datasets;",
             f"REPLACE INTO dataset_elements SELECT * FROM source_{idx}.dataset_elements;",

arkindex_worker/image.py CHANGED Viewed

@@ -366,6 +366,10 @@ def download_tiles(url: str) -> Image:
     logger.debug("Downloading image information")
     info = _retried_request(url + "info.json").json()
+    # Use `max` instead of `full` for IIIF 3, since `full` was deprecated in 2.1 then removed in 3.0
+    # With IIIF 3, the image's ID will be at `id`, while IIIF 2 will use `@id``
+    resize = "max" if "id" in info else "full"
     image_width, image_height = info.get("width"), info.get("height")
     assert image_width and image_height, "Missing image dimensions in info.json"
     assert info.get("tiles"), (
@@ -391,7 +395,7 @@ def download_tiles(url: str) -> Image:
             logger.debug(f"Downloading tile {tile_x},{tile_y}")
             resp = _retried_request(
-                f"{url}{region_x},{region_y},{region_width},{region_height}/full/0/default.jpg"
+                f"{url}{region_x},{region_y},{region_width},{region_height}/{resize}/0/default.jpg"
             )
             tile_img = Image.open(BytesIO(resp.content))

arkindex_worker/models.py CHANGED Viewed

@@ -87,6 +87,11 @@ class Element(MagicDict):
         url = self.zone.image.get("s3_url")
         if url:
             return url
+        # Use `max` instead of `full` for IIIF 3, since `full` was deprecated in 2.1 then removed in 3.0
+        if self.zone.image.server.get("version", 2) == 3 and size == "full":
+            size = "max"
         url = self.zone.image.url
         if not url.endswith("/"):
             url += "/"

arkindex_worker/utils.py CHANGED Viewed

@@ -4,6 +4,7 @@ import logging
 import os
 import tarfile
 import tempfile
+import zipfile
 from collections.abc import Callable, Generator
 from itertools import islice
 from pathlib import Path
@@ -225,6 +226,32 @@ def create_tar_zst_archive(
     return zst_fd, zst_archive, zst_hash, tar_hash
+def create_zip_archive(source: Path, destination: Path | None = None) -> Path:
+    """Helper to create a ZIP archive from a source folder.
+    :param source: Path to the folder whose content should be archived.
+    :param destination: Path to the created archive, defaults to None. If unspecified, a temporary file will be created.
+    :return: The file descriptor of the created tempfile (if one was created), path to the archive.
+    """
+    # Parse destination and create a tmpfile if none was specified
+    file_d, destination = (
+        tempfile.mkstemp(prefix="teklia-", suffix=".zip")
+        if destination is None
+        else (None, destination)
+    )
+    destination = Path(destination)
+    logger.debug(f"Compressing file to {destination}")
+    with zipfile.ZipFile(
+        destination, mode="w", compression=zipfile.ZIP_BZIP2
+    ) as archive:
+        for p in source.rglob("*"):
+            relpath = p.relative_to(source)
+            archive.write(p, arcname=relpath)
+        return archive, destination
 DEFAULT_BATCH_SIZE = 50
 """Batch size used for bulk publication to Arkindex"""

arkindex_worker/worker/__init__.py CHANGED Viewed

@@ -32,6 +32,41 @@ from arkindex_worker.worker.task import TaskMixin
 from arkindex_worker.worker.transcription import TranscriptionMixin
+class WorkerActivityIterator:
+    def __init__(self, api_client):
+        # Use same api client as main class
+        self.api_client = api_client
+        logger.info(
+            "Using StartWorkerActivity instead of reading init_elements JSON file"
+        )
+    def __bool__(self):
+        # Needed to bypass `not elements` check
+        return True
+    def __iter__(self):
+        return self
+    def __next__(self):
+        """
+        Provide a new element ID from a worker activity upon each iteration
+        """
+        try:
+            data = self.api_client.request("StartWorkerActivity")
+        except ErrorResponse as e:
+            # Arkindex will provide a 404 or 400 when there are no worker activities left or the task has completed
+            if e.status_code in (400, 404):
+                raise StopIteration from e
+            logger.warning(
+                f"Failed to start a new worker activity of element due to an API error: {e.content}"
+            )
+            raise e
+        return data["id"]
 class ElementsWorker(
     ElementMixin,
     DatasetMixin,
@@ -60,7 +95,9 @@ class ElementsWorker(
         """
         super().__init__(description, support_cache)
-    def get_elements(self) -> Iterable[CachedElement] | list[str] | list[Element]:
+    def get_elements(
+        self,
+    ) -> Iterable[CachedElement] | list[str] | list[Element] | WorkerActivityIterator:
         """
         List the elements to be processed, either from the CLI arguments or
         the cache database when enabled.
@@ -109,6 +146,9 @@ class ElementsWorker(
         elif self.process_mode == ProcessMode.Export:
             # For export mode processes, use list_process_elements and return element IDs
             return {item["id"] for item in self.list_process_elements()}
+        elif self.consume_worker_activities:
+            # Consume worker activitives one by one
+            return WorkerActivityIterator(self.api_client)
         invalid_element_ids = list(filter(invalid_element_id, out))
         assert not invalid_element_ids, (
@@ -135,6 +175,15 @@ class ElementsWorker(
         )
         return self.process_information.get("activity_state") == "ready"
+    @property
+    def unknown_nb_elements(self) -> bool:
+        """
+        Whether or not the worker knows the total number of elements to process
+         - when running with init_elements, we have a known list
+         - when running with StartWorkerActivity, we have a queue of unknown size
+        """
+        return self.consume_worker_activities
     def run(self):
         """
         Implements an Arkindex worker that goes through each element returned by
@@ -157,7 +206,8 @@ class ElementsWorker(
             )
         # Process every element
-        count = len(elements)
+        # We cannot know the number of elements when consuming a list of worker activities
+        count = None if self.unknown_nb_elements else len(elements)
         failed = 0
         for i, item in enumerate(elements, start=1):
             element = None
@@ -171,10 +221,16 @@ class ElementsWorker(
                         **self.api_client.request("RetrieveElement", id=item)
                     )
-                logger.info(f"Processing {element} ({i}/{count})")
+                if self.unknown_nb_elements:
+                    logger.info(f"Processing {element} (n°{i})")
+                else:
+                    logger.info(f"Processing {element} ({i}/{count})")
                 # Process the element and report its progress if activities are enabled
-                if self.update_activity(element.id, ActivityState.Started):
+                # We do not update the worker activity to "Started" state when consuming them
+                if self.consume_worker_activities or self.update_activity(
+                    element.id, ActivityState.Started
+                ):
                     self.process_element(element)
                     self.update_activity(element.id, ActivityState.Processed)
                 else:
@@ -207,10 +263,10 @@ class ElementsWorker(
                     with contextlib.suppress(Exception):
                         self.update_activity(element.id, ActivityState.Error)
-        message = f"Ran on {count} {pluralize('element', count)}: {count - failed} completed, {failed} failed"
+        message = f"Ran on {i} {pluralize('element', i)}: {i - failed} completed, {failed} failed"
         if failed:
             logger.error(message)
-            if failed >= count:  # Everything failed!
+            if failed >= i:  # Everything failed!
                 sys.exit(1)
         else:
             logger.info(message)

arkindex_worker/worker/base.py CHANGED Viewed

@@ -9,12 +9,13 @@ import os
 import shutil
 from pathlib import Path
 from tempfile import mkdtemp
+from typing import Any
 import gnupg
 import yaml
 from arkindex import options_from_env
-from arkindex.exceptions import ErrorResponse
+from arkindex.exceptions import ClientError, ErrorResponse
 from arkindex_worker import logger
 from arkindex_worker.cache import (
     check_version,
@@ -260,7 +261,28 @@ class BaseWorker:
         logger.info(f"Loaded {worker_run['summary']} from API")
+        def _process_config_item(item: dict) -> tuple[str, Any]:
+            if not item["secret"]:
+                return (item["key"], item["value"])
+            # The secret may not be picked by the user
+            if item["value"] is None:
+                logger.info(f"Optional secret `{item['key']}` is not set")
+                return (item["key"], None)
+            # Load secret, only available in Arkindex EE
+            try:
+                secret = self.load_secret(Path(item["value"]))
+            except ClientError as e:
+                logger.error(
+                    f"Failed to retrieve the secret {item['value']}, probably an Arkindex Community Edition: {e}"
+                )
+                return (item["key"], None)
+            return (item["key"], secret)
         # Load model version configuration when available
+        # Workers will use model version ID and details to download the model
         model_version = worker_run.get("model_version")
         if model_version:
             logger.info("Loaded model version configuration from WorkerRun")
@@ -272,6 +294,36 @@ class BaseWorker:
             # Set model details as worker attribute
             self.model_details = model_version["model"]
+        # Load worker run information
+        try:
+            config = self.api_client.request(
+                "RetrieveWorkerRunConfiguration", id=self.worker_run_id
+            )
+            # Provide the same configuration through all previous attributes
+            self.config = self.user_configuration = dict(
+                map(_process_config_item, config["configuration"])
+            )
+            # Provide secret values through the previous attribute
+            self.secrets = {
+                item["key"]: self.config[item["key"]]
+                for item in config["configuration"]
+                if item["secret"]
+            }
+            logger.info("Using modern configuration")
+            # Reset the model configuration to make sure workers rely on the single new source
+            self.model_configuration = {}
+            return  # Stop here once we have modern configuration
+        except ErrorResponse as e:
+            if e.status_code != 400:
+                raise
+            logger.info("Modern configuration is not available")
+        # Use old-style configuration with local merge
         # Retrieve initial configuration from API
         self.config = worker_version["configuration"].get("configuration", {})
         if "user_configuration" in worker_version["configuration"]:

arkindex_worker/worker/element.py CHANGED Viewed

@@ -38,6 +38,15 @@ class ElementMixin:
             type=open,
             default=os.environ.get("TASK_ELEMENTS"),
         )
+        self.parser.add_argument(
+            "--no-elements-list",
+            help=(
+                "Consume worker activities from Arkindex API instead of using a static elements list"
+            ),
+            dest="consume_worker_activities",
+            action="store_true",
+            default=os.environ.get("SKIP_TASK_ELEMENTS") is not None,
+        )
         self.parser.add_argument(
             "--element",
             type=str,
@@ -46,6 +55,17 @@ class ElementMixin:
         )
         super().add_arguments()
+    @property
+    def consume_worker_activities(self) -> bool:
+        """
+        Helper to detect if the worker rely on an elements.json or consume directly worker activities
+        Uses the process information when available, fallback to CLI args
+        """
+        if self.process_information is not None:
+            return self.process_information.get("skip_elements_json") is True
+        return self.args.consume_worker_activities
     def list_corpus_types(self):
         """
         Loads available element types in corpus.

arkindex-base-worker 0.5.0b3__py3-none-any.whl → 0.5.1__py3-none-any.whl

arkindex-base-worker 0.5.0b3py3-none-any.whl → 0.5.1py3-none-any.whl