PyPI - arkindex-base-worker - Versions diffs - 0.5.0rc1__tar.gz → 0.5.1__tar.gz - Mend

arkindex-base-worker 0.5.0rc1tar.gz → 0.5.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

{arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: arkindex-base-worker
-Version: 0.5.0rc1
+Version: 0.5.1
 Summary: Base Worker to easily build Arkindex ML workflows
 Author-email: Teklia <contact@teklia.com>
 Maintainer-email: Teklia <contact@teklia.com>
@@ -41,16 +41,15 @@ Classifier: Programming Language :: Python :: 3.12
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: humanize==4.12.3
+Requires-Dist: humanize==4.14.0
 Requires-Dist: peewee~=3.17
-Requires-Dist: Pillow==11.2.1
-Requires-Dist: python-gnupg==0.5.4
+Requires-Dist: Pillow==11.3.0
+Requires-Dist: python-gnupg==0.5.5
 Requires-Dist: shapely==2.0.6
-Requires-Dist: teklia-toolbox==0.1.9
-Requires-Dist: zstandard==0.23.0
+Requires-Dist: teklia-toolbox==0.1.11
+Requires-Dist: zstandard==0.25.0
 Provides-Extra: tests
-Requires-Dist: pytest==8.3.5; extra == "tests"
-Requires-Dist: pytest-mock==3.14.0; extra == "tests"
+Requires-Dist: pytest-mock==3.15.1; extra == "tests"
 Requires-Dist: pytest-responses==0.5.1; extra == "tests"
 Dynamic: license-file

{arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_base_worker.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: arkindex-base-worker
-Version: 0.5.0rc1
+Version: 0.5.1
 Summary: Base Worker to easily build Arkindex ML workflows
 Author-email: Teklia <contact@teklia.com>
 Maintainer-email: Teklia <contact@teklia.com>
@@ -41,16 +41,15 @@ Classifier: Programming Language :: Python :: 3.12
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: humanize==4.12.3
+Requires-Dist: humanize==4.14.0
 Requires-Dist: peewee~=3.17
-Requires-Dist: Pillow==11.2.1
-Requires-Dist: python-gnupg==0.5.4
+Requires-Dist: Pillow==11.3.0
+Requires-Dist: python-gnupg==0.5.5
 Requires-Dist: shapely==2.0.6
-Requires-Dist: teklia-toolbox==0.1.9
-Requires-Dist: zstandard==0.23.0
+Requires-Dist: teklia-toolbox==0.1.11
+Requires-Dist: zstandard==0.25.0
 Provides-Extra: tests
-Requires-Dist: pytest==8.3.5; extra == "tests"
-Requires-Dist: pytest-mock==3.14.0; extra == "tests"
+Requires-Dist: pytest-mock==3.15.1; extra == "tests"
 Requires-Dist: pytest-responses==0.5.1; extra == "tests"
 Dynamic: license-file

{arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_base_worker.egg-info/SOURCES.txt RENAMED Viewed

@@ -35,6 +35,7 @@ tests/test_dataset_worker.py
 tests/test_element.py
 tests/test_image.py
 tests/test_merge.py
+tests/test_modern_config.py
 tests/test_utils.py
 tests/test_elements_worker/__init__.py
 tests/test_elements_worker/test_classification.py

arkindex_base_worker-0.5.1/arkindex_base_worker.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,11 @@
+humanize==4.14.0
+peewee~=3.17
+Pillow==11.3.0
+python-gnupg==0.5.5
+shapely==2.0.6
+teklia-toolbox==0.1.11
+zstandard==0.25.0
+[tests]
+pytest-mock==3.15.1
+pytest-responses==0.5.1

{arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/cache.py RENAMED Viewed

@@ -73,6 +73,7 @@ class CachedImage(Model):
     width = IntegerField()
     height = IntegerField()
     url = TextField()
+    version = IntegerField(default=2)
     class Meta:
         database = db
@@ -157,6 +158,10 @@ class CachedElement(Model):
             else:
                 resize = f"{max_width or ''},{max_height or ''}"
+        # Use `max` instead of `full` for IIIF 3, since `full` was deprecated in 2.1 then removed in 3.0
+        if self.image.version == 3 and resize == "full":
+            resize = "max"
         url = self.image.url
         if not url.endswith("/"):
             url += "/"
@@ -259,7 +264,7 @@ MODELS = [
     CachedDataset,
     CachedDatasetElement,
 ]
-SQL_VERSION = 4
+SQL_VERSION = 5
 def init_cache_db(path: Path):

{arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/image.py RENAMED Viewed

@@ -366,6 +366,10 @@ def download_tiles(url: str) -> Image:
     logger.debug("Downloading image information")
     info = _retried_request(url + "info.json").json()
+    # Use `max` instead of `full` for IIIF 3, since `full` was deprecated in 2.1 then removed in 3.0
+    # With IIIF 3, the image's ID will be at `id`, while IIIF 2 will use `@id``
+    resize = "max" if "id" in info else "full"
     image_width, image_height = info.get("width"), info.get("height")
     assert image_width and image_height, "Missing image dimensions in info.json"
     assert info.get("tiles"), (
@@ -391,7 +395,7 @@ def download_tiles(url: str) -> Image:
             logger.debug(f"Downloading tile {tile_x},{tile_y}")
             resp = _retried_request(
-                f"{url}{region_x},{region_y},{region_width},{region_height}/full/0/default.jpg"
+                f"{url}{region_x},{region_y},{region_width},{region_height}/{resize}/0/default.jpg"
             )
             tile_img = Image.open(BytesIO(resp.content))

{arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/models.py RENAMED Viewed

@@ -87,6 +87,11 @@ class Element(MagicDict):
         url = self.zone.image.get("s3_url")
         if url:
             return url
+        # Use `max` instead of `full` for IIIF 3, since `full` was deprecated in 2.1 then removed in 3.0
+        if self.zone.image.server.get("version", 2) == 3 and size == "full":
+            size = "max"
         url = self.zone.image.url
         if not url.endswith("/"):
             url += "/"

{arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/utils.py RENAMED Viewed

@@ -4,6 +4,7 @@ import logging
 import os
 import tarfile
 import tempfile
+import zipfile
 from collections.abc import Callable, Generator
 from itertools import islice
 from pathlib import Path
@@ -225,6 +226,32 @@ def create_tar_zst_archive(
     return zst_fd, zst_archive, zst_hash, tar_hash
+def create_zip_archive(source: Path, destination: Path | None = None) -> Path:
+    """Helper to create a ZIP archive from a source folder.
+    :param source: Path to the folder whose content should be archived.
+    :param destination: Path to the created archive, defaults to None. If unspecified, a temporary file will be created.
+    :return: The file descriptor of the created tempfile (if one was created), path to the archive.
+    """
+    # Parse destination and create a tmpfile if none was specified
+    file_d, destination = (
+        tempfile.mkstemp(prefix="teklia-", suffix=".zip")
+        if destination is None
+        else (None, destination)
+    )
+    destination = Path(destination)
+    logger.debug(f"Compressing file to {destination}")
+    with zipfile.ZipFile(
+        destination, mode="w", compression=zipfile.ZIP_BZIP2
+    ) as archive:
+        for p in source.rglob("*"):
+            relpath = p.relative_to(source)
+            archive.write(p, arcname=relpath)
+        return archive, destination
 DEFAULT_BATCH_SIZE = 50
 """Batch size used for bulk publication to Arkindex"""

{arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/__init__.py RENAMED Viewed

@@ -32,6 +32,41 @@ from arkindex_worker.worker.task import TaskMixin
 from arkindex_worker.worker.transcription import TranscriptionMixin
+class WorkerActivityIterator:
+    def __init__(self, api_client):
+        # Use same api client as main class
+        self.api_client = api_client
+        logger.info(
+            "Using StartWorkerActivity instead of reading init_elements JSON file"
+        )
+    def __bool__(self):
+        # Needed to bypass `not elements` check
+        return True
+    def __iter__(self):
+        return self
+    def __next__(self):
+        """
+        Provide a new element ID from a worker activity upon each iteration
+        """
+        try:
+            data = self.api_client.request("StartWorkerActivity")
+        except ErrorResponse as e:
+            # Arkindex will provide a 404 or 400 when there are no worker activities left or the task has completed
+            if e.status_code in (400, 404):
+                raise StopIteration from e
+            logger.warning(
+                f"Failed to start a new worker activity of element due to an API error: {e.content}"
+            )
+            raise e
+        return data["id"]
 class ElementsWorker(
     ElementMixin,
     DatasetMixin,
@@ -60,7 +95,9 @@ class ElementsWorker(
         """
         super().__init__(description, support_cache)
-    def get_elements(self) -> Iterable[CachedElement] | list[str] | list[Element]:
+    def get_elements(
+        self,
+    ) -> Iterable[CachedElement] | list[str] | list[Element] | WorkerActivityIterator:
         """
         List the elements to be processed, either from the CLI arguments or
         the cache database when enabled.
@@ -109,6 +146,9 @@ class ElementsWorker(
         elif self.process_mode == ProcessMode.Export:
             # For export mode processes, use list_process_elements and return element IDs
             return {item["id"] for item in self.list_process_elements()}
+        elif self.consume_worker_activities:
+            # Consume worker activitives one by one
+            return WorkerActivityIterator(self.api_client)
         invalid_element_ids = list(filter(invalid_element_id, out))
         assert not invalid_element_ids, (
@@ -135,6 +175,15 @@ class ElementsWorker(
         )
         return self.process_information.get("activity_state") == "ready"
+    @property
+    def unknown_nb_elements(self) -> bool:
+        """
+        Whether or not the worker knows the total number of elements to process
+         - when running with init_elements, we have a known list
+         - when running with StartWorkerActivity, we have a queue of unknown size
+        """
+        return self.consume_worker_activities
     def run(self):
         """
         Implements an Arkindex worker that goes through each element returned by
@@ -157,7 +206,8 @@ class ElementsWorker(
             )
         # Process every element
-        count = len(elements)
+        # We cannot know the number of elements when consuming a list of worker activities
+        count = None if self.unknown_nb_elements else len(elements)
         failed = 0
         for i, item in enumerate(elements, start=1):
             element = None
@@ -171,10 +221,16 @@ class ElementsWorker(
                         **self.api_client.request("RetrieveElement", id=item)
                     )
-                logger.info(f"Processing {element} ({i}/{count})")
+                if self.unknown_nb_elements:
+                    logger.info(f"Processing {element} (n°{i})")
+                else:
+                    logger.info(f"Processing {element} ({i}/{count})")
                 # Process the element and report its progress if activities are enabled
-                if self.update_activity(element.id, ActivityState.Started):
+                # We do not update the worker activity to "Started" state when consuming them
+                if self.consume_worker_activities or self.update_activity(
+                    element.id, ActivityState.Started
+                ):
                     self.process_element(element)
                     self.update_activity(element.id, ActivityState.Processed)
                 else:
@@ -207,10 +263,10 @@ class ElementsWorker(
                     with contextlib.suppress(Exception):
                         self.update_activity(element.id, ActivityState.Error)
-        message = f"Ran on {count} {pluralize('element', count)}: {count - failed} completed, {failed} failed"
+        message = f"Ran on {i} {pluralize('element', i)}: {i - failed} completed, {failed} failed"
         if failed:
             logger.error(message)
-            if failed >= count:  # Everything failed!
+            if failed >= i:  # Everything failed!
                 sys.exit(1)
         else:
             logger.info(message)

{arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/base.py RENAMED Viewed

@@ -9,12 +9,13 @@ import os
 import shutil
 from pathlib import Path
 from tempfile import mkdtemp
+from typing import Any
 import gnupg
 import yaml
 from arkindex import options_from_env
-from arkindex.exceptions import ErrorResponse
+from arkindex.exceptions import ClientError, ErrorResponse
 from arkindex_worker import logger
 from arkindex_worker.cache import (
     check_version,
@@ -260,7 +261,28 @@ class BaseWorker:
         logger.info(f"Loaded {worker_run['summary']} from API")
+        def _process_config_item(item: dict) -> tuple[str, Any]:
+            if not item["secret"]:
+                return (item["key"], item["value"])
+            # The secret may not be picked by the user
+            if item["value"] is None:
+                logger.info(f"Optional secret `{item['key']}` is not set")
+                return (item["key"], None)
+            # Load secret, only available in Arkindex EE
+            try:
+                secret = self.load_secret(Path(item["value"]))
+            except ClientError as e:
+                logger.error(
+                    f"Failed to retrieve the secret {item['value']}, probably an Arkindex Community Edition: {e}"
+                )
+                return (item["key"], None)
+            return (item["key"], secret)
         # Load model version configuration when available
+        # Workers will use model version ID and details to download the model
         model_version = worker_run.get("model_version")
         if model_version:
             logger.info("Loaded model version configuration from WorkerRun")
@@ -272,6 +294,36 @@ class BaseWorker:
             # Set model details as worker attribute
             self.model_details = model_version["model"]
+        # Load worker run information
+        try:
+            config = self.api_client.request(
+                "RetrieveWorkerRunConfiguration", id=self.worker_run_id
+            )
+            # Provide the same configuration through all previous attributes
+            self.config = self.user_configuration = dict(
+                map(_process_config_item, config["configuration"])
+            )
+            # Provide secret values through the previous attribute
+            self.secrets = {
+                item["key"]: self.config[item["key"]]
+                for item in config["configuration"]
+                if item["secret"]
+            }
+            logger.info("Using modern configuration")
+            # Reset the model configuration to make sure workers rely on the single new source
+            self.model_configuration = {}
+            return  # Stop here once we have modern configuration
+        except ErrorResponse as e:
+            if e.status_code != 400:
+                raise
+            logger.info("Modern configuration is not available")
+        # Use old-style configuration with local merge
         # Retrieve initial configuration from API
         self.config = worker_version["configuration"].get("configuration", {})
         if "user_configuration" in worker_version["configuration"]:

{arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/element.py RENAMED Viewed

@@ -38,6 +38,15 @@ class ElementMixin:
             type=open,
             default=os.environ.get("TASK_ELEMENTS"),
         )
+        self.parser.add_argument(
+            "--no-elements-list",
+            help=(
+                "Consume worker activities from Arkindex API instead of using a static elements list"
+            ),
+            dest="consume_worker_activities",
+            action="store_true",
+            default=os.environ.get("SKIP_TASK_ELEMENTS") is not None,
+        )
         self.parser.add_argument(
             "--element",
             type=str,
@@ -46,6 +55,17 @@ class ElementMixin:
         )
         super().add_arguments()
+    @property
+    def consume_worker_activities(self) -> bool:
+        """
+        Helper to detect if the worker rely on an elements.json or consume directly worker activities
+        Uses the process information when available, fallback to CLI args
+        """
+        if self.process_information is not None:
+            return self.process_information.get("skip_elements_json") is True
+        return self.args.consume_worker_activities
     def list_corpus_types(self):
         """
         Loads available element types in corpus.

{arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/metadata.py RENAMED Viewed

@@ -20,10 +20,10 @@ class MetaType(Enum):
     A regular string with no special interpretation.
     """
-    HTML = "html"
+    Markdown = "markdown"
     """
-    A metadata with a string value that should be interpreted as HTML content.
-    The allowed HTML tags are restricted for security reasons.
+    A metadata with a string value that should be interpreted as Markdown content.
+    HTML is allowed, but the allowed HTML tags are restricted for security reasons.
     """
     Date = "date"

{arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/pyproject.toml RENAMED Viewed

@@ -4,17 +4,17 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "arkindex-base-worker"
-version = "0.5.0rc1"
+version = "0.5.1"
 description = "Base Worker to easily build Arkindex ML workflows"
 license = { file = "LICENSE" }
 dependencies = [
-    "humanize==4.12.3",
+    "humanize==4.14.0",
     "peewee~=3.17",
-    "Pillow==11.2.1",
-    "python-gnupg==0.5.4",
+    "Pillow==11.3.0",
+    "python-gnupg==0.5.5",
     "shapely==2.0.6",
-    "teklia-toolbox==0.1.9",
-    "zstandard==0.23.0",
+    "teklia-toolbox==0.1.11",
+    "zstandard==0.25.0",
 ]
 authors = [
     { name = "Teklia", email = "contact@teklia.com" },
@@ -44,8 +44,7 @@ Authors = "https://teklia.com"
 [project.optional-dependencies]
 tests = [
-    "pytest==8.3.5",
-    "pytest-mock==3.14.0",
+    "pytest-mock==3.15.1",
     "pytest-responses==0.5.1",
 ]

{arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/conftest.py RENAMED Viewed

@@ -103,12 +103,6 @@ def _mock_worker_run_api(responses):
     payload = {
         "id": "56785678-5678-5678-5678-567856785678",
         "parents": [],
-        "worker": {
-            "id": "deadbeef-1234-5678-1234-worker",
-            "name": "Fake worker",
-            "slug": "fake_worker",
-            "type": "classifier",
-        },
         "worker_version": {
             "id": "12341234-1234-1234-1234-123412341234",
             "configuration": {
@@ -153,6 +147,7 @@ def _mock_worker_run_api(responses):
             "train_folder_id": None,
             "validation_folder_id": None,
             "test_folder_id": None,
+            "skip_elements_json": False,
         },
         "summary": "Worker Fake worker @ 123412",
     }
@@ -165,6 +160,13 @@ def _mock_worker_run_api(responses):
         content_type="application/json",
     )
+    # By default, stick to classic configuration
+    responses.add(
+        responses.GET,
+        "http://testserver/api/v1/workers/runs/56785678-5678-5678-5678-567856785678/configuration/",
+        status=400,
+    )
 @pytest.fixture
 def _mock_worker_run_no_revision_api(responses):
@@ -172,12 +174,6 @@ def _mock_worker_run_no_revision_api(responses):
     payload = {
         "id": "56785678-5678-5678-5678-567856785678",
         "parents": [],
-        "worker": {
-            "id": "deadbeef-1234-5678-1234-worker",
-            "name": "Fake worker",
-            "slug": "fake_worker",
-            "type": "classifier",
-        },
         "worker_version": {
             "id": "12341234-1234-1234-1234-123412341234",
             "configuration": {
@@ -233,6 +229,56 @@ def _mock_worker_run_no_revision_api(responses):
     )
+@pytest.fixture
+def mock_base_worker_modern_conf(mocker, responses):
+    """
+    Provide a base worker to test modern configuration with (not provided in the fixture)
+    """
+    worker = BaseWorker()
+    mocker.patch.object(sys, "argv")
+    worker.args = worker.parser.parse_args()
+    payload = {
+        "id": "56785678-5678-5678-5678-567856785678",
+        "parents": [],
+        "worker_version": {
+            "id": "12341234-1234-1234-1234-123412341234",
+            "worker": {
+                "id": "deadbeef-1234-5678-1234-worker",
+                "name": "Fake worker",
+                "slug": "fake_worker",
+                "type": "classifier",
+            },
+            "revision": {"hash": "deadbeef1234"},
+            "configuration": {
+                "configuration": {"extra_key1": "not showing up"},
+                "user_configuration": {"extra_key2": "not showing up"},
+            },
+        },
+        "configuration": {
+            "id": "af0daaf4-983e-4703-a7ed-a10f146d6684",
+            "name": "my-userconfig",
+            "configuration": {
+                "extra_key3": "not showing up",
+            },
+        },
+        "model_version": None,
+        "process": {
+            "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
+            "corpus": CORPUS_ID,
+        },
+        "summary": "Worker Fake worker @ 123412",
+    }
+    responses.add(
+        responses.GET,
+        "http://testserver/api/v1/process/workers/56785678-5678-5678-5678-567856785678/",
+        status=200,
+        json=payload,
+    )
+    return worker
 @pytest.fixture
 def _mock_activity_calls(responses):
     """
@@ -282,6 +328,61 @@ def mock_elements_worker_with_list(monkeypatch, responses, mock_elements_worker)
     return mock_elements_worker
+@pytest.fixture
+def mock_elements_worker_consume_wa(monkeypatch, responses, mock_elements_worker):
+    """
+    Mock a worker instance to use StartWorkerActivity to consume worker activities
+    instead of reading a JSON file
+    """
+    # Enable consume worker activities through the process configuration
+    responses.replace(
+        responses.GET,
+        "http://testserver/api/v1/process/workers/56785678-5678-5678-5678-567856785678/",
+        status=200,
+        json={
+            "id": "56785678-5678-5678-5678-567856785678",
+            "parents": [],
+            "worker_version": {
+                "id": "12341234-1234-1234-1234-123412341234",
+                "configuration": {
+                    "docker": {"image": "python:3"},
+                    "configuration": {"someKey": "someValue"},
+                    "secrets": [],
+                },
+                "worker": {
+                    "id": "deadbeef-1234-5678-1234-worker",
+                    "name": "Fake worker",
+                    "slug": "fake_worker",
+                    "type": "classifier",
+                },
+            },
+            "configuration": None,
+            "model_version": None,
+            "process": {
+                "name": None,
+                "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
+                "state": "running",
+                "mode": "workers",
+                "corpus": CORPUS_ID,
+                "use_cache": False,
+                "activity_state": "ready",
+                "model_id": None,
+                "train_folder_id": None,
+                "validation_folder_id": None,
+                "test_folder_id": None,
+                "skip_elements_json": True,
+            },
+            "summary": "Worker Fake worker @ 123412",
+        },
+    )
+    # Call configure again to use updated process infos
+    mock_elements_worker.configure()
+    return mock_elements_worker
 @pytest.fixture
 def mock_cache_db(tmp_path):
     cache_path = tmp_path / "db.sqlite"

arkindex-base-worker 0.5.0rc1__tar.gz → 0.5.1__tar.gz

arkindex-base-worker 0.5.0rc1tar.gz → 0.5.1tar.gz