PyPI - arkindex-base-worker - Versions diffs - 0.4.0b3__tar.gz → 0.4.0rc2__tar.gz - Mend

arkindex-base-worker 0.4.0b3tar.gz → 0.4.0rc2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

{arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: arkindex-base-worker
-Version: 0.4.0b3
+Version: 0.4.0rc2
 Summary: Base Worker to easily build Arkindex ML workflows
 Author-email: Teklia <contact@teklia.com>
 Maintainer-email: Teklia <contact@teklia.com>
@@ -40,6 +40,7 @@ Classifier: Programming Language :: Python :: 3.11
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
+Requires-Dist: humanize==4.10.0
 Requires-Dist: peewee~=3.17
 Requires-Dist: Pillow==10.4.0
 Requires-Dist: python-gnupg==0.5.2
@@ -48,8 +49,8 @@ Requires-Dist: teklia-toolbox==0.1.5
 Requires-Dist: zstandard==0.22.0
 Provides-Extra: docs
 Requires-Dist: black==24.4.2; extra == "docs"
-Requires-Dist: mkdocs-material==9.5.31; extra == "docs"
-Requires-Dist: mkdocstrings-python==1.10.7; extra == "docs"
+Requires-Dist: mkdocs-material==9.5.33; extra == "docs"
+Requires-Dist: mkdocstrings-python==1.10.8; extra == "docs"
 Provides-Extra: tests
 Requires-Dist: pytest==8.3.2; extra == "tests"
 Requires-Dist: pytest-mock==3.14.0; extra == "tests"

{arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_base_worker.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: arkindex-base-worker
-Version: 0.4.0b3
+Version: 0.4.0rc2
 Summary: Base Worker to easily build Arkindex ML workflows
 Author-email: Teklia <contact@teklia.com>
 Maintainer-email: Teklia <contact@teklia.com>
@@ -40,6 +40,7 @@ Classifier: Programming Language :: Python :: 3.11
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
+Requires-Dist: humanize==4.10.0
 Requires-Dist: peewee~=3.17
 Requires-Dist: Pillow==10.4.0
 Requires-Dist: python-gnupg==0.5.2
@@ -48,8 +49,8 @@ Requires-Dist: teklia-toolbox==0.1.5
 Requires-Dist: zstandard==0.22.0
 Provides-Extra: docs
 Requires-Dist: black==24.4.2; extra == "docs"
-Requires-Dist: mkdocs-material==9.5.31; extra == "docs"
-Requires-Dist: mkdocstrings-python==1.10.7; extra == "docs"
+Requires-Dist: mkdocs-material==9.5.33; extra == "docs"
+Requires-Dist: mkdocstrings-python==1.10.8; extra == "docs"
 Provides-Extra: tests
 Requires-Dist: pytest==8.3.2; extra == "tests"
 Requires-Dist: pytest-mock==3.14.0; extra == "tests"

{arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_base_worker.egg-info/SOURCES.txt RENAMED Viewed

@@ -20,6 +20,7 @@ arkindex_worker/worker/element.py
 arkindex_worker/worker/entity.py
 arkindex_worker/worker/image.py
 arkindex_worker/worker/metadata.py
+arkindex_worker/worker/process.py
 arkindex_worker/worker/task.py
 arkindex_worker/worker/training.py
 arkindex_worker/worker/transcription.py

{arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_base_worker.egg-info/requires.txt RENAMED Viewed

@@ -1,3 +1,4 @@
+humanize==4.10.0
 peewee~=3.17
 Pillow==10.4.0
 python-gnupg==0.5.2
@@ -7,8 +8,8 @@ zstandard==0.22.0
 [docs]
 black==24.4.2
-mkdocs-material==9.5.31
-mkdocstrings-python==1.10.7
+mkdocs-material==9.5.33
+mkdocstrings-python==1.10.8
 [tests]
 pytest==8.3.2

{arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/image.py RENAMED Viewed

@@ -2,13 +2,18 @@
 Helper methods to download and open IIIF images, and manage polygons.
 """
+import functools
+import os
 import re
+import tempfile
 from collections import namedtuple
+from collections.abc import Generator, Iterator
 from io import BytesIO
 from math import ceil
 from pathlib import Path
 from typing import TYPE_CHECKING
+import humanize
 import requests
 from PIL import Image
 from shapely.affinity import rotate, scale, translate
@@ -40,8 +45,57 @@ IIIF_URL = re.compile(r"\w+:\/{2}.+\/.+\/.+\/.+\/(?P<size>.+)\/!?\d+\/\w+\.\w+")
 IIIF_FULL = "full"
 # Maximum size available
 IIIF_MAX = "max"
+# Ratio to resize image
+IMAGE_RATIO = [1, 0.9, 0.85, 0.80, 0.75, 0.70, 0.60, 0.50, 0.40, 0.30]
+def update_pillow_image_size_limit(func):
+    """
+    Update Pillow Image size limit
+    """
+    @functools.wraps(func)
+    def wrapper(
+        *args,
+        max_image_pixels: str | int | None = os.getenv("ARKINDEX_MAX_IMAGE_PIXELS"),
+        **kwargs,
+    ):
+        """
+        Wrapper to update Pillow Image size limit and restore it at the end of the function.
+        :param *args: Positional arguments passed to the function.
+        :param max_image_pixels: Pillow Image size limit to use.
+        :param **kwargs: Keyword arguments passed to the function.
+        """
+        MAX_IMAGE_PIXELS = Image.MAX_IMAGE_PIXELS
+        # Override Pillow Image size limit
+        if max_image_pixels is not None:
+            max_image_pixels = int(max_image_pixels)
+            # Override Pillow limit for detecting decompression bombs, disabled if set to 0
+            if max_image_pixels == 0:
+                logger.warning(
+                    "Pillow Image size limit is completely disabled, make sure you trust the image source."
+                )
+                Image.MAX_IMAGE_PIXELS = None
+            else:
+                Image.MAX_IMAGE_PIXELS = max_image_pixels
+        try:
+            results = func(*args, **kwargs)
+        except:
+            # Restore initial Pillow Image size limit
+            Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
+            raise
+        # Restore initial Pillow Image size limit
+        Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
+        return results
+    return wrapper
+@update_pillow_image_size_limit
 def open_image(
     path: str,
     mode: str | None = "RGB",
@@ -149,6 +203,70 @@ def upload_image(image: Image, url: str) -> requests.Response:
     return resp
+def resized_images(
+    *args,
+    element: "Element",
+    max_pixels: int | None = None,
+    max_bytes: int | None = None,
+    **kwargs,
+) -> Iterator[Generator[tempfile.NamedTemporaryFile, None, None]]:
+    """
+    Build resized images according to the pixel and byte limits.
+    :param *args: Positional arguments passed to [arkindex_worker.models.Element.open_image_tempfile][].
+    :param element: Element whose image needs to be resized.
+    :param max_pixels: Maximum pixel size of the resized images.
+    :param max_bytes: Maximum byte size of the resized images.
+    :param **kwargs: Keyword arguments passed to [arkindex_worker.models.Element.open_image_tempfile][].
+    :returns: An iterator of the temporary file of the resized image.
+    """
+    _, _, element_width, element_height = polygon_bounding_box(element.polygon)
+    logger.info(f"This element's image sizes are ({element_width} x {element_height}).")
+    if max_pixels and max(element_width, element_height) > max_pixels:
+        logger.warning(
+            f"Maximum image input size supported is ({max_pixels} x {max_pixels})."
+        )
+        logger.warning("The image will be resized.")
+    element_pixel, param = (
+        (element_width, "max_width")
+        if element_width > element_height
+        else (element_height, "max_height")
+    )
+    for resized_pixel in sorted(
+        set(
+            min(round(ratio * element_pixel), max_pixels or element_pixel)
+            for ratio in IMAGE_RATIO
+        ),
+        reverse=True,
+    ):
+        with element.open_image_tempfile(
+            *args, **{**kwargs, param: resized_pixel}
+        ) as image:
+            pillow_image = Image.open(image)
+            if (
+                pillow_image.width != element_width
+                or pillow_image.height != element_height
+            ):
+                logger.warning(
+                    f"The image was resized to ({pillow_image.width} x {pillow_image.height})."
+                )
+            # The image is still too large
+            image_size = Path(image.name).stat().st_size
+            if max_bytes and image_size > max_bytes:
+                logger.warning(f"The image size is {humanize.naturalsize(image_size)}.")
+                logger.warning(
+                    f"Maximum image input size supported is {humanize.naturalsize(max_bytes)}."
+                )
+                logger.warning("The image will be resized.")
+                continue
+            yield image
 def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
     """
     Compute the rectangle bounding box of a polygon.

{arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/__init__.py RENAMED Viewed

@@ -4,12 +4,10 @@ Base classes to implement Arkindex workers.
 import contextlib
 import json
-import os
 import sys
 import uuid
-from argparse import ArgumentTypeError
-from collections.abc import Iterable, Iterator
-from enum import Enum
+from collections.abc import Iterable
+from itertools import chain
 from pathlib import Path
 from apistar.exceptions import ErrorResponse
@@ -21,47 +19,27 @@ from arkindex_worker.utils import pluralize
 from arkindex_worker.worker.base import BaseWorker
 from arkindex_worker.worker.classification import ClassificationMixin
 from arkindex_worker.worker.corpus import CorpusMixin
-from arkindex_worker.worker.dataset import DatasetMixin, DatasetState
+from arkindex_worker.worker.dataset import (
+    DatasetMixin,
+    DatasetState,
+    MissingDatasetArchive,
+)
 from arkindex_worker.worker.element import ElementMixin
 from arkindex_worker.worker.entity import EntityMixin
 from arkindex_worker.worker.image import ImageMixin
 from arkindex_worker.worker.metadata import MetaDataMixin, MetaType  # noqa: F401
+from arkindex_worker.worker.process import ActivityState, ProcessMode
 from arkindex_worker.worker.task import TaskMixin
 from arkindex_worker.worker.transcription import TranscriptionMixin
 from arkindex_worker.worker.version import WorkerVersionMixin
-class ActivityState(Enum):
-    """
-    Processing state of an element.
-    """
-    Queued = "queued"
-    """
-    The element has not yet been processed by a worker.
-    """
-    Started = "started"
-    """
-    The element is being processed by a worker.
-    """
-    Processed = "processed"
-    """
-    The element has been successfully processed by a worker.
-    """
-    Error = "error"
-    """
-    An error occurred while processing this element.
-    """
 class ElementsWorker(
+    ElementMixin,
+    DatasetMixin,
     BaseWorker,
     ClassificationMixin,
     CorpusMixin,
-    ElementMixin,
     TranscriptionMixin,
     WorkerVersionMixin,
     EntityMixin,
@@ -96,22 +74,7 @@ class ElementsWorker(
         self._worker_version_cache = {}
-    def add_arguments(self):
-        """Define specific ``argparse`` arguments for this worker"""
-        self.parser.add_argument(
-            "--elements-list",
-            help="JSON elements list to use",
-            type=open,
-            default=os.environ.get("TASK_ELEMENTS"),
-        )
-        self.parser.add_argument(
-            "--element",
-            type=str,
-            nargs="+",
-            help="One or more Arkindex element ID",
-        )
-    def list_elements(self) -> Iterable[CachedElement] | list[str]:
+    def get_elements(self) -> Iterable[CachedElement] | list[str] | list[Element]:
         """
         List the elements to be processed, either from the CLI arguments or
         the cache database when enabled.
@@ -143,15 +106,20 @@ class ElementsWorker(
         )
         if self.use_cache and cache_query.exists():
             return cache_query
-        # Process elements from JSON file
         elif self.args.elements_list:
+            # Process elements from JSON file
             data = json.load(self.args.elements_list)
             assert isinstance(data, list), "Elements list must be a list"
             assert len(data), "No elements in elements list"
             out += list(filter(None, [element.get("id") for element in data]))
-        # Add any extra element from CLI
         elif self.args.element:
+            # Add any extra element from CLI
             out += self.args.element
+        elif self.process_mode == ProcessMode.Dataset or self.args.set:
+            # Elements from datasets
+            return list(
+                chain.from_iterable(map(self.list_set_elements, self.list_sets()))
+            )
         invalid_element_ids = list(filter(invalid_element_id, out))
         assert (
@@ -166,40 +134,18 @@ class ElementsWorker(
         Whether or not WorkerActivity support has been enabled on the DataImport
         used to run this worker.
         """
-        if self.is_read_only:
+        if self.is_read_only or self.process_mode == ProcessMode.Dataset:
+            # Worker activities are also disabled when running an ElementsWorker in a Dataset process.
             return False
         assert (
             self.process_information
         ), "Worker must be configured to access its process activity state"
         return self.process_information.get("activity_state") == "ready"
-    def configure(self):
-        """
-        Setup the worker using CLI arguments and environment variables.
-        """
-        # CLI args are stored on the instance so that implementations can access them
-        self.args = self.parser.parse_args()
-        if self.is_read_only:
-            super().configure_for_developers()
-        else:
-            super().configure()
-            super().configure_cache()
-        # Retrieve the model configuration
-        if self.model_configuration:
-            self.config.update(self.model_configuration)
-            logger.info("Model version configuration retrieved")
-        # Retrieve the user configuration
-        if self.user_configuration:
-            self.config.update(self.user_configuration)
-            logger.info("User configuration retrieved")
     def run(self):
         """
         Implements an Arkindex worker that goes through each element returned by
-        [list_elements][arkindex_worker.worker.ElementsWorker.list_elements].
+        [get_elements][arkindex_worker.worker.ElementsWorker.get_elements].
         It calls [process_element][arkindex_worker.worker.ElementsWorker.process_element],
         catching exceptions, and handles saving WorkerActivity updates when enabled.
         """
@@ -207,7 +153,7 @@ class ElementsWorker(
         # List all elements either from JSON file
         # or direct list of elements on CLI
-        elements = self.list_elements()
+        elements = self.get_elements()
         if not elements:
             logger.warning("No elements to process, stopping.")
             sys.exit(1)
@@ -223,8 +169,8 @@ class ElementsWorker(
         for i, item in enumerate(elements, start=1):
             element = None
             try:
-                if self.use_cache:
-                    # Just use the result of list_elements as the element
+                if isinstance(item, CachedElement | Element):
+                    # Just use the result of get_elements as the element
                     element = item
                 else:
                     # Load element using the Arkindex API
@@ -339,29 +285,7 @@ class ElementsWorker(
         return True
-def check_dataset_set(value: str) -> tuple[uuid.UUID, str]:
-    values = value.split(":")
-    if len(values) != 2:
-        raise ArgumentTypeError(
-            f"'{value}' is not in the correct format `<dataset_id>:<set_name>`"
-        )
-    dataset_id, set_name = values
-    try:
-        dataset_id = uuid.UUID(dataset_id)
-        return (dataset_id, set_name)
-    except (TypeError, ValueError) as e:
-        raise ArgumentTypeError(f"'{dataset_id}' should be a valid UUID") from e
-class MissingDatasetArchive(Exception):
-    """
-    Exception raised when the compressed archive associated to
-    a dataset isn't found in its task artifacts.
-    """
-class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
+class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
     """
     Base class for ML workers that operate on Arkindex dataset sets.
@@ -384,42 +308,6 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
         # Set as an instance variable as dataset workers might use it to easily extract its content
         self.downloaded_dataset_artifact: Path | None = None
-    def add_arguments(self):
-        """Define specific ``argparse`` arguments for this worker"""
-        self.parser.add_argument(
-            "--set",
-            type=check_dataset_set,
-            nargs="+",
-            help="""
-                One or more Arkindex dataset sets, format is <dataset_uuid>:<set_name>
-                (e.g.: "12341234-1234-1234-1234-123412341234:train")
-            """,
-            default=[],
-        )
-    def configure(self):
-        """
-        Setup the worker using CLI arguments and environment variables.
-        """
-        # CLI args are stored on the instance so that implementations can access them
-        self.args = self.parser.parse_args()
-        if self.is_read_only:
-            super().configure_for_developers()
-        else:
-            super().configure()
-            super().configure_cache()
-        # Retrieve the model configuration
-        if self.model_configuration:
-            self.config.update(self.model_configuration)
-            logger.info("Model version configuration retrieved")
-        # Retrieve the user configuration
-        if self.user_configuration:
-            self.config.update(self.user_configuration)
-            logger.info("User configuration retrieved")
     def cleanup_downloaded_artifact(self) -> None:
         """
         Cleanup the downloaded dataset artifact if any
@@ -467,30 +355,10 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
         :param set: The set to process.
         """
-    def list_sets(self) -> Iterator[Set]:
-        """
-        List the sets to be processed, either from the CLI arguments or using the
-        [list_process_sets][arkindex_worker.worker.dataset.DatasetMixin.list_process_sets] method.
-        :returns: An iterator of ``Set`` objects.
-        """
-        if not self.is_read_only:
-            yield from self.list_process_sets()
-        datasets: dict[uuid.UUID, Dataset] = {}
-        for dataset_id, set_name in self.args.set:
-            # Retrieving dataset information is not already cached
-            if dataset_id not in datasets:
-                datasets[dataset_id] = Dataset(
-                    **self.api_client.request("RetrieveDataset", id=dataset_id)
-                )
-            yield Set(name=set_name, dataset=datasets[dataset_id])
     def run(self):
         """
         Implements an Arkindex worker that goes through each dataset set returned by
-        [list_sets][arkindex_worker.worker.DatasetWorker.list_sets].
+        [list_sets][arkindex_worker.worker.dataset.DatasetMixin.list_sets].
         It calls [process_set][arkindex_worker.worker.DatasetWorker.process_set],
         catching exceptions.

{arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/base.py RENAMED Viewed

@@ -24,6 +24,7 @@ from arkindex_worker.cache import (
     merge_parents_cache,
 )
 from arkindex_worker.utils import close_delete_file, extract_tar_zst_archive
+from arkindex_worker.worker.process import ProcessMode
 from teklia_toolbox.requests import get_arkindex_client
@@ -156,6 +157,13 @@ class BaseWorker:
             raise Exception("Missing ARKINDEX_CORPUS_ID environment variable")
         return self._corpus_id
+    @property
+    def process_mode(self) -> ProcessMode | None:
+        """Mode of the process being run. Returns None when read-only."""
+        if self.is_read_only:
+            return
+        return ProcessMode(self.process_information["mode"])
     @property
     def is_read_only(self) -> bool:
         """
@@ -219,7 +227,7 @@ class BaseWorker:
         # Load all required secrets
         self.secrets = {name: self.load_secret(Path(name)) for name in required_secrets}
-    def configure(self):
+    def configure_worker_run(self):
         """
         Setup the necessary configuration needed using CLI args and environment variables.
         This is the method called when running a worker on Arkindex.
@@ -320,6 +328,29 @@ class BaseWorker:
         else:
             logger.debug("Cache is disabled")
+    def configure(self):
+        """
+        Setup the worker using CLI arguments and environment variables.
+        """
+        # CLI args are stored on the instance so that implementations can access them
+        self.args = self.parser.parse_args()
+        if self.is_read_only:
+            self.configure_for_developers()
+        else:
+            self.configure_worker_run()
+            self.configure_cache()
+        # Retrieve the model configuration
+        if self.model_configuration:
+            self.config.update(self.model_configuration)
+            logger.info("Model version configuration retrieved")
+        # Retrieve the user configuration
+        if self.user_configuration:
+            self.config.update(self.user_configuration)
+            logger.info("User configuration retrieved")
     def load_secret(self, name: Path):
         """
         Load a Ponos secret by name.

{arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/dataset.py RENAMED Viewed

@@ -2,6 +2,8 @@
 BaseWorker methods for datasets.
 """
+import uuid
+from argparse import ArgumentTypeError
 from collections.abc import Iterator
 from enum import Enum
@@ -36,7 +38,55 @@ class DatasetState(Enum):
     """
+class MissingDatasetArchive(Exception):
+    """
+    Exception raised when the compressed archive associated to
+    a dataset isn't found in its task artifacts.
+    """
+def check_dataset_set(value: str) -> tuple[uuid.UUID, str]:
+    """The `--set` argument should have the following format:
+    <dataset_id>:<set_name>
+    Args:
+        value (str): Provided argument.
+    Raises:
+        ArgumentTypeError: When the value is invalid.
+    Returns:
+        tuple[uuid.UUID, str]: The ID of the dataset parsed as UUID and the name of the set.
+    """
+    values = value.split(":")
+    if len(values) != 2:
+        raise ArgumentTypeError(
+            f"'{value}' is not in the correct format `<dataset_id>:<set_name>`"
+        )
+    dataset_id, set_name = values
+    try:
+        dataset_id = uuid.UUID(dataset_id)
+        return (dataset_id, set_name)
+    except (TypeError, ValueError) as e:
+        raise ArgumentTypeError(f"'{dataset_id}' should be a valid UUID") from e
 class DatasetMixin:
+    def add_arguments(self) -> None:
+        """Define specific ``argparse`` arguments for the worker using this mixin"""
+        self.parser.add_argument(
+            "--set",
+            type=check_dataset_set,
+            nargs="+",
+            help="""
+                One or more Arkindex dataset sets, format is <dataset_uuid>:<set_name>
+                (e.g.: "12341234-1234-1234-1234-123412341234:train")
+            """,
+            default=[],
+        )
+        super().add_arguments()
     def list_process_sets(self) -> Iterator[Set]:
         """
         List dataset sets associated to the worker's process. This helper is not available in developer mode.
@@ -73,6 +123,26 @@ class DatasetMixin:
         return map(lambda result: Element(**result["element"]), results)
+    def list_sets(self) -> Iterator[Set]:
+        """
+        List the sets to be processed, either from the CLI arguments or using the
+        [list_process_sets][arkindex_worker.worker.dataset.DatasetMixin.list_process_sets] method.
+        :returns: An iterator of ``Set`` objects.
+        """
+        if not self.is_read_only:
+            yield from self.list_process_sets()
+        datasets: dict[uuid.UUID, Dataset] = {}
+        for dataset_id, set_name in self.args.set:
+            # Retrieving dataset information if not already cached
+            if dataset_id not in datasets:
+                datasets[dataset_id] = Dataset(
+                    **self.api_client.request("RetrieveDataset", id=dataset_id)
+                )
+            yield Set(name=set_name, dataset=datasets[dataset_id])
     @unsupported_cache
     def update_dataset_state(self, dataset: Dataset, state: DatasetState) -> Dataset:
         """

arkindex-base-worker 0.4.0b3__tar.gz → 0.4.0rc2__tar.gz

arkindex-base-worker 0.4.0b3tar.gz → 0.4.0rc2tar.gz