PyPI - arkindex-base-worker - Versions diffs - 0.4.0b1__tar.gz → 0.4.0b2__tar.gz - Mend

arkindex-base-worker 0.4.0b1tar.gz → 0.4.0b2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

{arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: arkindex-base-worker
-Version: 0.4.0b1
+Version: 0.4.0b2
 Summary: Base Worker to easily build Arkindex ML workflows
 Author-email: Teklia <contact@teklia.com>
 Maintainer-email: Teklia <contact@teklia.com>

{arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_base_worker.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: arkindex-base-worker
-Version: 0.4.0b1
+Version: 0.4.0b2
 Summary: Base Worker to easily build Arkindex ML workflows
 Author-email: Teklia <contact@teklia.com>
 Maintainer-email: Teklia <contact@teklia.com>

{arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/image.py RENAMED Viewed

@@ -21,6 +21,7 @@ from tenacity import (
 )
 from arkindex_worker import logger
+from arkindex_worker.utils import pluralize
 from teklia_toolbox.requests import should_verify_cert
 # Avoid circular imports error when type checking
@@ -164,7 +165,7 @@ def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
 def _retry_log(retry_state, *args, **kwargs):
     logger.warning(
         f"Request to {retry_state.args[0]} failed ({repr(retry_state.outcome.exception())}), "
-        f"retrying in {retry_state.idle_for} seconds"
+        f'retrying in {retry_state.idle_for} {pluralize("second", retry_state.idle_for)}'
     )

{arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/utils.py RENAMED Viewed

@@ -1,14 +1,36 @@
 import hashlib
+import inspect
 import logging
 import os
 import tarfile
 import tempfile
+from collections.abc import Callable, Generator
+from itertools import islice
 from pathlib import Path
+from typing import Any
 import zstandard as zstd
 logger = logging.getLogger(__name__)
+def pluralize(singular: str, count: int) -> str:
+    """Pluralize a noun, if necessary, using simplified rules of English pluralization and a list of exceptions.
+    :param str singular: A singular noun describing an object
+    :param int count: The object count, to determine whether to pluralize or not
+    :return str: The noun in its singular or plural form
+    """
+    if count == 1:
+        return singular
+    some_exceptions = {"entity": "entities", "metadata": "metadata", "class": "classes"}
+    if singular in some_exceptions:
+        return some_exceptions[singular]
+    return singular + "s"
 MANUAL_SOURCE = "manual"
@@ -196,3 +218,57 @@ def create_tar_zst_archive(
     close_delete_file(tar_fd, tar_archive)
     return zst_fd, zst_archive, zst_hash, tar_hash
+DEFAULT_BATCH_SIZE = 50
+"""Batch size used for bulk publication to Arkindex"""
+def batch_publication(func: Callable) -> Callable:
+    """
+    Decorator for functions that should raise an error when the value passed through the ``batch_size`` parameter is **not** a strictly positive integer.
+    :param func: The function to wrap with the ``batch_size`` check
+    :return: The function passing the ``batch_size`` check
+    """
+    signature = inspect.signature(func)
+    def wrapper(self, *args, **kwargs):
+        bound_func = signature.bind(self, *args, **kwargs)
+        bound_func.apply_defaults()
+        batch_size = bound_func.arguments.get("batch_size")
+        assert (
+            batch_size and isinstance(batch_size, int) and batch_size > 0
+        ), "batch_size shouldn't be null and should be a strictly positive integer"
+        return func(self, *args, **kwargs)
+    return wrapper
+def make_batches(
+    objects: list, singular_name: str, batch_size: int
+) -> Generator[list[Any]]:
+    """Split an object list in successive batches of maximum size ``batch_size``.
+    :param objects: The object list to divide in batches of ``batch_size`` size
+    :param singular_name: The singular form of the noun associated with the object list
+    :param batch_size: The maximum size of each batch to split the object list
+    :return: A generator of successive batches containing ``batch_size`` items from ``objects``
+    """
+    count = len(objects)
+    logger.info(
+        f"Creating batches of size {batch_size} to process {count} {pluralize(singular_name, count)}"
+    )
+    index = 1
+    iterator = iter(objects)
+    while batch := list(islice(iterator, batch_size)):
+        count = len(batch)
+        logger.info(
+            f"Processing batch {index} containing {count} {pluralize(singular_name, count)}..."
+        )
+        yield batch
+        index += 1

{arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/__init__.py RENAMED Viewed

@@ -17,6 +17,7 @@ from apistar.exceptions import ErrorResponse
 from arkindex_worker import logger
 from arkindex_worker.cache import CachedElement
 from arkindex_worker.models import Dataset, Element, Set
+from arkindex_worker.utils import pluralize
 from arkindex_worker.worker.base import BaseWorker
 from arkindex_worker.worker.classification import ClassificationMixin
 from arkindex_worker.worker.corpus import CorpusMixin
@@ -267,7 +268,7 @@ class ElementsWorker(
                     with contextlib.suppress(Exception):
                         self.update_activity(element.id, ActivityState.Error)
-        message = f'Ran on {count} element{"s"[:count>1]}: {count - failed} completed, {failed} failed'
+        message = f'Ran on {count} {pluralize("element", count)}: {count - failed} completed, {failed} failed'
         if failed:
             logger.error(message)
             if failed >= count:  # Everything failed!
@@ -529,7 +530,7 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
         # Cleanup the latest downloaded dataset artifact
         self.cleanup_downloaded_artifact()
-        message = f'Ran on {count} set{"s"[:count>1]}: {count - failed} completed, {failed} failed'
+        message = f'Ran on {count} {pluralize("set", count)}: {count - failed} completed, {failed} failed'
         if failed:
             logger.error(message)
             if failed >= count:  # Everything failed!

{arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/classification.py RENAMED Viewed

@@ -8,6 +8,12 @@ from peewee import IntegrityError
 from arkindex_worker import logger
 from arkindex_worker.cache import CachedClassification, CachedElement
 from arkindex_worker.models import Element
+from arkindex_worker.utils import (
+    DEFAULT_BATCH_SIZE,
+    batch_publication,
+    make_batches,
+    pluralize,
+)
 class ClassificationMixin:
@@ -21,7 +27,7 @@ class ClassificationMixin:
         )
         self.classes = {ml_class["name"]: ml_class["id"] for ml_class in corpus_classes}
         logger.info(
-            f"Loaded {len(self.classes)} ML classes in corpus ({self.corpus_id})"
+            f'Loaded {len(self.classes)} ML {pluralize("class", len(self.classes))} in corpus ({self.corpus_id})'
         )
     def get_ml_class_id(self, ml_class: str) -> str:
@@ -167,10 +173,12 @@ class ClassificationMixin:
         return created
+    @batch_publication
     def create_classifications(
         self,
         element: Element | CachedElement,
         classifications: list[dict[str, str | float | bool]],
+        batch_size: int = DEFAULT_BATCH_SIZE,
     ) -> list[dict[str, str | float | bool]]:
         """
         Create multiple classifications at once on the given element through the API.
@@ -185,6 +193,8 @@ class ClassificationMixin:
             high_confidence (bool)
                 Optional. Whether or not the classification is of high confidence.
+        :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
         :returns: List of created classifications, as returned in the ``classifications`` field by
            the ``CreateClassifications`` API endpoint.
         """
@@ -220,20 +230,26 @@ class ClassificationMixin:
             )
             return
-        created_cls = self.api_client.request(
-            "CreateClassifications",
-            body={
-                "parent": str(element.id),
-                "worker_run_id": self.worker_run_id,
-                "classifications": [
-                    {
-                        **classification,
-                        "ml_class": self.get_ml_class_id(classification["ml_class"]),
-                    }
-                    for classification in classifications
-                ],
-            },
-        )["classifications"]
+        created_cls = [
+            created_cl
+            for batch in make_batches(classifications, "classification", batch_size)
+            for created_cl in self.api_client.request(
+                "CreateClassifications",
+                body={
+                    "parent": str(element.id),
+                    "worker_run_id": self.worker_run_id,
+                    "classifications": [
+                        {
+                            **classification,
+                            "ml_class": self.get_ml_class_id(
+                                classification["ml_class"]
+                            ),
+                        }
+                        for classification in batch
+                    ],
+                },
+            )["classifications"]
+        ]
         for created_cl in created_cls:
             created_cl["class_name"] = self.retrieve_ml_class(created_cl["ml_class"])

{arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/element.py RENAMED Viewed

@@ -12,6 +12,12 @@ from peewee import IntegrityError
 from arkindex_worker import logger
 from arkindex_worker.cache import CachedElement, CachedImage, unsupported_cache
 from arkindex_worker.models import Element
+from arkindex_worker.utils import (
+    DEFAULT_BATCH_SIZE,
+    batch_publication,
+    make_batches,
+    pluralize,
+)
 class ElementType(NamedTuple):
@@ -43,7 +49,7 @@ class ElementMixin:
         }
         count = len(self.corpus_types)
         logger.info(
-            f'Loaded {count} element type{"s"[:count>1]} in corpus ({self.corpus_id}).'
+            f'Loaded {count} element {pluralize("type", count)} in corpus ({self.corpus_id}).'
         )
     @unsupported_cache
@@ -94,7 +100,7 @@ class ElementMixin:
                 )
             else:
                 raise MissingTypeError(
-                    f'Element type(s) {", ".join(sorted(missing_slugs))} were not found in corpus ({self.corpus_id}).'
+                    f'Element {pluralize("type", len(missing_slugs))} {", ".join(sorted(missing_slugs))} were not found in corpus ({self.corpus_id}).'
                 )
         return True
@@ -176,10 +182,12 @@ class ElementMixin:
         return sub_element["id"] if slim_output else sub_element
+    @batch_publication
     def create_elements(
         self,
         parent: Element | CachedElement,
         elements: list[dict[str, str | list[list[int | float]] | float | None]],
+        batch_size: int = DEFAULT_BATCH_SIZE,
     ) -> list[dict[str, str]]:
         """
         Create child elements on the given element in a single API request.
@@ -200,6 +208,8 @@ class ElementMixin:
             confidence (float or None)
                 Optional confidence score, between 0.0 and 1.0.
+        :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
         :return: List of dicts, with each dict having a single key, ``id``, holding the UUID of each created element.
         """
         if isinstance(parent, Element):
@@ -258,14 +268,18 @@ class ElementMixin:
             logger.warning("Cannot create elements as this worker is in read-only mode")
             return
-        created_ids = self.api_client.request(
-            "CreateElements",
-            id=parent.id,
-            body={
-                "worker_run_id": self.worker_run_id,
-                "elements": elements,
-            },
-        )
+        created_ids = [
+            created_id
+            for batch in make_batches(elements, "element", batch_size)
+            for created_id in self.api_client.request(
+                "CreateElements",
+                id=parent.id,
+                body={
+                    "worker_run_id": self.worker_run_id,
+                    "elements": batch,
+                },
+            )
+        ]
         if self.use_cache:
             # Create the image as needed and handle both an Element and a CachedElement

{arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/entity.py RENAMED Viewed

@@ -15,6 +15,12 @@ from arkindex_worker.cache import (
     unsupported_cache,
 )
 from arkindex_worker.models import Element, Transcription
+from arkindex_worker.utils import (
+    DEFAULT_BATCH_SIZE,
+    batch_publication,
+    make_batches,
+    pluralize,
+)
 class Entity(TypedDict):
@@ -213,10 +219,12 @@ class EntityMixin:
         return transcription_ent
     @unsupported_cache
+    @batch_publication
     def create_transcription_entities(
         self,
         transcription: Transcription,
         entities: list[Entity],
+        batch_size: int = DEFAULT_BATCH_SIZE,
     ) -> list[dict[str, str]]:
         """
         Create multiple entities attached to a transcription in a single API request.
@@ -239,6 +247,8 @@ class EntityMixin:
             confidence (float or None)
                 Optional confidence score, between 0.0 and 1.0.
+        :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
         :return: List of dicts, with each dict having a two keys, `transcription_entity_id` and `entity_id`, holding the UUID of each created object.
         """
         assert transcription and isinstance(
@@ -290,16 +300,20 @@ class EntityMixin:
             )
             return
-        created_ids = self.api_client.request(
-            "CreateTranscriptionEntities",
-            id=transcription.id,
-            body={
-                "worker_run_id": self.worker_run_id,
-                "entities": entities,
-            },
-        )
+        created_entities = [
+            created_entity
+            for batch in make_batches(entities, "entities", batch_size)
+            for created_entity in self.api_client.request(
+                "CreateTranscriptionEntities",
+                id=transcription.id,
+                body={
+                    "worker_run_id": self.worker_run_id,
+                    "entities": batch,
+                },
+            )["entities"]
+        ]
-        return created_ids["entities"]
+        return created_entities
     def list_transcription_entities(
         self,
@@ -383,7 +397,7 @@ class EntityMixin:
         }
         count = len(self.entities)
         logger.info(
-            f'Loaded {count} entit{"ies" if count > 1 else "y"} in corpus ({self.corpus_id})'
+            f'Loaded {count} {pluralize("entity", count)} in corpus ({self.corpus_id})'
         )
     def list_corpus_entity_types(self):
@@ -398,5 +412,5 @@ class EntityMixin:
         }
         count = len(self.entity_types)
         logger.info(
-            f'Loaded {count} entity type{"s"[:count>1]} in corpus ({self.corpus_id}).'
+            f'Loaded {count} entity {pluralize("type", count)} in corpus ({self.corpus_id}).'
         )

{arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/metadata.py RENAMED Viewed

@@ -7,6 +7,7 @@ from enum import Enum
 from arkindex_worker import logger
 from arkindex_worker.cache import CachedElement, unsupported_cache
 from arkindex_worker.models import Element
+from arkindex_worker.utils import DEFAULT_BATCH_SIZE, batch_publication, make_batches
 class MetaType(Enum):
@@ -108,10 +109,12 @@ class MetaDataMixin:
         return metadata["id"]
     @unsupported_cache
+    @batch_publication
     def create_metadata_bulk(
         self,
         element: Element | CachedElement,
         metadata_list: list[dict[str, MetaType | str | int | float | None]],
+        batch_size: int = DEFAULT_BATCH_SIZE,
     ) -> list[dict[str, str]]:
         """
         Create multiple metadata on an existing element.
@@ -123,6 +126,9 @@ class MetaDataMixin:
             - name: str
             - value: str | int | float
             - entity_id: str | None
+        :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
+        :returns: A list of dicts as returned in the ``metadata_list`` field by the ``CreateMetaDataBulk`` API endpoint.
         """
         assert element and isinstance(
             element, Element | CachedElement
@@ -168,14 +174,18 @@ class MetaDataMixin:
             logger.warning("Cannot create metadata as this worker is in read-only mode")
             return
-        created_metadata_list = self.api_client.request(
-            "CreateMetaDataBulk",
-            id=element.id,
-            body={
-                "worker_run_id": self.worker_run_id,
-                "metadata_list": metas,
-            },
-        )["metadata_list"]
+        created_metadata_list = [
+            created_metadata
+            for batch in make_batches(metas, "metadata", batch_size)
+            for created_metadata in self.api_client.request(
+                "CreateMetaDataBulk",
+                id=element.id,
+                body={
+                    "worker_run_id": self.worker_run_id,
+                    "metadata_list": batch,
+                },
+            )["metadata_list"]
+        ]
         return created_metadata_list

{arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/transcription.py RENAMED Viewed

@@ -11,6 +11,7 @@ from peewee import IntegrityError
 from arkindex_worker import logger
 from arkindex_worker.cache import CachedElement, CachedTranscription
 from arkindex_worker.models import Element
+from arkindex_worker.utils import DEFAULT_BATCH_SIZE, batch_publication, make_batches
 class TextOrientation(Enum):
@@ -109,9 +110,11 @@ class TranscriptionMixin:
         return created
+    @batch_publication
     def create_transcriptions(
         self,
         transcriptions: list[dict[str, str | float | TextOrientation | None]],
+        batch_size: int = DEFAULT_BATCH_SIZE,
     ) -> list[dict[str, str | float]]:
         """
         Create multiple transcriptions at once on existing elements through the API,
@@ -128,6 +131,8 @@ class TranscriptionMixin:
             orientation (TextOrientation)
                 Optional. Orientation of the transcription's text.
+        :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
         :returns: A list of dicts as returned in the ``transcriptions`` field by the ``CreateTranscriptions`` API endpoint.
         """
@@ -171,13 +176,19 @@ class TranscriptionMixin:
             )
             return
-        created_trs = self.api_client.request(
-            "CreateTranscriptions",
-            body={
-                "worker_run_id": self.worker_run_id,
-                "transcriptions": transcriptions_payload,
-            },
-        )["transcriptions"]
+        created_trs = [
+            created_tr
+            for batch in make_batches(
+                transcriptions_payload, "transcription", batch_size
+            )
+            for created_tr in self.api_client.request(
+                "CreateTranscriptions",
+                body={
+                    "worker_run_id": self.worker_run_id,
+                    "transcriptions": batch,
+                },
+            )["transcriptions"]
+        ]
         if self.use_cache:
             # Store transcriptions in local cache
@@ -201,11 +212,13 @@ class TranscriptionMixin:
         return created_trs
+    @batch_publication
     def create_element_transcriptions(
         self,
         element: Element | CachedElement,
         sub_element_type: str,
         transcriptions: list[dict[str, str | float]],
+        batch_size: int = DEFAULT_BATCH_SIZE,
     ) -> dict[str, str | bool]:
         """
         Create multiple elements and transcriptions at once on a single parent element through the API.
@@ -225,6 +238,8 @@ class TranscriptionMixin:
             element_confidence (float)
                 Optional. Confidence score of the element between 0 and 1.
+        :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
         :returns: A list of dicts as returned by the ``CreateElementTranscriptions`` API endpoint.
         """
         assert element and isinstance(
@@ -291,16 +306,22 @@ class TranscriptionMixin:
             )
             return
-        annotations = self.api_client.request(
-            "CreateElementTranscriptions",
-            id=element.id,
-            body={
-                "element_type": sub_element_type,
-                "worker_run_id": self.worker_run_id,
-                "transcriptions": transcriptions_payload,
-                "return_elements": True,
-            },
-        )
+        annotations = [
+            annotation
+            for batch in make_batches(
+                transcriptions_payload, "transcription", batch_size
+            )
+            for annotation in self.api_client.request(
+                "CreateElementTranscriptions",
+                id=element.id,
+                body={
+                    "element_type": sub_element_type,
+                    "worker_run_id": self.worker_run_id,
+                    "transcriptions": batch,
+                    "return_elements": True,
+                },
+            )
+        ]
         for annotation in annotations:
             if annotation["created"]:

{arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "arkindex-base-worker"
-version = "0.4.0b1"
+version = "0.4.0b2"
 description = "Base Worker to easily build Arkindex ML workflows"
 license = { file = "LICENSE" }
 dependencies = [

arkindex-base-worker 0.4.0b1__tar.gz → 0.4.0b2__tar.gz

arkindex-base-worker 0.4.0b1tar.gz → 0.4.0b2tar.gz