PyPI - arkindex-base-worker - Versions diffs - 0.4.0a2__py3-none-any.whl → 0.4.0b2__py3-none-any.whl - Mend

arkindex-base-worker 0.4.0a2py3-none-any.whl → 0.4.0b2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{arkindex_base_worker-0.4.0a2.dist-info → arkindex_base_worker-0.4.0b2.dist-info}/METADATA +7 -7
arkindex_base_worker-0.4.0b2.dist-info/RECORD +51 -0
{arkindex_base_worker-0.4.0a2.dist-info → arkindex_base_worker-0.4.0b2.dist-info}/WHEEL +1 -1
arkindex_worker/image.py +2 -1
arkindex_worker/utils.py +76 -0
arkindex_worker/worker/__init__.py +24 -14
arkindex_worker/worker/base.py +3 -9
arkindex_worker/worker/classification.py +33 -17
arkindex_worker/worker/corpus.py +3 -1
arkindex_worker/worker/dataset.py +1 -1
arkindex_worker/worker/element.py +45 -16
arkindex_worker/worker/entity.py +30 -17
arkindex_worker/worker/metadata.py +19 -9
arkindex_worker/worker/task.py +4 -2
arkindex_worker/worker/training.py +5 -5
arkindex_worker/worker/transcription.py +39 -18
arkindex_worker/worker/version.py +3 -1
tests/test_base_worker.py +1 -1
tests/test_elements_worker/test_classifications.py +107 -60
tests/test_elements_worker/test_elements.py +213 -70
tests/test_elements_worker/test_entities.py +102 -33
tests/test_elements_worker/test_metadata.py +223 -98
tests/test_elements_worker/test_transcriptions.py +293 -143
tests/test_merge.py +1 -1
tests/test_utils.py +28 -0
arkindex_base_worker-0.4.0a2.dist-info/RECORD +0 -51
{arkindex_base_worker-0.4.0a2.dist-info → arkindex_base_worker-0.4.0b2.dist-info}/LICENSE +0 -0
{arkindex_base_worker-0.4.0a2.dist-info → arkindex_base_worker-0.4.0b2.dist-info}/top_level.txt +0 -0

{arkindex_base_worker-0.4.0a2.dist-info → arkindex_base_worker-0.4.0b2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: arkindex-base-worker
-Version: 0.4.0a2
+Version: 0.4.0b2
 Summary: Base Worker to easily build Arkindex ML workflows
 Author-email: Teklia <contact@teklia.com>
 Maintainer-email: Teklia <contact@teklia.com>
@@ -41,17 +41,17 @@ Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: peewee ~=3.17
-Requires-Dist: Pillow ==10.3.0
+Requires-Dist: Pillow ==10.4.0
 Requires-Dist: python-gnupg ==0.5.2
-Requires-Dist: shapely ==2.0.3
+Requires-Dist: shapely ==2.0.5
 Requires-Dist: teklia-toolbox ==0.1.5
 Requires-Dist: zstandard ==0.22.0
 Provides-Extra: docs
-Requires-Dist: black ==24.4.0 ; extra == 'docs'
-Requires-Dist: mkdocs-material ==9.5.17 ; extra == 'docs'
-Requires-Dist: mkdocstrings-python ==1.9.2 ; extra == 'docs'
+Requires-Dist: black ==24.4.2 ; extra == 'docs'
+Requires-Dist: mkdocs-material ==9.5.31 ; extra == 'docs'
+Requires-Dist: mkdocstrings-python ==1.10.7 ; extra == 'docs'
 Provides-Extra: tests
-Requires-Dist: pytest ==8.1.1 ; extra == 'tests'
+Requires-Dist: pytest ==8.3.2 ; extra == 'tests'
 Requires-Dist: pytest-mock ==3.14.0 ; extra == 'tests'
 Requires-Dist: pytest-responses ==0.5.1 ; extra == 'tests'

arkindex_base_worker-0.4.0b2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,51 @@
+arkindex_worker/__init__.py,sha256=OlgCtTC9MaWeejviY0a3iQpALcRQGMVArFVVYwTF6I8,162
+arkindex_worker/cache.py,sha256=FTlB0coXofn5zTNRTcVIvh709mcw4a1bPGqkwWjKs3w,11248
+arkindex_worker/image.py,sha256=8Y0PYMbTEsFUv8lCNLBu7UaDy6um5YfHCefyXL2jpnE,14347
+arkindex_worker/models.py,sha256=bPQzGZNs5a6z6DEcygsa8T33VOqPlMUbwKzHqlKzwbw,9923
+arkindex_worker/utils.py,sha256=zrtMChXx_HGu4UkqXZBKPg3ys0UBFmQaizoX1riM3D4,9824
+arkindex_worker/worker/__init__.py,sha256=w1VlDzERabXIp625kkHnojyu5ctCM11WLw4ARh1ja3k,19818
+arkindex_worker/worker/base.py,sha256=JStHpwSP3bis9LLvV2C2n6GTWtLUVIDA9JPgPJEt17o,18717
+arkindex_worker/worker/classification.py,sha256=ECm1cnQPOj_9m-CoO0e182ElSySAUOoyddHrORbShhc,10951
+arkindex_worker/worker/corpus.py,sha256=s9bCxOszJMwRq1WWAmKjWq888mjDfbaJ18Wo7h-rNOw,1827
+arkindex_worker/worker/dataset.py,sha256=UXElhhARca9m7Himp-yxD5dAqWbdxDKWOUJUGgeCZXI,2934
+arkindex_worker/worker/element.py,sha256=5knEFHc0LDRRHI8IbJbiiQsOAoW7qYPf9lcVXsFlUEQ,34798
+arkindex_worker/worker/entity.py,sha256=qGjQvOVXfP84rER0Dkui6q-rb9nTWerHVG0Z5voB8pU,15229
+arkindex_worker/worker/image.py,sha256=t_Az6IGnj0EZyvcA4XxfPikOUjn_pztgsyxTkFZhaXU,621
+arkindex_worker/worker/metadata.py,sha256=VRajtd2kaBvar9GercX4knvR6l1WFYjoCdJWU9ccKgk,7291
+arkindex_worker/worker/task.py,sha256=1O9zrWXxe3na3TOcoHX5Pxn1875v7EU08BSsCPnb62g,1519
+arkindex_worker/worker/training.py,sha256=qnBFEk11JOWWPLTbjF-lZ9iFBdTPpQzZAzQ9a03J1j4,10874
+arkindex_worker/worker/transcription.py,sha256=8ho-8zmF9LgP86oS59ZZLv5I7tfnZ1yNO2A3pY_9GQ8,21353
+arkindex_worker/worker/version.py,sha256=JIT7OI3Mo7RPkNrjOB9hfqrsG-FYygz_zi4l8PbkuAo,1960
+hooks/pre_gen_project.py,sha256=xQJERv3vv9VzIqcBHI281eeWLWREXUF4mMw7PvJHHXM,269
+tests/__init__.py,sha256=6aeTMHf4q_dKY4jIZWg1KT70VKaLvVlzCxh-Uu_cWiQ,241
+tests/conftest.py,sha256=-ZQTV4rg7TgW84-5Ioqndqv8byNILfDOpyUt8wecEiI,21967
+tests/test_base_worker.py,sha256=LdFV0LFdNU2IOyEKlX59MB1kuyxHCuhy4Tm7eE_iPiU,24281
+tests/test_cache.py,sha256=ii0gyr0DrG7ChEs7pmT8hMdSguAOAcCze4bRMiFQxuk,10640
+tests/test_dataset_worker.py,sha256=d9HG36qnO5HXu9vQ0UTBvdTSRR21FVq1FNoXM-vZbPk,22105
+tests/test_element.py,sha256=2G9M15TLxQRmvrWM9Kw2ucnElh4kSv_oF_5FYwwAxTY,13181
+tests/test_image.py,sha256=Fs9vKYgQ7mEFylbzI4YIO_JyOLeAcs-WxUXpzewxCd8,16188
+tests/test_merge.py,sha256=FMdpsm_ncHNmIvOrJ1vcwlyn8o9-SPcpFTcbAsXwK-w,8320
+tests/test_utils.py,sha256=zbJC24NyTc3slz3Ed3gJDswjRChjkR5oHEgDoQMOBiE,2588
+tests/test_elements_worker/__init__.py,sha256=Fh4nkbbyJSMv_VtjQxnWrOqTnxXaaWI8S9WU0VrzCHs,179
+tests/test_elements_worker/test_classifications.py,sha256=fXZ8cSzIWwZ6LHsY7tKsy9-Pp9fKyKUStIXS4ViBcek,27779
+tests/test_elements_worker/test_cli.py,sha256=a23i1pUDbXi23MUtbWwGEcLLrmc_YlrbDgOG3h66wLM,2620
+tests/test_elements_worker/test_corpus.py,sha256=c_LUHvkJIYgk_wXF06VQPNOoWfiZ06XpjOXrJ7MRiBc,4479
+tests/test_elements_worker/test_dataset.py,sha256=lSXqubhg1EEq2Y2goE8Y2RYaqIpM9Iejq6fGNW2BczU,11411
+tests/test_elements_worker/test_elements.py,sha256=dBhjQ8XZNIE7bjx5AaGaclPLZr1Ur_-tQ-ebS3S_Zn0,89142
+tests/test_elements_worker/test_entities.py,sha256=oav2dtvWWavQe1l3Drbxw1Ta2ocUJEVxJfDQ_r6-rYQ,36181
+tests/test_elements_worker/test_image.py,sha256=_E3UGdDOwTo1MW5KMS81PrdeSPBPWinWYoQPNy2F9Ro,2077
+tests/test_elements_worker/test_metadata.py,sha256=cm2NNaXxBYmYMkPexSPVTAqb2skDTB4mliwQCLz8Y98,22293
+tests/test_elements_worker/test_task.py,sha256=7Sr3fbjdgWUXJUhJEiC9CwnbhQIQX3rCInmHMIrmA38,5573
+tests/test_elements_worker/test_training.py,sha256=Qxi9EzGr_uKcn2Fh5aE6jNrq1K8QKLiOiSew4upASPs,8721
+tests/test_elements_worker/test_transcriptions.py,sha256=FNY6E26iTKqe7LP9LO72By4oV4g9hBIZYTU9BAc_w7I,77060
+tests/test_elements_worker/test_worker.py,sha256=AwdP8uSXNQ_SJavXxJV2s3_J3OiCafShVjMV1dgt4xo,17162
+worker-demo/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+worker-demo/tests/conftest.py,sha256=XzNMNeg6pmABUAH8jN6eZTlZSFGLYjS3-DTXjiRN6Yc,1002
+worker-demo/tests/test_worker.py,sha256=3DLd4NRK4bfyatG5P_PK4k9P9tJHx9XQq5_ryFEEFVg,304
+worker-demo/worker_demo/__init__.py,sha256=2BPomV8ZMNf3YXJgloatKeHQCE6QOkwmsHGkO6MkQuM,125
+worker-demo/worker_demo/worker.py,sha256=Rt-DjWa5iBP08k58NDZMfeyPuFbtNcbX6nc5jFX7GNo,440
+arkindex_base_worker-0.4.0b2.dist-info/LICENSE,sha256=NVshRi1efwVezMfW7xXYLrdDr2Li1AfwfGOd5WuH1kQ,1063
+arkindex_base_worker-0.4.0b2.dist-info/METADATA,sha256=wvefQTllKMq-jkbjsG1TMyuvF06h-XjBLgc79_j8MTU,3270
+arkindex_base_worker-0.4.0b2.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
+arkindex_base_worker-0.4.0b2.dist-info/top_level.txt,sha256=58NuslgxQC2vT4DiqZEgO4JqJRrYa2yeNI9QvkbfGQU,40
+arkindex_base_worker-0.4.0b2.dist-info/RECORD,,

{arkindex_base_worker-0.4.0a2.dist-info → arkindex_base_worker-0.4.0b2.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.43.0)
+Generator: setuptools (72.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

arkindex_worker/image.py CHANGED Viewed

@@ -21,6 +21,7 @@ from tenacity import (
 )
 from arkindex_worker import logger
+from arkindex_worker.utils import pluralize
 from teklia_toolbox.requests import should_verify_cert
 # Avoid circular imports error when type checking
@@ -164,7 +165,7 @@ def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
 def _retry_log(retry_state, *args, **kwargs):
     logger.warning(
         f"Request to {retry_state.args[0]} failed ({repr(retry_state.outcome.exception())}), "
-        f"retrying in {retry_state.idle_for} seconds"
+        f'retrying in {retry_state.idle_for} {pluralize("second", retry_state.idle_for)}'
     )

arkindex_worker/utils.py CHANGED Viewed

@@ -1,14 +1,36 @@
 import hashlib
+import inspect
 import logging
 import os
 import tarfile
 import tempfile
+from collections.abc import Callable, Generator
+from itertools import islice
 from pathlib import Path
+from typing import Any
 import zstandard as zstd
 logger = logging.getLogger(__name__)
+def pluralize(singular: str, count: int) -> str:
+    """Pluralize a noun, if necessary, using simplified rules of English pluralization and a list of exceptions.
+    :param str singular: A singular noun describing an object
+    :param int count: The object count, to determine whether to pluralize or not
+    :return str: The noun in its singular or plural form
+    """
+    if count == 1:
+        return singular
+    some_exceptions = {"entity": "entities", "metadata": "metadata", "class": "classes"}
+    if singular in some_exceptions:
+        return some_exceptions[singular]
+    return singular + "s"
 MANUAL_SOURCE = "manual"
@@ -196,3 +218,57 @@ def create_tar_zst_archive(
     close_delete_file(tar_fd, tar_archive)
     return zst_fd, zst_archive, zst_hash, tar_hash
+DEFAULT_BATCH_SIZE = 50
+"""Batch size used for bulk publication to Arkindex"""
+def batch_publication(func: Callable) -> Callable:
+    """
+    Decorator for functions that should raise an error when the value passed through the ``batch_size`` parameter is **not** a strictly positive integer.
+    :param func: The function to wrap with the ``batch_size`` check
+    :return: The function passing the ``batch_size`` check
+    """
+    signature = inspect.signature(func)
+    def wrapper(self, *args, **kwargs):
+        bound_func = signature.bind(self, *args, **kwargs)
+        bound_func.apply_defaults()
+        batch_size = bound_func.arguments.get("batch_size")
+        assert (
+            batch_size and isinstance(batch_size, int) and batch_size > 0
+        ), "batch_size shouldn't be null and should be a strictly positive integer"
+        return func(self, *args, **kwargs)
+    return wrapper
+def make_batches(
+    objects: list, singular_name: str, batch_size: int
+) -> Generator[list[Any]]:
+    """Split an object list in successive batches of maximum size ``batch_size``.
+    :param objects: The object list to divide in batches of ``batch_size`` size
+    :param singular_name: The singular form of the noun associated with the object list
+    :param batch_size: The maximum size of each batch to split the object list
+    :return: A generator of successive batches containing ``batch_size`` items from ``objects``
+    """
+    count = len(objects)
+    logger.info(
+        f"Creating batches of size {batch_size} to process {count} {pluralize(singular_name, count)}"
+    )
+    index = 1
+    iterator = iter(objects)
+    while batch := list(islice(iterator, batch_size)):
+        count = len(batch)
+        logger.info(
+            f"Processing batch {index} containing {count} {pluralize(singular_name, count)}..."
+        )
+        yield batch
+        index += 1

arkindex_worker/worker/__init__.py CHANGED Viewed

@@ -17,6 +17,7 @@ from apistar.exceptions import ErrorResponse
 from arkindex_worker import logger
 from arkindex_worker.cache import CachedElement
 from arkindex_worker.models import Dataset, Element, Set
+from arkindex_worker.utils import pluralize
 from arkindex_worker.worker.base import BaseWorker
 from arkindex_worker.worker.classification import ClassificationMixin
 from arkindex_worker.worker.corpus import CorpusMixin
@@ -83,7 +84,20 @@ class ElementsWorker(
         """
         super().__init__(description, support_cache)
-        # Add mandatory argument to process elements
+        self.classes = {}
+        self.entity_types = {}
+        """Known and available entity types in processed corpus
+        """
+        self.corpus_types = {}
+        """Known and available element types in processed corpus
+        """
+        self._worker_version_cache = {}
+    def add_arguments(self):
+        """Define specific ``argparse`` arguments for this worker"""
         self.parser.add_argument(
             "--elements-list",
             help="JSON elements list to use",
@@ -97,14 +111,6 @@ class ElementsWorker(
             help="One or more Arkindex element ID",
         )
-        self.classes = {}
-        self.entity_types = {}
-        """Known and available entity types in processed corpus
-        """
-        self._worker_version_cache = {}
     def list_elements(self) -> Iterable[CachedElement] | list[str]:
         """
         List the elements to be processed, either from the CLI arguments or
@@ -222,7 +228,9 @@ class ElementsWorker(
                     element = item
                 else:
                     # Load element using the Arkindex API
-                    element = Element(**self.request("RetrieveElement", id=item))
+                    element = Element(
+                        **self.api_client.request("RetrieveElement", id=item)
+                    )
                 logger.info(f"Processing {element} ({i}/{count})")
@@ -260,7 +268,7 @@ class ElementsWorker(
                     with contextlib.suppress(Exception):
                         self.update_activity(element.id, ActivityState.Error)
-        message = f'Ran on {count} element{"s"[:count>1]}: {count - failed} completed, {failed} failed'
+        message = f'Ran on {count} {pluralize("element", count)}: {count - failed} completed, {failed} failed'
         if failed:
             logger.error(message)
             if failed >= count:  # Everything failed!
@@ -301,7 +309,7 @@ class ElementsWorker(
         assert isinstance(state, ActivityState), "state should be an ActivityState"
         try:
-            self.request(
+            self.api_client.request(
                 "UpdateWorkerActivity",
                 id=self.worker_run_id,
                 body={
@@ -376,6 +384,8 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
         # Set as an instance variable as dataset workers might use it to easily extract its content
         self.downloaded_dataset_artifact: Path | None = None
+    def add_arguments(self):
+        """Define specific ``argparse`` arguments for this worker"""
         self.parser.add_argument(
             "--set",
             type=check_dataset_set,
@@ -472,7 +482,7 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
             # Retrieving dataset information is not already cached
             if dataset_id not in datasets:
                 datasets[dataset_id] = Dataset(
-                    **self.request("RetrieveDataset", id=dataset_id)
+                    **self.api_client.request("RetrieveDataset", id=dataset_id)
                 )
             yield Set(name=set_name, dataset=datasets[dataset_id])
@@ -520,7 +530,7 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
         # Cleanup the latest downloaded dataset artifact
         self.cleanup_downloaded_artifact()
-        message = f'Ran on {count} set{"s"[:count>1]}: {count - failed} completed, {failed} failed'
+        message = f'Ran on {count} {pluralize("set", count)}: {count - failed} completed, {failed} failed'
         if failed:
             logger.error(message)
             if failed >= count:  # Everything failed!

arkindex_worker/worker/base.py CHANGED Viewed

@@ -231,7 +231,7 @@ class BaseWorker:
             logger.debug("Debug output enabled")
         # Load worker run information
-        worker_run = self.request("RetrieveWorkerRun", id=self.worker_run_id)
+        worker_run = self.api_client.request("RetrieveWorkerRun", id=self.worker_run_id)
         # Load process information
         self.process_information = worker_run["process"]
@@ -290,7 +290,7 @@ class BaseWorker:
         if self.support_cache and self.args.database is not None:
             self.use_cache = True
         elif self.support_cache and self.task_id:
-            task = self.request("RetrieveTaskFromAgent", id=self.task_id)
+            task = self.api_client.request("RetrieveTask", id=self.task_id)
             self.task_parents = task["parents"]
             paths = self.find_parents_file_paths(Path("db.sqlite"))
             self.use_cache = len(paths) > 0
@@ -331,7 +331,7 @@ class BaseWorker:
         # Load from the backend
         try:
-            resp = self.request("RetrieveSecret", name=str(name))
+            resp = self.api_client.request("RetrieveSecret", name=str(name))
             secret = resp["content"]
             logging.info(f"Loaded API secret {name}")
         except ErrorResponse as e:
@@ -471,12 +471,6 @@ class BaseWorker:
             # Clean up
             shutil.rmtree(base_extracted_path)
-    def request(self, *args, **kwargs):
-        """
-        Wrapper around the ``ArkindexClient.request`` method.
-        """
-        return self.api_client.request(*args, **kwargs)
     def add_arguments(self):
         """Override this method to add ``argparse`` arguments to this worker"""

arkindex_worker/worker/classification.py CHANGED Viewed

@@ -8,6 +8,12 @@ from peewee import IntegrityError
 from arkindex_worker import logger
 from arkindex_worker.cache import CachedClassification, CachedElement
 from arkindex_worker.models import Element
+from arkindex_worker.utils import (
+    DEFAULT_BATCH_SIZE,
+    batch_publication,
+    make_batches,
+    pluralize,
+)
 class ClassificationMixin:
@@ -21,7 +27,7 @@ class ClassificationMixin:
         )
         self.classes = {ml_class["name"]: ml_class["id"] for ml_class in corpus_classes}
         logger.info(
-            f"Loaded {len(self.classes)} ML classes in corpus ({self.corpus_id})"
+            f'Loaded {len(self.classes)} ML {pluralize("class", len(self.classes))} in corpus ({self.corpus_id})'
         )
     def get_ml_class_id(self, ml_class: str) -> str:
@@ -39,7 +45,7 @@ class ClassificationMixin:
         if ml_class_id is None:
             logger.info(f"Creating ML class {ml_class} on corpus {self.corpus_id}")
             try:
-                response = self.request(
+                response = self.api_client.request(
                     "CreateMLClass", id=self.corpus_id, body={"name": ml_class}
                 )
                 ml_class_id = self.classes[ml_class] = response["id"]
@@ -119,7 +125,7 @@ class ClassificationMixin:
             )
             return
         try:
-            created = self.request(
+            created = self.api_client.request(
                 "CreateClassification",
                 body={
                     "element": str(element.id),
@@ -167,10 +173,12 @@ class ClassificationMixin:
         return created
+    @batch_publication
     def create_classifications(
         self,
         element: Element | CachedElement,
         classifications: list[dict[str, str | float | bool]],
+        batch_size: int = DEFAULT_BATCH_SIZE,
     ) -> list[dict[str, str | float | bool]]:
         """
         Create multiple classifications at once on the given element through the API.
@@ -185,6 +193,8 @@ class ClassificationMixin:
             high_confidence (bool)
                 Optional. Whether or not the classification is of high confidence.
+        :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
         :returns: List of created classifications, as returned in the ``classifications`` field by
            the ``CreateClassifications`` API endpoint.
         """
@@ -220,20 +230,26 @@ class ClassificationMixin:
             )
             return
-        created_cls = self.request(
-            "CreateClassifications",
-            body={
-                "parent": str(element.id),
-                "worker_run_id": self.worker_run_id,
-                "classifications": [
-                    {
-                        **classification,
-                        "ml_class": self.get_ml_class_id(classification["ml_class"]),
-                    }
-                    for classification in classifications
-                ],
-            },
-        )["classifications"]
+        created_cls = [
+            created_cl
+            for batch in make_batches(classifications, "classification", batch_size)
+            for created_cl in self.api_client.request(
+                "CreateClassifications",
+                body={
+                    "parent": str(element.id),
+                    "worker_run_id": self.worker_run_id,
+                    "classifications": [
+                        {
+                            **classification,
+                            "ml_class": self.get_ml_class_id(
+                                classification["ml_class"]
+                            ),
+                        }
+                        for classification in batch
+                    ],
+                },
+            )["classifications"]
+        ]
         for created_cl in created_cls:
             created_cl["class_name"] = self.retrieve_ml_class(created_cl["ml_class"])

arkindex_worker/worker/corpus.py CHANGED Viewed

@@ -63,7 +63,9 @@ class CorpusMixin:
         # Download latest export
         export_id: str = exports[0]["id"]
         logger.info(f"Downloading export ({export_id})...")
-        export: _TemporaryFileWrapper = self.request("DownloadExport", id=export_id)
+        export: _TemporaryFileWrapper = self.api_client.request(
+            "DownloadExport", id=export_id
+        )
         logger.info(f"Downloaded export ({export_id}) @ `{export.name}`")
         return export

arkindex_worker/worker/dataset.py CHANGED Viewed

@@ -93,7 +93,7 @@ class DatasetMixin:
             logger.warning("Cannot update dataset as this worker is in read-only mode")
             return
-        updated_dataset = self.request(
+        updated_dataset = self.api_client.request(
             "PartialUpdateDataset",
             id=dataset.id,
             body={"state": state.value},

arkindex_worker/worker/element.py CHANGED Viewed

@@ -12,6 +12,12 @@ from peewee import IntegrityError
 from arkindex_worker import logger
 from arkindex_worker.cache import CachedElement, CachedImage, unsupported_cache
 from arkindex_worker.models import Element
+from arkindex_worker.utils import (
+    DEFAULT_BATCH_SIZE,
+    batch_publication,
+    make_batches,
+    pluralize,
+)
 class ElementType(NamedTuple):
@@ -31,6 +37,21 @@ class MissingTypeError(Exception):
 class ElementMixin:
+    def list_corpus_types(self):
+        """
+        Loads available element types in corpus.
+        """
+        self.corpus_types = {
+            element_type["slug"]: element_type
+            for element_type in self.api_client.request(
+                "RetrieveCorpus", id=self.corpus_id
+            )["types"]
+        }
+        count = len(self.corpus_types)
+        logger.info(
+            f'Loaded {count} element {pluralize("type", count)} in corpus ({self.corpus_id}).'
+        )
     @unsupported_cache
     def create_required_types(self, element_types: list[ElementType]):
         """Creates given element types in the corpus.
@@ -38,7 +59,7 @@ class ElementMixin:
         :param element_types: The missing element types to create.
         """
         for element_type in element_types:
-            self.request(
+            self.api_client.request(
                 "CreateElementType",
                 body={
                     "slug": element_type.slug,
@@ -66,10 +87,10 @@ class ElementMixin:
             isinstance(slug, str) for slug in type_slugs
         ), "Element type slugs must be strings."
-        corpus = self.request("RetrieveCorpus", id=self.corpus_id)
-        available_slugs = {element_type["slug"] for element_type in corpus["types"]}
-        missing_slugs = set(type_slugs) - available_slugs
+        if not self.corpus_types:
+            self.list_corpus_types()
+        missing_slugs = set(type_slugs) - set(self.corpus_types)
         if missing_slugs:
             if create_missing:
                 self.create_required_types(
@@ -79,7 +100,7 @@ class ElementMixin:
                 )
             else:
                 raise MissingTypeError(
-                    f'Element type(s) {", ".join(sorted(missing_slugs))} were not found in the {corpus["name"]} corpus ({corpus["id"]}).'
+                    f'Element {pluralize("type", len(missing_slugs))} {", ".join(sorted(missing_slugs))} were not found in corpus ({self.corpus_id}).'
                 )
         return True
@@ -145,7 +166,7 @@ class ElementMixin:
             logger.warning("Cannot create element as this worker is in read-only mode")
             return
-        sub_element = self.request(
+        sub_element = self.api_client.request(
             "CreateElement",
             body={
                 "type": type,
@@ -161,10 +182,12 @@ class ElementMixin:
         return sub_element["id"] if slim_output else sub_element
+    @batch_publication
     def create_elements(
         self,
         parent: Element | CachedElement,
         elements: list[dict[str, str | list[list[int | float]] | float | None]],
+        batch_size: int = DEFAULT_BATCH_SIZE,
     ) -> list[dict[str, str]]:
         """
         Create child elements on the given element in a single API request.
@@ -185,6 +208,8 @@ class ElementMixin:
             confidence (float or None)
                 Optional confidence score, between 0.0 and 1.0.
+        :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
         :return: List of dicts, with each dict having a single key, ``id``, holding the UUID of each created element.
         """
         if isinstance(parent, Element):
@@ -243,14 +268,18 @@ class ElementMixin:
             logger.warning("Cannot create elements as this worker is in read-only mode")
             return
-        created_ids = self.request(
-            "CreateElements",
-            id=parent.id,
-            body={
-                "worker_run_id": self.worker_run_id,
-                "elements": elements,
-            },
-        )
+        created_ids = [
+            created_id
+            for batch in make_batches(elements, "element", batch_size)
+            for created_id in self.api_client.request(
+                "CreateElements",
+                id=parent.id,
+                body={
+                    "worker_run_id": self.worker_run_id,
+                    "elements": batch,
+                },
+            )
+        ]
         if self.use_cache:
             # Create the image as needed and handle both an Element and a CachedElement
@@ -311,7 +340,7 @@ class ElementMixin:
             logger.warning("Cannot link elements as this worker is in read-only mode")
             return
-        return self.request(
+        return self.api_client.request(
             "CreateElementParent",
             parent=parent.id,
             child=child.id,
@@ -383,7 +412,7 @@ class ElementMixin:
             logger.warning("Cannot update element as this worker is in read-only mode")
             return
-        updated_element = self.request(
+        updated_element = self.api_client.request(
             "PartialUpdateElement",
             id=element.id,
             body=kwargs,

arkindex-base-worker 0.4.0a2__py3-none-any.whl → 0.4.0b2__py3-none-any.whl

arkindex-base-worker 0.4.0a2py3-none-any.whl → 0.4.0b2py3-none-any.whl