PyPI - arkindex-base-worker - Versions diffs - 0.4.0a2__py3-none-any.whl → 0.4.0b2__py3-none-any.whl - Mend

arkindex-base-worker 0.4.0a2py3-none-any.whl → 0.4.0b2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{arkindex_base_worker-0.4.0a2.dist-info → arkindex_base_worker-0.4.0b2.dist-info}/METADATA +7 -7
arkindex_base_worker-0.4.0b2.dist-info/RECORD +51 -0
{arkindex_base_worker-0.4.0a2.dist-info → arkindex_base_worker-0.4.0b2.dist-info}/WHEEL +1 -1
arkindex_worker/image.py +2 -1
arkindex_worker/utils.py +76 -0
arkindex_worker/worker/__init__.py +24 -14
arkindex_worker/worker/base.py +3 -9
arkindex_worker/worker/classification.py +33 -17
arkindex_worker/worker/corpus.py +3 -1
arkindex_worker/worker/dataset.py +1 -1
arkindex_worker/worker/element.py +45 -16
arkindex_worker/worker/entity.py +30 -17
arkindex_worker/worker/metadata.py +19 -9
arkindex_worker/worker/task.py +4 -2
arkindex_worker/worker/training.py +5 -5
arkindex_worker/worker/transcription.py +39 -18
arkindex_worker/worker/version.py +3 -1
tests/test_base_worker.py +1 -1
tests/test_elements_worker/test_classifications.py +107 -60
tests/test_elements_worker/test_elements.py +213 -70
tests/test_elements_worker/test_entities.py +102 -33
tests/test_elements_worker/test_metadata.py +223 -98
tests/test_elements_worker/test_transcriptions.py +293 -143
tests/test_merge.py +1 -1
tests/test_utils.py +28 -0
arkindex_base_worker-0.4.0a2.dist-info/RECORD +0 -51
{arkindex_base_worker-0.4.0a2.dist-info → arkindex_base_worker-0.4.0b2.dist-info}/LICENSE +0 -0
{arkindex_base_worker-0.4.0a2.dist-info → arkindex_base_worker-0.4.0b2.dist-info}/top_level.txt +0 -0

arkindex_worker/worker/entity.py CHANGED Viewed

@@ -15,6 +15,12 @@ from arkindex_worker.cache import (
     unsupported_cache,
 )
 from arkindex_worker.models import Element, Transcription
+from arkindex_worker.utils import (
+    DEFAULT_BATCH_SIZE,
+    batch_publication,
+    make_batches,
+    pluralize,
+)
 class Entity(TypedDict):
@@ -48,6 +54,7 @@ class EntityMixin:
         if not self.entity_types:
             # Load entity_types of corpus
             self.list_corpus_entity_types()
         for entity_type in entity_types:
             # Do nothing if type already exists
             if entity_type in self.entity_types:
@@ -60,7 +67,7 @@ class EntityMixin:
                 )
             # Create type if non-existent
-            self.entity_types[entity_type] = self.request(
+            self.entity_types[entity_type] = self.api_client.request(
                 "CreateEntityType",
                 body={
                     "name": entity_type,
@@ -106,7 +113,7 @@ class EntityMixin:
         entity_type_id = self.entity_types.get(type)
         assert entity_type_id, f"Entity type `{type}` not found in the corpus."
-        entity = self.request(
+        entity = self.api_client.request(
             "CreateEntity",
             body={
                 "name": name,
@@ -188,7 +195,7 @@ class EntityMixin:
         if confidence is not None:
             body["confidence"] = confidence
-        transcription_ent = self.request(
+        transcription_ent = self.api_client.request(
             "CreateTranscriptionEntity",
             id=transcription.id,
             body=body,
@@ -212,10 +219,12 @@ class EntityMixin:
         return transcription_ent
     @unsupported_cache
+    @batch_publication
     def create_transcription_entities(
         self,
         transcription: Transcription,
         entities: list[Entity],
+        batch_size: int = DEFAULT_BATCH_SIZE,
     ) -> list[dict[str, str]]:
         """
         Create multiple entities attached to a transcription in a single API request.
@@ -238,6 +247,8 @@ class EntityMixin:
             confidence (float or None)
                 Optional confidence score, between 0.0 and 1.0.
+        :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
         :return: List of dicts, with each dict having a two keys, `transcription_entity_id` and `entity_id`, holding the UUID of each created object.
         """
         assert transcription and isinstance(
@@ -289,16 +300,20 @@ class EntityMixin:
             )
             return
-        created_ids = self.request(
-            "CreateTranscriptionEntities",
-            id=transcription.id,
-            body={
-                "worker_run_id": self.worker_run_id,
-                "entities": entities,
-            },
-        )
+        created_entities = [
+            created_entity
+            for batch in make_batches(entities, "entities", batch_size)
+            for created_entity in self.api_client.request(
+                "CreateTranscriptionEntities",
+                id=transcription.id,
+                body={
+                    "worker_run_id": self.worker_run_id,
+                    "entities": batch,
+                },
+            )["entities"]
+        ]
-        return created_ids["entities"]
+        return created_entities
     def list_transcription_entities(
         self,
@@ -382,12 +397,10 @@ class EntityMixin:
         }
         count = len(self.entities)
         logger.info(
-            f'Loaded {count} entit{"ies" if count > 1 else "y"} in corpus ({self.corpus_id})'
+            f'Loaded {count} {pluralize("entity", count)} in corpus ({self.corpus_id})'
         )
-    def list_corpus_entity_types(
-        self,
-    ):
+    def list_corpus_entity_types(self):
         """
         Loads available entity types in corpus.
         """
@@ -399,5 +412,5 @@ class EntityMixin:
         }
         count = len(self.entity_types)
         logger.info(
-            f'Loaded {count} entity type{"s"[:count>1]} in corpus ({self.corpus_id}).'
+            f'Loaded {count} entity {pluralize("type", count)} in corpus ({self.corpus_id}).'
         )

arkindex_worker/worker/metadata.py CHANGED Viewed

@@ -7,6 +7,7 @@ from enum import Enum
 from arkindex_worker import logger
 from arkindex_worker.cache import CachedElement, unsupported_cache
 from arkindex_worker.models import Element
+from arkindex_worker.utils import DEFAULT_BATCH_SIZE, batch_publication, make_batches
 class MetaType(Enum):
@@ -93,7 +94,7 @@ class MetaDataMixin:
             logger.warning("Cannot create metadata as this worker is in read-only mode")
             return
-        metadata = self.request(
+        metadata = self.api_client.request(
             "CreateMetaData",
             id=element.id,
             body={
@@ -108,10 +109,12 @@ class MetaDataMixin:
         return metadata["id"]
     @unsupported_cache
+    @batch_publication
     def create_metadata_bulk(
         self,
         element: Element | CachedElement,
         metadata_list: list[dict[str, MetaType | str | int | float | None]],
+        batch_size: int = DEFAULT_BATCH_SIZE,
     ) -> list[dict[str, str]]:
         """
         Create multiple metadata on an existing element.
@@ -123,6 +126,9 @@ class MetaDataMixin:
             - name: str
             - value: str | int | float
             - entity_id: str | None
+        :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
+        :returns: A list of dicts as returned in the ``metadata_list`` field by the ``CreateMetaDataBulk`` API endpoint.
         """
         assert element and isinstance(
             element, Element | CachedElement
@@ -168,14 +174,18 @@ class MetaDataMixin:
             logger.warning("Cannot create metadata as this worker is in read-only mode")
             return
-        created_metadata_list = self.request(
-            "CreateMetaDataBulk",
-            id=element.id,
-            body={
-                "worker_run_id": self.worker_run_id,
-                "metadata_list": metas,
-            },
-        )["metadata_list"]
+        created_metadata_list = [
+            created_metadata
+            for batch in make_batches(metas, "metadata", batch_size)
+            for created_metadata in self.api_client.request(
+                "CreateMetaDataBulk",
+                id=element.id,
+                body={
+                    "worker_run_id": self.worker_run_id,
+                    "metadata_list": batch,
+                },
+            )["metadata_list"]
+        ]
         return created_metadata_list

arkindex_worker/worker/task.py CHANGED Viewed

@@ -22,7 +22,7 @@ class TaskMixin:
             task_id, uuid.UUID
         ), "task_id shouldn't be null and should be an UUID"
-        results = self.request("ListArtifacts", id=task_id)
+        results = self.api_client.request("ListArtifacts", id=task_id)
         return map(Artifact, results)
@@ -43,4 +43,6 @@ class TaskMixin:
             artifact, Artifact
         ), "artifact shouldn't be null and should be an Artifact"
-        return self.request("DownloadArtifact", id=task_id, path=artifact.path)
+        return self.api_client.request(
+            "DownloadArtifact", id=task_id, path=artifact.path
+        )

arkindex_worker/worker/training.py CHANGED Viewed

@@ -185,7 +185,7 @@ class TrainingMixin:
         assert not self.model_version, "A model version has already been created."
         configuration = configuration or {}
-        self.model_version = self.request(
+        self.model_version = self.api_client.request(
             "CreateModelVersion",
             id=model_id,
             body=build_clean_payload(
@@ -217,7 +217,7 @@ class TrainingMixin:
         :param parent: ID of the parent model version
         """
         assert self.model_version, "No model version has been created yet."
-        self.model_version = self.request(
+        self.model_version = self.api_client.request(
             "UpdateModelVersion",
             id=self.model_version["id"],
             body=build_clean_payload(
@@ -273,7 +273,7 @@ class TrainingMixin:
         """
         assert self.model_version, "You must create the model version and upload its archive before validating it."
         try:
-            self.model_version = self.request(
+            self.model_version = self.api_client.request(
                 "PartialUpdateModelVersion",
                 id=self.model_version["id"],
                 body={
@@ -294,7 +294,7 @@ class TrainingMixin:
             pending_version_id = self.model_version["id"]
             logger.warning("Removing the pending model version.")
             try:
-                self.request("DestroyModelVersion", id=pending_version_id)
+                self.api_client.request("DestroyModelVersion", id=pending_version_id)
             except ErrorResponse as e:
                 msg = getattr(e, "content", str(e))
                 logger.error(
@@ -304,7 +304,7 @@ class TrainingMixin:
             logger.info("Retrieving the existing model version.")
             existing_version_id = model_version["id"].pop()
             try:
-                self.model_version = self.request(
+                self.model_version = self.api_client.request(
                     "RetrieveModelVersion", id=existing_version_id
                 )
             except ErrorResponse as e:

arkindex_worker/worker/transcription.py CHANGED Viewed

@@ -11,6 +11,7 @@ from peewee import IntegrityError
 from arkindex_worker import logger
 from arkindex_worker.cache import CachedElement, CachedTranscription
 from arkindex_worker.models import Element
+from arkindex_worker.utils import DEFAULT_BATCH_SIZE, batch_publication, make_batches
 class TextOrientation(Enum):
@@ -77,7 +78,7 @@ class TranscriptionMixin:
             )
             return
-        created = self.request(
+        created = self.api_client.request(
             "CreateTranscription",
             id=element.id,
             body={
@@ -109,9 +110,11 @@ class TranscriptionMixin:
         return created
+    @batch_publication
     def create_transcriptions(
         self,
         transcriptions: list[dict[str, str | float | TextOrientation | None]],
+        batch_size: int = DEFAULT_BATCH_SIZE,
     ) -> list[dict[str, str | float]]:
         """
         Create multiple transcriptions at once on existing elements through the API,
@@ -128,6 +131,8 @@ class TranscriptionMixin:
             orientation (TextOrientation)
                 Optional. Orientation of the transcription's text.
+        :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
         :returns: A list of dicts as returned in the ``transcriptions`` field by the ``CreateTranscriptions`` API endpoint.
         """
@@ -171,13 +176,19 @@ class TranscriptionMixin:
             )
             return
-        created_trs = self.request(
-            "CreateTranscriptions",
-            body={
-                "worker_run_id": self.worker_run_id,
-                "transcriptions": transcriptions_payload,
-            },
-        )["transcriptions"]
+        created_trs = [
+            created_tr
+            for batch in make_batches(
+                transcriptions_payload, "transcription", batch_size
+            )
+            for created_tr in self.api_client.request(
+                "CreateTranscriptions",
+                body={
+                    "worker_run_id": self.worker_run_id,
+                    "transcriptions": batch,
+                },
+            )["transcriptions"]
+        ]
         if self.use_cache:
             # Store transcriptions in local cache
@@ -201,11 +212,13 @@ class TranscriptionMixin:
         return created_trs
+    @batch_publication
     def create_element_transcriptions(
         self,
         element: Element | CachedElement,
         sub_element_type: str,
         transcriptions: list[dict[str, str | float]],
+        batch_size: int = DEFAULT_BATCH_SIZE,
     ) -> dict[str, str | bool]:
         """
         Create multiple elements and transcriptions at once on a single parent element through the API.
@@ -225,6 +238,8 @@ class TranscriptionMixin:
             element_confidence (float)
                 Optional. Confidence score of the element between 0 and 1.
+        :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
         :returns: A list of dicts as returned by the ``CreateElementTranscriptions`` API endpoint.
         """
         assert element and isinstance(
@@ -291,16 +306,22 @@ class TranscriptionMixin:
             )
             return
-        annotations = self.request(
-            "CreateElementTranscriptions",
-            id=element.id,
-            body={
-                "element_type": sub_element_type,
-                "worker_run_id": self.worker_run_id,
-                "transcriptions": transcriptions_payload,
-                "return_elements": True,
-            },
-        )
+        annotations = [
+            annotation
+            for batch in make_batches(
+                transcriptions_payload, "transcription", batch_size
+            )
+            for annotation in self.api_client.request(
+                "CreateElementTranscriptions",
+                id=element.id,
+                body={
+                    "element_type": sub_element_type,
+                    "worker_run_id": self.worker_run_id,
+                    "transcriptions": batch,
+                    "return_elements": True,
+                },
+            )
+        ]
         for annotation in annotations:
             if annotation["created"]:

arkindex_worker/worker/version.py CHANGED Viewed

@@ -34,7 +34,9 @@ class WorkerVersionMixin:
         if worker_version_id in self._worker_version_cache:
             return self._worker_version_cache[worker_version_id]
-        worker_version = self.request("RetrieveWorkerVersion", id=worker_version_id)
+        worker_version = self.api_client.request(
+            "RetrieveWorkerVersion", id=worker_version_id
+        )
         self._worker_version_cache[worker_version_id] = worker_version
         return worker_version

tests/test_base_worker.py CHANGED Viewed

@@ -658,7 +658,7 @@ def test_find_extras_directory_not_found(monkeypatch, extras_path, exists, error
 def test_find_parents_file_paths(responses, mock_base_worker_with_cache, tmp_path):
     responses.add(
         responses.GET,
-        "http://testserver/api/v1/task/my_task/from-agent/",
+        "http://testserver/api/v1/task/my_task/",
         status=200,
         json={"parents": ["first", "second", "third"]},
     )

tests/test_elements_worker/test_classifications.py CHANGED Viewed

@@ -7,6 +7,7 @@ from apistar.exceptions import ErrorResponse
 from arkindex_worker.cache import CachedClassification, CachedElement
 from arkindex_worker.models import Element
+from arkindex_worker.utils import DEFAULT_BATCH_SIZE
 from tests import CORPUS_ID
 from . import BASE_API_CALLS
@@ -692,7 +693,8 @@ def test_create_classifications_create_ml_class(mock_elements_worker, responses)
     }
-def test_create_classifications(responses, mock_elements_worker):
+@pytest.mark.parametrize("batch_size", [DEFAULT_BATCH_SIZE, 1])
+def test_create_classifications(batch_size, responses, mock_elements_worker):
     mock_elements_worker.classes = {"portrait": "0000", "landscape": "1111"}
     elt = Element({"id": "12341234-1234-1234-1234-123412341234"})
     responses.add(
@@ -716,62 +718,98 @@ def test_create_classifications(responses, mock_elements_worker):
                 "high_confidence": False,
             },
         ],
+        batch_size=batch_size,
     )
-    assert len(responses.calls) == len(BASE_API_CALLS) + 1
+    bulk_api_calls = [("POST", "http://testserver/api/v1/classification/bulk/")]
+    if batch_size != DEFAULT_BATCH_SIZE:
+        bulk_api_calls.append(("POST", "http://testserver/api/v1/classification/bulk/"))
+    assert len(responses.calls) == len(BASE_API_CALLS) + len(bulk_api_calls)
     assert [
         (call.request.method, call.request.url) for call in responses.calls
-    ] == BASE_API_CALLS + [
-        ("POST", "http://testserver/api/v1/classification/bulk/"),
-    ]
+    ] == BASE_API_CALLS + bulk_api_calls
-    assert json.loads(responses.calls[-1].request.body) == {
+    first_cl = {"confidence": 0.75, "high_confidence": False, "ml_class": "0000"}
+    second_cl = {"confidence": 0.25, "high_confidence": False, "ml_class": "1111"}
+    empty_payload = {
         "parent": str(elt.id),
         "worker_run_id": "56785678-5678-5678-5678-567856785678",
-        "classifications": [
-            {
-                "confidence": 0.75,
-                "high_confidence": False,
-                "ml_class": "0000",
-            },
-            {
-                "confidence": 0.25,
-                "high_confidence": False,
-                "ml_class": "1111",
-            },
-        ],
+        "classifications": [],
     }
+    bodies = []
+    first_call_idx = None
+    if batch_size > 1:
+        first_call_idx = -1
+        bodies.append({**empty_payload, "classifications": [first_cl, second_cl]})
+    else:
+        first_call_idx = -2
+        bodies.append({**empty_payload, "classifications": [first_cl]})
+        bodies.append({**empty_payload, "classifications": [second_cl]})
+    assert [
+        json.loads(bulk_call.request.body)
+        for bulk_call in responses.calls[first_call_idx:]
+    ] == bodies
-def test_create_classifications_with_cache(responses, mock_elements_worker_with_cache):
+@pytest.mark.parametrize("batch_size", [DEFAULT_BATCH_SIZE, 1])
+def test_create_classifications_with_cache(
+    batch_size, responses, mock_elements_worker_with_cache
+):
     mock_elements_worker_with_cache.classes = {"portrait": "0000", "landscape": "1111"}
     elt = CachedElement.create(id="12341234-1234-1234-1234-123412341234", type="thing")
-    responses.add(
-        responses.POST,
-        "http://testserver/api/v1/classification/bulk/",
-        status=200,
-        json={
-            "parent": str(elt.id),
-            "worker_run_id": "56785678-5678-5678-5678-567856785678",
-            "classifications": [
-                {
-                    "id": "00000000-0000-0000-0000-000000000000",
-                    "ml_class": "0000",
-                    "confidence": 0.75,
-                    "high_confidence": False,
-                    "state": "pending",
-                },
-                {
-                    "id": "11111111-1111-1111-1111-111111111111",
-                    "ml_class": "1111",
-                    "confidence": 0.25,
-                    "high_confidence": False,
-                    "state": "pending",
+    if batch_size > 1:
+        responses.add(
+            responses.POST,
+            "http://testserver/api/v1/classification/bulk/",
+            status=200,
+            json={
+                "parent": str(elt.id),
+                "worker_run_id": "56785678-5678-5678-5678-567856785678",
+                "classifications": [
+                    {
+                        "id": "00000000-0000-0000-0000-000000000000",
+                        "ml_class": "0000",
+                        "confidence": 0.75,
+                        "high_confidence": False,
+                        "state": "pending",
+                    },
+                    {
+                        "id": "11111111-1111-1111-1111-111111111111",
+                        "ml_class": "1111",
+                        "confidence": 0.25,
+                        "high_confidence": False,
+                        "state": "pending",
+                    },
+                ],
+            },
+        )
+    else:
+        for cl_id, cl_class, cl_conf in [
+            ("00000000-0000-0000-0000-000000000000", "0000", 0.75),
+            ("11111111-1111-1111-1111-111111111111", "1111", 0.25),
+        ]:
+            responses.add(
+                responses.POST,
+                "http://testserver/api/v1/classification/bulk/",
+                status=200,
+                json={
+                    "parent": str(elt.id),
+                    "worker_run_id": "56785678-5678-5678-5678-567856785678",
+                    "classifications": [
+                        {
+                            "id": cl_id,
+                            "ml_class": cl_class,
+                            "confidence": cl_conf,
+                            "high_confidence": False,
+                            "state": "pending",
+                        },
+                    ],
                 },
-            ],
-        },
-    )
+            )
     mock_elements_worker_with_cache.create_classifications(
         element=elt,
@@ -787,32 +825,41 @@ def test_create_classifications_with_cache(responses, mock_elements_worker_with_
                 "high_confidence": False,
             },
         ],
+        batch_size=batch_size,
     )
-    assert len(responses.calls) == len(BASE_API_CALLS) + 1
+    bulk_api_calls = [("POST", "http://testserver/api/v1/classification/bulk/")]
+    if batch_size != DEFAULT_BATCH_SIZE:
+        bulk_api_calls.append(("POST", "http://testserver/api/v1/classification/bulk/"))
+    assert len(responses.calls) == len(BASE_API_CALLS) + len(bulk_api_calls)
     assert [
         (call.request.method, call.request.url) for call in responses.calls
-    ] == BASE_API_CALLS + [
-        ("POST", "http://testserver/api/v1/classification/bulk/"),
-    ]
+    ] == BASE_API_CALLS + bulk_api_calls
-    assert json.loads(responses.calls[-1].request.body) == {
+    first_cl = {"confidence": 0.75, "high_confidence": False, "ml_class": "0000"}
+    second_cl = {"confidence": 0.25, "high_confidence": False, "ml_class": "1111"}
+    empty_payload = {
         "parent": str(elt.id),
         "worker_run_id": "56785678-5678-5678-5678-567856785678",
-        "classifications": [
-            {
-                "confidence": 0.75,
-                "high_confidence": False,
-                "ml_class": "0000",
-            },
-            {
-                "confidence": 0.25,
-                "high_confidence": False,
-                "ml_class": "1111",
-            },
-        ],
+        "classifications": [],
     }
+    bodies = []
+    first_call_idx = None
+    if batch_size > 1:
+        first_call_idx = -1
+        bodies.append({**empty_payload, "classifications": [first_cl, second_cl]})
+    else:
+        first_call_idx = -2
+        bodies.append({**empty_payload, "classifications": [first_cl]})
+        bodies.append({**empty_payload, "classifications": [second_cl]})
+    assert [
+        json.loads(bulk_call.request.body)
+        for bulk_call in responses.calls[first_call_idx:]
+    ] == bodies
     # Check that created classifications were properly stored in SQLite cache
     assert list(CachedClassification.select()) == [
         CachedClassification(

arkindex-base-worker 0.4.0a2__py3-none-any.whl → 0.4.0b2__py3-none-any.whl

arkindex-base-worker 0.4.0a2py3-none-any.whl → 0.4.0b2py3-none-any.whl