PyPI - arkindex-base-worker - Versions diffs - 0.4.0b1__py3-none-any.whl → 0.4.0b3__py3-none-any.whl - Mend

arkindex-base-worker 0.4.0b1py3-none-any.whl → 0.4.0b3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

{arkindex_base_worker-0.4.0b1.dist-info → arkindex_base_worker-0.4.0b3.dist-info}/METADATA +1 -1
{arkindex_base_worker-0.4.0b1.dist-info → arkindex_base_worker-0.4.0b3.dist-info}/RECORD +19 -19
arkindex_worker/image.py +2 -1
arkindex_worker/utils.py +81 -0
arkindex_worker/worker/__init__.py +3 -2
arkindex_worker/worker/classification.py +31 -15
arkindex_worker/worker/element.py +71 -10
arkindex_worker/worker/entity.py +25 -11
arkindex_worker/worker/metadata.py +18 -8
arkindex_worker/worker/transcription.py +38 -17
tests/test_elements_worker/test_classifications.py +107 -60
tests/test_elements_worker/test_elements.py +318 -49
tests/test_elements_worker/test_entities.py +102 -33
tests/test_elements_worker/test_metadata.py +223 -98
tests/test_elements_worker/test_transcriptions.py +293 -143
tests/test_utils.py +28 -0
{arkindex_base_worker-0.4.0b1.dist-info → arkindex_base_worker-0.4.0b3.dist-info}/LICENSE +0 -0
{arkindex_base_worker-0.4.0b1.dist-info → arkindex_base_worker-0.4.0b3.dist-info}/WHEEL +0 -0
{arkindex_base_worker-0.4.0b1.dist-info → arkindex_base_worker-0.4.0b3.dist-info}/top_level.txt +0 -0

arkindex_worker/worker/transcription.py CHANGED Viewed

@@ -11,6 +11,7 @@ from peewee import IntegrityError
 from arkindex_worker import logger
 from arkindex_worker.cache import CachedElement, CachedTranscription
 from arkindex_worker.models import Element
+from arkindex_worker.utils import DEFAULT_BATCH_SIZE, batch_publication, make_batches
 class TextOrientation(Enum):
@@ -109,9 +110,11 @@ class TranscriptionMixin:
         return created
+    @batch_publication
     def create_transcriptions(
         self,
         transcriptions: list[dict[str, str | float | TextOrientation | None]],
+        batch_size: int = DEFAULT_BATCH_SIZE,
     ) -> list[dict[str, str | float]]:
         """
         Create multiple transcriptions at once on existing elements through the API,
@@ -128,6 +131,8 @@ class TranscriptionMixin:
             orientation (TextOrientation)
                 Optional. Orientation of the transcription's text.
+        :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
         :returns: A list of dicts as returned in the ``transcriptions`` field by the ``CreateTranscriptions`` API endpoint.
         """
@@ -171,13 +176,19 @@ class TranscriptionMixin:
             )
             return
-        created_trs = self.api_client.request(
-            "CreateTranscriptions",
-            body={
-                "worker_run_id": self.worker_run_id,
-                "transcriptions": transcriptions_payload,
-            },
-        )["transcriptions"]
+        created_trs = [
+            created_tr
+            for batch in make_batches(
+                transcriptions_payload, "transcription", batch_size
+            )
+            for created_tr in self.api_client.request(
+                "CreateTranscriptions",
+                body={
+                    "worker_run_id": self.worker_run_id,
+                    "transcriptions": batch,
+                },
+            )["transcriptions"]
+        ]
         if self.use_cache:
             # Store transcriptions in local cache
@@ -201,11 +212,13 @@ class TranscriptionMixin:
         return created_trs
+    @batch_publication
     def create_element_transcriptions(
         self,
         element: Element | CachedElement,
         sub_element_type: str,
         transcriptions: list[dict[str, str | float]],
+        batch_size: int = DEFAULT_BATCH_SIZE,
     ) -> dict[str, str | bool]:
         """
         Create multiple elements and transcriptions at once on a single parent element through the API.
@@ -225,6 +238,8 @@ class TranscriptionMixin:
             element_confidence (float)
                 Optional. Confidence score of the element between 0 and 1.
+        :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
         :returns: A list of dicts as returned by the ``CreateElementTranscriptions`` API endpoint.
         """
         assert element and isinstance(
@@ -291,16 +306,22 @@ class TranscriptionMixin:
             )
             return
-        annotations = self.api_client.request(
-            "CreateElementTranscriptions",
-            id=element.id,
-            body={
-                "element_type": sub_element_type,
-                "worker_run_id": self.worker_run_id,
-                "transcriptions": transcriptions_payload,
-                "return_elements": True,
-            },
-        )
+        annotations = [
+            annotation
+            for batch in make_batches(
+                transcriptions_payload, "transcription", batch_size
+            )
+            for annotation in self.api_client.request(
+                "CreateElementTranscriptions",
+                id=element.id,
+                body={
+                    "element_type": sub_element_type,
+                    "worker_run_id": self.worker_run_id,
+                    "transcriptions": batch,
+                    "return_elements": True,
+                },
+            )
+        ]
         for annotation in annotations:
             if annotation["created"]:

tests/test_elements_worker/test_classifications.py CHANGED Viewed

@@ -7,6 +7,7 @@ from apistar.exceptions import ErrorResponse
 from arkindex_worker.cache import CachedClassification, CachedElement
 from arkindex_worker.models import Element
+from arkindex_worker.utils import DEFAULT_BATCH_SIZE
 from tests import CORPUS_ID
 from . import BASE_API_CALLS
@@ -692,7 +693,8 @@ def test_create_classifications_create_ml_class(mock_elements_worker, responses)
     }
-def test_create_classifications(responses, mock_elements_worker):
+@pytest.mark.parametrize("batch_size", [DEFAULT_BATCH_SIZE, 1])
+def test_create_classifications(batch_size, responses, mock_elements_worker):
     mock_elements_worker.classes = {"portrait": "0000", "landscape": "1111"}
     elt = Element({"id": "12341234-1234-1234-1234-123412341234"})
     responses.add(
@@ -716,62 +718,98 @@ def test_create_classifications(responses, mock_elements_worker):
                 "high_confidence": False,
             },
         ],
+        batch_size=batch_size,
     )
-    assert len(responses.calls) == len(BASE_API_CALLS) + 1
+    bulk_api_calls = [("POST", "http://testserver/api/v1/classification/bulk/")]
+    if batch_size != DEFAULT_BATCH_SIZE:
+        bulk_api_calls.append(("POST", "http://testserver/api/v1/classification/bulk/"))
+    assert len(responses.calls) == len(BASE_API_CALLS) + len(bulk_api_calls)
     assert [
         (call.request.method, call.request.url) for call in responses.calls
-    ] == BASE_API_CALLS + [
-        ("POST", "http://testserver/api/v1/classification/bulk/"),
-    ]
+    ] == BASE_API_CALLS + bulk_api_calls
-    assert json.loads(responses.calls[-1].request.body) == {
+    first_cl = {"confidence": 0.75, "high_confidence": False, "ml_class": "0000"}
+    second_cl = {"confidence": 0.25, "high_confidence": False, "ml_class": "1111"}
+    empty_payload = {
         "parent": str(elt.id),
         "worker_run_id": "56785678-5678-5678-5678-567856785678",
-        "classifications": [
-            {
-                "confidence": 0.75,
-                "high_confidence": False,
-                "ml_class": "0000",
-            },
-            {
-                "confidence": 0.25,
-                "high_confidence": False,
-                "ml_class": "1111",
-            },
-        ],
+        "classifications": [],
     }
+    bodies = []
+    first_call_idx = None
+    if batch_size > 1:
+        first_call_idx = -1
+        bodies.append({**empty_payload, "classifications": [first_cl, second_cl]})
+    else:
+        first_call_idx = -2
+        bodies.append({**empty_payload, "classifications": [first_cl]})
+        bodies.append({**empty_payload, "classifications": [second_cl]})
+    assert [
+        json.loads(bulk_call.request.body)
+        for bulk_call in responses.calls[first_call_idx:]
+    ] == bodies
-def test_create_classifications_with_cache(responses, mock_elements_worker_with_cache):
+@pytest.mark.parametrize("batch_size", [DEFAULT_BATCH_SIZE, 1])
+def test_create_classifications_with_cache(
+    batch_size, responses, mock_elements_worker_with_cache
+):
     mock_elements_worker_with_cache.classes = {"portrait": "0000", "landscape": "1111"}
     elt = CachedElement.create(id="12341234-1234-1234-1234-123412341234", type="thing")
-    responses.add(
-        responses.POST,
-        "http://testserver/api/v1/classification/bulk/",
-        status=200,
-        json={
-            "parent": str(elt.id),
-            "worker_run_id": "56785678-5678-5678-5678-567856785678",
-            "classifications": [
-                {
-                    "id": "00000000-0000-0000-0000-000000000000",
-                    "ml_class": "0000",
-                    "confidence": 0.75,
-                    "high_confidence": False,
-                    "state": "pending",
-                },
-                {
-                    "id": "11111111-1111-1111-1111-111111111111",
-                    "ml_class": "1111",
-                    "confidence": 0.25,
-                    "high_confidence": False,
-                    "state": "pending",
+    if batch_size > 1:
+        responses.add(
+            responses.POST,
+            "http://testserver/api/v1/classification/bulk/",
+            status=200,
+            json={
+                "parent": str(elt.id),
+                "worker_run_id": "56785678-5678-5678-5678-567856785678",
+                "classifications": [
+                    {
+                        "id": "00000000-0000-0000-0000-000000000000",
+                        "ml_class": "0000",
+                        "confidence": 0.75,
+                        "high_confidence": False,
+                        "state": "pending",
+                    },
+                    {
+                        "id": "11111111-1111-1111-1111-111111111111",
+                        "ml_class": "1111",
+                        "confidence": 0.25,
+                        "high_confidence": False,
+                        "state": "pending",
+                    },
+                ],
+            },
+        )
+    else:
+        for cl_id, cl_class, cl_conf in [
+            ("00000000-0000-0000-0000-000000000000", "0000", 0.75),
+            ("11111111-1111-1111-1111-111111111111", "1111", 0.25),
+        ]:
+            responses.add(
+                responses.POST,
+                "http://testserver/api/v1/classification/bulk/",
+                status=200,
+                json={
+                    "parent": str(elt.id),
+                    "worker_run_id": "56785678-5678-5678-5678-567856785678",
+                    "classifications": [
+                        {
+                            "id": cl_id,
+                            "ml_class": cl_class,
+                            "confidence": cl_conf,
+                            "high_confidence": False,
+                            "state": "pending",
+                        },
+                    ],
                 },
-            ],
-        },
-    )
+            )
     mock_elements_worker_with_cache.create_classifications(
         element=elt,
@@ -787,32 +825,41 @@ def test_create_classifications_with_cache(responses, mock_elements_worker_with_
                 "high_confidence": False,
             },
         ],
+        batch_size=batch_size,
     )
-    assert len(responses.calls) == len(BASE_API_CALLS) + 1
+    bulk_api_calls = [("POST", "http://testserver/api/v1/classification/bulk/")]
+    if batch_size != DEFAULT_BATCH_SIZE:
+        bulk_api_calls.append(("POST", "http://testserver/api/v1/classification/bulk/"))
+    assert len(responses.calls) == len(BASE_API_CALLS) + len(bulk_api_calls)
     assert [
         (call.request.method, call.request.url) for call in responses.calls
-    ] == BASE_API_CALLS + [
-        ("POST", "http://testserver/api/v1/classification/bulk/"),
-    ]
+    ] == BASE_API_CALLS + bulk_api_calls
-    assert json.loads(responses.calls[-1].request.body) == {
+    first_cl = {"confidence": 0.75, "high_confidence": False, "ml_class": "0000"}
+    second_cl = {"confidence": 0.25, "high_confidence": False, "ml_class": "1111"}
+    empty_payload = {
         "parent": str(elt.id),
         "worker_run_id": "56785678-5678-5678-5678-567856785678",
-        "classifications": [
-            {
-                "confidence": 0.75,
-                "high_confidence": False,
-                "ml_class": "0000",
-            },
-            {
-                "confidence": 0.25,
-                "high_confidence": False,
-                "ml_class": "1111",
-            },
-        ],
+        "classifications": [],
     }
+    bodies = []
+    first_call_idx = None
+    if batch_size > 1:
+        first_call_idx = -1
+        bodies.append({**empty_payload, "classifications": [first_cl, second_cl]})
+    else:
+        first_call_idx = -2
+        bodies.append({**empty_payload, "classifications": [first_cl]})
+        bodies.append({**empty_payload, "classifications": [second_cl]})
+    assert [
+        json.loads(bulk_call.request.body)
+        for bulk_call in responses.calls[first_call_idx:]
+    ] == bodies
     # Check that created classifications were properly stored in SQLite cache
     assert list(CachedClassification.select()) == [
         CachedClassification(

arkindex-base-worker 0.4.0b1__py3-none-any.whl → 0.4.0b3__py3-none-any.whl

arkindex-base-worker 0.4.0b1py3-none-any.whl → 0.4.0b3py3-none-any.whl