PyPI - arkindex-base-worker - Versions diffs - 0.4.0b1__py3-none-any.whl → 0.4.0b3__py3-none-any.whl - Mend

arkindex-base-worker 0.4.0b1py3-none-any.whl → 0.4.0b3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

{arkindex_base_worker-0.4.0b1.dist-info → arkindex_base_worker-0.4.0b3.dist-info}/METADATA +1 -1
{arkindex_base_worker-0.4.0b1.dist-info → arkindex_base_worker-0.4.0b3.dist-info}/RECORD +19 -19
arkindex_worker/image.py +2 -1
arkindex_worker/utils.py +81 -0
arkindex_worker/worker/__init__.py +3 -2
arkindex_worker/worker/classification.py +31 -15
arkindex_worker/worker/element.py +71 -10
arkindex_worker/worker/entity.py +25 -11
arkindex_worker/worker/metadata.py +18 -8
arkindex_worker/worker/transcription.py +38 -17
tests/test_elements_worker/test_classifications.py +107 -60
tests/test_elements_worker/test_elements.py +318 -49
tests/test_elements_worker/test_entities.py +102 -33
tests/test_elements_worker/test_metadata.py +223 -98
tests/test_elements_worker/test_transcriptions.py +293 -143
tests/test_utils.py +28 -0
{arkindex_base_worker-0.4.0b1.dist-info → arkindex_base_worker-0.4.0b3.dist-info}/LICENSE +0 -0
{arkindex_base_worker-0.4.0b1.dist-info → arkindex_base_worker-0.4.0b3.dist-info}/WHEEL +0 -0
{arkindex_base_worker-0.4.0b1.dist-info → arkindex_base_worker-0.4.0b3.dist-info}/top_level.txt +0 -0

tests/test_elements_worker/test_transcriptions.py CHANGED Viewed

@@ -8,6 +8,7 @@ from playhouse.shortcuts import model_to_dict
 from arkindex_worker.cache import CachedElement, CachedTranscription
 from arkindex_worker.models import Element
+from arkindex_worker.utils import DEFAULT_BATCH_SIZE
 from arkindex_worker.worker.transcription import TextOrientation
 from . import BASE_API_CALLS
@@ -639,9 +640,10 @@ def test_create_transcriptions_api_error(responses, mock_elements_worker):
     ] == BASE_API_CALLS + [("POST", "http://testserver/api/v1/transcription/bulk/")]
-def test_create_transcriptions(responses, mock_elements_worker_with_cache):
+@pytest.mark.parametrize("batch_size", [DEFAULT_BATCH_SIZE, 1])
+def test_create_transcriptions(batch_size, responses, mock_elements_worker_with_cache):
     CachedElement.create(id="11111111-1111-1111-1111-111111111111", type="thing")
-    trans = [
+    transcriptions = [
         {
             "element_id": "11111111-1111-1111-1111-111111111111",
             "text": "The",
@@ -654,60 +656,110 @@ def test_create_transcriptions(responses, mock_elements_worker_with_cache):
         },
     ]
-    responses.add(
-        responses.POST,
-        "http://testserver/api/v1/transcription/bulk/",
-        status=200,
-        json={
-            "worker_run_id": "56785678-5678-5678-5678-567856785678",
-            "transcriptions": [
-                {
-                    "id": "00000000-0000-0000-0000-000000000000",
-                    "element_id": "11111111-1111-1111-1111-111111111111",
-                    "text": "The",
-                    "orientation": "horizontal-lr",
-                    "confidence": 0.75,
-                },
-                {
-                    "id": "11111111-1111-1111-1111-111111111111",
-                    "element_id": "11111111-1111-1111-1111-111111111111",
-                    "text": "word",
-                    "orientation": "horizontal-lr",
-                    "confidence": 0.42,
-                },
+    if batch_size > 1:
+        responses.add(
+            responses.POST,
+            "http://testserver/api/v1/transcription/bulk/",
+            status=200,
+            json={
+                "worker_run_id": "56785678-5678-5678-5678-567856785678",
+                "transcriptions": [
+                    {
+                        "id": "00000000-0000-0000-0000-000000000000",
+                        "element_id": "11111111-1111-1111-1111-111111111111",
+                        "text": "The",
+                        "orientation": "horizontal-lr",
+                        "confidence": 0.75,
+                    },
+                    {
+                        "id": "11111111-1111-1111-1111-111111111111",
+                        "element_id": "11111111-1111-1111-1111-111111111111",
+                        "text": "word",
+                        "orientation": "horizontal-lr",
+                        "confidence": 0.42,
+                    },
+                ],
+            },
+        )
+    else:
+        for tr, tr_id in zip(
+            transcriptions,
+            [
+                "00000000-0000-0000-0000-000000000000",
+                "11111111-1111-1111-1111-111111111111",
             ],
-        },
-    )
+            strict=False,
+        ):
+            responses.add(
+                responses.POST,
+                "http://testserver/api/v1/transcription/bulk/",
+                status=200,
+                json={
+                    "worker_run_id": "56785678-5678-5678-5678-567856785678",
+                    "transcriptions": [
+                        {
+                            "id": tr_id,
+                            "element_id": tr["element_id"],
+                            "text": tr["text"],
+                            "orientation": "horizontal-lr",
+                            "confidence": tr["confidence"],
+                        }
+                    ],
+                },
+            )
     mock_elements_worker_with_cache.create_transcriptions(
-        transcriptions=trans,
+        transcriptions=transcriptions,
+        batch_size=batch_size,
     )
-    assert len(responses.calls) == len(BASE_API_CALLS) + 1
+    bulk_api_calls = [
+        (
+            "POST",
+            "http://testserver/api/v1/transcription/bulk/",
+        )
+    ]
+    if batch_size != DEFAULT_BATCH_SIZE:
+        bulk_api_calls.append(
+            (
+                "POST",
+                "http://testserver/api/v1/transcription/bulk/",
+            )
+        )
+    assert len(responses.calls) == len(BASE_API_CALLS) + len(bulk_api_calls)
     assert [
         (call.request.method, call.request.url) for call in responses.calls
-    ] == BASE_API_CALLS + [
-        ("POST", "http://testserver/api/v1/transcription/bulk/"),
-    ]
+    ] == BASE_API_CALLS + bulk_api_calls
-    assert json.loads(responses.calls[-1].request.body) == {
+    first_tr = {
+        **transcriptions[0],
+        "orientation": TextOrientation.HorizontalLeftToRight.value,
+    }
+    second_tr = {
+        **transcriptions[1],
+        "orientation": TextOrientation.HorizontalLeftToRight.value,
+    }
+    empty_payload = {
+        "transcriptions": [],
         "worker_run_id": "56785678-5678-5678-5678-567856785678",
-        "transcriptions": [
-            {
-                "element_id": "11111111-1111-1111-1111-111111111111",
-                "text": "The",
-                "confidence": 0.75,
-                "orientation": TextOrientation.HorizontalLeftToRight.value,
-            },
-            {
-                "element_id": "11111111-1111-1111-1111-111111111111",
-                "text": "word",
-                "confidence": 0.42,
-                "orientation": TextOrientation.HorizontalLeftToRight.value,
-            },
-        ],
     }
+    bodies = []
+    first_call_idx = None
+    if batch_size > 1:
+        first_call_idx = -1
+        bodies.append({**empty_payload, "transcriptions": [first_tr, second_tr]})
+    else:
+        first_call_idx = -2
+        bodies.append({**empty_payload, "transcriptions": [first_tr]})
+        bodies.append({**empty_payload, "transcriptions": [second_tr]})
+    assert [
+        json.loads(bulk_call.request.body)
+        for bulk_call in responses.calls[first_call_idx:]
+    ] == bodies
     # Check that created transcriptions were properly stored in SQLite cache
     assert list(CachedTranscription.select()) == [
         CachedTranscription(
@@ -1281,70 +1333,119 @@ def test_create_element_transcriptions_api_error(responses, mock_elements_worker
     ]
-def test_create_element_transcriptions(responses, mock_elements_worker):
+@pytest.mark.parametrize("batch_size", [DEFAULT_BATCH_SIZE, 2])
+def test_create_element_transcriptions(batch_size, responses, mock_elements_worker):
     elt = Element({"id": "12341234-1234-1234-1234-123412341234"})
-    responses.add(
-        responses.POST,
-        f"http://testserver/api/v1/element/{elt.id}/transcriptions/bulk/",
-        status=200,
-        json=[
-            {
-                "id": "56785678-5678-5678-5678-567856785678",
-                "element_id": "11111111-1111-1111-1111-111111111111",
-                "created": True,
-            },
-            {
-                "id": "67896789-6789-6789-6789-678967896789",
-                "element_id": "22222222-2222-2222-2222-222222222222",
-                "created": False,
-            },
-            {
-                "id": "78907890-7890-7890-7890-789078907890",
-                "element_id": "11111111-1111-1111-1111-111111111111",
-                "created": True,
-            },
-        ],
-    )
+    if batch_size > 2:
+        responses.add(
+            responses.POST,
+            f"http://testserver/api/v1/element/{elt.id}/transcriptions/bulk/",
+            status=200,
+            json=[
+                {
+                    "id": "56785678-5678-5678-5678-567856785678",
+                    "element_id": "11111111-1111-1111-1111-111111111111",
+                    "created": True,
+                },
+                {
+                    "id": "67896789-6789-6789-6789-678967896789",
+                    "element_id": "22222222-2222-2222-2222-222222222222",
+                    "created": False,
+                },
+                {
+                    "id": "78907890-7890-7890-7890-789078907890",
+                    "element_id": "11111111-1111-1111-1111-111111111111",
+                    "created": True,
+                },
+            ],
+        )
+    else:
+        for transcriptions in [
+            [
+                ("56785678-5678-5678-5678-567856785678", True),
+                ("67896789-6789-6789-6789-678967896789", False),
+            ],
+            [("78907890-7890-7890-7890-789078907890", True)],
+        ]:
+            responses.add(
+                responses.POST,
+                f"http://testserver/api/v1/element/{elt.id}/transcriptions/bulk/",
+                status=200,
+                json=[
+                    {
+                        "id": tr_id,
+                        "element_id": "11111111-1111-1111-1111-111111111111"
+                        if created
+                        else "22222222-2222-2222-2222-222222222222",
+                        "created": created,
+                    }
+                    for tr_id, created in transcriptions
+                ],
+            )
     annotations = mock_elements_worker.create_element_transcriptions(
         element=elt,
         sub_element_type="page",
         transcriptions=TRANSCRIPTIONS_SAMPLE,
+        batch_size=batch_size,
     )
-    assert len(responses.calls) == len(BASE_API_CALLS) + 1
+    bulk_api_calls = [
+        (
+            "POST",
+            f"http://testserver/api/v1/element/{elt.id}/transcriptions/bulk/",
+        )
+    ]
+    if batch_size != DEFAULT_BATCH_SIZE:
+        bulk_api_calls.append(
+            (
+                "POST",
+                f"http://testserver/api/v1/element/{elt.id}/transcriptions/bulk/",
+            )
+        )
+    assert len(responses.calls) == len(BASE_API_CALLS) + len(bulk_api_calls)
     assert [
         (call.request.method, call.request.url) for call in responses.calls
-    ] == BASE_API_CALLS + [
-        ("POST", f"http://testserver/api/v1/element/{elt.id}/transcriptions/bulk/"),
-    ]
+    ] == BASE_API_CALLS + bulk_api_calls
-    assert json.loads(responses.calls[-1].request.body) == {
+    first_tr = {
+        **TRANSCRIPTIONS_SAMPLE[0],
+        "orientation": TextOrientation.HorizontalLeftToRight.value,
+    }
+    second_tr = {
+        **TRANSCRIPTIONS_SAMPLE[1],
+        "orientation": TextOrientation.HorizontalLeftToRight.value,
+    }
+    third_tr = {
+        **TRANSCRIPTIONS_SAMPLE[2],
+        "orientation": TextOrientation.HorizontalLeftToRight.value,
+    }
+    empty_payload = {
         "element_type": "page",
         "worker_run_id": "56785678-5678-5678-5678-567856785678",
-        "transcriptions": [
-            {
-                "polygon": [[100, 150], [700, 150], [700, 200], [100, 200]],
-                "confidence": 0.5,
-                "text": "The",
-                "orientation": TextOrientation.HorizontalLeftToRight.value,
-            },
-            {
-                "polygon": [[0, 0], [2000, 0], [2000, 3000], [0, 3000]],
-                "confidence": 0.75,
-                "text": "first",
-                "orientation": TextOrientation.HorizontalLeftToRight.value,
-                "element_confidence": 0.75,
-            },
-            {
-                "polygon": [[1000, 300], [1200, 300], [1200, 500], [1000, 500]],
-                "confidence": 0.9,
-                "text": "line",
-                "orientation": TextOrientation.HorizontalLeftToRight.value,
-            },
-        ],
+        "transcriptions": [],
         "return_elements": True,
     }
+    bodies = []
+    first_call_idx = None
+    if batch_size > 2:
+        first_call_idx = -1
+        bodies.append(
+            {**empty_payload, "transcriptions": [first_tr, second_tr, third_tr]}
+        )
+    else:
+        first_call_idx = -2
+        bodies.append({**empty_payload, "transcriptions": [first_tr, second_tr]})
+        bodies.append({**empty_payload, "transcriptions": [third_tr]})
+    assert [
+        json.loads(bulk_call.request.body)
+        for bulk_call in responses.calls[first_call_idx:]
+    ] == bodies
     assert annotations == [
         {
             "id": "56785678-5678-5678-5678-567856785678",
@@ -1364,73 +1465,122 @@ def test_create_element_transcriptions(responses, mock_elements_worker):
     ]
+@pytest.mark.parametrize("batch_size", [DEFAULT_BATCH_SIZE, 2])
 def test_create_element_transcriptions_with_cache(
-    responses, mock_elements_worker_with_cache
+    batch_size, responses, mock_elements_worker_with_cache
 ):
     elt = CachedElement(id="12341234-1234-1234-1234-123412341234", type="thing")
-    responses.add(
-        responses.POST,
-        f"http://testserver/api/v1/element/{elt.id}/transcriptions/bulk/",
-        status=200,
-        json=[
-            {
-                "id": "56785678-5678-5678-5678-567856785678",
-                "element_id": "11111111-1111-1111-1111-111111111111",
-                "created": True,
-            },
-            {
-                "id": "67896789-6789-6789-6789-678967896789",
-                "element_id": "22222222-2222-2222-2222-222222222222",
-                "created": False,
-            },
-            {
-                "id": "78907890-7890-7890-7890-789078907890",
-                "element_id": "11111111-1111-1111-1111-111111111111",
-                "created": True,
-            },
-        ],
-    )
+    if batch_size > 2:
+        responses.add(
+            responses.POST,
+            f"http://testserver/api/v1/element/{elt.id}/transcriptions/bulk/",
+            status=200,
+            json=[
+                {
+                    "id": "56785678-5678-5678-5678-567856785678",
+                    "element_id": "11111111-1111-1111-1111-111111111111",
+                    "created": True,
+                },
+                {
+                    "id": "67896789-6789-6789-6789-678967896789",
+                    "element_id": "22222222-2222-2222-2222-222222222222",
+                    "created": False,
+                },
+                {
+                    "id": "78907890-7890-7890-7890-789078907890",
+                    "element_id": "11111111-1111-1111-1111-111111111111",
+                    "created": True,
+                },
+            ],
+        )
+    else:
+        for transcriptions in [
+            [
+                ("56785678-5678-5678-5678-567856785678", True),
+                ("67896789-6789-6789-6789-678967896789", False),
+            ],
+            [("78907890-7890-7890-7890-789078907890", True)],
+        ]:
+            responses.add(
+                responses.POST,
+                f"http://testserver/api/v1/element/{elt.id}/transcriptions/bulk/",
+                status=200,
+                json=[
+                    {
+                        "id": tr_id,
+                        "element_id": "11111111-1111-1111-1111-111111111111"
+                        if created
+                        else "22222222-2222-2222-2222-222222222222",
+                        "created": created,
+                    }
+                    for tr_id, created in transcriptions
+                ],
+            )
     annotations = mock_elements_worker_with_cache.create_element_transcriptions(
         element=elt,
         sub_element_type="page",
         transcriptions=TRANSCRIPTIONS_SAMPLE,
+        batch_size=batch_size,
     )
-    assert len(responses.calls) == len(BASE_API_CALLS) + 1
+    bulk_api_calls = [
+        (
+            "POST",
+            f"http://testserver/api/v1/element/{elt.id}/transcriptions/bulk/",
+        )
+    ]
+    if batch_size != DEFAULT_BATCH_SIZE:
+        bulk_api_calls.append(
+            (
+                "POST",
+                f"http://testserver/api/v1/element/{elt.id}/transcriptions/bulk/",
+            )
+        )
+    assert len(responses.calls) == len(BASE_API_CALLS) + len(bulk_api_calls)
     assert [
         (call.request.method, call.request.url) for call in responses.calls
-    ] == BASE_API_CALLS + [
-        ("POST", f"http://testserver/api/v1/element/{elt.id}/transcriptions/bulk/"),
-    ]
+    ] == BASE_API_CALLS + bulk_api_calls
-    assert json.loads(responses.calls[-1].request.body) == {
+    first_tr = {
+        **TRANSCRIPTIONS_SAMPLE[0],
+        "orientation": TextOrientation.HorizontalLeftToRight.value,
+    }
+    second_tr = {
+        **TRANSCRIPTIONS_SAMPLE[1],
+        "orientation": TextOrientation.HorizontalLeftToRight.value,
+        "element_confidence": 0.75,
+    }
+    third_tr = {
+        **TRANSCRIPTIONS_SAMPLE[2],
+        "orientation": TextOrientation.HorizontalLeftToRight.value,
+    }
+    empty_payload = {
         "element_type": "page",
         "worker_run_id": "56785678-5678-5678-5678-567856785678",
-        "transcriptions": [
-            {
-                "polygon": [[100, 150], [700, 150], [700, 200], [100, 200]],
-                "confidence": 0.5,
-                "text": "The",
-                "orientation": TextOrientation.HorizontalLeftToRight.value,
-            },
-            {
-                "polygon": [[0, 0], [2000, 0], [2000, 3000], [0, 3000]],
-                "confidence": 0.75,
-                "text": "first",
-                "orientation": TextOrientation.HorizontalLeftToRight.value,
-                "element_confidence": 0.75,
-            },
-            {
-                "polygon": [[1000, 300], [1200, 300], [1200, 500], [1000, 500]],
-                "confidence": 0.9,
-                "text": "line",
-                "orientation": TextOrientation.HorizontalLeftToRight.value,
-            },
-        ],
+        "transcriptions": [],
         "return_elements": True,
     }
+    bodies = []
+    first_call_idx = None
+    if batch_size > 2:
+        first_call_idx = -1
+        bodies.append(
+            {**empty_payload, "transcriptions": [first_tr, second_tr, third_tr]}
+        )
+    else:
+        first_call_idx = -2
+        bodies.append({**empty_payload, "transcriptions": [first_tr, second_tr]})
+        bodies.append({**empty_payload, "transcriptions": [third_tr]})
+    assert [
+        json.loads(bulk_call.request.body)
+        for bulk_call in responses.calls[first_call_idx:]
+    ] == bodies
     assert annotations == [
         {
             "id": "56785678-5678-5678-5678-567856785678",

tests/test_utils.py CHANGED Viewed

@@ -3,6 +3,8 @@ from pathlib import Path
 import pytest
 from arkindex_worker.utils import (
+    DEFAULT_BATCH_SIZE,
+    batch_publication,
     close_delete_file,
     extract_tar_zst_archive,
     parse_source_id,
@@ -55,3 +57,29 @@ def test_close_delete_file(tmp_path):
     close_delete_file(archive_fd, archive_path)
     assert not archive_path.exists()
+class TestMixin:
+    @batch_publication
+    def custom_publication_in_batches(self, batch_size: int = DEFAULT_BATCH_SIZE):
+        return batch_size
+def test_batch_publication_decorator_no_parameter():
+    assert TestMixin().custom_publication_in_batches() == DEFAULT_BATCH_SIZE
+@pytest.mark.parametrize("wrong_batch_size", [None, "not an int", 0])
+def test_batch_publication_decorator_wrong_parameter(wrong_batch_size):
+    with pytest.raises(
+        AssertionError,
+        match="batch_size shouldn't be null and should be a strictly positive integer",
+    ):
+        TestMixin().custom_publication_in_batches(batch_size=wrong_batch_size)
+@pytest.mark.parametrize("batch_size", [1, 10, DEFAULT_BATCH_SIZE])
+def test_batch_publication_decorator_right_parameter(batch_size):
+    assert (
+        TestMixin().custom_publication_in_batches(batch_size=batch_size) == batch_size
+    )

{arkindex_base_worker-0.4.0b1.dist-info → arkindex_base_worker-0.4.0b3.dist-info}/LICENSE RENAMED Viewed

File without changes

{arkindex_base_worker-0.4.0b1.dist-info → arkindex_base_worker-0.4.0b3.dist-info}/WHEEL RENAMED Viewed

File without changes

{arkindex_base_worker-0.4.0b1.dist-info → arkindex_base_worker-0.4.0b3.dist-info}/top_level.txt RENAMED Viewed

File without changes

arkindex-base-worker 0.4.0b1__py3-none-any.whl → 0.4.0b3__py3-none-any.whl

arkindex-base-worker 0.4.0b1py3-none-any.whl → 0.4.0b3py3-none-any.whl