PyPI - arkindex-base-worker - Versions diffs - 0.4.0b1__py3-none-any.whl → 0.4.0b2__py3-none-any.whl - Mend

arkindex-base-worker 0.4.0b1py3-none-any.whl → 0.4.0b2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

{arkindex_base_worker-0.4.0b1.dist-info → arkindex_base_worker-0.4.0b2.dist-info}/METADATA +1 -1
{arkindex_base_worker-0.4.0b1.dist-info → arkindex_base_worker-0.4.0b2.dist-info}/RECORD +19 -19
arkindex_worker/image.py +2 -1
arkindex_worker/utils.py +76 -0
arkindex_worker/worker/__init__.py +3 -2
arkindex_worker/worker/classification.py +31 -15
arkindex_worker/worker/element.py +24 -10
arkindex_worker/worker/entity.py +25 -11
arkindex_worker/worker/metadata.py +18 -8
arkindex_worker/worker/transcription.py +38 -17
tests/test_elements_worker/test_classifications.py +107 -60
tests/test_elements_worker/test_elements.py +185 -49
tests/test_elements_worker/test_entities.py +102 -33
tests/test_elements_worker/test_metadata.py +223 -98
tests/test_elements_worker/test_transcriptions.py +293 -143
tests/test_utils.py +28 -0
{arkindex_base_worker-0.4.0b1.dist-info → arkindex_base_worker-0.4.0b2.dist-info}/LICENSE +0 -0
{arkindex_base_worker-0.4.0b1.dist-info → arkindex_base_worker-0.4.0b2.dist-info}/WHEEL +0 -0
{arkindex_base_worker-0.4.0b1.dist-info → arkindex_base_worker-0.4.0b2.dist-info}/top_level.txt +0 -0

tests/test_elements_worker/test_classifications.py CHANGED Viewed

@@ -7,6 +7,7 @@ from apistar.exceptions import ErrorResponse
 from arkindex_worker.cache import CachedClassification, CachedElement
 from arkindex_worker.models import Element
+from arkindex_worker.utils import DEFAULT_BATCH_SIZE
 from tests import CORPUS_ID
 from . import BASE_API_CALLS
@@ -692,7 +693,8 @@ def test_create_classifications_create_ml_class(mock_elements_worker, responses)
     }
-def test_create_classifications(responses, mock_elements_worker):
+@pytest.mark.parametrize("batch_size", [DEFAULT_BATCH_SIZE, 1])
+def test_create_classifications(batch_size, responses, mock_elements_worker):
     mock_elements_worker.classes = {"portrait": "0000", "landscape": "1111"}
     elt = Element({"id": "12341234-1234-1234-1234-123412341234"})
     responses.add(
@@ -716,62 +718,98 @@ def test_create_classifications(responses, mock_elements_worker):
                 "high_confidence": False,
             },
         ],
+        batch_size=batch_size,
     )
-    assert len(responses.calls) == len(BASE_API_CALLS) + 1
+    bulk_api_calls = [("POST", "http://testserver/api/v1/classification/bulk/")]
+    if batch_size != DEFAULT_BATCH_SIZE:
+        bulk_api_calls.append(("POST", "http://testserver/api/v1/classification/bulk/"))
+    assert len(responses.calls) == len(BASE_API_CALLS) + len(bulk_api_calls)
     assert [
         (call.request.method, call.request.url) for call in responses.calls
-    ] == BASE_API_CALLS + [
-        ("POST", "http://testserver/api/v1/classification/bulk/"),
-    ]
+    ] == BASE_API_CALLS + bulk_api_calls
-    assert json.loads(responses.calls[-1].request.body) == {
+    first_cl = {"confidence": 0.75, "high_confidence": False, "ml_class": "0000"}
+    second_cl = {"confidence": 0.25, "high_confidence": False, "ml_class": "1111"}
+    empty_payload = {
         "parent": str(elt.id),
         "worker_run_id": "56785678-5678-5678-5678-567856785678",
-        "classifications": [
-            {
-                "confidence": 0.75,
-                "high_confidence": False,
-                "ml_class": "0000",
-            },
-            {
-                "confidence": 0.25,
-                "high_confidence": False,
-                "ml_class": "1111",
-            },
-        ],
+        "classifications": [],
     }
+    bodies = []
+    first_call_idx = None
+    if batch_size > 1:
+        first_call_idx = -1
+        bodies.append({**empty_payload, "classifications": [first_cl, second_cl]})
+    else:
+        first_call_idx = -2
+        bodies.append({**empty_payload, "classifications": [first_cl]})
+        bodies.append({**empty_payload, "classifications": [second_cl]})
+    assert [
+        json.loads(bulk_call.request.body)
+        for bulk_call in responses.calls[first_call_idx:]
+    ] == bodies
-def test_create_classifications_with_cache(responses, mock_elements_worker_with_cache):
+@pytest.mark.parametrize("batch_size", [DEFAULT_BATCH_SIZE, 1])
+def test_create_classifications_with_cache(
+    batch_size, responses, mock_elements_worker_with_cache
+):
     mock_elements_worker_with_cache.classes = {"portrait": "0000", "landscape": "1111"}
     elt = CachedElement.create(id="12341234-1234-1234-1234-123412341234", type="thing")
-    responses.add(
-        responses.POST,
-        "http://testserver/api/v1/classification/bulk/",
-        status=200,
-        json={
-            "parent": str(elt.id),
-            "worker_run_id": "56785678-5678-5678-5678-567856785678",
-            "classifications": [
-                {
-                    "id": "00000000-0000-0000-0000-000000000000",
-                    "ml_class": "0000",
-                    "confidence": 0.75,
-                    "high_confidence": False,
-                    "state": "pending",
-                },
-                {
-                    "id": "11111111-1111-1111-1111-111111111111",
-                    "ml_class": "1111",
-                    "confidence": 0.25,
-                    "high_confidence": False,
-                    "state": "pending",
+    if batch_size > 1:
+        responses.add(
+            responses.POST,
+            "http://testserver/api/v1/classification/bulk/",
+            status=200,
+            json={
+                "parent": str(elt.id),
+                "worker_run_id": "56785678-5678-5678-5678-567856785678",
+                "classifications": [
+                    {
+                        "id": "00000000-0000-0000-0000-000000000000",
+                        "ml_class": "0000",
+                        "confidence": 0.75,
+                        "high_confidence": False,
+                        "state": "pending",
+                    },
+                    {
+                        "id": "11111111-1111-1111-1111-111111111111",
+                        "ml_class": "1111",
+                        "confidence": 0.25,
+                        "high_confidence": False,
+                        "state": "pending",
+                    },
+                ],
+            },
+        )
+    else:
+        for cl_id, cl_class, cl_conf in [
+            ("00000000-0000-0000-0000-000000000000", "0000", 0.75),
+            ("11111111-1111-1111-1111-111111111111", "1111", 0.25),
+        ]:
+            responses.add(
+                responses.POST,
+                "http://testserver/api/v1/classification/bulk/",
+                status=200,
+                json={
+                    "parent": str(elt.id),
+                    "worker_run_id": "56785678-5678-5678-5678-567856785678",
+                    "classifications": [
+                        {
+                            "id": cl_id,
+                            "ml_class": cl_class,
+                            "confidence": cl_conf,
+                            "high_confidence": False,
+                            "state": "pending",
+                        },
+                    ],
                 },
-            ],
-        },
-    )
+            )
     mock_elements_worker_with_cache.create_classifications(
         element=elt,
@@ -787,32 +825,41 @@ def test_create_classifications_with_cache(responses, mock_elements_worker_with_
                 "high_confidence": False,
             },
         ],
+        batch_size=batch_size,
     )
-    assert len(responses.calls) == len(BASE_API_CALLS) + 1
+    bulk_api_calls = [("POST", "http://testserver/api/v1/classification/bulk/")]
+    if batch_size != DEFAULT_BATCH_SIZE:
+        bulk_api_calls.append(("POST", "http://testserver/api/v1/classification/bulk/"))
+    assert len(responses.calls) == len(BASE_API_CALLS) + len(bulk_api_calls)
     assert [
         (call.request.method, call.request.url) for call in responses.calls
-    ] == BASE_API_CALLS + [
-        ("POST", "http://testserver/api/v1/classification/bulk/"),
-    ]
+    ] == BASE_API_CALLS + bulk_api_calls
-    assert json.loads(responses.calls[-1].request.body) == {
+    first_cl = {"confidence": 0.75, "high_confidence": False, "ml_class": "0000"}
+    second_cl = {"confidence": 0.25, "high_confidence": False, "ml_class": "1111"}
+    empty_payload = {
         "parent": str(elt.id),
         "worker_run_id": "56785678-5678-5678-5678-567856785678",
-        "classifications": [
-            {
-                "confidence": 0.75,
-                "high_confidence": False,
-                "ml_class": "0000",
-            },
-            {
-                "confidence": 0.25,
-                "high_confidence": False,
-                "ml_class": "1111",
-            },
-        ],
+        "classifications": [],
     }
+    bodies = []
+    first_call_idx = None
+    if batch_size > 1:
+        first_call_idx = -1
+        bodies.append({**empty_payload, "classifications": [first_cl, second_cl]})
+    else:
+        first_call_idx = -2
+        bodies.append({**empty_payload, "classifications": [first_cl]})
+        bodies.append({**empty_payload, "classifications": [second_cl]})
+    assert [
+        json.loads(bulk_call.request.body)
+        for bulk_call in responses.calls[first_call_idx:]
+    ] == bodies
     # Check that created classifications were properly stored in SQLite cache
     assert list(CachedClassification.select()) == [
         CachedClassification(

tests/test_elements_worker/test_elements.py CHANGED Viewed

@@ -15,6 +15,7 @@ from arkindex_worker.cache import (
     init_cache_db,
 )
 from arkindex_worker.models import Element
+from arkindex_worker.utils import DEFAULT_BATCH_SIZE
 from arkindex_worker.worker import ElementsWorker
 from arkindex_worker.worker.element import MissingTypeError
 from tests import CORPUS_ID
@@ -62,7 +63,7 @@ def test_check_required_types(mock_elements_worker):
     with pytest.raises(
         MissingTypeError,
         match=re.escape(
-            "Element type(s) act, text_line were not found in corpus (11111111-1111-1111-1111-111111111111)."
+            "Element types act, text_line were not found in corpus (11111111-1111-1111-1111-111111111111)."
         ),
     ):
         assert mock_elements_worker.check_required_types("page", "text_line", "act")
@@ -1010,7 +1011,10 @@ def test_create_elements_api_error(responses, mock_elements_worker):
     ]
-def test_create_elements_cached_element(responses, mock_elements_worker_with_cache):
+@pytest.mark.parametrize("batch_size", [DEFAULT_BATCH_SIZE, 1])
+def test_create_elements_cached_element(
+    batch_size, responses, mock_elements_worker_with_cache
+):
     image = CachedImage.create(
         id=UUID("c0fec0fe-c0fe-c0fe-c0fe-c0fec0fec0fe"),
         width=42,
@@ -1023,12 +1027,28 @@ def test_create_elements_cached_element(responses, mock_elements_worker_with_cac
         image_id=image.id,
         polygon="[[0, 0], [0, 1000], [1000, 1000], [1000, 0], [0, 0]]",
     )
-    responses.add(
-        responses.POST,
-        "http://testserver/api/v1/element/12341234-1234-1234-1234-123412341234/children/bulk/",
-        status=200,
-        json=[{"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08"}],
-    )
+    if batch_size > 1:
+        responses.add(
+            responses.POST,
+            "http://testserver/api/v1/element/12341234-1234-1234-1234-123412341234/children/bulk/",
+            status=200,
+            json=[
+                {"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08"},
+                {"id": "5468c358-b9c4-499d-8b92-d6349c58e88d"},
+            ],
+        )
+    else:
+        for elt_id in [
+            "497f6eca-6276-4993-bfeb-53cbbbba6f08",
+            "5468c358-b9c4-499d-8b92-d6349c58e88d",
+        ]:
+            responses.add(
+                responses.POST,
+                "http://testserver/api/v1/element/12341234-1234-1234-1234-123412341234/children/bulk/",
+                status=200,
+                json=[{"id": elt_id}],
+            )
     created_ids = mock_elements_worker_with_cache.create_elements(
         parent=elt,
@@ -1037,30 +1057,69 @@ def test_create_elements_cached_element(responses, mock_elements_worker_with_cac
                 "name": "0",
                 "type": "something",
                 "polygon": [[1, 1], [2, 2], [2, 1], [1, 2]],
-            }
+            },
+            {
+                "name": "1",
+                "type": "something",
+                "polygon": [[4, 4], [5, 5], [5, 4], [4, 5]],
+            },
         ],
+        batch_size=batch_size,
     )
-    assert len(responses.calls) == len(BASE_API_CALLS) + 1
-    assert [
-        (call.request.method, call.request.url) for call in responses.calls
-    ] == BASE_API_CALLS + [
+    bulk_api_calls = [
         (
             "POST",
             "http://testserver/api/v1/element/12341234-1234-1234-1234-123412341234/children/bulk/",
-        ),
+        )
     ]
-    assert json.loads(responses.calls[-1].request.body) == {
-        "elements": [
-            {
-                "name": "0",
-                "type": "something",
-                "polygon": [[1, 1], [2, 2], [2, 1], [1, 2]],
-            }
-        ],
+    if batch_size != DEFAULT_BATCH_SIZE:
+        bulk_api_calls.append(
+            (
+                "POST",
+                "http://testserver/api/v1/element/12341234-1234-1234-1234-123412341234/children/bulk/",
+            )
+        )
+    assert len(responses.calls) == len(BASE_API_CALLS) + len(bulk_api_calls)
+    assert [
+        (call.request.method, call.request.url) for call in responses.calls
+    ] == BASE_API_CALLS + bulk_api_calls
+    first_elt = {
+        "name": "0",
+        "type": "something",
+        "polygon": [[1, 1], [2, 2], [2, 1], [1, 2]],
+    }
+    second_elt = {
+        "name": "1",
+        "type": "something",
+        "polygon": [[4, 4], [5, 5], [5, 4], [4, 5]],
+    }
+    empty_payload = {
+        "elements": [],
         "worker_run_id": "56785678-5678-5678-5678-567856785678",
     }
-    assert created_ids == [{"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08"}]
+    bodies = []
+    first_call_idx = None
+    if batch_size > 1:
+        first_call_idx = -1
+        bodies.append({**empty_payload, "elements": [first_elt, second_elt]})
+    else:
+        first_call_idx = -2
+        bodies.append({**empty_payload, "elements": [first_elt]})
+        bodies.append({**empty_payload, "elements": [second_elt]})
+    assert [
+        json.loads(bulk_call.request.body)
+        for bulk_call in responses.calls[first_call_idx:]
+    ] == bodies
+    assert created_ids == [
+        {"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08"},
+        {"id": "5468c358-b9c4-499d-8b92-d6349c58e88d"},
+    ]
     # Check that created elements were properly stored in SQLite cache
     assert list(CachedElement.select().order_by(CachedElement.id)) == [
@@ -1072,11 +1131,24 @@ def test_create_elements_cached_element(responses, mock_elements_worker_with_cac
             image_id="c0fec0fe-c0fe-c0fe-c0fe-c0fec0fec0fe",
             polygon=[[1, 1], [2, 2], [2, 1], [1, 2]],
             worker_run_id=UUID("56785678-5678-5678-5678-567856785678"),
+            confidence=None,
+        ),
+        CachedElement(
+            id=UUID("5468c358-b9c4-499d-8b92-d6349c58e88d"),
+            parent_id=elt.id,
+            type="something",
+            image_id="c0fec0fe-c0fe-c0fe-c0fe-c0fec0fec0fe",
+            polygon=[[4, 4], [5, 5], [5, 4], [4, 5]],
+            worker_run_id=UUID("56785678-5678-5678-5678-567856785678"),
+            confidence=None,
         ),
     ]
-def test_create_elements(responses, mock_elements_worker_with_cache, tmp_path):
+@pytest.mark.parametrize("batch_size", [DEFAULT_BATCH_SIZE, 1])
+def test_create_elements(
+    batch_size, responses, mock_elements_worker_with_cache, tmp_path
+):
     elt = Element(
         {
             "id": "12341234-1234-1234-1234-123412341234",
@@ -1090,12 +1162,28 @@ def test_create_elements(responses, mock_elements_worker_with_cache, tmp_path):
             },
         }
     )
-    responses.add(
-        responses.POST,
-        "http://testserver/api/v1/element/12341234-1234-1234-1234-123412341234/children/bulk/",
-        status=200,
-        json=[{"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08"}],
-    )
+    if batch_size > 1:
+        responses.add(
+            responses.POST,
+            "http://testserver/api/v1/element/12341234-1234-1234-1234-123412341234/children/bulk/",
+            status=200,
+            json=[
+                {"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08"},
+                {"id": "5468c358-b9c4-499d-8b92-d6349c58e88d"},
+            ],
+        )
+    else:
+        for elt_id in [
+            "497f6eca-6276-4993-bfeb-53cbbbba6f08",
+            "5468c358-b9c4-499d-8b92-d6349c58e88d",
+        ]:
+            responses.add(
+                responses.POST,
+                "http://testserver/api/v1/element/12341234-1234-1234-1234-123412341234/children/bulk/",
+                status=200,
+                json=[{"id": elt_id}],
+            )
     created_ids = mock_elements_worker_with_cache.create_elements(
         parent=elt,
@@ -1104,30 +1192,69 @@ def test_create_elements(responses, mock_elements_worker_with_cache, tmp_path):
                 "name": "0",
                 "type": "something",
                 "polygon": [[1, 1], [2, 2], [2, 1], [1, 2]],
-            }
+            },
+            {
+                "name": "1",
+                "type": "something",
+                "polygon": [[4, 4], [5, 5], [5, 4], [4, 5]],
+            },
         ],
+        batch_size=batch_size,
     )
-    assert len(responses.calls) == len(BASE_API_CALLS) + 1
-    assert [
-        (call.request.method, call.request.url) for call in responses.calls
-    ] == BASE_API_CALLS + [
+    bulk_api_calls = [
         (
             "POST",
             "http://testserver/api/v1/element/12341234-1234-1234-1234-123412341234/children/bulk/",
-        ),
+        )
     ]
-    assert json.loads(responses.calls[-1].request.body) == {
-        "elements": [
-            {
-                "name": "0",
-                "type": "something",
-                "polygon": [[1, 1], [2, 2], [2, 1], [1, 2]],
-            }
-        ],
+    if batch_size != DEFAULT_BATCH_SIZE:
+        bulk_api_calls.append(
+            (
+                "POST",
+                "http://testserver/api/v1/element/12341234-1234-1234-1234-123412341234/children/bulk/",
+            )
+        )
+    assert len(responses.calls) == len(BASE_API_CALLS) + len(bulk_api_calls)
+    assert [
+        (call.request.method, call.request.url) for call in responses.calls
+    ] == BASE_API_CALLS + bulk_api_calls
+    first_elt = {
+        "name": "0",
+        "type": "something",
+        "polygon": [[1, 1], [2, 2], [2, 1], [1, 2]],
+    }
+    second_elt = {
+        "name": "1",
+        "type": "something",
+        "polygon": [[4, 4], [5, 5], [5, 4], [4, 5]],
+    }
+    empty_payload = {
+        "elements": [],
         "worker_run_id": "56785678-5678-5678-5678-567856785678",
     }
-    assert created_ids == [{"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08"}]
+    bodies = []
+    first_call_idx = None
+    if batch_size > 1:
+        first_call_idx = -1
+        bodies.append({**empty_payload, "elements": [first_elt, second_elt]})
+    else:
+        first_call_idx = -2
+        bodies.append({**empty_payload, "elements": [first_elt]})
+        bodies.append({**empty_payload, "elements": [second_elt]})
+    assert [
+        json.loads(bulk_call.request.body)
+        for bulk_call in responses.calls[first_call_idx:]
+    ] == bodies
+    assert created_ids == [
+        {"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08"},
+        {"id": "5468c358-b9c4-499d-8b92-d6349c58e88d"},
+    ]
     # Check that created elements were properly stored in SQLite cache
     assert (tmp_path / "db.sqlite").is_file()
@@ -1141,7 +1268,16 @@ def test_create_elements(responses, mock_elements_worker_with_cache, tmp_path):
             polygon=[[1, 1], [2, 2], [2, 1], [1, 2]],
             worker_run_id=UUID("56785678-5678-5678-5678-567856785678"),
             confidence=None,
-        )
+        ),
+        CachedElement(
+            id=UUID("5468c358-b9c4-499d-8b92-d6349c58e88d"),
+            parent_id=UUID("12341234-1234-1234-1234-123412341234"),
+            type="something",
+            image_id="c0fec0fe-c0fe-c0fe-c0fe-c0fec0fec0fe",
+            polygon=[[4, 4], [5, 5], [5, 4], [4, 5]],
+            worker_run_id=UUID("56785678-5678-5678-5678-567856785678"),
+            confidence=None,
+        ),
     ]
@@ -1268,9 +1404,9 @@ def test_create_elements_integrity_error(
         {"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08"},
     ]
-    assert len(caplog.records) == 1
-    assert caplog.records[0].levelname == "WARNING"
-    assert caplog.records[0].message.startswith(
+    assert len(caplog.records) == 3
+    assert caplog.records[-1].levelname == "WARNING"
+    assert caplog.records[-1].message.startswith(
         "Couldn't save created elements in local cache:"
     )

arkindex-base-worker 0.4.0b1__py3-none-any.whl → 0.4.0b2__py3-none-any.whl

arkindex-base-worker 0.4.0b1py3-none-any.whl → 0.4.0b2py3-none-any.whl