PyPI - arkindex-base-worker - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.0a2__py3-none-any.whl - Mend

arkindex-base-worker 0.4.0py3-none-any.whl → 0.4.0a2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

{arkindex_base_worker-0.4.0.dist-info → arkindex_base_worker-0.4.0a2.dist-info}/METADATA +13 -15
arkindex_base_worker-0.4.0a2.dist-info/RECORD +51 -0
{arkindex_base_worker-0.4.0.dist-info → arkindex_base_worker-0.4.0a2.dist-info}/WHEEL +1 -1
arkindex_worker/cache.py +1 -1
arkindex_worker/image.py +1 -120
arkindex_worker/utils.py +0 -82
arkindex_worker/worker/__init__.py +161 -46
arkindex_worker/worker/base.py +11 -36
arkindex_worker/worker/classification.py +18 -34
arkindex_worker/worker/corpus.py +4 -21
arkindex_worker/worker/dataset.py +1 -71
arkindex_worker/worker/element.py +91 -352
arkindex_worker/worker/entity.py +11 -11
arkindex_worker/worker/metadata.py +9 -19
arkindex_worker/worker/task.py +4 -5
arkindex_worker/worker/training.py +6 -6
arkindex_worker/worker/transcription.py +68 -89
arkindex_worker/worker/version.py +1 -3
tests/__init__.py +1 -1
tests/conftest.py +45 -33
tests/test_base_worker.py +3 -204
tests/test_dataset_worker.py +4 -7
tests/test_elements_worker/{test_classification.py → test_classifications.py} +61 -194
tests/test_elements_worker/test_corpus.py +1 -32
tests/test_elements_worker/test_dataset.py +1 -1
tests/test_elements_worker/test_elements.py +2734 -0
tests/test_elements_worker/{test_entity_create.py → test_entities.py} +160 -26
tests/test_elements_worker/test_image.py +1 -2
tests/test_elements_worker/test_metadata.py +99 -224
tests/test_elements_worker/test_task.py +1 -1
tests/test_elements_worker/test_training.py +2 -2
tests/test_elements_worker/test_transcriptions.py +2102 -0
tests/test_elements_worker/test_worker.py +280 -563
tests/test_image.py +204 -429
tests/test_merge.py +2 -1
tests/test_utils.py +3 -66
arkindex_base_worker-0.4.0.dist-info/RECORD +0 -61
arkindex_worker/worker/process.py +0 -92
tests/test_elements_worker/test_element.py +0 -427
tests/test_elements_worker/test_element_create_multiple.py +0 -715
tests/test_elements_worker/test_element_create_single.py +0 -528
tests/test_elements_worker/test_element_list_children.py +0 -969
tests/test_elements_worker/test_element_list_parents.py +0 -530
tests/test_elements_worker/test_entity_list_and_check.py +0 -160
tests/test_elements_worker/test_process.py +0 -89
tests/test_elements_worker/test_transcription_create.py +0 -873
tests/test_elements_worker/test_transcription_create_with_elements.py +0 -951
tests/test_elements_worker/test_transcription_list.py +0 -450
tests/test_elements_worker/test_version.py +0 -60
{arkindex_base_worker-0.4.0.dist-info → arkindex_base_worker-0.4.0a2.dist-info}/LICENSE +0 -0
{arkindex_base_worker-0.4.0.dist-info → arkindex_base_worker-0.4.0a2.dist-info}/top_level.txt +0 -0

arkindex_worker/worker/entity.py CHANGED Viewed

@@ -15,7 +15,6 @@ from arkindex_worker.cache import (
     unsupported_cache,
 )
 from arkindex_worker.models import Element, Transcription
-from arkindex_worker.utils import pluralize
 class Entity(TypedDict):
@@ -49,7 +48,6 @@ class EntityMixin:
         if not self.entity_types:
             # Load entity_types of corpus
             self.list_corpus_entity_types()
         for entity_type in entity_types:
             # Do nothing if type already exists
             if entity_type in self.entity_types:
@@ -62,7 +60,7 @@ class EntityMixin:
                 )
             # Create type if non-existent
-            self.entity_types[entity_type] = self.api_client.request(
+            self.entity_types[entity_type] = self.request(
                 "CreateEntityType",
                 body={
                     "name": entity_type,
@@ -108,7 +106,7 @@ class EntityMixin:
         entity_type_id = self.entity_types.get(type)
         assert entity_type_id, f"Entity type `{type}` not found in the corpus."
-        entity = self.api_client.request(
+        entity = self.request(
             "CreateEntity",
             body={
                 "name": name,
@@ -190,7 +188,7 @@ class EntityMixin:
         if confidence is not None:
             body["confidence"] = confidence
-        transcription_ent = self.api_client.request(
+        transcription_ent = self.request(
             "CreateTranscriptionEntity",
             id=transcription.id,
             body=body,
@@ -291,16 +289,16 @@ class EntityMixin:
             )
             return
-        created_entities = self.api_client.request(
+        created_ids = self.request(
             "CreateTranscriptionEntities",
             id=transcription.id,
             body={
                 "worker_run_id": self.worker_run_id,
                 "entities": entities,
             },
-        )["entities"]
+        )
-        return created_entities
+        return created_ids["entities"]
     def list_transcription_entities(
         self,
@@ -384,10 +382,12 @@ class EntityMixin:
         }
         count = len(self.entities)
         logger.info(
-            f'Loaded {count} {pluralize("entity", count)} in corpus ({self.corpus_id})'
+            f'Loaded {count} entit{"ies" if count > 1 else "y"} in corpus ({self.corpus_id})'
         )
-    def list_corpus_entity_types(self):
+    def list_corpus_entity_types(
+        self,
+    ):
         """
         Loads available entity types in corpus.
         """
@@ -399,5 +399,5 @@ class EntityMixin:
         }
         count = len(self.entity_types)
         logger.info(
-            f'Loaded {count} entity {pluralize("type", count)} in corpus ({self.corpus_id}).'
+            f'Loaded {count} entity type{"s"[:count>1]} in corpus ({self.corpus_id}).'
         )

arkindex_worker/worker/metadata.py CHANGED Viewed

@@ -7,7 +7,6 @@ from enum import Enum
 from arkindex_worker import logger
 from arkindex_worker.cache import CachedElement, unsupported_cache
 from arkindex_worker.models import Element
-from arkindex_worker.utils import DEFAULT_BATCH_SIZE, batch_publication, make_batches
 class MetaType(Enum):
@@ -94,7 +93,7 @@ class MetaDataMixin:
             logger.warning("Cannot create metadata as this worker is in read-only mode")
             return
-        metadata = self.api_client.request(
+        metadata = self.request(
             "CreateMetaData",
             id=element.id,
             body={
@@ -109,12 +108,10 @@ class MetaDataMixin:
         return metadata["id"]
     @unsupported_cache
-    @batch_publication
     def create_metadata_bulk(
         self,
         element: Element | CachedElement,
         metadata_list: list[dict[str, MetaType | str | int | float | None]],
-        batch_size: int = DEFAULT_BATCH_SIZE,
     ) -> list[dict[str, str]]:
         """
         Create multiple metadata on an existing element.
@@ -126,9 +123,6 @@ class MetaDataMixin:
             - name: str
             - value: str | int | float
             - entity_id: str | None
-        :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
-        :returns: A list of dicts as returned in the ``metadata_list`` field by the ``CreateMetaDataBulk`` API endpoint.
         """
         assert element and isinstance(
             element, Element | CachedElement
@@ -174,18 +168,14 @@ class MetaDataMixin:
             logger.warning("Cannot create metadata as this worker is in read-only mode")
             return
-        created_metadata_list = [
-            created_metadata
-            for batch in make_batches(metas, "metadata", batch_size)
-            for created_metadata in self.api_client.request(
-                "CreateMetaDataBulk",
-                id=element.id,
-                body={
-                    "worker_run_id": self.worker_run_id,
-                    "metadata_list": batch,
-                },
-            )["metadata_list"]
-        ]
+        created_metadata_list = self.request(
+            "CreateMetaDataBulk",
+            id=element.id,
+            body={
+                "worker_run_id": self.worker_run_id,
+                "metadata_list": metas,
+            },
+        )["metadata_list"]
         return created_metadata_list

arkindex_worker/worker/task.py CHANGED Viewed

@@ -5,7 +5,8 @@ BaseWorker methods for tasks.
 import uuid
 from collections.abc import Iterator
-from arkindex.compat import DownloadedFile
+from apistar.compat import DownloadedFile
 from arkindex_worker.models import Artifact
@@ -21,7 +22,7 @@ class TaskMixin:
             task_id, uuid.UUID
         ), "task_id shouldn't be null and should be an UUID"
-        results = self.api_client.request("ListArtifacts", id=task_id)
+        results = self.request("ListArtifacts", id=task_id)
         return map(Artifact, results)
@@ -42,6 +43,4 @@ class TaskMixin:
             artifact, Artifact
         ), "artifact shouldn't be null and should be an Artifact"
-        return self.api_client.request(
-            "DownloadArtifact", id=task_id, path=artifact.path
-        )
+        return self.request("DownloadArtifact", id=task_id, path=artifact.path)

arkindex_worker/worker/training.py CHANGED Viewed

@@ -9,8 +9,8 @@ from typing import NewType
 from uuid import UUID
 import requests
+from apistar.exceptions import ErrorResponse
-from arkindex.exceptions import ErrorResponse
 from arkindex_worker import logger
 from arkindex_worker.utils import close_delete_file, create_tar_zst_archive
@@ -185,7 +185,7 @@ class TrainingMixin:
         assert not self.model_version, "A model version has already been created."
         configuration = configuration or {}
-        self.model_version = self.api_client.request(
+        self.model_version = self.request(
             "CreateModelVersion",
             id=model_id,
             body=build_clean_payload(
@@ -217,7 +217,7 @@ class TrainingMixin:
         :param parent: ID of the parent model version
         """
         assert self.model_version, "No model version has been created yet."
-        self.model_version = self.api_client.request(
+        self.model_version = self.request(
             "UpdateModelVersion",
             id=self.model_version["id"],
             body=build_clean_payload(
@@ -273,7 +273,7 @@ class TrainingMixin:
         """
         assert self.model_version, "You must create the model version and upload its archive before validating it."
         try:
-            self.model_version = self.api_client.request(
+            self.model_version = self.request(
                 "PartialUpdateModelVersion",
                 id=self.model_version["id"],
                 body={
@@ -294,7 +294,7 @@ class TrainingMixin:
             pending_version_id = self.model_version["id"]
             logger.warning("Removing the pending model version.")
             try:
-                self.api_client.request("DestroyModelVersion", id=pending_version_id)
+                self.request("DestroyModelVersion", id=pending_version_id)
             except ErrorResponse as e:
                 msg = getattr(e, "content", str(e))
                 logger.error(
@@ -304,7 +304,7 @@ class TrainingMixin:
             logger.info("Retrieving the existing model version.")
             existing_version_id = model_version["id"].pop()
             try:
-                self.model_version = self.api_client.request(
+                self.model_version = self.request(
                     "RetrieveModelVersion", id=existing_version_id
                 )
             except ErrorResponse as e:

arkindex_worker/worker/transcription.py CHANGED Viewed

@@ -11,7 +11,6 @@ from peewee import IntegrityError
 from arkindex_worker import logger
 from arkindex_worker.cache import CachedElement, CachedTranscription
 from arkindex_worker.models import Element
-from arkindex_worker.utils import DEFAULT_BATCH_SIZE, batch_publication, make_batches
 class TextOrientation(Enum):
@@ -78,7 +77,7 @@ class TranscriptionMixin:
             )
             return
-        created = self.api_client.request(
+        created = self.request(
             "CreateTranscription",
             id=element.id,
             body={
@@ -110,11 +109,9 @@ class TranscriptionMixin:
         return created
-    @batch_publication
     def create_transcriptions(
         self,
         transcriptions: list[dict[str, str | float | TextOrientation | None]],
-        batch_size: int = DEFAULT_BATCH_SIZE,
     ) -> list[dict[str, str | float]]:
         """
         Create multiple transcriptions at once on existing elements through the API,
@@ -131,8 +128,6 @@ class TranscriptionMixin:
             orientation (TextOrientation)
                 Optional. Orientation of the transcription's text.
-        :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
         :returns: A list of dicts as returned in the ``transcriptions`` field by the ``CreateTranscriptions`` API endpoint.
         """
@@ -176,19 +171,13 @@ class TranscriptionMixin:
             )
             return
-        created_trs = [
-            created_tr
-            for batch in make_batches(
-                transcriptions_payload, "transcription", batch_size
-            )
-            for created_tr in self.api_client.request(
-                "CreateTranscriptions",
-                body={
-                    "worker_run_id": self.worker_run_id,
-                    "transcriptions": batch,
-                },
-            )["transcriptions"]
-        ]
+        created_trs = self.request(
+            "CreateTranscriptions",
+            body={
+                "worker_run_id": self.worker_run_id,
+                "transcriptions": transcriptions_payload,
+            },
+        )["transcriptions"]
         if self.use_cache:
             # Store transcriptions in local cache
@@ -212,13 +201,11 @@ class TranscriptionMixin:
         return created_trs
-    @batch_publication
     def create_element_transcriptions(
         self,
         element: Element | CachedElement,
         sub_element_type: str,
         transcriptions: list[dict[str, str | float]],
-        batch_size: int = DEFAULT_BATCH_SIZE,
     ) -> dict[str, str | bool]:
         """
         Create multiple elements and transcriptions at once on a single parent element through the API.
@@ -238,8 +225,6 @@ class TranscriptionMixin:
             element_confidence (float)
                 Optional. Confidence score of the element between 0 and 1.
-        :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
         :returns: A list of dicts as returned by the ``CreateElementTranscriptions`` API endpoint.
         """
         assert element and isinstance(
@@ -306,22 +291,16 @@ class TranscriptionMixin:
             )
             return
-        annotations = [
-            annotation
-            for batch in make_batches(
-                transcriptions_payload, "transcription", batch_size
-            )
-            for annotation in self.api_client.request(
-                "CreateElementTranscriptions",
-                id=element.id,
-                body={
-                    "element_type": sub_element_type,
-                    "worker_run_id": self.worker_run_id,
-                    "transcriptions": batch,
-                    "return_elements": True,
-                },
-            )
-        ]
+        annotations = self.request(
+            "CreateElementTranscriptions",
+            id=element.id,
+            body={
+                "element_type": sub_element_type,
+                "worker_run_id": self.worker_run_id,
+                "transcriptions": transcriptions_payload,
+                "return_elements": True,
+            },
+        )
         for annotation in annotations:
             if annotation["created"]:
@@ -441,60 +420,60 @@ class TranscriptionMixin:
                 ), "if of type bool, worker_run can only be set to False"
             query_params["worker_run"] = worker_run
-        if not self.use_cache:
-            return self.api_client.paginate(
-                "ListTranscriptions", id=element.id, **query_params
-            )
-        if not recursive:
-            # In this case we don't have to return anything, it's easier to use an
-            # impossible condition (False) rather than filtering by type for nothing
-            if element_type and element_type != element.type:
-                return CachedTranscription.select().where(False)
-            transcriptions = CachedTranscription.select().where(
-                CachedTranscription.element_id == element.id
-            )
-        else:
-            base_case = (
-                CachedElement.select()
-                .where(CachedElement.id == element.id)
-                .cte("base", recursive=True)
-            )
-            recursive = CachedElement.select().join(
-                base_case, on=(CachedElement.parent_id == base_case.c.id)
-            )
-            cte = base_case.union_all(recursive)
-            transcriptions = (
-                CachedTranscription.select()
-                .join(cte, on=(CachedTranscription.element_id == cte.c.id))
-                .with_cte(cte)
-            )
-            if element_type:
-                transcriptions = transcriptions.where(cte.c.type == element_type)
-        if worker_version is not None:
-            # If worker_version=False, filter by manual worker_version e.g. None
-            worker_version_id = worker_version or None
-            if worker_version_id:
-                transcriptions = transcriptions.where(
-                    CachedTranscription.worker_version_id == worker_version_id
+        if self.use_cache:
+            if not recursive:
+                # In this case we don't have to return anything, it's easier to use an
+                # impossible condition (False) rather than filtering by type for nothing
+                if element_type and element_type != element.type:
+                    return CachedTranscription.select().where(False)
+                transcriptions = CachedTranscription.select().where(
+                    CachedTranscription.element_id == element.id
                 )
             else:
-                transcriptions = transcriptions.where(
-                    CachedTranscription.worker_version_id.is_null()
+                base_case = (
+                    CachedElement.select()
+                    .where(CachedElement.id == element.id)
+                    .cte("base", recursive=True)
                 )
-        if worker_run is not None:
-            # If worker_run=False, filter by manual worker_run e.g. None
-            worker_run_id = worker_run or None
-            if worker_run_id:
-                transcriptions = transcriptions.where(
-                    CachedTranscription.worker_run_id == worker_run_id
+                recursive = CachedElement.select().join(
+                    base_case, on=(CachedElement.parent_id == base_case.c.id)
                 )
-            else:
-                transcriptions = transcriptions.where(
-                    CachedTranscription.worker_run_id.is_null()
+                cte = base_case.union_all(recursive)
+                transcriptions = (
+                    CachedTranscription.select()
+                    .join(cte, on=(CachedTranscription.element_id == cte.c.id))
+                    .with_cte(cte)
                 )
+                if element_type:
+                    transcriptions = transcriptions.where(cte.c.type == element_type)
+            if worker_version is not None:
+                # If worker_version=False, filter by manual worker_version e.g. None
+                worker_version_id = worker_version or None
+                if worker_version_id:
+                    transcriptions = transcriptions.where(
+                        CachedTranscription.worker_version_id == worker_version_id
+                    )
+                else:
+                    transcriptions = transcriptions.where(
+                        CachedTranscription.worker_version_id.is_null()
+                    )
+            if worker_run is not None:
+                # If worker_run=False, filter by manual worker_run e.g. None
+                worker_run_id = worker_run or None
+                if worker_run_id:
+                    transcriptions = transcriptions.where(
+                        CachedTranscription.worker_run_id == worker_run_id
+                    )
+                else:
+                    transcriptions = transcriptions.where(
+                        CachedTranscription.worker_run_id.is_null()
+                    )
+        else:
+            transcriptions = self.api_client.paginate(
+                "ListTranscriptions", id=element.id, **query_params
+            )
         return transcriptions

arkindex_worker/worker/version.py CHANGED Viewed

@@ -34,9 +34,7 @@ class WorkerVersionMixin:
         if worker_version_id in self._worker_version_cache:
             return self._worker_version_cache[worker_version_id]
-        worker_version = self.api_client.request(
-            "RetrieveWorkerVersion", id=worker_version_id
-        )
+        worker_version = self.request("RetrieveWorkerVersion", id=worker_version_id)
         self._worker_version_cache[worker_version_id] = worker_version
         return worker_version

tests/__init__.py CHANGED Viewed

@@ -5,4 +5,4 @@ FIXTURES_DIR = BASE_DIR / "data"
 SAMPLES_DIR = BASE_DIR / "samples"
 CORPUS_ID = "11111111-1111-1111-1111-111111111111"
-PROCESS_ID = "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff"
+PROCESS_ID = "cafecafe-cafe-cafe-cafe-cafecafecafe"

arkindex-base-worker 0.4.0__py3-none-any.whl → 0.4.0a2__py3-none-any.whl

arkindex-base-worker 0.4.0py3-none-any.whl → 0.4.0a2py3-none-any.whl