PyPI - arkindex-base-worker - Versions diffs - 0.3.7rc4__py3-none-any.whl → 0.5.0a1__py3-none-any.whl - Mend

arkindex-base-worker 0.3.7rc4py3-none-any.whl → 0.5.0a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

{arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/METADATA +18 -19
arkindex_base_worker-0.5.0a1.dist-info/RECORD +61 -0
{arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/WHEEL +1 -1
{arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/top_level.txt +2 -0
arkindex_worker/cache.py +1 -1
arkindex_worker/image.py +167 -2
arkindex_worker/models.py +18 -0
arkindex_worker/utils.py +98 -4
arkindex_worker/worker/__init__.py +117 -218
arkindex_worker/worker/base.py +39 -46
arkindex_worker/worker/classification.py +45 -29
arkindex_worker/worker/corpus.py +86 -0
arkindex_worker/worker/dataset.py +89 -26
arkindex_worker/worker/element.py +352 -91
arkindex_worker/worker/entity.py +13 -11
arkindex_worker/worker/image.py +21 -0
arkindex_worker/worker/metadata.py +26 -16
arkindex_worker/worker/process.py +92 -0
arkindex_worker/worker/task.py +5 -4
arkindex_worker/worker/training.py +25 -10
arkindex_worker/worker/transcription.py +89 -68
arkindex_worker/worker/version.py +3 -1
hooks/pre_gen_project.py +3 -0
tests/__init__.py +8 -0
tests/conftest.py +47 -58
tests/test_base_worker.py +212 -12
tests/test_dataset_worker.py +294 -437
tests/test_elements_worker/{test_classifications.py → test_classification.py} +313 -200
tests/test_elements_worker/test_cli.py +3 -11
tests/test_elements_worker/test_corpus.py +168 -0
tests/test_elements_worker/test_dataset.py +106 -157
tests/test_elements_worker/test_element.py +427 -0
tests/test_elements_worker/test_element_create_multiple.py +715 -0
tests/test_elements_worker/test_element_create_single.py +528 -0
tests/test_elements_worker/test_element_list_children.py +969 -0
tests/test_elements_worker/test_element_list_parents.py +530 -0
tests/test_elements_worker/{test_entities.py → test_entity_create.py} +37 -195
tests/test_elements_worker/test_entity_list_and_check.py +160 -0
tests/test_elements_worker/test_image.py +66 -0
tests/test_elements_worker/test_metadata.py +252 -161
tests/test_elements_worker/test_process.py +89 -0
tests/test_elements_worker/test_task.py +8 -18
tests/test_elements_worker/test_training.py +17 -8
tests/test_elements_worker/test_transcription_create.py +873 -0
tests/test_elements_worker/test_transcription_create_with_elements.py +951 -0
tests/test_elements_worker/test_transcription_list.py +450 -0
tests/test_elements_worker/test_version.py +60 -0
tests/test_elements_worker/test_worker.py +578 -293
tests/test_image.py +542 -209
tests/test_merge.py +1 -2
tests/test_utils.py +89 -4
worker-demo/tests/__init__.py +0 -0
worker-demo/tests/conftest.py +32 -0
worker-demo/tests/test_worker.py +12 -0
worker-demo/worker_demo/__init__.py +6 -0
worker-demo/worker_demo/worker.py +19 -0
arkindex_base_worker-0.3.7rc4.dist-info/RECORD +0 -41
tests/test_elements_worker/test_elements.py +0 -2713
tests/test_elements_worker/test_transcriptions.py +0 -2119
{arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/LICENSE +0 -0

arkindex_worker/worker/classification.py CHANGED Viewed

@@ -2,14 +2,18 @@
 ElementsWorker methods for classifications and ML classes.
 """
-from uuid import UUID
-from apistar.exceptions import ErrorResponse
 from peewee import IntegrityError
+from arkindex.exceptions import ErrorResponse
 from arkindex_worker import logger
 from arkindex_worker.cache import CachedClassification, CachedElement
 from arkindex_worker.models import Element
+from arkindex_worker.utils import (
+    DEFAULT_BATCH_SIZE,
+    batch_publication,
+    make_batches,
+    pluralize,
+)
 class ClassificationMixin:
@@ -23,7 +27,7 @@ class ClassificationMixin:
         )
         self.classes = {ml_class["name"]: ml_class["id"] for ml_class in corpus_classes}
         logger.info(
-            f"Loaded {len(self.classes)} ML classes in corpus ({self.corpus_id})"
+            f'Loaded {len(self.classes)} ML {pluralize("class", len(self.classes))} in corpus ({self.corpus_id})'
         )
     def get_ml_class_id(self, ml_class: str) -> str:
@@ -41,7 +45,7 @@ class ClassificationMixin:
         if ml_class_id is None:
             logger.info(f"Creating ML class {ml_class} on corpus {self.corpus_id}")
             try:
-                response = self.request(
+                response = self.api_client.request(
                     "CreateMLClass", id=self.corpus_id, body={"name": ml_class}
                 )
                 ml_class_id = self.classes[ml_class] = response["id"]
@@ -121,7 +125,7 @@ class ClassificationMixin:
             )
             return
         try:
-            created = self.request(
+            created = self.api_client.request(
                 "CreateClassification",
                 body={
                     "element": str(element.id),
@@ -169,19 +173,27 @@ class ClassificationMixin:
         return created
+    @batch_publication
     def create_classifications(
         self,
         element: Element | CachedElement,
         classifications: list[dict[str, str | float | bool]],
+        batch_size: int = DEFAULT_BATCH_SIZE,
     ) -> list[dict[str, str | float | bool]]:
         """
         Create multiple classifications at once on the given element through the API.
         :param element: The element to create classifications on.
-        :param classifications: The classifications to create, a list of dicts. Each of them contains
-            a **ml_class_id** (str), the ID of the MLClass for this classification;
-            a **confidence** (float), the confidence score, between 0 and 1;
-            a **high_confidence** (bool), the high confidence state of the classification.
+        :param classifications: A list of dicts representing a classification each, with the following keys:
+            ml_class (str)
+                Required. Name of the MLClass to use.
+            confidence (float)
+                Required. Confidence score for the classification. Must be between 0 and 1.
+            high_confidence (bool)
+                Optional. Whether or not the classification is of high confidence.
+        :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
         :returns: List of created classifications, as returned in the ``classifications`` field by
            the ``CreateClassifications`` API endpoint.
@@ -194,18 +206,10 @@ class ClassificationMixin:
         ), "classifications shouldn't be null and should be of type list"
         for index, classification in enumerate(classifications):
-            ml_class_id = classification.get("ml_class_id")
+            ml_class = classification.get("ml_class")
             assert (
-                ml_class_id and isinstance(ml_class_id, str)
-            ), f"Classification at index {index} in classifications: ml_class_id shouldn't be null and should be of type str"
-            # Make sure it's a valid UUID
-            try:
-                UUID(ml_class_id)
-            except ValueError as e:
-                raise ValueError(
-                    f"Classification at index {index} in classifications: ml_class_id is not a valid uuid."
-                ) from e
+                ml_class and isinstance(ml_class, str)
+            ), f"Classification at index {index} in classifications: ml_class shouldn't be null and should be of type str"
             confidence = classification.get("confidence")
             assert (
@@ -226,14 +230,26 @@ class ClassificationMixin:
             )
             return
-        created_cls = self.request(
-            "CreateClassifications",
-            body={
-                "parent": str(element.id),
-                "worker_run_id": self.worker_run_id,
-                "classifications": classifications,
-            },
-        )["classifications"]
+        created_cls = [
+            created_cl
+            for batch in make_batches(classifications, "classification", batch_size)
+            for created_cl in self.api_client.request(
+                "CreateClassifications",
+                body={
+                    "parent": str(element.id),
+                    "worker_run_id": self.worker_run_id,
+                    "classifications": [
+                        {
+                            **classification,
+                            "ml_class": self.get_ml_class_id(
+                                classification["ml_class"]
+                            ),
+                        }
+                        for classification in batch
+                    ],
+                },
+            )["classifications"]
+        ]
         for created_cl in created_cls:
             created_cl["class_name"] = self.retrieve_ml_class(created_cl["ml_class"])

arkindex_worker/worker/corpus.py ADDED Viewed

@@ -0,0 +1,86 @@
+"""
+BaseWorker methods for corpora.
+"""
+from enum import Enum
+from operator import itemgetter
+from tempfile import _TemporaryFileWrapper
+from uuid import UUID
+from arkindex_worker import logger
+class CorpusExportState(Enum):
+    """
+    State of a corpus export.
+    """
+    Created = "created"
+    """
+    The corpus export is created, awaiting its processing.
+    """
+    Running = "running"
+    """
+    The corpus export is being built.
+    """
+    Failed = "failed"
+    """
+    The corpus export failed.
+    """
+    Done = "done"
+    """
+    The corpus export ended in success.
+    """
+class CorpusMixin:
+    def download_export(self, export_id: str) -> _TemporaryFileWrapper:
+        """
+        Download an export.
+        :param export_id: UUID of the export to download
+        :returns: The downloaded export stored in a temporary file.
+        """
+        try:
+            UUID(export_id)
+        except ValueError as e:
+            raise ValueError("export_id is not a valid uuid.") from e
+        logger.info(f"Downloading export ({export_id})...")
+        export: _TemporaryFileWrapper = self.api_client.request(
+            "DownloadExport", id=export_id
+        )
+        logger.info(f"Downloaded export ({export_id}) @ `{export.name}`")
+        return export
+    def download_latest_export(self) -> _TemporaryFileWrapper:
+        """
+        Download the latest export in `done` state of the current corpus.
+        :returns: The downloaded export stored in a temporary file.
+        """
+        # List all exports on the corpus
+        exports = self.api_client.paginate("ListExports", id=self.corpus_id)
+        # Find the latest that is in "done" state
+        exports: list[dict] = sorted(
+            list(
+                filter(
+                    lambda export: export["state"] == CorpusExportState.Done.value,
+                    exports,
+                )
+            ),
+            key=itemgetter("updated"),
+            reverse=True,
+        )
+        assert (
+            len(exports) > 0
+        ), f'No available exports found for the corpus ({self.corpus_id}) with state "{CorpusExportState.Done.value.capitalize()}".'
+        # Download latest export
+        export_id: str = exports[0]["id"]
+        return self.download_export(export_id)

arkindex_worker/worker/dataset.py CHANGED Viewed

@@ -2,12 +2,14 @@
 BaseWorker methods for datasets.
 """
+import uuid
+from argparse import ArgumentTypeError
 from collections.abc import Iterator
 from enum import Enum
 from arkindex_worker import logger
 from arkindex_worker.cache import unsupported_cache
-from arkindex_worker.models import Dataset, Element
+from arkindex_worker.models import Dataset, Element, Set
 class DatasetState(Enum):
@@ -36,49 +38,110 @@ class DatasetState(Enum):
     """
+class MissingDatasetArchive(Exception):
+    """
+    Exception raised when the compressed archive associated to
+    a dataset isn't found in its task artifacts.
+    """
+def check_dataset_set(value: str) -> tuple[uuid.UUID, str]:
+    """The `--set` argument should have the following format:
+    <dataset_id>:<set_name>
+    Args:
+        value (str): Provided argument.
+    Raises:
+        ArgumentTypeError: When the value is invalid.
+    Returns:
+        tuple[uuid.UUID, str]: The ID of the dataset parsed as UUID and the name of the set.
+    """
+    values = value.split(":")
+    if len(values) != 2:
+        raise ArgumentTypeError(
+            f"'{value}' is not in the correct format `<dataset_id>:<set_name>`"
+        )
+    dataset_id, set_name = values
+    try:
+        dataset_id = uuid.UUID(dataset_id)
+        return (dataset_id, set_name)
+    except (TypeError, ValueError) as e:
+        raise ArgumentTypeError(f"'{dataset_id}' should be a valid UUID") from e
 class DatasetMixin:
-    def list_process_datasets(self) -> Iterator[Dataset]:
+    def add_arguments(self) -> None:
+        """Define specific ``argparse`` arguments for the worker using this mixin"""
+        self.parser.add_argument(
+            "--set",
+            type=check_dataset_set,
+            nargs="+",
+            help="""
+                One or more Arkindex dataset sets, format is <dataset_uuid>:<set_name>
+                (e.g.: "12341234-1234-1234-1234-123412341234:train")
+            """,
+            default=[],
+        )
+        super().add_arguments()
+    def list_process_sets(self) -> Iterator[Set]:
         """
-        List datasets associated to the worker's process. This helper is not available in developer mode.
+        List dataset sets associated to the worker's process. This helper is not available in developer mode.
-        :returns: An iterator of ``Dataset`` objects built from the ``ListProcessDatasets`` API endpoint.
+        :returns: An iterator of ``Set`` objects built from the ``ListProcessSets`` API endpoint.
         """
         assert not self.is_read_only, "This helper is not available in read-only mode."
         results = self.api_client.paginate(
-            "ListProcessDatasets", id=self.process_information["id"]
+            "ListProcessSets", id=self.process_information["id"]
         )
         return map(
-            lambda result: Dataset(**result["dataset"], selected_sets=result["sets"]),
+            lambda result: Set(
+                name=result["set_name"], dataset=Dataset(**result["dataset"])
+            ),
             results,
         )
-    def list_dataset_elements(self, dataset: Dataset) -> Iterator[tuple[str, Element]]:
+    def list_set_elements(self, dataset_set: Set) -> Iterator[Element]:
         """
-        List elements in a dataset.
+        List elements in a dataset set.
-        :param dataset: Dataset to find elements in.
-        :returns: An iterator of tuples built from the ``ListDatasetElements`` API endpoint.
+        :param dataset_set: Set to find elements in.
+        :returns: An iterator of Element built from the ``ListDatasetElements`` API endpoint.
         """
-        assert dataset and isinstance(
-            dataset, Dataset
-        ), "dataset shouldn't be null and should be a Dataset"
+        assert dataset_set and isinstance(
+            dataset_set, Set
+        ), "dataset_set shouldn't be null and should be a Set"
+        results = self.api_client.paginate(
+            "ListDatasetElements", id=dataset_set.dataset.id, set=dataset_set.name
+        )
-        if dataset.sets == dataset.selected_sets:
-            results = self.api_client.paginate("ListDatasetElements", id=dataset.id)
-        else:
-            results = iter(
-                element
-                for selected_set in dataset.selected_sets
-                for element in self.api_client.paginate(
-                    "ListDatasetElements", id=dataset.id, set=selected_set
+        return map(lambda result: Element(**result["element"]), results)
+    def list_sets(self) -> Iterator[Set]:
+        """
+        List the sets to be processed, either from the CLI arguments or using the
+        [list_process_sets][arkindex_worker.worker.dataset.DatasetMixin.list_process_sets] method.
+        :returns: An iterator of ``Set`` objects.
+        """
+        if not self.is_read_only:
+            yield from self.list_process_sets()
+        datasets: dict[uuid.UUID, Dataset] = {}
+        for dataset_id, set_name in self.args.set:
+            # Retrieving dataset information if not already cached
+            if dataset_id not in datasets:
+                datasets[dataset_id] = Dataset(
+                    **self.api_client.request("RetrieveDataset", id=dataset_id)
                 )
-            )
-        return map(
-            lambda result: (result["set"], Element(**result["element"])), results
-        )
+            yield Set(name=set_name, dataset=datasets[dataset_id])
     @unsupported_cache
     def update_dataset_state(self, dataset: Dataset, state: DatasetState) -> Dataset:
@@ -100,7 +163,7 @@ class DatasetMixin:
             logger.warning("Cannot update dataset as this worker is in read-only mode")
             return
-        updated_dataset = self.request(
+        updated_dataset = self.api_client.request(
             "PartialUpdateDataset",
             id=dataset.id,
             body={"state": state.value},

arkindex-base-worker 0.3.7rc4__py3-none-any.whl → 0.5.0a1__py3-none-any.whl

arkindex-base-worker 0.3.7rc4py3-none-any.whl → 0.5.0a1py3-none-any.whl