PyPI - arkindex-base-worker - Versions diffs - 0.3.7rc9__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

arkindex-base-worker 0.3.7rc9py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

{arkindex_base_worker-0.3.7rc9.dist-info → arkindex_base_worker-0.4.0.dist-info}/METADATA +16 -20
arkindex_base_worker-0.4.0.dist-info/RECORD +61 -0
{arkindex_base_worker-0.3.7rc9.dist-info → arkindex_base_worker-0.4.0.dist-info}/WHEEL +1 -1
arkindex_worker/cache.py +1 -1
arkindex_worker/image.py +120 -1
arkindex_worker/models.py +6 -0
arkindex_worker/utils.py +85 -4
arkindex_worker/worker/__init__.py +68 -162
arkindex_worker/worker/base.py +39 -34
arkindex_worker/worker/classification.py +34 -18
arkindex_worker/worker/corpus.py +86 -0
arkindex_worker/worker/dataset.py +71 -1
arkindex_worker/worker/element.py +352 -91
arkindex_worker/worker/entity.py +11 -11
arkindex_worker/worker/image.py +21 -0
arkindex_worker/worker/metadata.py +19 -9
arkindex_worker/worker/process.py +92 -0
arkindex_worker/worker/task.py +5 -4
arkindex_worker/worker/training.py +25 -10
arkindex_worker/worker/transcription.py +89 -68
arkindex_worker/worker/version.py +3 -1
tests/__init__.py +8 -0
tests/conftest.py +36 -52
tests/test_base_worker.py +212 -12
tests/test_dataset_worker.py +21 -45
tests/test_elements_worker/{test_classifications.py → test_classification.py} +216 -100
tests/test_elements_worker/test_cli.py +3 -11
tests/test_elements_worker/test_corpus.py +168 -0
tests/test_elements_worker/test_dataset.py +7 -12
tests/test_elements_worker/test_element.py +427 -0
tests/test_elements_worker/test_element_create_multiple.py +715 -0
tests/test_elements_worker/test_element_create_single.py +528 -0
tests/test_elements_worker/test_element_list_children.py +969 -0
tests/test_elements_worker/test_element_list_parents.py +530 -0
tests/test_elements_worker/{test_entities.py → test_entity_create.py} +37 -195
tests/test_elements_worker/test_entity_list_and_check.py +160 -0
tests/test_elements_worker/test_image.py +66 -0
tests/test_elements_worker/test_metadata.py +230 -139
tests/test_elements_worker/test_process.py +89 -0
tests/test_elements_worker/test_task.py +8 -18
tests/test_elements_worker/test_training.py +17 -8
tests/test_elements_worker/test_transcription_create.py +873 -0
tests/test_elements_worker/test_transcription_create_with_elements.py +951 -0
tests/test_elements_worker/test_transcription_list.py +450 -0
tests/test_elements_worker/test_version.py +60 -0
tests/test_elements_worker/test_worker.py +563 -279
tests/test_image.py +432 -209
tests/test_merge.py +1 -2
tests/test_utils.py +66 -3
arkindex_base_worker-0.3.7rc9.dist-info/RECORD +0 -47
tests/test_elements_worker/test_elements.py +0 -2713
tests/test_elements_worker/test_transcriptions.py +0 -2119
{arkindex_base_worker-0.3.7rc9.dist-info → arkindex_base_worker-0.4.0.dist-info}/LICENSE +0 -0
{arkindex_base_worker-0.3.7rc9.dist-info → arkindex_base_worker-0.4.0.dist-info}/top_level.txt +0 -0

arkindex_worker/worker/__init__.py CHANGED Viewed

@@ -4,64 +4,47 @@ Base classes to implement Arkindex workers.
 import contextlib
 import json
-import os
 import sys
 import uuid
-from argparse import ArgumentTypeError
-from collections.abc import Iterable, Iterator
-from enum import Enum
+from collections.abc import Iterable
+from itertools import chain
 from pathlib import Path
-from apistar.exceptions import ErrorResponse
+from arkindex.exceptions import ErrorResponse
 from arkindex_worker import logger
 from arkindex_worker.cache import CachedElement
 from arkindex_worker.models import Dataset, Element, Set
+from arkindex_worker.utils import pluralize
 from arkindex_worker.worker.base import BaseWorker
 from arkindex_worker.worker.classification import ClassificationMixin
-from arkindex_worker.worker.dataset import DatasetMixin, DatasetState
+from arkindex_worker.worker.corpus import CorpusMixin
+from arkindex_worker.worker.dataset import (
+    DatasetMixin,
+    DatasetState,
+    MissingDatasetArchive,
+)
 from arkindex_worker.worker.element import ElementMixin
 from arkindex_worker.worker.entity import EntityMixin
+from arkindex_worker.worker.image import ImageMixin
 from arkindex_worker.worker.metadata import MetaDataMixin, MetaType  # noqa: F401
+from arkindex_worker.worker.process import ActivityState, ProcessMixin, ProcessMode
 from arkindex_worker.worker.task import TaskMixin
 from arkindex_worker.worker.transcription import TranscriptionMixin
 from arkindex_worker.worker.version import WorkerVersionMixin
-class ActivityState(Enum):
-    """
-    Processing state of an element.
-    """
-    Queued = "queued"
-    """
-    The element has not yet been processed by a worker.
-    """
-    Started = "started"
-    """
-    The element is being processed by a worker.
-    """
-    Processed = "processed"
-    """
-    The element has been successfully processed by a worker.
-    """
-    Error = "error"
-    """
-    An error occurred while processing this element.
-    """
 class ElementsWorker(
+    ElementMixin,
+    DatasetMixin,
     BaseWorker,
     ClassificationMixin,
-    ElementMixin,
+    CorpusMixin,
     TranscriptionMixin,
     WorkerVersionMixin,
     EntityMixin,
     MetaDataMixin,
+    ImageMixin,
+    ProcessMixin,
 ):
     """
     Base class for ML workers that operate on Arkindex elements.
@@ -79,39 +62,41 @@ class ElementsWorker(
         """
         super().__init__(description, support_cache)
-        # Add mandatory argument to process elements
-        self.parser.add_argument(
-            "--elements-list",
-            help="JSON elements list to use",
-            type=open,
-            default=os.environ.get("TASK_ELEMENTS"),
-        )
-        self.parser.add_argument(
-            "--element",
-            type=uuid.UUID,
-            nargs="+",
-            help="One or more Arkindex element ID",
-        )
         self.classes = {}
         self.entity_types = {}
         """Known and available entity types in processed corpus
         """
+        self.corpus_types = {}
+        """Known and available element types in processed corpus
+        """
         self._worker_version_cache = {}
-    def list_elements(self) -> Iterable[CachedElement] | list[str]:
+    def get_elements(self) -> Iterable[CachedElement] | list[str] | list[Element]:
         """
         List the elements to be processed, either from the CLI arguments or
         the cache database when enabled.
         :return: An iterable of [CachedElement][arkindex_worker.cache.CachedElement] when cache support is enabled,
-           and a list of strings representing element IDs otherwise.
+           or a list of strings representing element IDs otherwise.
         """
         assert not (
             self.args.elements_list and self.args.element
         ), "elements-list and element CLI args shouldn't be both set"
+        def invalid_element_id(value: str) -> bool:
+            """
+            Return whether the ID of an element is a valid UUID or not
+            """
+            try:
+                uuid.UUID(value)
+            except Exception:
+                return True
+            return False
         out = []
         # Load from the cache when available
@@ -121,15 +106,28 @@ class ElementsWorker(
         )
         if self.use_cache and cache_query.exists():
             return cache_query
-        # Process elements from JSON file
         elif self.args.elements_list:
+            # Process elements from JSON file
             data = json.load(self.args.elements_list)
             assert isinstance(data, list), "Elements list must be a list"
             assert len(data), "No elements in elements list"
             out += list(filter(None, [element.get("id") for element in data]))
-        # Add any extra element from CLI
         elif self.args.element:
+            # Add any extra element from CLI
             out += self.args.element
+        elif self.process_mode == ProcessMode.Dataset or self.args.set:
+            # Elements from datasets
+            return list(
+                chain.from_iterable(map(self.list_set_elements, self.list_sets()))
+            )
+        elif self.process_mode == ProcessMode.Export:
+            # For export mode processes, use list_process_elements and return element IDs
+            return {item["id"] for item in self.list_process_elements()}
+        invalid_element_ids = list(filter(invalid_element_id, out))
+        assert (
+            not invalid_element_ids
+        ), f"These element IDs are invalid: {', '.join(invalid_element_ids)}"
         return out
@@ -139,40 +137,22 @@ class ElementsWorker(
         Whether or not WorkerActivity support has been enabled on the DataImport
         used to run this worker.
         """
-        if self.is_read_only:
+        if self.is_read_only or self.process_mode in [
+            ProcessMode.Dataset,
+            ProcessMode.Export,
+        ]:
+            # Worker activities are also disabled when running an ElementsWorker in a Dataset process
+            # and when running export processes.
             return False
         assert (
             self.process_information
         ), "Worker must be configured to access its process activity state"
         return self.process_information.get("activity_state") == "ready"
-    def configure(self):
-        """
-        Setup the worker using CLI arguments and environment variables.
-        """
-        # CLI args are stored on the instance so that implementations can access them
-        self.args = self.parser.parse_args()
-        if self.is_read_only:
-            super().configure_for_developers()
-        else:
-            super().configure()
-            super().configure_cache()
-        # Retrieve the model configuration
-        if self.model_configuration:
-            self.config.update(self.model_configuration)
-            logger.info("Model version configuration retrieved")
-        # Retrieve the user configuration
-        if self.user_configuration:
-            self.config.update(self.user_configuration)
-            logger.info("User configuration retrieved")
     def run(self):
         """
         Implements an Arkindex worker that goes through each element returned by
-        [list_elements][arkindex_worker.worker.ElementsWorker.list_elements].
+        [get_elements][arkindex_worker.worker.ElementsWorker.get_elements].
         It calls [process_element][arkindex_worker.worker.ElementsWorker.process_element],
         catching exceptions, and handles saving WorkerActivity updates when enabled.
         """
@@ -180,7 +160,7 @@ class ElementsWorker(
         # List all elements either from JSON file
         # or direct list of elements on CLI
-        elements = self.list_elements()
+        elements = self.get_elements()
         if not elements:
             logger.warning("No elements to process, stopping.")
             sys.exit(1)
@@ -196,12 +176,14 @@ class ElementsWorker(
         for i, item in enumerate(elements, start=1):
             element = None
             try:
-                if self.use_cache:
-                    # Just use the result of list_elements as the element
+                if isinstance(item, CachedElement | Element):
+                    # Just use the result of get_elements as the element
                     element = item
                 else:
                     # Load element using the Arkindex API
-                    element = Element(**self.request("RetrieveElement", id=item))
+                    element = Element(
+                        **self.api_client.request("RetrieveElement", id=item)
+                    )
                 logger.info(f"Processing {element} ({i}/{count})")
@@ -239,7 +221,7 @@ class ElementsWorker(
                     with contextlib.suppress(Exception):
                         self.update_activity(element.id, ActivityState.Error)
-        message = f'Ran on {count} element{"s"[:count>1]}: {count - failed} completed, {failed} failed'
+        message = f'Ran on {count} {pluralize("element", count)}: {count - failed} completed, {failed} failed'
         if failed:
             logger.error(message)
             if failed >= count:  # Everything failed!
@@ -280,7 +262,7 @@ class ElementsWorker(
         assert isinstance(state, ActivityState), "state should be an ActivityState"
         try:
-            self.request(
+            self.api_client.request(
                 "UpdateWorkerActivity",
                 id=self.worker_run_id,
                 body={
@@ -310,29 +292,7 @@ class ElementsWorker(
         return True
-def check_dataset_set(value: str) -> tuple[uuid.UUID, str]:
-    values = value.split(":")
-    if len(values) != 2:
-        raise ArgumentTypeError(
-            f"'{value}' is not in the correct format `<dataset_id>:<set_name>`"
-        )
-    dataset_id, set_name = values
-    try:
-        dataset_id = uuid.UUID(dataset_id)
-        return (dataset_id, set_name)
-    except (TypeError, ValueError) as e:
-        raise ArgumentTypeError(f"'{dataset_id}' should be a valid UUID") from e
-class MissingDatasetArchive(Exception):
-    """
-    Exception raised when the compressed archive associated to
-    a dataset isn't found in its task artifacts.
-    """
-class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
+class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
     """
     Base class for ML workers that operate on Arkindex dataset sets.
@@ -355,40 +315,6 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
         # Set as an instance variable as dataset workers might use it to easily extract its content
         self.downloaded_dataset_artifact: Path | None = None
-        self.parser.add_argument(
-            "--set",
-            type=check_dataset_set,
-            nargs="+",
-            help="""
-                One or more Arkindex dataset sets, format is <dataset_uuid>:<set_name>
-                (e.g.: "12341234-1234-1234-1234-123412341234:train")
-            """,
-            default=[],
-        )
-    def configure(self):
-        """
-        Setup the worker using CLI arguments and environment variables.
-        """
-        # CLI args are stored on the instance so that implementations can access them
-        self.args = self.parser.parse_args()
-        if self.is_read_only:
-            super().configure_for_developers()
-        else:
-            super().configure()
-            super().configure_cache()
-        # Retrieve the model configuration
-        if self.model_configuration:
-            self.config.update(self.model_configuration)
-            logger.info("Model version configuration retrieved")
-        # Retrieve the user configuration
-        if self.user_configuration:
-            self.config.update(self.user_configuration)
-            logger.info("User configuration retrieved")
     def cleanup_downloaded_artifact(self) -> None:
         """
         Cleanup the downloaded dataset artifact if any
@@ -436,30 +362,10 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
         :param set: The set to process.
         """
-    def list_sets(self) -> Iterator[Set]:
-        """
-        List the sets to be processed, either from the CLI arguments or using the
-        [list_process_sets][arkindex_worker.worker.dataset.DatasetMixin.list_process_sets] method.
-        :returns: An iterator of ``Set`` objects.
-        """
-        if not self.is_read_only:
-            yield from self.list_process_sets()
-        datasets: dict[uuid.UUID, Dataset] = {}
-        for dataset_id, set_name in self.args.set:
-            # Retrieving dataset information is not already cached
-            if dataset_id not in datasets:
-                datasets[dataset_id] = Dataset(
-                    **self.request("RetrieveDataset", id=dataset_id)
-                )
-            yield Set(name=set_name, dataset=datasets[dataset_id])
     def run(self):
         """
         Implements an Arkindex worker that goes through each dataset set returned by
-        [list_sets][arkindex_worker.worker.DatasetWorker.list_sets].
+        [list_sets][arkindex_worker.worker.dataset.DatasetMixin.list_sets].
         It calls [process_set][arkindex_worker.worker.DatasetWorker.process_set],
         catching exceptions.
@@ -499,7 +405,7 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
         # Cleanup the latest downloaded dataset artifact
         self.cleanup_downloaded_artifact()
-        message = f'Ran on {count} set{"s"[:count>1]}: {count - failed} completed, {failed} failed'
+        message = f'Ran on {count} {pluralize("set", count)}: {count - failed} completed, {failed} failed'
         if failed:
             logger.error(message)
             if failed >= count:  # Everything failed!

arkindex_worker/worker/base.py CHANGED Viewed

@@ -12,15 +12,9 @@ from tempfile import mkdtemp
 import gnupg
 import yaml
-from apistar.exceptions import ErrorResponse
-from tenacity import (
-    before_sleep_log,
-    retry,
-    retry_if_exception,
-    stop_after_attempt,
-    wait_exponential,
-)
+from arkindex import options_from_env
+from arkindex.exceptions import ErrorResponse
 from arkindex_worker import logger
 from arkindex_worker.cache import (
     check_version,
@@ -30,7 +24,8 @@ from arkindex_worker.cache import (
     merge_parents_cache,
 )
 from arkindex_worker.utils import close_delete_file, extract_tar_zst_archive
-from teklia_toolbox.requests import _get_arkindex_client, _is_500_error
+from arkindex_worker.worker.process import ProcessMode
+from teklia_toolbox.requests import get_arkindex_client
 class ExtrasDirNotFoundError(Exception):
@@ -162,6 +157,13 @@ class BaseWorker:
             raise Exception("Missing ARKINDEX_CORPUS_ID environment variable")
         return self._corpus_id
+    @property
+    def process_mode(self) -> ProcessMode | None:
+        """Mode of the process being run. Returns None when read-only."""
+        if self.is_read_only:
+            return
+        return ProcessMode(self.process_information["mode"])
     @property
     def is_read_only(self) -> bool:
         """
@@ -185,7 +187,7 @@ class BaseWorker:
         Create an ArkindexClient to make API requests towards Arkindex instances.
         """
         # Build Arkindex API client from environment variables
-        self.api_client = _get_arkindex_client()
+        self.api_client = get_arkindex_client(**options_from_env())
         logger.debug(f"Setup Arkindex API client on {self.api_client.document.url}")
     def configure_for_developers(self):
@@ -225,7 +227,7 @@ class BaseWorker:
         # Load all required secrets
         self.secrets = {name: self.load_secret(Path(name)) for name in required_secrets}
-    def configure(self):
+    def configure_worker_run(self):
         """
         Setup the necessary configuration needed using CLI args and environment variables.
         This is the method called when running a worker on Arkindex.
@@ -237,7 +239,7 @@ class BaseWorker:
             logger.debug("Debug output enabled")
         # Load worker run information
-        worker_run = self.request("RetrieveWorkerRun", id=self.worker_run_id)
+        worker_run = self.api_client.request("RetrieveWorkerRun", id=self.worker_run_id)
         # Load process information
         self.process_information = worker_run["process"]
@@ -296,7 +298,7 @@ class BaseWorker:
         if self.support_cache and self.args.database is not None:
             self.use_cache = True
         elif self.support_cache and self.task_id:
-            task = self.request("RetrieveTaskFromAgent", id=self.task_id)
+            task = self.api_client.request("RetrieveTask", id=self.task_id)
             self.task_parents = task["parents"]
             paths = self.find_parents_file_paths(Path("db.sqlite"))
             self.use_cache = len(paths) > 0
@@ -326,6 +328,29 @@ class BaseWorker:
         else:
             logger.debug("Cache is disabled")
+    def configure(self):
+        """
+        Setup the worker using CLI arguments and environment variables.
+        """
+        # CLI args are stored on the instance so that implementations can access them
+        self.args = self.parser.parse_args()
+        if self.is_read_only:
+            self.configure_for_developers()
+        else:
+            self.configure_worker_run()
+            self.configure_cache()
+        # Retrieve the model configuration
+        if self.model_configuration:
+            self.config.update(self.model_configuration)
+            logger.info("Model version configuration retrieved")
+        # Retrieve the user configuration
+        if self.user_configuration:
+            self.config.update(self.user_configuration)
+            logger.info("User configuration retrieved")
     def load_secret(self, name: Path):
         """
         Load a Ponos secret by name.
@@ -337,7 +362,7 @@ class BaseWorker:
         # Load from the backend
         try:
-            resp = self.request("RetrieveSecret", name=str(name))
+            resp = self.api_client.request("RetrieveSecret", name=str(name))
             secret = resp["content"]
             logging.info(f"Loaded API secret {name}")
         except ErrorResponse as e:
@@ -477,26 +502,6 @@ class BaseWorker:
             # Clean up
             shutil.rmtree(base_extracted_path)
-    @retry(
-        retry=retry_if_exception(_is_500_error),
-        wait=wait_exponential(multiplier=2, min=3),
-        reraise=True,
-        stop=stop_after_attempt(5),
-        before_sleep=before_sleep_log(logger, logging.INFO),
-    )
-    def request(self, *args, **kwargs):
-        """
-        Wrapper around the ``ArkindexClient.request`` method.
-        The API call will be retried up to 5 times in case of HTTP 5xx errors,
-        with an exponential sleep time of 3, 4, 8 and 16 seconds between calls.
-        If the 5th call still causes an HTTP 5xx error, the exception is re-raised
-        and the caller should catch it.
-        Log messages are displayed when an HTTP 5xx error occurs, before waiting for the next call.
-        """
-        return self.api_client.request(*args, **kwargs)
     def add_arguments(self):
         """Override this method to add ``argparse`` arguments to this worker"""

arkindex_worker/worker/classification.py CHANGED Viewed

@@ -2,12 +2,18 @@
 ElementsWorker methods for classifications and ML classes.
 """
-from apistar.exceptions import ErrorResponse
 from peewee import IntegrityError
+from arkindex.exceptions import ErrorResponse
 from arkindex_worker import logger
 from arkindex_worker.cache import CachedClassification, CachedElement
 from arkindex_worker.models import Element
+from arkindex_worker.utils import (
+    DEFAULT_BATCH_SIZE,
+    batch_publication,
+    make_batches,
+    pluralize,
+)
 class ClassificationMixin:
@@ -21,7 +27,7 @@ class ClassificationMixin:
         )
         self.classes = {ml_class["name"]: ml_class["id"] for ml_class in corpus_classes}
         logger.info(
-            f"Loaded {len(self.classes)} ML classes in corpus ({self.corpus_id})"
+            f'Loaded {len(self.classes)} ML {pluralize("class", len(self.classes))} in corpus ({self.corpus_id})'
         )
     def get_ml_class_id(self, ml_class: str) -> str:
@@ -39,7 +45,7 @@ class ClassificationMixin:
         if ml_class_id is None:
             logger.info(f"Creating ML class {ml_class} on corpus {self.corpus_id}")
             try:
-                response = self.request(
+                response = self.api_client.request(
                     "CreateMLClass", id=self.corpus_id, body={"name": ml_class}
                 )
                 ml_class_id = self.classes[ml_class] = response["id"]
@@ -119,7 +125,7 @@ class ClassificationMixin:
             )
             return
         try:
-            created = self.request(
+            created = self.api_client.request(
                 "CreateClassification",
                 body={
                     "element": str(element.id),
@@ -167,10 +173,12 @@ class ClassificationMixin:
         return created
+    @batch_publication
     def create_classifications(
         self,
         element: Element | CachedElement,
         classifications: list[dict[str, str | float | bool]],
+        batch_size: int = DEFAULT_BATCH_SIZE,
     ) -> list[dict[str, str | float | bool]]:
         """
         Create multiple classifications at once on the given element through the API.
@@ -185,6 +193,8 @@ class ClassificationMixin:
             high_confidence (bool)
                 Optional. Whether or not the classification is of high confidence.
+        :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
         :returns: List of created classifications, as returned in the ``classifications`` field by
            the ``CreateClassifications`` API endpoint.
         """
@@ -220,20 +230,26 @@ class ClassificationMixin:
             )
             return
-        created_cls = self.request(
-            "CreateClassifications",
-            body={
-                "parent": str(element.id),
-                "worker_run_id": self.worker_run_id,
-                "classifications": [
-                    {
-                        **classification,
-                        "ml_class": self.get_ml_class_id(classification["ml_class"]),
-                    }
-                    for classification in classifications
-                ],
-            },
-        )["classifications"]
+        created_cls = [
+            created_cl
+            for batch in make_batches(classifications, "classification", batch_size)
+            for created_cl in self.api_client.request(
+                "CreateClassifications",
+                body={
+                    "parent": str(element.id),
+                    "worker_run_id": self.worker_run_id,
+                    "classifications": [
+                        {
+                            **classification,
+                            "ml_class": self.get_ml_class_id(
+                                classification["ml_class"]
+                            ),
+                        }
+                        for classification in batch
+                    ],
+                },
+            )["classifications"]
+        ]
         for created_cl in created_cls:
             created_cl["class_name"] = self.retrieve_ml_class(created_cl["ml_class"])

arkindex-base-worker 0.3.7rc9__py3-none-any.whl → 0.4.0__py3-none-any.whl

arkindex-base-worker 0.3.7rc9py3-none-any.whl → 0.4.0py3-none-any.whl