PyPI - arkindex-base-worker - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.0a2__py3-none-any.whl - Mend

arkindex-base-worker 0.4.0py3-none-any.whl → 0.4.0a2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

{arkindex_base_worker-0.4.0.dist-info → arkindex_base_worker-0.4.0a2.dist-info}/METADATA +13 -15
arkindex_base_worker-0.4.0a2.dist-info/RECORD +51 -0
{arkindex_base_worker-0.4.0.dist-info → arkindex_base_worker-0.4.0a2.dist-info}/WHEEL +1 -1
arkindex_worker/cache.py +1 -1
arkindex_worker/image.py +1 -120
arkindex_worker/utils.py +0 -82
arkindex_worker/worker/__init__.py +161 -46
arkindex_worker/worker/base.py +11 -36
arkindex_worker/worker/classification.py +18 -34
arkindex_worker/worker/corpus.py +4 -21
arkindex_worker/worker/dataset.py +1 -71
arkindex_worker/worker/element.py +91 -352
arkindex_worker/worker/entity.py +11 -11
arkindex_worker/worker/metadata.py +9 -19
arkindex_worker/worker/task.py +4 -5
arkindex_worker/worker/training.py +6 -6
arkindex_worker/worker/transcription.py +68 -89
arkindex_worker/worker/version.py +1 -3
tests/__init__.py +1 -1
tests/conftest.py +45 -33
tests/test_base_worker.py +3 -204
tests/test_dataset_worker.py +4 -7
tests/test_elements_worker/{test_classification.py → test_classifications.py} +61 -194
tests/test_elements_worker/test_corpus.py +1 -32
tests/test_elements_worker/test_dataset.py +1 -1
tests/test_elements_worker/test_elements.py +2734 -0
tests/test_elements_worker/{test_entity_create.py → test_entities.py} +160 -26
tests/test_elements_worker/test_image.py +1 -2
tests/test_elements_worker/test_metadata.py +99 -224
tests/test_elements_worker/test_task.py +1 -1
tests/test_elements_worker/test_training.py +2 -2
tests/test_elements_worker/test_transcriptions.py +2102 -0
tests/test_elements_worker/test_worker.py +280 -563
tests/test_image.py +204 -429
tests/test_merge.py +2 -1
tests/test_utils.py +3 -66
arkindex_base_worker-0.4.0.dist-info/RECORD +0 -61
arkindex_worker/worker/process.py +0 -92
tests/test_elements_worker/test_element.py +0 -427
tests/test_elements_worker/test_element_create_multiple.py +0 -715
tests/test_elements_worker/test_element_create_single.py +0 -528
tests/test_elements_worker/test_element_list_children.py +0 -969
tests/test_elements_worker/test_element_list_parents.py +0 -530
tests/test_elements_worker/test_entity_list_and_check.py +0 -160
tests/test_elements_worker/test_process.py +0 -89
tests/test_elements_worker/test_transcription_create.py +0 -873
tests/test_elements_worker/test_transcription_create_with_elements.py +0 -951
tests/test_elements_worker/test_transcription_list.py +0 -450
tests/test_elements_worker/test_version.py +0 -60
{arkindex_base_worker-0.4.0.dist-info → arkindex_base_worker-0.4.0a2.dist-info}/LICENSE +0 -0
{arkindex_base_worker-0.4.0.dist-info → arkindex_base_worker-0.4.0a2.dist-info}/top_level.txt +0 -0

arkindex_worker/worker/__init__.py CHANGED Viewed

@@ -4,47 +4,68 @@ Base classes to implement Arkindex workers.
 import contextlib
 import json
+import os
 import sys
 import uuid
-from collections.abc import Iterable
-from itertools import chain
+from argparse import ArgumentTypeError
+from collections.abc import Iterable, Iterator
+from enum import Enum
 from pathlib import Path
-from arkindex.exceptions import ErrorResponse
+from apistar.exceptions import ErrorResponse
 from arkindex_worker import logger
 from arkindex_worker.cache import CachedElement
 from arkindex_worker.models import Dataset, Element, Set
-from arkindex_worker.utils import pluralize
 from arkindex_worker.worker.base import BaseWorker
 from arkindex_worker.worker.classification import ClassificationMixin
 from arkindex_worker.worker.corpus import CorpusMixin
-from arkindex_worker.worker.dataset import (
-    DatasetMixin,
-    DatasetState,
-    MissingDatasetArchive,
-)
+from arkindex_worker.worker.dataset import DatasetMixin, DatasetState
 from arkindex_worker.worker.element import ElementMixin
 from arkindex_worker.worker.entity import EntityMixin
 from arkindex_worker.worker.image import ImageMixin
 from arkindex_worker.worker.metadata import MetaDataMixin, MetaType  # noqa: F401
-from arkindex_worker.worker.process import ActivityState, ProcessMixin, ProcessMode
 from arkindex_worker.worker.task import TaskMixin
 from arkindex_worker.worker.transcription import TranscriptionMixin
 from arkindex_worker.worker.version import WorkerVersionMixin
+class ActivityState(Enum):
+    """
+    Processing state of an element.
+    """
+    Queued = "queued"
+    """
+    The element has not yet been processed by a worker.
+    """
+    Started = "started"
+    """
+    The element is being processed by a worker.
+    """
+    Processed = "processed"
+    """
+    The element has been successfully processed by a worker.
+    """
+    Error = "error"
+    """
+    An error occurred while processing this element.
+    """
 class ElementsWorker(
-    ElementMixin,
-    DatasetMixin,
     BaseWorker,
     ClassificationMixin,
     CorpusMixin,
+    ElementMixin,
     TranscriptionMixin,
     WorkerVersionMixin,
     EntityMixin,
     MetaDataMixin,
     ImageMixin,
-    ProcessMixin,
 ):
     """
     Base class for ML workers that operate on Arkindex elements.
@@ -62,19 +83,29 @@ class ElementsWorker(
         """
         super().__init__(description, support_cache)
+        # Add mandatory argument to process elements
+        self.parser.add_argument(
+            "--elements-list",
+            help="JSON elements list to use",
+            type=open,
+            default=os.environ.get("TASK_ELEMENTS"),
+        )
+        self.parser.add_argument(
+            "--element",
+            type=str,
+            nargs="+",
+            help="One or more Arkindex element ID",
+        )
         self.classes = {}
         self.entity_types = {}
         """Known and available entity types in processed corpus
         """
-        self.corpus_types = {}
-        """Known and available element types in processed corpus
-        """
         self._worker_version_cache = {}
-    def get_elements(self) -> Iterable[CachedElement] | list[str] | list[Element]:
+    def list_elements(self) -> Iterable[CachedElement] | list[str]:
         """
         List the elements to be processed, either from the CLI arguments or
         the cache database when enabled.
@@ -106,23 +137,15 @@ class ElementsWorker(
         )
         if self.use_cache and cache_query.exists():
             return cache_query
+        # Process elements from JSON file
         elif self.args.elements_list:
-            # Process elements from JSON file
             data = json.load(self.args.elements_list)
             assert isinstance(data, list), "Elements list must be a list"
             assert len(data), "No elements in elements list"
             out += list(filter(None, [element.get("id") for element in data]))
+        # Add any extra element from CLI
         elif self.args.element:
-            # Add any extra element from CLI
             out += self.args.element
-        elif self.process_mode == ProcessMode.Dataset or self.args.set:
-            # Elements from datasets
-            return list(
-                chain.from_iterable(map(self.list_set_elements, self.list_sets()))
-            )
-        elif self.process_mode == ProcessMode.Export:
-            # For export mode processes, use list_process_elements and return element IDs
-            return {item["id"] for item in self.list_process_elements()}
         invalid_element_ids = list(filter(invalid_element_id, out))
         assert (
@@ -137,22 +160,40 @@ class ElementsWorker(
         Whether or not WorkerActivity support has been enabled on the DataImport
         used to run this worker.
         """
-        if self.is_read_only or self.process_mode in [
-            ProcessMode.Dataset,
-            ProcessMode.Export,
-        ]:
-            # Worker activities are also disabled when running an ElementsWorker in a Dataset process
-            # and when running export processes.
+        if self.is_read_only:
             return False
         assert (
             self.process_information
         ), "Worker must be configured to access its process activity state"
         return self.process_information.get("activity_state") == "ready"
+    def configure(self):
+        """
+        Setup the worker using CLI arguments and environment variables.
+        """
+        # CLI args are stored on the instance so that implementations can access them
+        self.args = self.parser.parse_args()
+        if self.is_read_only:
+            super().configure_for_developers()
+        else:
+            super().configure()
+            super().configure_cache()
+        # Retrieve the model configuration
+        if self.model_configuration:
+            self.config.update(self.model_configuration)
+            logger.info("Model version configuration retrieved")
+        # Retrieve the user configuration
+        if self.user_configuration:
+            self.config.update(self.user_configuration)
+            logger.info("User configuration retrieved")
     def run(self):
         """
         Implements an Arkindex worker that goes through each element returned by
-        [get_elements][arkindex_worker.worker.ElementsWorker.get_elements].
+        [list_elements][arkindex_worker.worker.ElementsWorker.list_elements].
         It calls [process_element][arkindex_worker.worker.ElementsWorker.process_element],
         catching exceptions, and handles saving WorkerActivity updates when enabled.
         """
@@ -160,7 +201,7 @@ class ElementsWorker(
         # List all elements either from JSON file
         # or direct list of elements on CLI
-        elements = self.get_elements()
+        elements = self.list_elements()
         if not elements:
             logger.warning("No elements to process, stopping.")
             sys.exit(1)
@@ -176,14 +217,12 @@ class ElementsWorker(
         for i, item in enumerate(elements, start=1):
             element = None
             try:
-                if isinstance(item, CachedElement | Element):
-                    # Just use the result of get_elements as the element
+                if self.use_cache:
+                    # Just use the result of list_elements as the element
                     element = item
                 else:
                     # Load element using the Arkindex API
-                    element = Element(
-                        **self.api_client.request("RetrieveElement", id=item)
-                    )
+                    element = Element(**self.request("RetrieveElement", id=item))
                 logger.info(f"Processing {element} ({i}/{count})")
@@ -221,7 +260,7 @@ class ElementsWorker(
                     with contextlib.suppress(Exception):
                         self.update_activity(element.id, ActivityState.Error)
-        message = f'Ran on {count} {pluralize("element", count)}: {count - failed} completed, {failed} failed'
+        message = f'Ran on {count} element{"s"[:count>1]}: {count - failed} completed, {failed} failed'
         if failed:
             logger.error(message)
             if failed >= count:  # Everything failed!
@@ -262,7 +301,7 @@ class ElementsWorker(
         assert isinstance(state, ActivityState), "state should be an ActivityState"
         try:
-            self.api_client.request(
+            self.request(
                 "UpdateWorkerActivity",
                 id=self.worker_run_id,
                 body={
@@ -292,7 +331,29 @@ class ElementsWorker(
         return True
-class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
+def check_dataset_set(value: str) -> tuple[uuid.UUID, str]:
+    values = value.split(":")
+    if len(values) != 2:
+        raise ArgumentTypeError(
+            f"'{value}' is not in the correct format `<dataset_id>:<set_name>`"
+        )
+    dataset_id, set_name = values
+    try:
+        dataset_id = uuid.UUID(dataset_id)
+        return (dataset_id, set_name)
+    except (TypeError, ValueError) as e:
+        raise ArgumentTypeError(f"'{dataset_id}' should be a valid UUID") from e
+class MissingDatasetArchive(Exception):
+    """
+    Exception raised when the compressed archive associated to
+    a dataset isn't found in its task artifacts.
+    """
+class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
     """
     Base class for ML workers that operate on Arkindex dataset sets.
@@ -315,6 +376,40 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
         # Set as an instance variable as dataset workers might use it to easily extract its content
         self.downloaded_dataset_artifact: Path | None = None
+        self.parser.add_argument(
+            "--set",
+            type=check_dataset_set,
+            nargs="+",
+            help="""
+                One or more Arkindex dataset sets, format is <dataset_uuid>:<set_name>
+                (e.g.: "12341234-1234-1234-1234-123412341234:train")
+            """,
+            default=[],
+        )
+    def configure(self):
+        """
+        Setup the worker using CLI arguments and environment variables.
+        """
+        # CLI args are stored on the instance so that implementations can access them
+        self.args = self.parser.parse_args()
+        if self.is_read_only:
+            super().configure_for_developers()
+        else:
+            super().configure()
+            super().configure_cache()
+        # Retrieve the model configuration
+        if self.model_configuration:
+            self.config.update(self.model_configuration)
+            logger.info("Model version configuration retrieved")
+        # Retrieve the user configuration
+        if self.user_configuration:
+            self.config.update(self.user_configuration)
+            logger.info("User configuration retrieved")
     def cleanup_downloaded_artifact(self) -> None:
         """
         Cleanup the downloaded dataset artifact if any
@@ -362,10 +457,30 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
         :param set: The set to process.
         """
+    def list_sets(self) -> Iterator[Set]:
+        """
+        List the sets to be processed, either from the CLI arguments or using the
+        [list_process_sets][arkindex_worker.worker.dataset.DatasetMixin.list_process_sets] method.
+        :returns: An iterator of ``Set`` objects.
+        """
+        if not self.is_read_only:
+            yield from self.list_process_sets()
+        datasets: dict[uuid.UUID, Dataset] = {}
+        for dataset_id, set_name in self.args.set:
+            # Retrieving dataset information is not already cached
+            if dataset_id not in datasets:
+                datasets[dataset_id] = Dataset(
+                    **self.request("RetrieveDataset", id=dataset_id)
+                )
+            yield Set(name=set_name, dataset=datasets[dataset_id])
     def run(self):
         """
         Implements an Arkindex worker that goes through each dataset set returned by
-        [list_sets][arkindex_worker.worker.dataset.DatasetMixin.list_sets].
+        [list_sets][arkindex_worker.worker.DatasetWorker.list_sets].
         It calls [process_set][arkindex_worker.worker.DatasetWorker.process_set],
         catching exceptions.
@@ -405,7 +520,7 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
         # Cleanup the latest downloaded dataset artifact
         self.cleanup_downloaded_artifact()
-        message = f'Ran on {count} {pluralize("set", count)}: {count - failed} completed, {failed} failed'
+        message = f'Ran on {count} set{"s"[:count>1]}: {count - failed} completed, {failed} failed'
         if failed:
             logger.error(message)
             if failed >= count:  # Everything failed!

arkindex_worker/worker/base.py CHANGED Viewed

@@ -12,9 +12,9 @@ from tempfile import mkdtemp
 import gnupg
 import yaml
+from apistar.exceptions import ErrorResponse
 from arkindex import options_from_env
-from arkindex.exceptions import ErrorResponse
 from arkindex_worker import logger
 from arkindex_worker.cache import (
     check_version,
@@ -24,7 +24,6 @@ from arkindex_worker.cache import (
     merge_parents_cache,
 )
 from arkindex_worker.utils import close_delete_file, extract_tar_zst_archive
-from arkindex_worker.worker.process import ProcessMode
 from teklia_toolbox.requests import get_arkindex_client
@@ -157,13 +156,6 @@ class BaseWorker:
             raise Exception("Missing ARKINDEX_CORPUS_ID environment variable")
         return self._corpus_id
-    @property
-    def process_mode(self) -> ProcessMode | None:
-        """Mode of the process being run. Returns None when read-only."""
-        if self.is_read_only:
-            return
-        return ProcessMode(self.process_information["mode"])
     @property
     def is_read_only(self) -> bool:
         """
@@ -227,7 +219,7 @@ class BaseWorker:
         # Load all required secrets
         self.secrets = {name: self.load_secret(Path(name)) for name in required_secrets}
-    def configure_worker_run(self):
+    def configure(self):
         """
         Setup the necessary configuration needed using CLI args and environment variables.
         This is the method called when running a worker on Arkindex.
@@ -239,7 +231,7 @@ class BaseWorker:
             logger.debug("Debug output enabled")
         # Load worker run information
-        worker_run = self.api_client.request("RetrieveWorkerRun", id=self.worker_run_id)
+        worker_run = self.request("RetrieveWorkerRun", id=self.worker_run_id)
         # Load process information
         self.process_information = worker_run["process"]
@@ -298,7 +290,7 @@ class BaseWorker:
         if self.support_cache and self.args.database is not None:
             self.use_cache = True
         elif self.support_cache and self.task_id:
-            task = self.api_client.request("RetrieveTask", id=self.task_id)
+            task = self.request("RetrieveTaskFromAgent", id=self.task_id)
             self.task_parents = task["parents"]
             paths = self.find_parents_file_paths(Path("db.sqlite"))
             self.use_cache = len(paths) > 0
@@ -328,29 +320,6 @@ class BaseWorker:
         else:
             logger.debug("Cache is disabled")
-    def configure(self):
-        """
-        Setup the worker using CLI arguments and environment variables.
-        """
-        # CLI args are stored on the instance so that implementations can access them
-        self.args = self.parser.parse_args()
-        if self.is_read_only:
-            self.configure_for_developers()
-        else:
-            self.configure_worker_run()
-            self.configure_cache()
-        # Retrieve the model configuration
-        if self.model_configuration:
-            self.config.update(self.model_configuration)
-            logger.info("Model version configuration retrieved")
-        # Retrieve the user configuration
-        if self.user_configuration:
-            self.config.update(self.user_configuration)
-            logger.info("User configuration retrieved")
     def load_secret(self, name: Path):
         """
         Load a Ponos secret by name.
@@ -362,7 +331,7 @@ class BaseWorker:
         # Load from the backend
         try:
-            resp = self.api_client.request("RetrieveSecret", name=str(name))
+            resp = self.request("RetrieveSecret", name=str(name))
             secret = resp["content"]
             logging.info(f"Loaded API secret {name}")
         except ErrorResponse as e:
@@ -502,6 +471,12 @@ class BaseWorker:
             # Clean up
             shutil.rmtree(base_extracted_path)
+    def request(self, *args, **kwargs):
+        """
+        Wrapper around the ``ArkindexClient.request`` method.
+        """
+        return self.api_client.request(*args, **kwargs)
     def add_arguments(self):
         """Override this method to add ``argparse`` arguments to this worker"""

arkindex_worker/worker/classification.py CHANGED Viewed

@@ -2,18 +2,12 @@
 ElementsWorker methods for classifications and ML classes.
 """
+from apistar.exceptions import ErrorResponse
 from peewee import IntegrityError
-from arkindex.exceptions import ErrorResponse
 from arkindex_worker import logger
 from arkindex_worker.cache import CachedClassification, CachedElement
 from arkindex_worker.models import Element
-from arkindex_worker.utils import (
-    DEFAULT_BATCH_SIZE,
-    batch_publication,
-    make_batches,
-    pluralize,
-)
 class ClassificationMixin:
@@ -27,7 +21,7 @@ class ClassificationMixin:
         )
         self.classes = {ml_class["name"]: ml_class["id"] for ml_class in corpus_classes}
         logger.info(
-            f'Loaded {len(self.classes)} ML {pluralize("class", len(self.classes))} in corpus ({self.corpus_id})'
+            f"Loaded {len(self.classes)} ML classes in corpus ({self.corpus_id})"
         )
     def get_ml_class_id(self, ml_class: str) -> str:
@@ -45,7 +39,7 @@ class ClassificationMixin:
         if ml_class_id is None:
             logger.info(f"Creating ML class {ml_class} on corpus {self.corpus_id}")
             try:
-                response = self.api_client.request(
+                response = self.request(
                     "CreateMLClass", id=self.corpus_id, body={"name": ml_class}
                 )
                 ml_class_id = self.classes[ml_class] = response["id"]
@@ -125,7 +119,7 @@ class ClassificationMixin:
             )
             return
         try:
-            created = self.api_client.request(
+            created = self.request(
                 "CreateClassification",
                 body={
                     "element": str(element.id),
@@ -173,12 +167,10 @@ class ClassificationMixin:
         return created
-    @batch_publication
     def create_classifications(
         self,
         element: Element | CachedElement,
         classifications: list[dict[str, str | float | bool]],
-        batch_size: int = DEFAULT_BATCH_SIZE,
     ) -> list[dict[str, str | float | bool]]:
         """
         Create multiple classifications at once on the given element through the API.
@@ -193,8 +185,6 @@ class ClassificationMixin:
             high_confidence (bool)
                 Optional. Whether or not the classification is of high confidence.
-        :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
         :returns: List of created classifications, as returned in the ``classifications`` field by
            the ``CreateClassifications`` API endpoint.
         """
@@ -230,26 +220,20 @@ class ClassificationMixin:
             )
             return
-        created_cls = [
-            created_cl
-            for batch in make_batches(classifications, "classification", batch_size)
-            for created_cl in self.api_client.request(
-                "CreateClassifications",
-                body={
-                    "parent": str(element.id),
-                    "worker_run_id": self.worker_run_id,
-                    "classifications": [
-                        {
-                            **classification,
-                            "ml_class": self.get_ml_class_id(
-                                classification["ml_class"]
-                            ),
-                        }
-                        for classification in batch
-                    ],
-                },
-            )["classifications"]
-        ]
+        created_cls = self.request(
+            "CreateClassifications",
+            body={
+                "parent": str(element.id),
+                "worker_run_id": self.worker_run_id,
+                "classifications": [
+                    {
+                        **classification,
+                        "ml_class": self.get_ml_class_id(classification["ml_class"]),
+                    }
+                    for classification in classifications
+                ],
+            },
+        )["classifications"]
         for created_cl in created_cls:
             created_cl["class_name"] = self.retrieve_ml_class(created_cl["ml_class"])

arkindex_worker/worker/corpus.py CHANGED Viewed

@@ -5,7 +5,6 @@ BaseWorker methods for corpora.
 from enum import Enum
 from operator import itemgetter
 from tempfile import _TemporaryFileWrapper
-from uuid import UUID
 from arkindex_worker import logger
@@ -37,25 +36,6 @@ class CorpusExportState(Enum):
 class CorpusMixin:
-    def download_export(self, export_id: str) -> _TemporaryFileWrapper:
-        """
-        Download an export.
-        :param export_id: UUID of the export to download
-        :returns: The downloaded export stored in a temporary file.
-        """
-        try:
-            UUID(export_id)
-        except ValueError as e:
-            raise ValueError("export_id is not a valid uuid.") from e
-        logger.info(f"Downloading export ({export_id})...")
-        export: _TemporaryFileWrapper = self.api_client.request(
-            "DownloadExport", id=export_id
-        )
-        logger.info(f"Downloaded export ({export_id}) @ `{export.name}`")
-        return export
     def download_latest_export(self) -> _TemporaryFileWrapper:
         """
         Download the latest export in `done` state of the current corpus.
@@ -82,5 +62,8 @@ class CorpusMixin:
         # Download latest export
         export_id: str = exports[0]["id"]
+        logger.info(f"Downloading export ({export_id})...")
+        export: _TemporaryFileWrapper = self.request("DownloadExport", id=export_id)
+        logger.info(f"Downloaded export ({export_id}) @ `{export.name}`")
-        return self.download_export(export_id)
+        return export

arkindex-base-worker 0.4.0__py3-none-any.whl → 0.4.0a2__py3-none-any.whl

arkindex-base-worker 0.4.0py3-none-any.whl → 0.4.0a2py3-none-any.whl