PyPI - arkindex-base-worker - Versions diffs - 0.3.6rc5__py3-none-any.whl → 0.3.7__py3-none-any.whl - Mend

arkindex-base-worker 0.3.6rc5py3-none-any.whl → 0.3.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

{arkindex_base_worker-0.3.6rc5.dist-info → arkindex_base_worker-0.3.7.dist-info}/METADATA +14 -13
arkindex_base_worker-0.3.7.dist-info/RECORD +47 -0
{arkindex_base_worker-0.3.6rc5.dist-info → arkindex_base_worker-0.3.7.dist-info}/WHEEL +1 -1
{arkindex_base_worker-0.3.6rc5.dist-info → arkindex_base_worker-0.3.7.dist-info}/top_level.txt +2 -0
arkindex_worker/cache.py +14 -0
arkindex_worker/image.py +29 -19
arkindex_worker/models.py +14 -2
arkindex_worker/utils.py +17 -3
arkindex_worker/worker/__init__.py +122 -125
arkindex_worker/worker/base.py +24 -24
arkindex_worker/worker/classification.py +18 -25
arkindex_worker/worker/dataset.py +24 -18
arkindex_worker/worker/element.py +45 -6
arkindex_worker/worker/entity.py +35 -4
arkindex_worker/worker/metadata.py +21 -11
arkindex_worker/worker/training.py +13 -0
arkindex_worker/worker/transcription.py +45 -5
arkindex_worker/worker/version.py +22 -0
hooks/pre_gen_project.py +3 -0
tests/conftest.py +14 -6
tests/test_base_worker.py +0 -6
tests/test_dataset_worker.py +291 -409
tests/test_elements_worker/test_classifications.py +365 -539
tests/test_elements_worker/test_cli.py +1 -1
tests/test_elements_worker/test_dataset.py +97 -116
tests/test_elements_worker/test_elements.py +227 -61
tests/test_elements_worker/test_entities.py +22 -2
tests/test_elements_worker/test_metadata.py +53 -27
tests/test_elements_worker/test_training.py +35 -0
tests/test_elements_worker/test_transcriptions.py +149 -16
tests/test_elements_worker/test_worker.py +19 -6
tests/test_image.py +37 -0
tests/test_utils.py +23 -1
worker-demo/tests/__init__.py +0 -0
worker-demo/tests/conftest.py +32 -0
worker-demo/tests/test_worker.py +12 -0
worker-demo/worker_demo/__init__.py +6 -0
worker-demo/worker_demo/worker.py +19 -0
arkindex_base_worker-0.3.6rc5.dist-info/RECORD +0 -41
{arkindex_base_worker-0.3.6rc5.dist-info → arkindex_base_worker-0.3.7.dist-info}/LICENSE +0 -0

{arkindex_base_worker-0.3.6rc5.dist-info → arkindex_base_worker-0.3.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: arkindex-base-worker
-Version: 0.3.6rc5
+Version: 0.3.7
 Summary: Base Worker to easily build Arkindex ML workflows
 Author-email: Teklia <contact@teklia.com>
 Maintainer-email: Teklia <contact@teklia.com>
@@ -41,22 +41,23 @@ Classifier: Topic :: Text Processing :: Linguistic
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: arkindex-client ==1.0.14
-Requires-Dist: peewee ==3.17.0
-Requires-Dist: Pillow ==10.1.0
-Requires-Dist: pymdown-extensions ==10.3.1
-Requires-Dist: python-gnupg ==0.5.1
-Requires-Dist: shapely ==2.0.2
-Requires-Dist: tenacity ==8.2.3
+Requires-Dist: peewee ==3.17.1
+Requires-Dist: Pillow ==10.3.0
+Requires-Dist: pymdown-extensions ==10.7.1
+Requires-Dist: python-gnupg ==0.5.2
+Requires-Dist: shapely ==2.0.3
+Requires-Dist: teklia-toolbox ==0.1.4
 Requires-Dist: zstandard ==0.22.0
 Provides-Extra: docs
-Requires-Dist: black ==23.11.0 ; extra == 'docs'
+Requires-Dist: black ==24.4.0 ; extra == 'docs'
 Requires-Dist: doc8 ==1.1.1 ; extra == 'docs'
-Requires-Dist: mkdocs ==1.5.3 ; extra == 'docs'
-Requires-Dist: mkdocs-material ==9.4.8 ; extra == 'docs'
-Requires-Dist: mkdocstrings ==0.23.0 ; extra == 'docs'
-Requires-Dist: mkdocstrings-python ==1.7.3 ; extra == 'docs'
+Requires-Dist: mkdocs-material ==9.5.17 ; extra == 'docs'
+Requires-Dist: mkdocstrings-python ==1.9.2 ; extra == 'docs'
 Requires-Dist: recommonmark ==0.7.1 ; extra == 'docs'
+Provides-Extra: tests
+Requires-Dist: pytest ==8.1.1 ; extra == 'tests'
+Requires-Dist: pytest-mock ==3.14.0 ; extra == 'tests'
+Requires-Dist: pytest-responses ==0.5.1 ; extra == 'tests'
 # Arkindex base Worker

arkindex_base_worker-0.3.7.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,47 @@
+arkindex_worker/__init__.py,sha256=OlgCtTC9MaWeejviY0a3iQpALcRQGMVArFVVYwTF6I8,162
+arkindex_worker/cache.py,sha256=FTlB0coXofn5zTNRTcVIvh709mcw4a1bPGqkwWjKs3w,11248
+arkindex_worker/image.py,sha256=5ymIGaTm2D7Sp2YYQkbuheuGnx5VJo0_AzYAEIvNGhs,14267
+arkindex_worker/models.py,sha256=xSvOadkNg3rgccic1xLgonzP28ugzmcGw0IUqXn51Cc,9844
+arkindex_worker/utils.py,sha256=0Mu7Fa8DVcHn19pg-FIXqMDpfgzQkb7QR9IAlAi-x_k,7243
+arkindex_worker/worker/__init__.py,sha256=U-_zOrQ09xmpBF9SmrTVj_UwnsCjFueV5G2hJAFEwv0,18806
+arkindex_worker/worker/base.py,sha256=qtkCGfpGn7SWsQZRJ5cpW0gQ4tV_cyR_AHbuHZr53z4,19585
+arkindex_worker/worker/classification.py,sha256=JVz-6YEeuavOy7zGfQi4nE_wpj9hwMUZDXTem-hXQY8,10328
+arkindex_worker/worker/dataset.py,sha256=roX2IMMNA-icteTtRADiFSZiZSRPClqS62ZPJm9s2JI,2923
+arkindex_worker/worker/element.py,sha256=AWK3YJSHWy3j4ajntJloi_2X4zxsgXZ6c6dzphgq3OI,33848
+arkindex_worker/worker/entity.py,sha256=suhycfikC9oTPEWmX48_cnvFEw-Wu5zBA8n_00K4KUk,14714
+arkindex_worker/worker/metadata.py,sha256=Bouuc_JaXogKykVXOTKDVP3tX--OUQeHoazxIGrGrJI,6702
+arkindex_worker/worker/task.py,sha256=cz3wJNPgogZv1lm_3lm7WScitQtYQtL6H6I7Xokq208,1475
+arkindex_worker/worker/training.py,sha256=YYnLNi4lsB0fEDj8Xh73z2Amt1LIfPdpuGzagOEtgDE,10648
+arkindex_worker/worker/transcription.py,sha256=6R7ofcGnNqX4rjT0kRKIE-G9FHq2TJ1tfztNM5sTqYE,20464
+arkindex_worker/worker/version.py,sha256=cs2pdlDxpKRO2Oldvcu54w-D_DQhf1cdeEt4tKX_QYs,1927
+hooks/pre_gen_project.py,sha256=xQJERv3vv9VzIqcBHI281eeWLWREXUF4mMw7PvJHHXM,269
+tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+tests/conftest.py,sha256=Oi5SJic4TNwDj8Pm0WHgg657yB7_JKxbLC0HYPI3RUc,22134
+tests/test_base_worker.py,sha256=Uq6_MpLW23gmKFXkU-SyDUaA_4dlViLBGG4e3gpBBz0,24512
+tests/test_cache.py,sha256=ii0gyr0DrG7ChEs7pmT8hMdSguAOAcCze4bRMiFQxuk,10640
+tests/test_dataset_worker.py,sha256=1joFRFmkL6XfPL9y1NYB_5QO-5FF56rwigAHrqtJMMA,23848
+tests/test_element.py,sha256=2G9M15TLxQRmvrWM9Kw2ucnElh4kSv_oF_5FYwwAxTY,13181
+tests/test_image.py,sha256=FZv8njLxh45sVgmY71UFHt0lv1cHr0cK4rrtPhQleX8,16262
+tests/test_merge.py,sha256=Q4zCbtZbe0wBfqE56gvAD06c6pDuhqnjKaioFqIgAQw,8331
+tests/test_utils.py,sha256=vpeHMeL7bJQonv5ZEbJmlJikqVKn5VWlVEbvmYFzDYA,1650
+tests/test_elements_worker/__init__.py,sha256=Fh4nkbbyJSMv_VtjQxnWrOqTnxXaaWI8S9WU0VrzCHs,179
+tests/test_elements_worker/test_classifications.py,sha256=vU6al1THtDSmERyVscMXaqiRPwTllcpRUHyeyBQ8M9U,26417
+tests/test_elements_worker/test_cli.py,sha256=BsFTswLti63WAZ2pf6ipiZKWJJyCQuSfuKnSlESuK8g,2878
+tests/test_elements_worker/test_dataset.py,sha256=hityecntzrldkuBHBWApYDkXSzSySdG3AZXJlM_sCOM,11777
+tests/test_elements_worker/test_elements.py,sha256=6XKtgXSVQJnTSgTHWwEVsAtIwLBapjYjUYPUdjxcHsY,84971
+tests/test_elements_worker/test_entities.py,sha256=yi1mXzvKvNwUNMzo0UZ56YOIJstYHcLyeepPJ8f10MQ,34557
+tests/test_elements_worker/test_metadata.py,sha256=YMYmkUSEp4WKNBm3QLcrg4yn6qVTWQ_aZzSu9Xygr80,18756
+tests/test_elements_worker/test_task.py,sha256=FCpxE9UpouKXgjGvWgNHEai_Hiy2d1YmqRG-_v2s27s,6312
+tests/test_elements_worker/test_training.py,sha256=3PGH6dAc2eSBD7w6ivrt1yAh6sCoici4nuIS9zdw6S8,9476
+tests/test_elements_worker/test_transcriptions.py,sha256=WVJG26sZyY66fu-Eka9A1_WWIeNI2scogjypzURnp8A,73468
+tests/test_elements_worker/test_worker.py,sha256=7-jGJVT3yMGpIyN96Uafz5eIUrO4ieNLgw0k1D8BhGc,17163
+worker-demo/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+worker-demo/tests/conftest.py,sha256=XzNMNeg6pmABUAH8jN6eZTlZSFGLYjS3-DTXjiRN6Yc,1002
+worker-demo/tests/test_worker.py,sha256=3DLd4NRK4bfyatG5P_PK4k9P9tJHx9XQq5_ryFEEFVg,304
+worker-demo/worker_demo/__init__.py,sha256=2BPomV8ZMNf3YXJgloatKeHQCE6QOkwmsHGkO6MkQuM,125
+worker-demo/worker_demo/worker.py,sha256=Rt-DjWa5iBP08k58NDZMfeyPuFbtNcbX6nc5jFX7GNo,440
+arkindex_base_worker-0.3.7.dist-info/LICENSE,sha256=NVshRi1efwVezMfW7xXYLrdDr2Li1AfwfGOd5WuH1kQ,1063
+arkindex_base_worker-0.3.7.dist-info/METADATA,sha256=AH2_i5Ne_vAPAYdQhlFhJQogSzDuLFtxueFsDMpkbMw,3458
+arkindex_base_worker-0.3.7.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+arkindex_base_worker-0.3.7.dist-info/top_level.txt,sha256=58NuslgxQC2vT4DiqZEgO4JqJRrYa2yeNI9QvkbfGQU,40
+arkindex_base_worker-0.3.7.dist-info/RECORD,,

{arkindex_base_worker-0.3.6rc5.dist-info → arkindex_base_worker-0.3.7.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.42.0)
+Generator: bdist_wheel (0.43.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{arkindex_base_worker-0.3.6rc5.dist-info → arkindex_base_worker-0.3.7.dist-info}/top_level.txt RENAMED Viewed

@@ -1,2 +1,4 @@
 arkindex_worker
+hooks
 tests
+worker-demo

arkindex_worker/cache.py CHANGED Viewed

@@ -374,3 +374,17 @@ def merge_parents_cache(paths: list, current_database: Path):
         for statement in statements:
             cursor.execute(statement)
         connection.commit()
+def unsupported_cache(func):
+    def wrapper(self, *args, **kwargs):
+        results = func(self, *args, **kwargs)
+        if not (self.is_read_only or self.use_cache):
+            logger.warning(
+                f"This API helper `{func.__name__}` did not update the cache database"
+            )
+        return results
+    return wrapper

arkindex_worker/image.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """
 Helper methods to download and open IIIF images, and manage polygons.
 """
 import re
 from collections import namedtuple
 from io import BytesIO
@@ -20,6 +21,7 @@ from tenacity import (
 )
 from arkindex_worker import logger
+from teklia_toolbox.requests import should_verify_cert
 # Avoid circular imports error when type checking
 if TYPE_CHECKING:
@@ -114,32 +116,38 @@ def download_image(url: str) -> Image:
             )
         else:
             raise e
-    except requests.exceptions.SSLError:
-        logger.warning(
-            "An SSLError occurred during image download, retrying with a weaker and unsafe SSL configuration"
-        )
-        # Saving current ciphers
-        previous_ciphers = requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS
-        # Downgrading ciphers to download the image
-        requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = "ALL:@SECLEVEL=1"
-        resp = _retried_request(url)
-        # Restoring previous ciphers
-        requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = previous_ciphers
     # Preprocess the image and prepare it for classification
     image = Image.open(BytesIO(resp.content))
     logger.info(
-        "Downloaded image {} - size={}x{} in {}".format(
-            url, image.size[0], image.size[1], resp.elapsed
-        )
+        f"Downloaded image {url} - size={image.size[0]}x{image.size[1]} in {resp.elapsed}"
     )
     return image
+def upload_image(image: Image, url: str) -> requests.Response:
+    """
+    Upload a Pillow image to a URL.
+    :param image: Pillow image to upload.
+    :param url: Destination URL.
+    :returns: The upload response.
+    """
+    assert url.startswith("http"), "Destination URL for the image must be HTTP(S)"
+    # Retrieve a binarized version of the image
+    image_bytes = BytesIO()
+    image.save(image_bytes, format="jpeg")
+    image_bytes.seek(0)
+    # Upload the image
+    resp = _retried_request(url, method=requests.put, data=image_bytes)
+    logger.info(f"Uploaded image to {url} in {resp.elapsed}")
+    return resp
 def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
     """
     Compute the rectangle bounding box of a polygon.
@@ -167,8 +175,10 @@ def _retry_log(retry_state, *args, **kwargs):
     before_sleep=_retry_log,
     reraise=True,
 )
-def _retried_request(url):
-    resp = requests.get(url, timeout=DOWNLOAD_TIMEOUT)
+def _retried_request(url, *args, method=requests.get, **kwargs):
+    resp = method(
+        url, *args, timeout=DOWNLOAD_TIMEOUT, verify=should_verify_cert(url), **kwargs
+    )
     resp.raise_for_status()
     return resp

arkindex_worker/models.py CHANGED Viewed

@@ -20,6 +20,8 @@ class MagicDict(dict):
         Automagically convert lists and dicts to MagicDicts and lists of MagicDicts
         Allows for nested access: foo.bar.baz
         """
+        if isinstance(item, Dataset):
+            return item
         if isinstance(item, list):
             return list(map(self._magify, item))
         if isinstance(item, dict):
@@ -75,10 +77,10 @@ class Element(MagicDict):
     def image_url(self, size: str = "full") -> str | None:
         """
-        Build an URL to access the image.
+        Build a URL to access the image.
         When possible, will return the S3 URL for images, so an ML worker can bypass IIIF servers.
         :param size: Subresolution of the image, following the syntax of the IIIF resize parameter.
-        :returns: An URL to the image, or None if the element does not have an image.
+        :returns: A URL to the image, or None if the element does not have an image.
         """
         if not self.get("zone"):
             return
@@ -272,6 +274,16 @@ class Dataset(ArkindexModel):
         return f"{self.id}.tar.zst"
+class Set(MagicDict):
+    """
+    Describes an Arkindex dataset set.
+    """
+    def __str__(self):
+        # Not using ArkindexModel.__str__ as we do not retrieve the Set ID
+        return f"{self.__class__.__name__} ({self.name}) from {self.dataset}"
 class Artifact(ArkindexModel):
     """
     Describes an Arkindex artifact.

arkindex_worker/utils.py CHANGED Viewed

@@ -10,6 +10,19 @@ import zstandard as zstd
 logger = logging.getLogger(__name__)
+MANUAL_SOURCE = "manual"
+def parse_source_id(value: str) -> bool | str | None:
+    """
+    Parse a UUID argument (Worker Version, Worker Run, ...) to use it directly in the API.
+    Arkindex API filters generally expect `False` to filter manual sources.
+    """
+    if value == MANUAL_SOURCE:
+        return False
+    return value or None
 CHUNK_SIZE = 1024
 """Chunk Size used for ZSTD compression"""
@@ -31,9 +44,10 @@ def decompress_zst_archive(compressed_archive: Path) -> tuple[int, Path]:
     logger.debug(f"Uncompressing file to {archive_path}")
     try:
-        with compressed_archive.open("rb") as compressed, archive_path.open(
-            "wb"
-        ) as decompressed:
+        with (
+            compressed_archive.open("rb") as compressed,
+            archive_path.open("wb") as decompressed,
+        ):
             dctx.copy_stream(compressed, decompressed)
         logger.debug(f"Successfully uncompressed archive {compressed_archive}")
     except zstandard.ZstdError as e:

arkindex_worker/worker/__init__.py CHANGED Viewed

@@ -1,31 +1,31 @@
 """
 Base classes to implement Arkindex workers.
 """
 import contextlib
 import json
 import os
 import sys
 import uuid
+from argparse import ArgumentTypeError
 from collections.abc import Iterable, Iterator
 from enum import Enum
-from itertools import groupby
-from operator import itemgetter
 from pathlib import Path
 from apistar.exceptions import ErrorResponse
 from arkindex_worker import logger
 from arkindex_worker.cache import CachedElement
-from arkindex_worker.models import Dataset, Element
+from arkindex_worker.models import Dataset, Element, Set
 from arkindex_worker.worker.base import BaseWorker
 from arkindex_worker.worker.classification import ClassificationMixin
 from arkindex_worker.worker.dataset import DatasetMixin, DatasetState
 from arkindex_worker.worker.element import ElementMixin
-from arkindex_worker.worker.entity import EntityMixin  # noqa: F401
+from arkindex_worker.worker.entity import EntityMixin
 from arkindex_worker.worker.metadata import MetaDataMixin, MetaType  # noqa: F401
 from arkindex_worker.worker.task import TaskMixin
 from arkindex_worker.worker.transcription import TranscriptionMixin
-from arkindex_worker.worker.version import WorkerVersionMixin  # noqa: F401
+from arkindex_worker.worker.version import WorkerVersionMixin
 class ActivityState(Enum):
@@ -159,6 +159,16 @@ class ElementsWorker(
             super().configure()
             super().configure_cache()
+        # Retrieve the model configuration
+        if self.model_configuration:
+            self.config.update(self.model_configuration)
+            logger.info("Model version configuration retrieved")
+        # Retrieve the user configuration
+        if self.user_configuration:
+            self.config.update(self.user_configuration)
+            logger.info("User configuration retrieved")
     def run(self):
         """
         Implements an Arkindex worker that goes through each element returned by
@@ -229,12 +239,13 @@ class ElementsWorker(
                     with contextlib.suppress(Exception):
                         self.update_activity(element.id, ActivityState.Error)
+        message = f'Ran on {count} element{"s"[:count>1]}: {count - failed} completed, {failed} failed'
         if failed:
-            logger.error(
-                f"Ran on {count} elements: {count - failed} completed, {failed} failed"
-            )
+            logger.error(message)
             if failed >= count:  # Everything failed!
                 sys.exit(1)
+        else:
+            logger.info(message)
     def process_element(self, element: Element | CachedElement):
         """
@@ -299,6 +310,21 @@ class ElementsWorker(
         return True
+def check_dataset_set(value: str) -> tuple[uuid.UUID, str]:
+    values = value.split(":")
+    if len(values) != 2:
+        raise ArgumentTypeError(
+            f"'{value}' is not in the correct format `<dataset_id>:<set_name>`"
+        )
+    dataset_id, set_name = values
+    try:
+        dataset_id = uuid.UUID(dataset_id)
+        return (dataset_id, set_name)
+    except (TypeError, ValueError) as e:
+        raise ArgumentTypeError(f"'{dataset_id}' should be a valid UUID") from e
 class MissingDatasetArchive(Exception):
     """
     Exception raised when the compressed archive associated to
@@ -308,7 +334,7 @@ class MissingDatasetArchive(Exception):
 class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
     """
-    Base class for ML workers that operate on Arkindex datasets.
+    Base class for ML workers that operate on Arkindex dataset sets.
     This class inherits from numerous mixin classes found in other modules of
     ``arkindex.worker``, which provide helpers to read and write to the Arkindex API.
@@ -318,24 +344,28 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
         self,
         description: str = "Arkindex Dataset Worker",
         support_cache: bool = False,
-        generator: bool = False,
     ):
         """
         :param description: The worker's description.
         :param support_cache: Whether the worker supports cache.
-        :param generator: Whether the worker generates the dataset archive artifact.
         """
         super().__init__(description, support_cache)
+        # Path to the dataset compressed archive (containing images and a SQLite database)
+        # Set as an instance variable as dataset workers might use it to easily extract its content
+        self.downloaded_dataset_artifact: Path | None = None
         self.parser.add_argument(
-            "--dataset",
-            type=uuid.UUID,
+            "--set",
+            type=check_dataset_set,
             nargs="+",
-            help="One or more Arkindex dataset ID",
+            help="""
+                One or more Arkindex dataset sets, format is <dataset_uuid>:<set_name>
+                (e.g.: "12341234-1234-1234-1234-123412341234:train")
+            """,
+            default=[],
         )
-        self.generator = generator
     def configure(self):
         """
         Setup the worker using CLI arguments and environment variables.
@@ -349,163 +379,130 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
             super().configure()
             super().configure_cache()
-    def download_dataset_artifact(self, dataset: Dataset) -> Path:
+        # Retrieve the model configuration
+        if self.model_configuration:
+            self.config.update(self.model_configuration)
+            logger.info("Model version configuration retrieved")
+        # Retrieve the user configuration
+        if self.user_configuration:
+            self.config.update(self.user_configuration)
+            logger.info("User configuration retrieved")
+    def cleanup_downloaded_artifact(self) -> None:
+        """
+        Cleanup the downloaded dataset artifact if any
+        """
+        if not self.downloaded_dataset_artifact:
+            return
+        self.downloaded_dataset_artifact.unlink(missing_ok=True)
+    def download_dataset_artifact(self, dataset: Dataset) -> None:
         """
         Find and download the compressed archive artifact describing a dataset using
         the [list_artifacts][arkindex_worker.worker.task.TaskMixin.list_artifacts] and
         [download_artifact][arkindex_worker.worker.task.TaskMixin.download_artifact] methods.
         :param dataset: The dataset to retrieve the compressed archive artifact for.
-        :returns: A path to the downloaded artifact.
         :raises MissingDatasetArchive: When the dataset artifact is not found.
         """
+        extra_dir = self.find_extras_directory()
+        archive = extra_dir / dataset.filepath
+        if archive.exists():
+            return
-        task_id = uuid.UUID(dataset.task_id)
+        # Cleanup the dataset artifact that was downloaded previously
+        self.cleanup_downloaded_artifact()
+        logger.info(f"Downloading artifact for {dataset}")
+        task_id = uuid.UUID(dataset.task_id)
         for artifact in self.list_artifacts(task_id):
             if artifact.path != dataset.filepath:
                 continue
-            extra_dir = self.find_extras_directory()
-            archive = extra_dir / dataset.filepath
             archive.write_bytes(self.download_artifact(task_id, artifact).read())
-            return archive
+            self.downloaded_dataset_artifact = archive
+            return
         raise MissingDatasetArchive(
             "The dataset compressed archive artifact was not found."
         )
-    def list_dataset_elements_per_split(
-        self, dataset: Dataset
-    ) -> Iterator[tuple[str, list[Element]]]:
-        """
-        List the elements in the dataset, grouped by split, using the
-        [list_dataset_elements][arkindex_worker.worker.dataset.DatasetMixin.list_dataset_elements] method.
-        :param dataset: The dataset to retrieve elements from.
-        :returns: An iterator of tuples containing the split name and the list of its elements.
-        """
-        def format_split(
-            split: tuple[str, Iterator[tuple[str, Element]]],
-        ) -> tuple[str, list[Element]]:
-            return (split[0], list(map(itemgetter(1), list(split[1]))))
-        return map(
-            format_split,
-            groupby(
-                sorted(self.list_dataset_elements(dataset), key=itemgetter(0)),
-                key=itemgetter(0),
-            ),
-        )
-    def process_dataset(self, dataset: Dataset):
+    def process_set(self, set: Set):
         """
-        Override this method to implement your worker and process a single Arkindex dataset at once.
+        Override this method to implement your worker and process a single Arkindex dataset set at once.
-        :param dataset: The dataset to process.
+        :param set: The set to process.
         """
-    def list_datasets(self) -> Iterator[Dataset] | Iterator[str]:
+    def list_sets(self) -> Iterator[Set]:
         """
-        List the datasets to be processed, either from the CLI arguments or using the
-        [list_process_datasets][arkindex_worker.worker.dataset.DatasetMixin.list_process_datasets] method.
+        List the sets to be processed, either from the CLI arguments or using the
+        [list_process_sets][arkindex_worker.worker.dataset.DatasetMixin.list_process_sets] method.
-        :returns: An iterator of strings if the worker is in read-only mode,
-        else an iterator of ``Dataset`` objects.
+        :returns: An iterator of ``Set`` objects.
         """
-        if self.is_read_only:
-            return map(str, self.args.dataset)
+        if not self.is_read_only:
+            yield from self.list_process_sets()
+        datasets: dict[uuid.UUID, Dataset] = {}
+        for dataset_id, set_name in self.args.set:
+            # Retrieving dataset information is not already cached
+            if dataset_id not in datasets:
+                datasets[dataset_id] = Dataset(
+                    **self.request("RetrieveDataset", id=dataset_id)
+                )
-        return self.list_process_datasets()
+            yield Set(name=set_name, dataset=datasets[dataset_id])
     def run(self):
         """
-        Implements an Arkindex worker that goes through each dataset returned by
-        [list_datasets][arkindex_worker.worker.DatasetWorker.list_datasets].
+        Implements an Arkindex worker that goes through each dataset set returned by
+        [list_sets][arkindex_worker.worker.DatasetWorker.list_sets].
-        It calls [process_dataset][arkindex_worker.worker.DatasetWorker.process_dataset],
-        catching exceptions, and handles updating the [DatasetState][arkindex_worker.worker.dataset.DatasetState]
-        when the worker is a generator.
+        It calls [process_set][arkindex_worker.worker.DatasetWorker.process_set],
+        catching exceptions.
         """
         self.configure()
-        datasets: list[Dataset] | list[str] = list(self.list_datasets())
-        if not datasets:
-            logger.warning("No datasets to process, stopping.")
+        dataset_sets: list[Set] = list(self.list_sets())
+        if not dataset_sets:
+            logger.warning("No sets to process, stopping.")
             sys.exit(1)
-        # Process every dataset
-        count = len(datasets)
+        # Process every set
+        count = len(dataset_sets)
         failed = 0
-        for i, item in enumerate(datasets, start=1):
-            dataset = None
-            dataset_artifact = None
+        for i, dataset_set in enumerate(dataset_sets, start=1):
             try:
-                if not self.is_read_only:
-                    # Just use the result of list_datasets as the dataset
-                    dataset = item
-                else:
-                    # Load dataset using the Arkindex API
-                    dataset = Dataset(**self.request("RetrieveDataset", id=item))
-                if self.generator:
-                    assert (
-                        dataset.state == DatasetState.Open.value
-                    ), "When generating a new dataset, its state should be Open."
-                else:
-                    assert (
-                        dataset.state == DatasetState.Complete.value
-                    ), "When processing an existing dataset, its state should be Complete."
-                logger.info(f"Processing {dataset} ({i}/{count})")
-                if self.generator:
-                    # Update the dataset state to Building
-                    logger.info(f"Building {dataset} ({i}/{count})")
-                    self.update_dataset_state(dataset, DatasetState.Building)
-                else:
-                    logger.info(f"Downloading data for {dataset} ({i}/{count})")
-                    dataset_artifact = self.download_dataset_artifact(dataset)
+                assert (
+                    dataset_set.dataset.state == DatasetState.Complete.value
+                ), "When processing a set, its dataset state should be Complete."
-                # Process the dataset
-                self.process_dataset(dataset)
+                logger.info(f"Retrieving data for {dataset_set} ({i}/{count})")
+                self.download_dataset_artifact(dataset_set.dataset)
-                if self.generator:
-                    # Update the dataset state to Complete
-                    logger.info(f"Completed {dataset} ({i}/{count})")
-                    self.update_dataset_state(dataset, DatasetState.Complete)
+                logger.info(f"Processing {dataset_set} ({i}/{count})")
+                self.process_set(dataset_set)
             except Exception as e:
-                # Handle errors occurring while retrieving, processing or patching the state for this dataset.
+                # Handle errors occurring while retrieving or processing this dataset set
                 failed += 1
-                # Handle the case where we failed retrieving the dataset
-                dataset_id = dataset.id if dataset else item
                 if isinstance(e, ErrorResponse):
-                    message = f"An API error occurred while processing dataset {dataset_id}: {e.title} - {e.content}"
+                    message = f"An API error occurred while processing {dataset_set}: {e.title} - {e.content}"
                 else:
-                    message = (
-                        f"Failed running worker on dataset {dataset_id}: {repr(e)}"
-                    )
+                    message = f"Failed running worker on {dataset_set}: {repr(e)}"
-                logger.warning(
-                    message,
-                    exc_info=e if self.args.verbose else None,
-                )
-                if dataset and self.generator:
-                    # Try to update the state to Error regardless of the response
-                    with contextlib.suppress(Exception):
-                        self.update_dataset_state(dataset, DatasetState.Error)
-            finally:
-                # Cleanup the dataset artifact if it was downloaded, no matter what
-                if dataset_artifact:
-                    dataset_artifact.unlink(missing_ok=True)
+                logger.warning(message, exc_info=e if self.args.verbose else None)
+        # Cleanup the latest downloaded dataset artifact
+        self.cleanup_downloaded_artifact()
+        message = f'Ran on {count} set{"s"[:count>1]}: {count - failed} completed, {failed} failed'
         if failed:
-            logger.error(
-                f"Ran on {count} datasets: {count - failed} completed, {failed} failed"
-            )
+            logger.error(message)
             if failed >= count:  # Everything failed!
                 sys.exit(1)
+        else:
+            logger.info(message)

arkindex-base-worker 0.3.6rc5__py3-none-any.whl → 0.3.7__py3-none-any.whl

arkindex-base-worker 0.3.6rc5py3-none-any.whl → 0.3.7py3-none-any.whl