PyPI - arkindex-base-worker - Versions diffs - 0.3.7rc3__tar.gz → 0.3.7rc5__tar.gz - Mend

arkindex-base-worker 0.3.7rc3tar.gz → 0.3.7rc5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

{arkindex-base-worker-0.3.7rc3 → arkindex-base-worker-0.3.7rc5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: arkindex-base-worker
-Version: 0.3.7rc3
+Version: 0.3.7rc5
 Summary: Base Worker to easily build Arkindex ML workflows
 Author-email: Teklia <contact@teklia.com>
 Maintainer-email: Teklia <contact@teklia.com>

{arkindex-base-worker-0.3.7rc3 → arkindex-base-worker-0.3.7rc5}/arkindex_base_worker.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: arkindex-base-worker
-Version: 0.3.7rc3
+Version: 0.3.7rc5
 Summary: Base Worker to easily build Arkindex ML workflows
 Author-email: Teklia <contact@teklia.com>
 Maintainer-email: Teklia <contact@teklia.com>

{arkindex-base-worker-0.3.7rc3 → arkindex-base-worker-0.3.7rc5}/arkindex_worker/image.py RENAMED Viewed

@@ -1,6 +1,7 @@
 """
 Helper methods to download and open IIIF images, and manage polygons.
 """
 import re
 from collections import namedtuple
 from io import BytesIO
@@ -114,32 +115,38 @@ def download_image(url: str) -> Image:
             )
         else:
             raise e
-    except requests.exceptions.SSLError:
-        logger.warning(
-            "An SSLError occurred during image download, retrying with a weaker and unsafe SSL configuration"
-        )
-        # Saving current ciphers
-        previous_ciphers = requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS
-        # Downgrading ciphers to download the image
-        requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = "ALL:@SECLEVEL=1"
-        resp = _retried_request(url)
-        # Restoring previous ciphers
-        requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = previous_ciphers
     # Preprocess the image and prepare it for classification
     image = Image.open(BytesIO(resp.content))
     logger.info(
-        "Downloaded image {} - size={}x{} in {}".format(
-            url, image.size[0], image.size[1], resp.elapsed
-        )
+        f"Downloaded image {url} - size={image.size[0]}x{image.size[1]} in {resp.elapsed}"
     )
     return image
+def upload_image(image: Image, url: str) -> requests.Response:
+    """
+    Upload a Pillow image to a URL.
+    :param image: Pillow image to upload.
+    :param url: Destination URL.
+    :returns: The upload response.
+    """
+    assert url.startswith("http"), "Destination URL for the image must be HTTP(S)"
+    # Retrieve a binarized version of the image
+    image_bytes = BytesIO()
+    image.save(image_bytes, format="jpeg")
+    image_bytes.seek(0)
+    # Upload the image
+    resp = _retried_request(url, method=requests.put, data=image_bytes)
+    logger.info(f"Uploaded image to {url} in {resp.elapsed}")
+    return resp
 def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
     """
     Compute the rectangle bounding box of a polygon.
@@ -167,8 +174,8 @@ def _retry_log(retry_state, *args, **kwargs):
     before_sleep=_retry_log,
     reraise=True,
 )
-def _retried_request(url):
-    resp = requests.get(url, timeout=DOWNLOAD_TIMEOUT)
+def _retried_request(url, *args, method=requests.get, **kwargs):
+    resp = method(url, *args, timeout=DOWNLOAD_TIMEOUT, **kwargs)
     resp.raise_for_status()
     return resp

{arkindex-base-worker-0.3.7rc3 → arkindex-base-worker-0.3.7rc5}/arkindex_worker/models.py RENAMED Viewed

@@ -75,10 +75,10 @@ class Element(MagicDict):
     def image_url(self, size: str = "full") -> str | None:
         """
-        Build an URL to access the image.
+        Build a URL to access the image.
         When possible, will return the S3 URL for images, so an ML worker can bypass IIIF servers.
         :param size: Subresolution of the image, following the syntax of the IIIF resize parameter.
-        :returns: An URL to the image, or None if the element does not have an image.
+        :returns: A URL to the image, or None if the element does not have an image.
         """
         if not self.get("zone"):
             return

{arkindex-base-worker-0.3.7rc3 → arkindex-base-worker-0.3.7rc5}/arkindex_worker/utils.py RENAMED Viewed

@@ -31,9 +31,10 @@ def decompress_zst_archive(compressed_archive: Path) -> tuple[int, Path]:
     logger.debug(f"Uncompressing file to {archive_path}")
     try:
-        with compressed_archive.open("rb") as compressed, archive_path.open(
-            "wb"
-        ) as decompressed:
+        with (
+            compressed_archive.open("rb") as compressed,
+            archive_path.open("wb") as decompressed,
+        ):
             dctx.copy_stream(compressed, decompressed)
         logger.debug(f"Successfully uncompressed archive {compressed_archive}")
     except zstandard.ZstdError as e:

{arkindex-base-worker-0.3.7rc3 → arkindex-base-worker-0.3.7rc5}/arkindex_worker/worker/__init__.py RENAMED Viewed

@@ -1,6 +1,7 @@
 """
 Base classes to implement Arkindex workers.
 """
 import contextlib
 import json
 import os
@@ -229,12 +230,13 @@ class ElementsWorker(
                     with contextlib.suppress(Exception):
                         self.update_activity(element.id, ActivityState.Error)
+        message = f'Ran on {count} element{"s"[:count>1]}: {count - failed} completed, {failed} failed'
         if failed:
-            logger.error(
-                f"Ran on {count} elements: {count - failed} completed, {failed} failed"
-            )
+            logger.error(message)
             if failed >= count:  # Everything failed!
                 sys.exit(1)
+        else:
+            logger.info(message)
     def process_element(self, element: Element | CachedElement):
         """
@@ -504,9 +506,10 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
                 if dataset_artifact:
                     dataset_artifact.unlink(missing_ok=True)
+        message = f'Ran on {count} dataset{"s"[:count>1]}: {count - failed} completed, {failed} failed'
         if failed:
-            logger.error(
-                f"Ran on {count} datasets: {count - failed} completed, {failed} failed"
-            )
+            logger.error(message)
             if failed >= count:  # Everything failed!
                 sys.exit(1)
+        else:
+            logger.info(message)

{arkindex-base-worker-0.3.7rc3 → arkindex-base-worker-0.3.7rc5}/arkindex_worker/worker/base.py RENAMED Viewed

@@ -1,6 +1,7 @@
 """
 The base class for all Arkindex workers.
 """
 import argparse
 import json
 import logging

{arkindex-base-worker-0.3.7rc3 → arkindex-base-worker-0.3.7rc5}/arkindex_worker/worker/classification.py RENAMED Viewed

@@ -2,8 +2,6 @@
 ElementsWorker methods for classifications and ML classes.
 """
-from uuid import UUID
 from apistar.exceptions import ErrorResponse
 from peewee import IntegrityError
@@ -178,10 +176,14 @@ class ClassificationMixin:
         Create multiple classifications at once on the given element through the API.
         :param element: The element to create classifications on.
-        :param classifications: The classifications to create, a list of dicts. Each of them contains
-            a **ml_class_id** (str), the ID of the MLClass for this classification;
-            a **confidence** (float), the confidence score, between 0 and 1;
-            a **high_confidence** (bool), the high confidence state of the classification.
+        :param classifications: A list of dicts representing a classification each, with the following keys:
+            ml_class (str)
+                Required. Name of the MLClass to use.
+            confidence (float)
+                Required. Confidence score for the classification. Must be between 0 and 1.
+            high_confidence (bool)
+                Optional. Whether or not the classification is of high confidence.
         :returns: List of created classifications, as returned in the ``classifications`` field by
            the ``CreateClassifications`` API endpoint.
@@ -194,18 +196,10 @@ class ClassificationMixin:
         ), "classifications shouldn't be null and should be of type list"
         for index, classification in enumerate(classifications):
-            ml_class_id = classification.get("ml_class_id")
+            ml_class = classification.get("ml_class")
             assert (
-                ml_class_id and isinstance(ml_class_id, str)
-            ), f"Classification at index {index} in classifications: ml_class_id shouldn't be null and should be of type str"
-            # Make sure it's a valid UUID
-            try:
-                UUID(ml_class_id)
-            except ValueError as e:
-                raise ValueError(
-                    f"Classification at index {index} in classifications: ml_class_id is not a valid uuid."
-                ) from e
+                ml_class and isinstance(ml_class, str)
+            ), f"Classification at index {index} in classifications: ml_class shouldn't be null and should be of type str"
             confidence = classification.get("confidence")
             assert (
@@ -231,7 +225,13 @@ class ClassificationMixin:
             body={
                 "parent": str(element.id),
                 "worker_run_id": self.worker_run_id,
-                "classifications": classifications,
+                "classifications": [
+                    {
+                        **classification,
+                        "ml_class": self.get_ml_class_id(classification["ml_class"]),
+                    }
+                    for classification in classifications
+                ],
             },
         )["classifications"]

{arkindex-base-worker-0.3.7rc3 → arkindex-base-worker-0.3.7rc5}/arkindex_worker/worker/dataset.py RENAMED Viewed

@@ -51,7 +51,7 @@ class DatasetMixin:
         return map(
             lambda result: Dataset(**result["dataset"], selected_sets=result["sets"]),
-            list(results),
+            results,
         )
     def list_dataset_elements(self, dataset: Dataset) -> Iterator[tuple[str, Element]]:
@@ -65,14 +65,20 @@ class DatasetMixin:
             dataset, Dataset
         ), "dataset shouldn't be null and should be a Dataset"
-        results = self.api_client.paginate("ListDatasetElements", id=dataset.id)
+        if dataset.sets == dataset.selected_sets:
+            results = self.api_client.paginate("ListDatasetElements", id=dataset.id)
+        else:
+            results = iter(
+                element
+                for selected_set in dataset.selected_sets
+                for element in self.api_client.paginate(
+                    "ListDatasetElements", id=dataset.id, set=selected_set
+                )
+            )
-        def format_result(result):
-            if result["set"] not in dataset.selected_sets:
-                return
-            return (result["set"], Element(**result["element"]))
-        return filter(None, map(format_result, list(results)))
+        return map(
+            lambda result: (result["set"], Element(**result["element"])), results
+        )
     @unsupported_cache
     def update_dataset_state(self, dataset: Dataset, state: DatasetState) -> Dataset:

{arkindex-base-worker-0.3.7rc3 → arkindex-base-worker-0.3.7rc5}/arkindex_worker/worker/element.py RENAMED Viewed

@@ -1,6 +1,7 @@
 """
 ElementsWorker methods for elements and element types.
 """
 from collections.abc import Iterable
 from typing import NamedTuple
 from uuid import UUID

{arkindex-base-worker-0.3.7rc3 → arkindex-base-worker-0.3.7rc5}/arkindex_worker/worker/metadata.py RENAMED Viewed

@@ -50,7 +50,7 @@ class MetaType(Enum):
     URL = "url"
     """
-    A metadata with a string value that should be interpreted as an URL.
+    A metadata with a string value that should be interpreted as a URL.
     Only the ``http`` and ``https`` schemes are allowed.
     """

{arkindex-base-worker-0.3.7rc3 → arkindex-base-worker-0.3.7rc5}/arkindex_worker/worker/version.py RENAMED Viewed

@@ -1,6 +1,7 @@
 """
 ElementsWorker methods for worker versions.
 """
 import functools
 from warnings import warn

{arkindex-base-worker-0.3.7rc3 → arkindex-base-worker-0.3.7rc5}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "arkindex-base-worker"
-version = "0.3.7rc3"
+version = "0.3.7rc5"
 description = "Base Worker to easily build Arkindex ML workflows"
 license = { file = "LICENSE" }
 dynamic = ["dependencies", "optional-dependencies"]
@@ -41,6 +41,8 @@ optional-dependencies = { docs = { file = ["docs-requirements.txt"] } }
 [tool.ruff]
 exclude = [".git", "__pycache__"]
+[tool.ruff.lint]
 ignore = ["E501"]
 select = [
     # pycodestyle
@@ -68,11 +70,11 @@ select = [
     "PTH",
 ]
-[tool.ruff.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
 # Ignore `pytest-composite-assertion` rules of `flake8-pytest-style` linter for non-test files
 "arkindex_worker/**/*.py" = ["PT018"]
-[tool.ruff.isort]
+[tool.ruff.lint.isort]
 known-first-party = ["arkindex", "arkindex_common", "arkindex_worker"]
 known-third-party = [
     "PIL",

{arkindex-base-worker-0.3.7rc3 → arkindex-base-worker-0.3.7rc5}/tests/test_dataset_worker.py RENAMED Viewed

@@ -195,7 +195,7 @@ def test_list_dataset_elements_per_split_api_error(
 ):
     responses.add(
         responses.GET,
-        f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/",
+        f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?set=set_1&with_count=true",
         status=500,
     )
@@ -211,23 +211,23 @@ def test_list_dataset_elements_per_split_api_error(
         # The API call is retried 5 times
         (
             "GET",
-            f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?with_count=true",
+            f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?set=set_1&with_count=true",
         ),
         (
             "GET",
-            f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?with_count=true",
+            f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?set=set_1&with_count=true",
         ),
         (
             "GET",
-            f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?with_count=true",
+            f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?set=set_1&with_count=true",
         ),
         (
             "GET",
-            f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?with_count=true",
+            f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?set=set_1&with_count=true",
         ),
         (
             "GET",
-            f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?with_count=true",
+            f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?set=set_1&with_count=true",
         ),
     ]
@@ -235,110 +235,60 @@ def test_list_dataset_elements_per_split_api_error(
 def test_list_dataset_elements_per_split(
     responses, mock_dataset_worker, default_dataset
 ):
-    expected_results = [
-        {
-            "set": "set_1",
-            "element": {
-                "id": "0000",
-                "type": "page",
-                "name": "Test",
-                "corpus": {},
-                "thumbnail_url": None,
-                "zone": {},
-                "best_classes": None,
-                "has_children": None,
-                "worker_version_id": None,
-                "worker_run_id": None,
-            },
-        },
-        {
-            "set": "set_1",
-            "element": {
-                "id": "1111",
-                "type": "page",
-                "name": "Test 2",
-                "corpus": {},
-                "thumbnail_url": None,
-                "zone": {},
-                "best_classes": None,
-                "has_children": None,
-                "worker_version_id": None,
-                "worker_run_id": None,
-            },
-        },
-        {
-            "set": "set_2",
-            "element": {
-                "id": "2222",
-                "type": "page",
-                "name": "Test 3",
-                "corpus": {},
-                "thumbnail_url": None,
-                "zone": {},
-                "best_classes": None,
-                "has_children": None,
-                "worker_version_id": None,
-                "worker_run_id": None,
-            },
-        },
-        {
-            "set": "set_3",
-            "element": {
-                "id": "3333",
-                "type": "page",
-                "name": "Test 4",
-                "corpus": {},
-                "thumbnail_url": None,
-                "zone": {},
-                "best_classes": None,
-                "has_children": None,
-                "worker_version_id": None,
-                "worker_run_id": None,
-            },
-        },
-        # `set_4` is not in `default_dataset.selected_sets`
-        {
-            "set": "set_4",
-            "element": {
-                "id": "4444",
-                "type": "page",
-                "name": "Test 5",
-                "corpus": {},
-                "thumbnail_url": None,
-                "zone": {},
-                "best_classes": None,
-                "has_children": None,
-                "worker_version_id": None,
-                "worker_run_id": None,
+    expected_results = []
+    for selected_set in default_dataset.selected_sets:
+        index = selected_set[-1]
+        expected_results.append(
+            {
+                "set": selected_set,
+                "element": {
+                    "id": str(index) * 4,
+                    "type": "page",
+                    "name": f"Test {index}",
+                    "corpus": {},
+                    "thumbnail_url": None,
+                    "zone": {},
+                    "best_classes": None,
+                    "has_children": None,
+                    "worker_version_id": None,
+                    "worker_run_id": None,
+                },
+            }
+        )
+        responses.add(
+            responses.GET,
+            f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?set={selected_set}&with_count=true",
+            status=200,
+            json={
+                "count": 1,
+                "next": None,
+                "results": [expected_results[-1]],
             },
-        },
-    ]
-    responses.add(
-        responses.GET,
-        f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/",
-        status=200,
-        json={
-            "count": 4,
-            "next": None,
-            "results": expected_results,
-        },
-    )
+        )
     assert list(
         mock_dataset_worker.list_dataset_elements_per_split(default_dataset)
     ) == [
-        ("set_1", [expected_results[0]["element"], expected_results[1]["element"]]),
-        ("set_2", [expected_results[2]["element"]]),
-        ("set_3", [expected_results[3]["element"]]),
+        ("set_1", [expected_results[0]["element"]]),
+        ("set_2", [expected_results[1]["element"]]),
+        ("set_3", [expected_results[2]["element"]]),
     ]
-    assert len(responses.calls) == len(BASE_API_CALLS) + 1
+    assert len(responses.calls) == len(BASE_API_CALLS) + 3
     assert [
         (call.request.method, call.request.url) for call in responses.calls
     ] == BASE_API_CALLS + [
         (
             "GET",
-            f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?with_count=true",
+            f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?set=set_1&with_count=true",
+        ),
+        (
+            "GET",
+            f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?set=set_2&with_count=true",
+        ),
+        (
+            "GET",
+            f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?set=set_3&with_count=true",
         ),
     ]
@@ -360,7 +310,7 @@ def test_list_datasets_api_error(responses, mock_dataset_worker):
     with pytest.raises(
         Exception, match="Stopping pagination as data will be incomplete"
     ):
-        mock_dataset_worker.list_datasets()
+        next(mock_dataset_worker.list_datasets())
     assert len(responses.calls) == len(BASE_API_CALLS) + 5
     assert [
@@ -512,7 +462,7 @@ def test_run_initial_dataset_state_error(
         if generator
         else []
     ) + [
-        (logging.ERROR, "Ran on 1 datasets: 0 completed, 1 failed"),
+        (logging.ERROR, "Ran on 1 dataset: 0 completed, 1 failed"),
     ]
@@ -577,7 +527,7 @@ def test_run_update_dataset_state_api_error(
         ],
         (
             logging.ERROR,
-            "Ran on 1 datasets: 0 completed, 1 failed",
+            "Ran on 1 dataset: 0 completed, 1 failed",
         ),
     ]
@@ -639,7 +589,7 @@ def test_run_download_dataset_artifact_api_error(
         ),
         (
             logging.ERROR,
-            "Ran on 1 datasets: 0 completed, 1 failed",
+            "Ran on 1 dataset: 0 completed, 1 failed",
         ),
     ]
@@ -690,7 +640,7 @@ def test_run_no_downloaded_artifact_error(
         ),
         (
             logging.ERROR,
-            "Ran on 1 datasets: 0 completed, 1 failed",
+            "Ran on 1 dataset: 0 completed, 1 failed",
         ),
     ]
@@ -792,7 +742,9 @@ def test_run(
     assert [(level, message) for _, level, message in caplog.record_tuples] == [
         (logging.INFO, "Loaded Worker Fake worker @ 123412 from API"),
         (logging.INFO, "Processing Dataset (dataset_id) (1/1)"),
-    ] + extra_logs
+        *extra_logs,
+        (logging.INFO, "Ran on 1 dataset: 1 completed, 0 failed"),
+    ]
 @pytest.mark.parametrize(
@@ -890,4 +842,6 @@ def test_run_read_only(
     assert [(level, message) for _, level, message in caplog.record_tuples] == [
         (logging.WARNING, "Running without any extra configuration"),
         (logging.INFO, "Processing Dataset (dataset_id) (1/1)"),
-    ] + extra_logs
+        *extra_logs,
+        (logging.INFO, "Ran on 1 dataset: 1 completed, 0 failed"),
+    ]

arkindex-base-worker 0.3.7rc3__tar.gz → 0.3.7rc5__tar.gz

arkindex-base-worker 0.3.7rc3tar.gz → 0.3.7rc5tar.gz