PyPI - arkindex-base-worker - Versions diffs - 0.4.0rc6__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

arkindex-base-worker 0.4.0rc6py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

{arkindex_base_worker-0.4.0rc6.dist-info → arkindex_base_worker-0.5.0.dist-info}/METADATA +9 -12
arkindex_base_worker-0.5.0.dist-info/RECORD +60 -0
{arkindex_base_worker-0.4.0rc6.dist-info → arkindex_base_worker-0.5.0.dist-info}/WHEEL +1 -1
{arkindex_base_worker-0.4.0rc6.dist-info → arkindex_base_worker-0.5.0.dist-info}/top_level.txt +1 -0
arkindex_worker/__init__.py +3 -0
arkindex_worker/cache.py +6 -25
arkindex_worker/image.py +105 -66
arkindex_worker/utils.py +2 -1
arkindex_worker/worker/__init__.py +17 -31
arkindex_worker/worker/base.py +16 -9
arkindex_worker/worker/classification.py +36 -34
arkindex_worker/worker/corpus.py +3 -3
arkindex_worker/worker/dataset.py +9 -9
arkindex_worker/worker/element.py +261 -231
arkindex_worker/worker/entity.py +137 -206
arkindex_worker/worker/image.py +3 -3
arkindex_worker/worker/metadata.py +27 -38
arkindex_worker/worker/task.py +9 -9
arkindex_worker/worker/training.py +15 -11
arkindex_worker/worker/transcription.py +77 -71
examples/standalone/python/worker.py +171 -0
examples/tooled/python/worker.py +50 -0
tests/conftest.py +22 -36
tests/test_base_worker.py +1 -1
tests/test_cache.py +1 -2
tests/test_dataset_worker.py +1 -1
tests/test_elements_worker/test_element.py +200 -26
tests/test_elements_worker/{test_entity_create.py → test_entity.py} +220 -227
tests/test_elements_worker/test_metadata.py +0 -47
tests/test_elements_worker/test_training.py +8 -8
tests/test_elements_worker/test_worker.py +15 -14
tests/test_image.py +244 -126
tests/test_merge.py +0 -7
tests/test_utils.py +37 -0
arkindex_base_worker-0.4.0rc6.dist-info/RECORD +0 -61
arkindex_worker/worker/version.py +0 -58
tests/test_elements_worker/test_entity_list_and_check.py +0 -160
tests/test_elements_worker/test_version.py +0 -60
{arkindex_base_worker-0.4.0rc6.dist-info → arkindex_base_worker-0.5.0.dist-info/licenses}/LICENSE +0 -0

arkindex_worker/worker/training.py CHANGED Viewed

@@ -122,9 +122,9 @@ class TrainingMixin:
             )
         elif tag or description or configuration or parent:
-            assert (
-                self.model_version.get("model_id") == model_id
-            ), "Given `model_id` does not match the current model version"
+            assert self.model_version.get("model_id") == model_id, (
+                "Given `model_id` does not match the current model version"
+            )
             # If any attribute field has been defined, PATCH the current model version
             self.update_model_version(
                 tag=tag,
@@ -237,15 +237,17 @@ class TrainingMixin:
         Upload the archive of the model's files to an Amazon s3 compatible storage
         """
-        assert (
-            self.model_version
-        ), "You must create the model version before uploading an archive."
-        assert (
-            self.model_version["state"] != "Available"
-        ), "The model is already marked as available."
+        assert self.model_version, (
+            "You must create the model version before uploading an archive."
+        )
+        assert self.model_version["state"] != "Available", (
+            "The model is already marked as available."
+        )
         s3_put_url = self.model_version.get("s3_put_url")
-        assert s3_put_url, "S3 PUT URL is not set, please ensure you have the right to validate a model version."
+        assert s3_put_url, (
+            "S3 PUT URL is not set, please ensure you have the right to validate a model version."
+        )
         logger.info("Uploading to s3...")
         # Upload the archive on s3
@@ -271,7 +273,9 @@ class TrainingMixin:
         :param size: The size of the uploaded archive
         :param archive_hash: MD5 hash of the uploaded archive
         """
-        assert self.model_version, "You must create the model version and upload its archive before validating it."
+        assert self.model_version, (
+            "You must create the model version and upload its archive before validating it."
+        )
         try:
             self.model_version = self.api_client.request(
                 "PartialUpdateModelVersion",

arkindex_worker/worker/transcription.py CHANGED Viewed

@@ -59,18 +59,18 @@ class TranscriptionMixin:
         :returns: A dict as returned by the ``CreateTranscription`` API endpoint,
            or None if the worker is in read-only mode.
         """
-        assert element and isinstance(
-            element, Element | CachedElement
-        ), "element shouldn't be null and should be an Element or CachedElement"
-        assert text and isinstance(
-            text, str
-        ), "text shouldn't be null and should be of type str"
-        assert orientation and isinstance(
-            orientation, TextOrientation
-        ), "orientation shouldn't be null and should be of type TextOrientation"
-        assert (
-            isinstance(confidence, float) and 0 <= confidence <= 1
-        ), "confidence shouldn't be null and should be a float in [0..1] range"
+        assert element and isinstance(element, Element | CachedElement), (
+            "element shouldn't be null and should be an Element or CachedElement"
+        )
+        assert text and isinstance(text, str), (
+            "text shouldn't be null and should be of type str"
+        )
+        assert orientation and isinstance(orientation, TextOrientation), (
+            "orientation shouldn't be null and should be of type TextOrientation"
+        )
+        assert isinstance(confidence, float) and 0 <= confidence <= 1, (
+            "confidence shouldn't be null and should be a float in [0..1] range"
+        )
         if self.is_read_only:
             logger.warning(
@@ -136,37 +136,39 @@ class TranscriptionMixin:
         :returns: A list of dicts as returned in the ``transcriptions`` field by the ``CreateTranscriptions`` API endpoint.
         """
-        assert transcriptions and isinstance(
-            transcriptions, list
-        ), "transcriptions shouldn't be null and should be of type list"
+        assert transcriptions and isinstance(transcriptions, list), (
+            "transcriptions shouldn't be null and should be of type list"
+        )
         # Create shallow copies of every transcription to avoid mutating the original payload
         transcriptions_payload = list(map(dict, transcriptions))
         for index, transcription in enumerate(transcriptions_payload):
             element_id = transcription.get("element_id")
-            assert (
-                element_id and isinstance(element_id, str)
-            ), f"Transcription at index {index} in transcriptions: element_id shouldn't be null and should be of type str"
+            assert element_id and isinstance(element_id, str), (
+                f"Transcription at index {index} in transcriptions: element_id shouldn't be null and should be of type str"
+            )
             text = transcription.get("text")
-            assert (
-                text and isinstance(text, str)
-            ), f"Transcription at index {index} in transcriptions: text shouldn't be null and should be of type str"
+            assert text and isinstance(text, str), (
+                f"Transcription at index {index} in transcriptions: text shouldn't be null and should be of type str"
+            )
             confidence = transcription.get("confidence")
             assert (
                 confidence is not None
                 and isinstance(confidence, float)
                 and 0 <= confidence <= 1
-            ), f"Transcription at index {index} in transcriptions: confidence shouldn't be null and should be a float in [0..1] range"
+            ), (
+                f"Transcription at index {index} in transcriptions: confidence shouldn't be null and should be a float in [0..1] range"
+            )
             orientation = transcription.get(
                 "orientation", TextOrientation.HorizontalLeftToRight
             )
-            assert (
-                orientation and isinstance(orientation, TextOrientation)
-            ), f"Transcription at index {index} in transcriptions: orientation shouldn't be null and should be of type TextOrientation"
+            assert orientation and isinstance(orientation, TextOrientation), (
+                f"Transcription at index {index} in transcriptions: orientation shouldn't be null and should be of type TextOrientation"
+            )
             if orientation:
                 transcription["orientation"] = orientation.value
@@ -242,63 +244,67 @@ class TranscriptionMixin:
         :returns: A list of dicts as returned by the ``CreateElementTranscriptions`` API endpoint.
         """
-        assert element and isinstance(
-            element, Element | CachedElement
-        ), "element shouldn't be null and should be an Element or CachedElement"
-        assert sub_element_type and isinstance(
-            sub_element_type, str
-        ), "sub_element_type shouldn't be null and should be of type str"
-        assert transcriptions and isinstance(
-            transcriptions, list
-        ), "transcriptions shouldn't be null and should be of type list"
+        assert element and isinstance(element, Element | CachedElement), (
+            "element shouldn't be null and should be an Element or CachedElement"
+        )
+        assert sub_element_type and isinstance(sub_element_type, str), (
+            "sub_element_type shouldn't be null and should be of type str"
+        )
+        assert transcriptions and isinstance(transcriptions, list), (
+            "transcriptions shouldn't be null and should be of type list"
+        )
         # Create shallow copies of every transcription to avoid mutating the original payload
         transcriptions_payload = list(map(dict, transcriptions))
         for index, transcription in enumerate(transcriptions_payload):
             text = transcription.get("text")
-            assert (
-                text and isinstance(text, str)
-            ), f"Transcription at index {index} in transcriptions: text shouldn't be null and should be of type str"
+            assert text and isinstance(text, str), (
+                f"Transcription at index {index} in transcriptions: text shouldn't be null and should be of type str"
+            )
             confidence = transcription.get("confidence")
             assert (
                 confidence is not None
                 and isinstance(confidence, float)
                 and 0 <= confidence <= 1
-            ), f"Transcription at index {index} in transcriptions: confidence shouldn't be null and should be a float in [0..1] range"
+            ), (
+                f"Transcription at index {index} in transcriptions: confidence shouldn't be null and should be a float in [0..1] range"
+            )
             orientation = transcription.get(
                 "orientation", TextOrientation.HorizontalLeftToRight
             )
-            assert (
-                orientation and isinstance(orientation, TextOrientation)
-            ), f"Transcription at index {index} in transcriptions: orientation shouldn't be null and should be of type TextOrientation"
+            assert orientation and isinstance(orientation, TextOrientation), (
+                f"Transcription at index {index} in transcriptions: orientation shouldn't be null and should be of type TextOrientation"
+            )
             if orientation:
                 transcription["orientation"] = orientation.value
             polygon = transcription.get("polygon")
-            assert (
-                polygon and isinstance(polygon, list)
-            ), f"Transcription at index {index} in transcriptions: polygon shouldn't be null and should be of type list"
-            assert (
-                len(polygon) >= 3
-            ), f"Transcription at index {index} in transcriptions: polygon should have at least three points"
+            assert polygon and isinstance(polygon, list), (
+                f"Transcription at index {index} in transcriptions: polygon shouldn't be null and should be of type list"
+            )
+            assert len(polygon) >= 3, (
+                f"Transcription at index {index} in transcriptions: polygon should have at least three points"
+            )
             assert all(
                 isinstance(point, list) and len(point) == 2 for point in polygon
-            ), f"Transcription at index {index} in transcriptions: polygon points should be lists of two items"
+            ), (
+                f"Transcription at index {index} in transcriptions: polygon points should be lists of two items"
+            )
             assert all(
                 isinstance(coord, int | float) for point in polygon for coord in point
-            ), f"Transcription at index {index} in transcriptions: polygon points should be lists of two numbers"
+            ), (
+                f"Transcription at index {index} in transcriptions: polygon points should be lists of two numbers"
+            )
             element_confidence = transcription.get("element_confidence")
-            assert (
-                element_confidence is None
-                or (
-                    isinstance(element_confidence, float)
-                    and 0 <= element_confidence <= 1
-                )
-            ), f"Transcription at index {index} in transcriptions: element_confidence should be either null or a float in [0..1] range"
+            assert element_confidence is None or (
+                isinstance(element_confidence, float) and 0 <= element_confidence <= 1
+            ), (
+                f"Transcription at index {index} in transcriptions: element_confidence should be either null or a float in [0..1] range"
+            )
         if self.is_read_only:
             logger.warning(
@@ -407,9 +413,9 @@ class TranscriptionMixin:
         :returns: An iterable of dicts representing each transcription,
            or an iterable of CachedTranscription when cache support is enabled.
         """
-        assert element and isinstance(
-            element, Element | CachedElement
-        ), "element shouldn't be null and should be an Element or CachedElement"
+        assert element and isinstance(element, Element | CachedElement), (
+            "element shouldn't be null and should be an Element or CachedElement"
+        )
         query_params = {}
         if element_type:
             assert isinstance(element_type, str), "element_type should be of type str"
@@ -423,22 +429,22 @@ class TranscriptionMixin:
                 DeprecationWarning,
                 stacklevel=1,
             )
-            assert isinstance(
-                worker_version, str | bool
-            ), "worker_version should be of type str or bool"
+            assert isinstance(worker_version, str | bool), (
+                "worker_version should be of type str or bool"
+            )
             if isinstance(worker_version, bool):
-                assert (
-                    worker_version is False
-                ), "if of type bool, worker_version can only be set to False"
+                assert worker_version is False, (
+                    "if of type bool, worker_version can only be set to False"
+                )
             query_params["worker_version"] = worker_version
         if worker_run is not None:
-            assert isinstance(
-                worker_run, str | bool
-            ), "worker_run should be of type str or bool"
+            assert isinstance(worker_run, str | bool), (
+                "worker_run should be of type str or bool"
+            )
             if isinstance(worker_run, bool):
-                assert (
-                    worker_run is False
-                ), "if of type bool, worker_run can only be set to False"
+                assert worker_run is False, (
+                    "if of type bool, worker_run can only be set to False"
+                )
             query_params["worker_run"] = worker_run
         if not self.use_cache:

examples/standalone/python/worker.py ADDED Viewed

@@ -0,0 +1,171 @@
+"""Standalone Python worker to create a transcription on Arkindex elements"""
+import logging
+import os
+from argparse import ArgumentParser, Namespace
+from typing import Any
+from urllib.parse import urljoin
+import requests
+# Initialize the logger to provide feedback about the worker's execution to the final user
+logging.basicConfig(
+    format="%(asctime)s %(levelname)s/%(name)s: %(message)s", level=logging.INFO
+)
+logger = logging.getLogger(__name__)
+# Below are listed the environment variables which are mandatory to run this worker
+ARKINDEX_API_URL = "ARKINDEX_API_URL"
+"""URL that points to the root of the Arkindex instance.
+"""
+ARKINDEX_API_TOKEN = "ARKINDEX_API_TOKEN"
+"""Personal token to authenticate to the Arkindex instance, useful when running locally.
+"""
+ARKINDEX_TASK_TOKEN = "ARKINDEX_TASK_TOKEN"
+"""Machine token to authenticate to the Arkindex instance, useful when running from Arkindex.
+"""
+ARKINDEX_WORKER_RUN_ID = "ARKINDEX_WORKER_RUN_ID"
+"""Identifier to publish worker results.
+"""
+def parse_args() -> Namespace:
+    """Helper to parse command line arguments.
+    This worker only supports one optional argument, a list of element IDs to process.
+    :return Namespace: A namespace containing the provided command arguments and their value.
+    """
+    parser = ArgumentParser("python worker.py")
+    parser.add_argument(
+        "--element",
+        nargs="+",
+        help="One or more Arkindex element ID",
+    )
+    return parser.parse_args()
+def arkindex_request(
+    method: str, endpoint_path: str, body: dict[str, Any] | None = None
+) -> dict:
+    """Helper to query any endpoint from the Arkindex API.
+    The environment variables named `ARKINDEX_API_URL` and `ARKINDEX_API_TOKEN` (or `ARKINDEX_TASK_TOKEN`) are required to use this helper.
+    :param str method: The HTTP request method to use https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Methods
+    :param str endpoint_path: The path of the API endpoint to query
+    :param dict[str, Any] | None body: A JSON body to send to the API, defaults to None
+    :return dict: The JSON response from the API endpoint
+    """
+    if body is None:
+        body = {}
+    # Use the `ARKINDEX_API_URL` environment variable to define the full endpoint URL
+    url = urljoin(os.getenv(ARKINDEX_API_URL), endpoint_path)
+    # The authorization varies when running locally or in Arkindex
+    if "ARKINDEX_TASK_TOKEN" in os.environ:
+        authorization = f"Ponos {os.getenv(ARKINDEX_TASK_TOKEN)}"
+    else:
+        authorization = f"Token {os.getenv(ARKINDEX_API_TOKEN)}"
+    # Query the endpoint URL using the `requests` Python package
+    response = requests.request(
+        method=method,
+        url=url,
+        headers={"Authorization": authorization},
+        json=body,
+    )
+    # Raise an exception if anything went wrong while querying the endpoint
+    try:
+        response.raise_for_status()
+    except requests.HTTPError:
+        logger.error(
+            f"Request `{endpoint_path}` failed with code {response.status_code}: {response.content}"
+        )
+        raise
+    # Return the response in JSON format if it was successful
+    return response.json()
+def main() -> None:
+    """Standalone Python worker to create a transcription on Arkindex elements"""
+    # Check that the required environment variables are available
+    for variable in (ARKINDEX_API_URL, ARKINDEX_WORKER_RUN_ID):
+        assert os.getenv(variable), (
+            f"Missing required variable `{variable}` in the environment."
+        )
+    assert os.getenv(ARKINDEX_API_TOKEN) or os.getenv(ARKINDEX_TASK_TOKEN), (
+        f"Either `{ARKINDEX_API_TOKEN}` or `{ARKINDEX_TASK_TOKEN}` variable must be set in the environment."
+    )
+    # Retrieve the worker configuration from Arkindex
+    # API endpoint: https://arkindex.teklia.com/api-docs/#tag/process/operation/RetrieveWorkerRun
+    configuration = arkindex_request(
+        method="get",
+        endpoint_path=f"process/workers/{os.getenv(ARKINDEX_WORKER_RUN_ID)}/",
+    )
+    # Build the list of elements to process
+    elements = []
+    # Option 1: The worker is running locally, on your machine, we use the value of the `--element` command argument
+    if configuration["process"]["mode"] == "local":
+        # Parse the provided command arguments
+        args = parse_args()
+        # Retrieve the list of elements from the `--element` argument
+        elements = args.element
+        # Assert that at least one element was provided to run the worker on
+        assert elements, (
+            "Missing at least one element ID to process while running the worker locally."
+        )
+    # Option 2: The worker is running on Arkindex, in a process, we list process elements
+    else:
+        # Retrieve the list of elements from the process which is currently running
+        # API endpoint: https://arkindex.teklia.com/api-docs/#tag/process/operation/ListProcessElements
+        json_response = arkindex_request(
+            method="get",
+            endpoint_path=f"process/{configuration['process']['id']}/elements/",
+        )
+        # We only need the ID of each element to process, other information is not necessary
+        elements = [element["id"] for element in json_response["results"]]
+    total = len(elements)
+    failed = 0
+    # Iterate over all elements to create a basic transcription
+    for element_id in elements:
+        try:
+            # Create the "Hello world!" transcription on the current element
+            # API endpoint: https://arkindex.teklia.com/api-docs/#tag/transcriptions/operation/CreateTranscription
+            transcription = arkindex_request(
+                method="post",
+                endpoint_path=f"element/{element_id}/transcription/",
+                body={
+                    "text": "Hello world!",
+                    "worker_run_id": os.getenv(ARKINDEX_WORKER_RUN_ID),
+                    "confidence": 1.0,
+                },
+            )
+            # Output feedback when a transcription is successfully created
+            logger.info(
+                f"A transcription with the ID {transcription['id']} was successfully created on element {element_id}."
+            )
+        except Exception:
+            # Output feedback when failing to create a transcription, and increment the `failed` counter
+            logger.error(f"Failed to create a transcription on element {element_id}.")
+            failed += 1
+    completed = total - failed
+    # Output a summary of the worker execution over all provided elements
+    logger.info(f"Ran on {total} element(s): {completed} completed, {failed} error(s).")
+if __name__ == "__main__":
+    main()

examples/tooled/python/worker.py ADDED Viewed

@@ -0,0 +1,50 @@
+"""Tooled Python worker to create a transcription on Arkindex elements"""
+import logging
+from arkindex_worker.models import Element
+from arkindex_worker.worker import ElementsWorker
+# Initialize the logger to provide feedback about the worker's execution to the final user
+logging.basicConfig(
+    format="%(asctime)s %(levelname)s/%(name)s: %(message)s", level=logging.INFO
+)
+logger = logging.getLogger(__name__)
+# Create a worker inheriting from the `ElementsWorker` class provided by the `arkindex-base-worker` package
+class BasicWorker(ElementsWorker):
+    def process_element(self, element: Element) -> None:
+        """Process a single Arkindex element at once and publish a simple transcription on it.
+        :param Element element: The element currently being processed from the element list
+        """
+        try:
+            # Create the "Hello world!" transcription on the current element
+            # Helper: `TranscriptionMixin.create_transcription` from the `arkindex-base-worker` package
+            transcription = self.create_transcription(
+                element=element,
+                text="Hello world!",
+                confidence=1.0,
+            )
+            # Output feedback when a transcription is successfully created
+            logger.info(
+                f"A transcription with the ID {transcription['id']} was successfully created on element {element.id}."
+            )
+        except Exception as e:
+            # Output feedback when failing to create a transcription
+            logger.error(
+                f"Failed to create a transcription on element {element.id}: {e}"
+            )
+def main() -> None:
+    BasicWorker(
+        description="Tooled Python worker to create a transcription on Arkindex elements"
+    ).run()
+if __name__ == "__main__":
+    main()

arkindex-base-worker 0.4.0rc6__py3-none-any.whl → 0.5.0__py3-none-any.whl

arkindex-base-worker 0.4.0rc6py3-none-any.whl → 0.5.0py3-none-any.whl