PyPI - arkindex-base-worker - Versions diffs - 0.5.0a3__py3-none-any.whl → 0.5.0b1__py3-none-any.whl - Mend

arkindex-base-worker 0.5.0a3py3-none-any.whl → 0.5.0b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

examples/standalone/python/worker.py ADDED Viewed

@@ -0,0 +1,171 @@
+"""Standalone Python worker to create a transcription on Arkindex elements"""
+import logging
+import os
+from argparse import ArgumentParser, Namespace
+from typing import Any
+from urllib.parse import urljoin
+import requests
+# Initialize the logger to provide feedback about the worker's execution to the final user
+logging.basicConfig(
+    format="%(asctime)s %(levelname)s/%(name)s: %(message)s", level=logging.INFO
+)
+logger = logging.getLogger(__name__)
+# Below are listed the environment variables which are mandatory to run this worker
+ARKINDEX_API_URL = "ARKINDEX_API_URL"
+"""URL that points to the root of the Arkindex instance.
+"""
+ARKINDEX_API_TOKEN = "ARKINDEX_API_TOKEN"
+"""Personal token to authenticate to the Arkindex instance, useful when running locally.
+"""
+ARKINDEX_TASK_TOKEN = "ARKINDEX_TASK_TOKEN"
+"""Machine token to authenticate to the Arkindex instance, useful when running from Arkindex.
+"""
+ARKINDEX_WORKER_RUN_ID = "ARKINDEX_WORKER_RUN_ID"
+"""Identifier to publish worker results.
+"""
+def parse_args() -> Namespace:
+    """Helper to parse command line arguments.
+    This worker only supports one optional argument, a list of element IDs to process.
+    :return Namespace: A namespace containing the provided command arguments and their value.
+    """
+    parser = ArgumentParser("python worker.py")
+    parser.add_argument(
+        "--element",
+        nargs="+",
+        help="One or more Arkindex element ID",
+    )
+    return parser.parse_args()
+def arkindex_request(
+    method: str, endpoint_path: str, body: dict[str, Any] | None = None
+) -> dict:
+    """Helper to query any endpoint from the Arkindex API.
+    The environment variables named `ARKINDEX_API_URL` and `ARKINDEX_API_TOKEN` (or `ARKINDEX_TASK_TOKEN`) are required to use this helper.
+    :param str method: The HTTP request method to use https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Methods
+    :param str endpoint_path: The path of the API endpoint to query
+    :param dict[str, Any] | None body: A JSON body to send to the API, defaults to None
+    :return dict: The JSON response from the API endpoint
+    """
+    if body is None:
+        body = {}
+    # Use the `ARKINDEX_API_URL` environment variable to define the full endpoint URL
+    url = urljoin(os.getenv(ARKINDEX_API_URL), endpoint_path)
+    # The authorization varies when running locally or in Arkindex
+    if "ARKINDEX_TASK_TOKEN" in os.environ:
+        authorization = f"Ponos {os.getenv(ARKINDEX_TASK_TOKEN)}"
+    else:
+        authorization = f"Token {os.getenv(ARKINDEX_API_TOKEN)}"
+    # Query the endpoint URL using the `requests` Python package
+    response = requests.request(
+        method=method,
+        url=url,
+        headers={"Authorization": authorization},
+        json=body,
+    )
+    # Raise an exception if anything went wrong while querying the endpoint
+    try:
+        response.raise_for_status()
+    except requests.HTTPError:
+        logger.error(
+            f"Request `{endpoint_path}` failed with code {response.status_code}: {response.content}"
+        )
+        raise
+    # Return the response in JSON format if it was successful
+    return response.json()
+def main() -> None:
+    """Standalone Python worker to create a transcription on Arkindex elements"""
+    # Check that the required environment variables are available
+    for variable in (ARKINDEX_API_URL, ARKINDEX_WORKER_RUN_ID):
+        assert os.getenv(variable), (
+            f"Missing required variable `{variable}` in the environment."
+        )
+    assert os.getenv(ARKINDEX_API_TOKEN) or os.getenv(ARKINDEX_TASK_TOKEN), (
+        f"Either `{ARKINDEX_API_TOKEN}` or `{ARKINDEX_TASK_TOKEN}` variable must be set in the environment."
+    )
+    # Retrieve the worker configuration from Arkindex
+    # API endpoint: https://arkindex.teklia.com/api-docs/#tag/process/operation/RetrieveWorkerRun
+    configuration = arkindex_request(
+        method="get",
+        endpoint_path=f"process/workers/{os.getenv(ARKINDEX_WORKER_RUN_ID)}/",
+    )
+    # Build the list of elements to process
+    elements = []
+    # Option 1: The worker is running locally, on your machine, we use the value of the `--element` command argument
+    if configuration["process"]["mode"] == "local":
+        # Parse the provided command arguments
+        args = parse_args()
+        # Retrieve the list of elements from the `--element` argument
+        elements = args.element
+        # Assert that at least one element was provided to run the worker on
+        assert elements, (
+            "Missing at least one element ID to process while running the worker locally."
+        )
+    # Option 2: The worker is running on Arkindex, in a process, we list process elements
+    else:
+        # Retrieve the list of elements from the process which is currently running
+        # API endpoint: https://arkindex.teklia.com/api-docs/#tag/process/operation/ListProcessElements
+        json_response = arkindex_request(
+            method="get",
+            endpoint_path=f"process/{configuration['process']['id']}/elements/",
+        )
+        # We only need the ID of each element to process, other information is not necessary
+        elements = [element["id"] for element in json_response["results"]]
+    total = len(elements)
+    failed = 0
+    # Iterate over all elements to create a basic transcription
+    for element_id in elements:
+        try:
+            # Create the "Hello world!" transcription on the current element
+            # API endpoint: https://arkindex.teklia.com/api-docs/#tag/transcriptions/operation/CreateTranscription
+            transcription = arkindex_request(
+                method="post",
+                endpoint_path=f"element/{element_id}/transcription/",
+                body={
+                    "text": "Hello world!",
+                    "worker_run_id": os.getenv(ARKINDEX_WORKER_RUN_ID),
+                    "confidence": 1.0,
+                },
+            )
+            # Output feedback when a transcription is successfully created
+            logger.info(
+                f"A transcription with the ID {transcription['id']} was successfully created on element {element_id}."
+            )
+        except Exception:
+            # Output feedback when failing to create a transcription, and increment the `failed` counter
+            logger.error(f"Failed to create a transcription on element {element_id}.")
+            failed += 1
+    completed = total - failed
+    # Output a summary of the worker execution over all provided elements
+    logger.info(f"Ran on {total} element(s): {completed} completed, {failed} error(s).")
+if __name__ == "__main__":
+    main()

examples/tooled/python/worker.py ADDED Viewed

@@ -0,0 +1,50 @@
+"""Tooled Python worker to create a transcription on Arkindex elements"""
+import logging
+from arkindex_worker.models import Element
+from arkindex_worker.worker import ElementsWorker
+# Initialize the logger to provide feedback about the worker's execution to the final user
+logging.basicConfig(
+    format="%(asctime)s %(levelname)s/%(name)s: %(message)s", level=logging.INFO
+)
+logger = logging.getLogger(__name__)
+# Create a worker inheriting from the `ElementsWorker` class provided by the `arkindex-base-worker` package
+class BasicWorker(ElementsWorker):
+    def process_element(self, element: Element) -> None:
+        """Process a single Arkindex element at once and publish a simple transcription on it.
+        :param Element element: The element currently being processed from the element list
+        """
+        try:
+            # Create the "Hello world!" transcription on the current element
+            # Helper: `TranscriptionMixin.create_transcription` from the `arkindex-base-worker` package
+            transcription = self.create_transcription(
+                element=element,
+                text="Hello world!",
+                confidence=1.0,
+            )
+            # Output feedback when a transcription is successfully created
+            logger.info(
+                f"A transcription with the ID {transcription['id']} was successfully created on element {element.id}."
+            )
+        except Exception as e:
+            # Output feedback when failing to create a transcription
+            logger.error(
+                f"Failed to create a transcription on element {element.id}: {e}"
+            )
+def main() -> None:
+    BasicWorker(
+        description="Tooled Python worker to create a transcription on Arkindex elements"
+    ).run()
+if __name__ == "__main__":
+    main()

tests/test_elements_worker/test_element.py CHANGED Viewed

@@ -10,7 +10,7 @@ from arkindex_worker.cache import (
     CachedImage,
 )
 from arkindex_worker.models import Element
-from arkindex_worker.worker.element import MissingTypeError
+from arkindex_worker.worker.element import MissingElementType
 from tests import CORPUS_ID
 from . import BASE_API_CALLS
@@ -34,73 +34,247 @@ def test_list_corpus_types(responses, mock_elements_worker):
     }
-def test_check_required_types_argument_types(mock_elements_worker):
+def test_create_element_type_wrong_slug(mock_elements_worker):
     with pytest.raises(
-        AssertionError, match="At least one element type slug is required."
+        AssertionError, match="slug shouldn't be null and should be of type str"
     ):
-        mock_elements_worker.check_required_types()
+        mock_elements_worker.create_element_type(slug=None, name="page")
-    with pytest.raises(AssertionError, match="Element type slugs must be strings."):
-        mock_elements_worker.check_required_types("lol", 42)
+    with pytest.raises(
+        AssertionError, match="slug shouldn't be null and should be of type str"
+    ):
+        mock_elements_worker.create_element_type(slug=1234, name="page")
-def test_check_required_types(mock_elements_worker):
-    mock_elements_worker.corpus_types = {
-        "folder": {"slug": "folder"},
-        "page": {"slug": "page"},
-    }
+def test_create_element_type_wrong_name(mock_elements_worker):
+    with pytest.raises(
+        AssertionError, match="name shouldn't be null and should be of type str"
+    ):
+        mock_elements_worker.create_element_type(slug="page", name=None)
+    with pytest.raises(
+        AssertionError, match="name shouldn't be null and should be of type str"
+    ):
+        mock_elements_worker.create_element_type(slug="page", name=1234)
-    assert mock_elements_worker.check_required_types("page")
-    assert mock_elements_worker.check_required_types("page", "folder")
+def test_create_element_type_wrong_is_folder(mock_elements_worker):
     with pytest.raises(
-        MissingTypeError,
-        match=re.escape(
-            "Element types act, text_line were not found in corpus (11111111-1111-1111-1111-111111111111)."
-        ),
+        AssertionError, match="is_folder shouldn't be null and should be of type bool"
+    ):
+        mock_elements_worker.create_element_type(
+            slug="page", name="page", is_folder=None
+        )
+    with pytest.raises(
+        AssertionError, match="is_folder shouldn't be null and should be of type bool"
     ):
-        assert mock_elements_worker.check_required_types("page", "text_line", "act")
+        mock_elements_worker.create_element_type(
+            slug="page", name="page", is_folder=1234
+        )
+def test_create_element_type_api_error(responses, mock_elements_worker):
+    responses.add(
+        responses.POST,
+        "http://testserver/api/v1/elements/type/",
+        status=418,
+    )
+    with pytest.raises(ErrorResponse):
+        mock_elements_worker.create_element_type(slug="page", name="page")
+    assert len(responses.calls) == len(BASE_API_CALLS) + 1
+    assert [
+        (call.request.method, call.request.url) for call in responses.calls
+    ] == BASE_API_CALLS + [("POST", "http://testserver/api/v1/elements/type/")]
+def test_create_element_type_already_exists(responses, mock_elements_worker):
+    assert mock_elements_worker.corpus_types == {}
-def test_check_required_types_create_missing(responses, mock_elements_worker):
-    mock_elements_worker.corpus_types = {
+    responses.add(
+        responses.POST,
+        "http://testserver/api/v1/elements/type/",
+        status=400,
+        match=[
+            matchers.json_params_matcher(
+                {
+                    "slug": "page",
+                    "display_name": "page",
+                    "folder": False,
+                    "corpus": CORPUS_ID,
+                }
+            )
+        ],
+    )
+    responses.add(
+        responses.GET,
+        f"http://testserver/api/v1/corpus/{CORPUS_ID}/",
+        status=200,
+        json={
+            "id": CORPUS_ID,
+            "types": [{"slug": "folder"}, {"slug": "page"}],
+        },
+    )
+    mock_elements_worker.create_element_type(slug="page", name="page")
+    assert len(responses.calls) == len(BASE_API_CALLS) + 2
+    assert [
+        (call.request.method, call.request.url) for call in responses.calls
+    ] == BASE_API_CALLS + [
+        ("POST", "http://testserver/api/v1/elements/type/"),
+        ("GET", f"http://testserver/api/v1/corpus/{CORPUS_ID}/"),
+    ]
+    # Make sure the corpus_types attribute has been updated
+    assert mock_elements_worker.corpus_types == {
         "folder": {"slug": "folder"},
         "page": {"slug": "page"},
     }
+def test_create_element_type(responses, mock_elements_worker):
+    assert mock_elements_worker.corpus_types == {}
     responses.add(
         responses.POST,
         "http://testserver/api/v1/elements/type/",
+        status=200,
         match=[
             matchers.json_params_matcher(
                 {
-                    "slug": "text_line",
-                    "display_name": "text_line",
+                    "slug": "page",
+                    "display_name": "page",
                     "folder": False,
                     "corpus": CORPUS_ID,
                 }
             )
         ],
+        json={"id": "page-id", "slug": "page", "display_name": "page", "folder": False},
     )
+    mock_elements_worker.create_element_type(slug="page", name="page")
+    assert len(responses.calls) == len(BASE_API_CALLS) + 1
+    assert [
+        (call.request.method, call.request.url) for call in responses.calls
+    ] == BASE_API_CALLS + [
+        ("POST", "http://testserver/api/v1/elements/type/"),
+    ]
+    # Make sure the corpus_types attribute has been updated
+    assert mock_elements_worker.corpus_types == {
+        "page": {
+            "id": "page-id",
+            "slug": "page",
+            "display_name": "page",
+            "folder": False,
+        }
+    }
+def test_check_required_types_wrong_type_slugs(mock_elements_worker):
+    with pytest.raises(
+        AssertionError, match="type_slugs shouldn't be null and should be of type list"
+    ):
+        mock_elements_worker.check_required_types(type_slugs=None)
+    with pytest.raises(
+        AssertionError, match="type_slugs shouldn't be null and should be of type list"
+    ):
+        mock_elements_worker.check_required_types(type_slugs=1234)
+    with pytest.raises(
+        AssertionError,
+        match="Element type at index 1 in type_slugs: Should be of type str",
+    ):
+        mock_elements_worker.check_required_types(type_slugs=["page", 1234])
+def test_check_required_types_wrong_create_missing(mock_elements_worker):
+    with pytest.raises(
+        AssertionError,
+        match="create_missing shouldn't be null and should be of type bool",
+    ):
+        mock_elements_worker.check_required_types(
+            type_slugs=["page"], create_missing=None
+        )
+    with pytest.raises(
+        AssertionError,
+        match="create_missing shouldn't be null and should be of type bool",
+    ):
+        mock_elements_worker.check_required_types(
+            type_slugs=["page"], create_missing=1234
+        )
+def test_check_required_types_do_not_create_missing(responses, mock_elements_worker):
+    # Set one element type
+    mock_elements_worker.corpus_types = {"folder": {"slug": "folder"}}
+    with pytest.raises(
+        MissingElementType, match="Element type `page` was not in the corpus."
+    ):
+        mock_elements_worker.check_required_types(
+            type_slugs=["folder", "page"], create_missing=False
+        )
+    assert len(responses.calls) == len(BASE_API_CALLS)
+    assert [
+        (call.request.method, call.request.url) for call in responses.calls
+    ] == BASE_API_CALLS
+def test_check_required_types(responses, mock_elements_worker):
+    # Set one element type
+    mock_elements_worker.corpus_types = {"folder": {"slug": "folder"}}
+    # Call to create a new element type
     responses.add(
         responses.POST,
         "http://testserver/api/v1/elements/type/",
+        status=200,
         match=[
             matchers.json_params_matcher(
                 {
-                    "slug": "act",
-                    "display_name": "act",
+                    "slug": "page",
+                    "display_name": "page",
                     "folder": False,
                     "corpus": CORPUS_ID,
                 }
             )
         ],
+        json={"id": "page-id", "slug": "page", "display_name": "page", "folder": False},
     )
-    assert mock_elements_worker.check_required_types(
-        "page", "text_line", "act", create_missing=True
+    mock_elements_worker.check_required_types(
+        type_slugs=["folder", "page"], create_missing=True
     )
+    assert len(responses.calls) == len(BASE_API_CALLS) + 1
+    assert [
+        (call.request.method, call.request.url) for call in responses.calls
+    ] == BASE_API_CALLS + [
+        (
+            "POST",
+            "http://testserver/api/v1/elements/type/",
+        ),
+    ]
+    # Make sure the element_types attribute has been updated
+    assert mock_elements_worker.corpus_types == {
+        "folder": {"slug": "folder"},
+        "page": {
+            "id": "page-id",
+            "slug": "page",
+            "display_name": "page",
+            "folder": False,
+        },
+    }
 @pytest.mark.parametrize(
     ("payload", "error"),

arkindex-base-worker 0.5.0a3__py3-none-any.whl → 0.5.0b1__py3-none-any.whl

arkindex-base-worker 0.5.0a3py3-none-any.whl → 0.5.0b1py3-none-any.whl