arkindex-base-worker 0.5.0a3__py3-none-any.whl → 0.5.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,171 @@
1
+ """Standalone Python worker to create a transcription on Arkindex elements"""
2
+
3
+ import logging
4
+ import os
5
+ from argparse import ArgumentParser, Namespace
6
+ from typing import Any
7
+ from urllib.parse import urljoin
8
+
9
+ import requests
10
+
11
+ # Initialize the logger to provide feedback about the worker's execution to the final user
12
+ logging.basicConfig(
13
+ format="%(asctime)s %(levelname)s/%(name)s: %(message)s", level=logging.INFO
14
+ )
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Below are listed the environment variables which are mandatory to run this worker
18
+ ARKINDEX_API_URL = "ARKINDEX_API_URL"
19
+ """URL that points to the root of the Arkindex instance.
20
+ """
21
+ ARKINDEX_API_TOKEN = "ARKINDEX_API_TOKEN"
22
+ """Personal token to authenticate to the Arkindex instance, useful when running locally.
23
+ """
24
+ ARKINDEX_TASK_TOKEN = "ARKINDEX_TASK_TOKEN"
25
+ """Machine token to authenticate to the Arkindex instance, useful when running from Arkindex.
26
+ """
27
+ ARKINDEX_WORKER_RUN_ID = "ARKINDEX_WORKER_RUN_ID"
28
+ """Identifier to publish worker results.
29
+ """
30
+
31
+
32
+ def parse_args() -> Namespace:
33
+ """Helper to parse command line arguments.
34
+ This worker only supports one optional argument, a list of element IDs to process.
35
+
36
+ :return Namespace: A namespace containing the provided command arguments and their value.
37
+ """
38
+ parser = ArgumentParser("python worker.py")
39
+ parser.add_argument(
40
+ "--element",
41
+ nargs="+",
42
+ help="One or more Arkindex element ID",
43
+ )
44
+ return parser.parse_args()
45
+
46
+
47
+ def arkindex_request(
48
+ method: str, endpoint_path: str, body: dict[str, Any] | None = None
49
+ ) -> dict:
50
+ """Helper to query any endpoint from the Arkindex API.
51
+ The environment variables named `ARKINDEX_API_URL` and `ARKINDEX_API_TOKEN` (or `ARKINDEX_TASK_TOKEN`) are required to use this helper.
52
+
53
+ :param str method: The HTTP request method to use https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Methods
54
+ :param str endpoint_path: The path of the API endpoint to query
55
+ :param dict[str, Any] | None body: A JSON body to send to the API, defaults to None
56
+ :return dict: The JSON response from the API endpoint
57
+ """
58
+ if body is None:
59
+ body = {}
60
+
61
+ # Use the `ARKINDEX_API_URL` environment variable to define the full endpoint URL
62
+ url = urljoin(os.getenv(ARKINDEX_API_URL), endpoint_path)
63
+
64
+ # The authorization varies when running locally or in Arkindex
65
+ if "ARKINDEX_TASK_TOKEN" in os.environ:
66
+ authorization = f"Ponos {os.getenv(ARKINDEX_TASK_TOKEN)}"
67
+ else:
68
+ authorization = f"Token {os.getenv(ARKINDEX_API_TOKEN)}"
69
+
70
+ # Query the endpoint URL using the `requests` Python package
71
+ response = requests.request(
72
+ method=method,
73
+ url=url,
74
+ headers={"Authorization": authorization},
75
+ json=body,
76
+ )
77
+
78
+ # Raise an exception if anything went wrong while querying the endpoint
79
+ try:
80
+ response.raise_for_status()
81
+ except requests.HTTPError:
82
+ logger.error(
83
+ f"Request `{endpoint_path}` failed with code {response.status_code}: {response.content}"
84
+ )
85
+ raise
86
+
87
+ # Return the response in JSON format if it was successful
88
+ return response.json()
89
+
90
+
91
+ def main() -> None:
92
+ """Standalone Python worker to create a transcription on Arkindex elements"""
93
+ # Check that the required environment variables are available
94
+ for variable in (ARKINDEX_API_URL, ARKINDEX_WORKER_RUN_ID):
95
+ assert os.getenv(variable), (
96
+ f"Missing required variable `{variable}` in the environment."
97
+ )
98
+
99
+ assert os.getenv(ARKINDEX_API_TOKEN) or os.getenv(ARKINDEX_TASK_TOKEN), (
100
+ f"Either `{ARKINDEX_API_TOKEN}` or `{ARKINDEX_TASK_TOKEN}` variable must be set in the environment."
101
+ )
102
+
103
+ # Retrieve the worker configuration from Arkindex
104
+ # API endpoint: https://arkindex.teklia.com/api-docs/#tag/process/operation/RetrieveWorkerRun
105
+ configuration = arkindex_request(
106
+ method="get",
107
+ endpoint_path=f"process/workers/{os.getenv(ARKINDEX_WORKER_RUN_ID)}/",
108
+ )
109
+
110
+ # Build the list of elements to process
111
+ elements = []
112
+
113
+ # Option 1: The worker is running locally, on your machine, we use the value of the `--element` command argument
114
+ if configuration["process"]["mode"] == "local":
115
+ # Parse the provided command arguments
116
+ args = parse_args()
117
+
118
+ # Retrieve the list of elements from the `--element` argument
119
+ elements = args.element
120
+
121
+ # Assert that at least one element was provided to run the worker on
122
+ assert elements, (
123
+ "Missing at least one element ID to process while running the worker locally."
124
+ )
125
+
126
+ # Option 2: The worker is running on Arkindex, in a process, we list process elements
127
+ else:
128
+ # Retrieve the list of elements from the process which is currently running
129
+ # API endpoint: https://arkindex.teklia.com/api-docs/#tag/process/operation/ListProcessElements
130
+ json_response = arkindex_request(
131
+ method="get",
132
+ endpoint_path=f"process/{configuration['process']['id']}/elements/",
133
+ )
134
+
135
+ # We only need the ID of each element to process, other information is not necessary
136
+ elements = [element["id"] for element in json_response["results"]]
137
+
138
+ total = len(elements)
139
+ failed = 0
140
+ # Iterate over all elements to create a basic transcription
141
+ for element_id in elements:
142
+ try:
143
+ # Create the "Hello world!" transcription on the current element
144
+ # API endpoint: https://arkindex.teklia.com/api-docs/#tag/transcriptions/operation/CreateTranscription
145
+ transcription = arkindex_request(
146
+ method="post",
147
+ endpoint_path=f"element/{element_id}/transcription/",
148
+ body={
149
+ "text": "Hello world!",
150
+ "worker_run_id": os.getenv(ARKINDEX_WORKER_RUN_ID),
151
+ "confidence": 1.0,
152
+ },
153
+ )
154
+
155
+ # Output feedback when a transcription is successfully created
156
+ logger.info(
157
+ f"A transcription with the ID {transcription['id']} was successfully created on element {element_id}."
158
+ )
159
+
160
+ except Exception:
161
+ # Output feedback when failing to create a transcription, and increment the `failed` counter
162
+ logger.error(f"Failed to create a transcription on element {element_id}.")
163
+ failed += 1
164
+
165
+ completed = total - failed
166
+ # Output a summary of the worker execution over all provided elements
167
+ logger.info(f"Ran on {total} element(s): {completed} completed, {failed} error(s).")
168
+
169
+
170
+ if __name__ == "__main__":
171
+ main()
@@ -0,0 +1,50 @@
1
+ """Tooled Python worker to create a transcription on Arkindex elements"""
2
+
3
+ import logging
4
+
5
+ from arkindex_worker.models import Element
6
+ from arkindex_worker.worker import ElementsWorker
7
+
8
+ # Initialize the logger to provide feedback about the worker's execution to the final user
9
+ logging.basicConfig(
10
+ format="%(asctime)s %(levelname)s/%(name)s: %(message)s", level=logging.INFO
11
+ )
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ # Create a worker inheriting from the `ElementsWorker` class provided by the `arkindex-base-worker` package
16
+ class BasicWorker(ElementsWorker):
17
+ def process_element(self, element: Element) -> None:
18
+ """Process a single Arkindex element at once and publish a simple transcription on it.
19
+
20
+ :param Element element: The element currently being processed from the element list
21
+ """
22
+ try:
23
+ # Create the "Hello world!" transcription on the current element
24
+ # Helper: `TranscriptionMixin.create_transcription` from the `arkindex-base-worker` package
25
+ transcription = self.create_transcription(
26
+ element=element,
27
+ text="Hello world!",
28
+ confidence=1.0,
29
+ )
30
+
31
+ # Output feedback when a transcription is successfully created
32
+ logger.info(
33
+ f"A transcription with the ID {transcription['id']} was successfully created on element {element.id}."
34
+ )
35
+
36
+ except Exception as e:
37
+ # Output feedback when failing to create a transcription
38
+ logger.error(
39
+ f"Failed to create a transcription on element {element.id}: {e}"
40
+ )
41
+
42
+
43
+ def main() -> None:
44
+ BasicWorker(
45
+ description="Tooled Python worker to create a transcription on Arkindex elements"
46
+ ).run()
47
+
48
+
49
+ if __name__ == "__main__":
50
+ main()
@@ -10,7 +10,7 @@ from arkindex_worker.cache import (
10
10
  CachedImage,
11
11
  )
12
12
  from arkindex_worker.models import Element
13
- from arkindex_worker.worker.element import MissingTypeError
13
+ from arkindex_worker.worker.element import MissingElementType
14
14
  from tests import CORPUS_ID
15
15
 
16
16
  from . import BASE_API_CALLS
@@ -34,73 +34,247 @@ def test_list_corpus_types(responses, mock_elements_worker):
34
34
  }
35
35
 
36
36
 
37
- def test_check_required_types_argument_types(mock_elements_worker):
37
+ def test_create_element_type_wrong_slug(mock_elements_worker):
38
38
  with pytest.raises(
39
- AssertionError, match="At least one element type slug is required."
39
+ AssertionError, match="slug shouldn't be null and should be of type str"
40
40
  ):
41
- mock_elements_worker.check_required_types()
41
+ mock_elements_worker.create_element_type(slug=None, name="page")
42
42
 
43
- with pytest.raises(AssertionError, match="Element type slugs must be strings."):
44
- mock_elements_worker.check_required_types("lol", 42)
43
+ with pytest.raises(
44
+ AssertionError, match="slug shouldn't be null and should be of type str"
45
+ ):
46
+ mock_elements_worker.create_element_type(slug=1234, name="page")
45
47
 
46
48
 
47
- def test_check_required_types(mock_elements_worker):
48
- mock_elements_worker.corpus_types = {
49
- "folder": {"slug": "folder"},
50
- "page": {"slug": "page"},
51
- }
49
+ def test_create_element_type_wrong_name(mock_elements_worker):
50
+ with pytest.raises(
51
+ AssertionError, match="name shouldn't be null and should be of type str"
52
+ ):
53
+ mock_elements_worker.create_element_type(slug="page", name=None)
54
+
55
+ with pytest.raises(
56
+ AssertionError, match="name shouldn't be null and should be of type str"
57
+ ):
58
+ mock_elements_worker.create_element_type(slug="page", name=1234)
52
59
 
53
- assert mock_elements_worker.check_required_types("page")
54
- assert mock_elements_worker.check_required_types("page", "folder")
55
60
 
61
+ def test_create_element_type_wrong_is_folder(mock_elements_worker):
56
62
  with pytest.raises(
57
- MissingTypeError,
58
- match=re.escape(
59
- "Element types act, text_line were not found in corpus (11111111-1111-1111-1111-111111111111)."
60
- ),
63
+ AssertionError, match="is_folder shouldn't be null and should be of type bool"
64
+ ):
65
+ mock_elements_worker.create_element_type(
66
+ slug="page", name="page", is_folder=None
67
+ )
68
+
69
+ with pytest.raises(
70
+ AssertionError, match="is_folder shouldn't be null and should be of type bool"
61
71
  ):
62
- assert mock_elements_worker.check_required_types("page", "text_line", "act")
72
+ mock_elements_worker.create_element_type(
73
+ slug="page", name="page", is_folder=1234
74
+ )
75
+
76
+
77
+ def test_create_element_type_api_error(responses, mock_elements_worker):
78
+ responses.add(
79
+ responses.POST,
80
+ "http://testserver/api/v1/elements/type/",
81
+ status=418,
82
+ )
83
+
84
+ with pytest.raises(ErrorResponse):
85
+ mock_elements_worker.create_element_type(slug="page", name="page")
86
+
87
+ assert len(responses.calls) == len(BASE_API_CALLS) + 1
88
+ assert [
89
+ (call.request.method, call.request.url) for call in responses.calls
90
+ ] == BASE_API_CALLS + [("POST", "http://testserver/api/v1/elements/type/")]
91
+
63
92
 
93
+ def test_create_element_type_already_exists(responses, mock_elements_worker):
94
+ assert mock_elements_worker.corpus_types == {}
64
95
 
65
- def test_check_required_types_create_missing(responses, mock_elements_worker):
66
- mock_elements_worker.corpus_types = {
96
+ responses.add(
97
+ responses.POST,
98
+ "http://testserver/api/v1/elements/type/",
99
+ status=400,
100
+ match=[
101
+ matchers.json_params_matcher(
102
+ {
103
+ "slug": "page",
104
+ "display_name": "page",
105
+ "folder": False,
106
+ "corpus": CORPUS_ID,
107
+ }
108
+ )
109
+ ],
110
+ )
111
+ responses.add(
112
+ responses.GET,
113
+ f"http://testserver/api/v1/corpus/{CORPUS_ID}/",
114
+ status=200,
115
+ json={
116
+ "id": CORPUS_ID,
117
+ "types": [{"slug": "folder"}, {"slug": "page"}],
118
+ },
119
+ )
120
+
121
+ mock_elements_worker.create_element_type(slug="page", name="page")
122
+
123
+ assert len(responses.calls) == len(BASE_API_CALLS) + 2
124
+ assert [
125
+ (call.request.method, call.request.url) for call in responses.calls
126
+ ] == BASE_API_CALLS + [
127
+ ("POST", "http://testserver/api/v1/elements/type/"),
128
+ ("GET", f"http://testserver/api/v1/corpus/{CORPUS_ID}/"),
129
+ ]
130
+
131
+ # Make sure the corpus_types attribute has been updated
132
+ assert mock_elements_worker.corpus_types == {
67
133
  "folder": {"slug": "folder"},
68
134
  "page": {"slug": "page"},
69
135
  }
70
136
 
137
+
138
+ def test_create_element_type(responses, mock_elements_worker):
139
+ assert mock_elements_worker.corpus_types == {}
140
+
71
141
  responses.add(
72
142
  responses.POST,
73
143
  "http://testserver/api/v1/elements/type/",
144
+ status=200,
74
145
  match=[
75
146
  matchers.json_params_matcher(
76
147
  {
77
- "slug": "text_line",
78
- "display_name": "text_line",
148
+ "slug": "page",
149
+ "display_name": "page",
79
150
  "folder": False,
80
151
  "corpus": CORPUS_ID,
81
152
  }
82
153
  )
83
154
  ],
155
+ json={"id": "page-id", "slug": "page", "display_name": "page", "folder": False},
84
156
  )
157
+
158
+ mock_elements_worker.create_element_type(slug="page", name="page")
159
+
160
+ assert len(responses.calls) == len(BASE_API_CALLS) + 1
161
+ assert [
162
+ (call.request.method, call.request.url) for call in responses.calls
163
+ ] == BASE_API_CALLS + [
164
+ ("POST", "http://testserver/api/v1/elements/type/"),
165
+ ]
166
+
167
+ # Make sure the corpus_types attribute has been updated
168
+ assert mock_elements_worker.corpus_types == {
169
+ "page": {
170
+ "id": "page-id",
171
+ "slug": "page",
172
+ "display_name": "page",
173
+ "folder": False,
174
+ }
175
+ }
176
+
177
+
178
+ def test_check_required_types_wrong_type_slugs(mock_elements_worker):
179
+ with pytest.raises(
180
+ AssertionError, match="type_slugs shouldn't be null and should be of type list"
181
+ ):
182
+ mock_elements_worker.check_required_types(type_slugs=None)
183
+
184
+ with pytest.raises(
185
+ AssertionError, match="type_slugs shouldn't be null and should be of type list"
186
+ ):
187
+ mock_elements_worker.check_required_types(type_slugs=1234)
188
+
189
+ with pytest.raises(
190
+ AssertionError,
191
+ match="Element type at index 1 in type_slugs: Should be of type str",
192
+ ):
193
+ mock_elements_worker.check_required_types(type_slugs=["page", 1234])
194
+
195
+
196
+ def test_check_required_types_wrong_create_missing(mock_elements_worker):
197
+ with pytest.raises(
198
+ AssertionError,
199
+ match="create_missing shouldn't be null and should be of type bool",
200
+ ):
201
+ mock_elements_worker.check_required_types(
202
+ type_slugs=["page"], create_missing=None
203
+ )
204
+
205
+ with pytest.raises(
206
+ AssertionError,
207
+ match="create_missing shouldn't be null and should be of type bool",
208
+ ):
209
+ mock_elements_worker.check_required_types(
210
+ type_slugs=["page"], create_missing=1234
211
+ )
212
+
213
+
214
+ def test_check_required_types_do_not_create_missing(responses, mock_elements_worker):
215
+ # Set one element type
216
+ mock_elements_worker.corpus_types = {"folder": {"slug": "folder"}}
217
+
218
+ with pytest.raises(
219
+ MissingElementType, match="Element type `page` was not in the corpus."
220
+ ):
221
+ mock_elements_worker.check_required_types(
222
+ type_slugs=["folder", "page"], create_missing=False
223
+ )
224
+
225
+ assert len(responses.calls) == len(BASE_API_CALLS)
226
+ assert [
227
+ (call.request.method, call.request.url) for call in responses.calls
228
+ ] == BASE_API_CALLS
229
+
230
+
231
+ def test_check_required_types(responses, mock_elements_worker):
232
+ # Set one element type
233
+ mock_elements_worker.corpus_types = {"folder": {"slug": "folder"}}
234
+
235
+ # Call to create a new element type
85
236
  responses.add(
86
237
  responses.POST,
87
238
  "http://testserver/api/v1/elements/type/",
239
+ status=200,
88
240
  match=[
89
241
  matchers.json_params_matcher(
90
242
  {
91
- "slug": "act",
92
- "display_name": "act",
243
+ "slug": "page",
244
+ "display_name": "page",
93
245
  "folder": False,
94
246
  "corpus": CORPUS_ID,
95
247
  }
96
248
  )
97
249
  ],
250
+ json={"id": "page-id", "slug": "page", "display_name": "page", "folder": False},
98
251
  )
99
252
 
100
- assert mock_elements_worker.check_required_types(
101
- "page", "text_line", "act", create_missing=True
253
+ mock_elements_worker.check_required_types(
254
+ type_slugs=["folder", "page"], create_missing=True
102
255
  )
103
256
 
257
+ assert len(responses.calls) == len(BASE_API_CALLS) + 1
258
+ assert [
259
+ (call.request.method, call.request.url) for call in responses.calls
260
+ ] == BASE_API_CALLS + [
261
+ (
262
+ "POST",
263
+ "http://testserver/api/v1/elements/type/",
264
+ ),
265
+ ]
266
+
267
+ # Make sure the element_types attribute has been updated
268
+ assert mock_elements_worker.corpus_types == {
269
+ "folder": {"slug": "folder"},
270
+ "page": {
271
+ "id": "page-id",
272
+ "slug": "page",
273
+ "display_name": "page",
274
+ "folder": False,
275
+ },
276
+ }
277
+
104
278
 
105
279
  @pytest.mark.parametrize(
106
280
  ("payload", "error"),