arkindex-base-worker 0.4.0rc6__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arkindex_base_worker-0.4.0rc6.dist-info → arkindex_base_worker-0.5.0.dist-info}/METADATA +9 -12
- arkindex_base_worker-0.5.0.dist-info/RECORD +60 -0
- {arkindex_base_worker-0.4.0rc6.dist-info → arkindex_base_worker-0.5.0.dist-info}/WHEEL +1 -1
- {arkindex_base_worker-0.4.0rc6.dist-info → arkindex_base_worker-0.5.0.dist-info}/top_level.txt +1 -0
- arkindex_worker/__init__.py +3 -0
- arkindex_worker/cache.py +6 -25
- arkindex_worker/image.py +105 -66
- arkindex_worker/utils.py +2 -1
- arkindex_worker/worker/__init__.py +17 -31
- arkindex_worker/worker/base.py +16 -9
- arkindex_worker/worker/classification.py +36 -34
- arkindex_worker/worker/corpus.py +3 -3
- arkindex_worker/worker/dataset.py +9 -9
- arkindex_worker/worker/element.py +261 -231
- arkindex_worker/worker/entity.py +137 -206
- arkindex_worker/worker/image.py +3 -3
- arkindex_worker/worker/metadata.py +27 -38
- arkindex_worker/worker/task.py +9 -9
- arkindex_worker/worker/training.py +15 -11
- arkindex_worker/worker/transcription.py +77 -71
- examples/standalone/python/worker.py +171 -0
- examples/tooled/python/worker.py +50 -0
- tests/conftest.py +22 -36
- tests/test_base_worker.py +1 -1
- tests/test_cache.py +1 -2
- tests/test_dataset_worker.py +1 -1
- tests/test_elements_worker/test_element.py +200 -26
- tests/test_elements_worker/{test_entity_create.py → test_entity.py} +220 -227
- tests/test_elements_worker/test_metadata.py +0 -47
- tests/test_elements_worker/test_training.py +8 -8
- tests/test_elements_worker/test_worker.py +15 -14
- tests/test_image.py +244 -126
- tests/test_merge.py +0 -7
- tests/test_utils.py +37 -0
- arkindex_base_worker-0.4.0rc6.dist-info/RECORD +0 -61
- arkindex_worker/worker/version.py +0 -58
- tests/test_elements_worker/test_entity_list_and_check.py +0 -160
- tests/test_elements_worker/test_version.py +0 -60
- {arkindex_base_worker-0.4.0rc6.dist-info → arkindex_base_worker-0.5.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -122,9 +122,9 @@ class TrainingMixin:
|
|
|
122
122
|
)
|
|
123
123
|
|
|
124
124
|
elif tag or description or configuration or parent:
|
|
125
|
-
assert (
|
|
126
|
-
|
|
127
|
-
)
|
|
125
|
+
assert self.model_version.get("model_id") == model_id, (
|
|
126
|
+
"Given `model_id` does not match the current model version"
|
|
127
|
+
)
|
|
128
128
|
# If any attribute field has been defined, PATCH the current model version
|
|
129
129
|
self.update_model_version(
|
|
130
130
|
tag=tag,
|
|
@@ -237,15 +237,17 @@ class TrainingMixin:
|
|
|
237
237
|
Upload the archive of the model's files to an Amazon s3 compatible storage
|
|
238
238
|
"""
|
|
239
239
|
|
|
240
|
-
assert (
|
|
241
|
-
|
|
242
|
-
)
|
|
243
|
-
assert (
|
|
244
|
-
|
|
245
|
-
)
|
|
240
|
+
assert self.model_version, (
|
|
241
|
+
"You must create the model version before uploading an archive."
|
|
242
|
+
)
|
|
243
|
+
assert self.model_version["state"] != "Available", (
|
|
244
|
+
"The model is already marked as available."
|
|
245
|
+
)
|
|
246
246
|
|
|
247
247
|
s3_put_url = self.model_version.get("s3_put_url")
|
|
248
|
-
assert s3_put_url,
|
|
248
|
+
assert s3_put_url, (
|
|
249
|
+
"S3 PUT URL is not set, please ensure you have the right to validate a model version."
|
|
250
|
+
)
|
|
249
251
|
|
|
250
252
|
logger.info("Uploading to s3...")
|
|
251
253
|
# Upload the archive on s3
|
|
@@ -271,7 +273,9 @@ class TrainingMixin:
|
|
|
271
273
|
:param size: The size of the uploaded archive
|
|
272
274
|
:param archive_hash: MD5 hash of the uploaded archive
|
|
273
275
|
"""
|
|
274
|
-
assert self.model_version,
|
|
276
|
+
assert self.model_version, (
|
|
277
|
+
"You must create the model version and upload its archive before validating it."
|
|
278
|
+
)
|
|
275
279
|
try:
|
|
276
280
|
self.model_version = self.api_client.request(
|
|
277
281
|
"PartialUpdateModelVersion",
|
|
@@ -59,18 +59,18 @@ class TranscriptionMixin:
|
|
|
59
59
|
:returns: A dict as returned by the ``CreateTranscription`` API endpoint,
|
|
60
60
|
or None if the worker is in read-only mode.
|
|
61
61
|
"""
|
|
62
|
-
assert element and isinstance(
|
|
63
|
-
element
|
|
64
|
-
)
|
|
65
|
-
assert text and isinstance(
|
|
66
|
-
text
|
|
67
|
-
)
|
|
68
|
-
assert orientation and isinstance(
|
|
69
|
-
orientation
|
|
70
|
-
)
|
|
71
|
-
assert (
|
|
72
|
-
|
|
73
|
-
)
|
|
62
|
+
assert element and isinstance(element, Element | CachedElement), (
|
|
63
|
+
"element shouldn't be null and should be an Element or CachedElement"
|
|
64
|
+
)
|
|
65
|
+
assert text and isinstance(text, str), (
|
|
66
|
+
"text shouldn't be null and should be of type str"
|
|
67
|
+
)
|
|
68
|
+
assert orientation and isinstance(orientation, TextOrientation), (
|
|
69
|
+
"orientation shouldn't be null and should be of type TextOrientation"
|
|
70
|
+
)
|
|
71
|
+
assert isinstance(confidence, float) and 0 <= confidence <= 1, (
|
|
72
|
+
"confidence shouldn't be null and should be a float in [0..1] range"
|
|
73
|
+
)
|
|
74
74
|
|
|
75
75
|
if self.is_read_only:
|
|
76
76
|
logger.warning(
|
|
@@ -136,37 +136,39 @@ class TranscriptionMixin:
|
|
|
136
136
|
:returns: A list of dicts as returned in the ``transcriptions`` field by the ``CreateTranscriptions`` API endpoint.
|
|
137
137
|
"""
|
|
138
138
|
|
|
139
|
-
assert transcriptions and isinstance(
|
|
140
|
-
transcriptions
|
|
141
|
-
)
|
|
139
|
+
assert transcriptions and isinstance(transcriptions, list), (
|
|
140
|
+
"transcriptions shouldn't be null and should be of type list"
|
|
141
|
+
)
|
|
142
142
|
|
|
143
143
|
# Create shallow copies of every transcription to avoid mutating the original payload
|
|
144
144
|
transcriptions_payload = list(map(dict, transcriptions))
|
|
145
145
|
|
|
146
146
|
for index, transcription in enumerate(transcriptions_payload):
|
|
147
147
|
element_id = transcription.get("element_id")
|
|
148
|
-
assert (
|
|
149
|
-
element_id and
|
|
150
|
-
)
|
|
148
|
+
assert element_id and isinstance(element_id, str), (
|
|
149
|
+
f"Transcription at index {index} in transcriptions: element_id shouldn't be null and should be of type str"
|
|
150
|
+
)
|
|
151
151
|
|
|
152
152
|
text = transcription.get("text")
|
|
153
|
-
assert (
|
|
154
|
-
text and
|
|
155
|
-
)
|
|
153
|
+
assert text and isinstance(text, str), (
|
|
154
|
+
f"Transcription at index {index} in transcriptions: text shouldn't be null and should be of type str"
|
|
155
|
+
)
|
|
156
156
|
|
|
157
157
|
confidence = transcription.get("confidence")
|
|
158
158
|
assert (
|
|
159
159
|
confidence is not None
|
|
160
160
|
and isinstance(confidence, float)
|
|
161
161
|
and 0 <= confidence <= 1
|
|
162
|
-
),
|
|
162
|
+
), (
|
|
163
|
+
f"Transcription at index {index} in transcriptions: confidence shouldn't be null and should be a float in [0..1] range"
|
|
164
|
+
)
|
|
163
165
|
|
|
164
166
|
orientation = transcription.get(
|
|
165
167
|
"orientation", TextOrientation.HorizontalLeftToRight
|
|
166
168
|
)
|
|
167
|
-
assert (
|
|
168
|
-
orientation and
|
|
169
|
-
)
|
|
169
|
+
assert orientation and isinstance(orientation, TextOrientation), (
|
|
170
|
+
f"Transcription at index {index} in transcriptions: orientation shouldn't be null and should be of type TextOrientation"
|
|
171
|
+
)
|
|
170
172
|
if orientation:
|
|
171
173
|
transcription["orientation"] = orientation.value
|
|
172
174
|
|
|
@@ -242,63 +244,67 @@ class TranscriptionMixin:
|
|
|
242
244
|
|
|
243
245
|
:returns: A list of dicts as returned by the ``CreateElementTranscriptions`` API endpoint.
|
|
244
246
|
"""
|
|
245
|
-
assert element and isinstance(
|
|
246
|
-
element
|
|
247
|
-
)
|
|
248
|
-
assert sub_element_type and isinstance(
|
|
249
|
-
sub_element_type
|
|
250
|
-
)
|
|
251
|
-
assert transcriptions and isinstance(
|
|
252
|
-
transcriptions
|
|
253
|
-
)
|
|
247
|
+
assert element and isinstance(element, Element | CachedElement), (
|
|
248
|
+
"element shouldn't be null and should be an Element or CachedElement"
|
|
249
|
+
)
|
|
250
|
+
assert sub_element_type and isinstance(sub_element_type, str), (
|
|
251
|
+
"sub_element_type shouldn't be null and should be of type str"
|
|
252
|
+
)
|
|
253
|
+
assert transcriptions and isinstance(transcriptions, list), (
|
|
254
|
+
"transcriptions shouldn't be null and should be of type list"
|
|
255
|
+
)
|
|
254
256
|
|
|
255
257
|
# Create shallow copies of every transcription to avoid mutating the original payload
|
|
256
258
|
transcriptions_payload = list(map(dict, transcriptions))
|
|
257
259
|
|
|
258
260
|
for index, transcription in enumerate(transcriptions_payload):
|
|
259
261
|
text = transcription.get("text")
|
|
260
|
-
assert (
|
|
261
|
-
text and
|
|
262
|
-
)
|
|
262
|
+
assert text and isinstance(text, str), (
|
|
263
|
+
f"Transcription at index {index} in transcriptions: text shouldn't be null and should be of type str"
|
|
264
|
+
)
|
|
263
265
|
|
|
264
266
|
confidence = transcription.get("confidence")
|
|
265
267
|
assert (
|
|
266
268
|
confidence is not None
|
|
267
269
|
and isinstance(confidence, float)
|
|
268
270
|
and 0 <= confidence <= 1
|
|
269
|
-
),
|
|
271
|
+
), (
|
|
272
|
+
f"Transcription at index {index} in transcriptions: confidence shouldn't be null and should be a float in [0..1] range"
|
|
273
|
+
)
|
|
270
274
|
|
|
271
275
|
orientation = transcription.get(
|
|
272
276
|
"orientation", TextOrientation.HorizontalLeftToRight
|
|
273
277
|
)
|
|
274
|
-
assert (
|
|
275
|
-
orientation and
|
|
276
|
-
)
|
|
278
|
+
assert orientation and isinstance(orientation, TextOrientation), (
|
|
279
|
+
f"Transcription at index {index} in transcriptions: orientation shouldn't be null and should be of type TextOrientation"
|
|
280
|
+
)
|
|
277
281
|
if orientation:
|
|
278
282
|
transcription["orientation"] = orientation.value
|
|
279
283
|
|
|
280
284
|
polygon = transcription.get("polygon")
|
|
281
|
-
assert (
|
|
282
|
-
polygon and
|
|
283
|
-
)
|
|
284
|
-
assert (
|
|
285
|
-
|
|
286
|
-
)
|
|
285
|
+
assert polygon and isinstance(polygon, list), (
|
|
286
|
+
f"Transcription at index {index} in transcriptions: polygon shouldn't be null and should be of type list"
|
|
287
|
+
)
|
|
288
|
+
assert len(polygon) >= 3, (
|
|
289
|
+
f"Transcription at index {index} in transcriptions: polygon should have at least three points"
|
|
290
|
+
)
|
|
287
291
|
assert all(
|
|
288
292
|
isinstance(point, list) and len(point) == 2 for point in polygon
|
|
289
|
-
),
|
|
293
|
+
), (
|
|
294
|
+
f"Transcription at index {index} in transcriptions: polygon points should be lists of two items"
|
|
295
|
+
)
|
|
290
296
|
assert all(
|
|
291
297
|
isinstance(coord, int | float) for point in polygon for coord in point
|
|
292
|
-
),
|
|
298
|
+
), (
|
|
299
|
+
f"Transcription at index {index} in transcriptions: polygon points should be lists of two numbers"
|
|
300
|
+
)
|
|
293
301
|
|
|
294
302
|
element_confidence = transcription.get("element_confidence")
|
|
295
|
-
assert (
|
|
296
|
-
element_confidence
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
)
|
|
301
|
-
), f"Transcription at index {index} in transcriptions: element_confidence should be either null or a float in [0..1] range"
|
|
303
|
+
assert element_confidence is None or (
|
|
304
|
+
isinstance(element_confidence, float) and 0 <= element_confidence <= 1
|
|
305
|
+
), (
|
|
306
|
+
f"Transcription at index {index} in transcriptions: element_confidence should be either null or a float in [0..1] range"
|
|
307
|
+
)
|
|
302
308
|
|
|
303
309
|
if self.is_read_only:
|
|
304
310
|
logger.warning(
|
|
@@ -407,9 +413,9 @@ class TranscriptionMixin:
|
|
|
407
413
|
:returns: An iterable of dicts representing each transcription,
|
|
408
414
|
or an iterable of CachedTranscription when cache support is enabled.
|
|
409
415
|
"""
|
|
410
|
-
assert element and isinstance(
|
|
411
|
-
element
|
|
412
|
-
)
|
|
416
|
+
assert element and isinstance(element, Element | CachedElement), (
|
|
417
|
+
"element shouldn't be null and should be an Element or CachedElement"
|
|
418
|
+
)
|
|
413
419
|
query_params = {}
|
|
414
420
|
if element_type:
|
|
415
421
|
assert isinstance(element_type, str), "element_type should be of type str"
|
|
@@ -423,22 +429,22 @@ class TranscriptionMixin:
|
|
|
423
429
|
DeprecationWarning,
|
|
424
430
|
stacklevel=1,
|
|
425
431
|
)
|
|
426
|
-
assert isinstance(
|
|
427
|
-
worker_version
|
|
428
|
-
)
|
|
432
|
+
assert isinstance(worker_version, str | bool), (
|
|
433
|
+
"worker_version should be of type str or bool"
|
|
434
|
+
)
|
|
429
435
|
if isinstance(worker_version, bool):
|
|
430
|
-
assert (
|
|
431
|
-
worker_version
|
|
432
|
-
)
|
|
436
|
+
assert worker_version is False, (
|
|
437
|
+
"if of type bool, worker_version can only be set to False"
|
|
438
|
+
)
|
|
433
439
|
query_params["worker_version"] = worker_version
|
|
434
440
|
if worker_run is not None:
|
|
435
|
-
assert isinstance(
|
|
436
|
-
worker_run
|
|
437
|
-
)
|
|
441
|
+
assert isinstance(worker_run, str | bool), (
|
|
442
|
+
"worker_run should be of type str or bool"
|
|
443
|
+
)
|
|
438
444
|
if isinstance(worker_run, bool):
|
|
439
|
-
assert (
|
|
440
|
-
worker_run
|
|
441
|
-
)
|
|
445
|
+
assert worker_run is False, (
|
|
446
|
+
"if of type bool, worker_run can only be set to False"
|
|
447
|
+
)
|
|
442
448
|
query_params["worker_run"] = worker_run
|
|
443
449
|
|
|
444
450
|
if not self.use_cache:
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""Standalone Python worker to create a transcription on Arkindex elements"""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
from argparse import ArgumentParser, Namespace
|
|
6
|
+
from typing import Any
|
|
7
|
+
from urllib.parse import urljoin
|
|
8
|
+
|
|
9
|
+
import requests
|
|
10
|
+
|
|
11
|
+
# Initialize the logger to provide feedback about the worker's execution to the final user
|
|
12
|
+
logging.basicConfig(
|
|
13
|
+
format="%(asctime)s %(levelname)s/%(name)s: %(message)s", level=logging.INFO
|
|
14
|
+
)
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
# Below are listed the environment variables which are mandatory to run this worker
|
|
18
|
+
ARKINDEX_API_URL = "ARKINDEX_API_URL"
|
|
19
|
+
"""URL that points to the root of the Arkindex instance.
|
|
20
|
+
"""
|
|
21
|
+
ARKINDEX_API_TOKEN = "ARKINDEX_API_TOKEN"
|
|
22
|
+
"""Personal token to authenticate to the Arkindex instance, useful when running locally.
|
|
23
|
+
"""
|
|
24
|
+
ARKINDEX_TASK_TOKEN = "ARKINDEX_TASK_TOKEN"
|
|
25
|
+
"""Machine token to authenticate to the Arkindex instance, useful when running from Arkindex.
|
|
26
|
+
"""
|
|
27
|
+
ARKINDEX_WORKER_RUN_ID = "ARKINDEX_WORKER_RUN_ID"
|
|
28
|
+
"""Identifier to publish worker results.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def parse_args() -> Namespace:
|
|
33
|
+
"""Helper to parse command line arguments.
|
|
34
|
+
This worker only supports one optional argument, a list of element IDs to process.
|
|
35
|
+
|
|
36
|
+
:return Namespace: A namespace containing the provided command arguments and their value.
|
|
37
|
+
"""
|
|
38
|
+
parser = ArgumentParser("python worker.py")
|
|
39
|
+
parser.add_argument(
|
|
40
|
+
"--element",
|
|
41
|
+
nargs="+",
|
|
42
|
+
help="One or more Arkindex element ID",
|
|
43
|
+
)
|
|
44
|
+
return parser.parse_args()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def arkindex_request(
|
|
48
|
+
method: str, endpoint_path: str, body: dict[str, Any] | None = None
|
|
49
|
+
) -> dict:
|
|
50
|
+
"""Helper to query any endpoint from the Arkindex API.
|
|
51
|
+
The environment variables named `ARKINDEX_API_URL` and `ARKINDEX_API_TOKEN` (or `ARKINDEX_TASK_TOKEN`) are required to use this helper.
|
|
52
|
+
|
|
53
|
+
:param str method: The HTTP request method to use https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Methods
|
|
54
|
+
:param str endpoint_path: The path of the API endpoint to query
|
|
55
|
+
:param dict[str, Any] | None body: A JSON body to send to the API, defaults to None
|
|
56
|
+
:return dict: The JSON response from the API endpoint
|
|
57
|
+
"""
|
|
58
|
+
if body is None:
|
|
59
|
+
body = {}
|
|
60
|
+
|
|
61
|
+
# Use the `ARKINDEX_API_URL` environment variable to define the full endpoint URL
|
|
62
|
+
url = urljoin(os.getenv(ARKINDEX_API_URL), endpoint_path)
|
|
63
|
+
|
|
64
|
+
# The authorization varies when running locally or in Arkindex
|
|
65
|
+
if "ARKINDEX_TASK_TOKEN" in os.environ:
|
|
66
|
+
authorization = f"Ponos {os.getenv(ARKINDEX_TASK_TOKEN)}"
|
|
67
|
+
else:
|
|
68
|
+
authorization = f"Token {os.getenv(ARKINDEX_API_TOKEN)}"
|
|
69
|
+
|
|
70
|
+
# Query the endpoint URL using the `requests` Python package
|
|
71
|
+
response = requests.request(
|
|
72
|
+
method=method,
|
|
73
|
+
url=url,
|
|
74
|
+
headers={"Authorization": authorization},
|
|
75
|
+
json=body,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Raise an exception if anything went wrong while querying the endpoint
|
|
79
|
+
try:
|
|
80
|
+
response.raise_for_status()
|
|
81
|
+
except requests.HTTPError:
|
|
82
|
+
logger.error(
|
|
83
|
+
f"Request `{endpoint_path}` failed with code {response.status_code}: {response.content}"
|
|
84
|
+
)
|
|
85
|
+
raise
|
|
86
|
+
|
|
87
|
+
# Return the response in JSON format if it was successful
|
|
88
|
+
return response.json()
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def main() -> None:
|
|
92
|
+
"""Standalone Python worker to create a transcription on Arkindex elements"""
|
|
93
|
+
# Check that the required environment variables are available
|
|
94
|
+
for variable in (ARKINDEX_API_URL, ARKINDEX_WORKER_RUN_ID):
|
|
95
|
+
assert os.getenv(variable), (
|
|
96
|
+
f"Missing required variable `{variable}` in the environment."
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
assert os.getenv(ARKINDEX_API_TOKEN) or os.getenv(ARKINDEX_TASK_TOKEN), (
|
|
100
|
+
f"Either `{ARKINDEX_API_TOKEN}` or `{ARKINDEX_TASK_TOKEN}` variable must be set in the environment."
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Retrieve the worker configuration from Arkindex
|
|
104
|
+
# API endpoint: https://arkindex.teklia.com/api-docs/#tag/process/operation/RetrieveWorkerRun
|
|
105
|
+
configuration = arkindex_request(
|
|
106
|
+
method="get",
|
|
107
|
+
endpoint_path=f"process/workers/{os.getenv(ARKINDEX_WORKER_RUN_ID)}/",
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# Build the list of elements to process
|
|
111
|
+
elements = []
|
|
112
|
+
|
|
113
|
+
# Option 1: The worker is running locally, on your machine, we use the value of the `--element` command argument
|
|
114
|
+
if configuration["process"]["mode"] == "local":
|
|
115
|
+
# Parse the provided command arguments
|
|
116
|
+
args = parse_args()
|
|
117
|
+
|
|
118
|
+
# Retrieve the list of elements from the `--element` argument
|
|
119
|
+
elements = args.element
|
|
120
|
+
|
|
121
|
+
# Assert that at least one element was provided to run the worker on
|
|
122
|
+
assert elements, (
|
|
123
|
+
"Missing at least one element ID to process while running the worker locally."
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
# Option 2: The worker is running on Arkindex, in a process, we list process elements
|
|
127
|
+
else:
|
|
128
|
+
# Retrieve the list of elements from the process which is currently running
|
|
129
|
+
# API endpoint: https://arkindex.teklia.com/api-docs/#tag/process/operation/ListProcessElements
|
|
130
|
+
json_response = arkindex_request(
|
|
131
|
+
method="get",
|
|
132
|
+
endpoint_path=f"process/{configuration['process']['id']}/elements/",
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# We only need the ID of each element to process, other information is not necessary
|
|
136
|
+
elements = [element["id"] for element in json_response["results"]]
|
|
137
|
+
|
|
138
|
+
total = len(elements)
|
|
139
|
+
failed = 0
|
|
140
|
+
# Iterate over all elements to create a basic transcription
|
|
141
|
+
for element_id in elements:
|
|
142
|
+
try:
|
|
143
|
+
# Create the "Hello world!" transcription on the current element
|
|
144
|
+
# API endpoint: https://arkindex.teklia.com/api-docs/#tag/transcriptions/operation/CreateTranscription
|
|
145
|
+
transcription = arkindex_request(
|
|
146
|
+
method="post",
|
|
147
|
+
endpoint_path=f"element/{element_id}/transcription/",
|
|
148
|
+
body={
|
|
149
|
+
"text": "Hello world!",
|
|
150
|
+
"worker_run_id": os.getenv(ARKINDEX_WORKER_RUN_ID),
|
|
151
|
+
"confidence": 1.0,
|
|
152
|
+
},
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# Output feedback when a transcription is successfully created
|
|
156
|
+
logger.info(
|
|
157
|
+
f"A transcription with the ID {transcription['id']} was successfully created on element {element_id}."
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
except Exception:
|
|
161
|
+
# Output feedback when failing to create a transcription, and increment the `failed` counter
|
|
162
|
+
logger.error(f"Failed to create a transcription on element {element_id}.")
|
|
163
|
+
failed += 1
|
|
164
|
+
|
|
165
|
+
completed = total - failed
|
|
166
|
+
# Output a summary of the worker execution over all provided elements
|
|
167
|
+
logger.info(f"Ran on {total} element(s): {completed} completed, {failed} error(s).")
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
if __name__ == "__main__":
|
|
171
|
+
main()
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Tooled Python worker to create a transcription on Arkindex elements"""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from arkindex_worker.models import Element
|
|
6
|
+
from arkindex_worker.worker import ElementsWorker
|
|
7
|
+
|
|
8
|
+
# Initialize the logger to provide feedback about the worker's execution to the final user
|
|
9
|
+
logging.basicConfig(
|
|
10
|
+
format="%(asctime)s %(levelname)s/%(name)s: %(message)s", level=logging.INFO
|
|
11
|
+
)
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# Create a worker inheriting from the `ElementsWorker` class provided by the `arkindex-base-worker` package
|
|
16
|
+
class BasicWorker(ElementsWorker):
|
|
17
|
+
def process_element(self, element: Element) -> None:
|
|
18
|
+
"""Process a single Arkindex element at once and publish a simple transcription on it.
|
|
19
|
+
|
|
20
|
+
:param Element element: The element currently being processed from the element list
|
|
21
|
+
"""
|
|
22
|
+
try:
|
|
23
|
+
# Create the "Hello world!" transcription on the current element
|
|
24
|
+
# Helper: `TranscriptionMixin.create_transcription` from the `arkindex-base-worker` package
|
|
25
|
+
transcription = self.create_transcription(
|
|
26
|
+
element=element,
|
|
27
|
+
text="Hello world!",
|
|
28
|
+
confidence=1.0,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# Output feedback when a transcription is successfully created
|
|
32
|
+
logger.info(
|
|
33
|
+
f"A transcription with the ID {transcription['id']} was successfully created on element {element.id}."
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
except Exception as e:
|
|
37
|
+
# Output feedback when failing to create a transcription
|
|
38
|
+
logger.error(
|
|
39
|
+
f"Failed to create a transcription on element {element.id}: {e}"
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def main() -> None:
|
|
44
|
+
BasicWorker(
|
|
45
|
+
description="Tooled Python worker to create a transcription on Arkindex elements"
|
|
46
|
+
).run()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
if __name__ == "__main__":
|
|
50
|
+
main()
|