arkindex-base-worker 0.4.0rc6__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {arkindex_base_worker-0.4.0rc6.dist-info → arkindex_base_worker-0.5.0.dist-info}/METADATA +9 -12
  2. arkindex_base_worker-0.5.0.dist-info/RECORD +60 -0
  3. {arkindex_base_worker-0.4.0rc6.dist-info → arkindex_base_worker-0.5.0.dist-info}/WHEEL +1 -1
  4. {arkindex_base_worker-0.4.0rc6.dist-info → arkindex_base_worker-0.5.0.dist-info}/top_level.txt +1 -0
  5. arkindex_worker/__init__.py +3 -0
  6. arkindex_worker/cache.py +6 -25
  7. arkindex_worker/image.py +105 -66
  8. arkindex_worker/utils.py +2 -1
  9. arkindex_worker/worker/__init__.py +17 -31
  10. arkindex_worker/worker/base.py +16 -9
  11. arkindex_worker/worker/classification.py +36 -34
  12. arkindex_worker/worker/corpus.py +3 -3
  13. arkindex_worker/worker/dataset.py +9 -9
  14. arkindex_worker/worker/element.py +261 -231
  15. arkindex_worker/worker/entity.py +137 -206
  16. arkindex_worker/worker/image.py +3 -3
  17. arkindex_worker/worker/metadata.py +27 -38
  18. arkindex_worker/worker/task.py +9 -9
  19. arkindex_worker/worker/training.py +15 -11
  20. arkindex_worker/worker/transcription.py +77 -71
  21. examples/standalone/python/worker.py +171 -0
  22. examples/tooled/python/worker.py +50 -0
  23. tests/conftest.py +22 -36
  24. tests/test_base_worker.py +1 -1
  25. tests/test_cache.py +1 -2
  26. tests/test_dataset_worker.py +1 -1
  27. tests/test_elements_worker/test_element.py +200 -26
  28. tests/test_elements_worker/{test_entity_create.py → test_entity.py} +220 -227
  29. tests/test_elements_worker/test_metadata.py +0 -47
  30. tests/test_elements_worker/test_training.py +8 -8
  31. tests/test_elements_worker/test_worker.py +15 -14
  32. tests/test_image.py +244 -126
  33. tests/test_merge.py +0 -7
  34. tests/test_utils.py +37 -0
  35. arkindex_base_worker-0.4.0rc6.dist-info/RECORD +0 -61
  36. arkindex_worker/worker/version.py +0 -58
  37. tests/test_elements_worker/test_entity_list_and_check.py +0 -160
  38. tests/test_elements_worker/test_version.py +0 -60
  39. {arkindex_base_worker-0.4.0rc6.dist-info → arkindex_base_worker-0.5.0.dist-info/licenses}/LICENSE +0 -0
@@ -122,9 +122,9 @@ class TrainingMixin:
122
122
  )
123
123
 
124
124
  elif tag or description or configuration or parent:
125
- assert (
126
- self.model_version.get("model_id") == model_id
127
- ), "Given `model_id` does not match the current model version"
125
+ assert self.model_version.get("model_id") == model_id, (
126
+ "Given `model_id` does not match the current model version"
127
+ )
128
128
  # If any attribute field has been defined, PATCH the current model version
129
129
  self.update_model_version(
130
130
  tag=tag,
@@ -237,15 +237,17 @@ class TrainingMixin:
237
237
  Upload the archive of the model's files to an Amazon s3 compatible storage
238
238
  """
239
239
 
240
- assert (
241
- self.model_version
242
- ), "You must create the model version before uploading an archive."
243
- assert (
244
- self.model_version["state"] != "Available"
245
- ), "The model is already marked as available."
240
+ assert self.model_version, (
241
+ "You must create the model version before uploading an archive."
242
+ )
243
+ assert self.model_version["state"] != "Available", (
244
+ "The model is already marked as available."
245
+ )
246
246
 
247
247
  s3_put_url = self.model_version.get("s3_put_url")
248
- assert s3_put_url, "S3 PUT URL is not set, please ensure you have the right to validate a model version."
248
+ assert s3_put_url, (
249
+ "S3 PUT URL is not set, please ensure you have the right to validate a model version."
250
+ )
249
251
 
250
252
  logger.info("Uploading to s3...")
251
253
  # Upload the archive on s3
@@ -271,7 +273,9 @@ class TrainingMixin:
271
273
  :param size: The size of the uploaded archive
272
274
  :param archive_hash: MD5 hash of the uploaded archive
273
275
  """
274
- assert self.model_version, "You must create the model version and upload its archive before validating it."
276
+ assert self.model_version, (
277
+ "You must create the model version and upload its archive before validating it."
278
+ )
275
279
  try:
276
280
  self.model_version = self.api_client.request(
277
281
  "PartialUpdateModelVersion",
@@ -59,18 +59,18 @@ class TranscriptionMixin:
59
59
  :returns: A dict as returned by the ``CreateTranscription`` API endpoint,
60
60
  or None if the worker is in read-only mode.
61
61
  """
62
- assert element and isinstance(
63
- element, Element | CachedElement
64
- ), "element shouldn't be null and should be an Element or CachedElement"
65
- assert text and isinstance(
66
- text, str
67
- ), "text shouldn't be null and should be of type str"
68
- assert orientation and isinstance(
69
- orientation, TextOrientation
70
- ), "orientation shouldn't be null and should be of type TextOrientation"
71
- assert (
72
- isinstance(confidence, float) and 0 <= confidence <= 1
73
- ), "confidence shouldn't be null and should be a float in [0..1] range"
62
+ assert element and isinstance(element, Element | CachedElement), (
63
+ "element shouldn't be null and should be an Element or CachedElement"
64
+ )
65
+ assert text and isinstance(text, str), (
66
+ "text shouldn't be null and should be of type str"
67
+ )
68
+ assert orientation and isinstance(orientation, TextOrientation), (
69
+ "orientation shouldn't be null and should be of type TextOrientation"
70
+ )
71
+ assert isinstance(confidence, float) and 0 <= confidence <= 1, (
72
+ "confidence shouldn't be null and should be a float in [0..1] range"
73
+ )
74
74
 
75
75
  if self.is_read_only:
76
76
  logger.warning(
@@ -136,37 +136,39 @@ class TranscriptionMixin:
136
136
  :returns: A list of dicts as returned in the ``transcriptions`` field by the ``CreateTranscriptions`` API endpoint.
137
137
  """
138
138
 
139
- assert transcriptions and isinstance(
140
- transcriptions, list
141
- ), "transcriptions shouldn't be null and should be of type list"
139
+ assert transcriptions and isinstance(transcriptions, list), (
140
+ "transcriptions shouldn't be null and should be of type list"
141
+ )
142
142
 
143
143
  # Create shallow copies of every transcription to avoid mutating the original payload
144
144
  transcriptions_payload = list(map(dict, transcriptions))
145
145
 
146
146
  for index, transcription in enumerate(transcriptions_payload):
147
147
  element_id = transcription.get("element_id")
148
- assert (
149
- element_id and isinstance(element_id, str)
150
- ), f"Transcription at index {index} in transcriptions: element_id shouldn't be null and should be of type str"
148
+ assert element_id and isinstance(element_id, str), (
149
+ f"Transcription at index {index} in transcriptions: element_id shouldn't be null and should be of type str"
150
+ )
151
151
 
152
152
  text = transcription.get("text")
153
- assert (
154
- text and isinstance(text, str)
155
- ), f"Transcription at index {index} in transcriptions: text shouldn't be null and should be of type str"
153
+ assert text and isinstance(text, str), (
154
+ f"Transcription at index {index} in transcriptions: text shouldn't be null and should be of type str"
155
+ )
156
156
 
157
157
  confidence = transcription.get("confidence")
158
158
  assert (
159
159
  confidence is not None
160
160
  and isinstance(confidence, float)
161
161
  and 0 <= confidence <= 1
162
- ), f"Transcription at index {index} in transcriptions: confidence shouldn't be null and should be a float in [0..1] range"
162
+ ), (
163
+ f"Transcription at index {index} in transcriptions: confidence shouldn't be null and should be a float in [0..1] range"
164
+ )
163
165
 
164
166
  orientation = transcription.get(
165
167
  "orientation", TextOrientation.HorizontalLeftToRight
166
168
  )
167
- assert (
168
- orientation and isinstance(orientation, TextOrientation)
169
- ), f"Transcription at index {index} in transcriptions: orientation shouldn't be null and should be of type TextOrientation"
169
+ assert orientation and isinstance(orientation, TextOrientation), (
170
+ f"Transcription at index {index} in transcriptions: orientation shouldn't be null and should be of type TextOrientation"
171
+ )
170
172
  if orientation:
171
173
  transcription["orientation"] = orientation.value
172
174
 
@@ -242,63 +244,67 @@ class TranscriptionMixin:
242
244
 
243
245
  :returns: A list of dicts as returned by the ``CreateElementTranscriptions`` API endpoint.
244
246
  """
245
- assert element and isinstance(
246
- element, Element | CachedElement
247
- ), "element shouldn't be null and should be an Element or CachedElement"
248
- assert sub_element_type and isinstance(
249
- sub_element_type, str
250
- ), "sub_element_type shouldn't be null and should be of type str"
251
- assert transcriptions and isinstance(
252
- transcriptions, list
253
- ), "transcriptions shouldn't be null and should be of type list"
247
+ assert element and isinstance(element, Element | CachedElement), (
248
+ "element shouldn't be null and should be an Element or CachedElement"
249
+ )
250
+ assert sub_element_type and isinstance(sub_element_type, str), (
251
+ "sub_element_type shouldn't be null and should be of type str"
252
+ )
253
+ assert transcriptions and isinstance(transcriptions, list), (
254
+ "transcriptions shouldn't be null and should be of type list"
255
+ )
254
256
 
255
257
  # Create shallow copies of every transcription to avoid mutating the original payload
256
258
  transcriptions_payload = list(map(dict, transcriptions))
257
259
 
258
260
  for index, transcription in enumerate(transcriptions_payload):
259
261
  text = transcription.get("text")
260
- assert (
261
- text and isinstance(text, str)
262
- ), f"Transcription at index {index} in transcriptions: text shouldn't be null and should be of type str"
262
+ assert text and isinstance(text, str), (
263
+ f"Transcription at index {index} in transcriptions: text shouldn't be null and should be of type str"
264
+ )
263
265
 
264
266
  confidence = transcription.get("confidence")
265
267
  assert (
266
268
  confidence is not None
267
269
  and isinstance(confidence, float)
268
270
  and 0 <= confidence <= 1
269
- ), f"Transcription at index {index} in transcriptions: confidence shouldn't be null and should be a float in [0..1] range"
271
+ ), (
272
+ f"Transcription at index {index} in transcriptions: confidence shouldn't be null and should be a float in [0..1] range"
273
+ )
270
274
 
271
275
  orientation = transcription.get(
272
276
  "orientation", TextOrientation.HorizontalLeftToRight
273
277
  )
274
- assert (
275
- orientation and isinstance(orientation, TextOrientation)
276
- ), f"Transcription at index {index} in transcriptions: orientation shouldn't be null and should be of type TextOrientation"
278
+ assert orientation and isinstance(orientation, TextOrientation), (
279
+ f"Transcription at index {index} in transcriptions: orientation shouldn't be null and should be of type TextOrientation"
280
+ )
277
281
  if orientation:
278
282
  transcription["orientation"] = orientation.value
279
283
 
280
284
  polygon = transcription.get("polygon")
281
- assert (
282
- polygon and isinstance(polygon, list)
283
- ), f"Transcription at index {index} in transcriptions: polygon shouldn't be null and should be of type list"
284
- assert (
285
- len(polygon) >= 3
286
- ), f"Transcription at index {index} in transcriptions: polygon should have at least three points"
285
+ assert polygon and isinstance(polygon, list), (
286
+ f"Transcription at index {index} in transcriptions: polygon shouldn't be null and should be of type list"
287
+ )
288
+ assert len(polygon) >= 3, (
289
+ f"Transcription at index {index} in transcriptions: polygon should have at least three points"
290
+ )
287
291
  assert all(
288
292
  isinstance(point, list) and len(point) == 2 for point in polygon
289
- ), f"Transcription at index {index} in transcriptions: polygon points should be lists of two items"
293
+ ), (
294
+ f"Transcription at index {index} in transcriptions: polygon points should be lists of two items"
295
+ )
290
296
  assert all(
291
297
  isinstance(coord, int | float) for point in polygon for coord in point
292
- ), f"Transcription at index {index} in transcriptions: polygon points should be lists of two numbers"
298
+ ), (
299
+ f"Transcription at index {index} in transcriptions: polygon points should be lists of two numbers"
300
+ )
293
301
 
294
302
  element_confidence = transcription.get("element_confidence")
295
- assert (
296
- element_confidence is None
297
- or (
298
- isinstance(element_confidence, float)
299
- and 0 <= element_confidence <= 1
300
- )
301
- ), f"Transcription at index {index} in transcriptions: element_confidence should be either null or a float in [0..1] range"
303
+ assert element_confidence is None or (
304
+ isinstance(element_confidence, float) and 0 <= element_confidence <= 1
305
+ ), (
306
+ f"Transcription at index {index} in transcriptions: element_confidence should be either null or a float in [0..1] range"
307
+ )
302
308
 
303
309
  if self.is_read_only:
304
310
  logger.warning(
@@ -407,9 +413,9 @@ class TranscriptionMixin:
407
413
  :returns: An iterable of dicts representing each transcription,
408
414
  or an iterable of CachedTranscription when cache support is enabled.
409
415
  """
410
- assert element and isinstance(
411
- element, Element | CachedElement
412
- ), "element shouldn't be null and should be an Element or CachedElement"
416
+ assert element and isinstance(element, Element | CachedElement), (
417
+ "element shouldn't be null and should be an Element or CachedElement"
418
+ )
413
419
  query_params = {}
414
420
  if element_type:
415
421
  assert isinstance(element_type, str), "element_type should be of type str"
@@ -423,22 +429,22 @@ class TranscriptionMixin:
423
429
  DeprecationWarning,
424
430
  stacklevel=1,
425
431
  )
426
- assert isinstance(
427
- worker_version, str | bool
428
- ), "worker_version should be of type str or bool"
432
+ assert isinstance(worker_version, str | bool), (
433
+ "worker_version should be of type str or bool"
434
+ )
429
435
  if isinstance(worker_version, bool):
430
- assert (
431
- worker_version is False
432
- ), "if of type bool, worker_version can only be set to False"
436
+ assert worker_version is False, (
437
+ "if of type bool, worker_version can only be set to False"
438
+ )
433
439
  query_params["worker_version"] = worker_version
434
440
  if worker_run is not None:
435
- assert isinstance(
436
- worker_run, str | bool
437
- ), "worker_run should be of type str or bool"
441
+ assert isinstance(worker_run, str | bool), (
442
+ "worker_run should be of type str or bool"
443
+ )
438
444
  if isinstance(worker_run, bool):
439
- assert (
440
- worker_run is False
441
- ), "if of type bool, worker_run can only be set to False"
445
+ assert worker_run is False, (
446
+ "if of type bool, worker_run can only be set to False"
447
+ )
442
448
  query_params["worker_run"] = worker_run
443
449
 
444
450
  if not self.use_cache:
@@ -0,0 +1,171 @@
1
+ """Standalone Python worker to create a transcription on Arkindex elements"""
2
+
3
+ import logging
4
+ import os
5
+ from argparse import ArgumentParser, Namespace
6
+ from typing import Any
7
+ from urllib.parse import urljoin
8
+
9
+ import requests
10
+
11
+ # Initialize the logger to provide feedback about the worker's execution to the final user
12
+ logging.basicConfig(
13
+ format="%(asctime)s %(levelname)s/%(name)s: %(message)s", level=logging.INFO
14
+ )
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Below are listed the environment variables which are mandatory to run this worker
18
+ ARKINDEX_API_URL = "ARKINDEX_API_URL"
19
+ """URL that points to the root of the Arkindex instance.
20
+ """
21
+ ARKINDEX_API_TOKEN = "ARKINDEX_API_TOKEN"
22
+ """Personal token to authenticate to the Arkindex instance, useful when running locally.
23
+ """
24
+ ARKINDEX_TASK_TOKEN = "ARKINDEX_TASK_TOKEN"
25
+ """Machine token to authenticate to the Arkindex instance, useful when running from Arkindex.
26
+ """
27
+ ARKINDEX_WORKER_RUN_ID = "ARKINDEX_WORKER_RUN_ID"
28
+ """Identifier to publish worker results.
29
+ """
30
+
31
+
32
+ def parse_args() -> Namespace:
33
+ """Helper to parse command line arguments.
34
+ This worker only supports one optional argument, a list of element IDs to process.
35
+
36
+ :return Namespace: A namespace containing the provided command arguments and their value.
37
+ """
38
+ parser = ArgumentParser("python worker.py")
39
+ parser.add_argument(
40
+ "--element",
41
+ nargs="+",
42
+ help="One or more Arkindex element ID",
43
+ )
44
+ return parser.parse_args()
45
+
46
+
47
+ def arkindex_request(
48
+ method: str, endpoint_path: str, body: dict[str, Any] | None = None
49
+ ) -> dict:
50
+ """Helper to query any endpoint from the Arkindex API.
51
+ The environment variables named `ARKINDEX_API_URL` and `ARKINDEX_API_TOKEN` (or `ARKINDEX_TASK_TOKEN`) are required to use this helper.
52
+
53
+ :param str method: The HTTP request method to use https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Methods
54
+ :param str endpoint_path: The path of the API endpoint to query
55
+ :param dict[str, Any] | None body: A JSON body to send to the API, defaults to None
56
+ :return dict: The JSON response from the API endpoint
57
+ """
58
+ if body is None:
59
+ body = {}
60
+
61
+ # Use the `ARKINDEX_API_URL` environment variable to define the full endpoint URL
62
+ url = urljoin(os.getenv(ARKINDEX_API_URL), endpoint_path)
63
+
64
+ # The authorization varies when running locally or in Arkindex
65
+ if "ARKINDEX_TASK_TOKEN" in os.environ:
66
+ authorization = f"Ponos {os.getenv(ARKINDEX_TASK_TOKEN)}"
67
+ else:
68
+ authorization = f"Token {os.getenv(ARKINDEX_API_TOKEN)}"
69
+
70
+ # Query the endpoint URL using the `requests` Python package
71
+ response = requests.request(
72
+ method=method,
73
+ url=url,
74
+ headers={"Authorization": authorization},
75
+ json=body,
76
+ )
77
+
78
+ # Raise an exception if anything went wrong while querying the endpoint
79
+ try:
80
+ response.raise_for_status()
81
+ except requests.HTTPError:
82
+ logger.error(
83
+ f"Request `{endpoint_path}` failed with code {response.status_code}: {response.content}"
84
+ )
85
+ raise
86
+
87
+ # Return the response in JSON format if it was successful
88
+ return response.json()
89
+
90
+
91
+ def main() -> None:
92
+ """Standalone Python worker to create a transcription on Arkindex elements"""
93
+ # Check that the required environment variables are available
94
+ for variable in (ARKINDEX_API_URL, ARKINDEX_WORKER_RUN_ID):
95
+ assert os.getenv(variable), (
96
+ f"Missing required variable `{variable}` in the environment."
97
+ )
98
+
99
+ assert os.getenv(ARKINDEX_API_TOKEN) or os.getenv(ARKINDEX_TASK_TOKEN), (
100
+ f"Either `{ARKINDEX_API_TOKEN}` or `{ARKINDEX_TASK_TOKEN}` variable must be set in the environment."
101
+ )
102
+
103
+ # Retrieve the worker configuration from Arkindex
104
+ # API endpoint: https://arkindex.teklia.com/api-docs/#tag/process/operation/RetrieveWorkerRun
105
+ configuration = arkindex_request(
106
+ method="get",
107
+ endpoint_path=f"process/workers/{os.getenv(ARKINDEX_WORKER_RUN_ID)}/",
108
+ )
109
+
110
+ # Build the list of elements to process
111
+ elements = []
112
+
113
+ # Option 1: The worker is running locally, on your machine, we use the value of the `--element` command argument
114
+ if configuration["process"]["mode"] == "local":
115
+ # Parse the provided command arguments
116
+ args = parse_args()
117
+
118
+ # Retrieve the list of elements from the `--element` argument
119
+ elements = args.element
120
+
121
+ # Assert that at least one element was provided to run the worker on
122
+ assert elements, (
123
+ "Missing at least one element ID to process while running the worker locally."
124
+ )
125
+
126
+ # Option 2: The worker is running on Arkindex, in a process, we list process elements
127
+ else:
128
+ # Retrieve the list of elements from the process which is currently running
129
+ # API endpoint: https://arkindex.teklia.com/api-docs/#tag/process/operation/ListProcessElements
130
+ json_response = arkindex_request(
131
+ method="get",
132
+ endpoint_path=f"process/{configuration['process']['id']}/elements/",
133
+ )
134
+
135
+ # We only need the ID of each element to process, other information is not necessary
136
+ elements = [element["id"] for element in json_response["results"]]
137
+
138
+ total = len(elements)
139
+ failed = 0
140
+ # Iterate over all elements to create a basic transcription
141
+ for element_id in elements:
142
+ try:
143
+ # Create the "Hello world!" transcription on the current element
144
+ # API endpoint: https://arkindex.teklia.com/api-docs/#tag/transcriptions/operation/CreateTranscription
145
+ transcription = arkindex_request(
146
+ method="post",
147
+ endpoint_path=f"element/{element_id}/transcription/",
148
+ body={
149
+ "text": "Hello world!",
150
+ "worker_run_id": os.getenv(ARKINDEX_WORKER_RUN_ID),
151
+ "confidence": 1.0,
152
+ },
153
+ )
154
+
155
+ # Output feedback when a transcription is successfully created
156
+ logger.info(
157
+ f"A transcription with the ID {transcription['id']} was successfully created on element {element_id}."
158
+ )
159
+
160
+ except Exception:
161
+ # Output feedback when failing to create a transcription, and increment the `failed` counter
162
+ logger.error(f"Failed to create a transcription on element {element_id}.")
163
+ failed += 1
164
+
165
+ completed = total - failed
166
+ # Output a summary of the worker execution over all provided elements
167
+ logger.info(f"Ran on {total} element(s): {completed} completed, {failed} error(s).")
168
+
169
+
170
+ if __name__ == "__main__":
171
+ main()
@@ -0,0 +1,50 @@
1
+ """Tooled Python worker to create a transcription on Arkindex elements"""
2
+
3
+ import logging
4
+
5
+ from arkindex_worker.models import Element
6
+ from arkindex_worker.worker import ElementsWorker
7
+
8
+ # Initialize the logger to provide feedback about the worker's execution to the final user
9
+ logging.basicConfig(
10
+ format="%(asctime)s %(levelname)s/%(name)s: %(message)s", level=logging.INFO
11
+ )
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ # Create a worker inheriting from the `ElementsWorker` class provided by the `arkindex-base-worker` package
16
+ class BasicWorker(ElementsWorker):
17
+ def process_element(self, element: Element) -> None:
18
+ """Process a single Arkindex element at once and publish a simple transcription on it.
19
+
20
+ :param Element element: The element currently being processed from the element list
21
+ """
22
+ try:
23
+ # Create the "Hello world!" transcription on the current element
24
+ # Helper: `TranscriptionMixin.create_transcription` from the `arkindex-base-worker` package
25
+ transcription = self.create_transcription(
26
+ element=element,
27
+ text="Hello world!",
28
+ confidence=1.0,
29
+ )
30
+
31
+ # Output feedback when a transcription is successfully created
32
+ logger.info(
33
+ f"A transcription with the ID {transcription['id']} was successfully created on element {element.id}."
34
+ )
35
+
36
+ except Exception as e:
37
+ # Output feedback when failing to create a transcription
38
+ logger.error(
39
+ f"Failed to create a transcription on element {element.id}: {e}"
40
+ )
41
+
42
+
43
+ def main() -> None:
44
+ BasicWorker(
45
+ description="Tooled Python worker to create a transcription on Arkindex elements"
46
+ ).run()
47
+
48
+
49
+ if __name__ == "__main__":
50
+ main()