arkindex-base-worker 0.4.0b3__tar.gz → 0.4.0rc2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/PKG-INFO +4 -3
  2. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_base_worker.egg-info/PKG-INFO +4 -3
  3. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_base_worker.egg-info/SOURCES.txt +1 -0
  4. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_base_worker.egg-info/requires.txt +3 -2
  5. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/image.py +118 -0
  6. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/__init__.py +26 -158
  7. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/base.py +32 -1
  8. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/dataset.py +70 -0
  9. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/element.py +260 -75
  10. arkindex_base_worker-0.4.0rc2/arkindex_worker/worker/process.py +63 -0
  11. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/transcription.py +50 -50
  12. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/pyproject.toml +4 -3
  13. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/__init__.py +1 -1
  14. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/conftest.py +11 -23
  15. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_base_worker.py +203 -2
  16. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_dataset_worker.py +5 -2
  17. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_elements.py +712 -18
  18. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_worker.py +0 -200
  19. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_image.py +248 -6
  20. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_merge.py +0 -1
  21. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_utils.py +2 -4
  22. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/LICENSE +0 -0
  23. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/README.md +0 -0
  24. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_base_worker.egg-info/dependency_links.txt +0 -0
  25. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_base_worker.egg-info/top_level.txt +0 -0
  26. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/__init__.py +0 -0
  27. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/cache.py +0 -0
  28. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/models.py +0 -0
  29. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/utils.py +0 -0
  30. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/classification.py +0 -0
  31. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/corpus.py +0 -0
  32. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/entity.py +0 -0
  33. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/image.py +0 -0
  34. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/metadata.py +0 -0
  35. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/task.py +0 -0
  36. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/training.py +0 -0
  37. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/version.py +0 -0
  38. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/hooks/pre_gen_project.py +0 -0
  39. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/setup.cfg +0 -0
  40. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_cache.py +0 -0
  41. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_element.py +0 -0
  42. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/__init__.py +0 -0
  43. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_classifications.py +0 -0
  44. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_cli.py +0 -0
  45. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_corpus.py +0 -0
  46. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_dataset.py +0 -0
  47. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_entities.py +0 -0
  48. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_image.py +0 -0
  49. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_metadata.py +0 -0
  50. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_task.py +0 -0
  51. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_training.py +0 -0
  52. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_transcriptions.py +0 -0
  53. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/worker-demo/tests/__init__.py +0 -0
  54. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/worker-demo/tests/conftest.py +0 -0
  55. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/worker-demo/tests/test_worker.py +0 -0
  56. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/worker-demo/worker_demo/__init__.py +0 -0
  57. {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/worker-demo/worker_demo/worker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: arkindex-base-worker
3
- Version: 0.4.0b3
3
+ Version: 0.4.0rc2
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
@@ -40,6 +40,7 @@ Classifier: Programming Language :: Python :: 3.11
40
40
  Requires-Python: >=3.10
41
41
  Description-Content-Type: text/markdown
42
42
  License-File: LICENSE
43
+ Requires-Dist: humanize==4.10.0
43
44
  Requires-Dist: peewee~=3.17
44
45
  Requires-Dist: Pillow==10.4.0
45
46
  Requires-Dist: python-gnupg==0.5.2
@@ -48,8 +49,8 @@ Requires-Dist: teklia-toolbox==0.1.5
48
49
  Requires-Dist: zstandard==0.22.0
49
50
  Provides-Extra: docs
50
51
  Requires-Dist: black==24.4.2; extra == "docs"
51
- Requires-Dist: mkdocs-material==9.5.31; extra == "docs"
52
- Requires-Dist: mkdocstrings-python==1.10.7; extra == "docs"
52
+ Requires-Dist: mkdocs-material==9.5.33; extra == "docs"
53
+ Requires-Dist: mkdocstrings-python==1.10.8; extra == "docs"
53
54
  Provides-Extra: tests
54
55
  Requires-Dist: pytest==8.3.2; extra == "tests"
55
56
  Requires-Dist: pytest-mock==3.14.0; extra == "tests"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: arkindex-base-worker
3
- Version: 0.4.0b3
3
+ Version: 0.4.0rc2
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
@@ -40,6 +40,7 @@ Classifier: Programming Language :: Python :: 3.11
40
40
  Requires-Python: >=3.10
41
41
  Description-Content-Type: text/markdown
42
42
  License-File: LICENSE
43
+ Requires-Dist: humanize==4.10.0
43
44
  Requires-Dist: peewee~=3.17
44
45
  Requires-Dist: Pillow==10.4.0
45
46
  Requires-Dist: python-gnupg==0.5.2
@@ -48,8 +49,8 @@ Requires-Dist: teklia-toolbox==0.1.5
48
49
  Requires-Dist: zstandard==0.22.0
49
50
  Provides-Extra: docs
50
51
  Requires-Dist: black==24.4.2; extra == "docs"
51
- Requires-Dist: mkdocs-material==9.5.31; extra == "docs"
52
- Requires-Dist: mkdocstrings-python==1.10.7; extra == "docs"
52
+ Requires-Dist: mkdocs-material==9.5.33; extra == "docs"
53
+ Requires-Dist: mkdocstrings-python==1.10.8; extra == "docs"
53
54
  Provides-Extra: tests
54
55
  Requires-Dist: pytest==8.3.2; extra == "tests"
55
56
  Requires-Dist: pytest-mock==3.14.0; extra == "tests"
@@ -20,6 +20,7 @@ arkindex_worker/worker/element.py
20
20
  arkindex_worker/worker/entity.py
21
21
  arkindex_worker/worker/image.py
22
22
  arkindex_worker/worker/metadata.py
23
+ arkindex_worker/worker/process.py
23
24
  arkindex_worker/worker/task.py
24
25
  arkindex_worker/worker/training.py
25
26
  arkindex_worker/worker/transcription.py
@@ -1,3 +1,4 @@
1
+ humanize==4.10.0
1
2
  peewee~=3.17
2
3
  Pillow==10.4.0
3
4
  python-gnupg==0.5.2
@@ -7,8 +8,8 @@ zstandard==0.22.0
7
8
 
8
9
  [docs]
9
10
  black==24.4.2
10
- mkdocs-material==9.5.31
11
- mkdocstrings-python==1.10.7
11
+ mkdocs-material==9.5.33
12
+ mkdocstrings-python==1.10.8
12
13
 
13
14
  [tests]
14
15
  pytest==8.3.2
@@ -2,13 +2,18 @@
2
2
  Helper methods to download and open IIIF images, and manage polygons.
3
3
  """
4
4
 
5
+ import functools
6
+ import os
5
7
  import re
8
+ import tempfile
6
9
  from collections import namedtuple
10
+ from collections.abc import Generator, Iterator
7
11
  from io import BytesIO
8
12
  from math import ceil
9
13
  from pathlib import Path
10
14
  from typing import TYPE_CHECKING
11
15
 
16
+ import humanize
12
17
  import requests
13
18
  from PIL import Image
14
19
  from shapely.affinity import rotate, scale, translate
@@ -40,8 +45,57 @@ IIIF_URL = re.compile(r"\w+:\/{2}.+\/.+\/.+\/.+\/(?P<size>.+)\/!?\d+\/\w+\.\w+")
40
45
  IIIF_FULL = "full"
41
46
  # Maximum size available
42
47
  IIIF_MAX = "max"
48
+ # Ratio to resize image
49
+ IMAGE_RATIO = [1, 0.9, 0.85, 0.80, 0.75, 0.70, 0.60, 0.50, 0.40, 0.30]
43
50
 
44
51
 
52
+ def update_pillow_image_size_limit(func):
53
+ """
54
+ Update Pillow Image size limit
55
+ """
56
+
57
+ @functools.wraps(func)
58
+ def wrapper(
59
+ *args,
60
+ max_image_pixels: str | int | None = os.getenv("ARKINDEX_MAX_IMAGE_PIXELS"),
61
+ **kwargs,
62
+ ):
63
+ """
64
+ Wrapper to update Pillow Image size limit and restore it at the end of the function.
65
+
66
+ :param *args: Positional arguments passed to the function.
67
+ :param max_image_pixels: Pillow Image size limit to use.
68
+ :param **kwargs: Keyword arguments passed to the function.
69
+ """
70
+ MAX_IMAGE_PIXELS = Image.MAX_IMAGE_PIXELS
71
+
72
+ # Override Pillow Image size limit
73
+ if max_image_pixels is not None:
74
+ max_image_pixels = int(max_image_pixels)
75
+ # Override Pillow limit for detecting decompression bombs, disabled if set to 0
76
+ if max_image_pixels == 0:
77
+ logger.warning(
78
+ "Pillow Image size limit is completely disabled, make sure you trust the image source."
79
+ )
80
+ Image.MAX_IMAGE_PIXELS = None
81
+ else:
82
+ Image.MAX_IMAGE_PIXELS = max_image_pixels
83
+
84
+ try:
85
+ results = func(*args, **kwargs)
86
+ except:
87
+ # Restore initial Pillow Image size limit
88
+ Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
89
+ raise
90
+
91
+ # Restore initial Pillow Image size limit
92
+ Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
93
+ return results
94
+
95
+ return wrapper
96
+
97
+
98
+ @update_pillow_image_size_limit
45
99
  def open_image(
46
100
  path: str,
47
101
  mode: str | None = "RGB",
@@ -149,6 +203,70 @@ def upload_image(image: Image, url: str) -> requests.Response:
149
203
  return resp
150
204
 
151
205
 
206
+ def resized_images(
207
+ *args,
208
+ element: "Element",
209
+ max_pixels: int | None = None,
210
+ max_bytes: int | None = None,
211
+ **kwargs,
212
+ ) -> Iterator[Generator[tempfile.NamedTemporaryFile, None, None]]:
213
+ """
214
+ Build resized images according to the pixel and byte limits.
215
+
216
+ :param *args: Positional arguments passed to [arkindex_worker.models.Element.open_image_tempfile][].
217
+ :param element: Element whose image needs to be resized.
218
+ :param max_pixels: Maximum pixel size of the resized images.
219
+ :param max_bytes: Maximum byte size of the resized images.
220
+ :param **kwargs: Keyword arguments passed to [arkindex_worker.models.Element.open_image_tempfile][].
221
+ :returns: An iterator of the temporary file of the resized image.
222
+ """
223
+ _, _, element_width, element_height = polygon_bounding_box(element.polygon)
224
+
225
+ logger.info(f"This element's image sizes are ({element_width} x {element_height}).")
226
+ if max_pixels and max(element_width, element_height) > max_pixels:
227
+ logger.warning(
228
+ f"Maximum image input size supported is ({max_pixels} x {max_pixels})."
229
+ )
230
+ logger.warning("The image will be resized.")
231
+
232
+ element_pixel, param = (
233
+ (element_width, "max_width")
234
+ if element_width > element_height
235
+ else (element_height, "max_height")
236
+ )
237
+
238
+ for resized_pixel in sorted(
239
+ set(
240
+ min(round(ratio * element_pixel), max_pixels or element_pixel)
241
+ for ratio in IMAGE_RATIO
242
+ ),
243
+ reverse=True,
244
+ ):
245
+ with element.open_image_tempfile(
246
+ *args, **{**kwargs, param: resized_pixel}
247
+ ) as image:
248
+ pillow_image = Image.open(image)
249
+ if (
250
+ pillow_image.width != element_width
251
+ or pillow_image.height != element_height
252
+ ):
253
+ logger.warning(
254
+ f"The image was resized to ({pillow_image.width} x {pillow_image.height})."
255
+ )
256
+
257
+ # The image is still too large
258
+ image_size = Path(image.name).stat().st_size
259
+ if max_bytes and image_size > max_bytes:
260
+ logger.warning(f"The image size is {humanize.naturalsize(image_size)}.")
261
+ logger.warning(
262
+ f"Maximum image input size supported is {humanize.naturalsize(max_bytes)}."
263
+ )
264
+ logger.warning("The image will be resized.")
265
+ continue
266
+
267
+ yield image
268
+
269
+
152
270
  def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
153
271
  """
154
272
  Compute the rectangle bounding box of a polygon.
@@ -4,12 +4,10 @@ Base classes to implement Arkindex workers.
4
4
 
5
5
  import contextlib
6
6
  import json
7
- import os
8
7
  import sys
9
8
  import uuid
10
- from argparse import ArgumentTypeError
11
- from collections.abc import Iterable, Iterator
12
- from enum import Enum
9
+ from collections.abc import Iterable
10
+ from itertools import chain
13
11
  from pathlib import Path
14
12
 
15
13
  from apistar.exceptions import ErrorResponse
@@ -21,47 +19,27 @@ from arkindex_worker.utils import pluralize
21
19
  from arkindex_worker.worker.base import BaseWorker
22
20
  from arkindex_worker.worker.classification import ClassificationMixin
23
21
  from arkindex_worker.worker.corpus import CorpusMixin
24
- from arkindex_worker.worker.dataset import DatasetMixin, DatasetState
22
+ from arkindex_worker.worker.dataset import (
23
+ DatasetMixin,
24
+ DatasetState,
25
+ MissingDatasetArchive,
26
+ )
25
27
  from arkindex_worker.worker.element import ElementMixin
26
28
  from arkindex_worker.worker.entity import EntityMixin
27
29
  from arkindex_worker.worker.image import ImageMixin
28
30
  from arkindex_worker.worker.metadata import MetaDataMixin, MetaType # noqa: F401
31
+ from arkindex_worker.worker.process import ActivityState, ProcessMode
29
32
  from arkindex_worker.worker.task import TaskMixin
30
33
  from arkindex_worker.worker.transcription import TranscriptionMixin
31
34
  from arkindex_worker.worker.version import WorkerVersionMixin
32
35
 
33
36
 
34
- class ActivityState(Enum):
35
- """
36
- Processing state of an element.
37
- """
38
-
39
- Queued = "queued"
40
- """
41
- The element has not yet been processed by a worker.
42
- """
43
-
44
- Started = "started"
45
- """
46
- The element is being processed by a worker.
47
- """
48
-
49
- Processed = "processed"
50
- """
51
- The element has been successfully processed by a worker.
52
- """
53
-
54
- Error = "error"
55
- """
56
- An error occurred while processing this element.
57
- """
58
-
59
-
60
37
  class ElementsWorker(
38
+ ElementMixin,
39
+ DatasetMixin,
61
40
  BaseWorker,
62
41
  ClassificationMixin,
63
42
  CorpusMixin,
64
- ElementMixin,
65
43
  TranscriptionMixin,
66
44
  WorkerVersionMixin,
67
45
  EntityMixin,
@@ -96,22 +74,7 @@ class ElementsWorker(
96
74
 
97
75
  self._worker_version_cache = {}
98
76
 
99
- def add_arguments(self):
100
- """Define specific ``argparse`` arguments for this worker"""
101
- self.parser.add_argument(
102
- "--elements-list",
103
- help="JSON elements list to use",
104
- type=open,
105
- default=os.environ.get("TASK_ELEMENTS"),
106
- )
107
- self.parser.add_argument(
108
- "--element",
109
- type=str,
110
- nargs="+",
111
- help="One or more Arkindex element ID",
112
- )
113
-
114
- def list_elements(self) -> Iterable[CachedElement] | list[str]:
77
+ def get_elements(self) -> Iterable[CachedElement] | list[str] | list[Element]:
115
78
  """
116
79
  List the elements to be processed, either from the CLI arguments or
117
80
  the cache database when enabled.
@@ -143,15 +106,20 @@ class ElementsWorker(
143
106
  )
144
107
  if self.use_cache and cache_query.exists():
145
108
  return cache_query
146
- # Process elements from JSON file
147
109
  elif self.args.elements_list:
110
+ # Process elements from JSON file
148
111
  data = json.load(self.args.elements_list)
149
112
  assert isinstance(data, list), "Elements list must be a list"
150
113
  assert len(data), "No elements in elements list"
151
114
  out += list(filter(None, [element.get("id") for element in data]))
152
- # Add any extra element from CLI
153
115
  elif self.args.element:
116
+ # Add any extra element from CLI
154
117
  out += self.args.element
118
+ elif self.process_mode == ProcessMode.Dataset or self.args.set:
119
+ # Elements from datasets
120
+ return list(
121
+ chain.from_iterable(map(self.list_set_elements, self.list_sets()))
122
+ )
155
123
 
156
124
  invalid_element_ids = list(filter(invalid_element_id, out))
157
125
  assert (
@@ -166,40 +134,18 @@ class ElementsWorker(
166
134
  Whether or not WorkerActivity support has been enabled on the DataImport
167
135
  used to run this worker.
168
136
  """
169
- if self.is_read_only:
137
+ if self.is_read_only or self.process_mode == ProcessMode.Dataset:
138
+ # Worker activities are also disabled when running an ElementsWorker in a Dataset process.
170
139
  return False
171
140
  assert (
172
141
  self.process_information
173
142
  ), "Worker must be configured to access its process activity state"
174
143
  return self.process_information.get("activity_state") == "ready"
175
144
 
176
- def configure(self):
177
- """
178
- Setup the worker using CLI arguments and environment variables.
179
- """
180
- # CLI args are stored on the instance so that implementations can access them
181
- self.args = self.parser.parse_args()
182
-
183
- if self.is_read_only:
184
- super().configure_for_developers()
185
- else:
186
- super().configure()
187
- super().configure_cache()
188
-
189
- # Retrieve the model configuration
190
- if self.model_configuration:
191
- self.config.update(self.model_configuration)
192
- logger.info("Model version configuration retrieved")
193
-
194
- # Retrieve the user configuration
195
- if self.user_configuration:
196
- self.config.update(self.user_configuration)
197
- logger.info("User configuration retrieved")
198
-
199
145
  def run(self):
200
146
  """
201
147
  Implements an Arkindex worker that goes through each element returned by
202
- [list_elements][arkindex_worker.worker.ElementsWorker.list_elements].
148
+ [get_elements][arkindex_worker.worker.ElementsWorker.get_elements].
203
149
  It calls [process_element][arkindex_worker.worker.ElementsWorker.process_element],
204
150
  catching exceptions, and handles saving WorkerActivity updates when enabled.
205
151
  """
@@ -207,7 +153,7 @@ class ElementsWorker(
207
153
 
208
154
  # List all elements either from JSON file
209
155
  # or direct list of elements on CLI
210
- elements = self.list_elements()
156
+ elements = self.get_elements()
211
157
  if not elements:
212
158
  logger.warning("No elements to process, stopping.")
213
159
  sys.exit(1)
@@ -223,8 +169,8 @@ class ElementsWorker(
223
169
  for i, item in enumerate(elements, start=1):
224
170
  element = None
225
171
  try:
226
- if self.use_cache:
227
- # Just use the result of list_elements as the element
172
+ if isinstance(item, CachedElement | Element):
173
+ # Just use the result of get_elements as the element
228
174
  element = item
229
175
  else:
230
176
  # Load element using the Arkindex API
@@ -339,29 +285,7 @@ class ElementsWorker(
339
285
  return True
340
286
 
341
287
 
342
- def check_dataset_set(value: str) -> tuple[uuid.UUID, str]:
343
- values = value.split(":")
344
- if len(values) != 2:
345
- raise ArgumentTypeError(
346
- f"'{value}' is not in the correct format `<dataset_id>:<set_name>`"
347
- )
348
-
349
- dataset_id, set_name = values
350
- try:
351
- dataset_id = uuid.UUID(dataset_id)
352
- return (dataset_id, set_name)
353
- except (TypeError, ValueError) as e:
354
- raise ArgumentTypeError(f"'{dataset_id}' should be a valid UUID") from e
355
-
356
-
357
- class MissingDatasetArchive(Exception):
358
- """
359
- Exception raised when the compressed archive associated to
360
- a dataset isn't found in its task artifacts.
361
- """
362
-
363
-
364
- class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
288
+ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
365
289
  """
366
290
  Base class for ML workers that operate on Arkindex dataset sets.
367
291
 
@@ -384,42 +308,6 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
384
308
  # Set as an instance variable as dataset workers might use it to easily extract its content
385
309
  self.downloaded_dataset_artifact: Path | None = None
386
310
 
387
- def add_arguments(self):
388
- """Define specific ``argparse`` arguments for this worker"""
389
- self.parser.add_argument(
390
- "--set",
391
- type=check_dataset_set,
392
- nargs="+",
393
- help="""
394
- One or more Arkindex dataset sets, format is <dataset_uuid>:<set_name>
395
- (e.g.: "12341234-1234-1234-1234-123412341234:train")
396
- """,
397
- default=[],
398
- )
399
-
400
- def configure(self):
401
- """
402
- Setup the worker using CLI arguments and environment variables.
403
- """
404
- # CLI args are stored on the instance so that implementations can access them
405
- self.args = self.parser.parse_args()
406
-
407
- if self.is_read_only:
408
- super().configure_for_developers()
409
- else:
410
- super().configure()
411
- super().configure_cache()
412
-
413
- # Retrieve the model configuration
414
- if self.model_configuration:
415
- self.config.update(self.model_configuration)
416
- logger.info("Model version configuration retrieved")
417
-
418
- # Retrieve the user configuration
419
- if self.user_configuration:
420
- self.config.update(self.user_configuration)
421
- logger.info("User configuration retrieved")
422
-
423
311
  def cleanup_downloaded_artifact(self) -> None:
424
312
  """
425
313
  Cleanup the downloaded dataset artifact if any
@@ -467,30 +355,10 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
467
355
  :param set: The set to process.
468
356
  """
469
357
 
470
- def list_sets(self) -> Iterator[Set]:
471
- """
472
- List the sets to be processed, either from the CLI arguments or using the
473
- [list_process_sets][arkindex_worker.worker.dataset.DatasetMixin.list_process_sets] method.
474
-
475
- :returns: An iterator of ``Set`` objects.
476
- """
477
- if not self.is_read_only:
478
- yield from self.list_process_sets()
479
-
480
- datasets: dict[uuid.UUID, Dataset] = {}
481
- for dataset_id, set_name in self.args.set:
482
- # Retrieving dataset information is not already cached
483
- if dataset_id not in datasets:
484
- datasets[dataset_id] = Dataset(
485
- **self.api_client.request("RetrieveDataset", id=dataset_id)
486
- )
487
-
488
- yield Set(name=set_name, dataset=datasets[dataset_id])
489
-
490
358
  def run(self):
491
359
  """
492
360
  Implements an Arkindex worker that goes through each dataset set returned by
493
- [list_sets][arkindex_worker.worker.DatasetWorker.list_sets].
361
+ [list_sets][arkindex_worker.worker.dataset.DatasetMixin.list_sets].
494
362
 
495
363
  It calls [process_set][arkindex_worker.worker.DatasetWorker.process_set],
496
364
  catching exceptions.
@@ -24,6 +24,7 @@ from arkindex_worker.cache import (
24
24
  merge_parents_cache,
25
25
  )
26
26
  from arkindex_worker.utils import close_delete_file, extract_tar_zst_archive
27
+ from arkindex_worker.worker.process import ProcessMode
27
28
  from teklia_toolbox.requests import get_arkindex_client
28
29
 
29
30
 
@@ -156,6 +157,13 @@ class BaseWorker:
156
157
  raise Exception("Missing ARKINDEX_CORPUS_ID environment variable")
157
158
  return self._corpus_id
158
159
 
160
+ @property
161
+ def process_mode(self) -> ProcessMode | None:
162
+ """Mode of the process being run. Returns None when read-only."""
163
+ if self.is_read_only:
164
+ return
165
+ return ProcessMode(self.process_information["mode"])
166
+
159
167
  @property
160
168
  def is_read_only(self) -> bool:
161
169
  """
@@ -219,7 +227,7 @@ class BaseWorker:
219
227
  # Load all required secrets
220
228
  self.secrets = {name: self.load_secret(Path(name)) for name in required_secrets}
221
229
 
222
- def configure(self):
230
+ def configure_worker_run(self):
223
231
  """
224
232
  Setup the necessary configuration needed using CLI args and environment variables.
225
233
  This is the method called when running a worker on Arkindex.
@@ -320,6 +328,29 @@ class BaseWorker:
320
328
  else:
321
329
  logger.debug("Cache is disabled")
322
330
 
331
+ def configure(self):
332
+ """
333
+ Setup the worker using CLI arguments and environment variables.
334
+ """
335
+ # CLI args are stored on the instance so that implementations can access them
336
+ self.args = self.parser.parse_args()
337
+
338
+ if self.is_read_only:
339
+ self.configure_for_developers()
340
+ else:
341
+ self.configure_worker_run()
342
+ self.configure_cache()
343
+
344
+ # Retrieve the model configuration
345
+ if self.model_configuration:
346
+ self.config.update(self.model_configuration)
347
+ logger.info("Model version configuration retrieved")
348
+
349
+ # Retrieve the user configuration
350
+ if self.user_configuration:
351
+ self.config.update(self.user_configuration)
352
+ logger.info("User configuration retrieved")
353
+
323
354
  def load_secret(self, name: Path):
324
355
  """
325
356
  Load a Ponos secret by name.
@@ -2,6 +2,8 @@
2
2
  BaseWorker methods for datasets.
3
3
  """
4
4
 
5
+ import uuid
6
+ from argparse import ArgumentTypeError
5
7
  from collections.abc import Iterator
6
8
  from enum import Enum
7
9
 
@@ -36,7 +38,55 @@ class DatasetState(Enum):
36
38
  """
37
39
 
38
40
 
41
+ class MissingDatasetArchive(Exception):
42
+ """
43
+ Exception raised when the compressed archive associated to
44
+ a dataset isn't found in its task artifacts.
45
+ """
46
+
47
+
48
+ def check_dataset_set(value: str) -> tuple[uuid.UUID, str]:
49
+ """The `--set` argument should have the following format:
50
+ <dataset_id>:<set_name>
51
+
52
+ Args:
53
+ value (str): Provided argument.
54
+
55
+ Raises:
56
+ ArgumentTypeError: When the value is invalid.
57
+
58
+ Returns:
59
+ tuple[uuid.UUID, str]: The ID of the dataset parsed as UUID and the name of the set.
60
+ """
61
+ values = value.split(":")
62
+ if len(values) != 2:
63
+ raise ArgumentTypeError(
64
+ f"'{value}' is not in the correct format `<dataset_id>:<set_name>`"
65
+ )
66
+
67
+ dataset_id, set_name = values
68
+ try:
69
+ dataset_id = uuid.UUID(dataset_id)
70
+ return (dataset_id, set_name)
71
+ except (TypeError, ValueError) as e:
72
+ raise ArgumentTypeError(f"'{dataset_id}' should be a valid UUID") from e
73
+
74
+
39
75
  class DatasetMixin:
76
+ def add_arguments(self) -> None:
77
+ """Define specific ``argparse`` arguments for the worker using this mixin"""
78
+ self.parser.add_argument(
79
+ "--set",
80
+ type=check_dataset_set,
81
+ nargs="+",
82
+ help="""
83
+ One or more Arkindex dataset sets, format is <dataset_uuid>:<set_name>
84
+ (e.g.: "12341234-1234-1234-1234-123412341234:train")
85
+ """,
86
+ default=[],
87
+ )
88
+ super().add_arguments()
89
+
40
90
  def list_process_sets(self) -> Iterator[Set]:
41
91
  """
42
92
  List dataset sets associated to the worker's process. This helper is not available in developer mode.
@@ -73,6 +123,26 @@ class DatasetMixin:
73
123
 
74
124
  return map(lambda result: Element(**result["element"]), results)
75
125
 
126
+ def list_sets(self) -> Iterator[Set]:
127
+ """
128
+ List the sets to be processed, either from the CLI arguments or using the
129
+ [list_process_sets][arkindex_worker.worker.dataset.DatasetMixin.list_process_sets] method.
130
+
131
+ :returns: An iterator of ``Set`` objects.
132
+ """
133
+ if not self.is_read_only:
134
+ yield from self.list_process_sets()
135
+
136
+ datasets: dict[uuid.UUID, Dataset] = {}
137
+ for dataset_id, set_name in self.args.set:
138
+ # Retrieving dataset information if not already cached
139
+ if dataset_id not in datasets:
140
+ datasets[dataset_id] = Dataset(
141
+ **self.api_client.request("RetrieveDataset", id=dataset_id)
142
+ )
143
+
144
+ yield Set(name=set_name, dataset=datasets[dataset_id])
145
+
76
146
  @unsupported_cache
77
147
  def update_dataset_state(self, dataset: Dataset, state: DatasetState) -> Dataset:
78
148
  """