arkindex-base-worker 0.5.0rc1__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/PKG-INFO +7 -8
  2. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_base_worker.egg-info/PKG-INFO +7 -8
  3. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_base_worker.egg-info/SOURCES.txt +1 -0
  4. arkindex_base_worker-0.5.1/arkindex_base_worker.egg-info/requires.txt +11 -0
  5. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/cache.py +6 -1
  6. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/image.py +5 -1
  7. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/models.py +5 -0
  8. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/utils.py +27 -0
  9. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/__init__.py +62 -6
  10. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/base.py +53 -1
  11. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/element.py +20 -0
  12. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/metadata.py +3 -3
  13. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/pyproject.toml +7 -8
  14. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/conftest.py +113 -12
  15. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_base_worker.py +99 -125
  16. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_cache.py +1 -1
  17. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_dataset_worker.py +5 -0
  18. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_element.py +52 -12
  19. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/__init__.py +4 -0
  20. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_worker.py +106 -0
  21. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_image.py +19 -3
  22. arkindex_base_worker-0.5.1/tests/test_modern_config.py +81 -0
  23. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_utils.py +42 -0
  24. arkindex_base_worker-0.5.0rc1/arkindex_base_worker.egg-info/requires.txt +0 -12
  25. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/LICENSE +0 -0
  26. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/README.md +0 -0
  27. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_base_worker.egg-info/dependency_links.txt +0 -0
  28. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_base_worker.egg-info/top_level.txt +0 -0
  29. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/__init__.py +0 -0
  30. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/classification.py +0 -0
  31. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/corpus.py +0 -0
  32. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/dataset.py +0 -0
  33. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/entity.py +0 -0
  34. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/image.py +0 -0
  35. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/process.py +0 -0
  36. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/task.py +0 -0
  37. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/training.py +0 -0
  38. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/transcription.py +0 -0
  39. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/examples/standalone/python/worker.py +0 -0
  40. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/examples/tooled/python/worker.py +0 -0
  41. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/hooks/pre_gen_project.py +0 -0
  42. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/setup.cfg +0 -0
  43. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/__init__.py +0 -0
  44. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_classification.py +0 -0
  45. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_cli.py +0 -0
  46. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_corpus.py +0 -0
  47. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_dataset.py +0 -0
  48. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_element.py +0 -0
  49. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_element_create_multiple.py +0 -0
  50. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_element_create_single.py +0 -0
  51. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_element_list_children.py +0 -0
  52. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_element_list_parents.py +0 -0
  53. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_entity.py +0 -0
  54. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_image.py +0 -0
  55. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_metadata.py +0 -0
  56. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_process.py +0 -0
  57. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_task.py +0 -0
  58. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_training.py +0 -0
  59. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_transcription_create.py +0 -0
  60. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_transcription_create_with_elements.py +0 -0
  61. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_transcription_list.py +0 -0
  62. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_merge.py +0 -0
  63. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/worker-demo/tests/__init__.py +0 -0
  64. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/worker-demo/tests/conftest.py +0 -0
  65. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/worker-demo/tests/test_worker.py +0 -0
  66. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/worker-demo/worker_demo/__init__.py +0 -0
  67. {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/worker-demo/worker_demo/worker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: arkindex-base-worker
3
- Version: 0.5.0rc1
3
+ Version: 0.5.1
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
@@ -41,16 +41,15 @@ Classifier: Programming Language :: Python :: 3.12
41
41
  Requires-Python: >=3.10
42
42
  Description-Content-Type: text/markdown
43
43
  License-File: LICENSE
44
- Requires-Dist: humanize==4.12.3
44
+ Requires-Dist: humanize==4.14.0
45
45
  Requires-Dist: peewee~=3.17
46
- Requires-Dist: Pillow==11.2.1
47
- Requires-Dist: python-gnupg==0.5.4
46
+ Requires-Dist: Pillow==11.3.0
47
+ Requires-Dist: python-gnupg==0.5.5
48
48
  Requires-Dist: shapely==2.0.6
49
- Requires-Dist: teklia-toolbox==0.1.9
50
- Requires-Dist: zstandard==0.23.0
49
+ Requires-Dist: teklia-toolbox==0.1.11
50
+ Requires-Dist: zstandard==0.25.0
51
51
  Provides-Extra: tests
52
- Requires-Dist: pytest==8.3.5; extra == "tests"
53
- Requires-Dist: pytest-mock==3.14.0; extra == "tests"
52
+ Requires-Dist: pytest-mock==3.15.1; extra == "tests"
54
53
  Requires-Dist: pytest-responses==0.5.1; extra == "tests"
55
54
  Dynamic: license-file
56
55
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: arkindex-base-worker
3
- Version: 0.5.0rc1
3
+ Version: 0.5.1
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
@@ -41,16 +41,15 @@ Classifier: Programming Language :: Python :: 3.12
41
41
  Requires-Python: >=3.10
42
42
  Description-Content-Type: text/markdown
43
43
  License-File: LICENSE
44
- Requires-Dist: humanize==4.12.3
44
+ Requires-Dist: humanize==4.14.0
45
45
  Requires-Dist: peewee~=3.17
46
- Requires-Dist: Pillow==11.2.1
47
- Requires-Dist: python-gnupg==0.5.4
46
+ Requires-Dist: Pillow==11.3.0
47
+ Requires-Dist: python-gnupg==0.5.5
48
48
  Requires-Dist: shapely==2.0.6
49
- Requires-Dist: teklia-toolbox==0.1.9
50
- Requires-Dist: zstandard==0.23.0
49
+ Requires-Dist: teklia-toolbox==0.1.11
50
+ Requires-Dist: zstandard==0.25.0
51
51
  Provides-Extra: tests
52
- Requires-Dist: pytest==8.3.5; extra == "tests"
53
- Requires-Dist: pytest-mock==3.14.0; extra == "tests"
52
+ Requires-Dist: pytest-mock==3.15.1; extra == "tests"
54
53
  Requires-Dist: pytest-responses==0.5.1; extra == "tests"
55
54
  Dynamic: license-file
56
55
 
@@ -35,6 +35,7 @@ tests/test_dataset_worker.py
35
35
  tests/test_element.py
36
36
  tests/test_image.py
37
37
  tests/test_merge.py
38
+ tests/test_modern_config.py
38
39
  tests/test_utils.py
39
40
  tests/test_elements_worker/__init__.py
40
41
  tests/test_elements_worker/test_classification.py
@@ -0,0 +1,11 @@
1
+ humanize==4.14.0
2
+ peewee~=3.17
3
+ Pillow==11.3.0
4
+ python-gnupg==0.5.5
5
+ shapely==2.0.6
6
+ teklia-toolbox==0.1.11
7
+ zstandard==0.25.0
8
+
9
+ [tests]
10
+ pytest-mock==3.15.1
11
+ pytest-responses==0.5.1
@@ -73,6 +73,7 @@ class CachedImage(Model):
73
73
  width = IntegerField()
74
74
  height = IntegerField()
75
75
  url = TextField()
76
+ version = IntegerField(default=2)
76
77
 
77
78
  class Meta:
78
79
  database = db
@@ -157,6 +158,10 @@ class CachedElement(Model):
157
158
  else:
158
159
  resize = f"{max_width or ''},{max_height or ''}"
159
160
 
161
+ # Use `max` instead of `full` for IIIF 3, since `full` was deprecated in 2.1 then removed in 3.0
162
+ if self.image.version == 3 and resize == "full":
163
+ resize = "max"
164
+
160
165
  url = self.image.url
161
166
  if not url.endswith("/"):
162
167
  url += "/"
@@ -259,7 +264,7 @@ MODELS = [
259
264
  CachedDataset,
260
265
  CachedDatasetElement,
261
266
  ]
262
- SQL_VERSION = 4
267
+ SQL_VERSION = 5
263
268
 
264
269
 
265
270
  def init_cache_db(path: Path):
@@ -366,6 +366,10 @@ def download_tiles(url: str) -> Image:
366
366
  logger.debug("Downloading image information")
367
367
  info = _retried_request(url + "info.json").json()
368
368
 
369
+ # Use `max` instead of `full` for IIIF 3, since `full` was deprecated in 2.1 then removed in 3.0
370
+ # With IIIF 3, the image's ID will be at `id`, while IIIF 2 will use `@id``
371
+ resize = "max" if "id" in info else "full"
372
+
369
373
  image_width, image_height = info.get("width"), info.get("height")
370
374
  assert image_width and image_height, "Missing image dimensions in info.json"
371
375
  assert info.get("tiles"), (
@@ -391,7 +395,7 @@ def download_tiles(url: str) -> Image:
391
395
 
392
396
  logger.debug(f"Downloading tile {tile_x},{tile_y}")
393
397
  resp = _retried_request(
394
- f"{url}{region_x},{region_y},{region_width},{region_height}/full/0/default.jpg"
398
+ f"{url}{region_x},{region_y},{region_width},{region_height}/{resize}/0/default.jpg"
395
399
  )
396
400
 
397
401
  tile_img = Image.open(BytesIO(resp.content))
@@ -87,6 +87,11 @@ class Element(MagicDict):
87
87
  url = self.zone.image.get("s3_url")
88
88
  if url:
89
89
  return url
90
+
91
+ # Use `max` instead of `full` for IIIF 3, since `full` was deprecated in 2.1 then removed in 3.0
92
+ if self.zone.image.server.get("version", 2) == 3 and size == "full":
93
+ size = "max"
94
+
90
95
  url = self.zone.image.url
91
96
  if not url.endswith("/"):
92
97
  url += "/"
@@ -4,6 +4,7 @@ import logging
4
4
  import os
5
5
  import tarfile
6
6
  import tempfile
7
+ import zipfile
7
8
  from collections.abc import Callable, Generator
8
9
  from itertools import islice
9
10
  from pathlib import Path
@@ -225,6 +226,32 @@ def create_tar_zst_archive(
225
226
  return zst_fd, zst_archive, zst_hash, tar_hash
226
227
 
227
228
 
229
+ def create_zip_archive(source: Path, destination: Path | None = None) -> Path:
230
+ """Helper to create a ZIP archive from a source folder.
231
+
232
+ :param source: Path to the folder whose content should be archived.
233
+ :param destination: Path to the created archive, defaults to None. If unspecified, a temporary file will be created.
234
+ :return: The file descriptor of the created tempfile (if one was created), path to the archive.
235
+ """
236
+ # Parse destination and create a tmpfile if none was specified
237
+ file_d, destination = (
238
+ tempfile.mkstemp(prefix="teklia-", suffix=".zip")
239
+ if destination is None
240
+ else (None, destination)
241
+ )
242
+ destination = Path(destination)
243
+ logger.debug(f"Compressing file to {destination}")
244
+
245
+ with zipfile.ZipFile(
246
+ destination, mode="w", compression=zipfile.ZIP_BZIP2
247
+ ) as archive:
248
+ for p in source.rglob("*"):
249
+ relpath = p.relative_to(source)
250
+ archive.write(p, arcname=relpath)
251
+
252
+ return archive, destination
253
+
254
+
228
255
  DEFAULT_BATCH_SIZE = 50
229
256
  """Batch size used for bulk publication to Arkindex"""
230
257
 
@@ -32,6 +32,41 @@ from arkindex_worker.worker.task import TaskMixin
32
32
  from arkindex_worker.worker.transcription import TranscriptionMixin
33
33
 
34
34
 
35
+ class WorkerActivityIterator:
36
+ def __init__(self, api_client):
37
+ # Use same api client as main class
38
+ self.api_client = api_client
39
+
40
+ logger.info(
41
+ "Using StartWorkerActivity instead of reading init_elements JSON file"
42
+ )
43
+
44
+ def __bool__(self):
45
+ # Needed to bypass `not elements` check
46
+ return True
47
+
48
+ def __iter__(self):
49
+ return self
50
+
51
+ def __next__(self):
52
+ """
53
+ Provide a new element ID from a worker activity upon each iteration
54
+ """
55
+ try:
56
+ data = self.api_client.request("StartWorkerActivity")
57
+ except ErrorResponse as e:
58
+ # Arkindex will provide a 404 or 400 when there are no worker activities left or the task has completed
59
+ if e.status_code in (400, 404):
60
+ raise StopIteration from e
61
+
62
+ logger.warning(
63
+ f"Failed to start a new worker activity of element due to an API error: {e.content}"
64
+ )
65
+ raise e
66
+
67
+ return data["id"]
68
+
69
+
35
70
  class ElementsWorker(
36
71
  ElementMixin,
37
72
  DatasetMixin,
@@ -60,7 +95,9 @@ class ElementsWorker(
60
95
  """
61
96
  super().__init__(description, support_cache)
62
97
 
63
- def get_elements(self) -> Iterable[CachedElement] | list[str] | list[Element]:
98
+ def get_elements(
99
+ self,
100
+ ) -> Iterable[CachedElement] | list[str] | list[Element] | WorkerActivityIterator:
64
101
  """
65
102
  List the elements to be processed, either from the CLI arguments or
66
103
  the cache database when enabled.
@@ -109,6 +146,9 @@ class ElementsWorker(
109
146
  elif self.process_mode == ProcessMode.Export:
110
147
  # For export mode processes, use list_process_elements and return element IDs
111
148
  return {item["id"] for item in self.list_process_elements()}
149
+ elif self.consume_worker_activities:
150
+ # Consume worker activitives one by one
151
+ return WorkerActivityIterator(self.api_client)
112
152
 
113
153
  invalid_element_ids = list(filter(invalid_element_id, out))
114
154
  assert not invalid_element_ids, (
@@ -135,6 +175,15 @@ class ElementsWorker(
135
175
  )
136
176
  return self.process_information.get("activity_state") == "ready"
137
177
 
178
+ @property
179
+ def unknown_nb_elements(self) -> bool:
180
+ """
181
+ Whether or not the worker knows the total number of elements to process
182
+ - when running with init_elements, we have a known list
183
+ - when running with StartWorkerActivity, we have a queue of unknown size
184
+ """
185
+ return self.consume_worker_activities
186
+
138
187
  def run(self):
139
188
  """
140
189
  Implements an Arkindex worker that goes through each element returned by
@@ -157,7 +206,8 @@ class ElementsWorker(
157
206
  )
158
207
 
159
208
  # Process every element
160
- count = len(elements)
209
+ # We cannot know the number of elements when consuming a list of worker activities
210
+ count = None if self.unknown_nb_elements else len(elements)
161
211
  failed = 0
162
212
  for i, item in enumerate(elements, start=1):
163
213
  element = None
@@ -171,10 +221,16 @@ class ElementsWorker(
171
221
  **self.api_client.request("RetrieveElement", id=item)
172
222
  )
173
223
 
174
- logger.info(f"Processing {element} ({i}/{count})")
224
+ if self.unknown_nb_elements:
225
+ logger.info(f"Processing {element} (n°{i})")
226
+ else:
227
+ logger.info(f"Processing {element} ({i}/{count})")
175
228
 
176
229
  # Process the element and report its progress if activities are enabled
177
- if self.update_activity(element.id, ActivityState.Started):
230
+ # We do not update the worker activity to "Started" state when consuming them
231
+ if self.consume_worker_activities or self.update_activity(
232
+ element.id, ActivityState.Started
233
+ ):
178
234
  self.process_element(element)
179
235
  self.update_activity(element.id, ActivityState.Processed)
180
236
  else:
@@ -207,10 +263,10 @@ class ElementsWorker(
207
263
  with contextlib.suppress(Exception):
208
264
  self.update_activity(element.id, ActivityState.Error)
209
265
 
210
- message = f"Ran on {count} {pluralize('element', count)}: {count - failed} completed, {failed} failed"
266
+ message = f"Ran on {i} {pluralize('element', i)}: {i - failed} completed, {failed} failed"
211
267
  if failed:
212
268
  logger.error(message)
213
- if failed >= count: # Everything failed!
269
+ if failed >= i: # Everything failed!
214
270
  sys.exit(1)
215
271
  else:
216
272
  logger.info(message)
@@ -9,12 +9,13 @@ import os
9
9
  import shutil
10
10
  from pathlib import Path
11
11
  from tempfile import mkdtemp
12
+ from typing import Any
12
13
 
13
14
  import gnupg
14
15
  import yaml
15
16
 
16
17
  from arkindex import options_from_env
17
- from arkindex.exceptions import ErrorResponse
18
+ from arkindex.exceptions import ClientError, ErrorResponse
18
19
  from arkindex_worker import logger
19
20
  from arkindex_worker.cache import (
20
21
  check_version,
@@ -260,7 +261,28 @@ class BaseWorker:
260
261
 
261
262
  logger.info(f"Loaded {worker_run['summary']} from API")
262
263
 
264
+ def _process_config_item(item: dict) -> tuple[str, Any]:
265
+ if not item["secret"]:
266
+ return (item["key"], item["value"])
267
+
268
+ # The secret may not be picked by the user
269
+ if item["value"] is None:
270
+ logger.info(f"Optional secret `{item['key']}` is not set")
271
+ return (item["key"], None)
272
+
273
+ # Load secret, only available in Arkindex EE
274
+ try:
275
+ secret = self.load_secret(Path(item["value"]))
276
+ except ClientError as e:
277
+ logger.error(
278
+ f"Failed to retrieve the secret {item['value']}, probably an Arkindex Community Edition: {e}"
279
+ )
280
+ return (item["key"], None)
281
+
282
+ return (item["key"], secret)
283
+
263
284
  # Load model version configuration when available
285
+ # Workers will use model version ID and details to download the model
264
286
  model_version = worker_run.get("model_version")
265
287
  if model_version:
266
288
  logger.info("Loaded model version configuration from WorkerRun")
@@ -272,6 +294,36 @@ class BaseWorker:
272
294
  # Set model details as worker attribute
273
295
  self.model_details = model_version["model"]
274
296
 
297
+ # Load worker run information
298
+ try:
299
+ config = self.api_client.request(
300
+ "RetrieveWorkerRunConfiguration", id=self.worker_run_id
301
+ )
302
+
303
+ # Provide the same configuration through all previous attributes
304
+ self.config = self.user_configuration = dict(
305
+ map(_process_config_item, config["configuration"])
306
+ )
307
+
308
+ # Provide secret values through the previous attribute
309
+ self.secrets = {
310
+ item["key"]: self.config[item["key"]]
311
+ for item in config["configuration"]
312
+ if item["secret"]
313
+ }
314
+ logger.info("Using modern configuration")
315
+
316
+ # Reset the model configuration to make sure workers rely on the single new source
317
+ self.model_configuration = {}
318
+
319
+ return # Stop here once we have modern configuration
320
+
321
+ except ErrorResponse as e:
322
+ if e.status_code != 400:
323
+ raise
324
+ logger.info("Modern configuration is not available")
325
+
326
+ # Use old-style configuration with local merge
275
327
  # Retrieve initial configuration from API
276
328
  self.config = worker_version["configuration"].get("configuration", {})
277
329
  if "user_configuration" in worker_version["configuration"]:
@@ -38,6 +38,15 @@ class ElementMixin:
38
38
  type=open,
39
39
  default=os.environ.get("TASK_ELEMENTS"),
40
40
  )
41
+ self.parser.add_argument(
42
+ "--no-elements-list",
43
+ help=(
44
+ "Consume worker activities from Arkindex API instead of using a static elements list"
45
+ ),
46
+ dest="consume_worker_activities",
47
+ action="store_true",
48
+ default=os.environ.get("SKIP_TASK_ELEMENTS") is not None,
49
+ )
41
50
  self.parser.add_argument(
42
51
  "--element",
43
52
  type=str,
@@ -46,6 +55,17 @@ class ElementMixin:
46
55
  )
47
56
  super().add_arguments()
48
57
 
58
+ @property
59
+ def consume_worker_activities(self) -> bool:
60
+ """
61
+ Helper to detect if the worker rely on an elements.json or consume directly worker activities
62
+ Uses the process information when available, fallback to CLI args
63
+ """
64
+ if self.process_information is not None:
65
+ return self.process_information.get("skip_elements_json") is True
66
+
67
+ return self.args.consume_worker_activities
68
+
49
69
  def list_corpus_types(self):
50
70
  """
51
71
  Loads available element types in corpus.
@@ -20,10 +20,10 @@ class MetaType(Enum):
20
20
  A regular string with no special interpretation.
21
21
  """
22
22
 
23
- HTML = "html"
23
+ Markdown = "markdown"
24
24
  """
25
- A metadata with a string value that should be interpreted as HTML content.
26
- The allowed HTML tags are restricted for security reasons.
25
+ A metadata with a string value that should be interpreted as Markdown content.
26
+ HTML is allowed, but the allowed HTML tags are restricted for security reasons.
27
27
  """
28
28
 
29
29
  Date = "date"
@@ -4,17 +4,17 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "arkindex-base-worker"
7
- version = "0.5.0rc1"
7
+ version = "0.5.1"
8
8
  description = "Base Worker to easily build Arkindex ML workflows"
9
9
  license = { file = "LICENSE" }
10
10
  dependencies = [
11
- "humanize==4.12.3",
11
+ "humanize==4.14.0",
12
12
  "peewee~=3.17",
13
- "Pillow==11.2.1",
14
- "python-gnupg==0.5.4",
13
+ "Pillow==11.3.0",
14
+ "python-gnupg==0.5.5",
15
15
  "shapely==2.0.6",
16
- "teklia-toolbox==0.1.9",
17
- "zstandard==0.23.0",
16
+ "teklia-toolbox==0.1.11",
17
+ "zstandard==0.25.0",
18
18
  ]
19
19
  authors = [
20
20
  { name = "Teklia", email = "contact@teklia.com" },
@@ -44,8 +44,7 @@ Authors = "https://teklia.com"
44
44
 
45
45
  [project.optional-dependencies]
46
46
  tests = [
47
- "pytest==8.3.5",
48
- "pytest-mock==3.14.0",
47
+ "pytest-mock==3.15.1",
49
48
  "pytest-responses==0.5.1",
50
49
  ]
51
50
 
@@ -103,12 +103,6 @@ def _mock_worker_run_api(responses):
103
103
  payload = {
104
104
  "id": "56785678-5678-5678-5678-567856785678",
105
105
  "parents": [],
106
- "worker": {
107
- "id": "deadbeef-1234-5678-1234-worker",
108
- "name": "Fake worker",
109
- "slug": "fake_worker",
110
- "type": "classifier",
111
- },
112
106
  "worker_version": {
113
107
  "id": "12341234-1234-1234-1234-123412341234",
114
108
  "configuration": {
@@ -153,6 +147,7 @@ def _mock_worker_run_api(responses):
153
147
  "train_folder_id": None,
154
148
  "validation_folder_id": None,
155
149
  "test_folder_id": None,
150
+ "skip_elements_json": False,
156
151
  },
157
152
  "summary": "Worker Fake worker @ 123412",
158
153
  }
@@ -165,6 +160,13 @@ def _mock_worker_run_api(responses):
165
160
  content_type="application/json",
166
161
  )
167
162
 
163
+ # By default, stick to classic configuration
164
+ responses.add(
165
+ responses.GET,
166
+ "http://testserver/api/v1/workers/runs/56785678-5678-5678-5678-567856785678/configuration/",
167
+ status=400,
168
+ )
169
+
168
170
 
169
171
  @pytest.fixture
170
172
  def _mock_worker_run_no_revision_api(responses):
@@ -172,12 +174,6 @@ def _mock_worker_run_no_revision_api(responses):
172
174
  payload = {
173
175
  "id": "56785678-5678-5678-5678-567856785678",
174
176
  "parents": [],
175
- "worker": {
176
- "id": "deadbeef-1234-5678-1234-worker",
177
- "name": "Fake worker",
178
- "slug": "fake_worker",
179
- "type": "classifier",
180
- },
181
177
  "worker_version": {
182
178
  "id": "12341234-1234-1234-1234-123412341234",
183
179
  "configuration": {
@@ -233,6 +229,56 @@ def _mock_worker_run_no_revision_api(responses):
233
229
  )
234
230
 
235
231
 
232
+ @pytest.fixture
233
+ def mock_base_worker_modern_conf(mocker, responses):
234
+ """
235
+ Provide a base worker to test modern configuration with (not provided in the fixture)
236
+ """
237
+ worker = BaseWorker()
238
+ mocker.patch.object(sys, "argv")
239
+ worker.args = worker.parser.parse_args()
240
+
241
+ payload = {
242
+ "id": "56785678-5678-5678-5678-567856785678",
243
+ "parents": [],
244
+ "worker_version": {
245
+ "id": "12341234-1234-1234-1234-123412341234",
246
+ "worker": {
247
+ "id": "deadbeef-1234-5678-1234-worker",
248
+ "name": "Fake worker",
249
+ "slug": "fake_worker",
250
+ "type": "classifier",
251
+ },
252
+ "revision": {"hash": "deadbeef1234"},
253
+ "configuration": {
254
+ "configuration": {"extra_key1": "not showing up"},
255
+ "user_configuration": {"extra_key2": "not showing up"},
256
+ },
257
+ },
258
+ "configuration": {
259
+ "id": "af0daaf4-983e-4703-a7ed-a10f146d6684",
260
+ "name": "my-userconfig",
261
+ "configuration": {
262
+ "extra_key3": "not showing up",
263
+ },
264
+ },
265
+ "model_version": None,
266
+ "process": {
267
+ "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
268
+ "corpus": CORPUS_ID,
269
+ },
270
+ "summary": "Worker Fake worker @ 123412",
271
+ }
272
+ responses.add(
273
+ responses.GET,
274
+ "http://testserver/api/v1/process/workers/56785678-5678-5678-5678-567856785678/",
275
+ status=200,
276
+ json=payload,
277
+ )
278
+
279
+ return worker
280
+
281
+
236
282
  @pytest.fixture
237
283
  def _mock_activity_calls(responses):
238
284
  """
@@ -282,6 +328,61 @@ def mock_elements_worker_with_list(monkeypatch, responses, mock_elements_worker)
282
328
  return mock_elements_worker
283
329
 
284
330
 
331
+ @pytest.fixture
332
+ def mock_elements_worker_consume_wa(monkeypatch, responses, mock_elements_worker):
333
+ """
334
+ Mock a worker instance to use StartWorkerActivity to consume worker activities
335
+ instead of reading a JSON file
336
+ """
337
+
338
+ # Enable consume worker activities through the process configuration
339
+ responses.replace(
340
+ responses.GET,
341
+ "http://testserver/api/v1/process/workers/56785678-5678-5678-5678-567856785678/",
342
+ status=200,
343
+ json={
344
+ "id": "56785678-5678-5678-5678-567856785678",
345
+ "parents": [],
346
+ "worker_version": {
347
+ "id": "12341234-1234-1234-1234-123412341234",
348
+ "configuration": {
349
+ "docker": {"image": "python:3"},
350
+ "configuration": {"someKey": "someValue"},
351
+ "secrets": [],
352
+ },
353
+ "worker": {
354
+ "id": "deadbeef-1234-5678-1234-worker",
355
+ "name": "Fake worker",
356
+ "slug": "fake_worker",
357
+ "type": "classifier",
358
+ },
359
+ },
360
+ "configuration": None,
361
+ "model_version": None,
362
+ "process": {
363
+ "name": None,
364
+ "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
365
+ "state": "running",
366
+ "mode": "workers",
367
+ "corpus": CORPUS_ID,
368
+ "use_cache": False,
369
+ "activity_state": "ready",
370
+ "model_id": None,
371
+ "train_folder_id": None,
372
+ "validation_folder_id": None,
373
+ "test_folder_id": None,
374
+ "skip_elements_json": True,
375
+ },
376
+ "summary": "Worker Fake worker @ 123412",
377
+ },
378
+ )
379
+
380
+ # Call configure again to use updated process infos
381
+ mock_elements_worker.configure()
382
+
383
+ return mock_elements_worker
384
+
385
+
285
386
  @pytest.fixture
286
387
  def mock_cache_db(tmp_path):
287
388
  cache_path = tmp_path / "db.sqlite"