arkindex-base-worker 0.5.0rc1__tar.gz → 0.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/PKG-INFO +7 -8
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_base_worker.egg-info/PKG-INFO +7 -8
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_base_worker.egg-info/SOURCES.txt +1 -0
- arkindex_base_worker-0.5.1/arkindex_base_worker.egg-info/requires.txt +11 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/cache.py +6 -1
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/image.py +5 -1
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/models.py +5 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/utils.py +27 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/__init__.py +62 -6
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/base.py +53 -1
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/element.py +20 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/metadata.py +3 -3
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/pyproject.toml +7 -8
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/conftest.py +113 -12
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_base_worker.py +99 -125
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_cache.py +1 -1
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_dataset_worker.py +5 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_element.py +52 -12
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/__init__.py +4 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_worker.py +106 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_image.py +19 -3
- arkindex_base_worker-0.5.1/tests/test_modern_config.py +81 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_utils.py +42 -0
- arkindex_base_worker-0.5.0rc1/arkindex_base_worker.egg-info/requires.txt +0 -12
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/LICENSE +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/README.md +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_base_worker.egg-info/dependency_links.txt +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_base_worker.egg-info/top_level.txt +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/__init__.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/classification.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/corpus.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/dataset.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/entity.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/image.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/process.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/task.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/training.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/transcription.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/examples/standalone/python/worker.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/examples/tooled/python/worker.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/hooks/pre_gen_project.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/setup.cfg +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/__init__.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_classification.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_cli.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_corpus.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_dataset.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_element.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_element_create_multiple.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_element_create_single.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_element_list_children.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_element_list_parents.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_entity.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_image.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_metadata.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_process.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_task.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_training.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_transcription_create.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_transcription_create_with_elements.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_elements_worker/test_transcription_list.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/tests/test_merge.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/worker-demo/tests/__init__.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/worker-demo/tests/conftest.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/worker-demo/tests/test_worker.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/worker-demo/worker_demo/__init__.py +0 -0
- {arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/worker-demo/worker_demo/worker.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: arkindex-base-worker
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: Base Worker to easily build Arkindex ML workflows
|
|
5
5
|
Author-email: Teklia <contact@teklia.com>
|
|
6
6
|
Maintainer-email: Teklia <contact@teklia.com>
|
|
@@ -41,16 +41,15 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
41
41
|
Requires-Python: >=3.10
|
|
42
42
|
Description-Content-Type: text/markdown
|
|
43
43
|
License-File: LICENSE
|
|
44
|
-
Requires-Dist: humanize==4.
|
|
44
|
+
Requires-Dist: humanize==4.14.0
|
|
45
45
|
Requires-Dist: peewee~=3.17
|
|
46
|
-
Requires-Dist: Pillow==11.
|
|
47
|
-
Requires-Dist: python-gnupg==0.5.
|
|
46
|
+
Requires-Dist: Pillow==11.3.0
|
|
47
|
+
Requires-Dist: python-gnupg==0.5.5
|
|
48
48
|
Requires-Dist: shapely==2.0.6
|
|
49
|
-
Requires-Dist: teklia-toolbox==0.1.
|
|
50
|
-
Requires-Dist: zstandard==0.
|
|
49
|
+
Requires-Dist: teklia-toolbox==0.1.11
|
|
50
|
+
Requires-Dist: zstandard==0.25.0
|
|
51
51
|
Provides-Extra: tests
|
|
52
|
-
Requires-Dist: pytest==
|
|
53
|
-
Requires-Dist: pytest-mock==3.14.0; extra == "tests"
|
|
52
|
+
Requires-Dist: pytest-mock==3.15.1; extra == "tests"
|
|
54
53
|
Requires-Dist: pytest-responses==0.5.1; extra == "tests"
|
|
55
54
|
Dynamic: license-file
|
|
56
55
|
|
{arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_base_worker.egg-info/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: arkindex-base-worker
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: Base Worker to easily build Arkindex ML workflows
|
|
5
5
|
Author-email: Teklia <contact@teklia.com>
|
|
6
6
|
Maintainer-email: Teklia <contact@teklia.com>
|
|
@@ -41,16 +41,15 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
41
41
|
Requires-Python: >=3.10
|
|
42
42
|
Description-Content-Type: text/markdown
|
|
43
43
|
License-File: LICENSE
|
|
44
|
-
Requires-Dist: humanize==4.
|
|
44
|
+
Requires-Dist: humanize==4.14.0
|
|
45
45
|
Requires-Dist: peewee~=3.17
|
|
46
|
-
Requires-Dist: Pillow==11.
|
|
47
|
-
Requires-Dist: python-gnupg==0.5.
|
|
46
|
+
Requires-Dist: Pillow==11.3.0
|
|
47
|
+
Requires-Dist: python-gnupg==0.5.5
|
|
48
48
|
Requires-Dist: shapely==2.0.6
|
|
49
|
-
Requires-Dist: teklia-toolbox==0.1.
|
|
50
|
-
Requires-Dist: zstandard==0.
|
|
49
|
+
Requires-Dist: teklia-toolbox==0.1.11
|
|
50
|
+
Requires-Dist: zstandard==0.25.0
|
|
51
51
|
Provides-Extra: tests
|
|
52
|
-
Requires-Dist: pytest==
|
|
53
|
-
Requires-Dist: pytest-mock==3.14.0; extra == "tests"
|
|
52
|
+
Requires-Dist: pytest-mock==3.15.1; extra == "tests"
|
|
54
53
|
Requires-Dist: pytest-responses==0.5.1; extra == "tests"
|
|
55
54
|
Dynamic: license-file
|
|
56
55
|
|
|
@@ -73,6 +73,7 @@ class CachedImage(Model):
|
|
|
73
73
|
width = IntegerField()
|
|
74
74
|
height = IntegerField()
|
|
75
75
|
url = TextField()
|
|
76
|
+
version = IntegerField(default=2)
|
|
76
77
|
|
|
77
78
|
class Meta:
|
|
78
79
|
database = db
|
|
@@ -157,6 +158,10 @@ class CachedElement(Model):
|
|
|
157
158
|
else:
|
|
158
159
|
resize = f"{max_width or ''},{max_height or ''}"
|
|
159
160
|
|
|
161
|
+
# Use `max` instead of `full` for IIIF 3, since `full` was deprecated in 2.1 then removed in 3.0
|
|
162
|
+
if self.image.version == 3 and resize == "full":
|
|
163
|
+
resize = "max"
|
|
164
|
+
|
|
160
165
|
url = self.image.url
|
|
161
166
|
if not url.endswith("/"):
|
|
162
167
|
url += "/"
|
|
@@ -259,7 +264,7 @@ MODELS = [
|
|
|
259
264
|
CachedDataset,
|
|
260
265
|
CachedDatasetElement,
|
|
261
266
|
]
|
|
262
|
-
SQL_VERSION =
|
|
267
|
+
SQL_VERSION = 5
|
|
263
268
|
|
|
264
269
|
|
|
265
270
|
def init_cache_db(path: Path):
|
|
@@ -366,6 +366,10 @@ def download_tiles(url: str) -> Image:
|
|
|
366
366
|
logger.debug("Downloading image information")
|
|
367
367
|
info = _retried_request(url + "info.json").json()
|
|
368
368
|
|
|
369
|
+
# Use `max` instead of `full` for IIIF 3, since `full` was deprecated in 2.1 then removed in 3.0
|
|
370
|
+
# With IIIF 3, the image's ID will be at `id`, while IIIF 2 will use `@id``
|
|
371
|
+
resize = "max" if "id" in info else "full"
|
|
372
|
+
|
|
369
373
|
image_width, image_height = info.get("width"), info.get("height")
|
|
370
374
|
assert image_width and image_height, "Missing image dimensions in info.json"
|
|
371
375
|
assert info.get("tiles"), (
|
|
@@ -391,7 +395,7 @@ def download_tiles(url: str) -> Image:
|
|
|
391
395
|
|
|
392
396
|
logger.debug(f"Downloading tile {tile_x},{tile_y}")
|
|
393
397
|
resp = _retried_request(
|
|
394
|
-
f"{url}{region_x},{region_y},{region_width},{region_height}/
|
|
398
|
+
f"{url}{region_x},{region_y},{region_width},{region_height}/{resize}/0/default.jpg"
|
|
395
399
|
)
|
|
396
400
|
|
|
397
401
|
tile_img = Image.open(BytesIO(resp.content))
|
|
@@ -87,6 +87,11 @@ class Element(MagicDict):
|
|
|
87
87
|
url = self.zone.image.get("s3_url")
|
|
88
88
|
if url:
|
|
89
89
|
return url
|
|
90
|
+
|
|
91
|
+
# Use `max` instead of `full` for IIIF 3, since `full` was deprecated in 2.1 then removed in 3.0
|
|
92
|
+
if self.zone.image.server.get("version", 2) == 3 and size == "full":
|
|
93
|
+
size = "max"
|
|
94
|
+
|
|
90
95
|
url = self.zone.image.url
|
|
91
96
|
if not url.endswith("/"):
|
|
92
97
|
url += "/"
|
|
@@ -4,6 +4,7 @@ import logging
|
|
|
4
4
|
import os
|
|
5
5
|
import tarfile
|
|
6
6
|
import tempfile
|
|
7
|
+
import zipfile
|
|
7
8
|
from collections.abc import Callable, Generator
|
|
8
9
|
from itertools import islice
|
|
9
10
|
from pathlib import Path
|
|
@@ -225,6 +226,32 @@ def create_tar_zst_archive(
|
|
|
225
226
|
return zst_fd, zst_archive, zst_hash, tar_hash
|
|
226
227
|
|
|
227
228
|
|
|
229
|
+
def create_zip_archive(source: Path, destination: Path | None = None) -> Path:
|
|
230
|
+
"""Helper to create a ZIP archive from a source folder.
|
|
231
|
+
|
|
232
|
+
:param source: Path to the folder whose content should be archived.
|
|
233
|
+
:param destination: Path to the created archive, defaults to None. If unspecified, a temporary file will be created.
|
|
234
|
+
:return: The file descriptor of the created tempfile (if one was created), path to the archive.
|
|
235
|
+
"""
|
|
236
|
+
# Parse destination and create a tmpfile if none was specified
|
|
237
|
+
file_d, destination = (
|
|
238
|
+
tempfile.mkstemp(prefix="teklia-", suffix=".zip")
|
|
239
|
+
if destination is None
|
|
240
|
+
else (None, destination)
|
|
241
|
+
)
|
|
242
|
+
destination = Path(destination)
|
|
243
|
+
logger.debug(f"Compressing file to {destination}")
|
|
244
|
+
|
|
245
|
+
with zipfile.ZipFile(
|
|
246
|
+
destination, mode="w", compression=zipfile.ZIP_BZIP2
|
|
247
|
+
) as archive:
|
|
248
|
+
for p in source.rglob("*"):
|
|
249
|
+
relpath = p.relative_to(source)
|
|
250
|
+
archive.write(p, arcname=relpath)
|
|
251
|
+
|
|
252
|
+
return archive, destination
|
|
253
|
+
|
|
254
|
+
|
|
228
255
|
DEFAULT_BATCH_SIZE = 50
|
|
229
256
|
"""Batch size used for bulk publication to Arkindex"""
|
|
230
257
|
|
{arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/__init__.py
RENAMED
|
@@ -32,6 +32,41 @@ from arkindex_worker.worker.task import TaskMixin
|
|
|
32
32
|
from arkindex_worker.worker.transcription import TranscriptionMixin
|
|
33
33
|
|
|
34
34
|
|
|
35
|
+
class WorkerActivityIterator:
|
|
36
|
+
def __init__(self, api_client):
|
|
37
|
+
# Use same api client as main class
|
|
38
|
+
self.api_client = api_client
|
|
39
|
+
|
|
40
|
+
logger.info(
|
|
41
|
+
"Using StartWorkerActivity instead of reading init_elements JSON file"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def __bool__(self):
|
|
45
|
+
# Needed to bypass `not elements` check
|
|
46
|
+
return True
|
|
47
|
+
|
|
48
|
+
def __iter__(self):
|
|
49
|
+
return self
|
|
50
|
+
|
|
51
|
+
def __next__(self):
|
|
52
|
+
"""
|
|
53
|
+
Provide a new element ID from a worker activity upon each iteration
|
|
54
|
+
"""
|
|
55
|
+
try:
|
|
56
|
+
data = self.api_client.request("StartWorkerActivity")
|
|
57
|
+
except ErrorResponse as e:
|
|
58
|
+
# Arkindex will provide a 404 or 400 when there are no worker activities left or the task has completed
|
|
59
|
+
if e.status_code in (400, 404):
|
|
60
|
+
raise StopIteration from e
|
|
61
|
+
|
|
62
|
+
logger.warning(
|
|
63
|
+
f"Failed to start a new worker activity of element due to an API error: {e.content}"
|
|
64
|
+
)
|
|
65
|
+
raise e
|
|
66
|
+
|
|
67
|
+
return data["id"]
|
|
68
|
+
|
|
69
|
+
|
|
35
70
|
class ElementsWorker(
|
|
36
71
|
ElementMixin,
|
|
37
72
|
DatasetMixin,
|
|
@@ -60,7 +95,9 @@ class ElementsWorker(
|
|
|
60
95
|
"""
|
|
61
96
|
super().__init__(description, support_cache)
|
|
62
97
|
|
|
63
|
-
def get_elements(
|
|
98
|
+
def get_elements(
|
|
99
|
+
self,
|
|
100
|
+
) -> Iterable[CachedElement] | list[str] | list[Element] | WorkerActivityIterator:
|
|
64
101
|
"""
|
|
65
102
|
List the elements to be processed, either from the CLI arguments or
|
|
66
103
|
the cache database when enabled.
|
|
@@ -109,6 +146,9 @@ class ElementsWorker(
|
|
|
109
146
|
elif self.process_mode == ProcessMode.Export:
|
|
110
147
|
# For export mode processes, use list_process_elements and return element IDs
|
|
111
148
|
return {item["id"] for item in self.list_process_elements()}
|
|
149
|
+
elif self.consume_worker_activities:
|
|
150
|
+
# Consume worker activitives one by one
|
|
151
|
+
return WorkerActivityIterator(self.api_client)
|
|
112
152
|
|
|
113
153
|
invalid_element_ids = list(filter(invalid_element_id, out))
|
|
114
154
|
assert not invalid_element_ids, (
|
|
@@ -135,6 +175,15 @@ class ElementsWorker(
|
|
|
135
175
|
)
|
|
136
176
|
return self.process_information.get("activity_state") == "ready"
|
|
137
177
|
|
|
178
|
+
@property
|
|
179
|
+
def unknown_nb_elements(self) -> bool:
|
|
180
|
+
"""
|
|
181
|
+
Whether or not the worker knows the total number of elements to process
|
|
182
|
+
- when running with init_elements, we have a known list
|
|
183
|
+
- when running with StartWorkerActivity, we have a queue of unknown size
|
|
184
|
+
"""
|
|
185
|
+
return self.consume_worker_activities
|
|
186
|
+
|
|
138
187
|
def run(self):
|
|
139
188
|
"""
|
|
140
189
|
Implements an Arkindex worker that goes through each element returned by
|
|
@@ -157,7 +206,8 @@ class ElementsWorker(
|
|
|
157
206
|
)
|
|
158
207
|
|
|
159
208
|
# Process every element
|
|
160
|
-
|
|
209
|
+
# We cannot know the number of elements when consuming a list of worker activities
|
|
210
|
+
count = None if self.unknown_nb_elements else len(elements)
|
|
161
211
|
failed = 0
|
|
162
212
|
for i, item in enumerate(elements, start=1):
|
|
163
213
|
element = None
|
|
@@ -171,10 +221,16 @@ class ElementsWorker(
|
|
|
171
221
|
**self.api_client.request("RetrieveElement", id=item)
|
|
172
222
|
)
|
|
173
223
|
|
|
174
|
-
|
|
224
|
+
if self.unknown_nb_elements:
|
|
225
|
+
logger.info(f"Processing {element} (n°{i})")
|
|
226
|
+
else:
|
|
227
|
+
logger.info(f"Processing {element} ({i}/{count})")
|
|
175
228
|
|
|
176
229
|
# Process the element and report its progress if activities are enabled
|
|
177
|
-
|
|
230
|
+
# We do not update the worker activity to "Started" state when consuming them
|
|
231
|
+
if self.consume_worker_activities or self.update_activity(
|
|
232
|
+
element.id, ActivityState.Started
|
|
233
|
+
):
|
|
178
234
|
self.process_element(element)
|
|
179
235
|
self.update_activity(element.id, ActivityState.Processed)
|
|
180
236
|
else:
|
|
@@ -207,10 +263,10 @@ class ElementsWorker(
|
|
|
207
263
|
with contextlib.suppress(Exception):
|
|
208
264
|
self.update_activity(element.id, ActivityState.Error)
|
|
209
265
|
|
|
210
|
-
message = f"Ran on {
|
|
266
|
+
message = f"Ran on {i} {pluralize('element', i)}: {i - failed} completed, {failed} failed"
|
|
211
267
|
if failed:
|
|
212
268
|
logger.error(message)
|
|
213
|
-
if failed >=
|
|
269
|
+
if failed >= i: # Everything failed!
|
|
214
270
|
sys.exit(1)
|
|
215
271
|
else:
|
|
216
272
|
logger.info(message)
|
|
@@ -9,12 +9,13 @@ import os
|
|
|
9
9
|
import shutil
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
from tempfile import mkdtemp
|
|
12
|
+
from typing import Any
|
|
12
13
|
|
|
13
14
|
import gnupg
|
|
14
15
|
import yaml
|
|
15
16
|
|
|
16
17
|
from arkindex import options_from_env
|
|
17
|
-
from arkindex.exceptions import ErrorResponse
|
|
18
|
+
from arkindex.exceptions import ClientError, ErrorResponse
|
|
18
19
|
from arkindex_worker import logger
|
|
19
20
|
from arkindex_worker.cache import (
|
|
20
21
|
check_version,
|
|
@@ -260,7 +261,28 @@ class BaseWorker:
|
|
|
260
261
|
|
|
261
262
|
logger.info(f"Loaded {worker_run['summary']} from API")
|
|
262
263
|
|
|
264
|
+
def _process_config_item(item: dict) -> tuple[str, Any]:
|
|
265
|
+
if not item["secret"]:
|
|
266
|
+
return (item["key"], item["value"])
|
|
267
|
+
|
|
268
|
+
# The secret may not be picked by the user
|
|
269
|
+
if item["value"] is None:
|
|
270
|
+
logger.info(f"Optional secret `{item['key']}` is not set")
|
|
271
|
+
return (item["key"], None)
|
|
272
|
+
|
|
273
|
+
# Load secret, only available in Arkindex EE
|
|
274
|
+
try:
|
|
275
|
+
secret = self.load_secret(Path(item["value"]))
|
|
276
|
+
except ClientError as e:
|
|
277
|
+
logger.error(
|
|
278
|
+
f"Failed to retrieve the secret {item['value']}, probably an Arkindex Community Edition: {e}"
|
|
279
|
+
)
|
|
280
|
+
return (item["key"], None)
|
|
281
|
+
|
|
282
|
+
return (item["key"], secret)
|
|
283
|
+
|
|
263
284
|
# Load model version configuration when available
|
|
285
|
+
# Workers will use model version ID and details to download the model
|
|
264
286
|
model_version = worker_run.get("model_version")
|
|
265
287
|
if model_version:
|
|
266
288
|
logger.info("Loaded model version configuration from WorkerRun")
|
|
@@ -272,6 +294,36 @@ class BaseWorker:
|
|
|
272
294
|
# Set model details as worker attribute
|
|
273
295
|
self.model_details = model_version["model"]
|
|
274
296
|
|
|
297
|
+
# Load worker run information
|
|
298
|
+
try:
|
|
299
|
+
config = self.api_client.request(
|
|
300
|
+
"RetrieveWorkerRunConfiguration", id=self.worker_run_id
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
# Provide the same configuration through all previous attributes
|
|
304
|
+
self.config = self.user_configuration = dict(
|
|
305
|
+
map(_process_config_item, config["configuration"])
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
# Provide secret values through the previous attribute
|
|
309
|
+
self.secrets = {
|
|
310
|
+
item["key"]: self.config[item["key"]]
|
|
311
|
+
for item in config["configuration"]
|
|
312
|
+
if item["secret"]
|
|
313
|
+
}
|
|
314
|
+
logger.info("Using modern configuration")
|
|
315
|
+
|
|
316
|
+
# Reset the model configuration to make sure workers rely on the single new source
|
|
317
|
+
self.model_configuration = {}
|
|
318
|
+
|
|
319
|
+
return # Stop here once we have modern configuration
|
|
320
|
+
|
|
321
|
+
except ErrorResponse as e:
|
|
322
|
+
if e.status_code != 400:
|
|
323
|
+
raise
|
|
324
|
+
logger.info("Modern configuration is not available")
|
|
325
|
+
|
|
326
|
+
# Use old-style configuration with local merge
|
|
275
327
|
# Retrieve initial configuration from API
|
|
276
328
|
self.config = worker_version["configuration"].get("configuration", {})
|
|
277
329
|
if "user_configuration" in worker_version["configuration"]:
|
{arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/element.py
RENAMED
|
@@ -38,6 +38,15 @@ class ElementMixin:
|
|
|
38
38
|
type=open,
|
|
39
39
|
default=os.environ.get("TASK_ELEMENTS"),
|
|
40
40
|
)
|
|
41
|
+
self.parser.add_argument(
|
|
42
|
+
"--no-elements-list",
|
|
43
|
+
help=(
|
|
44
|
+
"Consume worker activities from Arkindex API instead of using a static elements list"
|
|
45
|
+
),
|
|
46
|
+
dest="consume_worker_activities",
|
|
47
|
+
action="store_true",
|
|
48
|
+
default=os.environ.get("SKIP_TASK_ELEMENTS") is not None,
|
|
49
|
+
)
|
|
41
50
|
self.parser.add_argument(
|
|
42
51
|
"--element",
|
|
43
52
|
type=str,
|
|
@@ -46,6 +55,17 @@ class ElementMixin:
|
|
|
46
55
|
)
|
|
47
56
|
super().add_arguments()
|
|
48
57
|
|
|
58
|
+
@property
|
|
59
|
+
def consume_worker_activities(self) -> bool:
|
|
60
|
+
"""
|
|
61
|
+
Helper to detect if the worker rely on an elements.json or consume directly worker activities
|
|
62
|
+
Uses the process information when available, fallback to CLI args
|
|
63
|
+
"""
|
|
64
|
+
if self.process_information is not None:
|
|
65
|
+
return self.process_information.get("skip_elements_json") is True
|
|
66
|
+
|
|
67
|
+
return self.args.consume_worker_activities
|
|
68
|
+
|
|
49
69
|
def list_corpus_types(self):
|
|
50
70
|
"""
|
|
51
71
|
Loads available element types in corpus.
|
{arkindex_base_worker-0.5.0rc1 → arkindex_base_worker-0.5.1}/arkindex_worker/worker/metadata.py
RENAMED
|
@@ -20,10 +20,10 @@ class MetaType(Enum):
|
|
|
20
20
|
A regular string with no special interpretation.
|
|
21
21
|
"""
|
|
22
22
|
|
|
23
|
-
|
|
23
|
+
Markdown = "markdown"
|
|
24
24
|
"""
|
|
25
|
-
A metadata with a string value that should be interpreted as
|
|
26
|
-
|
|
25
|
+
A metadata with a string value that should be interpreted as Markdown content.
|
|
26
|
+
HTML is allowed, but the allowed HTML tags are restricted for security reasons.
|
|
27
27
|
"""
|
|
28
28
|
|
|
29
29
|
Date = "date"
|
|
@@ -4,17 +4,17 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "arkindex-base-worker"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.1"
|
|
8
8
|
description = "Base Worker to easily build Arkindex ML workflows"
|
|
9
9
|
license = { file = "LICENSE" }
|
|
10
10
|
dependencies = [
|
|
11
|
-
"humanize==4.
|
|
11
|
+
"humanize==4.14.0",
|
|
12
12
|
"peewee~=3.17",
|
|
13
|
-
"Pillow==11.
|
|
14
|
-
"python-gnupg==0.5.
|
|
13
|
+
"Pillow==11.3.0",
|
|
14
|
+
"python-gnupg==0.5.5",
|
|
15
15
|
"shapely==2.0.6",
|
|
16
|
-
"teklia-toolbox==0.1.
|
|
17
|
-
"zstandard==0.
|
|
16
|
+
"teklia-toolbox==0.1.11",
|
|
17
|
+
"zstandard==0.25.0",
|
|
18
18
|
]
|
|
19
19
|
authors = [
|
|
20
20
|
{ name = "Teklia", email = "contact@teklia.com" },
|
|
@@ -44,8 +44,7 @@ Authors = "https://teklia.com"
|
|
|
44
44
|
|
|
45
45
|
[project.optional-dependencies]
|
|
46
46
|
tests = [
|
|
47
|
-
"pytest==
|
|
48
|
-
"pytest-mock==3.14.0",
|
|
47
|
+
"pytest-mock==3.15.1",
|
|
49
48
|
"pytest-responses==0.5.1",
|
|
50
49
|
]
|
|
51
50
|
|
|
@@ -103,12 +103,6 @@ def _mock_worker_run_api(responses):
|
|
|
103
103
|
payload = {
|
|
104
104
|
"id": "56785678-5678-5678-5678-567856785678",
|
|
105
105
|
"parents": [],
|
|
106
|
-
"worker": {
|
|
107
|
-
"id": "deadbeef-1234-5678-1234-worker",
|
|
108
|
-
"name": "Fake worker",
|
|
109
|
-
"slug": "fake_worker",
|
|
110
|
-
"type": "classifier",
|
|
111
|
-
},
|
|
112
106
|
"worker_version": {
|
|
113
107
|
"id": "12341234-1234-1234-1234-123412341234",
|
|
114
108
|
"configuration": {
|
|
@@ -153,6 +147,7 @@ def _mock_worker_run_api(responses):
|
|
|
153
147
|
"train_folder_id": None,
|
|
154
148
|
"validation_folder_id": None,
|
|
155
149
|
"test_folder_id": None,
|
|
150
|
+
"skip_elements_json": False,
|
|
156
151
|
},
|
|
157
152
|
"summary": "Worker Fake worker @ 123412",
|
|
158
153
|
}
|
|
@@ -165,6 +160,13 @@ def _mock_worker_run_api(responses):
|
|
|
165
160
|
content_type="application/json",
|
|
166
161
|
)
|
|
167
162
|
|
|
163
|
+
# By default, stick to classic configuration
|
|
164
|
+
responses.add(
|
|
165
|
+
responses.GET,
|
|
166
|
+
"http://testserver/api/v1/workers/runs/56785678-5678-5678-5678-567856785678/configuration/",
|
|
167
|
+
status=400,
|
|
168
|
+
)
|
|
169
|
+
|
|
168
170
|
|
|
169
171
|
@pytest.fixture
|
|
170
172
|
def _mock_worker_run_no_revision_api(responses):
|
|
@@ -172,12 +174,6 @@ def _mock_worker_run_no_revision_api(responses):
|
|
|
172
174
|
payload = {
|
|
173
175
|
"id": "56785678-5678-5678-5678-567856785678",
|
|
174
176
|
"parents": [],
|
|
175
|
-
"worker": {
|
|
176
|
-
"id": "deadbeef-1234-5678-1234-worker",
|
|
177
|
-
"name": "Fake worker",
|
|
178
|
-
"slug": "fake_worker",
|
|
179
|
-
"type": "classifier",
|
|
180
|
-
},
|
|
181
177
|
"worker_version": {
|
|
182
178
|
"id": "12341234-1234-1234-1234-123412341234",
|
|
183
179
|
"configuration": {
|
|
@@ -233,6 +229,56 @@ def _mock_worker_run_no_revision_api(responses):
|
|
|
233
229
|
)
|
|
234
230
|
|
|
235
231
|
|
|
232
|
+
@pytest.fixture
|
|
233
|
+
def mock_base_worker_modern_conf(mocker, responses):
|
|
234
|
+
"""
|
|
235
|
+
Provide a base worker to test modern configuration with (not provided in the fixture)
|
|
236
|
+
"""
|
|
237
|
+
worker = BaseWorker()
|
|
238
|
+
mocker.patch.object(sys, "argv")
|
|
239
|
+
worker.args = worker.parser.parse_args()
|
|
240
|
+
|
|
241
|
+
payload = {
|
|
242
|
+
"id": "56785678-5678-5678-5678-567856785678",
|
|
243
|
+
"parents": [],
|
|
244
|
+
"worker_version": {
|
|
245
|
+
"id": "12341234-1234-1234-1234-123412341234",
|
|
246
|
+
"worker": {
|
|
247
|
+
"id": "deadbeef-1234-5678-1234-worker",
|
|
248
|
+
"name": "Fake worker",
|
|
249
|
+
"slug": "fake_worker",
|
|
250
|
+
"type": "classifier",
|
|
251
|
+
},
|
|
252
|
+
"revision": {"hash": "deadbeef1234"},
|
|
253
|
+
"configuration": {
|
|
254
|
+
"configuration": {"extra_key1": "not showing up"},
|
|
255
|
+
"user_configuration": {"extra_key2": "not showing up"},
|
|
256
|
+
},
|
|
257
|
+
},
|
|
258
|
+
"configuration": {
|
|
259
|
+
"id": "af0daaf4-983e-4703-a7ed-a10f146d6684",
|
|
260
|
+
"name": "my-userconfig",
|
|
261
|
+
"configuration": {
|
|
262
|
+
"extra_key3": "not showing up",
|
|
263
|
+
},
|
|
264
|
+
},
|
|
265
|
+
"model_version": None,
|
|
266
|
+
"process": {
|
|
267
|
+
"id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
|
|
268
|
+
"corpus": CORPUS_ID,
|
|
269
|
+
},
|
|
270
|
+
"summary": "Worker Fake worker @ 123412",
|
|
271
|
+
}
|
|
272
|
+
responses.add(
|
|
273
|
+
responses.GET,
|
|
274
|
+
"http://testserver/api/v1/process/workers/56785678-5678-5678-5678-567856785678/",
|
|
275
|
+
status=200,
|
|
276
|
+
json=payload,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
return worker
|
|
280
|
+
|
|
281
|
+
|
|
236
282
|
@pytest.fixture
|
|
237
283
|
def _mock_activity_calls(responses):
|
|
238
284
|
"""
|
|
@@ -282,6 +328,61 @@ def mock_elements_worker_with_list(monkeypatch, responses, mock_elements_worker)
|
|
|
282
328
|
return mock_elements_worker
|
|
283
329
|
|
|
284
330
|
|
|
331
|
+
@pytest.fixture
|
|
332
|
+
def mock_elements_worker_consume_wa(monkeypatch, responses, mock_elements_worker):
|
|
333
|
+
"""
|
|
334
|
+
Mock a worker instance to use StartWorkerActivity to consume worker activities
|
|
335
|
+
instead of reading a JSON file
|
|
336
|
+
"""
|
|
337
|
+
|
|
338
|
+
# Enable consume worker activities through the process configuration
|
|
339
|
+
responses.replace(
|
|
340
|
+
responses.GET,
|
|
341
|
+
"http://testserver/api/v1/process/workers/56785678-5678-5678-5678-567856785678/",
|
|
342
|
+
status=200,
|
|
343
|
+
json={
|
|
344
|
+
"id": "56785678-5678-5678-5678-567856785678",
|
|
345
|
+
"parents": [],
|
|
346
|
+
"worker_version": {
|
|
347
|
+
"id": "12341234-1234-1234-1234-123412341234",
|
|
348
|
+
"configuration": {
|
|
349
|
+
"docker": {"image": "python:3"},
|
|
350
|
+
"configuration": {"someKey": "someValue"},
|
|
351
|
+
"secrets": [],
|
|
352
|
+
},
|
|
353
|
+
"worker": {
|
|
354
|
+
"id": "deadbeef-1234-5678-1234-worker",
|
|
355
|
+
"name": "Fake worker",
|
|
356
|
+
"slug": "fake_worker",
|
|
357
|
+
"type": "classifier",
|
|
358
|
+
},
|
|
359
|
+
},
|
|
360
|
+
"configuration": None,
|
|
361
|
+
"model_version": None,
|
|
362
|
+
"process": {
|
|
363
|
+
"name": None,
|
|
364
|
+
"id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
|
|
365
|
+
"state": "running",
|
|
366
|
+
"mode": "workers",
|
|
367
|
+
"corpus": CORPUS_ID,
|
|
368
|
+
"use_cache": False,
|
|
369
|
+
"activity_state": "ready",
|
|
370
|
+
"model_id": None,
|
|
371
|
+
"train_folder_id": None,
|
|
372
|
+
"validation_folder_id": None,
|
|
373
|
+
"test_folder_id": None,
|
|
374
|
+
"skip_elements_json": True,
|
|
375
|
+
},
|
|
376
|
+
"summary": "Worker Fake worker @ 123412",
|
|
377
|
+
},
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
# Call configure again to use updated process infos
|
|
381
|
+
mock_elements_worker.configure()
|
|
382
|
+
|
|
383
|
+
return mock_elements_worker
|
|
384
|
+
|
|
385
|
+
|
|
285
386
|
@pytest.fixture
|
|
286
387
|
def mock_cache_db(tmp_path):
|
|
287
388
|
cache_path = tmp_path / "db.sqlite"
|