arkindex-base-worker 0.4.0b3__tar.gz → 0.4.0rc2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/PKG-INFO +4 -3
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_base_worker.egg-info/PKG-INFO +4 -3
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_base_worker.egg-info/SOURCES.txt +1 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_base_worker.egg-info/requires.txt +3 -2
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/image.py +118 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/__init__.py +26 -158
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/base.py +32 -1
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/dataset.py +70 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/element.py +260 -75
- arkindex_base_worker-0.4.0rc2/arkindex_worker/worker/process.py +63 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/transcription.py +50 -50
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/pyproject.toml +4 -3
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/__init__.py +1 -1
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/conftest.py +11 -23
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_base_worker.py +203 -2
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_dataset_worker.py +5 -2
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_elements.py +712 -18
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_worker.py +0 -200
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_image.py +248 -6
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_merge.py +0 -1
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_utils.py +2 -4
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/LICENSE +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/README.md +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_base_worker.egg-info/dependency_links.txt +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_base_worker.egg-info/top_level.txt +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/__init__.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/cache.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/models.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/utils.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/classification.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/corpus.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/entity.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/image.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/metadata.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/task.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/training.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/version.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/hooks/pre_gen_project.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/setup.cfg +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_cache.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_element.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/__init__.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_classifications.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_cli.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_corpus.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_dataset.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_entities.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_image.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_metadata.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_task.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_training.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/tests/test_elements_worker/test_transcriptions.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/worker-demo/tests/__init__.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/worker-demo/tests/conftest.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/worker-demo/tests/test_worker.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/worker-demo/worker_demo/__init__.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/worker-demo/worker_demo/worker.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: arkindex-base-worker
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.0rc2
|
|
4
4
|
Summary: Base Worker to easily build Arkindex ML workflows
|
|
5
5
|
Author-email: Teklia <contact@teklia.com>
|
|
6
6
|
Maintainer-email: Teklia <contact@teklia.com>
|
|
@@ -40,6 +40,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
40
40
|
Requires-Python: >=3.10
|
|
41
41
|
Description-Content-Type: text/markdown
|
|
42
42
|
License-File: LICENSE
|
|
43
|
+
Requires-Dist: humanize==4.10.0
|
|
43
44
|
Requires-Dist: peewee~=3.17
|
|
44
45
|
Requires-Dist: Pillow==10.4.0
|
|
45
46
|
Requires-Dist: python-gnupg==0.5.2
|
|
@@ -48,8 +49,8 @@ Requires-Dist: teklia-toolbox==0.1.5
|
|
|
48
49
|
Requires-Dist: zstandard==0.22.0
|
|
49
50
|
Provides-Extra: docs
|
|
50
51
|
Requires-Dist: black==24.4.2; extra == "docs"
|
|
51
|
-
Requires-Dist: mkdocs-material==9.5.
|
|
52
|
-
Requires-Dist: mkdocstrings-python==1.10.
|
|
52
|
+
Requires-Dist: mkdocs-material==9.5.33; extra == "docs"
|
|
53
|
+
Requires-Dist: mkdocstrings-python==1.10.8; extra == "docs"
|
|
53
54
|
Provides-Extra: tests
|
|
54
55
|
Requires-Dist: pytest==8.3.2; extra == "tests"
|
|
55
56
|
Requires-Dist: pytest-mock==3.14.0; extra == "tests"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: arkindex-base-worker
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.0rc2
|
|
4
4
|
Summary: Base Worker to easily build Arkindex ML workflows
|
|
5
5
|
Author-email: Teklia <contact@teklia.com>
|
|
6
6
|
Maintainer-email: Teklia <contact@teklia.com>
|
|
@@ -40,6 +40,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
40
40
|
Requires-Python: >=3.10
|
|
41
41
|
Description-Content-Type: text/markdown
|
|
42
42
|
License-File: LICENSE
|
|
43
|
+
Requires-Dist: humanize==4.10.0
|
|
43
44
|
Requires-Dist: peewee~=3.17
|
|
44
45
|
Requires-Dist: Pillow==10.4.0
|
|
45
46
|
Requires-Dist: python-gnupg==0.5.2
|
|
@@ -48,8 +49,8 @@ Requires-Dist: teklia-toolbox==0.1.5
|
|
|
48
49
|
Requires-Dist: zstandard==0.22.0
|
|
49
50
|
Provides-Extra: docs
|
|
50
51
|
Requires-Dist: black==24.4.2; extra == "docs"
|
|
51
|
-
Requires-Dist: mkdocs-material==9.5.
|
|
52
|
-
Requires-Dist: mkdocstrings-python==1.10.
|
|
52
|
+
Requires-Dist: mkdocs-material==9.5.33; extra == "docs"
|
|
53
|
+
Requires-Dist: mkdocstrings-python==1.10.8; extra == "docs"
|
|
53
54
|
Provides-Extra: tests
|
|
54
55
|
Requires-Dist: pytest==8.3.2; extra == "tests"
|
|
55
56
|
Requires-Dist: pytest-mock==3.14.0; extra == "tests"
|
|
@@ -20,6 +20,7 @@ arkindex_worker/worker/element.py
|
|
|
20
20
|
arkindex_worker/worker/entity.py
|
|
21
21
|
arkindex_worker/worker/image.py
|
|
22
22
|
arkindex_worker/worker/metadata.py
|
|
23
|
+
arkindex_worker/worker/process.py
|
|
23
24
|
arkindex_worker/worker/task.py
|
|
24
25
|
arkindex_worker/worker/training.py
|
|
25
26
|
arkindex_worker/worker/transcription.py
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
humanize==4.10.0
|
|
1
2
|
peewee~=3.17
|
|
2
3
|
Pillow==10.4.0
|
|
3
4
|
python-gnupg==0.5.2
|
|
@@ -7,8 +8,8 @@ zstandard==0.22.0
|
|
|
7
8
|
|
|
8
9
|
[docs]
|
|
9
10
|
black==24.4.2
|
|
10
|
-
mkdocs-material==9.5.
|
|
11
|
-
mkdocstrings-python==1.10.
|
|
11
|
+
mkdocs-material==9.5.33
|
|
12
|
+
mkdocstrings-python==1.10.8
|
|
12
13
|
|
|
13
14
|
[tests]
|
|
14
15
|
pytest==8.3.2
|
|
@@ -2,13 +2,18 @@
|
|
|
2
2
|
Helper methods to download and open IIIF images, and manage polygons.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
import functools
|
|
6
|
+
import os
|
|
5
7
|
import re
|
|
8
|
+
import tempfile
|
|
6
9
|
from collections import namedtuple
|
|
10
|
+
from collections.abc import Generator, Iterator
|
|
7
11
|
from io import BytesIO
|
|
8
12
|
from math import ceil
|
|
9
13
|
from pathlib import Path
|
|
10
14
|
from typing import TYPE_CHECKING
|
|
11
15
|
|
|
16
|
+
import humanize
|
|
12
17
|
import requests
|
|
13
18
|
from PIL import Image
|
|
14
19
|
from shapely.affinity import rotate, scale, translate
|
|
@@ -40,8 +45,57 @@ IIIF_URL = re.compile(r"\w+:\/{2}.+\/.+\/.+\/.+\/(?P<size>.+)\/!?\d+\/\w+\.\w+")
|
|
|
40
45
|
IIIF_FULL = "full"
|
|
41
46
|
# Maximum size available
|
|
42
47
|
IIIF_MAX = "max"
|
|
48
|
+
# Ratio to resize image
|
|
49
|
+
IMAGE_RATIO = [1, 0.9, 0.85, 0.80, 0.75, 0.70, 0.60, 0.50, 0.40, 0.30]
|
|
43
50
|
|
|
44
51
|
|
|
52
|
+
def update_pillow_image_size_limit(func):
|
|
53
|
+
"""
|
|
54
|
+
Update Pillow Image size limit
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
@functools.wraps(func)
|
|
58
|
+
def wrapper(
|
|
59
|
+
*args,
|
|
60
|
+
max_image_pixels: str | int | None = os.getenv("ARKINDEX_MAX_IMAGE_PIXELS"),
|
|
61
|
+
**kwargs,
|
|
62
|
+
):
|
|
63
|
+
"""
|
|
64
|
+
Wrapper to update Pillow Image size limit and restore it at the end of the function.
|
|
65
|
+
|
|
66
|
+
:param *args: Positional arguments passed to the function.
|
|
67
|
+
:param max_image_pixels: Pillow Image size limit to use.
|
|
68
|
+
:param **kwargs: Keyword arguments passed to the function.
|
|
69
|
+
"""
|
|
70
|
+
MAX_IMAGE_PIXELS = Image.MAX_IMAGE_PIXELS
|
|
71
|
+
|
|
72
|
+
# Override Pillow Image size limit
|
|
73
|
+
if max_image_pixels is not None:
|
|
74
|
+
max_image_pixels = int(max_image_pixels)
|
|
75
|
+
# Override Pillow limit for detecting decompression bombs, disabled if set to 0
|
|
76
|
+
if max_image_pixels == 0:
|
|
77
|
+
logger.warning(
|
|
78
|
+
"Pillow Image size limit is completely disabled, make sure you trust the image source."
|
|
79
|
+
)
|
|
80
|
+
Image.MAX_IMAGE_PIXELS = None
|
|
81
|
+
else:
|
|
82
|
+
Image.MAX_IMAGE_PIXELS = max_image_pixels
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
results = func(*args, **kwargs)
|
|
86
|
+
except:
|
|
87
|
+
# Restore initial Pillow Image size limit
|
|
88
|
+
Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
|
|
89
|
+
raise
|
|
90
|
+
|
|
91
|
+
# Restore initial Pillow Image size limit
|
|
92
|
+
Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
|
|
93
|
+
return results
|
|
94
|
+
|
|
95
|
+
return wrapper
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@update_pillow_image_size_limit
|
|
45
99
|
def open_image(
|
|
46
100
|
path: str,
|
|
47
101
|
mode: str | None = "RGB",
|
|
@@ -149,6 +203,70 @@ def upload_image(image: Image, url: str) -> requests.Response:
|
|
|
149
203
|
return resp
|
|
150
204
|
|
|
151
205
|
|
|
206
|
+
def resized_images(
|
|
207
|
+
*args,
|
|
208
|
+
element: "Element",
|
|
209
|
+
max_pixels: int | None = None,
|
|
210
|
+
max_bytes: int | None = None,
|
|
211
|
+
**kwargs,
|
|
212
|
+
) -> Iterator[Generator[tempfile.NamedTemporaryFile, None, None]]:
|
|
213
|
+
"""
|
|
214
|
+
Build resized images according to the pixel and byte limits.
|
|
215
|
+
|
|
216
|
+
:param *args: Positional arguments passed to [arkindex_worker.models.Element.open_image_tempfile][].
|
|
217
|
+
:param element: Element whose image needs to be resized.
|
|
218
|
+
:param max_pixels: Maximum pixel size of the resized images.
|
|
219
|
+
:param max_bytes: Maximum byte size of the resized images.
|
|
220
|
+
:param **kwargs: Keyword arguments passed to [arkindex_worker.models.Element.open_image_tempfile][].
|
|
221
|
+
:returns: An iterator of the temporary file of the resized image.
|
|
222
|
+
"""
|
|
223
|
+
_, _, element_width, element_height = polygon_bounding_box(element.polygon)
|
|
224
|
+
|
|
225
|
+
logger.info(f"This element's image sizes are ({element_width} x {element_height}).")
|
|
226
|
+
if max_pixels and max(element_width, element_height) > max_pixels:
|
|
227
|
+
logger.warning(
|
|
228
|
+
f"Maximum image input size supported is ({max_pixels} x {max_pixels})."
|
|
229
|
+
)
|
|
230
|
+
logger.warning("The image will be resized.")
|
|
231
|
+
|
|
232
|
+
element_pixel, param = (
|
|
233
|
+
(element_width, "max_width")
|
|
234
|
+
if element_width > element_height
|
|
235
|
+
else (element_height, "max_height")
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
for resized_pixel in sorted(
|
|
239
|
+
set(
|
|
240
|
+
min(round(ratio * element_pixel), max_pixels or element_pixel)
|
|
241
|
+
for ratio in IMAGE_RATIO
|
|
242
|
+
),
|
|
243
|
+
reverse=True,
|
|
244
|
+
):
|
|
245
|
+
with element.open_image_tempfile(
|
|
246
|
+
*args, **{**kwargs, param: resized_pixel}
|
|
247
|
+
) as image:
|
|
248
|
+
pillow_image = Image.open(image)
|
|
249
|
+
if (
|
|
250
|
+
pillow_image.width != element_width
|
|
251
|
+
or pillow_image.height != element_height
|
|
252
|
+
):
|
|
253
|
+
logger.warning(
|
|
254
|
+
f"The image was resized to ({pillow_image.width} x {pillow_image.height})."
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# The image is still too large
|
|
258
|
+
image_size = Path(image.name).stat().st_size
|
|
259
|
+
if max_bytes and image_size > max_bytes:
|
|
260
|
+
logger.warning(f"The image size is {humanize.naturalsize(image_size)}.")
|
|
261
|
+
logger.warning(
|
|
262
|
+
f"Maximum image input size supported is {humanize.naturalsize(max_bytes)}."
|
|
263
|
+
)
|
|
264
|
+
logger.warning("The image will be resized.")
|
|
265
|
+
continue
|
|
266
|
+
|
|
267
|
+
yield image
|
|
268
|
+
|
|
269
|
+
|
|
152
270
|
def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
|
|
153
271
|
"""
|
|
154
272
|
Compute the rectangle bounding box of a polygon.
|
{arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/__init__.py
RENAMED
|
@@ -4,12 +4,10 @@ Base classes to implement Arkindex workers.
|
|
|
4
4
|
|
|
5
5
|
import contextlib
|
|
6
6
|
import json
|
|
7
|
-
import os
|
|
8
7
|
import sys
|
|
9
8
|
import uuid
|
|
10
|
-
from
|
|
11
|
-
from
|
|
12
|
-
from enum import Enum
|
|
9
|
+
from collections.abc import Iterable
|
|
10
|
+
from itertools import chain
|
|
13
11
|
from pathlib import Path
|
|
14
12
|
|
|
15
13
|
from apistar.exceptions import ErrorResponse
|
|
@@ -21,47 +19,27 @@ from arkindex_worker.utils import pluralize
|
|
|
21
19
|
from arkindex_worker.worker.base import BaseWorker
|
|
22
20
|
from arkindex_worker.worker.classification import ClassificationMixin
|
|
23
21
|
from arkindex_worker.worker.corpus import CorpusMixin
|
|
24
|
-
from arkindex_worker.worker.dataset import
|
|
22
|
+
from arkindex_worker.worker.dataset import (
|
|
23
|
+
DatasetMixin,
|
|
24
|
+
DatasetState,
|
|
25
|
+
MissingDatasetArchive,
|
|
26
|
+
)
|
|
25
27
|
from arkindex_worker.worker.element import ElementMixin
|
|
26
28
|
from arkindex_worker.worker.entity import EntityMixin
|
|
27
29
|
from arkindex_worker.worker.image import ImageMixin
|
|
28
30
|
from arkindex_worker.worker.metadata import MetaDataMixin, MetaType # noqa: F401
|
|
31
|
+
from arkindex_worker.worker.process import ActivityState, ProcessMode
|
|
29
32
|
from arkindex_worker.worker.task import TaskMixin
|
|
30
33
|
from arkindex_worker.worker.transcription import TranscriptionMixin
|
|
31
34
|
from arkindex_worker.worker.version import WorkerVersionMixin
|
|
32
35
|
|
|
33
36
|
|
|
34
|
-
class ActivityState(Enum):
|
|
35
|
-
"""
|
|
36
|
-
Processing state of an element.
|
|
37
|
-
"""
|
|
38
|
-
|
|
39
|
-
Queued = "queued"
|
|
40
|
-
"""
|
|
41
|
-
The element has not yet been processed by a worker.
|
|
42
|
-
"""
|
|
43
|
-
|
|
44
|
-
Started = "started"
|
|
45
|
-
"""
|
|
46
|
-
The element is being processed by a worker.
|
|
47
|
-
"""
|
|
48
|
-
|
|
49
|
-
Processed = "processed"
|
|
50
|
-
"""
|
|
51
|
-
The element has been successfully processed by a worker.
|
|
52
|
-
"""
|
|
53
|
-
|
|
54
|
-
Error = "error"
|
|
55
|
-
"""
|
|
56
|
-
An error occurred while processing this element.
|
|
57
|
-
"""
|
|
58
|
-
|
|
59
|
-
|
|
60
37
|
class ElementsWorker(
|
|
38
|
+
ElementMixin,
|
|
39
|
+
DatasetMixin,
|
|
61
40
|
BaseWorker,
|
|
62
41
|
ClassificationMixin,
|
|
63
42
|
CorpusMixin,
|
|
64
|
-
ElementMixin,
|
|
65
43
|
TranscriptionMixin,
|
|
66
44
|
WorkerVersionMixin,
|
|
67
45
|
EntityMixin,
|
|
@@ -96,22 +74,7 @@ class ElementsWorker(
|
|
|
96
74
|
|
|
97
75
|
self._worker_version_cache = {}
|
|
98
76
|
|
|
99
|
-
def
|
|
100
|
-
"""Define specific ``argparse`` arguments for this worker"""
|
|
101
|
-
self.parser.add_argument(
|
|
102
|
-
"--elements-list",
|
|
103
|
-
help="JSON elements list to use",
|
|
104
|
-
type=open,
|
|
105
|
-
default=os.environ.get("TASK_ELEMENTS"),
|
|
106
|
-
)
|
|
107
|
-
self.parser.add_argument(
|
|
108
|
-
"--element",
|
|
109
|
-
type=str,
|
|
110
|
-
nargs="+",
|
|
111
|
-
help="One or more Arkindex element ID",
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
def list_elements(self) -> Iterable[CachedElement] | list[str]:
|
|
77
|
+
def get_elements(self) -> Iterable[CachedElement] | list[str] | list[Element]:
|
|
115
78
|
"""
|
|
116
79
|
List the elements to be processed, either from the CLI arguments or
|
|
117
80
|
the cache database when enabled.
|
|
@@ -143,15 +106,20 @@ class ElementsWorker(
|
|
|
143
106
|
)
|
|
144
107
|
if self.use_cache and cache_query.exists():
|
|
145
108
|
return cache_query
|
|
146
|
-
# Process elements from JSON file
|
|
147
109
|
elif self.args.elements_list:
|
|
110
|
+
# Process elements from JSON file
|
|
148
111
|
data = json.load(self.args.elements_list)
|
|
149
112
|
assert isinstance(data, list), "Elements list must be a list"
|
|
150
113
|
assert len(data), "No elements in elements list"
|
|
151
114
|
out += list(filter(None, [element.get("id") for element in data]))
|
|
152
|
-
# Add any extra element from CLI
|
|
153
115
|
elif self.args.element:
|
|
116
|
+
# Add any extra element from CLI
|
|
154
117
|
out += self.args.element
|
|
118
|
+
elif self.process_mode == ProcessMode.Dataset or self.args.set:
|
|
119
|
+
# Elements from datasets
|
|
120
|
+
return list(
|
|
121
|
+
chain.from_iterable(map(self.list_set_elements, self.list_sets()))
|
|
122
|
+
)
|
|
155
123
|
|
|
156
124
|
invalid_element_ids = list(filter(invalid_element_id, out))
|
|
157
125
|
assert (
|
|
@@ -166,40 +134,18 @@ class ElementsWorker(
|
|
|
166
134
|
Whether or not WorkerActivity support has been enabled on the DataImport
|
|
167
135
|
used to run this worker.
|
|
168
136
|
"""
|
|
169
|
-
if self.is_read_only:
|
|
137
|
+
if self.is_read_only or self.process_mode == ProcessMode.Dataset:
|
|
138
|
+
# Worker activities are also disabled when running an ElementsWorker in a Dataset process.
|
|
170
139
|
return False
|
|
171
140
|
assert (
|
|
172
141
|
self.process_information
|
|
173
142
|
), "Worker must be configured to access its process activity state"
|
|
174
143
|
return self.process_information.get("activity_state") == "ready"
|
|
175
144
|
|
|
176
|
-
def configure(self):
|
|
177
|
-
"""
|
|
178
|
-
Setup the worker using CLI arguments and environment variables.
|
|
179
|
-
"""
|
|
180
|
-
# CLI args are stored on the instance so that implementations can access them
|
|
181
|
-
self.args = self.parser.parse_args()
|
|
182
|
-
|
|
183
|
-
if self.is_read_only:
|
|
184
|
-
super().configure_for_developers()
|
|
185
|
-
else:
|
|
186
|
-
super().configure()
|
|
187
|
-
super().configure_cache()
|
|
188
|
-
|
|
189
|
-
# Retrieve the model configuration
|
|
190
|
-
if self.model_configuration:
|
|
191
|
-
self.config.update(self.model_configuration)
|
|
192
|
-
logger.info("Model version configuration retrieved")
|
|
193
|
-
|
|
194
|
-
# Retrieve the user configuration
|
|
195
|
-
if self.user_configuration:
|
|
196
|
-
self.config.update(self.user_configuration)
|
|
197
|
-
logger.info("User configuration retrieved")
|
|
198
|
-
|
|
199
145
|
def run(self):
|
|
200
146
|
"""
|
|
201
147
|
Implements an Arkindex worker that goes through each element returned by
|
|
202
|
-
[
|
|
148
|
+
[get_elements][arkindex_worker.worker.ElementsWorker.get_elements].
|
|
203
149
|
It calls [process_element][arkindex_worker.worker.ElementsWorker.process_element],
|
|
204
150
|
catching exceptions, and handles saving WorkerActivity updates when enabled.
|
|
205
151
|
"""
|
|
@@ -207,7 +153,7 @@ class ElementsWorker(
|
|
|
207
153
|
|
|
208
154
|
# List all elements either from JSON file
|
|
209
155
|
# or direct list of elements on CLI
|
|
210
|
-
elements = self.
|
|
156
|
+
elements = self.get_elements()
|
|
211
157
|
if not elements:
|
|
212
158
|
logger.warning("No elements to process, stopping.")
|
|
213
159
|
sys.exit(1)
|
|
@@ -223,8 +169,8 @@ class ElementsWorker(
|
|
|
223
169
|
for i, item in enumerate(elements, start=1):
|
|
224
170
|
element = None
|
|
225
171
|
try:
|
|
226
|
-
if
|
|
227
|
-
# Just use the result of
|
|
172
|
+
if isinstance(item, CachedElement | Element):
|
|
173
|
+
# Just use the result of get_elements as the element
|
|
228
174
|
element = item
|
|
229
175
|
else:
|
|
230
176
|
# Load element using the Arkindex API
|
|
@@ -339,29 +285,7 @@ class ElementsWorker(
|
|
|
339
285
|
return True
|
|
340
286
|
|
|
341
287
|
|
|
342
|
-
|
|
343
|
-
values = value.split(":")
|
|
344
|
-
if len(values) != 2:
|
|
345
|
-
raise ArgumentTypeError(
|
|
346
|
-
f"'{value}' is not in the correct format `<dataset_id>:<set_name>`"
|
|
347
|
-
)
|
|
348
|
-
|
|
349
|
-
dataset_id, set_name = values
|
|
350
|
-
try:
|
|
351
|
-
dataset_id = uuid.UUID(dataset_id)
|
|
352
|
-
return (dataset_id, set_name)
|
|
353
|
-
except (TypeError, ValueError) as e:
|
|
354
|
-
raise ArgumentTypeError(f"'{dataset_id}' should be a valid UUID") from e
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
class MissingDatasetArchive(Exception):
|
|
358
|
-
"""
|
|
359
|
-
Exception raised when the compressed archive associated to
|
|
360
|
-
a dataset isn't found in its task artifacts.
|
|
361
|
-
"""
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
|
|
288
|
+
class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
|
|
365
289
|
"""
|
|
366
290
|
Base class for ML workers that operate on Arkindex dataset sets.
|
|
367
291
|
|
|
@@ -384,42 +308,6 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
|
|
|
384
308
|
# Set as an instance variable as dataset workers might use it to easily extract its content
|
|
385
309
|
self.downloaded_dataset_artifact: Path | None = None
|
|
386
310
|
|
|
387
|
-
def add_arguments(self):
|
|
388
|
-
"""Define specific ``argparse`` arguments for this worker"""
|
|
389
|
-
self.parser.add_argument(
|
|
390
|
-
"--set",
|
|
391
|
-
type=check_dataset_set,
|
|
392
|
-
nargs="+",
|
|
393
|
-
help="""
|
|
394
|
-
One or more Arkindex dataset sets, format is <dataset_uuid>:<set_name>
|
|
395
|
-
(e.g.: "12341234-1234-1234-1234-123412341234:train")
|
|
396
|
-
""",
|
|
397
|
-
default=[],
|
|
398
|
-
)
|
|
399
|
-
|
|
400
|
-
def configure(self):
|
|
401
|
-
"""
|
|
402
|
-
Setup the worker using CLI arguments and environment variables.
|
|
403
|
-
"""
|
|
404
|
-
# CLI args are stored on the instance so that implementations can access them
|
|
405
|
-
self.args = self.parser.parse_args()
|
|
406
|
-
|
|
407
|
-
if self.is_read_only:
|
|
408
|
-
super().configure_for_developers()
|
|
409
|
-
else:
|
|
410
|
-
super().configure()
|
|
411
|
-
super().configure_cache()
|
|
412
|
-
|
|
413
|
-
# Retrieve the model configuration
|
|
414
|
-
if self.model_configuration:
|
|
415
|
-
self.config.update(self.model_configuration)
|
|
416
|
-
logger.info("Model version configuration retrieved")
|
|
417
|
-
|
|
418
|
-
# Retrieve the user configuration
|
|
419
|
-
if self.user_configuration:
|
|
420
|
-
self.config.update(self.user_configuration)
|
|
421
|
-
logger.info("User configuration retrieved")
|
|
422
|
-
|
|
423
311
|
def cleanup_downloaded_artifact(self) -> None:
|
|
424
312
|
"""
|
|
425
313
|
Cleanup the downloaded dataset artifact if any
|
|
@@ -467,30 +355,10 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
|
|
|
467
355
|
:param set: The set to process.
|
|
468
356
|
"""
|
|
469
357
|
|
|
470
|
-
def list_sets(self) -> Iterator[Set]:
|
|
471
|
-
"""
|
|
472
|
-
List the sets to be processed, either from the CLI arguments or using the
|
|
473
|
-
[list_process_sets][arkindex_worker.worker.dataset.DatasetMixin.list_process_sets] method.
|
|
474
|
-
|
|
475
|
-
:returns: An iterator of ``Set`` objects.
|
|
476
|
-
"""
|
|
477
|
-
if not self.is_read_only:
|
|
478
|
-
yield from self.list_process_sets()
|
|
479
|
-
|
|
480
|
-
datasets: dict[uuid.UUID, Dataset] = {}
|
|
481
|
-
for dataset_id, set_name in self.args.set:
|
|
482
|
-
# Retrieving dataset information is not already cached
|
|
483
|
-
if dataset_id not in datasets:
|
|
484
|
-
datasets[dataset_id] = Dataset(
|
|
485
|
-
**self.api_client.request("RetrieveDataset", id=dataset_id)
|
|
486
|
-
)
|
|
487
|
-
|
|
488
|
-
yield Set(name=set_name, dataset=datasets[dataset_id])
|
|
489
|
-
|
|
490
358
|
def run(self):
|
|
491
359
|
"""
|
|
492
360
|
Implements an Arkindex worker that goes through each dataset set returned by
|
|
493
|
-
[list_sets][arkindex_worker.worker.
|
|
361
|
+
[list_sets][arkindex_worker.worker.dataset.DatasetMixin.list_sets].
|
|
494
362
|
|
|
495
363
|
It calls [process_set][arkindex_worker.worker.DatasetWorker.process_set],
|
|
496
364
|
catching exceptions.
|
{arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/base.py
RENAMED
|
@@ -24,6 +24,7 @@ from arkindex_worker.cache import (
|
|
|
24
24
|
merge_parents_cache,
|
|
25
25
|
)
|
|
26
26
|
from arkindex_worker.utils import close_delete_file, extract_tar_zst_archive
|
|
27
|
+
from arkindex_worker.worker.process import ProcessMode
|
|
27
28
|
from teklia_toolbox.requests import get_arkindex_client
|
|
28
29
|
|
|
29
30
|
|
|
@@ -156,6 +157,13 @@ class BaseWorker:
|
|
|
156
157
|
raise Exception("Missing ARKINDEX_CORPUS_ID environment variable")
|
|
157
158
|
return self._corpus_id
|
|
158
159
|
|
|
160
|
+
@property
|
|
161
|
+
def process_mode(self) -> ProcessMode | None:
|
|
162
|
+
"""Mode of the process being run. Returns None when read-only."""
|
|
163
|
+
if self.is_read_only:
|
|
164
|
+
return
|
|
165
|
+
return ProcessMode(self.process_information["mode"])
|
|
166
|
+
|
|
159
167
|
@property
|
|
160
168
|
def is_read_only(self) -> bool:
|
|
161
169
|
"""
|
|
@@ -219,7 +227,7 @@ class BaseWorker:
|
|
|
219
227
|
# Load all required secrets
|
|
220
228
|
self.secrets = {name: self.load_secret(Path(name)) for name in required_secrets}
|
|
221
229
|
|
|
222
|
-
def
|
|
230
|
+
def configure_worker_run(self):
|
|
223
231
|
"""
|
|
224
232
|
Setup the necessary configuration needed using CLI args and environment variables.
|
|
225
233
|
This is the method called when running a worker on Arkindex.
|
|
@@ -320,6 +328,29 @@ class BaseWorker:
|
|
|
320
328
|
else:
|
|
321
329
|
logger.debug("Cache is disabled")
|
|
322
330
|
|
|
331
|
+
def configure(self):
|
|
332
|
+
"""
|
|
333
|
+
Setup the worker using CLI arguments and environment variables.
|
|
334
|
+
"""
|
|
335
|
+
# CLI args are stored on the instance so that implementations can access them
|
|
336
|
+
self.args = self.parser.parse_args()
|
|
337
|
+
|
|
338
|
+
if self.is_read_only:
|
|
339
|
+
self.configure_for_developers()
|
|
340
|
+
else:
|
|
341
|
+
self.configure_worker_run()
|
|
342
|
+
self.configure_cache()
|
|
343
|
+
|
|
344
|
+
# Retrieve the model configuration
|
|
345
|
+
if self.model_configuration:
|
|
346
|
+
self.config.update(self.model_configuration)
|
|
347
|
+
logger.info("Model version configuration retrieved")
|
|
348
|
+
|
|
349
|
+
# Retrieve the user configuration
|
|
350
|
+
if self.user_configuration:
|
|
351
|
+
self.config.update(self.user_configuration)
|
|
352
|
+
logger.info("User configuration retrieved")
|
|
353
|
+
|
|
323
354
|
def load_secret(self, name: Path):
|
|
324
355
|
"""
|
|
325
356
|
Load a Ponos secret by name.
|
{arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc2}/arkindex_worker/worker/dataset.py
RENAMED
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
BaseWorker methods for datasets.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
import uuid
|
|
6
|
+
from argparse import ArgumentTypeError
|
|
5
7
|
from collections.abc import Iterator
|
|
6
8
|
from enum import Enum
|
|
7
9
|
|
|
@@ -36,7 +38,55 @@ class DatasetState(Enum):
|
|
|
36
38
|
"""
|
|
37
39
|
|
|
38
40
|
|
|
41
|
+
class MissingDatasetArchive(Exception):
|
|
42
|
+
"""
|
|
43
|
+
Exception raised when the compressed archive associated to
|
|
44
|
+
a dataset isn't found in its task artifacts.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def check_dataset_set(value: str) -> tuple[uuid.UUID, str]:
|
|
49
|
+
"""The `--set` argument should have the following format:
|
|
50
|
+
<dataset_id>:<set_name>
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
value (str): Provided argument.
|
|
54
|
+
|
|
55
|
+
Raises:
|
|
56
|
+
ArgumentTypeError: When the value is invalid.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
tuple[uuid.UUID, str]: The ID of the dataset parsed as UUID and the name of the set.
|
|
60
|
+
"""
|
|
61
|
+
values = value.split(":")
|
|
62
|
+
if len(values) != 2:
|
|
63
|
+
raise ArgumentTypeError(
|
|
64
|
+
f"'{value}' is not in the correct format `<dataset_id>:<set_name>`"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
dataset_id, set_name = values
|
|
68
|
+
try:
|
|
69
|
+
dataset_id = uuid.UUID(dataset_id)
|
|
70
|
+
return (dataset_id, set_name)
|
|
71
|
+
except (TypeError, ValueError) as e:
|
|
72
|
+
raise ArgumentTypeError(f"'{dataset_id}' should be a valid UUID") from e
|
|
73
|
+
|
|
74
|
+
|
|
39
75
|
class DatasetMixin:
|
|
76
|
+
def add_arguments(self) -> None:
|
|
77
|
+
"""Define specific ``argparse`` arguments for the worker using this mixin"""
|
|
78
|
+
self.parser.add_argument(
|
|
79
|
+
"--set",
|
|
80
|
+
type=check_dataset_set,
|
|
81
|
+
nargs="+",
|
|
82
|
+
help="""
|
|
83
|
+
One or more Arkindex dataset sets, format is <dataset_uuid>:<set_name>
|
|
84
|
+
(e.g.: "12341234-1234-1234-1234-123412341234:train")
|
|
85
|
+
""",
|
|
86
|
+
default=[],
|
|
87
|
+
)
|
|
88
|
+
super().add_arguments()
|
|
89
|
+
|
|
40
90
|
def list_process_sets(self) -> Iterator[Set]:
|
|
41
91
|
"""
|
|
42
92
|
List dataset sets associated to the worker's process. This helper is not available in developer mode.
|
|
@@ -73,6 +123,26 @@ class DatasetMixin:
|
|
|
73
123
|
|
|
74
124
|
return map(lambda result: Element(**result["element"]), results)
|
|
75
125
|
|
|
126
|
+
def list_sets(self) -> Iterator[Set]:
|
|
127
|
+
"""
|
|
128
|
+
List the sets to be processed, either from the CLI arguments or using the
|
|
129
|
+
[list_process_sets][arkindex_worker.worker.dataset.DatasetMixin.list_process_sets] method.
|
|
130
|
+
|
|
131
|
+
:returns: An iterator of ``Set`` objects.
|
|
132
|
+
"""
|
|
133
|
+
if not self.is_read_only:
|
|
134
|
+
yield from self.list_process_sets()
|
|
135
|
+
|
|
136
|
+
datasets: dict[uuid.UUID, Dataset] = {}
|
|
137
|
+
for dataset_id, set_name in self.args.set:
|
|
138
|
+
# Retrieving dataset information if not already cached
|
|
139
|
+
if dataset_id not in datasets:
|
|
140
|
+
datasets[dataset_id] = Dataset(
|
|
141
|
+
**self.api_client.request("RetrieveDataset", id=dataset_id)
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
yield Set(name=set_name, dataset=datasets[dataset_id])
|
|
145
|
+
|
|
76
146
|
@unsupported_cache
|
|
77
147
|
def update_dataset_state(self, dataset: Dataset, state: DatasetState) -> Dataset:
|
|
78
148
|
"""
|