arkindex-base-worker 0.4.0b3__tar.gz → 0.4.0rc1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/PKG-INFO +3 -2
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_base_worker.egg-info/PKG-INFO +3 -2
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_base_worker.egg-info/requires.txt +2 -1
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_worker/image.py +118 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_worker/worker/__init__.py +4 -50
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_worker/worker/base.py +24 -1
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_worker/worker/element.py +243 -75
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_worker/worker/transcription.py +50 -50
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/pyproject.toml +3 -2
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/tests/conftest.py +2 -21
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/tests/test_base_worker.py +203 -2
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/tests/test_elements_worker/test_elements.py +443 -16
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/tests/test_elements_worker/test_worker.py +0 -200
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/tests/test_image.py +248 -6
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/tests/test_merge.py +0 -1
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/tests/test_utils.py +2 -4
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/LICENSE +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/README.md +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_base_worker.egg-info/SOURCES.txt +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_base_worker.egg-info/dependency_links.txt +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_base_worker.egg-info/top_level.txt +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_worker/__init__.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_worker/cache.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_worker/models.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_worker/utils.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_worker/worker/classification.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_worker/worker/corpus.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_worker/worker/dataset.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_worker/worker/entity.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_worker/worker/image.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_worker/worker/metadata.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_worker/worker/task.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_worker/worker/training.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_worker/worker/version.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/hooks/pre_gen_project.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/setup.cfg +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/tests/__init__.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/tests/test_cache.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/tests/test_dataset_worker.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/tests/test_element.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/tests/test_elements_worker/__init__.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/tests/test_elements_worker/test_classifications.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/tests/test_elements_worker/test_cli.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/tests/test_elements_worker/test_corpus.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/tests/test_elements_worker/test_dataset.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/tests/test_elements_worker/test_entities.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/tests/test_elements_worker/test_image.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/tests/test_elements_worker/test_metadata.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/tests/test_elements_worker/test_task.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/tests/test_elements_worker/test_training.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/tests/test_elements_worker/test_transcriptions.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/worker-demo/tests/__init__.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/worker-demo/tests/conftest.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/worker-demo/tests/test_worker.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/worker-demo/worker_demo/__init__.py +0 -0
- {arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/worker-demo/worker_demo/worker.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: arkindex-base-worker
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.0rc1
|
|
4
4
|
Summary: Base Worker to easily build Arkindex ML workflows
|
|
5
5
|
Author-email: Teklia <contact@teklia.com>
|
|
6
6
|
Maintainer-email: Teklia <contact@teklia.com>
|
|
@@ -40,6 +40,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
40
40
|
Requires-Python: >=3.10
|
|
41
41
|
Description-Content-Type: text/markdown
|
|
42
42
|
License-File: LICENSE
|
|
43
|
+
Requires-Dist: humanize==4.9.0
|
|
43
44
|
Requires-Dist: peewee~=3.17
|
|
44
45
|
Requires-Dist: Pillow==10.4.0
|
|
45
46
|
Requires-Dist: python-gnupg==0.5.2
|
|
@@ -49,7 +50,7 @@ Requires-Dist: zstandard==0.22.0
|
|
|
49
50
|
Provides-Extra: docs
|
|
50
51
|
Requires-Dist: black==24.4.2; extra == "docs"
|
|
51
52
|
Requires-Dist: mkdocs-material==9.5.31; extra == "docs"
|
|
52
|
-
Requires-Dist: mkdocstrings-python==1.10.
|
|
53
|
+
Requires-Dist: mkdocstrings-python==1.10.8; extra == "docs"
|
|
53
54
|
Provides-Extra: tests
|
|
54
55
|
Requires-Dist: pytest==8.3.2; extra == "tests"
|
|
55
56
|
Requires-Dist: pytest-mock==3.14.0; extra == "tests"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: arkindex-base-worker
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.0rc1
|
|
4
4
|
Summary: Base Worker to easily build Arkindex ML workflows
|
|
5
5
|
Author-email: Teklia <contact@teklia.com>
|
|
6
6
|
Maintainer-email: Teklia <contact@teklia.com>
|
|
@@ -40,6 +40,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
40
40
|
Requires-Python: >=3.10
|
|
41
41
|
Description-Content-Type: text/markdown
|
|
42
42
|
License-File: LICENSE
|
|
43
|
+
Requires-Dist: humanize==4.9.0
|
|
43
44
|
Requires-Dist: peewee~=3.17
|
|
44
45
|
Requires-Dist: Pillow==10.4.0
|
|
45
46
|
Requires-Dist: python-gnupg==0.5.2
|
|
@@ -49,7 +50,7 @@ Requires-Dist: zstandard==0.22.0
|
|
|
49
50
|
Provides-Extra: docs
|
|
50
51
|
Requires-Dist: black==24.4.2; extra == "docs"
|
|
51
52
|
Requires-Dist: mkdocs-material==9.5.31; extra == "docs"
|
|
52
|
-
Requires-Dist: mkdocstrings-python==1.10.
|
|
53
|
+
Requires-Dist: mkdocstrings-python==1.10.8; extra == "docs"
|
|
53
54
|
Provides-Extra: tests
|
|
54
55
|
Requires-Dist: pytest==8.3.2; extra == "tests"
|
|
55
56
|
Requires-Dist: pytest-mock==3.14.0; extra == "tests"
|
|
@@ -2,13 +2,18 @@
|
|
|
2
2
|
Helper methods to download and open IIIF images, and manage polygons.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
import functools
|
|
6
|
+
import os
|
|
5
7
|
import re
|
|
8
|
+
import tempfile
|
|
6
9
|
from collections import namedtuple
|
|
10
|
+
from collections.abc import Generator, Iterator
|
|
7
11
|
from io import BytesIO
|
|
8
12
|
from math import ceil
|
|
9
13
|
from pathlib import Path
|
|
10
14
|
from typing import TYPE_CHECKING
|
|
11
15
|
|
|
16
|
+
import humanize
|
|
12
17
|
import requests
|
|
13
18
|
from PIL import Image
|
|
14
19
|
from shapely.affinity import rotate, scale, translate
|
|
@@ -40,8 +45,57 @@ IIIF_URL = re.compile(r"\w+:\/{2}.+\/.+\/.+\/.+\/(?P<size>.+)\/!?\d+\/\w+\.\w+")
|
|
|
40
45
|
IIIF_FULL = "full"
|
|
41
46
|
# Maximum size available
|
|
42
47
|
IIIF_MAX = "max"
|
|
48
|
+
# Ratio to resize image
|
|
49
|
+
IMAGE_RATIO = [1, 0.9, 0.85, 0.80, 0.75, 0.70, 0.60, 0.50, 0.40, 0.30]
|
|
43
50
|
|
|
44
51
|
|
|
52
|
+
def update_pillow_image_size_limit(func):
|
|
53
|
+
"""
|
|
54
|
+
Update Pillow Image size limit
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
@functools.wraps(func)
|
|
58
|
+
def wrapper(
|
|
59
|
+
*args,
|
|
60
|
+
max_image_pixels: str | int | None = os.getenv("ARKINDEX_MAX_IMAGE_PIXELS"),
|
|
61
|
+
**kwargs,
|
|
62
|
+
):
|
|
63
|
+
"""
|
|
64
|
+
Wrapper to update Pillow Image size limit and restore it at the end of the function.
|
|
65
|
+
|
|
66
|
+
:param *args: Positional arguments passed to the function.
|
|
67
|
+
:param max_image_pixels: Pillow Image size limit to use.
|
|
68
|
+
:param **kwargs: Keyword arguments passed to the function.
|
|
69
|
+
"""
|
|
70
|
+
MAX_IMAGE_PIXELS = Image.MAX_IMAGE_PIXELS
|
|
71
|
+
|
|
72
|
+
# Override Pillow Image size limit
|
|
73
|
+
if max_image_pixels is not None:
|
|
74
|
+
max_image_pixels = int(max_image_pixels)
|
|
75
|
+
# Override Pillow limit for detecting decompression bombs, disabled if set to 0
|
|
76
|
+
if max_image_pixels == 0:
|
|
77
|
+
logger.warning(
|
|
78
|
+
"Pillow Image size limit is completely disabled, make sure you trust the image source."
|
|
79
|
+
)
|
|
80
|
+
Image.MAX_IMAGE_PIXELS = None
|
|
81
|
+
else:
|
|
82
|
+
Image.MAX_IMAGE_PIXELS = max_image_pixels
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
results = func(*args, **kwargs)
|
|
86
|
+
except:
|
|
87
|
+
# Restore initial Pillow Image size limit
|
|
88
|
+
Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
|
|
89
|
+
raise
|
|
90
|
+
|
|
91
|
+
# Restore initial Pillow Image size limit
|
|
92
|
+
Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
|
|
93
|
+
return results
|
|
94
|
+
|
|
95
|
+
return wrapper
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@update_pillow_image_size_limit
|
|
45
99
|
def open_image(
|
|
46
100
|
path: str,
|
|
47
101
|
mode: str | None = "RGB",
|
|
@@ -149,6 +203,70 @@ def upload_image(image: Image, url: str) -> requests.Response:
|
|
|
149
203
|
return resp
|
|
150
204
|
|
|
151
205
|
|
|
206
|
+
def resized_images(
|
|
207
|
+
*args,
|
|
208
|
+
element: "Element",
|
|
209
|
+
max_pixels: int | None = None,
|
|
210
|
+
max_bytes: int | None = None,
|
|
211
|
+
**kwargs,
|
|
212
|
+
) -> Iterator[Generator[tempfile.NamedTemporaryFile, None, None]]:
|
|
213
|
+
"""
|
|
214
|
+
Build resized images according to the pixel and byte limits.
|
|
215
|
+
|
|
216
|
+
:param *args: Positional arguments passed to [arkindex_worker.models.Element.open_image_tempfile][].
|
|
217
|
+
:param element: Element whose image needs to be resized.
|
|
218
|
+
:param max_pixels: Maximum pixel size of the resized images.
|
|
219
|
+
:param max_bytes: Maximum byte size of the resized images.
|
|
220
|
+
:param **kwargs: Keyword arguments passed to [arkindex_worker.models.Element.open_image_tempfile][].
|
|
221
|
+
:returns: An iterator of the temporary file of the resized image.
|
|
222
|
+
"""
|
|
223
|
+
_, _, element_width, element_height = polygon_bounding_box(element.polygon)
|
|
224
|
+
|
|
225
|
+
logger.info(f"This element's image sizes are ({element_width} x {element_height}).")
|
|
226
|
+
if max_pixels and max(element_width, element_height) > max_pixels:
|
|
227
|
+
logger.warning(
|
|
228
|
+
f"Maximum image input size supported is ({max_pixels} x {max_pixels})."
|
|
229
|
+
)
|
|
230
|
+
logger.warning("The image will be resized.")
|
|
231
|
+
|
|
232
|
+
element_pixel, param = (
|
|
233
|
+
(element_width, "max_width")
|
|
234
|
+
if element_width > element_height
|
|
235
|
+
else (element_height, "max_height")
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
for resized_pixel in sorted(
|
|
239
|
+
set(
|
|
240
|
+
min(round(ratio * element_pixel), max_pixels or element_pixel)
|
|
241
|
+
for ratio in IMAGE_RATIO
|
|
242
|
+
),
|
|
243
|
+
reverse=True,
|
|
244
|
+
):
|
|
245
|
+
with element.open_image_tempfile(
|
|
246
|
+
*args, **{**kwargs, param: resized_pixel}
|
|
247
|
+
) as image:
|
|
248
|
+
pillow_image = Image.open(image)
|
|
249
|
+
if (
|
|
250
|
+
pillow_image.width != element_width
|
|
251
|
+
or pillow_image.height != element_height
|
|
252
|
+
):
|
|
253
|
+
logger.warning(
|
|
254
|
+
f"The image was resized to ({pillow_image.width} x {pillow_image.height})."
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# The image is still too large
|
|
258
|
+
image_size = Path(image.name).stat().st_size
|
|
259
|
+
if max_bytes and image_size > max_bytes:
|
|
260
|
+
logger.warning(f"The image size is {humanize.naturalsize(image_size)}.")
|
|
261
|
+
logger.warning(
|
|
262
|
+
f"Maximum image input size supported is {humanize.naturalsize(max_bytes)}."
|
|
263
|
+
)
|
|
264
|
+
logger.warning("The image will be resized.")
|
|
265
|
+
continue
|
|
266
|
+
|
|
267
|
+
yield image
|
|
268
|
+
|
|
269
|
+
|
|
152
270
|
def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
|
|
153
271
|
"""
|
|
154
272
|
Compute the rectangle bounding box of a polygon.
|
{arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_worker/worker/__init__.py
RENAMED
|
@@ -111,7 +111,7 @@ class ElementsWorker(
|
|
|
111
111
|
help="One or more Arkindex element ID",
|
|
112
112
|
)
|
|
113
113
|
|
|
114
|
-
def
|
|
114
|
+
def get_elements(self) -> Iterable[CachedElement] | list[str]:
|
|
115
115
|
"""
|
|
116
116
|
List the elements to be processed, either from the CLI arguments or
|
|
117
117
|
the cache database when enabled.
|
|
@@ -173,33 +173,10 @@ class ElementsWorker(
|
|
|
173
173
|
), "Worker must be configured to access its process activity state"
|
|
174
174
|
return self.process_information.get("activity_state") == "ready"
|
|
175
175
|
|
|
176
|
-
def configure(self):
|
|
177
|
-
"""
|
|
178
|
-
Setup the worker using CLI arguments and environment variables.
|
|
179
|
-
"""
|
|
180
|
-
# CLI args are stored on the instance so that implementations can access them
|
|
181
|
-
self.args = self.parser.parse_args()
|
|
182
|
-
|
|
183
|
-
if self.is_read_only:
|
|
184
|
-
super().configure_for_developers()
|
|
185
|
-
else:
|
|
186
|
-
super().configure()
|
|
187
|
-
super().configure_cache()
|
|
188
|
-
|
|
189
|
-
# Retrieve the model configuration
|
|
190
|
-
if self.model_configuration:
|
|
191
|
-
self.config.update(self.model_configuration)
|
|
192
|
-
logger.info("Model version configuration retrieved")
|
|
193
|
-
|
|
194
|
-
# Retrieve the user configuration
|
|
195
|
-
if self.user_configuration:
|
|
196
|
-
self.config.update(self.user_configuration)
|
|
197
|
-
logger.info("User configuration retrieved")
|
|
198
|
-
|
|
199
176
|
def run(self):
|
|
200
177
|
"""
|
|
201
178
|
Implements an Arkindex worker that goes through each element returned by
|
|
202
|
-
[
|
|
179
|
+
[get_elements][arkindex_worker.worker.ElementsWorker.get_elements].
|
|
203
180
|
It calls [process_element][arkindex_worker.worker.ElementsWorker.process_element],
|
|
204
181
|
catching exceptions, and handles saving WorkerActivity updates when enabled.
|
|
205
182
|
"""
|
|
@@ -207,7 +184,7 @@ class ElementsWorker(
|
|
|
207
184
|
|
|
208
185
|
# List all elements either from JSON file
|
|
209
186
|
# or direct list of elements on CLI
|
|
210
|
-
elements = self.
|
|
187
|
+
elements = self.get_elements()
|
|
211
188
|
if not elements:
|
|
212
189
|
logger.warning("No elements to process, stopping.")
|
|
213
190
|
sys.exit(1)
|
|
@@ -224,7 +201,7 @@ class ElementsWorker(
|
|
|
224
201
|
element = None
|
|
225
202
|
try:
|
|
226
203
|
if self.use_cache:
|
|
227
|
-
# Just use the result of
|
|
204
|
+
# Just use the result of get_elements as the element
|
|
228
205
|
element = item
|
|
229
206
|
else:
|
|
230
207
|
# Load element using the Arkindex API
|
|
@@ -397,29 +374,6 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
|
|
|
397
374
|
default=[],
|
|
398
375
|
)
|
|
399
376
|
|
|
400
|
-
def configure(self):
|
|
401
|
-
"""
|
|
402
|
-
Setup the worker using CLI arguments and environment variables.
|
|
403
|
-
"""
|
|
404
|
-
# CLI args are stored on the instance so that implementations can access them
|
|
405
|
-
self.args = self.parser.parse_args()
|
|
406
|
-
|
|
407
|
-
if self.is_read_only:
|
|
408
|
-
super().configure_for_developers()
|
|
409
|
-
else:
|
|
410
|
-
super().configure()
|
|
411
|
-
super().configure_cache()
|
|
412
|
-
|
|
413
|
-
# Retrieve the model configuration
|
|
414
|
-
if self.model_configuration:
|
|
415
|
-
self.config.update(self.model_configuration)
|
|
416
|
-
logger.info("Model version configuration retrieved")
|
|
417
|
-
|
|
418
|
-
# Retrieve the user configuration
|
|
419
|
-
if self.user_configuration:
|
|
420
|
-
self.config.update(self.user_configuration)
|
|
421
|
-
logger.info("User configuration retrieved")
|
|
422
|
-
|
|
423
377
|
def cleanup_downloaded_artifact(self) -> None:
|
|
424
378
|
"""
|
|
425
379
|
Cleanup the downloaded dataset artifact if any
|
{arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_worker/worker/base.py
RENAMED
|
@@ -219,7 +219,7 @@ class BaseWorker:
|
|
|
219
219
|
# Load all required secrets
|
|
220
220
|
self.secrets = {name: self.load_secret(Path(name)) for name in required_secrets}
|
|
221
221
|
|
|
222
|
-
def
|
|
222
|
+
def configure_worker_run(self):
|
|
223
223
|
"""
|
|
224
224
|
Setup the necessary configuration needed using CLI args and environment variables.
|
|
225
225
|
This is the method called when running a worker on Arkindex.
|
|
@@ -320,6 +320,29 @@ class BaseWorker:
|
|
|
320
320
|
else:
|
|
321
321
|
logger.debug("Cache is disabled")
|
|
322
322
|
|
|
323
|
+
def configure(self):
|
|
324
|
+
"""
|
|
325
|
+
Setup the worker using CLI arguments and environment variables.
|
|
326
|
+
"""
|
|
327
|
+
# CLI args are stored on the instance so that implementations can access them
|
|
328
|
+
self.args = self.parser.parse_args()
|
|
329
|
+
|
|
330
|
+
if self.is_read_only:
|
|
331
|
+
self.configure_for_developers()
|
|
332
|
+
else:
|
|
333
|
+
self.configure_worker_run()
|
|
334
|
+
self.configure_cache()
|
|
335
|
+
|
|
336
|
+
# Retrieve the model configuration
|
|
337
|
+
if self.model_configuration:
|
|
338
|
+
self.config.update(self.model_configuration)
|
|
339
|
+
logger.info("Model version configuration retrieved")
|
|
340
|
+
|
|
341
|
+
# Retrieve the user configuration
|
|
342
|
+
if self.user_configuration:
|
|
343
|
+
self.config.update(self.user_configuration)
|
|
344
|
+
logger.info("User configuration retrieved")
|
|
345
|
+
|
|
323
346
|
def load_secret(self, name: Path):
|
|
324
347
|
"""
|
|
325
348
|
Load a Ponos secret by name.
|
{arkindex_base_worker-0.4.0b3 → arkindex_base_worker-0.4.0rc1}/arkindex_worker/worker/element.py
RENAMED
|
@@ -483,6 +483,178 @@ class ElementMixin:
|
|
|
483
483
|
|
|
484
484
|
return updated_element
|
|
485
485
|
|
|
486
|
+
def list_elements(
|
|
487
|
+
self,
|
|
488
|
+
folder: bool | None = None,
|
|
489
|
+
name: str | None = None,
|
|
490
|
+
top_level: bool | None = None,
|
|
491
|
+
transcription_worker_version: str | bool | None = None,
|
|
492
|
+
transcription_worker_run: str | bool | None = None,
|
|
493
|
+
type: str | None = None,
|
|
494
|
+
with_classes: bool | None = None,
|
|
495
|
+
with_corpus: bool | None = None,
|
|
496
|
+
with_metadata: bool | None = None,
|
|
497
|
+
with_has_children: bool | None = None,
|
|
498
|
+
with_zone: bool | None = None,
|
|
499
|
+
worker_version: str | bool | None = None,
|
|
500
|
+
worker_run: str | bool | None = None,
|
|
501
|
+
) -> Iterable[dict] | Iterable[CachedElement]:
|
|
502
|
+
"""
|
|
503
|
+
List element in a corpus.
|
|
504
|
+
|
|
505
|
+
Warns:
|
|
506
|
+
----
|
|
507
|
+
The following parameters are **deprecated**:
|
|
508
|
+
|
|
509
|
+
- `transcription_worker_version` in favor of `transcription_worker_run`
|
|
510
|
+
- `worker_version` in favor of `worker_run`
|
|
511
|
+
|
|
512
|
+
:param folder: Restrict to or exclude elements with folder types.
|
|
513
|
+
This parameter is not supported when caching is enabled.
|
|
514
|
+
:param name: Restrict to elements whose name contain a substring (case-insensitive).
|
|
515
|
+
This parameter is not supported when caching is enabled.
|
|
516
|
+
:param top_level: Restrict to or exclude folder elements without parent elements (top-level elements).
|
|
517
|
+
This parameter is not supported when caching is enabled.
|
|
518
|
+
:param transcription_worker_version: **Deprecated** Restrict to elements that have a transcription created by a worker version with this UUID. Set to False to look for elements that have a manual transcription.
|
|
519
|
+
This parameter is not supported when caching is enabled.
|
|
520
|
+
:param transcription_worker_run: Restrict to elements that have a transcription created by a worker run with this UUID. Set to False to look for elements that have a manual transcription.
|
|
521
|
+
This parameter is not supported when caching is enabled.
|
|
522
|
+
:param type: Restrict to elements with a specific type slug
|
|
523
|
+
This parameter is not supported when caching is enabled.
|
|
524
|
+
:param with_classes: Include each element's classifications in the response.
|
|
525
|
+
This parameter is not supported when caching is enabled.
|
|
526
|
+
:param with_corpus: Include each element's corpus in the response.
|
|
527
|
+
This parameter is not supported when caching is enabled.
|
|
528
|
+
:param with_has_children: Include the ``has_children`` attribute in the response,
|
|
529
|
+
indicating if this element has child elements of its own.
|
|
530
|
+
This parameter is not supported when caching is enabled.
|
|
531
|
+
:param with_metadata: Include each element's metadata in the response.
|
|
532
|
+
This parameter is not supported when caching is enabled.
|
|
533
|
+
:param with_zone: Include the ``zone`` attribute in the response,
|
|
534
|
+
holding the element's image and polygon.
|
|
535
|
+
This parameter is not supported when caching is enabled.
|
|
536
|
+
:param worker_version: **Deprecated** Restrict to elements created by a worker version with this UUID.
|
|
537
|
+
:param worker_run: Restrict to elements created by a worker run with this UUID.
|
|
538
|
+
:return: An iterable of dicts from the ``ListElementChildren`` API endpoint,
|
|
539
|
+
or an iterable of [CachedElement][arkindex_worker.cache.CachedElement] when caching is enabled.
|
|
540
|
+
"""
|
|
541
|
+
query_params = {}
|
|
542
|
+
if folder is not None:
|
|
543
|
+
assert isinstance(folder, bool), "folder should be of type bool"
|
|
544
|
+
query_params["folder"] = folder
|
|
545
|
+
if name:
|
|
546
|
+
assert isinstance(name, str), "name should be of type str"
|
|
547
|
+
query_params["name"] = name
|
|
548
|
+
if top_level is not None:
|
|
549
|
+
assert isinstance(top_level, bool), "top_level should be of type bool"
|
|
550
|
+
query_params["top_level"] = top_level
|
|
551
|
+
if transcription_worker_version is not None:
|
|
552
|
+
warn(
|
|
553
|
+
"`transcription_worker_version` usage is deprecated. Consider using `transcription_worker_run` instead.",
|
|
554
|
+
DeprecationWarning,
|
|
555
|
+
stacklevel=1,
|
|
556
|
+
)
|
|
557
|
+
assert isinstance(
|
|
558
|
+
transcription_worker_version, str | bool
|
|
559
|
+
), "transcription_worker_version should be of type str or bool"
|
|
560
|
+
if isinstance(transcription_worker_version, bool):
|
|
561
|
+
assert (
|
|
562
|
+
transcription_worker_version is False
|
|
563
|
+
), "if of type bool, transcription_worker_version can only be set to False"
|
|
564
|
+
query_params["transcription_worker_version"] = transcription_worker_version
|
|
565
|
+
if transcription_worker_run is not None:
|
|
566
|
+
assert isinstance(
|
|
567
|
+
transcription_worker_run, str | bool
|
|
568
|
+
), "transcription_worker_run should be of type str or bool"
|
|
569
|
+
if isinstance(transcription_worker_run, bool):
|
|
570
|
+
assert (
|
|
571
|
+
transcription_worker_run is False
|
|
572
|
+
), "if of type bool, transcription_worker_run can only be set to False"
|
|
573
|
+
query_params["transcription_worker_run"] = transcription_worker_run
|
|
574
|
+
if type:
|
|
575
|
+
assert isinstance(type, str), "type should be of type str"
|
|
576
|
+
query_params["type"] = type
|
|
577
|
+
if with_classes is not None:
|
|
578
|
+
assert isinstance(with_classes, bool), "with_classes should be of type bool"
|
|
579
|
+
query_params["with_classes"] = with_classes
|
|
580
|
+
if with_corpus is not None:
|
|
581
|
+
assert isinstance(with_corpus, bool), "with_corpus should be of type bool"
|
|
582
|
+
query_params["with_corpus"] = with_corpus
|
|
583
|
+
if with_has_children is not None:
|
|
584
|
+
assert isinstance(
|
|
585
|
+
with_has_children, bool
|
|
586
|
+
), "with_has_children should be of type bool"
|
|
587
|
+
query_params["with_has_children"] = with_has_children
|
|
588
|
+
if with_metadata is not None:
|
|
589
|
+
assert isinstance(
|
|
590
|
+
with_metadata, bool
|
|
591
|
+
), "with_metadata should be of type bool"
|
|
592
|
+
query_params["with_metadata"] = with_metadata
|
|
593
|
+
if with_zone is not None:
|
|
594
|
+
assert isinstance(with_zone, bool), "with_zone should be of type bool"
|
|
595
|
+
query_params["with_zone"] = with_zone
|
|
596
|
+
if worker_version is not None:
|
|
597
|
+
warn(
|
|
598
|
+
"`worker_version` usage is deprecated. Consider using `worker_run` instead.",
|
|
599
|
+
DeprecationWarning,
|
|
600
|
+
stacklevel=1,
|
|
601
|
+
)
|
|
602
|
+
assert isinstance(
|
|
603
|
+
worker_version, str | bool
|
|
604
|
+
), "worker_version should be of type str or bool"
|
|
605
|
+
if isinstance(worker_version, bool):
|
|
606
|
+
assert (
|
|
607
|
+
worker_version is False
|
|
608
|
+
), "if of type bool, worker_version can only be set to False"
|
|
609
|
+
query_params["worker_version"] = worker_version
|
|
610
|
+
if worker_run is not None:
|
|
611
|
+
assert isinstance(
|
|
612
|
+
worker_run, str | bool
|
|
613
|
+
), "worker_run should be of type str or bool"
|
|
614
|
+
if isinstance(worker_run, bool):
|
|
615
|
+
assert (
|
|
616
|
+
worker_run is False
|
|
617
|
+
), "if of type bool, worker_run can only be set to False"
|
|
618
|
+
query_params["worker_run"] = worker_run
|
|
619
|
+
|
|
620
|
+
if not self.use_cache:
|
|
621
|
+
return self.api_client.paginate(
|
|
622
|
+
"ListElements", corpus=self.corpus_id, **query_params
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
# Checking that we only received query_params handled by the cache
|
|
626
|
+
assert (
|
|
627
|
+
set(query_params.keys())
|
|
628
|
+
<= {
|
|
629
|
+
"type",
|
|
630
|
+
"worker_version",
|
|
631
|
+
"worker_run",
|
|
632
|
+
}
|
|
633
|
+
), "When using the local cache, you can only filter by 'type' and/or 'worker_version' and/or 'worker_run'"
|
|
634
|
+
|
|
635
|
+
query = CachedElement.select()
|
|
636
|
+
if type:
|
|
637
|
+
query = query.where(CachedElement.type == type)
|
|
638
|
+
if worker_version is not None:
|
|
639
|
+
# If worker_version=False, filter by manual worker_version e.g. None
|
|
640
|
+
worker_version_id = worker_version or None
|
|
641
|
+
if worker_version_id:
|
|
642
|
+
query = query.where(
|
|
643
|
+
CachedElement.worker_version_id == worker_version_id
|
|
644
|
+
)
|
|
645
|
+
else:
|
|
646
|
+
query = query.where(CachedElement.worker_version_id.is_null())
|
|
647
|
+
|
|
648
|
+
if worker_run is not None:
|
|
649
|
+
# If worker_run=False, filter by manual worker_run e.g. None
|
|
650
|
+
worker_run_id = worker_run or None
|
|
651
|
+
if worker_run_id:
|
|
652
|
+
query = query.where(CachedElement.worker_run_id == worker_run_id)
|
|
653
|
+
else:
|
|
654
|
+
query = query.where(CachedElement.worker_run_id.is_null())
|
|
655
|
+
|
|
656
|
+
return query
|
|
657
|
+
|
|
486
658
|
def list_element_children(
|
|
487
659
|
self,
|
|
488
660
|
element: Element | CachedElement,
|
|
@@ -622,45 +794,43 @@ class ElementMixin:
|
|
|
622
794
|
), "if of type bool, worker_run can only be set to False"
|
|
623
795
|
query_params["worker_run"] = worker_run
|
|
624
796
|
|
|
625
|
-
if self.use_cache:
|
|
626
|
-
|
|
627
|
-
assert (
|
|
628
|
-
set(query_params.keys())
|
|
629
|
-
<= {
|
|
630
|
-
"type",
|
|
631
|
-
"worker_version",
|
|
632
|
-
"worker_run",
|
|
633
|
-
}
|
|
634
|
-
), "When using the local cache, you can only filter by 'type' and/or 'worker_version' and/or 'worker_run'"
|
|
635
|
-
|
|
636
|
-
query = CachedElement.select().where(CachedElement.parent_id == element.id)
|
|
637
|
-
if type:
|
|
638
|
-
query = query.where(CachedElement.type == type)
|
|
639
|
-
if worker_version is not None:
|
|
640
|
-
# If worker_version=False, filter by manual worker_version e.g. None
|
|
641
|
-
worker_version_id = worker_version or None
|
|
642
|
-
if worker_version_id:
|
|
643
|
-
query = query.where(
|
|
644
|
-
CachedElement.worker_version_id == worker_version_id
|
|
645
|
-
)
|
|
646
|
-
else:
|
|
647
|
-
query = query.where(CachedElement.worker_version_id.is_null())
|
|
648
|
-
|
|
649
|
-
if worker_run is not None:
|
|
650
|
-
# If worker_run=False, filter by manual worker_run e.g. None
|
|
651
|
-
worker_run_id = worker_run or None
|
|
652
|
-
if worker_run_id:
|
|
653
|
-
query = query.where(CachedElement.worker_run_id == worker_run_id)
|
|
654
|
-
else:
|
|
655
|
-
query = query.where(CachedElement.worker_run_id.is_null())
|
|
656
|
-
|
|
657
|
-
return query
|
|
658
|
-
else:
|
|
659
|
-
children = self.api_client.paginate(
|
|
797
|
+
if not self.use_cache:
|
|
798
|
+
return self.api_client.paginate(
|
|
660
799
|
"ListElementChildren", id=element.id, **query_params
|
|
661
800
|
)
|
|
662
801
|
|
|
663
|
-
|
|
802
|
+
# Checking that we only received query_params handled by the cache
|
|
803
|
+
assert (
|
|
804
|
+
set(query_params.keys())
|
|
805
|
+
<= {
|
|
806
|
+
"type",
|
|
807
|
+
"worker_version",
|
|
808
|
+
"worker_run",
|
|
809
|
+
}
|
|
810
|
+
), "When using the local cache, you can only filter by 'type' and/or 'worker_version' and/or 'worker_run'"
|
|
811
|
+
|
|
812
|
+
query = CachedElement.select().where(CachedElement.parent_id == element.id)
|
|
813
|
+
if type:
|
|
814
|
+
query = query.where(CachedElement.type == type)
|
|
815
|
+
if worker_version is not None:
|
|
816
|
+
# If worker_version=False, filter by manual worker_version e.g. None
|
|
817
|
+
worker_version_id = worker_version or None
|
|
818
|
+
if worker_version_id:
|
|
819
|
+
query = query.where(
|
|
820
|
+
CachedElement.worker_version_id == worker_version_id
|
|
821
|
+
)
|
|
822
|
+
else:
|
|
823
|
+
query = query.where(CachedElement.worker_version_id.is_null())
|
|
824
|
+
|
|
825
|
+
if worker_run is not None:
|
|
826
|
+
# If worker_run=False, filter by manual worker_run e.g. None
|
|
827
|
+
worker_run_id = worker_run or None
|
|
828
|
+
if worker_run_id:
|
|
829
|
+
query = query.where(CachedElement.worker_run_id == worker_run_id)
|
|
830
|
+
else:
|
|
831
|
+
query = query.where(CachedElement.worker_run_id.is_null())
|
|
832
|
+
|
|
833
|
+
return query
|
|
664
834
|
|
|
665
835
|
def list_element_parents(
|
|
666
836
|
self,
|
|
@@ -801,45 +971,43 @@ class ElementMixin:
|
|
|
801
971
|
), "if of type bool, worker_run can only be set to False"
|
|
802
972
|
query_params["worker_run"] = worker_run
|
|
803
973
|
|
|
804
|
-
if self.use_cache:
|
|
805
|
-
|
|
806
|
-
assert (
|
|
807
|
-
set(query_params.keys())
|
|
808
|
-
<= {
|
|
809
|
-
"type",
|
|
810
|
-
"worker_version",
|
|
811
|
-
"worker_run",
|
|
812
|
-
}
|
|
813
|
-
), "When using the local cache, you can only filter by 'type' and/or 'worker_version' and/or 'worker_run'"
|
|
814
|
-
|
|
815
|
-
parent_ids = CachedElement.select(CachedElement.parent_id).where(
|
|
816
|
-
CachedElement.id == element.id
|
|
817
|
-
)
|
|
818
|
-
query = CachedElement.select().where(CachedElement.id.in_(parent_ids))
|
|
819
|
-
if type:
|
|
820
|
-
query = query.where(CachedElement.type == type)
|
|
821
|
-
if worker_version is not None:
|
|
822
|
-
# If worker_version=False, filter by manual worker_version e.g. None
|
|
823
|
-
worker_version_id = worker_version or None
|
|
824
|
-
if worker_version_id:
|
|
825
|
-
query = query.where(
|
|
826
|
-
CachedElement.worker_version_id == worker_version_id
|
|
827
|
-
)
|
|
828
|
-
else:
|
|
829
|
-
query = query.where(CachedElement.worker_version_id.is_null())
|
|
830
|
-
|
|
831
|
-
if worker_run is not None:
|
|
832
|
-
# If worker_run=False, filter by manual worker_run e.g. None
|
|
833
|
-
worker_run_id = worker_run or None
|
|
834
|
-
if worker_run_id:
|
|
835
|
-
query = query.where(CachedElement.worker_run_id == worker_run_id)
|
|
836
|
-
else:
|
|
837
|
-
query = query.where(CachedElement.worker_run_id.is_null())
|
|
838
|
-
|
|
839
|
-
return query
|
|
840
|
-
else:
|
|
841
|
-
parents = self.api_client.paginate(
|
|
974
|
+
if not self.use_cache:
|
|
975
|
+
return self.api_client.paginate(
|
|
842
976
|
"ListElementParents", id=element.id, **query_params
|
|
843
977
|
)
|
|
844
978
|
|
|
845
|
-
|
|
979
|
+
# Checking that we only received query_params handled by the cache
|
|
980
|
+
assert (
|
|
981
|
+
set(query_params.keys())
|
|
982
|
+
<= {
|
|
983
|
+
"type",
|
|
984
|
+
"worker_version",
|
|
985
|
+
"worker_run",
|
|
986
|
+
}
|
|
987
|
+
), "When using the local cache, you can only filter by 'type' and/or 'worker_version' and/or 'worker_run'"
|
|
988
|
+
|
|
989
|
+
parent_ids = CachedElement.select(CachedElement.parent_id).where(
|
|
990
|
+
CachedElement.id == element.id
|
|
991
|
+
)
|
|
992
|
+
query = CachedElement.select().where(CachedElement.id.in_(parent_ids))
|
|
993
|
+
if type:
|
|
994
|
+
query = query.where(CachedElement.type == type)
|
|
995
|
+
if worker_version is not None:
|
|
996
|
+
# If worker_version=False, filter by manual worker_version e.g. None
|
|
997
|
+
worker_version_id = worker_version or None
|
|
998
|
+
if worker_version_id:
|
|
999
|
+
query = query.where(
|
|
1000
|
+
CachedElement.worker_version_id == worker_version_id
|
|
1001
|
+
)
|
|
1002
|
+
else:
|
|
1003
|
+
query = query.where(CachedElement.worker_version_id.is_null())
|
|
1004
|
+
|
|
1005
|
+
if worker_run is not None:
|
|
1006
|
+
# If worker_run=False, filter by manual worker_run e.g. None
|
|
1007
|
+
worker_run_id = worker_run or None
|
|
1008
|
+
if worker_run_id:
|
|
1009
|
+
query = query.where(CachedElement.worker_run_id == worker_run_id)
|
|
1010
|
+
else:
|
|
1011
|
+
query = query.where(CachedElement.worker_run_id.is_null())
|
|
1012
|
+
|
|
1013
|
+
return query
|