arkindex-base-worker 0.4.0rc6__tar.gz → 0.5.0a2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/PKG-INFO +3 -3
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_base_worker.egg-info/PKG-INFO +3 -3
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_base_worker.egg-info/requires.txt +1 -1
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_worker/__init__.py +3 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_worker/cache.py +3 -3
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_worker/image.py +98 -48
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_worker/utils.py +2 -1
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_worker/worker/__init__.py +17 -17
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_worker/worker/base.py +6 -6
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_worker/worker/classification.py +34 -32
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_worker/worker/corpus.py +3 -3
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_worker/worker/dataset.py +9 -9
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_worker/worker/element.py +193 -189
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_worker/worker/entity.py +62 -63
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_worker/worker/image.py +3 -3
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_worker/worker/metadata.py +27 -27
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_worker/worker/task.py +9 -9
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_worker/worker/training.py +15 -11
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_worker/worker/transcription.py +77 -71
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/pyproject.toml +3 -3
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/conftest.py +22 -22
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_dataset_worker.py +1 -1
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_elements_worker/test_training.py +8 -8
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_elements_worker/test_worker.py +15 -14
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_image.py +234 -124
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_utils.py +37 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/LICENSE +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/README.md +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_base_worker.egg-info/SOURCES.txt +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_base_worker.egg-info/dependency_links.txt +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_base_worker.egg-info/top_level.txt +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_worker/models.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_worker/worker/process.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_worker/worker/version.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/hooks/pre_gen_project.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/setup.cfg +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/__init__.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_base_worker.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_cache.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_element.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_elements_worker/__init__.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_elements_worker/test_classification.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_elements_worker/test_cli.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_elements_worker/test_corpus.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_elements_worker/test_dataset.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_elements_worker/test_element.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_elements_worker/test_element_create_multiple.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_elements_worker/test_element_create_single.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_elements_worker/test_element_list_children.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_elements_worker/test_element_list_parents.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_elements_worker/test_entity_create.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_elements_worker/test_entity_list_and_check.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_elements_worker/test_image.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_elements_worker/test_metadata.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_elements_worker/test_process.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_elements_worker/test_task.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_elements_worker/test_transcription_create.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_elements_worker/test_transcription_create_with_elements.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_elements_worker/test_transcription_list.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_elements_worker/test_version.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/tests/test_merge.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/worker-demo/tests/__init__.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/worker-demo/tests/conftest.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/worker-demo/tests/test_worker.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/worker-demo/worker_demo/__init__.py +0 -0
- {arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/worker-demo/worker_demo/worker.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: arkindex-base-worker
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0a2
|
|
4
4
|
Summary: Base Worker to easily build Arkindex ML workflows
|
|
5
5
|
Author-email: Teklia <contact@teklia.com>
|
|
6
6
|
Maintainer-email: Teklia <contact@teklia.com>
|
|
@@ -46,7 +46,7 @@ Requires-Dist: peewee~=3.17
|
|
|
46
46
|
Requires-Dist: Pillow==11.0.0
|
|
47
47
|
Requires-Dist: python-gnupg==0.5.3
|
|
48
48
|
Requires-Dist: shapely==2.0.6
|
|
49
|
-
Requires-Dist: teklia-toolbox==0.1.
|
|
49
|
+
Requires-Dist: teklia-toolbox==0.1.7
|
|
50
50
|
Requires-Dist: zstandard==0.23.0
|
|
51
51
|
Provides-Extra: docs
|
|
52
52
|
Requires-Dist: black==24.10.0; extra == "docs"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: arkindex-base-worker
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0a2
|
|
4
4
|
Summary: Base Worker to easily build Arkindex ML workflows
|
|
5
5
|
Author-email: Teklia <contact@teklia.com>
|
|
6
6
|
Maintainer-email: Teklia <contact@teklia.com>
|
|
@@ -46,7 +46,7 @@ Requires-Dist: peewee~=3.17
|
|
|
46
46
|
Requires-Dist: Pillow==11.0.0
|
|
47
47
|
Requires-Dist: python-gnupg==0.5.3
|
|
48
48
|
Requires-Dist: shapely==2.0.6
|
|
49
|
-
Requires-Dist: teklia-toolbox==0.1.
|
|
49
|
+
Requires-Dist: teklia-toolbox==0.1.7
|
|
50
50
|
Requires-Dist: zstandard==0.23.0
|
|
51
51
|
Provides-Extra: docs
|
|
52
52
|
Requires-Dist: black==24.10.0; extra == "docs"
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import importlib.metadata
|
|
1
2
|
import logging
|
|
2
3
|
|
|
3
4
|
logging.basicConfig(
|
|
@@ -5,3 +6,5 @@ logging.basicConfig(
|
|
|
5
6
|
format="%(asctime)s %(levelname)s/%(name)s: %(message)s",
|
|
6
7
|
)
|
|
7
8
|
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
VERSION = importlib.metadata.version("arkindex-base-worker")
|
|
@@ -327,9 +327,9 @@ def check_version(cache_path: str | Path):
|
|
|
327
327
|
except OperationalError:
|
|
328
328
|
version = None
|
|
329
329
|
|
|
330
|
-
assert (
|
|
331
|
-
version
|
|
332
|
-
)
|
|
330
|
+
assert version == SQL_VERSION, (
|
|
331
|
+
f"The SQLite database {cache_path} does not have the correct cache version, it should be {SQL_VERSION}"
|
|
332
|
+
)
|
|
333
333
|
|
|
334
334
|
|
|
335
335
|
def merge_parents_cache(paths: list, current_database: Path):
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
Helper methods to download and open IIIF images, and manage polygons.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
import base64
|
|
5
6
|
import functools
|
|
6
7
|
import os
|
|
7
8
|
import re
|
|
@@ -14,6 +15,7 @@ from pathlib import Path
|
|
|
14
15
|
from typing import TYPE_CHECKING
|
|
15
16
|
|
|
16
17
|
import humanize
|
|
18
|
+
import numpy as np
|
|
17
19
|
import requests
|
|
18
20
|
from PIL import Image
|
|
19
21
|
from shapely.affinity import rotate, scale, translate
|
|
@@ -25,7 +27,7 @@ from tenacity import (
|
|
|
25
27
|
wait_exponential,
|
|
26
28
|
)
|
|
27
29
|
|
|
28
|
-
from arkindex_worker import logger
|
|
30
|
+
from arkindex_worker import VERSION, logger
|
|
29
31
|
from arkindex_worker.utils import pluralize
|
|
30
32
|
from teklia_toolbox.requests import should_verify_cert
|
|
31
33
|
|
|
@@ -39,14 +41,16 @@ DOWNLOAD_TIMEOUT = (30, 60)
|
|
|
39
41
|
|
|
40
42
|
BoundingBox = namedtuple("BoundingBox", ["x", "y", "width", "height"])
|
|
41
43
|
|
|
44
|
+
# Specific User-Agent to bypass potential server limitations
|
|
45
|
+
IIIF_USER_AGENT = f"Teklia/Workers {VERSION}"
|
|
42
46
|
# To parse IIIF Urls
|
|
43
47
|
IIIF_URL = re.compile(r"\w+:\/{2}.+\/.+\/.+\/.+\/(?P<size>.+)\/!?\d+\/\w+\.\w+")
|
|
44
48
|
# Full size of the region
|
|
45
49
|
IIIF_FULL = "full"
|
|
46
50
|
# Maximum size available
|
|
47
51
|
IIIF_MAX = "max"
|
|
48
|
-
#
|
|
49
|
-
|
|
52
|
+
# Ratios to resize images: 1.0, 0.95, [...], 0.1, 0.05
|
|
53
|
+
IMAGE_RATIOS = np.arange(1, 0, -0.05).round(2).tolist()
|
|
50
54
|
|
|
51
55
|
|
|
52
56
|
def update_pillow_image_size_limit(func):
|
|
@@ -206,44 +210,81 @@ def upload_image(image: Image, url: str) -> requests.Response:
|
|
|
206
210
|
def resized_images(
|
|
207
211
|
*args,
|
|
208
212
|
element: "Element",
|
|
209
|
-
|
|
213
|
+
max_pixels_short: int | None = None,
|
|
214
|
+
max_pixels_long: int | None = None,
|
|
210
215
|
max_bytes: int | None = None,
|
|
216
|
+
use_base64: bool = False,
|
|
211
217
|
**kwargs,
|
|
212
|
-
) -> Iterator[Generator[tempfile.
|
|
218
|
+
) -> Iterator[Generator[tempfile._TemporaryFileWrapper | str]]:
|
|
213
219
|
"""
|
|
214
|
-
Build resized images according to
|
|
220
|
+
Build resized images according to pixel and byte limits.
|
|
215
221
|
|
|
216
222
|
:param *args: Positional arguments passed to [arkindex_worker.models.Element.open_image_tempfile][].
|
|
217
223
|
:param element: Element whose image needs to be resized.
|
|
218
|
-
:param
|
|
224
|
+
:param max_pixels_short: Maximum pixel size of the resized images' short side.
|
|
225
|
+
:param max_pixels_long: Maximum pixel size of the resized images' long side.
|
|
219
226
|
:param max_bytes: Maximum byte size of the resized images.
|
|
227
|
+
:param use_base64: Whether or not to encode resized images in base64 before calculating their size.
|
|
220
228
|
:param **kwargs: Keyword arguments passed to [arkindex_worker.models.Element.open_image_tempfile][].
|
|
221
|
-
:returns: An iterator of
|
|
229
|
+
:returns: An iterator of temporary files for resized images OR an iterator of base64-encoded strings if `use_base64` is set.
|
|
222
230
|
"""
|
|
223
231
|
_, _, element_width, element_height = polygon_bounding_box(element.polygon)
|
|
232
|
+
logger.info(
|
|
233
|
+
f"This element's image dimensions are ({element_width} x {element_height})."
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
portrait_format = element_width <= element_height
|
|
237
|
+
max_pixels_width, max_pixels_height = (
|
|
238
|
+
(max_pixels_short, max_pixels_long)
|
|
239
|
+
if portrait_format
|
|
240
|
+
else (max_pixels_long, max_pixels_short)
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
# The image dimension is already within the pixel limitation, no need to resize the image
|
|
244
|
+
if max_pixels_width and max_pixels_width >= element_width:
|
|
245
|
+
max_pixels_width = None
|
|
246
|
+
if max_pixels_height and max_pixels_height >= element_height:
|
|
247
|
+
max_pixels_height = None
|
|
224
248
|
|
|
225
|
-
|
|
226
|
-
|
|
249
|
+
if (max_pixels_width and element_width > max_pixels_width) or (
|
|
250
|
+
max_pixels_height and element_height > max_pixels_height
|
|
251
|
+
):
|
|
227
252
|
logger.warning(
|
|
228
|
-
f"Maximum image
|
|
253
|
+
f"Maximum image dimensions supported are ({max_pixels_width or element_width} x {max_pixels_height or element_height})."
|
|
229
254
|
)
|
|
230
255
|
logger.warning("The image will be resized.")
|
|
231
256
|
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
257
|
+
# No limitations provided, we keep the image initial dimensions
|
|
258
|
+
if max_pixels_width is None and max_pixels_height is None:
|
|
259
|
+
open_image_param, max_value = (
|
|
260
|
+
("max_height", element_height)
|
|
261
|
+
if portrait_format
|
|
262
|
+
else ("max_width", element_width)
|
|
263
|
+
)
|
|
264
|
+
# A limitation is only given for the height, we resize it
|
|
265
|
+
elif max_pixels_width is None:
|
|
266
|
+
open_image_param, max_value = ("max_height", max_pixels_height)
|
|
267
|
+
# A limitation is only given for the width, we resize it
|
|
268
|
+
elif max_pixels_height is None:
|
|
269
|
+
open_image_param, max_value = ("max_width", max_pixels_width)
|
|
270
|
+
# Limitations are provided for both sides:
|
|
271
|
+
# - we resize only the one with the biggest scale factor
|
|
272
|
+
# - the remaining one will automatically fall within the other limitation
|
|
273
|
+
else:
|
|
274
|
+
width_rescaling_factor = element_width / max_pixels_width
|
|
275
|
+
height_rescaling_factor = element_height / max_pixels_height
|
|
276
|
+
open_image_param, max_value = (
|
|
277
|
+
("max_height", max_pixels_height)
|
|
278
|
+
if height_rescaling_factor > width_rescaling_factor
|
|
279
|
+
else ("max_width", max_pixels_width)
|
|
280
|
+
)
|
|
237
281
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
),
|
|
243
|
-
reverse=True,
|
|
244
|
-
):
|
|
282
|
+
resized_pixels = set(
|
|
283
|
+
min(round(ratio * max_value), max_value) for ratio in IMAGE_RATIOS
|
|
284
|
+
)
|
|
285
|
+
for resized_pixel in sorted(resized_pixels, reverse=True):
|
|
245
286
|
with element.open_image_tempfile(
|
|
246
|
-
*args, **{**kwargs,
|
|
287
|
+
*args, **{**kwargs, open_image_param: resized_pixel}
|
|
247
288
|
) as image:
|
|
248
289
|
pillow_image = Image.open(image)
|
|
249
290
|
if (
|
|
@@ -254,8 +295,12 @@ def resized_images(
|
|
|
254
295
|
f"The image was resized to ({pillow_image.width} x {pillow_image.height})."
|
|
255
296
|
)
|
|
256
297
|
|
|
257
|
-
# The image is still too large
|
|
258
298
|
image_size = Path(image.name).stat().st_size
|
|
299
|
+
if use_base64:
|
|
300
|
+
image = base64.b64encode(Path(image.name).read_bytes()).decode("utf-8")
|
|
301
|
+
image_size = len(image)
|
|
302
|
+
|
|
303
|
+
# The image is still too heavy
|
|
259
304
|
if max_bytes and image_size > max_bytes:
|
|
260
305
|
logger.warning(f"The image size is {humanize.naturalsize(image_size)}.")
|
|
261
306
|
logger.warning(
|
|
@@ -283,7 +328,7 @@ def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
|
|
|
283
328
|
def _retry_log(retry_state, *args, **kwargs):
|
|
284
329
|
logger.warning(
|
|
285
330
|
f"Request to {retry_state.args[0]} failed ({repr(retry_state.outcome.exception())}), "
|
|
286
|
-
f
|
|
331
|
+
f"retrying in {retry_state.idle_for} {pluralize('second', retry_state.idle_for)}"
|
|
287
332
|
)
|
|
288
333
|
|
|
289
334
|
|
|
@@ -296,7 +341,12 @@ def _retry_log(retry_state, *args, **kwargs):
|
|
|
296
341
|
)
|
|
297
342
|
def _retried_request(url, *args, method=requests.get, **kwargs):
|
|
298
343
|
resp = method(
|
|
299
|
-
url,
|
|
344
|
+
url,
|
|
345
|
+
*args,
|
|
346
|
+
headers={"User-Agent": IIIF_USER_AGENT},
|
|
347
|
+
timeout=DOWNLOAD_TIMEOUT,
|
|
348
|
+
verify=should_verify_cert(url),
|
|
349
|
+
**kwargs,
|
|
300
350
|
)
|
|
301
351
|
resp.raise_for_status()
|
|
302
352
|
return resp
|
|
@@ -316,9 +366,9 @@ def download_tiles(url: str) -> Image:
|
|
|
316
366
|
|
|
317
367
|
image_width, image_height = info.get("width"), info.get("height")
|
|
318
368
|
assert image_width and image_height, "Missing image dimensions in info.json"
|
|
319
|
-
assert info.get(
|
|
320
|
-
"tiles"
|
|
321
|
-
)
|
|
369
|
+
assert info.get("tiles"), (
|
|
370
|
+
"Image cannot be retrieved at full size and tiles are not supported"
|
|
371
|
+
)
|
|
322
372
|
|
|
323
373
|
# Take the biggest available tile size
|
|
324
374
|
tile = sorted(info["tiles"], key=lambda tile: tile.get("width", 0), reverse=True)[0]
|
|
@@ -392,15 +442,15 @@ def trim_polygon(
|
|
|
392
442
|
is entirely outside of the image's bounds.
|
|
393
443
|
"""
|
|
394
444
|
|
|
395
|
-
assert isinstance(
|
|
396
|
-
polygon
|
|
397
|
-
)
|
|
398
|
-
assert all(
|
|
399
|
-
|
|
400
|
-
)
|
|
401
|
-
assert all(
|
|
402
|
-
|
|
403
|
-
)
|
|
445
|
+
assert isinstance(polygon, list | tuple), (
|
|
446
|
+
"Input polygon must be a valid list or tuple of points."
|
|
447
|
+
)
|
|
448
|
+
assert all(isinstance(point, list | tuple) for point in polygon), (
|
|
449
|
+
"Polygon points must be tuples or lists."
|
|
450
|
+
)
|
|
451
|
+
assert all(len(point) == 2 for point in polygon), (
|
|
452
|
+
"Polygon points must be tuples or lists of 2 elements."
|
|
453
|
+
)
|
|
404
454
|
assert all(
|
|
405
455
|
isinstance(point[0], int) and isinstance(point[1], int) for point in polygon
|
|
406
456
|
), "Polygon point coordinates must be integers."
|
|
@@ -451,12 +501,12 @@ def revert_orientation(
|
|
|
451
501
|
from arkindex_worker.cache import CachedElement
|
|
452
502
|
from arkindex_worker.models import Element
|
|
453
503
|
|
|
454
|
-
assert element and isinstance(
|
|
455
|
-
element
|
|
456
|
-
)
|
|
457
|
-
assert polygon and isinstance(
|
|
458
|
-
polygon
|
|
459
|
-
)
|
|
504
|
+
assert element and isinstance(element, Element | CachedElement), (
|
|
505
|
+
"element shouldn't be null and should be an Element or CachedElement"
|
|
506
|
+
)
|
|
507
|
+
assert polygon and isinstance(polygon, list), (
|
|
508
|
+
"polygon shouldn't be null and should be a list"
|
|
509
|
+
)
|
|
460
510
|
assert isinstance(reverse, bool), "Reverse should be a bool"
|
|
461
511
|
# Rotating with Pillow can cause it to move the image around, as the image cannot have negative coordinates
|
|
462
512
|
# and must be a rectangle. This means the origin point of any coordinates from an image is invalid, and the
|
|
@@ -464,9 +514,9 @@ def revert_orientation(
|
|
|
464
514
|
# To properly undo the mirroring and rotation implicitly applied by open_image, we first need to find the center
|
|
465
515
|
# of the rotated bounding box.
|
|
466
516
|
if isinstance(element, Element):
|
|
467
|
-
assert (
|
|
468
|
-
element
|
|
469
|
-
)
|
|
517
|
+
assert element.zone and element.zone.polygon, (
|
|
518
|
+
"element should have a zone and a polygon"
|
|
519
|
+
)
|
|
470
520
|
parent_ring = LinearRing(element.zone.polygon)
|
|
471
521
|
elif isinstance(element, CachedElement):
|
|
472
522
|
assert element.polygon, "cached element should have a polygon"
|
|
@@ -243,11 +243,12 @@ def batch_publication(func: Callable) -> Callable:
|
|
|
243
243
|
bound_func.apply_defaults()
|
|
244
244
|
batch_size = bound_func.arguments.get("batch_size")
|
|
245
245
|
assert (
|
|
246
|
-
batch_size and isinstance(batch_size, int) and batch_size > 0
|
|
246
|
+
batch_size is not None and isinstance(batch_size, int) and batch_size > 0
|
|
247
247
|
), "batch_size shouldn't be null and should be a strictly positive integer"
|
|
248
248
|
|
|
249
249
|
return func(self, *args, **kwargs)
|
|
250
250
|
|
|
251
|
+
wrapper.__name__ = func.__name__
|
|
251
252
|
return wrapper
|
|
252
253
|
|
|
253
254
|
|
{arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_worker/worker/__init__.py
RENAMED
|
@@ -82,9 +82,9 @@ class ElementsWorker(
|
|
|
82
82
|
:return: An iterable of [CachedElement][arkindex_worker.cache.CachedElement] when cache support is enabled,
|
|
83
83
|
or a list of strings representing element IDs otherwise.
|
|
84
84
|
"""
|
|
85
|
-
assert not (
|
|
86
|
-
|
|
87
|
-
)
|
|
85
|
+
assert not (self.args.elements_list and self.args.element), (
|
|
86
|
+
"elements-list and element CLI args shouldn't be both set"
|
|
87
|
+
)
|
|
88
88
|
|
|
89
89
|
def invalid_element_id(value: str) -> bool:
|
|
90
90
|
"""
|
|
@@ -125,9 +125,9 @@ class ElementsWorker(
|
|
|
125
125
|
return {item["id"] for item in self.list_process_elements()}
|
|
126
126
|
|
|
127
127
|
invalid_element_ids = list(filter(invalid_element_id, out))
|
|
128
|
-
assert (
|
|
129
|
-
|
|
130
|
-
)
|
|
128
|
+
assert not invalid_element_ids, (
|
|
129
|
+
f"These element IDs are invalid: {', '.join(invalid_element_ids)}"
|
|
130
|
+
)
|
|
131
131
|
|
|
132
132
|
return out
|
|
133
133
|
|
|
@@ -144,9 +144,9 @@ class ElementsWorker(
|
|
|
144
144
|
# Worker activities are also disabled when running an ElementsWorker in a Dataset process
|
|
145
145
|
# and when running export processes.
|
|
146
146
|
return False
|
|
147
|
-
assert (
|
|
148
|
-
|
|
149
|
-
)
|
|
147
|
+
assert self.process_information, (
|
|
148
|
+
"Worker must be configured to access its process activity state"
|
|
149
|
+
)
|
|
150
150
|
return self.process_information.get("activity_state") == "ready"
|
|
151
151
|
|
|
152
152
|
def run(self):
|
|
@@ -221,7 +221,7 @@ class ElementsWorker(
|
|
|
221
221
|
with contextlib.suppress(Exception):
|
|
222
222
|
self.update_activity(element.id, ActivityState.Error)
|
|
223
223
|
|
|
224
|
-
message = f
|
|
224
|
+
message = f"Ran on {count} {pluralize('element', count)}: {count - failed} completed, {failed} failed"
|
|
225
225
|
if failed:
|
|
226
226
|
logger.error(message)
|
|
227
227
|
if failed >= count: # Everything failed!
|
|
@@ -256,9 +256,9 @@ class ElementsWorker(
|
|
|
256
256
|
)
|
|
257
257
|
return True
|
|
258
258
|
|
|
259
|
-
assert element_id and isinstance(
|
|
260
|
-
element_id
|
|
261
|
-
)
|
|
259
|
+
assert element_id and isinstance(element_id, uuid.UUID | str), (
|
|
260
|
+
"element_id shouldn't be null and should be an UUID or str"
|
|
261
|
+
)
|
|
262
262
|
assert isinstance(state, ActivityState), "state should be an ActivityState"
|
|
263
263
|
|
|
264
264
|
try:
|
|
@@ -382,9 +382,9 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
|
|
|
382
382
|
failed = 0
|
|
383
383
|
for i, dataset_set in enumerate(dataset_sets, start=1):
|
|
384
384
|
try:
|
|
385
|
-
assert (
|
|
386
|
-
|
|
387
|
-
)
|
|
385
|
+
assert dataset_set.dataset.state == DatasetState.Complete.value, (
|
|
386
|
+
"When processing a set, its dataset state should be Complete."
|
|
387
|
+
)
|
|
388
388
|
|
|
389
389
|
logger.info(f"Retrieving data for {dataset_set} ({i}/{count})")
|
|
390
390
|
self.download_dataset_artifact(dataset_set.dataset)
|
|
@@ -405,7 +405,7 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
|
|
|
405
405
|
# Cleanup the latest downloaded dataset artifact
|
|
406
406
|
self.cleanup_downloaded_artifact()
|
|
407
407
|
|
|
408
|
-
message = f
|
|
408
|
+
message = f"Ran on {count} {pluralize('set', count)}: {count - failed} completed, {failed} failed"
|
|
409
409
|
if failed:
|
|
410
410
|
logger.error(message)
|
|
411
411
|
if failed >= count: # Everything failed!
|
{arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_worker/worker/base.py
RENAMED
|
@@ -305,9 +305,9 @@ class BaseWorker:
|
|
|
305
305
|
|
|
306
306
|
if self.use_cache:
|
|
307
307
|
if self.args.database is not None:
|
|
308
|
-
assert (
|
|
309
|
-
self.args.database
|
|
310
|
-
)
|
|
308
|
+
assert self.args.database.is_file(), (
|
|
309
|
+
f"Database in {self.args.database} does not exist"
|
|
310
|
+
)
|
|
311
311
|
self.cache_path = self.args.database
|
|
312
312
|
else:
|
|
313
313
|
cache_dir = self.task_data_dir / self.task_id
|
|
@@ -378,9 +378,9 @@ class BaseWorker:
|
|
|
378
378
|
gpg = gnupg.GPG()
|
|
379
379
|
with path.open("rb") as gpg_file:
|
|
380
380
|
decrypted = gpg.decrypt_file(gpg_file)
|
|
381
|
-
assert (
|
|
382
|
-
decrypted.
|
|
383
|
-
)
|
|
381
|
+
assert decrypted.ok, (
|
|
382
|
+
f"GPG error: {decrypted.status} - {decrypted.stderr}"
|
|
383
|
+
)
|
|
384
384
|
secret = decrypted.data.decode("utf-8")
|
|
385
385
|
logging.info(f"Loaded local secret {name}")
|
|
386
386
|
except Exception as e:
|
|
@@ -27,7 +27,7 @@ class ClassificationMixin:
|
|
|
27
27
|
)
|
|
28
28
|
self.classes = {ml_class["name"]: ml_class["id"] for ml_class in corpus_classes}
|
|
29
29
|
logger.info(
|
|
30
|
-
f
|
|
30
|
+
f"Loaded {len(self.classes)} ML {pluralize('class', len(self.classes))} in corpus ({self.corpus_id})"
|
|
31
31
|
)
|
|
32
32
|
|
|
33
33
|
def get_ml_class_id(self, ml_class: str) -> str:
|
|
@@ -60,9 +60,9 @@ class ClassificationMixin:
|
|
|
60
60
|
f"Reloading corpus classes to see if {ml_class} already exists"
|
|
61
61
|
)
|
|
62
62
|
self.load_corpus_classes()
|
|
63
|
-
assert (
|
|
64
|
-
ml_class
|
|
65
|
-
)
|
|
63
|
+
assert ml_class in self.classes, (
|
|
64
|
+
"Missing class {ml_class} even after reloading"
|
|
65
|
+
)
|
|
66
66
|
ml_class_id = self.classes[ml_class]
|
|
67
67
|
|
|
68
68
|
return ml_class_id
|
|
@@ -86,9 +86,9 @@ class ClassificationMixin:
|
|
|
86
86
|
),
|
|
87
87
|
None,
|
|
88
88
|
)
|
|
89
|
-
assert (
|
|
90
|
-
|
|
91
|
-
)
|
|
89
|
+
assert ml_class_name is not None, (
|
|
90
|
+
f"Missing class with id ({ml_class_id}) in corpus ({self.corpus_id})"
|
|
91
|
+
)
|
|
92
92
|
return ml_class_name
|
|
93
93
|
|
|
94
94
|
def create_classification(
|
|
@@ -107,18 +107,18 @@ class ClassificationMixin:
|
|
|
107
107
|
:param high_confidence: Whether or not the classification is of high confidence.
|
|
108
108
|
:returns: The created classification, as returned by the ``CreateClassification`` API endpoint.
|
|
109
109
|
"""
|
|
110
|
-
assert element and isinstance(
|
|
111
|
-
element
|
|
112
|
-
)
|
|
113
|
-
assert ml_class and isinstance(
|
|
114
|
-
ml_class
|
|
115
|
-
)
|
|
116
|
-
assert (
|
|
117
|
-
|
|
118
|
-
)
|
|
119
|
-
assert isinstance(
|
|
120
|
-
high_confidence
|
|
121
|
-
)
|
|
110
|
+
assert element and isinstance(element, Element | CachedElement), (
|
|
111
|
+
"element shouldn't be null and should be an Element or CachedElement"
|
|
112
|
+
)
|
|
113
|
+
assert ml_class and isinstance(ml_class, str), (
|
|
114
|
+
"ml_class shouldn't be null and should be of type str"
|
|
115
|
+
)
|
|
116
|
+
assert isinstance(confidence, float) and 0 <= confidence <= 1, (
|
|
117
|
+
"confidence shouldn't be null and should be a float in [0..1] range"
|
|
118
|
+
)
|
|
119
|
+
assert isinstance(high_confidence, bool), (
|
|
120
|
+
"high_confidence shouldn't be null and should be of type bool"
|
|
121
|
+
)
|
|
122
122
|
if self.is_read_only:
|
|
123
123
|
logger.warning(
|
|
124
124
|
"Cannot create classification as this worker is in read-only mode"
|
|
@@ -198,31 +198,33 @@ class ClassificationMixin:
|
|
|
198
198
|
:returns: List of created classifications, as returned in the ``classifications`` field by
|
|
199
199
|
the ``CreateClassifications`` API endpoint.
|
|
200
200
|
"""
|
|
201
|
-
assert element and isinstance(
|
|
202
|
-
element
|
|
203
|
-
)
|
|
204
|
-
assert classifications and isinstance(
|
|
205
|
-
classifications
|
|
206
|
-
)
|
|
201
|
+
assert element and isinstance(element, Element | CachedElement), (
|
|
202
|
+
"element shouldn't be null and should be an Element or CachedElement"
|
|
203
|
+
)
|
|
204
|
+
assert classifications and isinstance(classifications, list), (
|
|
205
|
+
"classifications shouldn't be null and should be of type list"
|
|
206
|
+
)
|
|
207
207
|
|
|
208
208
|
for index, classification in enumerate(classifications):
|
|
209
209
|
ml_class = classification.get("ml_class")
|
|
210
|
-
assert (
|
|
211
|
-
ml_class and
|
|
212
|
-
)
|
|
210
|
+
assert ml_class and isinstance(ml_class, str), (
|
|
211
|
+
f"Classification at index {index} in classifications: ml_class shouldn't be null and should be of type str"
|
|
212
|
+
)
|
|
213
213
|
|
|
214
214
|
confidence = classification.get("confidence")
|
|
215
215
|
assert (
|
|
216
216
|
confidence is not None
|
|
217
217
|
and isinstance(confidence, float)
|
|
218
218
|
and 0 <= confidence <= 1
|
|
219
|
-
),
|
|
219
|
+
), (
|
|
220
|
+
f"Classification at index {index} in classifications: confidence shouldn't be null and should be a float in [0..1] range"
|
|
221
|
+
)
|
|
220
222
|
|
|
221
223
|
high_confidence = classification.get("high_confidence")
|
|
222
224
|
if high_confidence is not None:
|
|
223
|
-
assert isinstance(
|
|
224
|
-
high_confidence
|
|
225
|
-
)
|
|
225
|
+
assert isinstance(high_confidence, bool), (
|
|
226
|
+
f"Classification at index {index} in classifications: high_confidence should be of type bool"
|
|
227
|
+
)
|
|
226
228
|
|
|
227
229
|
if self.is_read_only:
|
|
228
230
|
logger.warning(
|
{arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_worker/worker/corpus.py
RENAMED
|
@@ -76,9 +76,9 @@ class CorpusMixin:
|
|
|
76
76
|
key=itemgetter("updated"),
|
|
77
77
|
reverse=True,
|
|
78
78
|
)
|
|
79
|
-
assert (
|
|
80
|
-
|
|
81
|
-
)
|
|
79
|
+
assert len(exports) > 0, (
|
|
80
|
+
f'No available exports found for the corpus ({self.corpus_id}) with state "{CorpusExportState.Done.value.capitalize()}".'
|
|
81
|
+
)
|
|
82
82
|
|
|
83
83
|
# Download latest export
|
|
84
84
|
export_id: str = exports[0]["id"]
|
{arkindex_base_worker-0.4.0rc6 → arkindex_base_worker-0.5.0a2}/arkindex_worker/worker/dataset.py
RENAMED
|
@@ -113,9 +113,9 @@ class DatasetMixin:
|
|
|
113
113
|
:param dataset_set: Set to find elements in.
|
|
114
114
|
:returns: An iterator of Element built from the ``ListDatasetElements`` API endpoint.
|
|
115
115
|
"""
|
|
116
|
-
assert dataset_set and isinstance(
|
|
117
|
-
dataset_set
|
|
118
|
-
)
|
|
116
|
+
assert dataset_set and isinstance(dataset_set, Set), (
|
|
117
|
+
"dataset_set shouldn't be null and should be a Set"
|
|
118
|
+
)
|
|
119
119
|
|
|
120
120
|
results = self.api_client.paginate(
|
|
121
121
|
"ListDatasetElements", id=dataset_set.dataset.id, set=dataset_set.name
|
|
@@ -152,12 +152,12 @@ class DatasetMixin:
|
|
|
152
152
|
:param state: State of the dataset.
|
|
153
153
|
:returns: The updated ``Dataset`` object from the ``PartialUpdateDataset`` API endpoint.
|
|
154
154
|
"""
|
|
155
|
-
assert dataset and isinstance(
|
|
156
|
-
dataset
|
|
157
|
-
)
|
|
158
|
-
assert state and isinstance(
|
|
159
|
-
state
|
|
160
|
-
)
|
|
155
|
+
assert dataset and isinstance(dataset, Dataset), (
|
|
156
|
+
"dataset shouldn't be null and should be a Dataset"
|
|
157
|
+
)
|
|
158
|
+
assert state and isinstance(state, DatasetState), (
|
|
159
|
+
"state shouldn't be null and should be a str from DatasetState"
|
|
160
|
+
)
|
|
161
161
|
|
|
162
162
|
if self.is_read_only:
|
|
163
163
|
logger.warning("Cannot update dataset as this worker is in read-only mode")
|