arkindex-base-worker 0.3.7rc4__py3-none-any.whl → 0.5.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/METADATA +18 -19
  2. arkindex_base_worker-0.5.0a1.dist-info/RECORD +61 -0
  3. {arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/WHEEL +1 -1
  4. {arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/top_level.txt +2 -0
  5. arkindex_worker/cache.py +1 -1
  6. arkindex_worker/image.py +167 -2
  7. arkindex_worker/models.py +18 -0
  8. arkindex_worker/utils.py +98 -4
  9. arkindex_worker/worker/__init__.py +117 -218
  10. arkindex_worker/worker/base.py +39 -46
  11. arkindex_worker/worker/classification.py +45 -29
  12. arkindex_worker/worker/corpus.py +86 -0
  13. arkindex_worker/worker/dataset.py +89 -26
  14. arkindex_worker/worker/element.py +352 -91
  15. arkindex_worker/worker/entity.py +13 -11
  16. arkindex_worker/worker/image.py +21 -0
  17. arkindex_worker/worker/metadata.py +26 -16
  18. arkindex_worker/worker/process.py +92 -0
  19. arkindex_worker/worker/task.py +5 -4
  20. arkindex_worker/worker/training.py +25 -10
  21. arkindex_worker/worker/transcription.py +89 -68
  22. arkindex_worker/worker/version.py +3 -1
  23. hooks/pre_gen_project.py +3 -0
  24. tests/__init__.py +8 -0
  25. tests/conftest.py +47 -58
  26. tests/test_base_worker.py +212 -12
  27. tests/test_dataset_worker.py +294 -437
  28. tests/test_elements_worker/{test_classifications.py → test_classification.py} +313 -200
  29. tests/test_elements_worker/test_cli.py +3 -11
  30. tests/test_elements_worker/test_corpus.py +168 -0
  31. tests/test_elements_worker/test_dataset.py +106 -157
  32. tests/test_elements_worker/test_element.py +427 -0
  33. tests/test_elements_worker/test_element_create_multiple.py +715 -0
  34. tests/test_elements_worker/test_element_create_single.py +528 -0
  35. tests/test_elements_worker/test_element_list_children.py +969 -0
  36. tests/test_elements_worker/test_element_list_parents.py +530 -0
  37. tests/test_elements_worker/{test_entities.py → test_entity_create.py} +37 -195
  38. tests/test_elements_worker/test_entity_list_and_check.py +160 -0
  39. tests/test_elements_worker/test_image.py +66 -0
  40. tests/test_elements_worker/test_metadata.py +252 -161
  41. tests/test_elements_worker/test_process.py +89 -0
  42. tests/test_elements_worker/test_task.py +8 -18
  43. tests/test_elements_worker/test_training.py +17 -8
  44. tests/test_elements_worker/test_transcription_create.py +873 -0
  45. tests/test_elements_worker/test_transcription_create_with_elements.py +951 -0
  46. tests/test_elements_worker/test_transcription_list.py +450 -0
  47. tests/test_elements_worker/test_version.py +60 -0
  48. tests/test_elements_worker/test_worker.py +578 -293
  49. tests/test_image.py +542 -209
  50. tests/test_merge.py +1 -2
  51. tests/test_utils.py +89 -4
  52. worker-demo/tests/__init__.py +0 -0
  53. worker-demo/tests/conftest.py +32 -0
  54. worker-demo/tests/test_worker.py +12 -0
  55. worker-demo/worker_demo/__init__.py +6 -0
  56. worker-demo/worker_demo/worker.py +19 -0
  57. arkindex_base_worker-0.3.7rc4.dist-info/RECORD +0 -41
  58. tests/test_elements_worker/test_elements.py +0 -2713
  59. tests/test_elements_worker/test_transcriptions.py +0 -2119
  60. {arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/LICENSE +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: arkindex-base-worker
3
- Version: 0.3.7rc4
3
+ Version: 0.5.0a1
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
@@ -37,26 +37,25 @@ Classifier: License :: OSI Approved :: MIT License
37
37
  Classifier: Programming Language :: Python :: 3 :: Only
38
38
  Classifier: Programming Language :: Python :: 3.10
39
39
  Classifier: Programming Language :: Python :: 3.11
40
- Classifier: Topic :: Text Processing :: Linguistic
40
+ Classifier: Programming Language :: Python :: 3.12
41
41
  Requires-Python: >=3.10
42
42
  Description-Content-Type: text/markdown
43
43
  License-File: LICENSE
44
- Requires-Dist: arkindex-client ==1.0.14
45
- Requires-Dist: peewee ==3.17.0
46
- Requires-Dist: Pillow ==10.2.0
47
- Requires-Dist: pymdown-extensions ==10.7
48
- Requires-Dist: python-gnupg ==0.5.2
49
- Requires-Dist: shapely ==2.0.3
50
- Requires-Dist: tenacity ==8.2.3
51
- Requires-Dist: zstandard ==0.22.0
44
+ Requires-Dist: humanize==4.11.0
45
+ Requires-Dist: peewee~=3.17
46
+ Requires-Dist: Pillow==11.0.0
47
+ Requires-Dist: python-gnupg==0.5.3
48
+ Requires-Dist: shapely==2.0.6
49
+ Requires-Dist: teklia-toolbox==0.1.7
50
+ Requires-Dist: zstandard==0.23.0
52
51
  Provides-Extra: docs
53
- Requires-Dist: black ==24.2.0 ; extra == 'docs'
54
- Requires-Dist: doc8 ==1.1.1 ; extra == 'docs'
55
- Requires-Dist: mkdocs ==1.5.3 ; extra == 'docs'
56
- Requires-Dist: mkdocs-material ==9.5.10 ; extra == 'docs'
57
- Requires-Dist: mkdocstrings ==0.24.0 ; extra == 'docs'
58
- Requires-Dist: mkdocstrings-python ==1.8.0 ; extra == 'docs'
59
- Requires-Dist: recommonmark ==0.7.1 ; extra == 'docs'
52
+ Requires-Dist: black==24.10.0; extra == "docs"
53
+ Requires-Dist: mkdocs-material==9.5.48; extra == "docs"
54
+ Requires-Dist: mkdocstrings-python==1.12.2; extra == "docs"
55
+ Provides-Extra: tests
56
+ Requires-Dist: pytest==8.3.4; extra == "tests"
57
+ Requires-Dist: pytest-mock==3.14.0; extra == "tests"
58
+ Requires-Dist: pytest-responses==0.5.1; extra == "tests"
60
59
 
61
60
  # Arkindex base Worker
62
61
 
@@ -70,7 +69,7 @@ The [documentation](https://workers.arkindex.org/) is made with [Material for Mk
70
69
 
71
70
  ## Create a new worker using our template
72
71
 
73
- ```
72
+ ```shell
74
73
  pip install --user cookiecutter
75
74
  cookiecutter git@gitlab.teklia.com:workers/base-worker.git
76
75
  ```
@@ -0,0 +1,61 @@
1
+ arkindex_worker/__init__.py,sha256=OlgCtTC9MaWeejviY0a3iQpALcRQGMVArFVVYwTF6I8,162
2
+ arkindex_worker/cache.py,sha256=qTblc_zKdYC47Wip6_O9Jf5qBkQW2ozQQrg-nsx1WuY,11221
3
+ arkindex_worker/image.py,sha256=D4CdTZKbzFULdRNy-flsilAdfNPP2WSV01dkxQnfGeA,20770
4
+ arkindex_worker/models.py,sha256=bPQzGZNs5a6z6DEcygsa8T33VOqPlMUbwKzHqlKzwbw,9923
5
+ arkindex_worker/utils.py,sha256=MbbJT8oh8DMHHR-vidFeXdUH0TSXGWm7ZDGWzrRXoEY,9933
6
+ arkindex_worker/worker/__init__.py,sha256=0_YHeOe31KR_8ynbnYMIMwnSQTVbKkkeLGmnlTMhFx0,16234
7
+ arkindex_worker/worker/base.py,sha256=7Pmw-UQSxV-xkW8NO5cXsxJ8W8szzyppMaNjq_az81A,19844
8
+ arkindex_worker/worker/classification.py,sha256=zECSNzGCZFzoPoDVZN4kuGYRNLzMQLBaRt3q1jnBSaA,10952
9
+ arkindex_worker/worker/corpus.py,sha256=0TQFOwZ6Te-CZi6lgkZY1wzyJ5wO9LAmcVQtqHvZpPk,2291
10
+ arkindex_worker/worker/dataset.py,sha256=LwzKwNFX4FqfLxh29LSvJydPwRw3VHaB1wjuFhUshsE,5267
11
+ arkindex_worker/worker/element.py,sha256=Qvvq9kJnAHNATHW7zi96eIY1x-0MsR-T5rrSJg6e9Y4,45309
12
+ arkindex_worker/worker/entity.py,sha256=DG8oVAdy-r18fliTjnzGI1j6l7SOFmyIBmE6JlE6A8g,14799
13
+ arkindex_worker/worker/image.py,sha256=t_Az6IGnj0EZyvcA4XxfPikOUjn_pztgsyxTkFZhaXU,621
14
+ arkindex_worker/worker/metadata.py,sha256=VRajtd2kaBvar9GercX4knvR6l1WFYjoCdJWU9ccKgk,7291
15
+ arkindex_worker/worker/process.py,sha256=9TEHpMcBax1wc6PrWMMrdXe2uNfqyVj7n_dAYZRBGnY,1854
16
+ arkindex_worker/worker/task.py,sha256=r1j7_qbdNu2Z8H8HbGzO3P3qdx-2N1pBbUPFDca0rqg,1519
17
+ arkindex_worker/worker/training.py,sha256=H8FmCdzGcDW-WMMwcgvmZPlN5tPHwGo0BXn12qmzj8g,10875
18
+ arkindex_worker/worker/transcription.py,sha256=52RY9kYsiR1sz9FxOigyo12Ker3VDbQ4U42gK9DpR3g,21146
19
+ arkindex_worker/worker/version.py,sha256=JIT7OI3Mo7RPkNrjOB9hfqrsG-FYygz_zi4l8PbkuAo,1960
20
+ hooks/pre_gen_project.py,sha256=xQJERv3vv9VzIqcBHI281eeWLWREXUF4mMw7PvJHHXM,269
21
+ tests/__init__.py,sha256=DG--S6IpGl399rzSAjDdHL76CkOIeZIjajCcyUSDhOQ,241
22
+ tests/conftest.py,sha256=Z9amrKmVtFltzTUUm07fGDrT4m540biaTpjedmplyzc,21536
23
+ tests/test_base_worker.py,sha256=2EIYcd_3f9O0zB5WiGIQV0Cn9wndLvnEnSfcAE1qWWU,30607
24
+ tests/test_cache.py,sha256=ii0gyr0DrG7ChEs7pmT8hMdSguAOAcCze4bRMiFQxuk,10640
25
+ tests/test_dataset_worker.py,sha256=z8ydliUlwW2j-irgLAotJMacgJXkVvF5TgsWLyCn1Jo,22087
26
+ tests/test_element.py,sha256=2G9M15TLxQRmvrWM9Kw2ucnElh4kSv_oF_5FYwwAxTY,13181
27
+ tests/test_image.py,sha256=mYyRfDXGLLzcQQtmaM7GR3jt7ScxsLLog16pUVHrH3M,27824
28
+ tests/test_merge.py,sha256=TuOeUS0UCz66DPOQFFhc4NQBxIjZL9f5czi4XnvGrr4,8270
29
+ tests/test_utils.py,sha256=nYL1s2ViZoLoMiNpLGDaWwxf8dJ1D8aT522AO-PVaEQ,3607
30
+ tests/test_elements_worker/__init__.py,sha256=Fh4nkbbyJSMv_VtjQxnWrOqTnxXaaWI8S9WU0VrzCHs,179
31
+ tests/test_elements_worker/test_classification.py,sha256=nya7veSPR_O9G41Enodp2-o6AifMBcaSTWJP2vXSSJ4,30133
32
+ tests/test_elements_worker/test_cli.py,sha256=a23i1pUDbXi23MUtbWwGEcLLrmc_YlrbDgOG3h66wLM,2620
33
+ tests/test_elements_worker/test_corpus.py,sha256=kscJyM8k1njYJJFGuvliVzn89lWh41mEyDCCawnp3W8,5483
34
+ tests/test_elements_worker/test_dataset.py,sha256=00IlOZv9YFlZ23rGXyR-HLbKLQxGelZ1Bf9lEZYA0IY,11412
35
+ tests/test_elements_worker/test_element.py,sha256=lb5tLjl0jsixX0OWVhBAaKLE9GKkBw79kFHDNGommaQ,12535
36
+ tests/test_elements_worker/test_element_create_multiple.py,sha256=arYFGmxc0517ZUii6k__G_UQQatuNIASTC8MXvUrSwk,21887
37
+ tests/test_elements_worker/test_element_create_single.py,sha256=Fa9zm12J2rQ3VrUe3yIlHAc7Vty_eQYb_YGnNPQB3IE,16697
38
+ tests/test_elements_worker/test_element_list_children.py,sha256=2zH4h663w3EduqpzQr-7bf9zIDzO1x2WxdUYYHsIHkI,31358
39
+ tests/test_elements_worker/test_element_list_parents.py,sha256=TXeGW-a3W-7GmB2QrhJH9mMnvxuybeAwQ4tL3iIxwXo,16734
40
+ tests/test_elements_worker/test_entity_create.py,sha256=9Tjr9KA2yo44VFV283q_cs6XbbVguUMDNfCj-DILSJg,29353
41
+ tests/test_elements_worker/test_entity_list_and_check.py,sha256=ENBLaqbXlRUDbHRvQla3080a0HJltrWAPYWNohUA9NU,4992
42
+ tests/test_elements_worker/test_image.py,sha256=BljMNKgec_9a5bzNzFpYZIvSbuvwsWDfdqLHVJaTa7M,2079
43
+ tests/test_elements_worker/test_metadata.py,sha256=Xfggy-vxw5DZ3hFKx3sB7OYb2d1tu1RiNK8fvKJIaBs,22294
44
+ tests/test_elements_worker/test_process.py,sha256=y4RoVhPfyHzR795fw7-_FXElBcKo3fy4Ew_HI-kxJic,3088
45
+ tests/test_elements_worker/test_task.py,sha256=wTUWqN9UhfKmJn3IcFY75EW4I1ulRhisflmY1kmP47s,5574
46
+ tests/test_elements_worker/test_training.py,sha256=3W2LzpqxekvRiX42m_PvWcVel7ynQJmzO8gKcLmCMQI,8717
47
+ tests/test_elements_worker/test_transcription_create.py,sha256=yznO9B_BVsOR0Z_VY5ZL8gJp0ZPCz_4sPUs5dXtixAg,29281
48
+ tests/test_elements_worker/test_transcription_create_with_elements.py,sha256=tmcyglgssEqMnt1Mdy_u6X1m2wgLWTo_HdWst3GrK2k,33056
49
+ tests/test_elements_worker/test_transcription_list.py,sha256=ikz7HYPCoQWTdTRCd382SB-y-T2BbigPLlIcx5Eow-I,15324
50
+ tests/test_elements_worker/test_version.py,sha256=xqCgcgukTFJzkMgYfQG-8mTbu0o2fdYjWC07FktThfw,2125
51
+ tests/test_elements_worker/test_worker.py,sha256=HDw_UQdiMUzlBd4-jRvC-B3pNrZmmpps4sfZ9a87JVY,25378
52
+ worker-demo/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
+ worker-demo/tests/conftest.py,sha256=XzNMNeg6pmABUAH8jN6eZTlZSFGLYjS3-DTXjiRN6Yc,1002
54
+ worker-demo/tests/test_worker.py,sha256=3DLd4NRK4bfyatG5P_PK4k9P9tJHx9XQq5_ryFEEFVg,304
55
+ worker-demo/worker_demo/__init__.py,sha256=2BPomV8ZMNf3YXJgloatKeHQCE6QOkwmsHGkO6MkQuM,125
56
+ worker-demo/worker_demo/worker.py,sha256=Rt-DjWa5iBP08k58NDZMfeyPuFbtNcbX6nc5jFX7GNo,440
57
+ arkindex_base_worker-0.5.0a1.dist-info/LICENSE,sha256=NVshRi1efwVezMfW7xXYLrdDr2Li1AfwfGOd5WuH1kQ,1063
58
+ arkindex_base_worker-0.5.0a1.dist-info/METADATA,sha256=eP4wgAkBFUHBWvNVcASdUXsxxz_0AMtQTgjJPuBlCCQ,3336
59
+ arkindex_base_worker-0.5.0a1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
60
+ arkindex_base_worker-0.5.0a1.dist-info/top_level.txt,sha256=58NuslgxQC2vT4DiqZEgO4JqJRrYa2yeNI9QvkbfGQU,40
61
+ arkindex_base_worker-0.5.0a1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.42.0)
2
+ Generator: setuptools (75.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,2 +1,4 @@
1
1
  arkindex_worker
2
+ hooks
2
3
  tests
4
+ worker-demo
arkindex_worker/cache.py CHANGED
@@ -380,7 +380,7 @@ def unsupported_cache(func):
380
380
  def wrapper(self, *args, **kwargs):
381
381
  results = func(self, *args, **kwargs)
382
382
 
383
- if not (self.is_read_only or self.use_cache):
383
+ if self.use_cache:
384
384
  logger.warning(
385
385
  f"This API helper `{func.__name__}` did not update the cache database"
386
386
  )
arkindex_worker/image.py CHANGED
@@ -2,13 +2,20 @@
2
2
  Helper methods to download and open IIIF images, and manage polygons.
3
3
  """
4
4
 
5
+ import base64
6
+ import functools
7
+ import os
5
8
  import re
9
+ import tempfile
6
10
  from collections import namedtuple
11
+ from collections.abc import Generator, Iterator
7
12
  from io import BytesIO
8
13
  from math import ceil
9
14
  from pathlib import Path
10
15
  from typing import TYPE_CHECKING
11
16
 
17
+ import humanize
18
+ import numpy as np
12
19
  import requests
13
20
  from PIL import Image
14
21
  from shapely.affinity import rotate, scale, translate
@@ -21,6 +28,8 @@ from tenacity import (
21
28
  )
22
29
 
23
30
  from arkindex_worker import logger
31
+ from arkindex_worker.utils import pluralize
32
+ from teklia_toolbox.requests import should_verify_cert
24
33
 
25
34
  # Avoid circular imports error when type checking
26
35
  if TYPE_CHECKING:
@@ -38,8 +47,57 @@ IIIF_URL = re.compile(r"\w+:\/{2}.+\/.+\/.+\/.+\/(?P<size>.+)\/!?\d+\/\w+\.\w+")
38
47
  IIIF_FULL = "full"
39
48
  # Maximum size available
40
49
  IIIF_MAX = "max"
50
+ # Ratios to resize images: 1.0, 0.95, [...], 0.1, 0.05
51
+ IMAGE_RATIOS = np.arange(1, 0, -0.05).round(2).tolist()
41
52
 
42
53
 
54
+ def update_pillow_image_size_limit(func):
55
+ """
56
+ Update Pillow Image size limit
57
+ """
58
+
59
+ @functools.wraps(func)
60
+ def wrapper(
61
+ *args,
62
+ max_image_pixels: str | int | None = os.getenv("ARKINDEX_MAX_IMAGE_PIXELS"),
63
+ **kwargs,
64
+ ):
65
+ """
66
+ Wrapper to update Pillow Image size limit and restore it at the end of the function.
67
+
68
+ :param *args: Positional arguments passed to the function.
69
+ :param max_image_pixels: Pillow Image size limit to use.
70
+ :param **kwargs: Keyword arguments passed to the function.
71
+ """
72
+ MAX_IMAGE_PIXELS = Image.MAX_IMAGE_PIXELS
73
+
74
+ # Override Pillow Image size limit
75
+ if max_image_pixels is not None:
76
+ max_image_pixels = int(max_image_pixels)
77
+ # Override Pillow limit for detecting decompression bombs, disabled if set to 0
78
+ if max_image_pixels == 0:
79
+ logger.warning(
80
+ "Pillow Image size limit is completely disabled, make sure you trust the image source."
81
+ )
82
+ Image.MAX_IMAGE_PIXELS = None
83
+ else:
84
+ Image.MAX_IMAGE_PIXELS = max_image_pixels
85
+
86
+ try:
87
+ results = func(*args, **kwargs)
88
+ except:
89
+ # Restore initial Pillow Image size limit
90
+ Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
91
+ raise
92
+
93
+ # Restore initial Pillow Image size limit
94
+ Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
95
+ return results
96
+
97
+ return wrapper
98
+
99
+
100
+ @update_pillow_image_size_limit
43
101
  def open_image(
44
102
  path: str,
45
103
  mode: str | None = "RGB",
@@ -147,6 +205,111 @@ def upload_image(image: Image, url: str) -> requests.Response:
147
205
  return resp
148
206
 
149
207
 
208
+ def resized_images(
209
+ *args,
210
+ element: "Element",
211
+ max_pixels_short: int | None = None,
212
+ max_pixels_long: int | None = None,
213
+ max_bytes: int | None = None,
214
+ use_base64: bool = False,
215
+ **kwargs,
216
+ ) -> Iterator[Generator[tempfile._TemporaryFileWrapper | str]]:
217
+ """
218
+ Build resized images according to pixel and byte limits.
219
+
220
+ :param *args: Positional arguments passed to [arkindex_worker.models.Element.open_image_tempfile][].
221
+ :param element: Element whose image needs to be resized.
222
+ :param max_pixels_short: Maximum pixel size of the resized images' short side.
223
+ :param max_pixels_long: Maximum pixel size of the resized images' long side.
224
+ :param max_bytes: Maximum byte size of the resized images.
225
+ :param use_base64: Whether or not to encode resized images in base64 before calculating their size.
226
+ :param **kwargs: Keyword arguments passed to [arkindex_worker.models.Element.open_image_tempfile][].
227
+ :returns: An iterator of temporary files for resized images OR an iterator of base64-encoded strings if `use_base64` is set.
228
+ """
229
+ _, _, element_width, element_height = polygon_bounding_box(element.polygon)
230
+ logger.info(
231
+ f"This element's image dimensions are ({element_width} x {element_height})."
232
+ )
233
+
234
+ portrait_format = element_width <= element_height
235
+ max_pixels_width, max_pixels_height = (
236
+ (max_pixels_short, max_pixels_long)
237
+ if portrait_format
238
+ else (max_pixels_long, max_pixels_short)
239
+ )
240
+
241
+ # The image dimension is already within the pixel limitation, no need to resize the image
242
+ if max_pixels_width and max_pixels_width >= element_width:
243
+ max_pixels_width = None
244
+ if max_pixels_height and max_pixels_height >= element_height:
245
+ max_pixels_height = None
246
+
247
+ if (max_pixels_width and element_width > max_pixels_width) or (
248
+ max_pixels_height and element_height > max_pixels_height
249
+ ):
250
+ logger.warning(
251
+ f"Maximum image dimensions supported are ({max_pixels_width or element_width} x {max_pixels_height or element_height})."
252
+ )
253
+ logger.warning("The image will be resized.")
254
+
255
+ # No limitations provided, we keep the image initial dimensions
256
+ if max_pixels_width is None and max_pixels_height is None:
257
+ open_image_param, max_value = (
258
+ ("max_height", element_height)
259
+ if portrait_format
260
+ else ("max_width", element_width)
261
+ )
262
+ # A limitation is only given for the height, we resize it
263
+ elif max_pixels_width is None:
264
+ open_image_param, max_value = ("max_height", max_pixels_height)
265
+ # A limitation is only given for the width, we resize it
266
+ elif max_pixels_height is None:
267
+ open_image_param, max_value = ("max_width", max_pixels_width)
268
+ # Limitations are provided for both sides:
269
+ # - we resize only the one with the biggest scale factor
270
+ # - the remaining one will automatically fall within the other limitation
271
+ else:
272
+ width_rescaling_factor = element_width / max_pixels_width
273
+ height_rescaling_factor = element_height / max_pixels_height
274
+ open_image_param, max_value = (
275
+ ("max_height", max_pixels_height)
276
+ if height_rescaling_factor > width_rescaling_factor
277
+ else ("max_width", max_pixels_width)
278
+ )
279
+
280
+ resized_pixels = set(
281
+ min(round(ratio * max_value), max_value) for ratio in IMAGE_RATIOS
282
+ )
283
+ for resized_pixel in sorted(resized_pixels, reverse=True):
284
+ with element.open_image_tempfile(
285
+ *args, **{**kwargs, open_image_param: resized_pixel}
286
+ ) as image:
287
+ pillow_image = Image.open(image)
288
+ if (
289
+ pillow_image.width != element_width
290
+ or pillow_image.height != element_height
291
+ ):
292
+ logger.warning(
293
+ f"The image was resized to ({pillow_image.width} x {pillow_image.height})."
294
+ )
295
+
296
+ image_size = Path(image.name).stat().st_size
297
+ if use_base64:
298
+ image = base64.b64encode(Path(image.name).read_bytes()).decode("utf-8")
299
+ image_size = len(image)
300
+
301
+ # The image is still too heavy
302
+ if max_bytes and image_size > max_bytes:
303
+ logger.warning(f"The image size is {humanize.naturalsize(image_size)}.")
304
+ logger.warning(
305
+ f"Maximum image input size supported is {humanize.naturalsize(max_bytes)}."
306
+ )
307
+ logger.warning("The image will be resized.")
308
+ continue
309
+
310
+ yield image
311
+
312
+
150
313
  def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
151
314
  """
152
315
  Compute the rectangle bounding box of a polygon.
@@ -163,7 +326,7 @@ def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
163
326
  def _retry_log(retry_state, *args, **kwargs):
164
327
  logger.warning(
165
328
  f"Request to {retry_state.args[0]} failed ({repr(retry_state.outcome.exception())}), "
166
- f"retrying in {retry_state.idle_for} seconds"
329
+ f'retrying in {retry_state.idle_for} {pluralize("second", retry_state.idle_for)}'
167
330
  )
168
331
 
169
332
 
@@ -175,7 +338,9 @@ def _retry_log(retry_state, *args, **kwargs):
175
338
  reraise=True,
176
339
  )
177
340
  def _retried_request(url, *args, method=requests.get, **kwargs):
178
- resp = method(url, *args, timeout=DOWNLOAD_TIMEOUT, **kwargs)
341
+ resp = method(
342
+ url, *args, timeout=DOWNLOAD_TIMEOUT, verify=should_verify_cert(url), **kwargs
343
+ )
179
344
  resp.raise_for_status()
180
345
  return resp
181
346
 
arkindex_worker/models.py CHANGED
@@ -20,6 +20,8 @@ class MagicDict(dict):
20
20
  Automagically convert lists and dicts to MagicDicts and lists of MagicDicts
21
21
  Allows for nested access: foo.bar.baz
22
22
  """
23
+ if isinstance(item, Dataset):
24
+ return item
23
25
  if isinstance(item, list):
24
26
  return list(map(self._magify, item))
25
27
  if isinstance(item, dict):
@@ -259,6 +261,12 @@ class Transcription(ArkindexModel):
259
261
  """
260
262
 
261
263
 
264
+ class Image(ArkindexModel):
265
+ """
266
+ Describes an Arkindex image.
267
+ """
268
+
269
+
262
270
  class Dataset(ArkindexModel):
263
271
  """
264
272
  Describes an Arkindex dataset.
@@ -272,6 +280,16 @@ class Dataset(ArkindexModel):
272
280
  return f"{self.id}.tar.zst"
273
281
 
274
282
 
283
+ class Set(MagicDict):
284
+ """
285
+ Describes an Arkindex dataset set.
286
+ """
287
+
288
+ def __str__(self):
289
+ # Not using ArkindexModel.__str__ as we do not retrieve the Set ID
290
+ return f"{self.__class__.__name__} ({self.name}) from {self.dataset}"
291
+
292
+
275
293
  class Artifact(ArkindexModel):
276
294
  """
277
295
  Describes an Arkindex artifact.
arkindex_worker/utils.py CHANGED
@@ -1,15 +1,54 @@
1
1
  import hashlib
2
+ import inspect
2
3
  import logging
3
4
  import os
4
5
  import tarfile
5
6
  import tempfile
7
+ from collections.abc import Callable, Generator
8
+ from itertools import islice
6
9
  from pathlib import Path
10
+ from typing import Any
7
11
 
8
- import zstandard
9
12
  import zstandard as zstd
10
13
 
11
14
  logger = logging.getLogger(__name__)
12
15
 
16
+
17
+ def pluralize(singular: str, count: int) -> str:
18
+ """Pluralize a noun, if necessary, using simplified rules of English pluralization and a list of exceptions.
19
+
20
+ :param str singular: A singular noun describing an object
21
+ :param int count: The object count, to determine whether to pluralize or not
22
+ :return str: The noun in its singular or plural form
23
+ """
24
+ if count == 1:
25
+ return singular
26
+
27
+ some_exceptions = {
28
+ "child": "children",
29
+ "class": "classes",
30
+ "entity": "entities",
31
+ "metadata": "metadata",
32
+ }
33
+ if singular in some_exceptions:
34
+ return some_exceptions[singular]
35
+
36
+ return singular + "s"
37
+
38
+
39
+ MANUAL_SOURCE = "manual"
40
+
41
+
42
+ def parse_source_id(value: str) -> bool | str | None:
43
+ """
44
+ Parse a UUID argument (Worker Version, Worker Run, ...) to use it directly in the API.
45
+ Arkindex API filters generally expect `False` to filter manual sources.
46
+ """
47
+ if value == MANUAL_SOURCE:
48
+ return False
49
+ return value or None
50
+
51
+
13
52
  CHUNK_SIZE = 1024
14
53
  """Chunk Size used for ZSTD compression"""
15
54
 
@@ -25,7 +64,7 @@ def decompress_zst_archive(compressed_archive: Path) -> tuple[int, Path]:
25
64
  :param compressed_archive: Path to the target ZST-compressed archive
26
65
  :return: File descriptor and path to the uncompressed tar archive
27
66
  """
28
- dctx = zstandard.ZstdDecompressor()
67
+ dctx = zstd.ZstdDecompressor()
29
68
  archive_fd, archive_path = tempfile.mkstemp(suffix=".tar")
30
69
  archive_path = Path(archive_path)
31
70
 
@@ -37,7 +76,7 @@ def decompress_zst_archive(compressed_archive: Path) -> tuple[int, Path]:
37
76
  ):
38
77
  dctx.copy_stream(compressed, decompressed)
39
78
  logger.debug(f"Successfully uncompressed archive {compressed_archive}")
40
- except zstandard.ZstdError as e:
79
+ except zstd.ZstdError as e:
41
80
  raise Exception(f"Couldn't uncompressed archive: {e}") from e
42
81
 
43
82
  return archive_fd, archive_path
@@ -116,7 +155,7 @@ def zstd_compress(
116
155
  archive_hasher.update(compressed_chunk)
117
156
  archive_file.write(compressed_chunk)
118
157
  logger.debug(f"Successfully compressed {source}")
119
- except zstandard.ZstdError as e:
158
+ except zstd.ZstdError as e:
120
159
  raise Exception(f"Couldn't compress archive: {e}") from e
121
160
  return file_d, destination, archive_hasher.hexdigest()
122
161
 
@@ -184,3 +223,58 @@ def create_tar_zst_archive(
184
223
  close_delete_file(tar_fd, tar_archive)
185
224
 
186
225
  return zst_fd, zst_archive, zst_hash, tar_hash
226
+
227
+
228
+ DEFAULT_BATCH_SIZE = 50
229
+ """Batch size used for bulk publication to Arkindex"""
230
+
231
+
232
+ def batch_publication(func: Callable) -> Callable:
233
+ """
234
+ Decorator for functions that should raise an error when the value passed through the ``batch_size`` parameter is **not** a strictly positive integer.
235
+
236
+ :param func: The function to wrap with the ``batch_size`` check
237
+ :return: The function passing the ``batch_size`` check
238
+ """
239
+ signature = inspect.signature(func)
240
+
241
+ def wrapper(self, *args, **kwargs):
242
+ bound_func = signature.bind(self, *args, **kwargs)
243
+ bound_func.apply_defaults()
244
+ batch_size = bound_func.arguments.get("batch_size")
245
+ assert (
246
+ batch_size is not None and isinstance(batch_size, int) and batch_size > 0
247
+ ), "batch_size shouldn't be null and should be a strictly positive integer"
248
+
249
+ return func(self, *args, **kwargs)
250
+
251
+ wrapper.__name__ = func.__name__
252
+ return wrapper
253
+
254
+
255
+ def make_batches(
256
+ objects: list, singular_name: str, batch_size: int
257
+ ) -> Generator[list[Any]]:
258
+ """Split an object list in successive batches of maximum size ``batch_size``.
259
+
260
+ :param objects: The object list to divide in batches of ``batch_size`` size
261
+ :param singular_name: The singular form of the noun associated with the object list
262
+ :param batch_size: The maximum size of each batch to split the object list
263
+ :return: A generator of successive batches containing ``batch_size`` items from ``objects``
264
+ """
265
+ count = len(objects)
266
+ logger.info(
267
+ f"Creating batches of size {batch_size} to process {count} {pluralize(singular_name, count)}"
268
+ )
269
+
270
+ index = 1
271
+ iterator = iter(objects)
272
+ while batch := list(islice(iterator, batch_size)):
273
+ count = len(batch)
274
+ logger.info(
275
+ f"Processing batch {index} containing {count} {pluralize(singular_name, count)}..."
276
+ )
277
+
278
+ yield batch
279
+
280
+ index += 1