arkindex-base-worker 0.3.7rc9__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/PKG-INFO +15 -19
  2. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/README.md +1 -1
  3. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/arkindex_base_worker.egg-info/PKG-INFO +15 -19
  4. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/arkindex_base_worker.egg-info/SOURCES.txt +18 -4
  5. arkindex_base_worker-0.4.0/arkindex_base_worker.egg-info/requires.txt +17 -0
  6. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/arkindex_worker/cache.py +1 -1
  7. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/arkindex_worker/image.py +120 -1
  8. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/arkindex_worker/models.py +6 -0
  9. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/arkindex_worker/utils.py +85 -4
  10. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/arkindex_worker/worker/__init__.py +68 -162
  11. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/arkindex_worker/worker/base.py +39 -34
  12. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/arkindex_worker/worker/classification.py +34 -18
  13. arkindex_base_worker-0.4.0/arkindex_worker/worker/corpus.py +86 -0
  14. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/arkindex_worker/worker/dataset.py +71 -1
  15. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/arkindex_worker/worker/element.py +352 -91
  16. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/arkindex_worker/worker/entity.py +11 -11
  17. arkindex_base_worker-0.4.0/arkindex_worker/worker/image.py +21 -0
  18. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/arkindex_worker/worker/metadata.py +19 -9
  19. arkindex_base_worker-0.4.0/arkindex_worker/worker/process.py +92 -0
  20. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/arkindex_worker/worker/task.py +5 -4
  21. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/arkindex_worker/worker/training.py +25 -10
  22. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/arkindex_worker/worker/transcription.py +89 -68
  23. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/arkindex_worker/worker/version.py +3 -1
  24. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/pyproject.toml +14 -24
  25. arkindex_base_worker-0.4.0/tests/__init__.py +8 -0
  26. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/tests/conftest.py +36 -52
  27. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/tests/test_base_worker.py +212 -12
  28. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/tests/test_dataset_worker.py +21 -45
  29. arkindex-base-worker-0.3.7rc9/tests/test_elements_worker/test_classifications.py → arkindex_base_worker-0.4.0/tests/test_elements_worker/test_classification.py +216 -100
  30. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/tests/test_elements_worker/test_cli.py +3 -11
  31. arkindex_base_worker-0.4.0/tests/test_elements_worker/test_corpus.py +168 -0
  32. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/tests/test_elements_worker/test_dataset.py +7 -12
  33. arkindex_base_worker-0.4.0/tests/test_elements_worker/test_element.py +427 -0
  34. arkindex_base_worker-0.4.0/tests/test_elements_worker/test_element_create_multiple.py +715 -0
  35. arkindex_base_worker-0.4.0/tests/test_elements_worker/test_element_create_single.py +528 -0
  36. arkindex_base_worker-0.4.0/tests/test_elements_worker/test_element_list_children.py +969 -0
  37. arkindex_base_worker-0.4.0/tests/test_elements_worker/test_element_list_parents.py +530 -0
  38. arkindex-base-worker-0.3.7rc9/tests/test_elements_worker/test_entities.py → arkindex_base_worker-0.4.0/tests/test_elements_worker/test_entity_create.py +37 -195
  39. arkindex_base_worker-0.4.0/tests/test_elements_worker/test_entity_list_and_check.py +160 -0
  40. arkindex_base_worker-0.4.0/tests/test_elements_worker/test_image.py +66 -0
  41. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/tests/test_elements_worker/test_metadata.py +230 -139
  42. arkindex_base_worker-0.4.0/tests/test_elements_worker/test_process.py +89 -0
  43. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/tests/test_elements_worker/test_task.py +8 -18
  44. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/tests/test_elements_worker/test_training.py +17 -8
  45. arkindex_base_worker-0.4.0/tests/test_elements_worker/test_transcription_create.py +873 -0
  46. arkindex_base_worker-0.4.0/tests/test_elements_worker/test_transcription_create_with_elements.py +951 -0
  47. arkindex_base_worker-0.4.0/tests/test_elements_worker/test_transcription_list.py +450 -0
  48. arkindex_base_worker-0.4.0/tests/test_elements_worker/test_version.py +60 -0
  49. arkindex_base_worker-0.4.0/tests/test_elements_worker/test_worker.py +797 -0
  50. arkindex_base_worker-0.4.0/tests/test_image.py +809 -0
  51. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/tests/test_merge.py +1 -2
  52. arkindex_base_worker-0.4.0/tests/test_utils.py +120 -0
  53. arkindex-base-worker-0.3.7rc9/arkindex_base_worker.egg-info/requires.txt +0 -21
  54. arkindex-base-worker-0.3.7rc9/tests/test_elements_worker/test_elements.py +0 -2713
  55. arkindex-base-worker-0.3.7rc9/tests/test_elements_worker/test_transcriptions.py +0 -2119
  56. arkindex-base-worker-0.3.7rc9/tests/test_elements_worker/test_worker.py +0 -513
  57. arkindex-base-worker-0.3.7rc9/tests/test_image.py +0 -586
  58. arkindex-base-worker-0.3.7rc9/tests/test_utils.py +0 -57
  59. arkindex-base-worker-0.3.7rc9/worker-demo/tests/__init__.py +0 -0
  60. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/LICENSE +0 -0
  61. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/arkindex_base_worker.egg-info/dependency_links.txt +0 -0
  62. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/arkindex_base_worker.egg-info/top_level.txt +0 -0
  63. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/arkindex_worker/__init__.py +0 -0
  64. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/hooks/pre_gen_project.py +0 -0
  65. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/setup.cfg +0 -0
  66. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/tests/test_cache.py +0 -0
  67. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/tests/test_element.py +0 -0
  68. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/tests/test_elements_worker/__init__.py +0 -0
  69. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0/worker-demo}/tests/__init__.py +0 -0
  70. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/worker-demo/tests/conftest.py +0 -0
  71. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/worker-demo/tests/test_worker.py +0 -0
  72. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/worker-demo/worker_demo/__init__.py +0 -0
  73. {arkindex-base-worker-0.3.7rc9 → arkindex_base_worker-0.4.0}/worker-demo/worker_demo/worker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: arkindex-base-worker
3
- Version: 0.3.7rc9
3
+ Version: 0.4.0
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
@@ -37,28 +37,24 @@ Classifier: License :: OSI Approved :: MIT License
37
37
  Classifier: Programming Language :: Python :: 3 :: Only
38
38
  Classifier: Programming Language :: Python :: 3.10
39
39
  Classifier: Programming Language :: Python :: 3.11
40
- Classifier: Topic :: Text Processing :: Linguistic
40
+ Classifier: Programming Language :: Python :: 3.12
41
41
  Requires-Python: >=3.10
42
42
  Description-Content-Type: text/markdown
43
43
  License-File: LICENSE
44
- Requires-Dist: peewee==3.17.0
45
- Requires-Dist: Pillow==10.2.0
46
- Requires-Dist: pymdown-extensions==10.7
47
- Requires-Dist: python-gnupg==0.5.2
48
- Requires-Dist: shapely==2.0.3
49
- Requires-Dist: teklia-toolbox==0.1.4rc3
50
- Requires-Dist: zstandard==0.22.0
44
+ Requires-Dist: humanize==4.11.0
45
+ Requires-Dist: peewee~=3.17
46
+ Requires-Dist: Pillow==11.0.0
47
+ Requires-Dist: python-gnupg==0.5.3
48
+ Requires-Dist: shapely==2.0.6
49
+ Requires-Dist: teklia-toolbox==0.1.7
50
+ Requires-Dist: zstandard==0.23.0
51
51
  Provides-Extra: docs
52
- Requires-Dist: black==24.2.0; extra == "docs"
53
- Requires-Dist: doc8==1.1.1; extra == "docs"
54
- Requires-Dist: mkdocs==1.5.3; extra == "docs"
55
- Requires-Dist: mkdocs-material==9.5.10; extra == "docs"
56
- Requires-Dist: mkdocstrings==0.24.0; extra == "docs"
57
- Requires-Dist: mkdocstrings-python==1.8.0; extra == "docs"
58
- Requires-Dist: recommonmark==0.7.1; extra == "docs"
52
+ Requires-Dist: black==24.10.0; extra == "docs"
53
+ Requires-Dist: mkdocs-material==9.5.48; extra == "docs"
54
+ Requires-Dist: mkdocstrings-python==1.12.2; extra == "docs"
59
55
  Provides-Extra: tests
60
- Requires-Dist: pytest==8.0.1; extra == "tests"
61
- Requires-Dist: pytest-mock==3.12.0; extra == "tests"
56
+ Requires-Dist: pytest==8.3.4; extra == "tests"
57
+ Requires-Dist: pytest-mock==3.14.0; extra == "tests"
62
58
  Requires-Dist: pytest-responses==0.5.1; extra == "tests"
63
59
 
64
60
  # Arkindex base Worker
@@ -73,7 +69,7 @@ The [documentation](https://workers.arkindex.org/) is made with [Material for Mk
73
69
 
74
70
  ## Create a new worker using our template
75
71
 
76
- ```
72
+ ```shell
77
73
  pip install --user cookiecutter
78
74
  cookiecutter git@gitlab.teklia.com:workers/base-worker.git
79
75
  ```
@@ -10,7 +10,7 @@ The [documentation](https://workers.arkindex.org/) is made with [Material for Mk
10
10
 
11
11
  ## Create a new worker using our template
12
12
 
13
- ```
13
+ ```shell
14
14
  pip install --user cookiecutter
15
15
  cookiecutter git@gitlab.teklia.com:workers/base-worker.git
16
16
  ```
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: arkindex-base-worker
3
- Version: 0.3.7rc9
3
+ Version: 0.4.0
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
@@ -37,28 +37,24 @@ Classifier: License :: OSI Approved :: MIT License
37
37
  Classifier: Programming Language :: Python :: 3 :: Only
38
38
  Classifier: Programming Language :: Python :: 3.10
39
39
  Classifier: Programming Language :: Python :: 3.11
40
- Classifier: Topic :: Text Processing :: Linguistic
40
+ Classifier: Programming Language :: Python :: 3.12
41
41
  Requires-Python: >=3.10
42
42
  Description-Content-Type: text/markdown
43
43
  License-File: LICENSE
44
- Requires-Dist: peewee==3.17.0
45
- Requires-Dist: Pillow==10.2.0
46
- Requires-Dist: pymdown-extensions==10.7
47
- Requires-Dist: python-gnupg==0.5.2
48
- Requires-Dist: shapely==2.0.3
49
- Requires-Dist: teklia-toolbox==0.1.4rc3
50
- Requires-Dist: zstandard==0.22.0
44
+ Requires-Dist: humanize==4.11.0
45
+ Requires-Dist: peewee~=3.17
46
+ Requires-Dist: Pillow==11.0.0
47
+ Requires-Dist: python-gnupg==0.5.3
48
+ Requires-Dist: shapely==2.0.6
49
+ Requires-Dist: teklia-toolbox==0.1.7
50
+ Requires-Dist: zstandard==0.23.0
51
51
  Provides-Extra: docs
52
- Requires-Dist: black==24.2.0; extra == "docs"
53
- Requires-Dist: doc8==1.1.1; extra == "docs"
54
- Requires-Dist: mkdocs==1.5.3; extra == "docs"
55
- Requires-Dist: mkdocs-material==9.5.10; extra == "docs"
56
- Requires-Dist: mkdocstrings==0.24.0; extra == "docs"
57
- Requires-Dist: mkdocstrings-python==1.8.0; extra == "docs"
58
- Requires-Dist: recommonmark==0.7.1; extra == "docs"
52
+ Requires-Dist: black==24.10.0; extra == "docs"
53
+ Requires-Dist: mkdocs-material==9.5.48; extra == "docs"
54
+ Requires-Dist: mkdocstrings-python==1.12.2; extra == "docs"
59
55
  Provides-Extra: tests
60
- Requires-Dist: pytest==8.0.1; extra == "tests"
61
- Requires-Dist: pytest-mock==3.12.0; extra == "tests"
56
+ Requires-Dist: pytest==8.3.4; extra == "tests"
57
+ Requires-Dist: pytest-mock==3.14.0; extra == "tests"
62
58
  Requires-Dist: pytest-responses==0.5.1; extra == "tests"
63
59
 
64
60
  # Arkindex base Worker
@@ -73,7 +69,7 @@ The [documentation](https://workers.arkindex.org/) is made with [Material for Mk
73
69
 
74
70
  ## Create a new worker using our template
75
71
 
76
- ```
72
+ ```shell
77
73
  pip install --user cookiecutter
78
74
  cookiecutter git@gitlab.teklia.com:workers/base-worker.git
79
75
  ```
@@ -14,10 +14,13 @@ arkindex_worker/utils.py
14
14
  arkindex_worker/worker/__init__.py
15
15
  arkindex_worker/worker/base.py
16
16
  arkindex_worker/worker/classification.py
17
+ arkindex_worker/worker/corpus.py
17
18
  arkindex_worker/worker/dataset.py
18
19
  arkindex_worker/worker/element.py
19
20
  arkindex_worker/worker/entity.py
21
+ arkindex_worker/worker/image.py
20
22
  arkindex_worker/worker/metadata.py
23
+ arkindex_worker/worker/process.py
21
24
  arkindex_worker/worker/task.py
22
25
  arkindex_worker/worker/training.py
23
26
  arkindex_worker/worker/transcription.py
@@ -33,15 +36,26 @@ tests/test_image.py
33
36
  tests/test_merge.py
34
37
  tests/test_utils.py
35
38
  tests/test_elements_worker/__init__.py
36
- tests/test_elements_worker/test_classifications.py
39
+ tests/test_elements_worker/test_classification.py
37
40
  tests/test_elements_worker/test_cli.py
41
+ tests/test_elements_worker/test_corpus.py
38
42
  tests/test_elements_worker/test_dataset.py
39
- tests/test_elements_worker/test_elements.py
40
- tests/test_elements_worker/test_entities.py
43
+ tests/test_elements_worker/test_element.py
44
+ tests/test_elements_worker/test_element_create_multiple.py
45
+ tests/test_elements_worker/test_element_create_single.py
46
+ tests/test_elements_worker/test_element_list_children.py
47
+ tests/test_elements_worker/test_element_list_parents.py
48
+ tests/test_elements_worker/test_entity_create.py
49
+ tests/test_elements_worker/test_entity_list_and_check.py
50
+ tests/test_elements_worker/test_image.py
41
51
  tests/test_elements_worker/test_metadata.py
52
+ tests/test_elements_worker/test_process.py
42
53
  tests/test_elements_worker/test_task.py
43
54
  tests/test_elements_worker/test_training.py
44
- tests/test_elements_worker/test_transcriptions.py
55
+ tests/test_elements_worker/test_transcription_create.py
56
+ tests/test_elements_worker/test_transcription_create_with_elements.py
57
+ tests/test_elements_worker/test_transcription_list.py
58
+ tests/test_elements_worker/test_version.py
45
59
  tests/test_elements_worker/test_worker.py
46
60
  worker-demo/tests/__init__.py
47
61
  worker-demo/tests/conftest.py
@@ -0,0 +1,17 @@
1
+ humanize==4.11.0
2
+ peewee~=3.17
3
+ Pillow==11.0.0
4
+ python-gnupg==0.5.3
5
+ shapely==2.0.6
6
+ teklia-toolbox==0.1.7
7
+ zstandard==0.23.0
8
+
9
+ [docs]
10
+ black==24.10.0
11
+ mkdocs-material==9.5.48
12
+ mkdocstrings-python==1.12.2
13
+
14
+ [tests]
15
+ pytest==8.3.4
16
+ pytest-mock==3.14.0
17
+ pytest-responses==0.5.1
@@ -380,7 +380,7 @@ def unsupported_cache(func):
380
380
  def wrapper(self, *args, **kwargs):
381
381
  results = func(self, *args, **kwargs)
382
382
 
383
- if not (self.is_read_only or self.use_cache):
383
+ if self.use_cache:
384
384
  logger.warning(
385
385
  f"This API helper `{func.__name__}` did not update the cache database"
386
386
  )
@@ -2,13 +2,18 @@
2
2
  Helper methods to download and open IIIF images, and manage polygons.
3
3
  """
4
4
 
5
+ import functools
6
+ import os
5
7
  import re
8
+ import tempfile
6
9
  from collections import namedtuple
10
+ from collections.abc import Generator, Iterator
7
11
  from io import BytesIO
8
12
  from math import ceil
9
13
  from pathlib import Path
10
14
  from typing import TYPE_CHECKING
11
15
 
16
+ import humanize
12
17
  import requests
13
18
  from PIL import Image
14
19
  from shapely.affinity import rotate, scale, translate
@@ -21,6 +26,7 @@ from tenacity import (
21
26
  )
22
27
 
23
28
  from arkindex_worker import logger
29
+ from arkindex_worker.utils import pluralize
24
30
  from teklia_toolbox.requests import should_verify_cert
25
31
 
26
32
  # Avoid circular imports error when type checking
@@ -39,8 +45,57 @@ IIIF_URL = re.compile(r"\w+:\/{2}.+\/.+\/.+\/.+\/(?P<size>.+)\/!?\d+\/\w+\.\w+")
39
45
  IIIF_FULL = "full"
40
46
  # Maximum size available
41
47
  IIIF_MAX = "max"
48
+ # Ratio to resize image
49
+ IMAGE_RATIO = [1, 0.9, 0.85, 0.80, 0.75, 0.70, 0.60, 0.50, 0.40, 0.30]
42
50
 
43
51
 
52
+ def update_pillow_image_size_limit(func):
53
+ """
54
+ Update Pillow Image size limit
55
+ """
56
+
57
+ @functools.wraps(func)
58
+ def wrapper(
59
+ *args,
60
+ max_image_pixels: str | int | None = os.getenv("ARKINDEX_MAX_IMAGE_PIXELS"),
61
+ **kwargs,
62
+ ):
63
+ """
64
+ Wrapper to update Pillow Image size limit and restore it at the end of the function.
65
+
66
+ :param *args: Positional arguments passed to the function.
67
+ :param max_image_pixels: Pillow Image size limit to use.
68
+ :param **kwargs: Keyword arguments passed to the function.
69
+ """
70
+ MAX_IMAGE_PIXELS = Image.MAX_IMAGE_PIXELS
71
+
72
+ # Override Pillow Image size limit
73
+ if max_image_pixels is not None:
74
+ max_image_pixels = int(max_image_pixels)
75
+ # Override Pillow limit for detecting decompression bombs, disabled if set to 0
76
+ if max_image_pixels == 0:
77
+ logger.warning(
78
+ "Pillow Image size limit is completely disabled, make sure you trust the image source."
79
+ )
80
+ Image.MAX_IMAGE_PIXELS = None
81
+ else:
82
+ Image.MAX_IMAGE_PIXELS = max_image_pixels
83
+
84
+ try:
85
+ results = func(*args, **kwargs)
86
+ except:
87
+ # Restore initial Pillow Image size limit
88
+ Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
89
+ raise
90
+
91
+ # Restore initial Pillow Image size limit
92
+ Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
93
+ return results
94
+
95
+ return wrapper
96
+
97
+
98
+ @update_pillow_image_size_limit
44
99
  def open_image(
45
100
  path: str,
46
101
  mode: str | None = "RGB",
@@ -148,6 +203,70 @@ def upload_image(image: Image, url: str) -> requests.Response:
148
203
  return resp
149
204
 
150
205
 
206
+ def resized_images(
207
+ *args,
208
+ element: "Element",
209
+ max_pixels: int | None = None,
210
+ max_bytes: int | None = None,
211
+ **kwargs,
212
+ ) -> Iterator[Generator[tempfile.NamedTemporaryFile, None, None]]:
213
+ """
214
+ Build resized images according to the pixel and byte limits.
215
+
216
+ :param *args: Positional arguments passed to [arkindex_worker.models.Element.open_image_tempfile][].
217
+ :param element: Element whose image needs to be resized.
218
+ :param max_pixels: Maximum pixel size of the resized images.
219
+ :param max_bytes: Maximum byte size of the resized images.
220
+ :param **kwargs: Keyword arguments passed to [arkindex_worker.models.Element.open_image_tempfile][].
221
+ :returns: An iterator of the temporary file of the resized image.
222
+ """
223
+ _, _, element_width, element_height = polygon_bounding_box(element.polygon)
224
+
225
+ logger.info(f"This element's image sizes are ({element_width} x {element_height}).")
226
+ if max_pixels and max(element_width, element_height) > max_pixels:
227
+ logger.warning(
228
+ f"Maximum image input size supported is ({max_pixels} x {max_pixels})."
229
+ )
230
+ logger.warning("The image will be resized.")
231
+
232
+ element_pixel, param = (
233
+ (element_width, "max_width")
234
+ if element_width > element_height
235
+ else (element_height, "max_height")
236
+ )
237
+
238
+ for resized_pixel in sorted(
239
+ set(
240
+ min(round(ratio * element_pixel), max_pixels or element_pixel)
241
+ for ratio in IMAGE_RATIO
242
+ ),
243
+ reverse=True,
244
+ ):
245
+ with element.open_image_tempfile(
246
+ *args, **{**kwargs, param: resized_pixel}
247
+ ) as image:
248
+ pillow_image = Image.open(image)
249
+ if (
250
+ pillow_image.width != element_width
251
+ or pillow_image.height != element_height
252
+ ):
253
+ logger.warning(
254
+ f"The image was resized to ({pillow_image.width} x {pillow_image.height})."
255
+ )
256
+
257
+ # The image is still too large
258
+ image_size = Path(image.name).stat().st_size
259
+ if max_bytes and image_size > max_bytes:
260
+ logger.warning(f"The image size is {humanize.naturalsize(image_size)}.")
261
+ logger.warning(
262
+ f"Maximum image input size supported is {humanize.naturalsize(max_bytes)}."
263
+ )
264
+ logger.warning("The image will be resized.")
265
+ continue
266
+
267
+ yield image
268
+
269
+
151
270
  def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
152
271
  """
153
272
  Compute the rectangle bounding box of a polygon.
@@ -164,7 +283,7 @@ def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
164
283
  def _retry_log(retry_state, *args, **kwargs):
165
284
  logger.warning(
166
285
  f"Request to {retry_state.args[0]} failed ({repr(retry_state.outcome.exception())}), "
167
- f"retrying in {retry_state.idle_for} seconds"
286
+ f'retrying in {retry_state.idle_for} {pluralize("second", retry_state.idle_for)}'
168
287
  )
169
288
 
170
289
 
@@ -261,6 +261,12 @@ class Transcription(ArkindexModel):
261
261
  """
262
262
 
263
263
 
264
+ class Image(ArkindexModel):
265
+ """
266
+ Describes an Arkindex image.
267
+ """
268
+
269
+
264
270
  class Dataset(ArkindexModel):
265
271
  """
266
272
  Describes an Arkindex dataset.
@@ -1,15 +1,41 @@
1
1
  import hashlib
2
+ import inspect
2
3
  import logging
3
4
  import os
4
5
  import tarfile
5
6
  import tempfile
7
+ from collections.abc import Callable, Generator
8
+ from itertools import islice
6
9
  from pathlib import Path
10
+ from typing import Any
7
11
 
8
- import zstandard
9
12
  import zstandard as zstd
10
13
 
11
14
  logger = logging.getLogger(__name__)
12
15
 
16
+
17
+ def pluralize(singular: str, count: int) -> str:
18
+ """Pluralize a noun, if necessary, using simplified rules of English pluralization and a list of exceptions.
19
+
20
+ :param str singular: A singular noun describing an object
21
+ :param int count: The object count, to determine whether to pluralize or not
22
+ :return str: The noun in its singular or plural form
23
+ """
24
+ if count == 1:
25
+ return singular
26
+
27
+ some_exceptions = {
28
+ "child": "children",
29
+ "class": "classes",
30
+ "entity": "entities",
31
+ "metadata": "metadata",
32
+ }
33
+ if singular in some_exceptions:
34
+ return some_exceptions[singular]
35
+
36
+ return singular + "s"
37
+
38
+
13
39
  MANUAL_SOURCE = "manual"
14
40
 
15
41
 
@@ -38,7 +64,7 @@ def decompress_zst_archive(compressed_archive: Path) -> tuple[int, Path]:
38
64
  :param compressed_archive: Path to the target ZST-compressed archive
39
65
  :return: File descriptor and path to the uncompressed tar archive
40
66
  """
41
- dctx = zstandard.ZstdDecompressor()
67
+ dctx = zstd.ZstdDecompressor()
42
68
  archive_fd, archive_path = tempfile.mkstemp(suffix=".tar")
43
69
  archive_path = Path(archive_path)
44
70
 
@@ -50,7 +76,7 @@ def decompress_zst_archive(compressed_archive: Path) -> tuple[int, Path]:
50
76
  ):
51
77
  dctx.copy_stream(compressed, decompressed)
52
78
  logger.debug(f"Successfully uncompressed archive {compressed_archive}")
53
- except zstandard.ZstdError as e:
79
+ except zstd.ZstdError as e:
54
80
  raise Exception(f"Couldn't uncompressed archive: {e}") from e
55
81
 
56
82
  return archive_fd, archive_path
@@ -129,7 +155,7 @@ def zstd_compress(
129
155
  archive_hasher.update(compressed_chunk)
130
156
  archive_file.write(compressed_chunk)
131
157
  logger.debug(f"Successfully compressed {source}")
132
- except zstandard.ZstdError as e:
158
+ except zstd.ZstdError as e:
133
159
  raise Exception(f"Couldn't compress archive: {e}") from e
134
160
  return file_d, destination, archive_hasher.hexdigest()
135
161
 
@@ -197,3 +223,58 @@ def create_tar_zst_archive(
197
223
  close_delete_file(tar_fd, tar_archive)
198
224
 
199
225
  return zst_fd, zst_archive, zst_hash, tar_hash
226
+
227
+
228
+ DEFAULT_BATCH_SIZE = 50
229
+ """Batch size used for bulk publication to Arkindex"""
230
+
231
+
232
+ def batch_publication(func: Callable) -> Callable:
233
+ """
234
+ Decorator for functions that should raise an error when the value passed through the ``batch_size`` parameter is **not** a strictly positive integer.
235
+
236
+ :param func: The function to wrap with the ``batch_size`` check
237
+ :return: The function passing the ``batch_size`` check
238
+ """
239
+ signature = inspect.signature(func)
240
+
241
+ def wrapper(self, *args, **kwargs):
242
+ bound_func = signature.bind(self, *args, **kwargs)
243
+ bound_func.apply_defaults()
244
+ batch_size = bound_func.arguments.get("batch_size")
245
+ assert (
246
+ batch_size is not None and isinstance(batch_size, int) and batch_size > 0
247
+ ), "batch_size shouldn't be null and should be a strictly positive integer"
248
+
249
+ return func(self, *args, **kwargs)
250
+
251
+ wrapper.__name__ = func.__name__
252
+ return wrapper
253
+
254
+
255
+ def make_batches(
256
+ objects: list, singular_name: str, batch_size: int
257
+ ) -> Generator[list[Any]]:
258
+ """Split an object list in successive batches of maximum size ``batch_size``.
259
+
260
+ :param objects: The object list to divide in batches of ``batch_size`` size
261
+ :param singular_name: The singular form of the noun associated with the object list
262
+ :param batch_size: The maximum size of each batch to split the object list
263
+ :return: A generator of successive batches containing ``batch_size`` items from ``objects``
264
+ """
265
+ count = len(objects)
266
+ logger.info(
267
+ f"Creating batches of size {batch_size} to process {count} {pluralize(singular_name, count)}"
268
+ )
269
+
270
+ index = 1
271
+ iterator = iter(objects)
272
+ while batch := list(islice(iterator, batch_size)):
273
+ count = len(batch)
274
+ logger.info(
275
+ f"Processing batch {index} containing {count} {pluralize(singular_name, count)}..."
276
+ )
277
+
278
+ yield batch
279
+
280
+ index += 1