arkindex-base-worker 0.4.0b3__py3-none-any.whl → 0.4.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: arkindex-base-worker
3
- Version: 0.4.0b3
3
+ Version: 0.4.0rc1
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
@@ -40,6 +40,7 @@ Classifier: Programming Language :: Python :: 3.11
40
40
  Requires-Python: >=3.10
41
41
  Description-Content-Type: text/markdown
42
42
  License-File: LICENSE
43
+ Requires-Dist: humanize ==4.9.0
43
44
  Requires-Dist: peewee ~=3.17
44
45
  Requires-Dist: Pillow ==10.4.0
45
46
  Requires-Dist: python-gnupg ==0.5.2
@@ -49,7 +50,7 @@ Requires-Dist: zstandard ==0.22.0
49
50
  Provides-Extra: docs
50
51
  Requires-Dist: black ==24.4.2 ; extra == 'docs'
51
52
  Requires-Dist: mkdocs-material ==9.5.31 ; extra == 'docs'
52
- Requires-Dist: mkdocstrings-python ==1.10.7 ; extra == 'docs'
53
+ Requires-Dist: mkdocstrings-python ==1.10.8 ; extra == 'docs'
53
54
  Provides-Extra: tests
54
55
  Requires-Dist: pytest ==8.3.2 ; extra == 'tests'
55
56
  Requires-Dist: pytest-mock ==3.14.0 ; extra == 'tests'
@@ -1,51 +1,51 @@
1
1
  arkindex_worker/__init__.py,sha256=OlgCtTC9MaWeejviY0a3iQpALcRQGMVArFVVYwTF6I8,162
2
2
  arkindex_worker/cache.py,sha256=FTlB0coXofn5zTNRTcVIvh709mcw4a1bPGqkwWjKs3w,11248
3
- arkindex_worker/image.py,sha256=8Y0PYMbTEsFUv8lCNLBu7UaDy6um5YfHCefyXL2jpnE,14347
3
+ arkindex_worker/image.py,sha256=oEgVCrSHiGh3D5-UXfM6PvT17TttSxC0115irpvB3Dw,18581
4
4
  arkindex_worker/models.py,sha256=bPQzGZNs5a6z6DEcygsa8T33VOqPlMUbwKzHqlKzwbw,9923
5
5
  arkindex_worker/utils.py,sha256=q1EeLdC6ebYIH-C0LOAqw2cNpjCjVoP-Vbr-39mF4w0,9884
6
- arkindex_worker/worker/__init__.py,sha256=w1VlDzERabXIp625kkHnojyu5ctCM11WLw4ARh1ja3k,19818
7
- arkindex_worker/worker/base.py,sha256=JStHpwSP3bis9LLvV2C2n6GTWtLUVIDA9JPgPJEt17o,18717
6
+ arkindex_worker/worker/__init__.py,sha256=Xzn20bD4THFcnDfPjZeE-uU41m_whs_3yA0WjZb9uqk,18195
7
+ arkindex_worker/worker/base.py,sha256=wyEJB5_zcy4cTvqSXMhX8DLaWQVgvIKO77-uovcprq4,19539
8
8
  arkindex_worker/worker/classification.py,sha256=ECm1cnQPOj_9m-CoO0e182ElSySAUOoyddHrORbShhc,10951
9
9
  arkindex_worker/worker/corpus.py,sha256=s9bCxOszJMwRq1WWAmKjWq888mjDfbaJ18Wo7h-rNOw,1827
10
10
  arkindex_worker/worker/dataset.py,sha256=UXElhhARca9m7Himp-yxD5dAqWbdxDKWOUJUGgeCZXI,2934
11
- arkindex_worker/worker/element.py,sha256=yz7q-emuCIY6MI438QXQk1Cgq991QjYoLewNyUVE4ic,36411
11
+ arkindex_worker/worker/element.py,sha256=1qTnz9Y4nbTSxn274-sRmM2stzT5wJrsbshxXHlBoPw,44789
12
12
  arkindex_worker/worker/entity.py,sha256=qGjQvOVXfP84rER0Dkui6q-rb9nTWerHVG0Z5voB8pU,15229
13
13
  arkindex_worker/worker/image.py,sha256=t_Az6IGnj0EZyvcA4XxfPikOUjn_pztgsyxTkFZhaXU,621
14
14
  arkindex_worker/worker/metadata.py,sha256=VRajtd2kaBvar9GercX4knvR6l1WFYjoCdJWU9ccKgk,7291
15
15
  arkindex_worker/worker/task.py,sha256=1O9zrWXxe3na3TOcoHX5Pxn1875v7EU08BSsCPnb62g,1519
16
16
  arkindex_worker/worker/training.py,sha256=qnBFEk11JOWWPLTbjF-lZ9iFBdTPpQzZAzQ9a03J1j4,10874
17
- arkindex_worker/worker/transcription.py,sha256=8ho-8zmF9LgP86oS59ZZLv5I7tfnZ1yNO2A3pY_9GQ8,21353
17
+ arkindex_worker/worker/transcription.py,sha256=52RY9kYsiR1sz9FxOigyo12Ker3VDbQ4U42gK9DpR3g,21146
18
18
  arkindex_worker/worker/version.py,sha256=JIT7OI3Mo7RPkNrjOB9hfqrsG-FYygz_zi4l8PbkuAo,1960
19
19
  hooks/pre_gen_project.py,sha256=xQJERv3vv9VzIqcBHI281eeWLWREXUF4mMw7PvJHHXM,269
20
20
  tests/__init__.py,sha256=6aeTMHf4q_dKY4jIZWg1KT70VKaLvVlzCxh-Uu_cWiQ,241
21
- tests/conftest.py,sha256=-ZQTV4rg7TgW84-5Ioqndqv8byNILfDOpyUt8wecEiI,21967
22
- tests/test_base_worker.py,sha256=LdFV0LFdNU2IOyEKlX59MB1kuyxHCuhy4Tm7eE_iPiU,24281
21
+ tests/conftest.py,sha256=KNBZ0xMC9xX2pKQXp_4XwVU07JGeTSFeM4rN2RpipfY,21522
22
+ tests/test_base_worker.py,sha256=2EIYcd_3f9O0zB5WiGIQV0Cn9wndLvnEnSfcAE1qWWU,30607
23
23
  tests/test_cache.py,sha256=ii0gyr0DrG7ChEs7pmT8hMdSguAOAcCze4bRMiFQxuk,10640
24
24
  tests/test_dataset_worker.py,sha256=d9HG36qnO5HXu9vQ0UTBvdTSRR21FVq1FNoXM-vZbPk,22105
25
25
  tests/test_element.py,sha256=2G9M15TLxQRmvrWM9Kw2ucnElh4kSv_oF_5FYwwAxTY,13181
26
- tests/test_image.py,sha256=Fs9vKYgQ7mEFylbzI4YIO_JyOLeAcs-WxUXpzewxCd8,16188
27
- tests/test_merge.py,sha256=FMdpsm_ncHNmIvOrJ1vcwlyn8o9-SPcpFTcbAsXwK-w,8320
28
- tests/test_utils.py,sha256=zbJC24NyTc3slz3Ed3gJDswjRChjkR5oHEgDoQMOBiE,2588
26
+ tests/test_image.py,sha256=J3jqB5OhcdCpB6n0UnwivxrMlne8YjFLXhq1gBMANrs,26711
27
+ tests/test_merge.py,sha256=TuOeUS0UCz66DPOQFFhc4NQBxIjZL9f5czi4XnvGrr4,8270
28
+ tests/test_utils.py,sha256=_WJUPnt-pM_TQ0er4yjPZy-u_LePrHq1lxwk_teky7M,2544
29
29
  tests/test_elements_worker/__init__.py,sha256=Fh4nkbbyJSMv_VtjQxnWrOqTnxXaaWI8S9WU0VrzCHs,179
30
30
  tests/test_elements_worker/test_classifications.py,sha256=fXZ8cSzIWwZ6LHsY7tKsy9-Pp9fKyKUStIXS4ViBcek,27779
31
31
  tests/test_elements_worker/test_cli.py,sha256=a23i1pUDbXi23MUtbWwGEcLLrmc_YlrbDgOG3h66wLM,2620
32
32
  tests/test_elements_worker/test_corpus.py,sha256=c_LUHvkJIYgk_wXF06VQPNOoWfiZ06XpjOXrJ7MRiBc,4479
33
33
  tests/test_elements_worker/test_dataset.py,sha256=lSXqubhg1EEq2Y2goE8Y2RYaqIpM9Iejq6fGNW2BczU,11411
34
- tests/test_elements_worker/test_elements.py,sha256=v5MUD-a4gcmuaqG5UHu9AlzSEoRA2dudkht7cEVED_s,93227
34
+ tests/test_elements_worker/test_elements.py,sha256=PBVRIQB8yTCCa22A0VJKIsJSa4gvagDVZVtZT8mlZF0,107199
35
35
  tests/test_elements_worker/test_entities.py,sha256=oav2dtvWWavQe1l3Drbxw1Ta2ocUJEVxJfDQ_r6-rYQ,36181
36
36
  tests/test_elements_worker/test_image.py,sha256=_E3UGdDOwTo1MW5KMS81PrdeSPBPWinWYoQPNy2F9Ro,2077
37
37
  tests/test_elements_worker/test_metadata.py,sha256=cm2NNaXxBYmYMkPexSPVTAqb2skDTB4mliwQCLz8Y98,22293
38
38
  tests/test_elements_worker/test_task.py,sha256=7Sr3fbjdgWUXJUhJEiC9CwnbhQIQX3rCInmHMIrmA38,5573
39
39
  tests/test_elements_worker/test_training.py,sha256=Qxi9EzGr_uKcn2Fh5aE6jNrq1K8QKLiOiSew4upASPs,8721
40
40
  tests/test_elements_worker/test_transcriptions.py,sha256=FNY6E26iTKqe7LP9LO72By4oV4g9hBIZYTU9BAc_w7I,77060
41
- tests/test_elements_worker/test_worker.py,sha256=AwdP8uSXNQ_SJavXxJV2s3_J3OiCafShVjMV1dgt4xo,17162
41
+ tests/test_elements_worker/test_worker.py,sha256=AuFDyqncIusT-rMMY4sEay9MqGvoNuSuZQq-5rHN02U,10803
42
42
  worker-demo/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
43
43
  worker-demo/tests/conftest.py,sha256=XzNMNeg6pmABUAH8jN6eZTlZSFGLYjS3-DTXjiRN6Yc,1002
44
44
  worker-demo/tests/test_worker.py,sha256=3DLd4NRK4bfyatG5P_PK4k9P9tJHx9XQq5_ryFEEFVg,304
45
45
  worker-demo/worker_demo/__init__.py,sha256=2BPomV8ZMNf3YXJgloatKeHQCE6QOkwmsHGkO6MkQuM,125
46
46
  worker-demo/worker_demo/worker.py,sha256=Rt-DjWa5iBP08k58NDZMfeyPuFbtNcbX6nc5jFX7GNo,440
47
- arkindex_base_worker-0.4.0b3.dist-info/LICENSE,sha256=NVshRi1efwVezMfW7xXYLrdDr2Li1AfwfGOd5WuH1kQ,1063
48
- arkindex_base_worker-0.4.0b3.dist-info/METADATA,sha256=KpYeTvNM7sruTB38VaQk_TephTtArTv1I6hrMI9iloM,3270
49
- arkindex_base_worker-0.4.0b3.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
50
- arkindex_base_worker-0.4.0b3.dist-info/top_level.txt,sha256=58NuslgxQC2vT4DiqZEgO4JqJRrYa2yeNI9QvkbfGQU,40
51
- arkindex_base_worker-0.4.0b3.dist-info/RECORD,,
47
+ arkindex_base_worker-0.4.0rc1.dist-info/LICENSE,sha256=NVshRi1efwVezMfW7xXYLrdDr2Li1AfwfGOd5WuH1kQ,1063
48
+ arkindex_base_worker-0.4.0rc1.dist-info/METADATA,sha256=22DYiI2CtAzJ9d0P21Y2ZlAoBFX_Ks-yRQMoYlMO5KM,3303
49
+ arkindex_base_worker-0.4.0rc1.dist-info/WHEEL,sha256=Mdi9PDNwEZptOjTlUcAth7XJDFtKrHYaQMPulZeBCiQ,91
50
+ arkindex_base_worker-0.4.0rc1.dist-info/top_level.txt,sha256=58NuslgxQC2vT4DiqZEgO4JqJRrYa2yeNI9QvkbfGQU,40
51
+ arkindex_base_worker-0.4.0rc1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (72.1.0)
2
+ Generator: setuptools (73.0.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
arkindex_worker/image.py CHANGED
@@ -2,13 +2,18 @@
2
2
  Helper methods to download and open IIIF images, and manage polygons.
3
3
  """
4
4
 
5
+ import functools
6
+ import os
5
7
  import re
8
+ import tempfile
6
9
  from collections import namedtuple
10
+ from collections.abc import Generator, Iterator
7
11
  from io import BytesIO
8
12
  from math import ceil
9
13
  from pathlib import Path
10
14
  from typing import TYPE_CHECKING
11
15
 
16
+ import humanize
12
17
  import requests
13
18
  from PIL import Image
14
19
  from shapely.affinity import rotate, scale, translate
@@ -40,8 +45,57 @@ IIIF_URL = re.compile(r"\w+:\/{2}.+\/.+\/.+\/.+\/(?P<size>.+)\/!?\d+\/\w+\.\w+")
40
45
  IIIF_FULL = "full"
41
46
  # Maximum size available
42
47
  IIIF_MAX = "max"
48
+ # Ratio to resize image
49
+ IMAGE_RATIO = [1, 0.9, 0.85, 0.80, 0.75, 0.70, 0.60, 0.50, 0.40, 0.30]
43
50
 
44
51
 
52
+ def update_pillow_image_size_limit(func):
53
+ """
54
+ Update Pillow Image size limit
55
+ """
56
+
57
+ @functools.wraps(func)
58
+ def wrapper(
59
+ *args,
60
+ max_image_pixels: str | int | None = os.getenv("ARKINDEX_MAX_IMAGE_PIXELS"),
61
+ **kwargs,
62
+ ):
63
+ """
64
+ Wrapper to update Pillow Image size limit and restore it at the end of the function.
65
+
66
+ :param *args: Positional arguments passed to the function.
67
+ :param max_image_pixels: Pillow Image size limit to use.
68
+ :param **kwargs: Keyword arguments passed to the function.
69
+ """
70
+ MAX_IMAGE_PIXELS = Image.MAX_IMAGE_PIXELS
71
+
72
+ # Override Pillow Image size limit
73
+ if max_image_pixels is not None:
74
+ max_image_pixels = int(max_image_pixels)
75
+ # Override Pillow limit for detecting decompression bombs, disabled if set to 0
76
+ if max_image_pixels == 0:
77
+ logger.warning(
78
+ "Pillow Image size limit is completely disabled, make sure you trust the image source."
79
+ )
80
+ Image.MAX_IMAGE_PIXELS = None
81
+ else:
82
+ Image.MAX_IMAGE_PIXELS = max_image_pixels
83
+
84
+ try:
85
+ results = func(*args, **kwargs)
86
+ except:
87
+ # Restore initial Pillow Image size limit
88
+ Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
89
+ raise
90
+
91
+ # Restore initial Pillow Image size limit
92
+ Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
93
+ return results
94
+
95
+ return wrapper
96
+
97
+
98
+ @update_pillow_image_size_limit
45
99
  def open_image(
46
100
  path: str,
47
101
  mode: str | None = "RGB",
@@ -149,6 +203,70 @@ def upload_image(image: Image, url: str) -> requests.Response:
149
203
  return resp
150
204
 
151
205
 
206
+ def resized_images(
207
+ *args,
208
+ element: "Element",
209
+ max_pixels: int | None = None,
210
+ max_bytes: int | None = None,
211
+ **kwargs,
212
+ ) -> Iterator[Generator[tempfile.NamedTemporaryFile, None, None]]:
213
+ """
214
+ Build resized images according to the pixel and byte limits.
215
+
216
+ :param *args: Positional arguments passed to [arkindex_worker.models.Element.open_image_tempfile][].
217
+ :param element: Element whose image needs to be resized.
218
+ :param max_pixels: Maximum pixel size of the resized images.
219
+ :param max_bytes: Maximum byte size of the resized images.
220
+ :param **kwargs: Keyword arguments passed to [arkindex_worker.models.Element.open_image_tempfile][].
221
+ :returns: An iterator of the temporary file of the resized image.
222
+ """
223
+ _, _, element_width, element_height = polygon_bounding_box(element.polygon)
224
+
225
+ logger.info(f"This element's image sizes are ({element_width} x {element_height}).")
226
+ if max_pixels and max(element_width, element_height) > max_pixels:
227
+ logger.warning(
228
+ f"Maximum image input size supported is ({max_pixels} x {max_pixels})."
229
+ )
230
+ logger.warning("The image will be resized.")
231
+
232
+ element_pixel, param = (
233
+ (element_width, "max_width")
234
+ if element_width > element_height
235
+ else (element_height, "max_height")
236
+ )
237
+
238
+ for resized_pixel in sorted(
239
+ set(
240
+ min(round(ratio * element_pixel), max_pixels or element_pixel)
241
+ for ratio in IMAGE_RATIO
242
+ ),
243
+ reverse=True,
244
+ ):
245
+ with element.open_image_tempfile(
246
+ *args, **{**kwargs, param: resized_pixel}
247
+ ) as image:
248
+ pillow_image = Image.open(image)
249
+ if (
250
+ pillow_image.width != element_width
251
+ or pillow_image.height != element_height
252
+ ):
253
+ logger.warning(
254
+ f"The image was resized to ({pillow_image.width} x {pillow_image.height})."
255
+ )
256
+
257
+ # The image is still too large
258
+ image_size = Path(image.name).stat().st_size
259
+ if max_bytes and image_size > max_bytes:
260
+ logger.warning(f"The image size is {humanize.naturalsize(image_size)}.")
261
+ logger.warning(
262
+ f"Maximum image input size supported is {humanize.naturalsize(max_bytes)}."
263
+ )
264
+ logger.warning("The image will be resized.")
265
+ continue
266
+
267
+ yield image
268
+
269
+
152
270
  def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
153
271
  """
154
272
  Compute the rectangle bounding box of a polygon.
@@ -111,7 +111,7 @@ class ElementsWorker(
111
111
  help="One or more Arkindex element ID",
112
112
  )
113
113
 
114
- def list_elements(self) -> Iterable[CachedElement] | list[str]:
114
+ def get_elements(self) -> Iterable[CachedElement] | list[str]:
115
115
  """
116
116
  List the elements to be processed, either from the CLI arguments or
117
117
  the cache database when enabled.
@@ -173,33 +173,10 @@ class ElementsWorker(
173
173
  ), "Worker must be configured to access its process activity state"
174
174
  return self.process_information.get("activity_state") == "ready"
175
175
 
176
- def configure(self):
177
- """
178
- Setup the worker using CLI arguments and environment variables.
179
- """
180
- # CLI args are stored on the instance so that implementations can access them
181
- self.args = self.parser.parse_args()
182
-
183
- if self.is_read_only:
184
- super().configure_for_developers()
185
- else:
186
- super().configure()
187
- super().configure_cache()
188
-
189
- # Retrieve the model configuration
190
- if self.model_configuration:
191
- self.config.update(self.model_configuration)
192
- logger.info("Model version configuration retrieved")
193
-
194
- # Retrieve the user configuration
195
- if self.user_configuration:
196
- self.config.update(self.user_configuration)
197
- logger.info("User configuration retrieved")
198
-
199
176
  def run(self):
200
177
  """
201
178
  Implements an Arkindex worker that goes through each element returned by
202
- [list_elements][arkindex_worker.worker.ElementsWorker.list_elements].
179
+ [get_elements][arkindex_worker.worker.ElementsWorker.get_elements].
203
180
  It calls [process_element][arkindex_worker.worker.ElementsWorker.process_element],
204
181
  catching exceptions, and handles saving WorkerActivity updates when enabled.
205
182
  """
@@ -207,7 +184,7 @@ class ElementsWorker(
207
184
 
208
185
  # List all elements either from JSON file
209
186
  # or direct list of elements on CLI
210
- elements = self.list_elements()
187
+ elements = self.get_elements()
211
188
  if not elements:
212
189
  logger.warning("No elements to process, stopping.")
213
190
  sys.exit(1)
@@ -224,7 +201,7 @@ class ElementsWorker(
224
201
  element = None
225
202
  try:
226
203
  if self.use_cache:
227
- # Just use the result of list_elements as the element
204
+ # Just use the result of get_elements as the element
228
205
  element = item
229
206
  else:
230
207
  # Load element using the Arkindex API
@@ -397,29 +374,6 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
397
374
  default=[],
398
375
  )
399
376
 
400
- def configure(self):
401
- """
402
- Setup the worker using CLI arguments and environment variables.
403
- """
404
- # CLI args are stored on the instance so that implementations can access them
405
- self.args = self.parser.parse_args()
406
-
407
- if self.is_read_only:
408
- super().configure_for_developers()
409
- else:
410
- super().configure()
411
- super().configure_cache()
412
-
413
- # Retrieve the model configuration
414
- if self.model_configuration:
415
- self.config.update(self.model_configuration)
416
- logger.info("Model version configuration retrieved")
417
-
418
- # Retrieve the user configuration
419
- if self.user_configuration:
420
- self.config.update(self.user_configuration)
421
- logger.info("User configuration retrieved")
422
-
423
377
  def cleanup_downloaded_artifact(self) -> None:
424
378
  """
425
379
  Cleanup the downloaded dataset artifact if any
@@ -219,7 +219,7 @@ class BaseWorker:
219
219
  # Load all required secrets
220
220
  self.secrets = {name: self.load_secret(Path(name)) for name in required_secrets}
221
221
 
222
- def configure(self):
222
+ def configure_worker_run(self):
223
223
  """
224
224
  Setup the necessary configuration needed using CLI args and environment variables.
225
225
  This is the method called when running a worker on Arkindex.
@@ -320,6 +320,29 @@ class BaseWorker:
320
320
  else:
321
321
  logger.debug("Cache is disabled")
322
322
 
323
+ def configure(self):
324
+ """
325
+ Setup the worker using CLI arguments and environment variables.
326
+ """
327
+ # CLI args are stored on the instance so that implementations can access them
328
+ self.args = self.parser.parse_args()
329
+
330
+ if self.is_read_only:
331
+ self.configure_for_developers()
332
+ else:
333
+ self.configure_worker_run()
334
+ self.configure_cache()
335
+
336
+ # Retrieve the model configuration
337
+ if self.model_configuration:
338
+ self.config.update(self.model_configuration)
339
+ logger.info("Model version configuration retrieved")
340
+
341
+ # Retrieve the user configuration
342
+ if self.user_configuration:
343
+ self.config.update(self.user_configuration)
344
+ logger.info("User configuration retrieved")
345
+
323
346
  def load_secret(self, name: Path):
324
347
  """
325
348
  Load a Ponos secret by name.