arkindex-base-worker 0.4.0b1__py3-none-any.whl → 0.4.0b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: arkindex-base-worker
3
- Version: 0.4.0b1
3
+ Version: 0.4.0b3
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
@@ -1,20 +1,20 @@
1
1
  arkindex_worker/__init__.py,sha256=OlgCtTC9MaWeejviY0a3iQpALcRQGMVArFVVYwTF6I8,162
2
2
  arkindex_worker/cache.py,sha256=FTlB0coXofn5zTNRTcVIvh709mcw4a1bPGqkwWjKs3w,11248
3
- arkindex_worker/image.py,sha256=5ymIGaTm2D7Sp2YYQkbuheuGnx5VJo0_AzYAEIvNGhs,14267
3
+ arkindex_worker/image.py,sha256=8Y0PYMbTEsFUv8lCNLBu7UaDy6um5YfHCefyXL2jpnE,14347
4
4
  arkindex_worker/models.py,sha256=bPQzGZNs5a6z6DEcygsa8T33VOqPlMUbwKzHqlKzwbw,9923
5
- arkindex_worker/utils.py,sha256=KXWIACda7D3IpdToaAplLoAgnCK8bKWw7aWUyq-IWUA,7211
6
- arkindex_worker/worker/__init__.py,sha256=belqRtbs0raTdFJoQJoGBoDJkUOrEE3wyXv90f85bTs,19760
5
+ arkindex_worker/utils.py,sha256=q1EeLdC6ebYIH-C0LOAqw2cNpjCjVoP-Vbr-39mF4w0,9884
6
+ arkindex_worker/worker/__init__.py,sha256=w1VlDzERabXIp625kkHnojyu5ctCM11WLw4ARh1ja3k,19818
7
7
  arkindex_worker/worker/base.py,sha256=JStHpwSP3bis9LLvV2C2n6GTWtLUVIDA9JPgPJEt17o,18717
8
- arkindex_worker/worker/classification.py,sha256=4YAY4weF6kMSMsoYiz6oia3SN21PzRR1bAdhMJCGBbw,10361
8
+ arkindex_worker/worker/classification.py,sha256=ECm1cnQPOj_9m-CoO0e182ElSySAUOoyddHrORbShhc,10951
9
9
  arkindex_worker/worker/corpus.py,sha256=s9bCxOszJMwRq1WWAmKjWq888mjDfbaJ18Wo7h-rNOw,1827
10
10
  arkindex_worker/worker/dataset.py,sha256=UXElhhARca9m7Himp-yxD5dAqWbdxDKWOUJUGgeCZXI,2934
11
- arkindex_worker/worker/element.py,sha256=kMaJNXEfZbFBK4YYc3XLqyGvPyNvJs7mJG2T_a1c7D0,34294
12
- arkindex_worker/worker/entity.py,sha256=BbQp56kxTPmOQI482TUFZ8KOXISj7KtQAyHRT0CmedM,14744
11
+ arkindex_worker/worker/element.py,sha256=yz7q-emuCIY6MI438QXQk1Cgq991QjYoLewNyUVE4ic,36411
12
+ arkindex_worker/worker/entity.py,sha256=qGjQvOVXfP84rER0Dkui6q-rb9nTWerHVG0Z5voB8pU,15229
13
13
  arkindex_worker/worker/image.py,sha256=t_Az6IGnj0EZyvcA4XxfPikOUjn_pztgsyxTkFZhaXU,621
14
- arkindex_worker/worker/metadata.py,sha256=PFO0oJc8N91HIpj4yHLscwGW5UFRXtuyQYfEXW27-WQ,6724
14
+ arkindex_worker/worker/metadata.py,sha256=VRajtd2kaBvar9GercX4knvR6l1WFYjoCdJWU9ccKgk,7291
15
15
  arkindex_worker/worker/task.py,sha256=1O9zrWXxe3na3TOcoHX5Pxn1875v7EU08BSsCPnb62g,1519
16
16
  arkindex_worker/worker/training.py,sha256=qnBFEk11JOWWPLTbjF-lZ9iFBdTPpQzZAzQ9a03J1j4,10874
17
- arkindex_worker/worker/transcription.py,sha256=9TC3E6zu_CnQKWsaTAzI83TrSfMuzh3KSMOCLdbEG18,20497
17
+ arkindex_worker/worker/transcription.py,sha256=8ho-8zmF9LgP86oS59ZZLv5I7tfnZ1yNO2A3pY_9GQ8,21353
18
18
  arkindex_worker/worker/version.py,sha256=JIT7OI3Mo7RPkNrjOB9hfqrsG-FYygz_zi4l8PbkuAo,1960
19
19
  hooks/pre_gen_project.py,sha256=xQJERv3vv9VzIqcBHI281eeWLWREXUF4mMw7PvJHHXM,269
20
20
  tests/__init__.py,sha256=6aeTMHf4q_dKY4jIZWg1KT70VKaLvVlzCxh-Uu_cWiQ,241
@@ -25,27 +25,27 @@ tests/test_dataset_worker.py,sha256=d9HG36qnO5HXu9vQ0UTBvdTSRR21FVq1FNoXM-vZbPk,
25
25
  tests/test_element.py,sha256=2G9M15TLxQRmvrWM9Kw2ucnElh4kSv_oF_5FYwwAxTY,13181
26
26
  tests/test_image.py,sha256=Fs9vKYgQ7mEFylbzI4YIO_JyOLeAcs-WxUXpzewxCd8,16188
27
27
  tests/test_merge.py,sha256=FMdpsm_ncHNmIvOrJ1vcwlyn8o9-SPcpFTcbAsXwK-w,8320
28
- tests/test_utils.py,sha256=vpeHMeL7bJQonv5ZEbJmlJikqVKn5VWlVEbvmYFzDYA,1650
28
+ tests/test_utils.py,sha256=zbJC24NyTc3slz3Ed3gJDswjRChjkR5oHEgDoQMOBiE,2588
29
29
  tests/test_elements_worker/__init__.py,sha256=Fh4nkbbyJSMv_VtjQxnWrOqTnxXaaWI8S9WU0VrzCHs,179
30
- tests/test_elements_worker/test_classifications.py,sha256=DYRKhPpplFp144GCXKyFG1hz4Ra9vk5FiAN6dhfMP6k,25511
30
+ tests/test_elements_worker/test_classifications.py,sha256=fXZ8cSzIWwZ6LHsY7tKsy9-Pp9fKyKUStIXS4ViBcek,27779
31
31
  tests/test_elements_worker/test_cli.py,sha256=a23i1pUDbXi23MUtbWwGEcLLrmc_YlrbDgOG3h66wLM,2620
32
32
  tests/test_elements_worker/test_corpus.py,sha256=c_LUHvkJIYgk_wXF06VQPNOoWfiZ06XpjOXrJ7MRiBc,4479
33
33
  tests/test_elements_worker/test_dataset.py,sha256=lSXqubhg1EEq2Y2goE8Y2RYaqIpM9Iejq6fGNW2BczU,11411
34
- tests/test_elements_worker/test_elements.py,sha256=HH8jUU4xHp5gXcrGJLQlo4kLFh7oYfMxO3QQEYo2itg,84885
35
- tests/test_elements_worker/test_entities.py,sha256=jirb_IKAMqMhwxeDgjO-rsr1fTP9GdXwuyhncUjCJFM,33494
34
+ tests/test_elements_worker/test_elements.py,sha256=v5MUD-a4gcmuaqG5UHu9AlzSEoRA2dudkht7cEVED_s,93227
35
+ tests/test_elements_worker/test_entities.py,sha256=oav2dtvWWavQe1l3Drbxw1Ta2ocUJEVxJfDQ_r6-rYQ,36181
36
36
  tests/test_elements_worker/test_image.py,sha256=_E3UGdDOwTo1MW5KMS81PrdeSPBPWinWYoQPNy2F9Ro,2077
37
- tests/test_elements_worker/test_metadata.py,sha256=-cZhlVAh4o2uRnHz8fPf_thfavRnJrtJYN_p4BmHISU,17566
37
+ tests/test_elements_worker/test_metadata.py,sha256=cm2NNaXxBYmYMkPexSPVTAqb2skDTB4mliwQCLz8Y98,22293
38
38
  tests/test_elements_worker/test_task.py,sha256=7Sr3fbjdgWUXJUhJEiC9CwnbhQIQX3rCInmHMIrmA38,5573
39
39
  tests/test_elements_worker/test_training.py,sha256=Qxi9EzGr_uKcn2Fh5aE6jNrq1K8QKLiOiSew4upASPs,8721
40
- tests/test_elements_worker/test_transcriptions.py,sha256=7HDkIW8IDK7pKAfpSdAPB7YOyKyeBJTn2_alvVK46SA,72411
40
+ tests/test_elements_worker/test_transcriptions.py,sha256=FNY6E26iTKqe7LP9LO72By4oV4g9hBIZYTU9BAc_w7I,77060
41
41
  tests/test_elements_worker/test_worker.py,sha256=AwdP8uSXNQ_SJavXxJV2s3_J3OiCafShVjMV1dgt4xo,17162
42
42
  worker-demo/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
43
43
  worker-demo/tests/conftest.py,sha256=XzNMNeg6pmABUAH8jN6eZTlZSFGLYjS3-DTXjiRN6Yc,1002
44
44
  worker-demo/tests/test_worker.py,sha256=3DLd4NRK4bfyatG5P_PK4k9P9tJHx9XQq5_ryFEEFVg,304
45
45
  worker-demo/worker_demo/__init__.py,sha256=2BPomV8ZMNf3YXJgloatKeHQCE6QOkwmsHGkO6MkQuM,125
46
46
  worker-demo/worker_demo/worker.py,sha256=Rt-DjWa5iBP08k58NDZMfeyPuFbtNcbX6nc5jFX7GNo,440
47
- arkindex_base_worker-0.4.0b1.dist-info/LICENSE,sha256=NVshRi1efwVezMfW7xXYLrdDr2Li1AfwfGOd5WuH1kQ,1063
48
- arkindex_base_worker-0.4.0b1.dist-info/METADATA,sha256=02rPRlcFlghY1Trb-_trpdCCMME1A9FmPzrY8wzzLDg,3270
49
- arkindex_base_worker-0.4.0b1.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
50
- arkindex_base_worker-0.4.0b1.dist-info/top_level.txt,sha256=58NuslgxQC2vT4DiqZEgO4JqJRrYa2yeNI9QvkbfGQU,40
51
- arkindex_base_worker-0.4.0b1.dist-info/RECORD,,
47
+ arkindex_base_worker-0.4.0b3.dist-info/LICENSE,sha256=NVshRi1efwVezMfW7xXYLrdDr2Li1AfwfGOd5WuH1kQ,1063
48
+ arkindex_base_worker-0.4.0b3.dist-info/METADATA,sha256=KpYeTvNM7sruTB38VaQk_TephTtArTv1I6hrMI9iloM,3270
49
+ arkindex_base_worker-0.4.0b3.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
50
+ arkindex_base_worker-0.4.0b3.dist-info/top_level.txt,sha256=58NuslgxQC2vT4DiqZEgO4JqJRrYa2yeNI9QvkbfGQU,40
51
+ arkindex_base_worker-0.4.0b3.dist-info/RECORD,,
arkindex_worker/image.py CHANGED
@@ -21,6 +21,7 @@ from tenacity import (
21
21
  )
22
22
 
23
23
  from arkindex_worker import logger
24
+ from arkindex_worker.utils import pluralize
24
25
  from teklia_toolbox.requests import should_verify_cert
25
26
 
26
27
  # Avoid circular imports error when type checking
@@ -164,7 +165,7 @@ def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
164
165
  def _retry_log(retry_state, *args, **kwargs):
165
166
  logger.warning(
166
167
  f"Request to {retry_state.args[0]} failed ({repr(retry_state.outcome.exception())}), "
167
- f"retrying in {retry_state.idle_for} seconds"
168
+ f'retrying in {retry_state.idle_for} {pluralize("second", retry_state.idle_for)}'
168
169
  )
169
170
 
170
171
 
arkindex_worker/utils.py CHANGED
@@ -1,14 +1,41 @@
1
1
  import hashlib
2
+ import inspect
2
3
  import logging
3
4
  import os
4
5
  import tarfile
5
6
  import tempfile
7
+ from collections.abc import Callable, Generator
8
+ from itertools import islice
6
9
  from pathlib import Path
10
+ from typing import Any
7
11
 
8
12
  import zstandard as zstd
9
13
 
10
14
  logger = logging.getLogger(__name__)
11
15
 
16
+
17
+ def pluralize(singular: str, count: int) -> str:
18
+ """Pluralize a noun, if necessary, using simplified rules of English pluralization and a list of exceptions.
19
+
20
+ :param str singular: A singular noun describing an object
21
+ :param int count: The object count, to determine whether to pluralize or not
22
+ :return str: The noun in its singular or plural form
23
+ """
24
+ if count == 1:
25
+ return singular
26
+
27
+ some_exceptions = {
28
+ "child": "children",
29
+ "class": "classes",
30
+ "entity": "entities",
31
+ "metadata": "metadata",
32
+ }
33
+ if singular in some_exceptions:
34
+ return some_exceptions[singular]
35
+
36
+ return singular + "s"
37
+
38
+
12
39
  MANUAL_SOURCE = "manual"
13
40
 
14
41
 
@@ -196,3 +223,57 @@ def create_tar_zst_archive(
196
223
  close_delete_file(tar_fd, tar_archive)
197
224
 
198
225
  return zst_fd, zst_archive, zst_hash, tar_hash
226
+
227
+
228
+ DEFAULT_BATCH_SIZE = 50
229
+ """Batch size used for bulk publication to Arkindex"""
230
+
231
+
232
+ def batch_publication(func: Callable) -> Callable:
233
+ """
234
+ Decorator for functions that should raise an error when the value passed through the ``batch_size`` parameter is **not** a strictly positive integer.
235
+
236
+ :param func: The function to wrap with the ``batch_size`` check
237
+ :return: The function passing the ``batch_size`` check
238
+ """
239
+ signature = inspect.signature(func)
240
+
241
+ def wrapper(self, *args, **kwargs):
242
+ bound_func = signature.bind(self, *args, **kwargs)
243
+ bound_func.apply_defaults()
244
+ batch_size = bound_func.arguments.get("batch_size")
245
+ assert (
246
+ batch_size and isinstance(batch_size, int) and batch_size > 0
247
+ ), "batch_size shouldn't be null and should be a strictly positive integer"
248
+
249
+ return func(self, *args, **kwargs)
250
+
251
+ return wrapper
252
+
253
+
254
+ def make_batches(
255
+ objects: list, singular_name: str, batch_size: int
256
+ ) -> Generator[list[Any]]:
257
+ """Split an object list in successive batches of maximum size ``batch_size``.
258
+
259
+ :param objects: The object list to divide in batches of ``batch_size`` size
260
+ :param singular_name: The singular form of the noun associated with the object list
261
+ :param batch_size: The maximum size of each batch to split the object list
262
+ :return: A generator of successive batches containing ``batch_size`` items from ``objects``
263
+ """
264
+ count = len(objects)
265
+ logger.info(
266
+ f"Creating batches of size {batch_size} to process {count} {pluralize(singular_name, count)}"
267
+ )
268
+
269
+ index = 1
270
+ iterator = iter(objects)
271
+ while batch := list(islice(iterator, batch_size)):
272
+ count = len(batch)
273
+ logger.info(
274
+ f"Processing batch {index} containing {count} {pluralize(singular_name, count)}..."
275
+ )
276
+
277
+ yield batch
278
+
279
+ index += 1
@@ -17,6 +17,7 @@ from apistar.exceptions import ErrorResponse
17
17
  from arkindex_worker import logger
18
18
  from arkindex_worker.cache import CachedElement
19
19
  from arkindex_worker.models import Dataset, Element, Set
20
+ from arkindex_worker.utils import pluralize
20
21
  from arkindex_worker.worker.base import BaseWorker
21
22
  from arkindex_worker.worker.classification import ClassificationMixin
22
23
  from arkindex_worker.worker.corpus import CorpusMixin
@@ -267,7 +268,7 @@ class ElementsWorker(
267
268
  with contextlib.suppress(Exception):
268
269
  self.update_activity(element.id, ActivityState.Error)
269
270
 
270
- message = f'Ran on {count} element{"s"[:count>1]}: {count - failed} completed, {failed} failed'
271
+ message = f'Ran on {count} {pluralize("element", count)}: {count - failed} completed, {failed} failed'
271
272
  if failed:
272
273
  logger.error(message)
273
274
  if failed >= count: # Everything failed!
@@ -529,7 +530,7 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
529
530
  # Cleanup the latest downloaded dataset artifact
530
531
  self.cleanup_downloaded_artifact()
531
532
 
532
- message = f'Ran on {count} set{"s"[:count>1]}: {count - failed} completed, {failed} failed'
533
+ message = f'Ran on {count} {pluralize("set", count)}: {count - failed} completed, {failed} failed'
533
534
  if failed:
534
535
  logger.error(message)
535
536
  if failed >= count: # Everything failed!
@@ -8,6 +8,12 @@ from peewee import IntegrityError
8
8
  from arkindex_worker import logger
9
9
  from arkindex_worker.cache import CachedClassification, CachedElement
10
10
  from arkindex_worker.models import Element
11
+ from arkindex_worker.utils import (
12
+ DEFAULT_BATCH_SIZE,
13
+ batch_publication,
14
+ make_batches,
15
+ pluralize,
16
+ )
11
17
 
12
18
 
13
19
  class ClassificationMixin:
@@ -21,7 +27,7 @@ class ClassificationMixin:
21
27
  )
22
28
  self.classes = {ml_class["name"]: ml_class["id"] for ml_class in corpus_classes}
23
29
  logger.info(
24
- f"Loaded {len(self.classes)} ML classes in corpus ({self.corpus_id})"
30
+ f'Loaded {len(self.classes)} ML {pluralize("class", len(self.classes))} in corpus ({self.corpus_id})'
25
31
  )
26
32
 
27
33
  def get_ml_class_id(self, ml_class: str) -> str:
@@ -167,10 +173,12 @@ class ClassificationMixin:
167
173
 
168
174
  return created
169
175
 
176
+ @batch_publication
170
177
  def create_classifications(
171
178
  self,
172
179
  element: Element | CachedElement,
173
180
  classifications: list[dict[str, str | float | bool]],
181
+ batch_size: int = DEFAULT_BATCH_SIZE,
174
182
  ) -> list[dict[str, str | float | bool]]:
175
183
  """
176
184
  Create multiple classifications at once on the given element through the API.
@@ -185,6 +193,8 @@ class ClassificationMixin:
185
193
  high_confidence (bool)
186
194
  Optional. Whether or not the classification is of high confidence.
187
195
 
196
+ :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
197
+
188
198
  :returns: List of created classifications, as returned in the ``classifications`` field by
189
199
  the ``CreateClassifications`` API endpoint.
190
200
  """
@@ -220,20 +230,26 @@ class ClassificationMixin:
220
230
  )
221
231
  return
222
232
 
223
- created_cls = self.api_client.request(
224
- "CreateClassifications",
225
- body={
226
- "parent": str(element.id),
227
- "worker_run_id": self.worker_run_id,
228
- "classifications": [
229
- {
230
- **classification,
231
- "ml_class": self.get_ml_class_id(classification["ml_class"]),
232
- }
233
- for classification in classifications
234
- ],
235
- },
236
- )["classifications"]
233
+ created_cls = [
234
+ created_cl
235
+ for batch in make_batches(classifications, "classification", batch_size)
236
+ for created_cl in self.api_client.request(
237
+ "CreateClassifications",
238
+ body={
239
+ "parent": str(element.id),
240
+ "worker_run_id": self.worker_run_id,
241
+ "classifications": [
242
+ {
243
+ **classification,
244
+ "ml_class": self.get_ml_class_id(
245
+ classification["ml_class"]
246
+ ),
247
+ }
248
+ for classification in batch
249
+ ],
250
+ },
251
+ )["classifications"]
252
+ ]
237
253
 
238
254
  for created_cl in created_cls:
239
255
  created_cl["class_name"] = self.retrieve_ml_class(created_cl["ml_class"])
@@ -3,6 +3,7 @@ ElementsWorker methods for elements and element types.
3
3
  """
4
4
 
5
5
  from collections.abc import Iterable
6
+ from operator import attrgetter
6
7
  from typing import NamedTuple
7
8
  from uuid import UUID
8
9
  from warnings import warn
@@ -12,6 +13,12 @@ from peewee import IntegrityError
12
13
  from arkindex_worker import logger
13
14
  from arkindex_worker.cache import CachedElement, CachedImage, unsupported_cache
14
15
  from arkindex_worker.models import Element
16
+ from arkindex_worker.utils import (
17
+ DEFAULT_BATCH_SIZE,
18
+ batch_publication,
19
+ make_batches,
20
+ pluralize,
21
+ )
15
22
 
16
23
 
17
24
  class ElementType(NamedTuple):
@@ -43,7 +50,7 @@ class ElementMixin:
43
50
  }
44
51
  count = len(self.corpus_types)
45
52
  logger.info(
46
- f'Loaded {count} element type{"s"[:count>1]} in corpus ({self.corpus_id}).'
53
+ f'Loaded {count} element {pluralize("type", count)} in corpus ({self.corpus_id}).'
47
54
  )
48
55
 
49
56
  @unsupported_cache
@@ -94,7 +101,7 @@ class ElementMixin:
94
101
  )
95
102
  else:
96
103
  raise MissingTypeError(
97
- f'Element type(s) {", ".join(sorted(missing_slugs))} were not found in corpus ({self.corpus_id}).'
104
+ f'Element {pluralize("type", len(missing_slugs))} {", ".join(sorted(missing_slugs))} were not found in corpus ({self.corpus_id}).'
98
105
  )
99
106
 
100
107
  return True
@@ -176,10 +183,12 @@ class ElementMixin:
176
183
 
177
184
  return sub_element["id"] if slim_output else sub_element
178
185
 
186
+ @batch_publication
179
187
  def create_elements(
180
188
  self,
181
189
  parent: Element | CachedElement,
182
190
  elements: list[dict[str, str | list[list[int | float]] | float | None]],
191
+ batch_size: int = DEFAULT_BATCH_SIZE,
183
192
  ) -> list[dict[str, str]]:
184
193
  """
185
194
  Create child elements on the given element in a single API request.
@@ -200,6 +209,8 @@ class ElementMixin:
200
209
  confidence (float or None)
201
210
  Optional confidence score, between 0.0 and 1.0.
202
211
 
212
+ :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
213
+
203
214
  :return: List of dicts, with each dict having a single key, ``id``, holding the UUID of each created element.
204
215
  """
205
216
  if isinstance(parent, Element):
@@ -258,14 +269,18 @@ class ElementMixin:
258
269
  logger.warning("Cannot create elements as this worker is in read-only mode")
259
270
  return
260
271
 
261
- created_ids = self.api_client.request(
262
- "CreateElements",
263
- id=parent.id,
264
- body={
265
- "worker_run_id": self.worker_run_id,
266
- "elements": elements,
267
- },
268
- )
272
+ created_ids = [
273
+ created_id
274
+ for batch in make_batches(elements, "element", batch_size)
275
+ for created_id in self.api_client.request(
276
+ "CreateElements",
277
+ id=parent.id,
278
+ body={
279
+ "worker_run_id": self.worker_run_id,
280
+ "elements": batch,
281
+ },
282
+ )
283
+ ]
269
284
 
270
285
  if self.use_cache:
271
286
  # Create the image as needed and handle both an Element and a CachedElement
@@ -332,6 +347,52 @@ class ElementMixin:
332
347
  child=child.id,
333
348
  )
334
349
 
350
+ @unsupported_cache
351
+ @batch_publication
352
+ def create_element_children(
353
+ self,
354
+ parent: Element,
355
+ children: list[Element],
356
+ batch_size: int = DEFAULT_BATCH_SIZE,
357
+ ) -> list[str]:
358
+ """
359
+ Link multiple elements to a single parent through the API.
360
+
361
+ :param parent: Parent element.
362
+ :param children: A list of child elements.
363
+ :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
364
+
365
+ :returns: A list containing the string UUID of each child linked to the parent.
366
+ """
367
+ assert parent and isinstance(
368
+ parent, Element
369
+ ), "parent shouldn't be null and should be of type Element"
370
+
371
+ assert children and isinstance(
372
+ children, list
373
+ ), "children shouldn't be null and should be of type list"
374
+
375
+ for index, child in enumerate(children):
376
+ assert isinstance(
377
+ child, Element
378
+ ), f"Child at index {index} in children: Should be of type Element"
379
+
380
+ if self.is_read_only:
381
+ logger.warning("Cannot link elements as this worker is in read-only mode")
382
+ return
383
+
384
+ return [
385
+ child_id
386
+ for batch in make_batches(children, "child", batch_size)
387
+ for child_id in self.api_client.request(
388
+ "CreateElementChildren",
389
+ id=parent.id,
390
+ body={
391
+ "children": list(map(attrgetter("id"), batch)),
392
+ },
393
+ )["children"]
394
+ ]
395
+
335
396
  def partial_update_element(
336
397
  self, element: Element | CachedElement, **kwargs
337
398
  ) -> dict:
@@ -15,6 +15,12 @@ from arkindex_worker.cache import (
15
15
  unsupported_cache,
16
16
  )
17
17
  from arkindex_worker.models import Element, Transcription
18
+ from arkindex_worker.utils import (
19
+ DEFAULT_BATCH_SIZE,
20
+ batch_publication,
21
+ make_batches,
22
+ pluralize,
23
+ )
18
24
 
19
25
 
20
26
  class Entity(TypedDict):
@@ -213,10 +219,12 @@ class EntityMixin:
213
219
  return transcription_ent
214
220
 
215
221
  @unsupported_cache
222
+ @batch_publication
216
223
  def create_transcription_entities(
217
224
  self,
218
225
  transcription: Transcription,
219
226
  entities: list[Entity],
227
+ batch_size: int = DEFAULT_BATCH_SIZE,
220
228
  ) -> list[dict[str, str]]:
221
229
  """
222
230
  Create multiple entities attached to a transcription in a single API request.
@@ -239,6 +247,8 @@ class EntityMixin:
239
247
  confidence (float or None)
240
248
  Optional confidence score, between 0.0 and 1.0.
241
249
 
250
+ :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
251
+
242
252
  :return: List of dicts, with each dict having a two keys, `transcription_entity_id` and `entity_id`, holding the UUID of each created object.
243
253
  """
244
254
  assert transcription and isinstance(
@@ -290,16 +300,20 @@ class EntityMixin:
290
300
  )
291
301
  return
292
302
 
293
- created_ids = self.api_client.request(
294
- "CreateTranscriptionEntities",
295
- id=transcription.id,
296
- body={
297
- "worker_run_id": self.worker_run_id,
298
- "entities": entities,
299
- },
300
- )
303
+ created_entities = [
304
+ created_entity
305
+ for batch in make_batches(entities, "entities", batch_size)
306
+ for created_entity in self.api_client.request(
307
+ "CreateTranscriptionEntities",
308
+ id=transcription.id,
309
+ body={
310
+ "worker_run_id": self.worker_run_id,
311
+ "entities": batch,
312
+ },
313
+ )["entities"]
314
+ ]
301
315
 
302
- return created_ids["entities"]
316
+ return created_entities
303
317
 
304
318
  def list_transcription_entities(
305
319
  self,
@@ -383,7 +397,7 @@ class EntityMixin:
383
397
  }
384
398
  count = len(self.entities)
385
399
  logger.info(
386
- f'Loaded {count} entit{"ies" if count > 1 else "y"} in corpus ({self.corpus_id})'
400
+ f'Loaded {count} {pluralize("entity", count)} in corpus ({self.corpus_id})'
387
401
  )
388
402
 
389
403
  def list_corpus_entity_types(self):
@@ -398,5 +412,5 @@ class EntityMixin:
398
412
  }
399
413
  count = len(self.entity_types)
400
414
  logger.info(
401
- f'Loaded {count} entity type{"s"[:count>1]} in corpus ({self.corpus_id}).'
415
+ f'Loaded {count} entity {pluralize("type", count)} in corpus ({self.corpus_id}).'
402
416
  )
@@ -7,6 +7,7 @@ from enum import Enum
7
7
  from arkindex_worker import logger
8
8
  from arkindex_worker.cache import CachedElement, unsupported_cache
9
9
  from arkindex_worker.models import Element
10
+ from arkindex_worker.utils import DEFAULT_BATCH_SIZE, batch_publication, make_batches
10
11
 
11
12
 
12
13
  class MetaType(Enum):
@@ -108,10 +109,12 @@ class MetaDataMixin:
108
109
  return metadata["id"]
109
110
 
110
111
  @unsupported_cache
112
+ @batch_publication
111
113
  def create_metadata_bulk(
112
114
  self,
113
115
  element: Element | CachedElement,
114
116
  metadata_list: list[dict[str, MetaType | str | int | float | None]],
117
+ batch_size: int = DEFAULT_BATCH_SIZE,
115
118
  ) -> list[dict[str, str]]:
116
119
  """
117
120
  Create multiple metadata on an existing element.
@@ -123,6 +126,9 @@ class MetaDataMixin:
123
126
  - name: str
124
127
  - value: str | int | float
125
128
  - entity_id: str | None
129
+ :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
130
+
131
+ :returns: A list of dicts as returned in the ``metadata_list`` field by the ``CreateMetaDataBulk`` API endpoint.
126
132
  """
127
133
  assert element and isinstance(
128
134
  element, Element | CachedElement
@@ -168,14 +174,18 @@ class MetaDataMixin:
168
174
  logger.warning("Cannot create metadata as this worker is in read-only mode")
169
175
  return
170
176
 
171
- created_metadata_list = self.api_client.request(
172
- "CreateMetaDataBulk",
173
- id=element.id,
174
- body={
175
- "worker_run_id": self.worker_run_id,
176
- "metadata_list": metas,
177
- },
178
- )["metadata_list"]
177
+ created_metadata_list = [
178
+ created_metadata
179
+ for batch in make_batches(metas, "metadata", batch_size)
180
+ for created_metadata in self.api_client.request(
181
+ "CreateMetaDataBulk",
182
+ id=element.id,
183
+ body={
184
+ "worker_run_id": self.worker_run_id,
185
+ "metadata_list": batch,
186
+ },
187
+ )["metadata_list"]
188
+ ]
179
189
 
180
190
  return created_metadata_list
181
191