arkindex-base-worker 0.4.0b1__tar.gz → 0.4.0b2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/PKG-INFO +1 -1
  2. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_base_worker.egg-info/PKG-INFO +1 -1
  3. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/image.py +2 -1
  4. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/utils.py +76 -0
  5. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/__init__.py +3 -2
  6. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/classification.py +31 -15
  7. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/element.py +24 -10
  8. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/entity.py +25 -11
  9. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/metadata.py +18 -8
  10. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/transcription.py +38 -17
  11. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/pyproject.toml +1 -1
  12. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_classifications.py +107 -60
  13. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_elements.py +185 -49
  14. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_entities.py +102 -33
  15. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_metadata.py +223 -98
  16. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_transcriptions.py +293 -143
  17. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_utils.py +28 -0
  18. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/LICENSE +0 -0
  19. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/README.md +0 -0
  20. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_base_worker.egg-info/SOURCES.txt +0 -0
  21. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_base_worker.egg-info/dependency_links.txt +0 -0
  22. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_base_worker.egg-info/requires.txt +0 -0
  23. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_base_worker.egg-info/top_level.txt +0 -0
  24. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/__init__.py +0 -0
  25. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/cache.py +0 -0
  26. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/models.py +0 -0
  27. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/base.py +0 -0
  28. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/corpus.py +0 -0
  29. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/dataset.py +0 -0
  30. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/image.py +0 -0
  31. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/task.py +0 -0
  32. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/training.py +0 -0
  33. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/version.py +0 -0
  34. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/hooks/pre_gen_project.py +0 -0
  35. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/setup.cfg +0 -0
  36. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/__init__.py +0 -0
  37. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/conftest.py +0 -0
  38. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_base_worker.py +0 -0
  39. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_cache.py +0 -0
  40. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_dataset_worker.py +0 -0
  41. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_element.py +0 -0
  42. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/__init__.py +0 -0
  43. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_cli.py +0 -0
  44. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_corpus.py +0 -0
  45. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_dataset.py +0 -0
  46. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_image.py +0 -0
  47. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_task.py +0 -0
  48. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_training.py +0 -0
  49. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_worker.py +0 -0
  50. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_image.py +0 -0
  51. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_merge.py +0 -0
  52. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/worker-demo/tests/__init__.py +0 -0
  53. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/worker-demo/tests/conftest.py +0 -0
  54. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/worker-demo/tests/test_worker.py +0 -0
  55. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/worker-demo/worker_demo/__init__.py +0 -0
  56. {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/worker-demo/worker_demo/worker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: arkindex-base-worker
3
- Version: 0.4.0b1
3
+ Version: 0.4.0b2
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: arkindex-base-worker
3
- Version: 0.4.0b1
3
+ Version: 0.4.0b2
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
@@ -21,6 +21,7 @@ from tenacity import (
21
21
  )
22
22
 
23
23
  from arkindex_worker import logger
24
+ from arkindex_worker.utils import pluralize
24
25
  from teklia_toolbox.requests import should_verify_cert
25
26
 
26
27
  # Avoid circular imports error when type checking
@@ -164,7 +165,7 @@ def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
164
165
  def _retry_log(retry_state, *args, **kwargs):
165
166
  logger.warning(
166
167
  f"Request to {retry_state.args[0]} failed ({repr(retry_state.outcome.exception())}), "
167
- f"retrying in {retry_state.idle_for} seconds"
168
+ f'retrying in {retry_state.idle_for} {pluralize("second", retry_state.idle_for)}'
168
169
  )
169
170
 
170
171
 
@@ -1,14 +1,36 @@
1
1
  import hashlib
2
+ import inspect
2
3
  import logging
3
4
  import os
4
5
  import tarfile
5
6
  import tempfile
7
+ from collections.abc import Callable, Generator
8
+ from itertools import islice
6
9
  from pathlib import Path
10
+ from typing import Any
7
11
 
8
12
  import zstandard as zstd
9
13
 
10
14
  logger = logging.getLogger(__name__)
11
15
 
16
+
17
+ def pluralize(singular: str, count: int) -> str:
18
+ """Pluralize a noun, if necessary, using simplified rules of English pluralization and a list of exceptions.
19
+
20
+ :param str singular: A singular noun describing an object
21
+ :param int count: The object count, to determine whether to pluralize or not
22
+ :return str: The noun in its singular or plural form
23
+ """
24
+ if count == 1:
25
+ return singular
26
+
27
+ some_exceptions = {"entity": "entities", "metadata": "metadata", "class": "classes"}
28
+ if singular in some_exceptions:
29
+ return some_exceptions[singular]
30
+
31
+ return singular + "s"
32
+
33
+
12
34
  MANUAL_SOURCE = "manual"
13
35
 
14
36
 
@@ -196,3 +218,57 @@ def create_tar_zst_archive(
196
218
  close_delete_file(tar_fd, tar_archive)
197
219
 
198
220
  return zst_fd, zst_archive, zst_hash, tar_hash
221
+
222
+
223
+ DEFAULT_BATCH_SIZE = 50
224
+ """Batch size used for bulk publication to Arkindex"""
225
+
226
+
227
+ def batch_publication(func: Callable) -> Callable:
228
+ """
229
+ Decorator for functions that should raise an error when the value passed through the ``batch_size`` parameter is **not** a strictly positive integer.
230
+
231
+ :param func: The function to wrap with the ``batch_size`` check
232
+ :return: The function passing the ``batch_size`` check
233
+ """
234
+ signature = inspect.signature(func)
235
+
236
+ def wrapper(self, *args, **kwargs):
237
+ bound_func = signature.bind(self, *args, **kwargs)
238
+ bound_func.apply_defaults()
239
+ batch_size = bound_func.arguments.get("batch_size")
240
+ assert (
241
+ batch_size and isinstance(batch_size, int) and batch_size > 0
242
+ ), "batch_size shouldn't be null and should be a strictly positive integer"
243
+
244
+ return func(self, *args, **kwargs)
245
+
246
+ return wrapper
247
+
248
+
249
+ def make_batches(
250
+ objects: list, singular_name: str, batch_size: int
251
+ ) -> Generator[list[Any]]:
252
+ """Split an object list in successive batches of maximum size ``batch_size``.
253
+
254
+ :param objects: The object list to divide in batches of ``batch_size`` size
255
+ :param singular_name: The singular form of the noun associated with the object list
256
+ :param batch_size: The maximum size of each batch to split the object list
257
+ :return: A generator of successive batches containing ``batch_size`` items from ``objects``
258
+ """
259
+ count = len(objects)
260
+ logger.info(
261
+ f"Creating batches of size {batch_size} to process {count} {pluralize(singular_name, count)}"
262
+ )
263
+
264
+ index = 1
265
+ iterator = iter(objects)
266
+ while batch := list(islice(iterator, batch_size)):
267
+ count = len(batch)
268
+ logger.info(
269
+ f"Processing batch {index} containing {count} {pluralize(singular_name, count)}..."
270
+ )
271
+
272
+ yield batch
273
+
274
+ index += 1
@@ -17,6 +17,7 @@ from apistar.exceptions import ErrorResponse
17
17
  from arkindex_worker import logger
18
18
  from arkindex_worker.cache import CachedElement
19
19
  from arkindex_worker.models import Dataset, Element, Set
20
+ from arkindex_worker.utils import pluralize
20
21
  from arkindex_worker.worker.base import BaseWorker
21
22
  from arkindex_worker.worker.classification import ClassificationMixin
22
23
  from arkindex_worker.worker.corpus import CorpusMixin
@@ -267,7 +268,7 @@ class ElementsWorker(
267
268
  with contextlib.suppress(Exception):
268
269
  self.update_activity(element.id, ActivityState.Error)
269
270
 
270
- message = f'Ran on {count} element{"s"[:count>1]}: {count - failed} completed, {failed} failed'
271
+ message = f'Ran on {count} {pluralize("element", count)}: {count - failed} completed, {failed} failed'
271
272
  if failed:
272
273
  logger.error(message)
273
274
  if failed >= count: # Everything failed!
@@ -529,7 +530,7 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
529
530
  # Cleanup the latest downloaded dataset artifact
530
531
  self.cleanup_downloaded_artifact()
531
532
 
532
- message = f'Ran on {count} set{"s"[:count>1]}: {count - failed} completed, {failed} failed'
533
+ message = f'Ran on {count} {pluralize("set", count)}: {count - failed} completed, {failed} failed'
533
534
  if failed:
534
535
  logger.error(message)
535
536
  if failed >= count: # Everything failed!
@@ -8,6 +8,12 @@ from peewee import IntegrityError
8
8
  from arkindex_worker import logger
9
9
  from arkindex_worker.cache import CachedClassification, CachedElement
10
10
  from arkindex_worker.models import Element
11
+ from arkindex_worker.utils import (
12
+ DEFAULT_BATCH_SIZE,
13
+ batch_publication,
14
+ make_batches,
15
+ pluralize,
16
+ )
11
17
 
12
18
 
13
19
  class ClassificationMixin:
@@ -21,7 +27,7 @@ class ClassificationMixin:
21
27
  )
22
28
  self.classes = {ml_class["name"]: ml_class["id"] for ml_class in corpus_classes}
23
29
  logger.info(
24
- f"Loaded {len(self.classes)} ML classes in corpus ({self.corpus_id})"
30
+ f'Loaded {len(self.classes)} ML {pluralize("class", len(self.classes))} in corpus ({self.corpus_id})'
25
31
  )
26
32
 
27
33
  def get_ml_class_id(self, ml_class: str) -> str:
@@ -167,10 +173,12 @@ class ClassificationMixin:
167
173
 
168
174
  return created
169
175
 
176
+ @batch_publication
170
177
  def create_classifications(
171
178
  self,
172
179
  element: Element | CachedElement,
173
180
  classifications: list[dict[str, str | float | bool]],
181
+ batch_size: int = DEFAULT_BATCH_SIZE,
174
182
  ) -> list[dict[str, str | float | bool]]:
175
183
  """
176
184
  Create multiple classifications at once on the given element through the API.
@@ -185,6 +193,8 @@ class ClassificationMixin:
185
193
  high_confidence (bool)
186
194
  Optional. Whether or not the classification is of high confidence.
187
195
 
196
+ :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
197
+
188
198
  :returns: List of created classifications, as returned in the ``classifications`` field by
189
199
  the ``CreateClassifications`` API endpoint.
190
200
  """
@@ -220,20 +230,26 @@ class ClassificationMixin:
220
230
  )
221
231
  return
222
232
 
223
- created_cls = self.api_client.request(
224
- "CreateClassifications",
225
- body={
226
- "parent": str(element.id),
227
- "worker_run_id": self.worker_run_id,
228
- "classifications": [
229
- {
230
- **classification,
231
- "ml_class": self.get_ml_class_id(classification["ml_class"]),
232
- }
233
- for classification in classifications
234
- ],
235
- },
236
- )["classifications"]
233
+ created_cls = [
234
+ created_cl
235
+ for batch in make_batches(classifications, "classification", batch_size)
236
+ for created_cl in self.api_client.request(
237
+ "CreateClassifications",
238
+ body={
239
+ "parent": str(element.id),
240
+ "worker_run_id": self.worker_run_id,
241
+ "classifications": [
242
+ {
243
+ **classification,
244
+ "ml_class": self.get_ml_class_id(
245
+ classification["ml_class"]
246
+ ),
247
+ }
248
+ for classification in batch
249
+ ],
250
+ },
251
+ )["classifications"]
252
+ ]
237
253
 
238
254
  for created_cl in created_cls:
239
255
  created_cl["class_name"] = self.retrieve_ml_class(created_cl["ml_class"])
@@ -12,6 +12,12 @@ from peewee import IntegrityError
12
12
  from arkindex_worker import logger
13
13
  from arkindex_worker.cache import CachedElement, CachedImage, unsupported_cache
14
14
  from arkindex_worker.models import Element
15
+ from arkindex_worker.utils import (
16
+ DEFAULT_BATCH_SIZE,
17
+ batch_publication,
18
+ make_batches,
19
+ pluralize,
20
+ )
15
21
 
16
22
 
17
23
  class ElementType(NamedTuple):
@@ -43,7 +49,7 @@ class ElementMixin:
43
49
  }
44
50
  count = len(self.corpus_types)
45
51
  logger.info(
46
- f'Loaded {count} element type{"s"[:count>1]} in corpus ({self.corpus_id}).'
52
+ f'Loaded {count} element {pluralize("type", count)} in corpus ({self.corpus_id}).'
47
53
  )
48
54
 
49
55
  @unsupported_cache
@@ -94,7 +100,7 @@ class ElementMixin:
94
100
  )
95
101
  else:
96
102
  raise MissingTypeError(
97
- f'Element type(s) {", ".join(sorted(missing_slugs))} were not found in corpus ({self.corpus_id}).'
103
+ f'Element {pluralize("type", len(missing_slugs))} {", ".join(sorted(missing_slugs))} were not found in corpus ({self.corpus_id}).'
98
104
  )
99
105
 
100
106
  return True
@@ -176,10 +182,12 @@ class ElementMixin:
176
182
 
177
183
  return sub_element["id"] if slim_output else sub_element
178
184
 
185
+ @batch_publication
179
186
  def create_elements(
180
187
  self,
181
188
  parent: Element | CachedElement,
182
189
  elements: list[dict[str, str | list[list[int | float]] | float | None]],
190
+ batch_size: int = DEFAULT_BATCH_SIZE,
183
191
  ) -> list[dict[str, str]]:
184
192
  """
185
193
  Create child elements on the given element in a single API request.
@@ -200,6 +208,8 @@ class ElementMixin:
200
208
  confidence (float or None)
201
209
  Optional confidence score, between 0.0 and 1.0.
202
210
 
211
+ :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
212
+
203
213
  :return: List of dicts, with each dict having a single key, ``id``, holding the UUID of each created element.
204
214
  """
205
215
  if isinstance(parent, Element):
@@ -258,14 +268,18 @@ class ElementMixin:
258
268
  logger.warning("Cannot create elements as this worker is in read-only mode")
259
269
  return
260
270
 
261
- created_ids = self.api_client.request(
262
- "CreateElements",
263
- id=parent.id,
264
- body={
265
- "worker_run_id": self.worker_run_id,
266
- "elements": elements,
267
- },
268
- )
271
+ created_ids = [
272
+ created_id
273
+ for batch in make_batches(elements, "element", batch_size)
274
+ for created_id in self.api_client.request(
275
+ "CreateElements",
276
+ id=parent.id,
277
+ body={
278
+ "worker_run_id": self.worker_run_id,
279
+ "elements": batch,
280
+ },
281
+ )
282
+ ]
269
283
 
270
284
  if self.use_cache:
271
285
  # Create the image as needed and handle both an Element and a CachedElement
@@ -15,6 +15,12 @@ from arkindex_worker.cache import (
15
15
  unsupported_cache,
16
16
  )
17
17
  from arkindex_worker.models import Element, Transcription
18
+ from arkindex_worker.utils import (
19
+ DEFAULT_BATCH_SIZE,
20
+ batch_publication,
21
+ make_batches,
22
+ pluralize,
23
+ )
18
24
 
19
25
 
20
26
  class Entity(TypedDict):
@@ -213,10 +219,12 @@ class EntityMixin:
213
219
  return transcription_ent
214
220
 
215
221
  @unsupported_cache
222
+ @batch_publication
216
223
  def create_transcription_entities(
217
224
  self,
218
225
  transcription: Transcription,
219
226
  entities: list[Entity],
227
+ batch_size: int = DEFAULT_BATCH_SIZE,
220
228
  ) -> list[dict[str, str]]:
221
229
  """
222
230
  Create multiple entities attached to a transcription in a single API request.
@@ -239,6 +247,8 @@ class EntityMixin:
239
247
  confidence (float or None)
240
248
  Optional confidence score, between 0.0 and 1.0.
241
249
 
250
+ :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
251
+
242
252
  :return: List of dicts, with each dict having a two keys, `transcription_entity_id` and `entity_id`, holding the UUID of each created object.
243
253
  """
244
254
  assert transcription and isinstance(
@@ -290,16 +300,20 @@ class EntityMixin:
290
300
  )
291
301
  return
292
302
 
293
- created_ids = self.api_client.request(
294
- "CreateTranscriptionEntities",
295
- id=transcription.id,
296
- body={
297
- "worker_run_id": self.worker_run_id,
298
- "entities": entities,
299
- },
300
- )
303
+ created_entities = [
304
+ created_entity
305
+ for batch in make_batches(entities, "entities", batch_size)
306
+ for created_entity in self.api_client.request(
307
+ "CreateTranscriptionEntities",
308
+ id=transcription.id,
309
+ body={
310
+ "worker_run_id": self.worker_run_id,
311
+ "entities": batch,
312
+ },
313
+ )["entities"]
314
+ ]
301
315
 
302
- return created_ids["entities"]
316
+ return created_entities
303
317
 
304
318
  def list_transcription_entities(
305
319
  self,
@@ -383,7 +397,7 @@ class EntityMixin:
383
397
  }
384
398
  count = len(self.entities)
385
399
  logger.info(
386
- f'Loaded {count} entit{"ies" if count > 1 else "y"} in corpus ({self.corpus_id})'
400
+ f'Loaded {count} {pluralize("entity", count)} in corpus ({self.corpus_id})'
387
401
  )
388
402
 
389
403
  def list_corpus_entity_types(self):
@@ -398,5 +412,5 @@ class EntityMixin:
398
412
  }
399
413
  count = len(self.entity_types)
400
414
  logger.info(
401
- f'Loaded {count} entity type{"s"[:count>1]} in corpus ({self.corpus_id}).'
415
+ f'Loaded {count} entity {pluralize("type", count)} in corpus ({self.corpus_id}).'
402
416
  )
@@ -7,6 +7,7 @@ from enum import Enum
7
7
  from arkindex_worker import logger
8
8
  from arkindex_worker.cache import CachedElement, unsupported_cache
9
9
  from arkindex_worker.models import Element
10
+ from arkindex_worker.utils import DEFAULT_BATCH_SIZE, batch_publication, make_batches
10
11
 
11
12
 
12
13
  class MetaType(Enum):
@@ -108,10 +109,12 @@ class MetaDataMixin:
108
109
  return metadata["id"]
109
110
 
110
111
  @unsupported_cache
112
+ @batch_publication
111
113
  def create_metadata_bulk(
112
114
  self,
113
115
  element: Element | CachedElement,
114
116
  metadata_list: list[dict[str, MetaType | str | int | float | None]],
117
+ batch_size: int = DEFAULT_BATCH_SIZE,
115
118
  ) -> list[dict[str, str]]:
116
119
  """
117
120
  Create multiple metadata on an existing element.
@@ -123,6 +126,9 @@ class MetaDataMixin:
123
126
  - name: str
124
127
  - value: str | int | float
125
128
  - entity_id: str | None
129
+ :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
130
+
131
+ :returns: A list of dicts as returned in the ``metadata_list`` field by the ``CreateMetaDataBulk`` API endpoint.
126
132
  """
127
133
  assert element and isinstance(
128
134
  element, Element | CachedElement
@@ -168,14 +174,18 @@ class MetaDataMixin:
168
174
  logger.warning("Cannot create metadata as this worker is in read-only mode")
169
175
  return
170
176
 
171
- created_metadata_list = self.api_client.request(
172
- "CreateMetaDataBulk",
173
- id=element.id,
174
- body={
175
- "worker_run_id": self.worker_run_id,
176
- "metadata_list": metas,
177
- },
178
- )["metadata_list"]
177
+ created_metadata_list = [
178
+ created_metadata
179
+ for batch in make_batches(metas, "metadata", batch_size)
180
+ for created_metadata in self.api_client.request(
181
+ "CreateMetaDataBulk",
182
+ id=element.id,
183
+ body={
184
+ "worker_run_id": self.worker_run_id,
185
+ "metadata_list": batch,
186
+ },
187
+ )["metadata_list"]
188
+ ]
179
189
 
180
190
  return created_metadata_list
181
191
 
@@ -11,6 +11,7 @@ from peewee import IntegrityError
11
11
  from arkindex_worker import logger
12
12
  from arkindex_worker.cache import CachedElement, CachedTranscription
13
13
  from arkindex_worker.models import Element
14
+ from arkindex_worker.utils import DEFAULT_BATCH_SIZE, batch_publication, make_batches
14
15
 
15
16
 
16
17
  class TextOrientation(Enum):
@@ -109,9 +110,11 @@ class TranscriptionMixin:
109
110
 
110
111
  return created
111
112
 
113
+ @batch_publication
112
114
  def create_transcriptions(
113
115
  self,
114
116
  transcriptions: list[dict[str, str | float | TextOrientation | None]],
117
+ batch_size: int = DEFAULT_BATCH_SIZE,
115
118
  ) -> list[dict[str, str | float]]:
116
119
  """
117
120
  Create multiple transcriptions at once on existing elements through the API,
@@ -128,6 +131,8 @@ class TranscriptionMixin:
128
131
  orientation (TextOrientation)
129
132
  Optional. Orientation of the transcription's text.
130
133
 
134
+ :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
135
+
131
136
  :returns: A list of dicts as returned in the ``transcriptions`` field by the ``CreateTranscriptions`` API endpoint.
132
137
  """
133
138
 
@@ -171,13 +176,19 @@ class TranscriptionMixin:
171
176
  )
172
177
  return
173
178
 
174
- created_trs = self.api_client.request(
175
- "CreateTranscriptions",
176
- body={
177
- "worker_run_id": self.worker_run_id,
178
- "transcriptions": transcriptions_payload,
179
- },
180
- )["transcriptions"]
179
+ created_trs = [
180
+ created_tr
181
+ for batch in make_batches(
182
+ transcriptions_payload, "transcription", batch_size
183
+ )
184
+ for created_tr in self.api_client.request(
185
+ "CreateTranscriptions",
186
+ body={
187
+ "worker_run_id": self.worker_run_id,
188
+ "transcriptions": batch,
189
+ },
190
+ )["transcriptions"]
191
+ ]
181
192
 
182
193
  if self.use_cache:
183
194
  # Store transcriptions in local cache
@@ -201,11 +212,13 @@ class TranscriptionMixin:
201
212
 
202
213
  return created_trs
203
214
 
215
+ @batch_publication
204
216
  def create_element_transcriptions(
205
217
  self,
206
218
  element: Element | CachedElement,
207
219
  sub_element_type: str,
208
220
  transcriptions: list[dict[str, str | float]],
221
+ batch_size: int = DEFAULT_BATCH_SIZE,
209
222
  ) -> dict[str, str | bool]:
210
223
  """
211
224
  Create multiple elements and transcriptions at once on a single parent element through the API.
@@ -225,6 +238,8 @@ class TranscriptionMixin:
225
238
  element_confidence (float)
226
239
  Optional. Confidence score of the element between 0 and 1.
227
240
 
241
+ :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
242
+
228
243
  :returns: A list of dicts as returned by the ``CreateElementTranscriptions`` API endpoint.
229
244
  """
230
245
  assert element and isinstance(
@@ -291,16 +306,22 @@ class TranscriptionMixin:
291
306
  )
292
307
  return
293
308
 
294
- annotations = self.api_client.request(
295
- "CreateElementTranscriptions",
296
- id=element.id,
297
- body={
298
- "element_type": sub_element_type,
299
- "worker_run_id": self.worker_run_id,
300
- "transcriptions": transcriptions_payload,
301
- "return_elements": True,
302
- },
303
- )
309
+ annotations = [
310
+ annotation
311
+ for batch in make_batches(
312
+ transcriptions_payload, "transcription", batch_size
313
+ )
314
+ for annotation in self.api_client.request(
315
+ "CreateElementTranscriptions",
316
+ id=element.id,
317
+ body={
318
+ "element_type": sub_element_type,
319
+ "worker_run_id": self.worker_run_id,
320
+ "transcriptions": batch,
321
+ "return_elements": True,
322
+ },
323
+ )
324
+ ]
304
325
 
305
326
  for annotation in annotations:
306
327
  if annotation["created"]:
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "arkindex-base-worker"
7
- version = "0.4.0b1"
7
+ version = "0.4.0b2"
8
8
  description = "Base Worker to easily build Arkindex ML workflows"
9
9
  license = { file = "LICENSE" }
10
10
  dependencies = [