arkindex-base-worker 0.4.0b1__tar.gz → 0.4.0b2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/PKG-INFO +1 -1
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_base_worker.egg-info/PKG-INFO +1 -1
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/image.py +2 -1
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/utils.py +76 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/__init__.py +3 -2
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/classification.py +31 -15
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/element.py +24 -10
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/entity.py +25 -11
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/metadata.py +18 -8
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/transcription.py +38 -17
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/pyproject.toml +1 -1
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_classifications.py +107 -60
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_elements.py +185 -49
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_entities.py +102 -33
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_metadata.py +223 -98
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_transcriptions.py +293 -143
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_utils.py +28 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/LICENSE +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/README.md +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_base_worker.egg-info/SOURCES.txt +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_base_worker.egg-info/dependency_links.txt +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_base_worker.egg-info/requires.txt +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_base_worker.egg-info/top_level.txt +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/__init__.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/cache.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/models.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/base.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/corpus.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/dataset.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/image.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/task.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/training.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/version.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/hooks/pre_gen_project.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/setup.cfg +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/__init__.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/conftest.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_base_worker.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_cache.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_dataset_worker.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_element.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/__init__.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_cli.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_corpus.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_dataset.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_image.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_task.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_training.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_elements_worker/test_worker.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_image.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/tests/test_merge.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/worker-demo/tests/__init__.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/worker-demo/tests/conftest.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/worker-demo/tests/test_worker.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/worker-demo/worker_demo/__init__.py +0 -0
- {arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/worker-demo/worker_demo/worker.py +0 -0
|
@@ -21,6 +21,7 @@ from tenacity import (
|
|
|
21
21
|
)
|
|
22
22
|
|
|
23
23
|
from arkindex_worker import logger
|
|
24
|
+
from arkindex_worker.utils import pluralize
|
|
24
25
|
from teklia_toolbox.requests import should_verify_cert
|
|
25
26
|
|
|
26
27
|
# Avoid circular imports error when type checking
|
|
@@ -164,7 +165,7 @@ def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
|
|
|
164
165
|
def _retry_log(retry_state, *args, **kwargs):
|
|
165
166
|
logger.warning(
|
|
166
167
|
f"Request to {retry_state.args[0]} failed ({repr(retry_state.outcome.exception())}), "
|
|
167
|
-
f
|
|
168
|
+
f'retrying in {retry_state.idle_for} {pluralize("second", retry_state.idle_for)}'
|
|
168
169
|
)
|
|
169
170
|
|
|
170
171
|
|
|
@@ -1,14 +1,36 @@
|
|
|
1
1
|
import hashlib
|
|
2
|
+
import inspect
|
|
2
3
|
import logging
|
|
3
4
|
import os
|
|
4
5
|
import tarfile
|
|
5
6
|
import tempfile
|
|
7
|
+
from collections.abc import Callable, Generator
|
|
8
|
+
from itertools import islice
|
|
6
9
|
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
7
11
|
|
|
8
12
|
import zstandard as zstd
|
|
9
13
|
|
|
10
14
|
logger = logging.getLogger(__name__)
|
|
11
15
|
|
|
16
|
+
|
|
17
|
+
def pluralize(singular: str, count: int) -> str:
|
|
18
|
+
"""Pluralize a noun, if necessary, using simplified rules of English pluralization and a list of exceptions.
|
|
19
|
+
|
|
20
|
+
:param str singular: A singular noun describing an object
|
|
21
|
+
:param int count: The object count, to determine whether to pluralize or not
|
|
22
|
+
:return str: The noun in its singular or plural form
|
|
23
|
+
"""
|
|
24
|
+
if count == 1:
|
|
25
|
+
return singular
|
|
26
|
+
|
|
27
|
+
some_exceptions = {"entity": "entities", "metadata": "metadata", "class": "classes"}
|
|
28
|
+
if singular in some_exceptions:
|
|
29
|
+
return some_exceptions[singular]
|
|
30
|
+
|
|
31
|
+
return singular + "s"
|
|
32
|
+
|
|
33
|
+
|
|
12
34
|
MANUAL_SOURCE = "manual"
|
|
13
35
|
|
|
14
36
|
|
|
@@ -196,3 +218,57 @@ def create_tar_zst_archive(
|
|
|
196
218
|
close_delete_file(tar_fd, tar_archive)
|
|
197
219
|
|
|
198
220
|
return zst_fd, zst_archive, zst_hash, tar_hash
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
DEFAULT_BATCH_SIZE = 50
|
|
224
|
+
"""Batch size used for bulk publication to Arkindex"""
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def batch_publication(func: Callable) -> Callable:
|
|
228
|
+
"""
|
|
229
|
+
Decorator for functions that should raise an error when the value passed through the ``batch_size`` parameter is **not** a strictly positive integer.
|
|
230
|
+
|
|
231
|
+
:param func: The function to wrap with the ``batch_size`` check
|
|
232
|
+
:return: The function passing the ``batch_size`` check
|
|
233
|
+
"""
|
|
234
|
+
signature = inspect.signature(func)
|
|
235
|
+
|
|
236
|
+
def wrapper(self, *args, **kwargs):
|
|
237
|
+
bound_func = signature.bind(self, *args, **kwargs)
|
|
238
|
+
bound_func.apply_defaults()
|
|
239
|
+
batch_size = bound_func.arguments.get("batch_size")
|
|
240
|
+
assert (
|
|
241
|
+
batch_size and isinstance(batch_size, int) and batch_size > 0
|
|
242
|
+
), "batch_size shouldn't be null and should be a strictly positive integer"
|
|
243
|
+
|
|
244
|
+
return func(self, *args, **kwargs)
|
|
245
|
+
|
|
246
|
+
return wrapper
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def make_batches(
|
|
250
|
+
objects: list, singular_name: str, batch_size: int
|
|
251
|
+
) -> Generator[list[Any]]:
|
|
252
|
+
"""Split an object list in successive batches of maximum size ``batch_size``.
|
|
253
|
+
|
|
254
|
+
:param objects: The object list to divide in batches of ``batch_size`` size
|
|
255
|
+
:param singular_name: The singular form of the noun associated with the object list
|
|
256
|
+
:param batch_size: The maximum size of each batch to split the object list
|
|
257
|
+
:return: A generator of successive batches containing ``batch_size`` items from ``objects``
|
|
258
|
+
"""
|
|
259
|
+
count = len(objects)
|
|
260
|
+
logger.info(
|
|
261
|
+
f"Creating batches of size {batch_size} to process {count} {pluralize(singular_name, count)}"
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
index = 1
|
|
265
|
+
iterator = iter(objects)
|
|
266
|
+
while batch := list(islice(iterator, batch_size)):
|
|
267
|
+
count = len(batch)
|
|
268
|
+
logger.info(
|
|
269
|
+
f"Processing batch {index} containing {count} {pluralize(singular_name, count)}..."
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
yield batch
|
|
273
|
+
|
|
274
|
+
index += 1
|
{arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/__init__.py
RENAMED
|
@@ -17,6 +17,7 @@ from apistar.exceptions import ErrorResponse
|
|
|
17
17
|
from arkindex_worker import logger
|
|
18
18
|
from arkindex_worker.cache import CachedElement
|
|
19
19
|
from arkindex_worker.models import Dataset, Element, Set
|
|
20
|
+
from arkindex_worker.utils import pluralize
|
|
20
21
|
from arkindex_worker.worker.base import BaseWorker
|
|
21
22
|
from arkindex_worker.worker.classification import ClassificationMixin
|
|
22
23
|
from arkindex_worker.worker.corpus import CorpusMixin
|
|
@@ -267,7 +268,7 @@ class ElementsWorker(
|
|
|
267
268
|
with contextlib.suppress(Exception):
|
|
268
269
|
self.update_activity(element.id, ActivityState.Error)
|
|
269
270
|
|
|
270
|
-
message = f'Ran on {count}
|
|
271
|
+
message = f'Ran on {count} {pluralize("element", count)}: {count - failed} completed, {failed} failed'
|
|
271
272
|
if failed:
|
|
272
273
|
logger.error(message)
|
|
273
274
|
if failed >= count: # Everything failed!
|
|
@@ -529,7 +530,7 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
|
|
|
529
530
|
# Cleanup the latest downloaded dataset artifact
|
|
530
531
|
self.cleanup_downloaded_artifact()
|
|
531
532
|
|
|
532
|
-
message = f'Ran on {count}
|
|
533
|
+
message = f'Ran on {count} {pluralize("set", count)}: {count - failed} completed, {failed} failed'
|
|
533
534
|
if failed:
|
|
534
535
|
logger.error(message)
|
|
535
536
|
if failed >= count: # Everything failed!
|
|
@@ -8,6 +8,12 @@ from peewee import IntegrityError
|
|
|
8
8
|
from arkindex_worker import logger
|
|
9
9
|
from arkindex_worker.cache import CachedClassification, CachedElement
|
|
10
10
|
from arkindex_worker.models import Element
|
|
11
|
+
from arkindex_worker.utils import (
|
|
12
|
+
DEFAULT_BATCH_SIZE,
|
|
13
|
+
batch_publication,
|
|
14
|
+
make_batches,
|
|
15
|
+
pluralize,
|
|
16
|
+
)
|
|
11
17
|
|
|
12
18
|
|
|
13
19
|
class ClassificationMixin:
|
|
@@ -21,7 +27,7 @@ class ClassificationMixin:
|
|
|
21
27
|
)
|
|
22
28
|
self.classes = {ml_class["name"]: ml_class["id"] for ml_class in corpus_classes}
|
|
23
29
|
logger.info(
|
|
24
|
-
f
|
|
30
|
+
f'Loaded {len(self.classes)} ML {pluralize("class", len(self.classes))} in corpus ({self.corpus_id})'
|
|
25
31
|
)
|
|
26
32
|
|
|
27
33
|
def get_ml_class_id(self, ml_class: str) -> str:
|
|
@@ -167,10 +173,12 @@ class ClassificationMixin:
|
|
|
167
173
|
|
|
168
174
|
return created
|
|
169
175
|
|
|
176
|
+
@batch_publication
|
|
170
177
|
def create_classifications(
|
|
171
178
|
self,
|
|
172
179
|
element: Element | CachedElement,
|
|
173
180
|
classifications: list[dict[str, str | float | bool]],
|
|
181
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
174
182
|
) -> list[dict[str, str | float | bool]]:
|
|
175
183
|
"""
|
|
176
184
|
Create multiple classifications at once on the given element through the API.
|
|
@@ -185,6 +193,8 @@ class ClassificationMixin:
|
|
|
185
193
|
high_confidence (bool)
|
|
186
194
|
Optional. Whether or not the classification is of high confidence.
|
|
187
195
|
|
|
196
|
+
:param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
|
|
197
|
+
|
|
188
198
|
:returns: List of created classifications, as returned in the ``classifications`` field by
|
|
189
199
|
the ``CreateClassifications`` API endpoint.
|
|
190
200
|
"""
|
|
@@ -220,20 +230,26 @@ class ClassificationMixin:
|
|
|
220
230
|
)
|
|
221
231
|
return
|
|
222
232
|
|
|
223
|
-
created_cls =
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
"
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
233
|
+
created_cls = [
|
|
234
|
+
created_cl
|
|
235
|
+
for batch in make_batches(classifications, "classification", batch_size)
|
|
236
|
+
for created_cl in self.api_client.request(
|
|
237
|
+
"CreateClassifications",
|
|
238
|
+
body={
|
|
239
|
+
"parent": str(element.id),
|
|
240
|
+
"worker_run_id": self.worker_run_id,
|
|
241
|
+
"classifications": [
|
|
242
|
+
{
|
|
243
|
+
**classification,
|
|
244
|
+
"ml_class": self.get_ml_class_id(
|
|
245
|
+
classification["ml_class"]
|
|
246
|
+
),
|
|
247
|
+
}
|
|
248
|
+
for classification in batch
|
|
249
|
+
],
|
|
250
|
+
},
|
|
251
|
+
)["classifications"]
|
|
252
|
+
]
|
|
237
253
|
|
|
238
254
|
for created_cl in created_cls:
|
|
239
255
|
created_cl["class_name"] = self.retrieve_ml_class(created_cl["ml_class"])
|
{arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/element.py
RENAMED
|
@@ -12,6 +12,12 @@ from peewee import IntegrityError
|
|
|
12
12
|
from arkindex_worker import logger
|
|
13
13
|
from arkindex_worker.cache import CachedElement, CachedImage, unsupported_cache
|
|
14
14
|
from arkindex_worker.models import Element
|
|
15
|
+
from arkindex_worker.utils import (
|
|
16
|
+
DEFAULT_BATCH_SIZE,
|
|
17
|
+
batch_publication,
|
|
18
|
+
make_batches,
|
|
19
|
+
pluralize,
|
|
20
|
+
)
|
|
15
21
|
|
|
16
22
|
|
|
17
23
|
class ElementType(NamedTuple):
|
|
@@ -43,7 +49,7 @@ class ElementMixin:
|
|
|
43
49
|
}
|
|
44
50
|
count = len(self.corpus_types)
|
|
45
51
|
logger.info(
|
|
46
|
-
f'Loaded {count} element
|
|
52
|
+
f'Loaded {count} element {pluralize("type", count)} in corpus ({self.corpus_id}).'
|
|
47
53
|
)
|
|
48
54
|
|
|
49
55
|
@unsupported_cache
|
|
@@ -94,7 +100,7 @@ class ElementMixin:
|
|
|
94
100
|
)
|
|
95
101
|
else:
|
|
96
102
|
raise MissingTypeError(
|
|
97
|
-
f'Element type(
|
|
103
|
+
f'Element {pluralize("type", len(missing_slugs))} {", ".join(sorted(missing_slugs))} were not found in corpus ({self.corpus_id}).'
|
|
98
104
|
)
|
|
99
105
|
|
|
100
106
|
return True
|
|
@@ -176,10 +182,12 @@ class ElementMixin:
|
|
|
176
182
|
|
|
177
183
|
return sub_element["id"] if slim_output else sub_element
|
|
178
184
|
|
|
185
|
+
@batch_publication
|
|
179
186
|
def create_elements(
|
|
180
187
|
self,
|
|
181
188
|
parent: Element | CachedElement,
|
|
182
189
|
elements: list[dict[str, str | list[list[int | float]] | float | None]],
|
|
190
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
183
191
|
) -> list[dict[str, str]]:
|
|
184
192
|
"""
|
|
185
193
|
Create child elements on the given element in a single API request.
|
|
@@ -200,6 +208,8 @@ class ElementMixin:
|
|
|
200
208
|
confidence (float or None)
|
|
201
209
|
Optional confidence score, between 0.0 and 1.0.
|
|
202
210
|
|
|
211
|
+
:param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
|
|
212
|
+
|
|
203
213
|
:return: List of dicts, with each dict having a single key, ``id``, holding the UUID of each created element.
|
|
204
214
|
"""
|
|
205
215
|
if isinstance(parent, Element):
|
|
@@ -258,14 +268,18 @@ class ElementMixin:
|
|
|
258
268
|
logger.warning("Cannot create elements as this worker is in read-only mode")
|
|
259
269
|
return
|
|
260
270
|
|
|
261
|
-
created_ids =
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
"
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
271
|
+
created_ids = [
|
|
272
|
+
created_id
|
|
273
|
+
for batch in make_batches(elements, "element", batch_size)
|
|
274
|
+
for created_id in self.api_client.request(
|
|
275
|
+
"CreateElements",
|
|
276
|
+
id=parent.id,
|
|
277
|
+
body={
|
|
278
|
+
"worker_run_id": self.worker_run_id,
|
|
279
|
+
"elements": batch,
|
|
280
|
+
},
|
|
281
|
+
)
|
|
282
|
+
]
|
|
269
283
|
|
|
270
284
|
if self.use_cache:
|
|
271
285
|
# Create the image as needed and handle both an Element and a CachedElement
|
{arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/entity.py
RENAMED
|
@@ -15,6 +15,12 @@ from arkindex_worker.cache import (
|
|
|
15
15
|
unsupported_cache,
|
|
16
16
|
)
|
|
17
17
|
from arkindex_worker.models import Element, Transcription
|
|
18
|
+
from arkindex_worker.utils import (
|
|
19
|
+
DEFAULT_BATCH_SIZE,
|
|
20
|
+
batch_publication,
|
|
21
|
+
make_batches,
|
|
22
|
+
pluralize,
|
|
23
|
+
)
|
|
18
24
|
|
|
19
25
|
|
|
20
26
|
class Entity(TypedDict):
|
|
@@ -213,10 +219,12 @@ class EntityMixin:
|
|
|
213
219
|
return transcription_ent
|
|
214
220
|
|
|
215
221
|
@unsupported_cache
|
|
222
|
+
@batch_publication
|
|
216
223
|
def create_transcription_entities(
|
|
217
224
|
self,
|
|
218
225
|
transcription: Transcription,
|
|
219
226
|
entities: list[Entity],
|
|
227
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
220
228
|
) -> list[dict[str, str]]:
|
|
221
229
|
"""
|
|
222
230
|
Create multiple entities attached to a transcription in a single API request.
|
|
@@ -239,6 +247,8 @@ class EntityMixin:
|
|
|
239
247
|
confidence (float or None)
|
|
240
248
|
Optional confidence score, between 0.0 and 1.0.
|
|
241
249
|
|
|
250
|
+
:param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
|
|
251
|
+
|
|
242
252
|
:return: List of dicts, with each dict having a two keys, `transcription_entity_id` and `entity_id`, holding the UUID of each created object.
|
|
243
253
|
"""
|
|
244
254
|
assert transcription and isinstance(
|
|
@@ -290,16 +300,20 @@ class EntityMixin:
|
|
|
290
300
|
)
|
|
291
301
|
return
|
|
292
302
|
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
"
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
303
|
+
created_entities = [
|
|
304
|
+
created_entity
|
|
305
|
+
for batch in make_batches(entities, "entities", batch_size)
|
|
306
|
+
for created_entity in self.api_client.request(
|
|
307
|
+
"CreateTranscriptionEntities",
|
|
308
|
+
id=transcription.id,
|
|
309
|
+
body={
|
|
310
|
+
"worker_run_id": self.worker_run_id,
|
|
311
|
+
"entities": batch,
|
|
312
|
+
},
|
|
313
|
+
)["entities"]
|
|
314
|
+
]
|
|
301
315
|
|
|
302
|
-
return
|
|
316
|
+
return created_entities
|
|
303
317
|
|
|
304
318
|
def list_transcription_entities(
|
|
305
319
|
self,
|
|
@@ -383,7 +397,7 @@ class EntityMixin:
|
|
|
383
397
|
}
|
|
384
398
|
count = len(self.entities)
|
|
385
399
|
logger.info(
|
|
386
|
-
f'Loaded {count}
|
|
400
|
+
f'Loaded {count} {pluralize("entity", count)} in corpus ({self.corpus_id})'
|
|
387
401
|
)
|
|
388
402
|
|
|
389
403
|
def list_corpus_entity_types(self):
|
|
@@ -398,5 +412,5 @@ class EntityMixin:
|
|
|
398
412
|
}
|
|
399
413
|
count = len(self.entity_types)
|
|
400
414
|
logger.info(
|
|
401
|
-
f'Loaded {count} entity
|
|
415
|
+
f'Loaded {count} entity {pluralize("type", count)} in corpus ({self.corpus_id}).'
|
|
402
416
|
)
|
{arkindex_base_worker-0.4.0b1 → arkindex_base_worker-0.4.0b2}/arkindex_worker/worker/metadata.py
RENAMED
|
@@ -7,6 +7,7 @@ from enum import Enum
|
|
|
7
7
|
from arkindex_worker import logger
|
|
8
8
|
from arkindex_worker.cache import CachedElement, unsupported_cache
|
|
9
9
|
from arkindex_worker.models import Element
|
|
10
|
+
from arkindex_worker.utils import DEFAULT_BATCH_SIZE, batch_publication, make_batches
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
class MetaType(Enum):
|
|
@@ -108,10 +109,12 @@ class MetaDataMixin:
|
|
|
108
109
|
return metadata["id"]
|
|
109
110
|
|
|
110
111
|
@unsupported_cache
|
|
112
|
+
@batch_publication
|
|
111
113
|
def create_metadata_bulk(
|
|
112
114
|
self,
|
|
113
115
|
element: Element | CachedElement,
|
|
114
116
|
metadata_list: list[dict[str, MetaType | str | int | float | None]],
|
|
117
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
115
118
|
) -> list[dict[str, str]]:
|
|
116
119
|
"""
|
|
117
120
|
Create multiple metadata on an existing element.
|
|
@@ -123,6 +126,9 @@ class MetaDataMixin:
|
|
|
123
126
|
- name: str
|
|
124
127
|
- value: str | int | float
|
|
125
128
|
- entity_id: str | None
|
|
129
|
+
:param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
|
|
130
|
+
|
|
131
|
+
:returns: A list of dicts as returned in the ``metadata_list`` field by the ``CreateMetaDataBulk`` API endpoint.
|
|
126
132
|
"""
|
|
127
133
|
assert element and isinstance(
|
|
128
134
|
element, Element | CachedElement
|
|
@@ -168,14 +174,18 @@ class MetaDataMixin:
|
|
|
168
174
|
logger.warning("Cannot create metadata as this worker is in read-only mode")
|
|
169
175
|
return
|
|
170
176
|
|
|
171
|
-
created_metadata_list =
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
"
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
177
|
+
created_metadata_list = [
|
|
178
|
+
created_metadata
|
|
179
|
+
for batch in make_batches(metas, "metadata", batch_size)
|
|
180
|
+
for created_metadata in self.api_client.request(
|
|
181
|
+
"CreateMetaDataBulk",
|
|
182
|
+
id=element.id,
|
|
183
|
+
body={
|
|
184
|
+
"worker_run_id": self.worker_run_id,
|
|
185
|
+
"metadata_list": batch,
|
|
186
|
+
},
|
|
187
|
+
)["metadata_list"]
|
|
188
|
+
]
|
|
179
189
|
|
|
180
190
|
return created_metadata_list
|
|
181
191
|
|
|
@@ -11,6 +11,7 @@ from peewee import IntegrityError
|
|
|
11
11
|
from arkindex_worker import logger
|
|
12
12
|
from arkindex_worker.cache import CachedElement, CachedTranscription
|
|
13
13
|
from arkindex_worker.models import Element
|
|
14
|
+
from arkindex_worker.utils import DEFAULT_BATCH_SIZE, batch_publication, make_batches
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
class TextOrientation(Enum):
|
|
@@ -109,9 +110,11 @@ class TranscriptionMixin:
|
|
|
109
110
|
|
|
110
111
|
return created
|
|
111
112
|
|
|
113
|
+
@batch_publication
|
|
112
114
|
def create_transcriptions(
|
|
113
115
|
self,
|
|
114
116
|
transcriptions: list[dict[str, str | float | TextOrientation | None]],
|
|
117
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
115
118
|
) -> list[dict[str, str | float]]:
|
|
116
119
|
"""
|
|
117
120
|
Create multiple transcriptions at once on existing elements through the API,
|
|
@@ -128,6 +131,8 @@ class TranscriptionMixin:
|
|
|
128
131
|
orientation (TextOrientation)
|
|
129
132
|
Optional. Orientation of the transcription's text.
|
|
130
133
|
|
|
134
|
+
:param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
|
|
135
|
+
|
|
131
136
|
:returns: A list of dicts as returned in the ``transcriptions`` field by the ``CreateTranscriptions`` API endpoint.
|
|
132
137
|
"""
|
|
133
138
|
|
|
@@ -171,13 +176,19 @@ class TranscriptionMixin:
|
|
|
171
176
|
)
|
|
172
177
|
return
|
|
173
178
|
|
|
174
|
-
created_trs =
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
"
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
179
|
+
created_trs = [
|
|
180
|
+
created_tr
|
|
181
|
+
for batch in make_batches(
|
|
182
|
+
transcriptions_payload, "transcription", batch_size
|
|
183
|
+
)
|
|
184
|
+
for created_tr in self.api_client.request(
|
|
185
|
+
"CreateTranscriptions",
|
|
186
|
+
body={
|
|
187
|
+
"worker_run_id": self.worker_run_id,
|
|
188
|
+
"transcriptions": batch,
|
|
189
|
+
},
|
|
190
|
+
)["transcriptions"]
|
|
191
|
+
]
|
|
181
192
|
|
|
182
193
|
if self.use_cache:
|
|
183
194
|
# Store transcriptions in local cache
|
|
@@ -201,11 +212,13 @@ class TranscriptionMixin:
|
|
|
201
212
|
|
|
202
213
|
return created_trs
|
|
203
214
|
|
|
215
|
+
@batch_publication
|
|
204
216
|
def create_element_transcriptions(
|
|
205
217
|
self,
|
|
206
218
|
element: Element | CachedElement,
|
|
207
219
|
sub_element_type: str,
|
|
208
220
|
transcriptions: list[dict[str, str | float]],
|
|
221
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
209
222
|
) -> dict[str, str | bool]:
|
|
210
223
|
"""
|
|
211
224
|
Create multiple elements and transcriptions at once on a single parent element through the API.
|
|
@@ -225,6 +238,8 @@ class TranscriptionMixin:
|
|
|
225
238
|
element_confidence (float)
|
|
226
239
|
Optional. Confidence score of the element between 0 and 1.
|
|
227
240
|
|
|
241
|
+
:param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
|
|
242
|
+
|
|
228
243
|
:returns: A list of dicts as returned by the ``CreateElementTranscriptions`` API endpoint.
|
|
229
244
|
"""
|
|
230
245
|
assert element and isinstance(
|
|
@@ -291,16 +306,22 @@ class TranscriptionMixin:
|
|
|
291
306
|
)
|
|
292
307
|
return
|
|
293
308
|
|
|
294
|
-
annotations =
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
"
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
309
|
+
annotations = [
|
|
310
|
+
annotation
|
|
311
|
+
for batch in make_batches(
|
|
312
|
+
transcriptions_payload, "transcription", batch_size
|
|
313
|
+
)
|
|
314
|
+
for annotation in self.api_client.request(
|
|
315
|
+
"CreateElementTranscriptions",
|
|
316
|
+
id=element.id,
|
|
317
|
+
body={
|
|
318
|
+
"element_type": sub_element_type,
|
|
319
|
+
"worker_run_id": self.worker_run_id,
|
|
320
|
+
"transcriptions": batch,
|
|
321
|
+
"return_elements": True,
|
|
322
|
+
},
|
|
323
|
+
)
|
|
324
|
+
]
|
|
304
325
|
|
|
305
326
|
for annotation in annotations:
|
|
306
327
|
if annotation["created"]:
|