arkindex-base-worker 0.3.7rc4__py3-none-any.whl → 0.5.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/METADATA +18 -19
- arkindex_base_worker-0.5.0a1.dist-info/RECORD +61 -0
- {arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/WHEEL +1 -1
- {arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/top_level.txt +2 -0
- arkindex_worker/cache.py +1 -1
- arkindex_worker/image.py +167 -2
- arkindex_worker/models.py +18 -0
- arkindex_worker/utils.py +98 -4
- arkindex_worker/worker/__init__.py +117 -218
- arkindex_worker/worker/base.py +39 -46
- arkindex_worker/worker/classification.py +45 -29
- arkindex_worker/worker/corpus.py +86 -0
- arkindex_worker/worker/dataset.py +89 -26
- arkindex_worker/worker/element.py +352 -91
- arkindex_worker/worker/entity.py +13 -11
- arkindex_worker/worker/image.py +21 -0
- arkindex_worker/worker/metadata.py +26 -16
- arkindex_worker/worker/process.py +92 -0
- arkindex_worker/worker/task.py +5 -4
- arkindex_worker/worker/training.py +25 -10
- arkindex_worker/worker/transcription.py +89 -68
- arkindex_worker/worker/version.py +3 -1
- hooks/pre_gen_project.py +3 -0
- tests/__init__.py +8 -0
- tests/conftest.py +47 -58
- tests/test_base_worker.py +212 -12
- tests/test_dataset_worker.py +294 -437
- tests/test_elements_worker/{test_classifications.py → test_classification.py} +313 -200
- tests/test_elements_worker/test_cli.py +3 -11
- tests/test_elements_worker/test_corpus.py +168 -0
- tests/test_elements_worker/test_dataset.py +106 -157
- tests/test_elements_worker/test_element.py +427 -0
- tests/test_elements_worker/test_element_create_multiple.py +715 -0
- tests/test_elements_worker/test_element_create_single.py +528 -0
- tests/test_elements_worker/test_element_list_children.py +969 -0
- tests/test_elements_worker/test_element_list_parents.py +530 -0
- tests/test_elements_worker/{test_entities.py → test_entity_create.py} +37 -195
- tests/test_elements_worker/test_entity_list_and_check.py +160 -0
- tests/test_elements_worker/test_image.py +66 -0
- tests/test_elements_worker/test_metadata.py +252 -161
- tests/test_elements_worker/test_process.py +89 -0
- tests/test_elements_worker/test_task.py +8 -18
- tests/test_elements_worker/test_training.py +17 -8
- tests/test_elements_worker/test_transcription_create.py +873 -0
- tests/test_elements_worker/test_transcription_create_with_elements.py +951 -0
- tests/test_elements_worker/test_transcription_list.py +450 -0
- tests/test_elements_worker/test_version.py +60 -0
- tests/test_elements_worker/test_worker.py +578 -293
- tests/test_image.py +542 -209
- tests/test_merge.py +1 -2
- tests/test_utils.py +89 -4
- worker-demo/tests/__init__.py +0 -0
- worker-demo/tests/conftest.py +32 -0
- worker-demo/tests/test_worker.py +12 -0
- worker-demo/worker_demo/__init__.py +6 -0
- worker-demo/worker_demo/worker.py +19 -0
- arkindex_base_worker-0.3.7rc4.dist-info/RECORD +0 -41
- tests/test_elements_worker/test_elements.py +0 -2713
- tests/test_elements_worker/test_transcriptions.py +0 -2119
- {arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/LICENSE +0 -0
|
@@ -2,7 +2,9 @@
|
|
|
2
2
|
ElementsWorker methods for elements and element types.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
import os
|
|
5
6
|
from collections.abc import Iterable
|
|
7
|
+
from operator import attrgetter
|
|
6
8
|
from typing import NamedTuple
|
|
7
9
|
from uuid import UUID
|
|
8
10
|
from warnings import warn
|
|
@@ -12,6 +14,12 @@ from peewee import IntegrityError
|
|
|
12
14
|
from arkindex_worker import logger
|
|
13
15
|
from arkindex_worker.cache import CachedElement, CachedImage, unsupported_cache
|
|
14
16
|
from arkindex_worker.models import Element
|
|
17
|
+
from arkindex_worker.utils import (
|
|
18
|
+
DEFAULT_BATCH_SIZE,
|
|
19
|
+
batch_publication,
|
|
20
|
+
make_batches,
|
|
21
|
+
pluralize,
|
|
22
|
+
)
|
|
15
23
|
|
|
16
24
|
|
|
17
25
|
class ElementType(NamedTuple):
|
|
@@ -31,6 +39,37 @@ class MissingTypeError(Exception):
|
|
|
31
39
|
|
|
32
40
|
|
|
33
41
|
class ElementMixin:
|
|
42
|
+
def add_arguments(self):
|
|
43
|
+
"""Define specific ``argparse`` arguments for the worker using this mixin"""
|
|
44
|
+
self.parser.add_argument(
|
|
45
|
+
"--elements-list",
|
|
46
|
+
help="JSON elements list to use",
|
|
47
|
+
type=open,
|
|
48
|
+
default=os.environ.get("TASK_ELEMENTS"),
|
|
49
|
+
)
|
|
50
|
+
self.parser.add_argument(
|
|
51
|
+
"--element",
|
|
52
|
+
type=str,
|
|
53
|
+
nargs="+",
|
|
54
|
+
help="One or more Arkindex element ID",
|
|
55
|
+
)
|
|
56
|
+
super().add_arguments()
|
|
57
|
+
|
|
58
|
+
def list_corpus_types(self):
|
|
59
|
+
"""
|
|
60
|
+
Loads available element types in corpus.
|
|
61
|
+
"""
|
|
62
|
+
self.corpus_types = {
|
|
63
|
+
element_type["slug"]: element_type
|
|
64
|
+
for element_type in self.api_client.request(
|
|
65
|
+
"RetrieveCorpus", id=self.corpus_id
|
|
66
|
+
)["types"]
|
|
67
|
+
}
|
|
68
|
+
count = len(self.corpus_types)
|
|
69
|
+
logger.info(
|
|
70
|
+
f'Loaded {count} element {pluralize("type", count)} in corpus ({self.corpus_id}).'
|
|
71
|
+
)
|
|
72
|
+
|
|
34
73
|
@unsupported_cache
|
|
35
74
|
def create_required_types(self, element_types: list[ElementType]):
|
|
36
75
|
"""Creates given element types in the corpus.
|
|
@@ -38,7 +77,7 @@ class ElementMixin:
|
|
|
38
77
|
:param element_types: The missing element types to create.
|
|
39
78
|
"""
|
|
40
79
|
for element_type in element_types:
|
|
41
|
-
self.request(
|
|
80
|
+
self.api_client.request(
|
|
42
81
|
"CreateElementType",
|
|
43
82
|
body={
|
|
44
83
|
"slug": element_type.slug,
|
|
@@ -66,10 +105,10 @@ class ElementMixin:
|
|
|
66
105
|
isinstance(slug, str) for slug in type_slugs
|
|
67
106
|
), "Element type slugs must be strings."
|
|
68
107
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
missing_slugs = set(type_slugs) - available_slugs
|
|
108
|
+
if not self.corpus_types:
|
|
109
|
+
self.list_corpus_types()
|
|
72
110
|
|
|
111
|
+
missing_slugs = set(type_slugs) - set(self.corpus_types)
|
|
73
112
|
if missing_slugs:
|
|
74
113
|
if create_missing:
|
|
75
114
|
self.create_required_types(
|
|
@@ -79,7 +118,7 @@ class ElementMixin:
|
|
|
79
118
|
)
|
|
80
119
|
else:
|
|
81
120
|
raise MissingTypeError(
|
|
82
|
-
f'Element type(
|
|
121
|
+
f'Element {pluralize("type", len(missing_slugs))} {", ".join(sorted(missing_slugs))} were not found in corpus ({self.corpus_id}).'
|
|
83
122
|
)
|
|
84
123
|
|
|
85
124
|
return True
|
|
@@ -145,7 +184,7 @@ class ElementMixin:
|
|
|
145
184
|
logger.warning("Cannot create element as this worker is in read-only mode")
|
|
146
185
|
return
|
|
147
186
|
|
|
148
|
-
sub_element = self.request(
|
|
187
|
+
sub_element = self.api_client.request(
|
|
149
188
|
"CreateElement",
|
|
150
189
|
body={
|
|
151
190
|
"type": type,
|
|
@@ -161,10 +200,12 @@ class ElementMixin:
|
|
|
161
200
|
|
|
162
201
|
return sub_element["id"] if slim_output else sub_element
|
|
163
202
|
|
|
203
|
+
@batch_publication
|
|
164
204
|
def create_elements(
|
|
165
205
|
self,
|
|
166
206
|
parent: Element | CachedElement,
|
|
167
207
|
elements: list[dict[str, str | list[list[int | float]] | float | None]],
|
|
208
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
168
209
|
) -> list[dict[str, str]]:
|
|
169
210
|
"""
|
|
170
211
|
Create child elements on the given element in a single API request.
|
|
@@ -185,6 +226,8 @@ class ElementMixin:
|
|
|
185
226
|
confidence (float or None)
|
|
186
227
|
Optional confidence score, between 0.0 and 1.0.
|
|
187
228
|
|
|
229
|
+
:param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
|
|
230
|
+
|
|
188
231
|
:return: List of dicts, with each dict having a single key, ``id``, holding the UUID of each created element.
|
|
189
232
|
"""
|
|
190
233
|
if isinstance(parent, Element):
|
|
@@ -243,14 +286,18 @@ class ElementMixin:
|
|
|
243
286
|
logger.warning("Cannot create elements as this worker is in read-only mode")
|
|
244
287
|
return
|
|
245
288
|
|
|
246
|
-
created_ids =
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
"
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
289
|
+
created_ids = [
|
|
290
|
+
created_id
|
|
291
|
+
for batch in make_batches(elements, "element", batch_size)
|
|
292
|
+
for created_id in self.api_client.request(
|
|
293
|
+
"CreateElements",
|
|
294
|
+
id=parent.id,
|
|
295
|
+
body={
|
|
296
|
+
"worker_run_id": self.worker_run_id,
|
|
297
|
+
"elements": batch,
|
|
298
|
+
},
|
|
299
|
+
)
|
|
300
|
+
]
|
|
254
301
|
|
|
255
302
|
if self.use_cache:
|
|
256
303
|
# Create the image as needed and handle both an Element and a CachedElement
|
|
@@ -311,12 +358,58 @@ class ElementMixin:
|
|
|
311
358
|
logger.warning("Cannot link elements as this worker is in read-only mode")
|
|
312
359
|
return
|
|
313
360
|
|
|
314
|
-
return self.request(
|
|
361
|
+
return self.api_client.request(
|
|
315
362
|
"CreateElementParent",
|
|
316
363
|
parent=parent.id,
|
|
317
364
|
child=child.id,
|
|
318
365
|
)
|
|
319
366
|
|
|
367
|
+
@unsupported_cache
|
|
368
|
+
@batch_publication
|
|
369
|
+
def create_element_children(
|
|
370
|
+
self,
|
|
371
|
+
parent: Element,
|
|
372
|
+
children: list[Element],
|
|
373
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
374
|
+
) -> list[str]:
|
|
375
|
+
"""
|
|
376
|
+
Link multiple elements to a single parent through the API.
|
|
377
|
+
|
|
378
|
+
:param parent: Parent element.
|
|
379
|
+
:param children: A list of child elements.
|
|
380
|
+
:param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
|
|
381
|
+
|
|
382
|
+
:returns: A list containing the string UUID of each child linked to the parent.
|
|
383
|
+
"""
|
|
384
|
+
assert parent and isinstance(
|
|
385
|
+
parent, Element
|
|
386
|
+
), "parent shouldn't be null and should be of type Element"
|
|
387
|
+
|
|
388
|
+
assert children and isinstance(
|
|
389
|
+
children, list
|
|
390
|
+
), "children shouldn't be null and should be of type list"
|
|
391
|
+
|
|
392
|
+
for index, child in enumerate(children):
|
|
393
|
+
assert isinstance(
|
|
394
|
+
child, Element
|
|
395
|
+
), f"Child at index {index} in children: Should be of type Element"
|
|
396
|
+
|
|
397
|
+
if self.is_read_only:
|
|
398
|
+
logger.warning("Cannot link elements as this worker is in read-only mode")
|
|
399
|
+
return
|
|
400
|
+
|
|
401
|
+
return [
|
|
402
|
+
child_id
|
|
403
|
+
for batch in make_batches(children, "child", batch_size)
|
|
404
|
+
for child_id in self.api_client.request(
|
|
405
|
+
"CreateElementChildren",
|
|
406
|
+
id=parent.id,
|
|
407
|
+
body={
|
|
408
|
+
"children": list(map(attrgetter("id"), batch)),
|
|
409
|
+
},
|
|
410
|
+
)["children"]
|
|
411
|
+
]
|
|
412
|
+
|
|
320
413
|
def partial_update_element(
|
|
321
414
|
self, element: Element | CachedElement, **kwargs
|
|
322
415
|
) -> dict:
|
|
@@ -383,7 +476,7 @@ class ElementMixin:
|
|
|
383
476
|
logger.warning("Cannot update element as this worker is in read-only mode")
|
|
384
477
|
return
|
|
385
478
|
|
|
386
|
-
updated_element = self.request(
|
|
479
|
+
updated_element = self.api_client.request(
|
|
387
480
|
"PartialUpdateElement",
|
|
388
481
|
id=element.id,
|
|
389
482
|
body=kwargs,
|
|
@@ -407,6 +500,178 @@ class ElementMixin:
|
|
|
407
500
|
|
|
408
501
|
return updated_element
|
|
409
502
|
|
|
503
|
+
def list_elements(
|
|
504
|
+
self,
|
|
505
|
+
folder: bool | None = None,
|
|
506
|
+
name: str | None = None,
|
|
507
|
+
top_level: bool | None = None,
|
|
508
|
+
transcription_worker_version: str | bool | None = None,
|
|
509
|
+
transcription_worker_run: str | bool | None = None,
|
|
510
|
+
type: str | None = None,
|
|
511
|
+
with_classes: bool | None = None,
|
|
512
|
+
with_corpus: bool | None = None,
|
|
513
|
+
with_metadata: bool | None = None,
|
|
514
|
+
with_has_children: bool | None = None,
|
|
515
|
+
with_zone: bool | None = None,
|
|
516
|
+
worker_version: str | bool | None = None,
|
|
517
|
+
worker_run: str | bool | None = None,
|
|
518
|
+
) -> Iterable[dict] | Iterable[CachedElement]:
|
|
519
|
+
"""
|
|
520
|
+
List element in a corpus.
|
|
521
|
+
|
|
522
|
+
Warns:
|
|
523
|
+
----
|
|
524
|
+
The following parameters are **deprecated**:
|
|
525
|
+
|
|
526
|
+
- `transcription_worker_version` in favor of `transcription_worker_run`
|
|
527
|
+
- `worker_version` in favor of `worker_run`
|
|
528
|
+
|
|
529
|
+
:param folder: Restrict to or exclude elements with folder types.
|
|
530
|
+
This parameter is not supported when caching is enabled.
|
|
531
|
+
:param name: Restrict to elements whose name contain a substring (case-insensitive).
|
|
532
|
+
This parameter is not supported when caching is enabled.
|
|
533
|
+
:param top_level: Restrict to or exclude folder elements without parent elements (top-level elements).
|
|
534
|
+
This parameter is not supported when caching is enabled.
|
|
535
|
+
:param transcription_worker_version: **Deprecated** Restrict to elements that have a transcription created by a worker version with this UUID. Set to False to look for elements that have a manual transcription.
|
|
536
|
+
This parameter is not supported when caching is enabled.
|
|
537
|
+
:param transcription_worker_run: Restrict to elements that have a transcription created by a worker run with this UUID. Set to False to look for elements that have a manual transcription.
|
|
538
|
+
This parameter is not supported when caching is enabled.
|
|
539
|
+
:param type: Restrict to elements with a specific type slug
|
|
540
|
+
This parameter is not supported when caching is enabled.
|
|
541
|
+
:param with_classes: Include each element's classifications in the response.
|
|
542
|
+
This parameter is not supported when caching is enabled.
|
|
543
|
+
:param with_corpus: Include each element's corpus in the response.
|
|
544
|
+
This parameter is not supported when caching is enabled.
|
|
545
|
+
:param with_has_children: Include the ``has_children`` attribute in the response,
|
|
546
|
+
indicating if this element has child elements of its own.
|
|
547
|
+
This parameter is not supported when caching is enabled.
|
|
548
|
+
:param with_metadata: Include each element's metadata in the response.
|
|
549
|
+
This parameter is not supported when caching is enabled.
|
|
550
|
+
:param with_zone: Include the ``zone`` attribute in the response,
|
|
551
|
+
holding the element's image and polygon.
|
|
552
|
+
This parameter is not supported when caching is enabled.
|
|
553
|
+
:param worker_version: **Deprecated** Restrict to elements created by a worker version with this UUID.
|
|
554
|
+
:param worker_run: Restrict to elements created by a worker run with this UUID.
|
|
555
|
+
:return: An iterable of dicts from the ``ListElementChildren`` API endpoint,
|
|
556
|
+
or an iterable of [CachedElement][arkindex_worker.cache.CachedElement] when caching is enabled.
|
|
557
|
+
"""
|
|
558
|
+
query_params = {}
|
|
559
|
+
if folder is not None:
|
|
560
|
+
assert isinstance(folder, bool), "folder should be of type bool"
|
|
561
|
+
query_params["folder"] = folder
|
|
562
|
+
if name:
|
|
563
|
+
assert isinstance(name, str), "name should be of type str"
|
|
564
|
+
query_params["name"] = name
|
|
565
|
+
if top_level is not None:
|
|
566
|
+
assert isinstance(top_level, bool), "top_level should be of type bool"
|
|
567
|
+
query_params["top_level"] = top_level
|
|
568
|
+
if transcription_worker_version is not None:
|
|
569
|
+
warn(
|
|
570
|
+
"`transcription_worker_version` usage is deprecated. Consider using `transcription_worker_run` instead.",
|
|
571
|
+
DeprecationWarning,
|
|
572
|
+
stacklevel=1,
|
|
573
|
+
)
|
|
574
|
+
assert isinstance(
|
|
575
|
+
transcription_worker_version, str | bool
|
|
576
|
+
), "transcription_worker_version should be of type str or bool"
|
|
577
|
+
if isinstance(transcription_worker_version, bool):
|
|
578
|
+
assert (
|
|
579
|
+
transcription_worker_version is False
|
|
580
|
+
), "if of type bool, transcription_worker_version can only be set to False"
|
|
581
|
+
query_params["transcription_worker_version"] = transcription_worker_version
|
|
582
|
+
if transcription_worker_run is not None:
|
|
583
|
+
assert isinstance(
|
|
584
|
+
transcription_worker_run, str | bool
|
|
585
|
+
), "transcription_worker_run should be of type str or bool"
|
|
586
|
+
if isinstance(transcription_worker_run, bool):
|
|
587
|
+
assert (
|
|
588
|
+
transcription_worker_run is False
|
|
589
|
+
), "if of type bool, transcription_worker_run can only be set to False"
|
|
590
|
+
query_params["transcription_worker_run"] = transcription_worker_run
|
|
591
|
+
if type:
|
|
592
|
+
assert isinstance(type, str), "type should be of type str"
|
|
593
|
+
query_params["type"] = type
|
|
594
|
+
if with_classes is not None:
|
|
595
|
+
assert isinstance(with_classes, bool), "with_classes should be of type bool"
|
|
596
|
+
query_params["with_classes"] = with_classes
|
|
597
|
+
if with_corpus is not None:
|
|
598
|
+
assert isinstance(with_corpus, bool), "with_corpus should be of type bool"
|
|
599
|
+
query_params["with_corpus"] = with_corpus
|
|
600
|
+
if with_has_children is not None:
|
|
601
|
+
assert isinstance(
|
|
602
|
+
with_has_children, bool
|
|
603
|
+
), "with_has_children should be of type bool"
|
|
604
|
+
query_params["with_has_children"] = with_has_children
|
|
605
|
+
if with_metadata is not None:
|
|
606
|
+
assert isinstance(
|
|
607
|
+
with_metadata, bool
|
|
608
|
+
), "with_metadata should be of type bool"
|
|
609
|
+
query_params["with_metadata"] = with_metadata
|
|
610
|
+
if with_zone is not None:
|
|
611
|
+
assert isinstance(with_zone, bool), "with_zone should be of type bool"
|
|
612
|
+
query_params["with_zone"] = with_zone
|
|
613
|
+
if worker_version is not None:
|
|
614
|
+
warn(
|
|
615
|
+
"`worker_version` usage is deprecated. Consider using `worker_run` instead.",
|
|
616
|
+
DeprecationWarning,
|
|
617
|
+
stacklevel=1,
|
|
618
|
+
)
|
|
619
|
+
assert isinstance(
|
|
620
|
+
worker_version, str | bool
|
|
621
|
+
), "worker_version should be of type str or bool"
|
|
622
|
+
if isinstance(worker_version, bool):
|
|
623
|
+
assert (
|
|
624
|
+
worker_version is False
|
|
625
|
+
), "if of type bool, worker_version can only be set to False"
|
|
626
|
+
query_params["worker_version"] = worker_version
|
|
627
|
+
if worker_run is not None:
|
|
628
|
+
assert isinstance(
|
|
629
|
+
worker_run, str | bool
|
|
630
|
+
), "worker_run should be of type str or bool"
|
|
631
|
+
if isinstance(worker_run, bool):
|
|
632
|
+
assert (
|
|
633
|
+
worker_run is False
|
|
634
|
+
), "if of type bool, worker_run can only be set to False"
|
|
635
|
+
query_params["worker_run"] = worker_run
|
|
636
|
+
|
|
637
|
+
if not self.use_cache:
|
|
638
|
+
return self.api_client.paginate(
|
|
639
|
+
"ListElements", corpus=self.corpus_id, **query_params
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
# Checking that we only received query_params handled by the cache
|
|
643
|
+
assert (
|
|
644
|
+
set(query_params.keys())
|
|
645
|
+
<= {
|
|
646
|
+
"type",
|
|
647
|
+
"worker_version",
|
|
648
|
+
"worker_run",
|
|
649
|
+
}
|
|
650
|
+
), "When using the local cache, you can only filter by 'type' and/or 'worker_version' and/or 'worker_run'"
|
|
651
|
+
|
|
652
|
+
query = CachedElement.select()
|
|
653
|
+
if type:
|
|
654
|
+
query = query.where(CachedElement.type == type)
|
|
655
|
+
if worker_version is not None:
|
|
656
|
+
# If worker_version=False, filter by manual worker_version e.g. None
|
|
657
|
+
worker_version_id = worker_version or None
|
|
658
|
+
if worker_version_id:
|
|
659
|
+
query = query.where(
|
|
660
|
+
CachedElement.worker_version_id == worker_version_id
|
|
661
|
+
)
|
|
662
|
+
else:
|
|
663
|
+
query = query.where(CachedElement.worker_version_id.is_null())
|
|
664
|
+
|
|
665
|
+
if worker_run is not None:
|
|
666
|
+
# If worker_run=False, filter by manual worker_run e.g. None
|
|
667
|
+
worker_run_id = worker_run or None
|
|
668
|
+
if worker_run_id:
|
|
669
|
+
query = query.where(CachedElement.worker_run_id == worker_run_id)
|
|
670
|
+
else:
|
|
671
|
+
query = query.where(CachedElement.worker_run_id.is_null())
|
|
672
|
+
|
|
673
|
+
return query
|
|
674
|
+
|
|
410
675
|
def list_element_children(
|
|
411
676
|
self,
|
|
412
677
|
element: Element | CachedElement,
|
|
@@ -546,45 +811,43 @@ class ElementMixin:
|
|
|
546
811
|
), "if of type bool, worker_run can only be set to False"
|
|
547
812
|
query_params["worker_run"] = worker_run
|
|
548
813
|
|
|
549
|
-
if self.use_cache:
|
|
550
|
-
|
|
551
|
-
assert (
|
|
552
|
-
set(query_params.keys())
|
|
553
|
-
<= {
|
|
554
|
-
"type",
|
|
555
|
-
"worker_version",
|
|
556
|
-
"worker_run",
|
|
557
|
-
}
|
|
558
|
-
), "When using the local cache, you can only filter by 'type' and/or 'worker_version' and/or 'worker_run'"
|
|
559
|
-
|
|
560
|
-
query = CachedElement.select().where(CachedElement.parent_id == element.id)
|
|
561
|
-
if type:
|
|
562
|
-
query = query.where(CachedElement.type == type)
|
|
563
|
-
if worker_version is not None:
|
|
564
|
-
# If worker_version=False, filter by manual worker_version e.g. None
|
|
565
|
-
worker_version_id = worker_version or None
|
|
566
|
-
if worker_version_id:
|
|
567
|
-
query = query.where(
|
|
568
|
-
CachedElement.worker_version_id == worker_version_id
|
|
569
|
-
)
|
|
570
|
-
else:
|
|
571
|
-
query = query.where(CachedElement.worker_version_id.is_null())
|
|
572
|
-
|
|
573
|
-
if worker_run is not None:
|
|
574
|
-
# If worker_run=False, filter by manual worker_run e.g. None
|
|
575
|
-
worker_run_id = worker_run or None
|
|
576
|
-
if worker_run_id:
|
|
577
|
-
query = query.where(CachedElement.worker_run_id == worker_run_id)
|
|
578
|
-
else:
|
|
579
|
-
query = query.where(CachedElement.worker_run_id.is_null())
|
|
580
|
-
|
|
581
|
-
return query
|
|
582
|
-
else:
|
|
583
|
-
children = self.api_client.paginate(
|
|
814
|
+
if not self.use_cache:
|
|
815
|
+
return self.api_client.paginate(
|
|
584
816
|
"ListElementChildren", id=element.id, **query_params
|
|
585
817
|
)
|
|
586
818
|
|
|
587
|
-
|
|
819
|
+
# Checking that we only received query_params handled by the cache
|
|
820
|
+
assert (
|
|
821
|
+
set(query_params.keys())
|
|
822
|
+
<= {
|
|
823
|
+
"type",
|
|
824
|
+
"worker_version",
|
|
825
|
+
"worker_run",
|
|
826
|
+
}
|
|
827
|
+
), "When using the local cache, you can only filter by 'type' and/or 'worker_version' and/or 'worker_run'"
|
|
828
|
+
|
|
829
|
+
query = CachedElement.select().where(CachedElement.parent_id == element.id)
|
|
830
|
+
if type:
|
|
831
|
+
query = query.where(CachedElement.type == type)
|
|
832
|
+
if worker_version is not None:
|
|
833
|
+
# If worker_version=False, filter by manual worker_version e.g. None
|
|
834
|
+
worker_version_id = worker_version or None
|
|
835
|
+
if worker_version_id:
|
|
836
|
+
query = query.where(
|
|
837
|
+
CachedElement.worker_version_id == worker_version_id
|
|
838
|
+
)
|
|
839
|
+
else:
|
|
840
|
+
query = query.where(CachedElement.worker_version_id.is_null())
|
|
841
|
+
|
|
842
|
+
if worker_run is not None:
|
|
843
|
+
# If worker_run=False, filter by manual worker_run e.g. None
|
|
844
|
+
worker_run_id = worker_run or None
|
|
845
|
+
if worker_run_id:
|
|
846
|
+
query = query.where(CachedElement.worker_run_id == worker_run_id)
|
|
847
|
+
else:
|
|
848
|
+
query = query.where(CachedElement.worker_run_id.is_null())
|
|
849
|
+
|
|
850
|
+
return query
|
|
588
851
|
|
|
589
852
|
def list_element_parents(
|
|
590
853
|
self,
|
|
@@ -725,45 +988,43 @@ class ElementMixin:
|
|
|
725
988
|
), "if of type bool, worker_run can only be set to False"
|
|
726
989
|
query_params["worker_run"] = worker_run
|
|
727
990
|
|
|
728
|
-
if self.use_cache:
|
|
729
|
-
|
|
730
|
-
assert (
|
|
731
|
-
set(query_params.keys())
|
|
732
|
-
<= {
|
|
733
|
-
"type",
|
|
734
|
-
"worker_version",
|
|
735
|
-
"worker_run",
|
|
736
|
-
}
|
|
737
|
-
), "When using the local cache, you can only filter by 'type' and/or 'worker_version' and/or 'worker_run'"
|
|
738
|
-
|
|
739
|
-
parent_ids = CachedElement.select(CachedElement.parent_id).where(
|
|
740
|
-
CachedElement.id == element.id
|
|
741
|
-
)
|
|
742
|
-
query = CachedElement.select().where(CachedElement.id.in_(parent_ids))
|
|
743
|
-
if type:
|
|
744
|
-
query = query.where(CachedElement.type == type)
|
|
745
|
-
if worker_version is not None:
|
|
746
|
-
# If worker_version=False, filter by manual worker_version e.g. None
|
|
747
|
-
worker_version_id = worker_version or None
|
|
748
|
-
if worker_version_id:
|
|
749
|
-
query = query.where(
|
|
750
|
-
CachedElement.worker_version_id == worker_version_id
|
|
751
|
-
)
|
|
752
|
-
else:
|
|
753
|
-
query = query.where(CachedElement.worker_version_id.is_null())
|
|
754
|
-
|
|
755
|
-
if worker_run is not None:
|
|
756
|
-
# If worker_run=False, filter by manual worker_run e.g. None
|
|
757
|
-
worker_run_id = worker_run or None
|
|
758
|
-
if worker_run_id:
|
|
759
|
-
query = query.where(CachedElement.worker_run_id == worker_run_id)
|
|
760
|
-
else:
|
|
761
|
-
query = query.where(CachedElement.worker_run_id.is_null())
|
|
762
|
-
|
|
763
|
-
return query
|
|
764
|
-
else:
|
|
765
|
-
parents = self.api_client.paginate(
|
|
991
|
+
if not self.use_cache:
|
|
992
|
+
return self.api_client.paginate(
|
|
766
993
|
"ListElementParents", id=element.id, **query_params
|
|
767
994
|
)
|
|
768
995
|
|
|
769
|
-
|
|
996
|
+
# Checking that we only received query_params handled by the cache
|
|
997
|
+
assert (
|
|
998
|
+
set(query_params.keys())
|
|
999
|
+
<= {
|
|
1000
|
+
"type",
|
|
1001
|
+
"worker_version",
|
|
1002
|
+
"worker_run",
|
|
1003
|
+
}
|
|
1004
|
+
), "When using the local cache, you can only filter by 'type' and/or 'worker_version' and/or 'worker_run'"
|
|
1005
|
+
|
|
1006
|
+
parent_ids = CachedElement.select(CachedElement.parent_id).where(
|
|
1007
|
+
CachedElement.id == element.id
|
|
1008
|
+
)
|
|
1009
|
+
query = CachedElement.select().where(CachedElement.id.in_(parent_ids))
|
|
1010
|
+
if type:
|
|
1011
|
+
query = query.where(CachedElement.type == type)
|
|
1012
|
+
if worker_version is not None:
|
|
1013
|
+
# If worker_version=False, filter by manual worker_version e.g. None
|
|
1014
|
+
worker_version_id = worker_version or None
|
|
1015
|
+
if worker_version_id:
|
|
1016
|
+
query = query.where(
|
|
1017
|
+
CachedElement.worker_version_id == worker_version_id
|
|
1018
|
+
)
|
|
1019
|
+
else:
|
|
1020
|
+
query = query.where(CachedElement.worker_version_id.is_null())
|
|
1021
|
+
|
|
1022
|
+
if worker_run is not None:
|
|
1023
|
+
# If worker_run=False, filter by manual worker_run e.g. None
|
|
1024
|
+
worker_run_id = worker_run or None
|
|
1025
|
+
if worker_run_id:
|
|
1026
|
+
query = query.where(CachedElement.worker_run_id == worker_run_id)
|
|
1027
|
+
else:
|
|
1028
|
+
query = query.where(CachedElement.worker_run_id.is_null())
|
|
1029
|
+
|
|
1030
|
+
return query
|
arkindex_worker/worker/entity.py
CHANGED
|
@@ -15,6 +15,7 @@ from arkindex_worker.cache import (
|
|
|
15
15
|
unsupported_cache,
|
|
16
16
|
)
|
|
17
17
|
from arkindex_worker.models import Element, Transcription
|
|
18
|
+
from arkindex_worker.utils import pluralize
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
class Entity(TypedDict):
|
|
@@ -48,6 +49,7 @@ class EntityMixin:
|
|
|
48
49
|
if not self.entity_types:
|
|
49
50
|
# Load entity_types of corpus
|
|
50
51
|
self.list_corpus_entity_types()
|
|
52
|
+
|
|
51
53
|
for entity_type in entity_types:
|
|
52
54
|
# Do nothing if type already exists
|
|
53
55
|
if entity_type in self.entity_types:
|
|
@@ -60,7 +62,7 @@ class EntityMixin:
|
|
|
60
62
|
)
|
|
61
63
|
|
|
62
64
|
# Create type if non-existent
|
|
63
|
-
self.entity_types[entity_type] = self.request(
|
|
65
|
+
self.entity_types[entity_type] = self.api_client.request(
|
|
64
66
|
"CreateEntityType",
|
|
65
67
|
body={
|
|
66
68
|
"name": entity_type,
|
|
@@ -106,7 +108,7 @@ class EntityMixin:
|
|
|
106
108
|
entity_type_id = self.entity_types.get(type)
|
|
107
109
|
assert entity_type_id, f"Entity type `{type}` not found in the corpus."
|
|
108
110
|
|
|
109
|
-
entity = self.request(
|
|
111
|
+
entity = self.api_client.request(
|
|
110
112
|
"CreateEntity",
|
|
111
113
|
body={
|
|
112
114
|
"name": name,
|
|
@@ -188,7 +190,7 @@ class EntityMixin:
|
|
|
188
190
|
if confidence is not None:
|
|
189
191
|
body["confidence"] = confidence
|
|
190
192
|
|
|
191
|
-
transcription_ent = self.request(
|
|
193
|
+
transcription_ent = self.api_client.request(
|
|
192
194
|
"CreateTranscriptionEntity",
|
|
193
195
|
id=transcription.id,
|
|
194
196
|
body=body,
|
|
@@ -289,16 +291,16 @@ class EntityMixin:
|
|
|
289
291
|
)
|
|
290
292
|
return
|
|
291
293
|
|
|
292
|
-
|
|
294
|
+
created_entities = self.api_client.request(
|
|
293
295
|
"CreateTranscriptionEntities",
|
|
294
296
|
id=transcription.id,
|
|
295
297
|
body={
|
|
296
298
|
"worker_run_id": self.worker_run_id,
|
|
297
299
|
"entities": entities,
|
|
298
300
|
},
|
|
299
|
-
)
|
|
301
|
+
)["entities"]
|
|
300
302
|
|
|
301
|
-
return
|
|
303
|
+
return created_entities
|
|
302
304
|
|
|
303
305
|
def list_transcription_entities(
|
|
304
306
|
self,
|
|
@@ -380,13 +382,12 @@ class EntityMixin:
|
|
|
380
382
|
"ListCorpusEntities", id=self.corpus_id, **query_params
|
|
381
383
|
)
|
|
382
384
|
}
|
|
385
|
+
count = len(self.entities)
|
|
383
386
|
logger.info(
|
|
384
|
-
f
|
|
387
|
+
f'Loaded {count} {pluralize("entity", count)} in corpus ({self.corpus_id})'
|
|
385
388
|
)
|
|
386
389
|
|
|
387
|
-
def list_corpus_entity_types(
|
|
388
|
-
self,
|
|
389
|
-
):
|
|
390
|
+
def list_corpus_entity_types(self):
|
|
390
391
|
"""
|
|
391
392
|
Loads available entity types in corpus.
|
|
392
393
|
"""
|
|
@@ -396,6 +397,7 @@ class EntityMixin:
|
|
|
396
397
|
"ListCorpusEntityTypes", id=self.corpus_id
|
|
397
398
|
)
|
|
398
399
|
}
|
|
400
|
+
count = len(self.entity_types)
|
|
399
401
|
logger.info(
|
|
400
|
-
f
|
|
402
|
+
f'Loaded {count} entity {pluralize("type", count)} in corpus ({self.corpus_id}).'
|
|
401
403
|
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ElementsWorker methods for images.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from arkindex_worker.models import Image
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ImageMixin:
|
|
9
|
+
def create_iiif_url(self, url: str) -> Image:
|
|
10
|
+
"""
|
|
11
|
+
Create an image from an existing IIIF image by URL.
|
|
12
|
+
The URL should be of the image's identifier, not of its Image Information request (`/info.json`).
|
|
13
|
+
|
|
14
|
+
:param url: URL of the image.
|
|
15
|
+
:returns: The created image.
|
|
16
|
+
"""
|
|
17
|
+
assert url and isinstance(
|
|
18
|
+
url, str
|
|
19
|
+
), "url shouldn't be null and should be of type str"
|
|
20
|
+
|
|
21
|
+
return Image(self.api_client.request("CreateIIIFURL", body={"url": url}))
|