arkindex-base-worker 0.3.7rc4__py3-none-any.whl → 0.5.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/METADATA +18 -19
  2. arkindex_base_worker-0.5.0a1.dist-info/RECORD +61 -0
  3. {arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/WHEEL +1 -1
  4. {arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/top_level.txt +2 -0
  5. arkindex_worker/cache.py +1 -1
  6. arkindex_worker/image.py +167 -2
  7. arkindex_worker/models.py +18 -0
  8. arkindex_worker/utils.py +98 -4
  9. arkindex_worker/worker/__init__.py +117 -218
  10. arkindex_worker/worker/base.py +39 -46
  11. arkindex_worker/worker/classification.py +45 -29
  12. arkindex_worker/worker/corpus.py +86 -0
  13. arkindex_worker/worker/dataset.py +89 -26
  14. arkindex_worker/worker/element.py +352 -91
  15. arkindex_worker/worker/entity.py +13 -11
  16. arkindex_worker/worker/image.py +21 -0
  17. arkindex_worker/worker/metadata.py +26 -16
  18. arkindex_worker/worker/process.py +92 -0
  19. arkindex_worker/worker/task.py +5 -4
  20. arkindex_worker/worker/training.py +25 -10
  21. arkindex_worker/worker/transcription.py +89 -68
  22. arkindex_worker/worker/version.py +3 -1
  23. hooks/pre_gen_project.py +3 -0
  24. tests/__init__.py +8 -0
  25. tests/conftest.py +47 -58
  26. tests/test_base_worker.py +212 -12
  27. tests/test_dataset_worker.py +294 -437
  28. tests/test_elements_worker/{test_classifications.py → test_classification.py} +313 -200
  29. tests/test_elements_worker/test_cli.py +3 -11
  30. tests/test_elements_worker/test_corpus.py +168 -0
  31. tests/test_elements_worker/test_dataset.py +106 -157
  32. tests/test_elements_worker/test_element.py +427 -0
  33. tests/test_elements_worker/test_element_create_multiple.py +715 -0
  34. tests/test_elements_worker/test_element_create_single.py +528 -0
  35. tests/test_elements_worker/test_element_list_children.py +969 -0
  36. tests/test_elements_worker/test_element_list_parents.py +530 -0
  37. tests/test_elements_worker/{test_entities.py → test_entity_create.py} +37 -195
  38. tests/test_elements_worker/test_entity_list_and_check.py +160 -0
  39. tests/test_elements_worker/test_image.py +66 -0
  40. tests/test_elements_worker/test_metadata.py +252 -161
  41. tests/test_elements_worker/test_process.py +89 -0
  42. tests/test_elements_worker/test_task.py +8 -18
  43. tests/test_elements_worker/test_training.py +17 -8
  44. tests/test_elements_worker/test_transcription_create.py +873 -0
  45. tests/test_elements_worker/test_transcription_create_with_elements.py +951 -0
  46. tests/test_elements_worker/test_transcription_list.py +450 -0
  47. tests/test_elements_worker/test_version.py +60 -0
  48. tests/test_elements_worker/test_worker.py +578 -293
  49. tests/test_image.py +542 -209
  50. tests/test_merge.py +1 -2
  51. tests/test_utils.py +89 -4
  52. worker-demo/tests/__init__.py +0 -0
  53. worker-demo/tests/conftest.py +32 -0
  54. worker-demo/tests/test_worker.py +12 -0
  55. worker-demo/worker_demo/__init__.py +6 -0
  56. worker-demo/worker_demo/worker.py +19 -0
  57. arkindex_base_worker-0.3.7rc4.dist-info/RECORD +0 -41
  58. tests/test_elements_worker/test_elements.py +0 -2713
  59. tests/test_elements_worker/test_transcriptions.py +0 -2119
  60. {arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/LICENSE +0 -0
@@ -2,7 +2,9 @@
2
2
  ElementsWorker methods for elements and element types.
3
3
  """
4
4
 
5
+ import os
5
6
  from collections.abc import Iterable
7
+ from operator import attrgetter
6
8
  from typing import NamedTuple
7
9
  from uuid import UUID
8
10
  from warnings import warn
@@ -12,6 +14,12 @@ from peewee import IntegrityError
12
14
  from arkindex_worker import logger
13
15
  from arkindex_worker.cache import CachedElement, CachedImage, unsupported_cache
14
16
  from arkindex_worker.models import Element
17
+ from arkindex_worker.utils import (
18
+ DEFAULT_BATCH_SIZE,
19
+ batch_publication,
20
+ make_batches,
21
+ pluralize,
22
+ )
15
23
 
16
24
 
17
25
  class ElementType(NamedTuple):
@@ -31,6 +39,37 @@ class MissingTypeError(Exception):
31
39
 
32
40
 
33
41
  class ElementMixin:
42
+ def add_arguments(self):
43
+ """Define specific ``argparse`` arguments for the worker using this mixin"""
44
+ self.parser.add_argument(
45
+ "--elements-list",
46
+ help="JSON elements list to use",
47
+ type=open,
48
+ default=os.environ.get("TASK_ELEMENTS"),
49
+ )
50
+ self.parser.add_argument(
51
+ "--element",
52
+ type=str,
53
+ nargs="+",
54
+ help="One or more Arkindex element ID",
55
+ )
56
+ super().add_arguments()
57
+
58
+ def list_corpus_types(self):
59
+ """
60
+ Loads available element types in corpus.
61
+ """
62
+ self.corpus_types = {
63
+ element_type["slug"]: element_type
64
+ for element_type in self.api_client.request(
65
+ "RetrieveCorpus", id=self.corpus_id
66
+ )["types"]
67
+ }
68
+ count = len(self.corpus_types)
69
+ logger.info(
70
+ f'Loaded {count} element {pluralize("type", count)} in corpus ({self.corpus_id}).'
71
+ )
72
+
34
73
  @unsupported_cache
35
74
  def create_required_types(self, element_types: list[ElementType]):
36
75
  """Creates given element types in the corpus.
@@ -38,7 +77,7 @@ class ElementMixin:
38
77
  :param element_types: The missing element types to create.
39
78
  """
40
79
  for element_type in element_types:
41
- self.request(
80
+ self.api_client.request(
42
81
  "CreateElementType",
43
82
  body={
44
83
  "slug": element_type.slug,
@@ -66,10 +105,10 @@ class ElementMixin:
66
105
  isinstance(slug, str) for slug in type_slugs
67
106
  ), "Element type slugs must be strings."
68
107
 
69
- corpus = self.request("RetrieveCorpus", id=self.corpus_id)
70
- available_slugs = {element_type["slug"] for element_type in corpus["types"]}
71
- missing_slugs = set(type_slugs) - available_slugs
108
+ if not self.corpus_types:
109
+ self.list_corpus_types()
72
110
 
111
+ missing_slugs = set(type_slugs) - set(self.corpus_types)
73
112
  if missing_slugs:
74
113
  if create_missing:
75
114
  self.create_required_types(
@@ -79,7 +118,7 @@ class ElementMixin:
79
118
  )
80
119
  else:
81
120
  raise MissingTypeError(
82
- f'Element type(s) {", ".join(sorted(missing_slugs))} were not found in the {corpus["name"]} corpus ({corpus["id"]}).'
121
+ f'Element {pluralize("type", len(missing_slugs))} {", ".join(sorted(missing_slugs))} were not found in corpus ({self.corpus_id}).'
83
122
  )
84
123
 
85
124
  return True
@@ -145,7 +184,7 @@ class ElementMixin:
145
184
  logger.warning("Cannot create element as this worker is in read-only mode")
146
185
  return
147
186
 
148
- sub_element = self.request(
187
+ sub_element = self.api_client.request(
149
188
  "CreateElement",
150
189
  body={
151
190
  "type": type,
@@ -161,10 +200,12 @@ class ElementMixin:
161
200
 
162
201
  return sub_element["id"] if slim_output else sub_element
163
202
 
203
+ @batch_publication
164
204
  def create_elements(
165
205
  self,
166
206
  parent: Element | CachedElement,
167
207
  elements: list[dict[str, str | list[list[int | float]] | float | None]],
208
+ batch_size: int = DEFAULT_BATCH_SIZE,
168
209
  ) -> list[dict[str, str]]:
169
210
  """
170
211
  Create child elements on the given element in a single API request.
@@ -185,6 +226,8 @@ class ElementMixin:
185
226
  confidence (float or None)
186
227
  Optional confidence score, between 0.0 and 1.0.
187
228
 
229
+ :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
230
+
188
231
  :return: List of dicts, with each dict having a single key, ``id``, holding the UUID of each created element.
189
232
  """
190
233
  if isinstance(parent, Element):
@@ -243,14 +286,18 @@ class ElementMixin:
243
286
  logger.warning("Cannot create elements as this worker is in read-only mode")
244
287
  return
245
288
 
246
- created_ids = self.request(
247
- "CreateElements",
248
- id=parent.id,
249
- body={
250
- "worker_run_id": self.worker_run_id,
251
- "elements": elements,
252
- },
253
- )
289
+ created_ids = [
290
+ created_id
291
+ for batch in make_batches(elements, "element", batch_size)
292
+ for created_id in self.api_client.request(
293
+ "CreateElements",
294
+ id=parent.id,
295
+ body={
296
+ "worker_run_id": self.worker_run_id,
297
+ "elements": batch,
298
+ },
299
+ )
300
+ ]
254
301
 
255
302
  if self.use_cache:
256
303
  # Create the image as needed and handle both an Element and a CachedElement
@@ -311,12 +358,58 @@ class ElementMixin:
311
358
  logger.warning("Cannot link elements as this worker is in read-only mode")
312
359
  return
313
360
 
314
- return self.request(
361
+ return self.api_client.request(
315
362
  "CreateElementParent",
316
363
  parent=parent.id,
317
364
  child=child.id,
318
365
  )
319
366
 
367
+ @unsupported_cache
368
+ @batch_publication
369
+ def create_element_children(
370
+ self,
371
+ parent: Element,
372
+ children: list[Element],
373
+ batch_size: int = DEFAULT_BATCH_SIZE,
374
+ ) -> list[str]:
375
+ """
376
+ Link multiple elements to a single parent through the API.
377
+
378
+ :param parent: Parent element.
379
+ :param children: A list of child elements.
380
+ :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
381
+
382
+ :returns: A list containing the string UUID of each child linked to the parent.
383
+ """
384
+ assert parent and isinstance(
385
+ parent, Element
386
+ ), "parent shouldn't be null and should be of type Element"
387
+
388
+ assert children and isinstance(
389
+ children, list
390
+ ), "children shouldn't be null and should be of type list"
391
+
392
+ for index, child in enumerate(children):
393
+ assert isinstance(
394
+ child, Element
395
+ ), f"Child at index {index} in children: Should be of type Element"
396
+
397
+ if self.is_read_only:
398
+ logger.warning("Cannot link elements as this worker is in read-only mode")
399
+ return
400
+
401
+ return [
402
+ child_id
403
+ for batch in make_batches(children, "child", batch_size)
404
+ for child_id in self.api_client.request(
405
+ "CreateElementChildren",
406
+ id=parent.id,
407
+ body={
408
+ "children": list(map(attrgetter("id"), batch)),
409
+ },
410
+ )["children"]
411
+ ]
412
+
320
413
  def partial_update_element(
321
414
  self, element: Element | CachedElement, **kwargs
322
415
  ) -> dict:
@@ -383,7 +476,7 @@ class ElementMixin:
383
476
  logger.warning("Cannot update element as this worker is in read-only mode")
384
477
  return
385
478
 
386
- updated_element = self.request(
479
+ updated_element = self.api_client.request(
387
480
  "PartialUpdateElement",
388
481
  id=element.id,
389
482
  body=kwargs,
@@ -407,6 +500,178 @@ class ElementMixin:
407
500
 
408
501
  return updated_element
409
502
 
503
+ def list_elements(
504
+ self,
505
+ folder: bool | None = None,
506
+ name: str | None = None,
507
+ top_level: bool | None = None,
508
+ transcription_worker_version: str | bool | None = None,
509
+ transcription_worker_run: str | bool | None = None,
510
+ type: str | None = None,
511
+ with_classes: bool | None = None,
512
+ with_corpus: bool | None = None,
513
+ with_metadata: bool | None = None,
514
+ with_has_children: bool | None = None,
515
+ with_zone: bool | None = None,
516
+ worker_version: str | bool | None = None,
517
+ worker_run: str | bool | None = None,
518
+ ) -> Iterable[dict] | Iterable[CachedElement]:
519
+ """
520
+ List element in a corpus.
521
+
522
+ Warns:
523
+ ----
524
+ The following parameters are **deprecated**:
525
+
526
+ - `transcription_worker_version` in favor of `transcription_worker_run`
527
+ - `worker_version` in favor of `worker_run`
528
+
529
+ :param folder: Restrict to or exclude elements with folder types.
530
+ This parameter is not supported when caching is enabled.
531
+ :param name: Restrict to elements whose name contain a substring (case-insensitive).
532
+ This parameter is not supported when caching is enabled.
533
+ :param top_level: Restrict to or exclude folder elements without parent elements (top-level elements).
534
+ This parameter is not supported when caching is enabled.
535
+ :param transcription_worker_version: **Deprecated** Restrict to elements that have a transcription created by a worker version with this UUID. Set to False to look for elements that have a manual transcription.
536
+ This parameter is not supported when caching is enabled.
537
+ :param transcription_worker_run: Restrict to elements that have a transcription created by a worker run with this UUID. Set to False to look for elements that have a manual transcription.
538
+ This parameter is not supported when caching is enabled.
539
+ :param type: Restrict to elements with a specific type slug
540
+ This parameter is not supported when caching is enabled.
541
+ :param with_classes: Include each element's classifications in the response.
542
+ This parameter is not supported when caching is enabled.
543
+ :param with_corpus: Include each element's corpus in the response.
544
+ This parameter is not supported when caching is enabled.
545
+ :param with_has_children: Include the ``has_children`` attribute in the response,
546
+ indicating if this element has child elements of its own.
547
+ This parameter is not supported when caching is enabled.
548
+ :param with_metadata: Include each element's metadata in the response.
549
+ This parameter is not supported when caching is enabled.
550
+ :param with_zone: Include the ``zone`` attribute in the response,
551
+ holding the element's image and polygon.
552
+ This parameter is not supported when caching is enabled.
553
+ :param worker_version: **Deprecated** Restrict to elements created by a worker version with this UUID.
554
+ :param worker_run: Restrict to elements created by a worker run with this UUID.
555
+ :return: An iterable of dicts from the ``ListElementChildren`` API endpoint,
556
+ or an iterable of [CachedElement][arkindex_worker.cache.CachedElement] when caching is enabled.
557
+ """
558
+ query_params = {}
559
+ if folder is not None:
560
+ assert isinstance(folder, bool), "folder should be of type bool"
561
+ query_params["folder"] = folder
562
+ if name:
563
+ assert isinstance(name, str), "name should be of type str"
564
+ query_params["name"] = name
565
+ if top_level is not None:
566
+ assert isinstance(top_level, bool), "top_level should be of type bool"
567
+ query_params["top_level"] = top_level
568
+ if transcription_worker_version is not None:
569
+ warn(
570
+ "`transcription_worker_version` usage is deprecated. Consider using `transcription_worker_run` instead.",
571
+ DeprecationWarning,
572
+ stacklevel=1,
573
+ )
574
+ assert isinstance(
575
+ transcription_worker_version, str | bool
576
+ ), "transcription_worker_version should be of type str or bool"
577
+ if isinstance(transcription_worker_version, bool):
578
+ assert (
579
+ transcription_worker_version is False
580
+ ), "if of type bool, transcription_worker_version can only be set to False"
581
+ query_params["transcription_worker_version"] = transcription_worker_version
582
+ if transcription_worker_run is not None:
583
+ assert isinstance(
584
+ transcription_worker_run, str | bool
585
+ ), "transcription_worker_run should be of type str or bool"
586
+ if isinstance(transcription_worker_run, bool):
587
+ assert (
588
+ transcription_worker_run is False
589
+ ), "if of type bool, transcription_worker_run can only be set to False"
590
+ query_params["transcription_worker_run"] = transcription_worker_run
591
+ if type:
592
+ assert isinstance(type, str), "type should be of type str"
593
+ query_params["type"] = type
594
+ if with_classes is not None:
595
+ assert isinstance(with_classes, bool), "with_classes should be of type bool"
596
+ query_params["with_classes"] = with_classes
597
+ if with_corpus is not None:
598
+ assert isinstance(with_corpus, bool), "with_corpus should be of type bool"
599
+ query_params["with_corpus"] = with_corpus
600
+ if with_has_children is not None:
601
+ assert isinstance(
602
+ with_has_children, bool
603
+ ), "with_has_children should be of type bool"
604
+ query_params["with_has_children"] = with_has_children
605
+ if with_metadata is not None:
606
+ assert isinstance(
607
+ with_metadata, bool
608
+ ), "with_metadata should be of type bool"
609
+ query_params["with_metadata"] = with_metadata
610
+ if with_zone is not None:
611
+ assert isinstance(with_zone, bool), "with_zone should be of type bool"
612
+ query_params["with_zone"] = with_zone
613
+ if worker_version is not None:
614
+ warn(
615
+ "`worker_version` usage is deprecated. Consider using `worker_run` instead.",
616
+ DeprecationWarning,
617
+ stacklevel=1,
618
+ )
619
+ assert isinstance(
620
+ worker_version, str | bool
621
+ ), "worker_version should be of type str or bool"
622
+ if isinstance(worker_version, bool):
623
+ assert (
624
+ worker_version is False
625
+ ), "if of type bool, worker_version can only be set to False"
626
+ query_params["worker_version"] = worker_version
627
+ if worker_run is not None:
628
+ assert isinstance(
629
+ worker_run, str | bool
630
+ ), "worker_run should be of type str or bool"
631
+ if isinstance(worker_run, bool):
632
+ assert (
633
+ worker_run is False
634
+ ), "if of type bool, worker_run can only be set to False"
635
+ query_params["worker_run"] = worker_run
636
+
637
+ if not self.use_cache:
638
+ return self.api_client.paginate(
639
+ "ListElements", corpus=self.corpus_id, **query_params
640
+ )
641
+
642
+ # Checking that we only received query_params handled by the cache
643
+ assert (
644
+ set(query_params.keys())
645
+ <= {
646
+ "type",
647
+ "worker_version",
648
+ "worker_run",
649
+ }
650
+ ), "When using the local cache, you can only filter by 'type' and/or 'worker_version' and/or 'worker_run'"
651
+
652
+ query = CachedElement.select()
653
+ if type:
654
+ query = query.where(CachedElement.type == type)
655
+ if worker_version is not None:
656
+ # If worker_version=False, filter by manual worker_version e.g. None
657
+ worker_version_id = worker_version or None
658
+ if worker_version_id:
659
+ query = query.where(
660
+ CachedElement.worker_version_id == worker_version_id
661
+ )
662
+ else:
663
+ query = query.where(CachedElement.worker_version_id.is_null())
664
+
665
+ if worker_run is not None:
666
+ # If worker_run=False, filter by manual worker_run e.g. None
667
+ worker_run_id = worker_run or None
668
+ if worker_run_id:
669
+ query = query.where(CachedElement.worker_run_id == worker_run_id)
670
+ else:
671
+ query = query.where(CachedElement.worker_run_id.is_null())
672
+
673
+ return query
674
+
410
675
  def list_element_children(
411
676
  self,
412
677
  element: Element | CachedElement,
@@ -546,45 +811,43 @@ class ElementMixin:
546
811
  ), "if of type bool, worker_run can only be set to False"
547
812
  query_params["worker_run"] = worker_run
548
813
 
549
- if self.use_cache:
550
- # Checking that we only received query_params handled by the cache
551
- assert (
552
- set(query_params.keys())
553
- <= {
554
- "type",
555
- "worker_version",
556
- "worker_run",
557
- }
558
- ), "When using the local cache, you can only filter by 'type' and/or 'worker_version' and/or 'worker_run'"
559
-
560
- query = CachedElement.select().where(CachedElement.parent_id == element.id)
561
- if type:
562
- query = query.where(CachedElement.type == type)
563
- if worker_version is not None:
564
- # If worker_version=False, filter by manual worker_version e.g. None
565
- worker_version_id = worker_version or None
566
- if worker_version_id:
567
- query = query.where(
568
- CachedElement.worker_version_id == worker_version_id
569
- )
570
- else:
571
- query = query.where(CachedElement.worker_version_id.is_null())
572
-
573
- if worker_run is not None:
574
- # If worker_run=False, filter by manual worker_run e.g. None
575
- worker_run_id = worker_run or None
576
- if worker_run_id:
577
- query = query.where(CachedElement.worker_run_id == worker_run_id)
578
- else:
579
- query = query.where(CachedElement.worker_run_id.is_null())
580
-
581
- return query
582
- else:
583
- children = self.api_client.paginate(
814
+ if not self.use_cache:
815
+ return self.api_client.paginate(
584
816
  "ListElementChildren", id=element.id, **query_params
585
817
  )
586
818
 
587
- return children
819
+ # Checking that we only received query_params handled by the cache
820
+ assert (
821
+ set(query_params.keys())
822
+ <= {
823
+ "type",
824
+ "worker_version",
825
+ "worker_run",
826
+ }
827
+ ), "When using the local cache, you can only filter by 'type' and/or 'worker_version' and/or 'worker_run'"
828
+
829
+ query = CachedElement.select().where(CachedElement.parent_id == element.id)
830
+ if type:
831
+ query = query.where(CachedElement.type == type)
832
+ if worker_version is not None:
833
+ # If worker_version=False, filter by manual worker_version e.g. None
834
+ worker_version_id = worker_version or None
835
+ if worker_version_id:
836
+ query = query.where(
837
+ CachedElement.worker_version_id == worker_version_id
838
+ )
839
+ else:
840
+ query = query.where(CachedElement.worker_version_id.is_null())
841
+
842
+ if worker_run is not None:
843
+ # If worker_run=False, filter by manual worker_run e.g. None
844
+ worker_run_id = worker_run or None
845
+ if worker_run_id:
846
+ query = query.where(CachedElement.worker_run_id == worker_run_id)
847
+ else:
848
+ query = query.where(CachedElement.worker_run_id.is_null())
849
+
850
+ return query
588
851
 
589
852
  def list_element_parents(
590
853
  self,
@@ -725,45 +988,43 @@ class ElementMixin:
725
988
  ), "if of type bool, worker_run can only be set to False"
726
989
  query_params["worker_run"] = worker_run
727
990
 
728
- if self.use_cache:
729
- # Checking that we only received query_params handled by the cache
730
- assert (
731
- set(query_params.keys())
732
- <= {
733
- "type",
734
- "worker_version",
735
- "worker_run",
736
- }
737
- ), "When using the local cache, you can only filter by 'type' and/or 'worker_version' and/or 'worker_run'"
738
-
739
- parent_ids = CachedElement.select(CachedElement.parent_id).where(
740
- CachedElement.id == element.id
741
- )
742
- query = CachedElement.select().where(CachedElement.id.in_(parent_ids))
743
- if type:
744
- query = query.where(CachedElement.type == type)
745
- if worker_version is not None:
746
- # If worker_version=False, filter by manual worker_version e.g. None
747
- worker_version_id = worker_version or None
748
- if worker_version_id:
749
- query = query.where(
750
- CachedElement.worker_version_id == worker_version_id
751
- )
752
- else:
753
- query = query.where(CachedElement.worker_version_id.is_null())
754
-
755
- if worker_run is not None:
756
- # If worker_run=False, filter by manual worker_run e.g. None
757
- worker_run_id = worker_run or None
758
- if worker_run_id:
759
- query = query.where(CachedElement.worker_run_id == worker_run_id)
760
- else:
761
- query = query.where(CachedElement.worker_run_id.is_null())
762
-
763
- return query
764
- else:
765
- parents = self.api_client.paginate(
991
+ if not self.use_cache:
992
+ return self.api_client.paginate(
766
993
  "ListElementParents", id=element.id, **query_params
767
994
  )
768
995
 
769
- return parents
996
+ # Checking that we only received query_params handled by the cache
997
+ assert (
998
+ set(query_params.keys())
999
+ <= {
1000
+ "type",
1001
+ "worker_version",
1002
+ "worker_run",
1003
+ }
1004
+ ), "When using the local cache, you can only filter by 'type' and/or 'worker_version' and/or 'worker_run'"
1005
+
1006
+ parent_ids = CachedElement.select(CachedElement.parent_id).where(
1007
+ CachedElement.id == element.id
1008
+ )
1009
+ query = CachedElement.select().where(CachedElement.id.in_(parent_ids))
1010
+ if type:
1011
+ query = query.where(CachedElement.type == type)
1012
+ if worker_version is not None:
1013
+ # If worker_version=False, filter by manual worker_version e.g. None
1014
+ worker_version_id = worker_version or None
1015
+ if worker_version_id:
1016
+ query = query.where(
1017
+ CachedElement.worker_version_id == worker_version_id
1018
+ )
1019
+ else:
1020
+ query = query.where(CachedElement.worker_version_id.is_null())
1021
+
1022
+ if worker_run is not None:
1023
+ # If worker_run=False, filter by manual worker_run e.g. None
1024
+ worker_run_id = worker_run or None
1025
+ if worker_run_id:
1026
+ query = query.where(CachedElement.worker_run_id == worker_run_id)
1027
+ else:
1028
+ query = query.where(CachedElement.worker_run_id.is_null())
1029
+
1030
+ return query
@@ -15,6 +15,7 @@ from arkindex_worker.cache import (
15
15
  unsupported_cache,
16
16
  )
17
17
  from arkindex_worker.models import Element, Transcription
18
+ from arkindex_worker.utils import pluralize
18
19
 
19
20
 
20
21
  class Entity(TypedDict):
@@ -48,6 +49,7 @@ class EntityMixin:
48
49
  if not self.entity_types:
49
50
  # Load entity_types of corpus
50
51
  self.list_corpus_entity_types()
52
+
51
53
  for entity_type in entity_types:
52
54
  # Do nothing if type already exists
53
55
  if entity_type in self.entity_types:
@@ -60,7 +62,7 @@ class EntityMixin:
60
62
  )
61
63
 
62
64
  # Create type if non-existent
63
- self.entity_types[entity_type] = self.request(
65
+ self.entity_types[entity_type] = self.api_client.request(
64
66
  "CreateEntityType",
65
67
  body={
66
68
  "name": entity_type,
@@ -106,7 +108,7 @@ class EntityMixin:
106
108
  entity_type_id = self.entity_types.get(type)
107
109
  assert entity_type_id, f"Entity type `{type}` not found in the corpus."
108
110
 
109
- entity = self.request(
111
+ entity = self.api_client.request(
110
112
  "CreateEntity",
111
113
  body={
112
114
  "name": name,
@@ -188,7 +190,7 @@ class EntityMixin:
188
190
  if confidence is not None:
189
191
  body["confidence"] = confidence
190
192
 
191
- transcription_ent = self.request(
193
+ transcription_ent = self.api_client.request(
192
194
  "CreateTranscriptionEntity",
193
195
  id=transcription.id,
194
196
  body=body,
@@ -289,16 +291,16 @@ class EntityMixin:
289
291
  )
290
292
  return
291
293
 
292
- created_ids = self.request(
294
+ created_entities = self.api_client.request(
293
295
  "CreateTranscriptionEntities",
294
296
  id=transcription.id,
295
297
  body={
296
298
  "worker_run_id": self.worker_run_id,
297
299
  "entities": entities,
298
300
  },
299
- )
301
+ )["entities"]
300
302
 
301
- return created_ids["entities"]
303
+ return created_entities
302
304
 
303
305
  def list_transcription_entities(
304
306
  self,
@@ -380,13 +382,12 @@ class EntityMixin:
380
382
  "ListCorpusEntities", id=self.corpus_id, **query_params
381
383
  )
382
384
  }
385
+ count = len(self.entities)
383
386
  logger.info(
384
- f"Loaded {len(self.entities)} entities in corpus ({self.corpus_id})"
387
+ f'Loaded {count} {pluralize("entity", count)} in corpus ({self.corpus_id})'
385
388
  )
386
389
 
387
- def list_corpus_entity_types(
388
- self,
389
- ):
390
+ def list_corpus_entity_types(self):
390
391
  """
391
392
  Loads available entity types in corpus.
392
393
  """
@@ -396,6 +397,7 @@ class EntityMixin:
396
397
  "ListCorpusEntityTypes", id=self.corpus_id
397
398
  )
398
399
  }
400
+ count = len(self.entity_types)
399
401
  logger.info(
400
- f"Loaded {len(self.entity_types)} entity types in corpus ({self.corpus_id})."
402
+ f'Loaded {count} entity {pluralize("type", count)} in corpus ({self.corpus_id}).'
401
403
  )
@@ -0,0 +1,21 @@
1
+ """
2
+ ElementsWorker methods for images.
3
+ """
4
+
5
+ from arkindex_worker.models import Image
6
+
7
+
8
+ class ImageMixin:
9
+ def create_iiif_url(self, url: str) -> Image:
10
+ """
11
+ Create an image from an existing IIIF image by URL.
12
+ The URL should be of the image's identifier, not of its Image Information request (`/info.json`).
13
+
14
+ :param url: URL of the image.
15
+ :returns: The created image.
16
+ """
17
+ assert url and isinstance(
18
+ url, str
19
+ ), "url shouldn't be null and should be of type str"
20
+
21
+ return Image(self.api_client.request("CreateIIIFURL", body={"url": url}))