arkindex-base-worker 0.4.0b3__py3-none-any.whl → 0.4.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,8 @@
2
2
  BaseWorker methods for datasets.
3
3
  """
4
4
 
5
+ import uuid
6
+ from argparse import ArgumentTypeError
5
7
  from collections.abc import Iterator
6
8
  from enum import Enum
7
9
 
@@ -36,7 +38,55 @@ class DatasetState(Enum):
36
38
  """
37
39
 
38
40
 
41
+ class MissingDatasetArchive(Exception):
42
+ """
43
+ Exception raised when the compressed archive associated to
44
+ a dataset isn't found in its task artifacts.
45
+ """
46
+
47
+
48
+ def check_dataset_set(value: str) -> tuple[uuid.UUID, str]:
49
+ """The `--set` argument should have the following format:
50
+ <dataset_id>:<set_name>
51
+
52
+ Args:
53
+ value (str): Provided argument.
54
+
55
+ Raises:
56
+ ArgumentTypeError: When the value is invalid.
57
+
58
+ Returns:
59
+ tuple[uuid.UUID, str]: The ID of the dataset parsed as UUID and the name of the set.
60
+ """
61
+ values = value.split(":")
62
+ if len(values) != 2:
63
+ raise ArgumentTypeError(
64
+ f"'{value}' is not in the correct format `<dataset_id>:<set_name>`"
65
+ )
66
+
67
+ dataset_id, set_name = values
68
+ try:
69
+ dataset_id = uuid.UUID(dataset_id)
70
+ return (dataset_id, set_name)
71
+ except (TypeError, ValueError) as e:
72
+ raise ArgumentTypeError(f"'{dataset_id}' should be a valid UUID") from e
73
+
74
+
39
75
  class DatasetMixin:
76
+ def add_arguments(self) -> None:
77
+ """Define specific ``argparse`` arguments for the worker using this mixin"""
78
+ self.parser.add_argument(
79
+ "--set",
80
+ type=check_dataset_set,
81
+ nargs="+",
82
+ help="""
83
+ One or more Arkindex dataset sets, format is <dataset_uuid>:<set_name>
84
+ (e.g.: "12341234-1234-1234-1234-123412341234:train")
85
+ """,
86
+ default=[],
87
+ )
88
+ super().add_arguments()
89
+
40
90
  def list_process_sets(self) -> Iterator[Set]:
41
91
  """
42
92
  List dataset sets associated to the worker's process. This helper is not available in developer mode.
@@ -73,6 +123,26 @@ class DatasetMixin:
73
123
 
74
124
  return map(lambda result: Element(**result["element"]), results)
75
125
 
126
+ def list_sets(self) -> Iterator[Set]:
127
+ """
128
+ List the sets to be processed, either from the CLI arguments or using the
129
+ [list_process_sets][arkindex_worker.worker.dataset.DatasetMixin.list_process_sets] method.
130
+
131
+ :returns: An iterator of ``Set`` objects.
132
+ """
133
+ if not self.is_read_only:
134
+ yield from self.list_process_sets()
135
+
136
+ datasets: dict[uuid.UUID, Dataset] = {}
137
+ for dataset_id, set_name in self.args.set:
138
+ # Retrieving dataset information if not already cached
139
+ if dataset_id not in datasets:
140
+ datasets[dataset_id] = Dataset(
141
+ **self.api_client.request("RetrieveDataset", id=dataset_id)
142
+ )
143
+
144
+ yield Set(name=set_name, dataset=datasets[dataset_id])
145
+
76
146
  @unsupported_cache
77
147
  def update_dataset_state(self, dataset: Dataset, state: DatasetState) -> Dataset:
78
148
  """
@@ -2,6 +2,7 @@
2
2
  ElementsWorker methods for elements and element types.
3
3
  """
4
4
 
5
+ import os
5
6
  from collections.abc import Iterable
6
7
  from operator import attrgetter
7
8
  from typing import NamedTuple
@@ -38,6 +39,22 @@ class MissingTypeError(Exception):
38
39
 
39
40
 
40
41
  class ElementMixin:
42
+ def add_arguments(self):
43
+ """Define specific ``argparse`` arguments for the worker using this mixin"""
44
+ self.parser.add_argument(
45
+ "--elements-list",
46
+ help="JSON elements list to use",
47
+ type=open,
48
+ default=os.environ.get("TASK_ELEMENTS"),
49
+ )
50
+ self.parser.add_argument(
51
+ "--element",
52
+ type=str,
53
+ nargs="+",
54
+ help="One or more Arkindex element ID",
55
+ )
56
+ super().add_arguments()
57
+
41
58
  def list_corpus_types(self):
42
59
  """
43
60
  Loads available element types in corpus.
@@ -483,6 +500,178 @@ class ElementMixin:
483
500
 
484
501
  return updated_element
485
502
 
503
+ def list_elements(
504
+ self,
505
+ folder: bool | None = None,
506
+ name: str | None = None,
507
+ top_level: bool | None = None,
508
+ transcription_worker_version: str | bool | None = None,
509
+ transcription_worker_run: str | bool | None = None,
510
+ type: str | None = None,
511
+ with_classes: bool | None = None,
512
+ with_corpus: bool | None = None,
513
+ with_metadata: bool | None = None,
514
+ with_has_children: bool | None = None,
515
+ with_zone: bool | None = None,
516
+ worker_version: str | bool | None = None,
517
+ worker_run: str | bool | None = None,
518
+ ) -> Iterable[dict] | Iterable[CachedElement]:
519
+ """
520
+ List element in a corpus.
521
+
522
+ Warns:
523
+ ----
524
+ The following parameters are **deprecated**:
525
+
526
+ - `transcription_worker_version` in favor of `transcription_worker_run`
527
+ - `worker_version` in favor of `worker_run`
528
+
529
+ :param folder: Restrict to or exclude elements with folder types.
530
+ This parameter is not supported when caching is enabled.
531
+ :param name: Restrict to elements whose name contain a substring (case-insensitive).
532
+ This parameter is not supported when caching is enabled.
533
+ :param top_level: Restrict to or exclude folder elements without parent elements (top-level elements).
534
+ This parameter is not supported when caching is enabled.
535
+ :param transcription_worker_version: **Deprecated** Restrict to elements that have a transcription created by a worker version with this UUID. Set to False to look for elements that have a manual transcription.
536
+ This parameter is not supported when caching is enabled.
537
+ :param transcription_worker_run: Restrict to elements that have a transcription created by a worker run with this UUID. Set to False to look for elements that have a manual transcription.
538
+ This parameter is not supported when caching is enabled.
539
+ :param type: Restrict to elements with a specific type slug
540
+ This parameter is not supported when caching is enabled.
541
+ :param with_classes: Include each element's classifications in the response.
542
+ This parameter is not supported when caching is enabled.
543
+ :param with_corpus: Include each element's corpus in the response.
544
+ This parameter is not supported when caching is enabled.
545
+ :param with_has_children: Include the ``has_children`` attribute in the response,
546
+ indicating if this element has child elements of its own.
547
+ This parameter is not supported when caching is enabled.
548
+ :param with_metadata: Include each element's metadata in the response.
549
+ This parameter is not supported when caching is enabled.
550
+ :param with_zone: Include the ``zone`` attribute in the response,
551
+ holding the element's image and polygon.
552
+ This parameter is not supported when caching is enabled.
553
+ :param worker_version: **Deprecated** Restrict to elements created by a worker version with this UUID.
554
+ :param worker_run: Restrict to elements created by a worker run with this UUID.
555
+ :return: An iterable of dicts from the ``ListElementChildren`` API endpoint,
556
+ or an iterable of [CachedElement][arkindex_worker.cache.CachedElement] when caching is enabled.
557
+ """
558
+ query_params = {}
559
+ if folder is not None:
560
+ assert isinstance(folder, bool), "folder should be of type bool"
561
+ query_params["folder"] = folder
562
+ if name:
563
+ assert isinstance(name, str), "name should be of type str"
564
+ query_params["name"] = name
565
+ if top_level is not None:
566
+ assert isinstance(top_level, bool), "top_level should be of type bool"
567
+ query_params["top_level"] = top_level
568
+ if transcription_worker_version is not None:
569
+ warn(
570
+ "`transcription_worker_version` usage is deprecated. Consider using `transcription_worker_run` instead.",
571
+ DeprecationWarning,
572
+ stacklevel=1,
573
+ )
574
+ assert isinstance(
575
+ transcription_worker_version, str | bool
576
+ ), "transcription_worker_version should be of type str or bool"
577
+ if isinstance(transcription_worker_version, bool):
578
+ assert (
579
+ transcription_worker_version is False
580
+ ), "if of type bool, transcription_worker_version can only be set to False"
581
+ query_params["transcription_worker_version"] = transcription_worker_version
582
+ if transcription_worker_run is not None:
583
+ assert isinstance(
584
+ transcription_worker_run, str | bool
585
+ ), "transcription_worker_run should be of type str or bool"
586
+ if isinstance(transcription_worker_run, bool):
587
+ assert (
588
+ transcription_worker_run is False
589
+ ), "if of type bool, transcription_worker_run can only be set to False"
590
+ query_params["transcription_worker_run"] = transcription_worker_run
591
+ if type:
592
+ assert isinstance(type, str), "type should be of type str"
593
+ query_params["type"] = type
594
+ if with_classes is not None:
595
+ assert isinstance(with_classes, bool), "with_classes should be of type bool"
596
+ query_params["with_classes"] = with_classes
597
+ if with_corpus is not None:
598
+ assert isinstance(with_corpus, bool), "with_corpus should be of type bool"
599
+ query_params["with_corpus"] = with_corpus
600
+ if with_has_children is not None:
601
+ assert isinstance(
602
+ with_has_children, bool
603
+ ), "with_has_children should be of type bool"
604
+ query_params["with_has_children"] = with_has_children
605
+ if with_metadata is not None:
606
+ assert isinstance(
607
+ with_metadata, bool
608
+ ), "with_metadata should be of type bool"
609
+ query_params["with_metadata"] = with_metadata
610
+ if with_zone is not None:
611
+ assert isinstance(with_zone, bool), "with_zone should be of type bool"
612
+ query_params["with_zone"] = with_zone
613
+ if worker_version is not None:
614
+ warn(
615
+ "`worker_version` usage is deprecated. Consider using `worker_run` instead.",
616
+ DeprecationWarning,
617
+ stacklevel=1,
618
+ )
619
+ assert isinstance(
620
+ worker_version, str | bool
621
+ ), "worker_version should be of type str or bool"
622
+ if isinstance(worker_version, bool):
623
+ assert (
624
+ worker_version is False
625
+ ), "if of type bool, worker_version can only be set to False"
626
+ query_params["worker_version"] = worker_version
627
+ if worker_run is not None:
628
+ assert isinstance(
629
+ worker_run, str | bool
630
+ ), "worker_run should be of type str or bool"
631
+ if isinstance(worker_run, bool):
632
+ assert (
633
+ worker_run is False
634
+ ), "if of type bool, worker_run can only be set to False"
635
+ query_params["worker_run"] = worker_run
636
+
637
+ if not self.use_cache:
638
+ return self.api_client.paginate(
639
+ "ListElements", corpus=self.corpus_id, **query_params
640
+ )
641
+
642
+ # Checking that we only received query_params handled by the cache
643
+ assert (
644
+ set(query_params.keys())
645
+ <= {
646
+ "type",
647
+ "worker_version",
648
+ "worker_run",
649
+ }
650
+ ), "When using the local cache, you can only filter by 'type' and/or 'worker_version' and/or 'worker_run'"
651
+
652
+ query = CachedElement.select()
653
+ if type:
654
+ query = query.where(CachedElement.type == type)
655
+ if worker_version is not None:
656
+ # If worker_version=False, filter by manual worker_version e.g. None
657
+ worker_version_id = worker_version or None
658
+ if worker_version_id:
659
+ query = query.where(
660
+ CachedElement.worker_version_id == worker_version_id
661
+ )
662
+ else:
663
+ query = query.where(CachedElement.worker_version_id.is_null())
664
+
665
+ if worker_run is not None:
666
+ # If worker_run=False, filter by manual worker_run e.g. None
667
+ worker_run_id = worker_run or None
668
+ if worker_run_id:
669
+ query = query.where(CachedElement.worker_run_id == worker_run_id)
670
+ else:
671
+ query = query.where(CachedElement.worker_run_id.is_null())
672
+
673
+ return query
674
+
486
675
  def list_element_children(
487
676
  self,
488
677
  element: Element | CachedElement,
@@ -622,45 +811,43 @@ class ElementMixin:
622
811
  ), "if of type bool, worker_run can only be set to False"
623
812
  query_params["worker_run"] = worker_run
624
813
 
625
- if self.use_cache:
626
- # Checking that we only received query_params handled by the cache
627
- assert (
628
- set(query_params.keys())
629
- <= {
630
- "type",
631
- "worker_version",
632
- "worker_run",
633
- }
634
- ), "When using the local cache, you can only filter by 'type' and/or 'worker_version' and/or 'worker_run'"
635
-
636
- query = CachedElement.select().where(CachedElement.parent_id == element.id)
637
- if type:
638
- query = query.where(CachedElement.type == type)
639
- if worker_version is not None:
640
- # If worker_version=False, filter by manual worker_version e.g. None
641
- worker_version_id = worker_version or None
642
- if worker_version_id:
643
- query = query.where(
644
- CachedElement.worker_version_id == worker_version_id
645
- )
646
- else:
647
- query = query.where(CachedElement.worker_version_id.is_null())
648
-
649
- if worker_run is not None:
650
- # If worker_run=False, filter by manual worker_run e.g. None
651
- worker_run_id = worker_run or None
652
- if worker_run_id:
653
- query = query.where(CachedElement.worker_run_id == worker_run_id)
654
- else:
655
- query = query.where(CachedElement.worker_run_id.is_null())
656
-
657
- return query
658
- else:
659
- children = self.api_client.paginate(
814
+ if not self.use_cache:
815
+ return self.api_client.paginate(
660
816
  "ListElementChildren", id=element.id, **query_params
661
817
  )
662
818
 
663
- return children
819
+ # Checking that we only received query_params handled by the cache
820
+ assert (
821
+ set(query_params.keys())
822
+ <= {
823
+ "type",
824
+ "worker_version",
825
+ "worker_run",
826
+ }
827
+ ), "When using the local cache, you can only filter by 'type' and/or 'worker_version' and/or 'worker_run'"
828
+
829
+ query = CachedElement.select().where(CachedElement.parent_id == element.id)
830
+ if type:
831
+ query = query.where(CachedElement.type == type)
832
+ if worker_version is not None:
833
+ # If worker_version=False, filter by manual worker_version e.g. None
834
+ worker_version_id = worker_version or None
835
+ if worker_version_id:
836
+ query = query.where(
837
+ CachedElement.worker_version_id == worker_version_id
838
+ )
839
+ else:
840
+ query = query.where(CachedElement.worker_version_id.is_null())
841
+
842
+ if worker_run is not None:
843
+ # If worker_run=False, filter by manual worker_run e.g. None
844
+ worker_run_id = worker_run or None
845
+ if worker_run_id:
846
+ query = query.where(CachedElement.worker_run_id == worker_run_id)
847
+ else:
848
+ query = query.where(CachedElement.worker_run_id.is_null())
849
+
850
+ return query
664
851
 
665
852
  def list_element_parents(
666
853
  self,
@@ -801,45 +988,43 @@ class ElementMixin:
801
988
  ), "if of type bool, worker_run can only be set to False"
802
989
  query_params["worker_run"] = worker_run
803
990
 
804
- if self.use_cache:
805
- # Checking that we only received query_params handled by the cache
806
- assert (
807
- set(query_params.keys())
808
- <= {
809
- "type",
810
- "worker_version",
811
- "worker_run",
812
- }
813
- ), "When using the local cache, you can only filter by 'type' and/or 'worker_version' and/or 'worker_run'"
814
-
815
- parent_ids = CachedElement.select(CachedElement.parent_id).where(
816
- CachedElement.id == element.id
817
- )
818
- query = CachedElement.select().where(CachedElement.id.in_(parent_ids))
819
- if type:
820
- query = query.where(CachedElement.type == type)
821
- if worker_version is not None:
822
- # If worker_version=False, filter by manual worker_version e.g. None
823
- worker_version_id = worker_version or None
824
- if worker_version_id:
825
- query = query.where(
826
- CachedElement.worker_version_id == worker_version_id
827
- )
828
- else:
829
- query = query.where(CachedElement.worker_version_id.is_null())
830
-
831
- if worker_run is not None:
832
- # If worker_run=False, filter by manual worker_run e.g. None
833
- worker_run_id = worker_run or None
834
- if worker_run_id:
835
- query = query.where(CachedElement.worker_run_id == worker_run_id)
836
- else:
837
- query = query.where(CachedElement.worker_run_id.is_null())
838
-
839
- return query
840
- else:
841
- parents = self.api_client.paginate(
991
+ if not self.use_cache:
992
+ return self.api_client.paginate(
842
993
  "ListElementParents", id=element.id, **query_params
843
994
  )
844
995
 
845
- return parents
996
+ # Checking that we only received query_params handled by the cache
997
+ assert (
998
+ set(query_params.keys())
999
+ <= {
1000
+ "type",
1001
+ "worker_version",
1002
+ "worker_run",
1003
+ }
1004
+ ), "When using the local cache, you can only filter by 'type' and/or 'worker_version' and/or 'worker_run'"
1005
+
1006
+ parent_ids = CachedElement.select(CachedElement.parent_id).where(
1007
+ CachedElement.id == element.id
1008
+ )
1009
+ query = CachedElement.select().where(CachedElement.id.in_(parent_ids))
1010
+ if type:
1011
+ query = query.where(CachedElement.type == type)
1012
+ if worker_version is not None:
1013
+ # If worker_version=False, filter by manual worker_version e.g. None
1014
+ worker_version_id = worker_version or None
1015
+ if worker_version_id:
1016
+ query = query.where(
1017
+ CachedElement.worker_version_id == worker_version_id
1018
+ )
1019
+ else:
1020
+ query = query.where(CachedElement.worker_version_id.is_null())
1021
+
1022
+ if worker_run is not None:
1023
+ # If worker_run=False, filter by manual worker_run e.g. None
1024
+ worker_run_id = worker_run or None
1025
+ if worker_run_id:
1026
+ query = query.where(CachedElement.worker_run_id == worker_run_id)
1027
+ else:
1028
+ query = query.where(CachedElement.worker_run_id.is_null())
1029
+
1030
+ return query
@@ -0,0 +1,63 @@
1
+ from enum import Enum
2
+
3
+
4
+ class ActivityState(Enum):
5
+ """
6
+ Processing state of an element.
7
+ """
8
+
9
+ Queued = "queued"
10
+ """
11
+ The element has not yet been processed by a worker.
12
+ """
13
+
14
+ Started = "started"
15
+ """
16
+ The element is being processed by a worker.
17
+ """
18
+
19
+ Processed = "processed"
20
+ """
21
+ The element has been successfully processed by a worker.
22
+ """
23
+
24
+ Error = "error"
25
+ """
26
+ An error occurred while processing this element.
27
+ """
28
+
29
+
30
+ class ProcessMode(Enum):
31
+ """
32
+ Mode of the process of the worker.
33
+ """
34
+
35
+ Files = "files"
36
+ """
37
+ Processes of files (images, PDFs, IIIF, ...) imports.
38
+ """
39
+
40
+ Workers = "workers"
41
+ """
42
+ Processes of worker executions.
43
+ """
44
+
45
+ Template = "template"
46
+ """
47
+ Process templates.
48
+ """
49
+
50
+ S3 = "s3"
51
+ """
52
+ Processes of imports from an S3-compatible storage.
53
+ """
54
+
55
+ Local = "local"
56
+ """
57
+ Local processes.
58
+ """
59
+
60
+ Dataset = "dataset"
61
+ """
62
+ Dataset processes.
63
+ """
@@ -441,60 +441,60 @@ class TranscriptionMixin:
441
441
  ), "if of type bool, worker_run can only be set to False"
442
442
  query_params["worker_run"] = worker_run
443
443
 
444
- if self.use_cache:
445
- if not recursive:
446
- # In this case we don't have to return anything, it's easier to use an
447
- # impossible condition (False) rather than filtering by type for nothing
448
- if element_type and element_type != element.type:
449
- return CachedTranscription.select().where(False)
450
- transcriptions = CachedTranscription.select().where(
451
- CachedTranscription.element_id == element.id
444
+ if not self.use_cache:
445
+ return self.api_client.paginate(
446
+ "ListTranscriptions", id=element.id, **query_params
447
+ )
448
+
449
+ if not recursive:
450
+ # In this case we don't have to return anything, it's easier to use an
451
+ # impossible condition (False) rather than filtering by type for nothing
452
+ if element_type and element_type != element.type:
453
+ return CachedTranscription.select().where(False)
454
+ transcriptions = CachedTranscription.select().where(
455
+ CachedTranscription.element_id == element.id
456
+ )
457
+ else:
458
+ base_case = (
459
+ CachedElement.select()
460
+ .where(CachedElement.id == element.id)
461
+ .cte("base", recursive=True)
462
+ )
463
+ recursive = CachedElement.select().join(
464
+ base_case, on=(CachedElement.parent_id == base_case.c.id)
465
+ )
466
+ cte = base_case.union_all(recursive)
467
+ transcriptions = (
468
+ CachedTranscription.select()
469
+ .join(cte, on=(CachedTranscription.element_id == cte.c.id))
470
+ .with_cte(cte)
471
+ )
472
+
473
+ if element_type:
474
+ transcriptions = transcriptions.where(cte.c.type == element_type)
475
+
476
+ if worker_version is not None:
477
+ # If worker_version=False, filter by manual worker_version e.g. None
478
+ worker_version_id = worker_version or None
479
+ if worker_version_id:
480
+ transcriptions = transcriptions.where(
481
+ CachedTranscription.worker_version_id == worker_version_id
452
482
  )
453
483
  else:
454
- base_case = (
455
- CachedElement.select()
456
- .where(CachedElement.id == element.id)
457
- .cte("base", recursive=True)
484
+ transcriptions = transcriptions.where(
485
+ CachedTranscription.worker_version_id.is_null()
458
486
  )
459
- recursive = CachedElement.select().join(
460
- base_case, on=(CachedElement.parent_id == base_case.c.id)
487
+
488
+ if worker_run is not None:
489
+ # If worker_run=False, filter by manual worker_run e.g. None
490
+ worker_run_id = worker_run or None
491
+ if worker_run_id:
492
+ transcriptions = transcriptions.where(
493
+ CachedTranscription.worker_run_id == worker_run_id
461
494
  )
462
- cte = base_case.union_all(recursive)
463
- transcriptions = (
464
- CachedTranscription.select()
465
- .join(cte, on=(CachedTranscription.element_id == cte.c.id))
466
- .with_cte(cte)
495
+ else:
496
+ transcriptions = transcriptions.where(
497
+ CachedTranscription.worker_run_id.is_null()
467
498
  )
468
499
 
469
- if element_type:
470
- transcriptions = transcriptions.where(cte.c.type == element_type)
471
-
472
- if worker_version is not None:
473
- # If worker_version=False, filter by manual worker_version e.g. None
474
- worker_version_id = worker_version or None
475
- if worker_version_id:
476
- transcriptions = transcriptions.where(
477
- CachedTranscription.worker_version_id == worker_version_id
478
- )
479
- else:
480
- transcriptions = transcriptions.where(
481
- CachedTranscription.worker_version_id.is_null()
482
- )
483
-
484
- if worker_run is not None:
485
- # If worker_run=False, filter by manual worker_run e.g. None
486
- worker_run_id = worker_run or None
487
- if worker_run_id:
488
- transcriptions = transcriptions.where(
489
- CachedTranscription.worker_run_id == worker_run_id
490
- )
491
- else:
492
- transcriptions = transcriptions.where(
493
- CachedTranscription.worker_run_id.is_null()
494
- )
495
- else:
496
- transcriptions = self.api_client.paginate(
497
- "ListTranscriptions", id=element.id, **query_params
498
- )
499
-
500
500
  return transcriptions
tests/__init__.py CHANGED
@@ -5,4 +5,4 @@ FIXTURES_DIR = BASE_DIR / "data"
5
5
  SAMPLES_DIR = BASE_DIR / "samples"
6
6
 
7
7
  CORPUS_ID = "11111111-1111-1111-1111-111111111111"
8
- PROCESS_ID = "cafecafe-cafe-cafe-cafe-cafecafecafe"
8
+ PROCESS_ID = "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff"