arkindex-base-worker 0.3.7rc10__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arkindex_base_worker-0.3.7rc10.dist-info → arkindex_base_worker-0.4.0.dist-info}/METADATA +16 -20
- arkindex_base_worker-0.4.0.dist-info/RECORD +61 -0
- {arkindex_base_worker-0.3.7rc10.dist-info → arkindex_base_worker-0.4.0.dist-info}/WHEEL +1 -1
- arkindex_worker/cache.py +1 -1
- arkindex_worker/image.py +120 -1
- arkindex_worker/models.py +6 -0
- arkindex_worker/utils.py +85 -4
- arkindex_worker/worker/__init__.py +68 -162
- arkindex_worker/worker/base.py +39 -34
- arkindex_worker/worker/classification.py +34 -18
- arkindex_worker/worker/corpus.py +86 -0
- arkindex_worker/worker/dataset.py +71 -1
- arkindex_worker/worker/element.py +352 -91
- arkindex_worker/worker/entity.py +11 -11
- arkindex_worker/worker/image.py +21 -0
- arkindex_worker/worker/metadata.py +19 -9
- arkindex_worker/worker/process.py +92 -0
- arkindex_worker/worker/task.py +5 -4
- arkindex_worker/worker/training.py +25 -10
- arkindex_worker/worker/transcription.py +89 -68
- arkindex_worker/worker/version.py +3 -1
- tests/__init__.py +8 -0
- tests/conftest.py +36 -52
- tests/test_base_worker.py +212 -12
- tests/test_dataset_worker.py +21 -45
- tests/test_elements_worker/{test_classifications.py → test_classification.py} +216 -100
- tests/test_elements_worker/test_cli.py +3 -11
- tests/test_elements_worker/test_corpus.py +168 -0
- tests/test_elements_worker/test_dataset.py +7 -12
- tests/test_elements_worker/test_element.py +427 -0
- tests/test_elements_worker/test_element_create_multiple.py +715 -0
- tests/test_elements_worker/test_element_create_single.py +528 -0
- tests/test_elements_worker/test_element_list_children.py +969 -0
- tests/test_elements_worker/test_element_list_parents.py +530 -0
- tests/test_elements_worker/{test_entities.py → test_entity_create.py} +37 -195
- tests/test_elements_worker/test_entity_list_and_check.py +160 -0
- tests/test_elements_worker/test_image.py +66 -0
- tests/test_elements_worker/test_metadata.py +230 -139
- tests/test_elements_worker/test_process.py +89 -0
- tests/test_elements_worker/test_task.py +8 -18
- tests/test_elements_worker/test_training.py +17 -8
- tests/test_elements_worker/test_transcription_create.py +873 -0
- tests/test_elements_worker/test_transcription_create_with_elements.py +951 -0
- tests/test_elements_worker/test_transcription_list.py +450 -0
- tests/test_elements_worker/test_version.py +60 -0
- tests/test_elements_worker/test_worker.py +563 -279
- tests/test_image.py +432 -209
- tests/test_merge.py +1 -2
- tests/test_utils.py +66 -3
- arkindex_base_worker-0.3.7rc10.dist-info/RECORD +0 -47
- tests/test_elements_worker/test_elements.py +0 -2713
- tests/test_elements_worker/test_transcriptions.py +0 -2119
- {arkindex_base_worker-0.3.7rc10.dist-info → arkindex_base_worker-0.4.0.dist-info}/LICENSE +0 -0
- {arkindex_base_worker-0.3.7rc10.dist-info → arkindex_base_worker-0.4.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BaseWorker methods for corpora.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from operator import itemgetter
|
|
7
|
+
from tempfile import _TemporaryFileWrapper
|
|
8
|
+
from uuid import UUID
|
|
9
|
+
|
|
10
|
+
from arkindex_worker import logger
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class CorpusExportState(Enum):
|
|
14
|
+
"""
|
|
15
|
+
State of a corpus export.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
Created = "created"
|
|
19
|
+
"""
|
|
20
|
+
The corpus export is created, awaiting its processing.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
Running = "running"
|
|
24
|
+
"""
|
|
25
|
+
The corpus export is being built.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
Failed = "failed"
|
|
29
|
+
"""
|
|
30
|
+
The corpus export failed.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
Done = "done"
|
|
34
|
+
"""
|
|
35
|
+
The corpus export ended in success.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class CorpusMixin:
|
|
40
|
+
def download_export(self, export_id: str) -> _TemporaryFileWrapper:
|
|
41
|
+
"""
|
|
42
|
+
Download an export.
|
|
43
|
+
|
|
44
|
+
:param export_id: UUID of the export to download
|
|
45
|
+
:returns: The downloaded export stored in a temporary file.
|
|
46
|
+
"""
|
|
47
|
+
try:
|
|
48
|
+
UUID(export_id)
|
|
49
|
+
except ValueError as e:
|
|
50
|
+
raise ValueError("export_id is not a valid uuid.") from e
|
|
51
|
+
|
|
52
|
+
logger.info(f"Downloading export ({export_id})...")
|
|
53
|
+
export: _TemporaryFileWrapper = self.api_client.request(
|
|
54
|
+
"DownloadExport", id=export_id
|
|
55
|
+
)
|
|
56
|
+
logger.info(f"Downloaded export ({export_id}) @ `{export.name}`")
|
|
57
|
+
return export
|
|
58
|
+
|
|
59
|
+
def download_latest_export(self) -> _TemporaryFileWrapper:
|
|
60
|
+
"""
|
|
61
|
+
Download the latest export in `done` state of the current corpus.
|
|
62
|
+
|
|
63
|
+
:returns: The downloaded export stored in a temporary file.
|
|
64
|
+
"""
|
|
65
|
+
# List all exports on the corpus
|
|
66
|
+
exports = self.api_client.paginate("ListExports", id=self.corpus_id)
|
|
67
|
+
|
|
68
|
+
# Find the latest that is in "done" state
|
|
69
|
+
exports: list[dict] = sorted(
|
|
70
|
+
list(
|
|
71
|
+
filter(
|
|
72
|
+
lambda export: export["state"] == CorpusExportState.Done.value,
|
|
73
|
+
exports,
|
|
74
|
+
)
|
|
75
|
+
),
|
|
76
|
+
key=itemgetter("updated"),
|
|
77
|
+
reverse=True,
|
|
78
|
+
)
|
|
79
|
+
assert (
|
|
80
|
+
len(exports) > 0
|
|
81
|
+
), f'No available exports found for the corpus ({self.corpus_id}) with state "{CorpusExportState.Done.value.capitalize()}".'
|
|
82
|
+
|
|
83
|
+
# Download latest export
|
|
84
|
+
export_id: str = exports[0]["id"]
|
|
85
|
+
|
|
86
|
+
return self.download_export(export_id)
|
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
BaseWorker methods for datasets.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
import uuid
|
|
6
|
+
from argparse import ArgumentTypeError
|
|
5
7
|
from collections.abc import Iterator
|
|
6
8
|
from enum import Enum
|
|
7
9
|
|
|
@@ -36,7 +38,55 @@ class DatasetState(Enum):
|
|
|
36
38
|
"""
|
|
37
39
|
|
|
38
40
|
|
|
41
|
+
class MissingDatasetArchive(Exception):
|
|
42
|
+
"""
|
|
43
|
+
Exception raised when the compressed archive associated to
|
|
44
|
+
a dataset isn't found in its task artifacts.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def check_dataset_set(value: str) -> tuple[uuid.UUID, str]:
|
|
49
|
+
"""The `--set` argument should have the following format:
|
|
50
|
+
<dataset_id>:<set_name>
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
value (str): Provided argument.
|
|
54
|
+
|
|
55
|
+
Raises:
|
|
56
|
+
ArgumentTypeError: When the value is invalid.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
tuple[uuid.UUID, str]: The ID of the dataset parsed as UUID and the name of the set.
|
|
60
|
+
"""
|
|
61
|
+
values = value.split(":")
|
|
62
|
+
if len(values) != 2:
|
|
63
|
+
raise ArgumentTypeError(
|
|
64
|
+
f"'{value}' is not in the correct format `<dataset_id>:<set_name>`"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
dataset_id, set_name = values
|
|
68
|
+
try:
|
|
69
|
+
dataset_id = uuid.UUID(dataset_id)
|
|
70
|
+
return (dataset_id, set_name)
|
|
71
|
+
except (TypeError, ValueError) as e:
|
|
72
|
+
raise ArgumentTypeError(f"'{dataset_id}' should be a valid UUID") from e
|
|
73
|
+
|
|
74
|
+
|
|
39
75
|
class DatasetMixin:
|
|
76
|
+
def add_arguments(self) -> None:
|
|
77
|
+
"""Define specific ``argparse`` arguments for the worker using this mixin"""
|
|
78
|
+
self.parser.add_argument(
|
|
79
|
+
"--set",
|
|
80
|
+
type=check_dataset_set,
|
|
81
|
+
nargs="+",
|
|
82
|
+
help="""
|
|
83
|
+
One or more Arkindex dataset sets, format is <dataset_uuid>:<set_name>
|
|
84
|
+
(e.g.: "12341234-1234-1234-1234-123412341234:train")
|
|
85
|
+
""",
|
|
86
|
+
default=[],
|
|
87
|
+
)
|
|
88
|
+
super().add_arguments()
|
|
89
|
+
|
|
40
90
|
def list_process_sets(self) -> Iterator[Set]:
|
|
41
91
|
"""
|
|
42
92
|
List dataset sets associated to the worker's process. This helper is not available in developer mode.
|
|
@@ -73,6 +123,26 @@ class DatasetMixin:
|
|
|
73
123
|
|
|
74
124
|
return map(lambda result: Element(**result["element"]), results)
|
|
75
125
|
|
|
126
|
+
def list_sets(self) -> Iterator[Set]:
|
|
127
|
+
"""
|
|
128
|
+
List the sets to be processed, either from the CLI arguments or using the
|
|
129
|
+
[list_process_sets][arkindex_worker.worker.dataset.DatasetMixin.list_process_sets] method.
|
|
130
|
+
|
|
131
|
+
:returns: An iterator of ``Set`` objects.
|
|
132
|
+
"""
|
|
133
|
+
if not self.is_read_only:
|
|
134
|
+
yield from self.list_process_sets()
|
|
135
|
+
|
|
136
|
+
datasets: dict[uuid.UUID, Dataset] = {}
|
|
137
|
+
for dataset_id, set_name in self.args.set:
|
|
138
|
+
# Retrieving dataset information if not already cached
|
|
139
|
+
if dataset_id not in datasets:
|
|
140
|
+
datasets[dataset_id] = Dataset(
|
|
141
|
+
**self.api_client.request("RetrieveDataset", id=dataset_id)
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
yield Set(name=set_name, dataset=datasets[dataset_id])
|
|
145
|
+
|
|
76
146
|
@unsupported_cache
|
|
77
147
|
def update_dataset_state(self, dataset: Dataset, state: DatasetState) -> Dataset:
|
|
78
148
|
"""
|
|
@@ -93,7 +163,7 @@ class DatasetMixin:
|
|
|
93
163
|
logger.warning("Cannot update dataset as this worker is in read-only mode")
|
|
94
164
|
return
|
|
95
165
|
|
|
96
|
-
updated_dataset = self.request(
|
|
166
|
+
updated_dataset = self.api_client.request(
|
|
97
167
|
"PartialUpdateDataset",
|
|
98
168
|
id=dataset.id,
|
|
99
169
|
body={"state": state.value},
|