arkindex-base-worker 0.3.7rc4__py3-none-any.whl → 0.5.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/METADATA +18 -19
- arkindex_base_worker-0.5.0a1.dist-info/RECORD +61 -0
- {arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/WHEEL +1 -1
- {arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/top_level.txt +2 -0
- arkindex_worker/cache.py +1 -1
- arkindex_worker/image.py +167 -2
- arkindex_worker/models.py +18 -0
- arkindex_worker/utils.py +98 -4
- arkindex_worker/worker/__init__.py +117 -218
- arkindex_worker/worker/base.py +39 -46
- arkindex_worker/worker/classification.py +45 -29
- arkindex_worker/worker/corpus.py +86 -0
- arkindex_worker/worker/dataset.py +89 -26
- arkindex_worker/worker/element.py +352 -91
- arkindex_worker/worker/entity.py +13 -11
- arkindex_worker/worker/image.py +21 -0
- arkindex_worker/worker/metadata.py +26 -16
- arkindex_worker/worker/process.py +92 -0
- arkindex_worker/worker/task.py +5 -4
- arkindex_worker/worker/training.py +25 -10
- arkindex_worker/worker/transcription.py +89 -68
- arkindex_worker/worker/version.py +3 -1
- hooks/pre_gen_project.py +3 -0
- tests/__init__.py +8 -0
- tests/conftest.py +47 -58
- tests/test_base_worker.py +212 -12
- tests/test_dataset_worker.py +294 -437
- tests/test_elements_worker/{test_classifications.py → test_classification.py} +313 -200
- tests/test_elements_worker/test_cli.py +3 -11
- tests/test_elements_worker/test_corpus.py +168 -0
- tests/test_elements_worker/test_dataset.py +106 -157
- tests/test_elements_worker/test_element.py +427 -0
- tests/test_elements_worker/test_element_create_multiple.py +715 -0
- tests/test_elements_worker/test_element_create_single.py +528 -0
- tests/test_elements_worker/test_element_list_children.py +969 -0
- tests/test_elements_worker/test_element_list_parents.py +530 -0
- tests/test_elements_worker/{test_entities.py → test_entity_create.py} +37 -195
- tests/test_elements_worker/test_entity_list_and_check.py +160 -0
- tests/test_elements_worker/test_image.py +66 -0
- tests/test_elements_worker/test_metadata.py +252 -161
- tests/test_elements_worker/test_process.py +89 -0
- tests/test_elements_worker/test_task.py +8 -18
- tests/test_elements_worker/test_training.py +17 -8
- tests/test_elements_worker/test_transcription_create.py +873 -0
- tests/test_elements_worker/test_transcription_create_with_elements.py +951 -0
- tests/test_elements_worker/test_transcription_list.py +450 -0
- tests/test_elements_worker/test_version.py +60 -0
- tests/test_elements_worker/test_worker.py +578 -293
- tests/test_image.py +542 -209
- tests/test_merge.py +1 -2
- tests/test_utils.py +89 -4
- worker-demo/tests/__init__.py +0 -0
- worker-demo/tests/conftest.py +32 -0
- worker-demo/tests/test_worker.py +12 -0
- worker-demo/worker_demo/__init__.py +6 -0
- worker-demo/worker_demo/worker.py +19 -0
- arkindex_base_worker-0.3.7rc4.dist-info/RECORD +0 -41
- tests/test_elements_worker/test_elements.py +0 -2713
- tests/test_elements_worker/test_transcriptions.py +0 -2119
- {arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/LICENSE +0 -0
|
@@ -4,65 +4,47 @@ Base classes to implement Arkindex workers.
|
|
|
4
4
|
|
|
5
5
|
import contextlib
|
|
6
6
|
import json
|
|
7
|
-
import os
|
|
8
7
|
import sys
|
|
9
8
|
import uuid
|
|
10
|
-
from collections.abc import Iterable
|
|
11
|
-
from
|
|
12
|
-
from itertools import groupby
|
|
13
|
-
from operator import itemgetter
|
|
9
|
+
from collections.abc import Iterable
|
|
10
|
+
from itertools import chain
|
|
14
11
|
from pathlib import Path
|
|
15
12
|
|
|
16
|
-
from
|
|
17
|
-
|
|
13
|
+
from arkindex.exceptions import ErrorResponse
|
|
18
14
|
from arkindex_worker import logger
|
|
19
15
|
from arkindex_worker.cache import CachedElement
|
|
20
|
-
from arkindex_worker.models import Dataset, Element
|
|
16
|
+
from arkindex_worker.models import Dataset, Element, Set
|
|
17
|
+
from arkindex_worker.utils import pluralize
|
|
21
18
|
from arkindex_worker.worker.base import BaseWorker
|
|
22
19
|
from arkindex_worker.worker.classification import ClassificationMixin
|
|
23
|
-
from arkindex_worker.worker.
|
|
20
|
+
from arkindex_worker.worker.corpus import CorpusMixin
|
|
21
|
+
from arkindex_worker.worker.dataset import (
|
|
22
|
+
DatasetMixin,
|
|
23
|
+
DatasetState,
|
|
24
|
+
MissingDatasetArchive,
|
|
25
|
+
)
|
|
24
26
|
from arkindex_worker.worker.element import ElementMixin
|
|
25
|
-
from arkindex_worker.worker.entity import EntityMixin
|
|
27
|
+
from arkindex_worker.worker.entity import EntityMixin
|
|
28
|
+
from arkindex_worker.worker.image import ImageMixin
|
|
26
29
|
from arkindex_worker.worker.metadata import MetaDataMixin, MetaType # noqa: F401
|
|
30
|
+
from arkindex_worker.worker.process import ActivityState, ProcessMixin, ProcessMode
|
|
27
31
|
from arkindex_worker.worker.task import TaskMixin
|
|
28
32
|
from arkindex_worker.worker.transcription import TranscriptionMixin
|
|
29
|
-
from arkindex_worker.worker.version import WorkerVersionMixin
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
class ActivityState(Enum):
|
|
33
|
-
"""
|
|
34
|
-
Processing state of an element.
|
|
35
|
-
"""
|
|
36
|
-
|
|
37
|
-
Queued = "queued"
|
|
38
|
-
"""
|
|
39
|
-
The element has not yet been processed by a worker.
|
|
40
|
-
"""
|
|
41
|
-
|
|
42
|
-
Started = "started"
|
|
43
|
-
"""
|
|
44
|
-
The element is being processed by a worker.
|
|
45
|
-
"""
|
|
46
|
-
|
|
47
|
-
Processed = "processed"
|
|
48
|
-
"""
|
|
49
|
-
The element has been successfully processed by a worker.
|
|
50
|
-
"""
|
|
51
|
-
|
|
52
|
-
Error = "error"
|
|
53
|
-
"""
|
|
54
|
-
An error occurred while processing this element.
|
|
55
|
-
"""
|
|
33
|
+
from arkindex_worker.worker.version import WorkerVersionMixin
|
|
56
34
|
|
|
57
35
|
|
|
58
36
|
class ElementsWorker(
|
|
37
|
+
ElementMixin,
|
|
38
|
+
DatasetMixin,
|
|
59
39
|
BaseWorker,
|
|
60
40
|
ClassificationMixin,
|
|
61
|
-
|
|
41
|
+
CorpusMixin,
|
|
62
42
|
TranscriptionMixin,
|
|
63
43
|
WorkerVersionMixin,
|
|
64
44
|
EntityMixin,
|
|
65
45
|
MetaDataMixin,
|
|
46
|
+
ImageMixin,
|
|
47
|
+
ProcessMixin,
|
|
66
48
|
):
|
|
67
49
|
"""
|
|
68
50
|
Base class for ML workers that operate on Arkindex elements.
|
|
@@ -80,39 +62,41 @@ class ElementsWorker(
|
|
|
80
62
|
"""
|
|
81
63
|
super().__init__(description, support_cache)
|
|
82
64
|
|
|
83
|
-
# Add mandatory argument to process elements
|
|
84
|
-
self.parser.add_argument(
|
|
85
|
-
"--elements-list",
|
|
86
|
-
help="JSON elements list to use",
|
|
87
|
-
type=open,
|
|
88
|
-
default=os.environ.get("TASK_ELEMENTS"),
|
|
89
|
-
)
|
|
90
|
-
self.parser.add_argument(
|
|
91
|
-
"--element",
|
|
92
|
-
type=uuid.UUID,
|
|
93
|
-
nargs="+",
|
|
94
|
-
help="One or more Arkindex element ID",
|
|
95
|
-
)
|
|
96
|
-
|
|
97
65
|
self.classes = {}
|
|
98
66
|
|
|
99
67
|
self.entity_types = {}
|
|
100
68
|
"""Known and available entity types in processed corpus
|
|
101
69
|
"""
|
|
102
70
|
|
|
71
|
+
self.corpus_types = {}
|
|
72
|
+
"""Known and available element types in processed corpus
|
|
73
|
+
"""
|
|
74
|
+
|
|
103
75
|
self._worker_version_cache = {}
|
|
104
76
|
|
|
105
|
-
def
|
|
77
|
+
def get_elements(self) -> Iterable[CachedElement] | list[str] | list[Element]:
|
|
106
78
|
"""
|
|
107
79
|
List the elements to be processed, either from the CLI arguments or
|
|
108
80
|
the cache database when enabled.
|
|
109
81
|
|
|
110
82
|
:return: An iterable of [CachedElement][arkindex_worker.cache.CachedElement] when cache support is enabled,
|
|
111
|
-
|
|
83
|
+
or a list of strings representing element IDs otherwise.
|
|
112
84
|
"""
|
|
113
85
|
assert not (
|
|
114
86
|
self.args.elements_list and self.args.element
|
|
115
87
|
), "elements-list and element CLI args shouldn't be both set"
|
|
88
|
+
|
|
89
|
+
def invalid_element_id(value: str) -> bool:
|
|
90
|
+
"""
|
|
91
|
+
Return whether the ID of an element is a valid UUID or not
|
|
92
|
+
"""
|
|
93
|
+
try:
|
|
94
|
+
uuid.UUID(value)
|
|
95
|
+
except Exception:
|
|
96
|
+
return True
|
|
97
|
+
|
|
98
|
+
return False
|
|
99
|
+
|
|
116
100
|
out = []
|
|
117
101
|
|
|
118
102
|
# Load from the cache when available
|
|
@@ -122,15 +106,28 @@ class ElementsWorker(
|
|
|
122
106
|
)
|
|
123
107
|
if self.use_cache and cache_query.exists():
|
|
124
108
|
return cache_query
|
|
125
|
-
# Process elements from JSON file
|
|
126
109
|
elif self.args.elements_list:
|
|
110
|
+
# Process elements from JSON file
|
|
127
111
|
data = json.load(self.args.elements_list)
|
|
128
112
|
assert isinstance(data, list), "Elements list must be a list"
|
|
129
113
|
assert len(data), "No elements in elements list"
|
|
130
114
|
out += list(filter(None, [element.get("id") for element in data]))
|
|
131
|
-
# Add any extra element from CLI
|
|
132
115
|
elif self.args.element:
|
|
116
|
+
# Add any extra element from CLI
|
|
133
117
|
out += self.args.element
|
|
118
|
+
elif self.process_mode == ProcessMode.Dataset or self.args.set:
|
|
119
|
+
# Elements from datasets
|
|
120
|
+
return list(
|
|
121
|
+
chain.from_iterable(map(self.list_set_elements, self.list_sets()))
|
|
122
|
+
)
|
|
123
|
+
elif self.process_mode == ProcessMode.Export:
|
|
124
|
+
# For export mode processes, use list_process_elements and return element IDs
|
|
125
|
+
return {item["id"] for item in self.list_process_elements()}
|
|
126
|
+
|
|
127
|
+
invalid_element_ids = list(filter(invalid_element_id, out))
|
|
128
|
+
assert (
|
|
129
|
+
not invalid_element_ids
|
|
130
|
+
), f"These element IDs are invalid: {', '.join(invalid_element_ids)}"
|
|
134
131
|
|
|
135
132
|
return out
|
|
136
133
|
|
|
@@ -140,30 +137,22 @@ class ElementsWorker(
|
|
|
140
137
|
Whether or not WorkerActivity support has been enabled on the DataImport
|
|
141
138
|
used to run this worker.
|
|
142
139
|
"""
|
|
143
|
-
if self.is_read_only
|
|
140
|
+
if self.is_read_only or self.process_mode in [
|
|
141
|
+
ProcessMode.Dataset,
|
|
142
|
+
ProcessMode.Export,
|
|
143
|
+
]:
|
|
144
|
+
# Worker activities are also disabled when running an ElementsWorker in a Dataset process
|
|
145
|
+
# and when running export processes.
|
|
144
146
|
return False
|
|
145
147
|
assert (
|
|
146
148
|
self.process_information
|
|
147
149
|
), "Worker must be configured to access its process activity state"
|
|
148
150
|
return self.process_information.get("activity_state") == "ready"
|
|
149
151
|
|
|
150
|
-
def configure(self):
|
|
151
|
-
"""
|
|
152
|
-
Setup the worker using CLI arguments and environment variables.
|
|
153
|
-
"""
|
|
154
|
-
# CLI args are stored on the instance so that implementations can access them
|
|
155
|
-
self.args = self.parser.parse_args()
|
|
156
|
-
|
|
157
|
-
if self.is_read_only:
|
|
158
|
-
super().configure_for_developers()
|
|
159
|
-
else:
|
|
160
|
-
super().configure()
|
|
161
|
-
super().configure_cache()
|
|
162
|
-
|
|
163
152
|
def run(self):
|
|
164
153
|
"""
|
|
165
154
|
Implements an Arkindex worker that goes through each element returned by
|
|
166
|
-
[
|
|
155
|
+
[get_elements][arkindex_worker.worker.ElementsWorker.get_elements].
|
|
167
156
|
It calls [process_element][arkindex_worker.worker.ElementsWorker.process_element],
|
|
168
157
|
catching exceptions, and handles saving WorkerActivity updates when enabled.
|
|
169
158
|
"""
|
|
@@ -171,7 +160,7 @@ class ElementsWorker(
|
|
|
171
160
|
|
|
172
161
|
# List all elements either from JSON file
|
|
173
162
|
# or direct list of elements on CLI
|
|
174
|
-
elements = self.
|
|
163
|
+
elements = self.get_elements()
|
|
175
164
|
if not elements:
|
|
176
165
|
logger.warning("No elements to process, stopping.")
|
|
177
166
|
sys.exit(1)
|
|
@@ -187,12 +176,14 @@ class ElementsWorker(
|
|
|
187
176
|
for i, item in enumerate(elements, start=1):
|
|
188
177
|
element = None
|
|
189
178
|
try:
|
|
190
|
-
if
|
|
191
|
-
# Just use the result of
|
|
179
|
+
if isinstance(item, CachedElement | Element):
|
|
180
|
+
# Just use the result of get_elements as the element
|
|
192
181
|
element = item
|
|
193
182
|
else:
|
|
194
183
|
# Load element using the Arkindex API
|
|
195
|
-
element = Element(
|
|
184
|
+
element = Element(
|
|
185
|
+
**self.api_client.request("RetrieveElement", id=item)
|
|
186
|
+
)
|
|
196
187
|
|
|
197
188
|
logger.info(f"Processing {element} ({i}/{count})")
|
|
198
189
|
|
|
@@ -230,7 +221,7 @@ class ElementsWorker(
|
|
|
230
221
|
with contextlib.suppress(Exception):
|
|
231
222
|
self.update_activity(element.id, ActivityState.Error)
|
|
232
223
|
|
|
233
|
-
message = f'Ran on {count}
|
|
224
|
+
message = f'Ran on {count} {pluralize("element", count)}: {count - failed} completed, {failed} failed'
|
|
234
225
|
if failed:
|
|
235
226
|
logger.error(message)
|
|
236
227
|
if failed >= count: # Everything failed!
|
|
@@ -271,7 +262,7 @@ class ElementsWorker(
|
|
|
271
262
|
assert isinstance(state, ActivityState), "state should be an ActivityState"
|
|
272
263
|
|
|
273
264
|
try:
|
|
274
|
-
self.request(
|
|
265
|
+
self.api_client.request(
|
|
275
266
|
"UpdateWorkerActivity",
|
|
276
267
|
id=self.worker_run_id,
|
|
277
268
|
body={
|
|
@@ -301,16 +292,9 @@ class ElementsWorker(
|
|
|
301
292
|
return True
|
|
302
293
|
|
|
303
294
|
|
|
304
|
-
class
|
|
305
|
-
"""
|
|
306
|
-
Exception raised when the compressed archive associated to
|
|
307
|
-
a dataset isn't found in its task artifacts.
|
|
308
|
-
"""
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
|
|
295
|
+
class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
|
|
312
296
|
"""
|
|
313
|
-
Base class for ML workers that operate on Arkindex
|
|
297
|
+
Base class for ML workers that operate on Arkindex dataset sets.
|
|
314
298
|
|
|
315
299
|
This class inherits from numerous mixin classes found in other modules of
|
|
316
300
|
``arkindex.worker``, which provide helpers to read and write to the Arkindex API.
|
|
@@ -320,193 +304,108 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
|
|
|
320
304
|
self,
|
|
321
305
|
description: str = "Arkindex Dataset Worker",
|
|
322
306
|
support_cache: bool = False,
|
|
323
|
-
generator: bool = False,
|
|
324
307
|
):
|
|
325
308
|
"""
|
|
326
309
|
:param description: The worker's description.
|
|
327
310
|
:param support_cache: Whether the worker supports cache.
|
|
328
|
-
:param generator: Whether the worker generates the dataset archive artifact.
|
|
329
311
|
"""
|
|
330
312
|
super().__init__(description, support_cache)
|
|
331
313
|
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
nargs="+",
|
|
336
|
-
help="One or more Arkindex dataset ID",
|
|
337
|
-
)
|
|
338
|
-
|
|
339
|
-
self.generator = generator
|
|
314
|
+
# Path to the dataset compressed archive (containing images and a SQLite database)
|
|
315
|
+
# Set as an instance variable as dataset workers might use it to easily extract its content
|
|
316
|
+
self.downloaded_dataset_artifact: Path | None = None
|
|
340
317
|
|
|
341
|
-
def
|
|
318
|
+
def cleanup_downloaded_artifact(self) -> None:
|
|
342
319
|
"""
|
|
343
|
-
|
|
320
|
+
Cleanup the downloaded dataset artifact if any
|
|
344
321
|
"""
|
|
345
|
-
|
|
346
|
-
|
|
322
|
+
if not self.downloaded_dataset_artifact:
|
|
323
|
+
return
|
|
347
324
|
|
|
348
|
-
|
|
349
|
-
super().configure_for_developers()
|
|
350
|
-
else:
|
|
351
|
-
super().configure()
|
|
352
|
-
super().configure_cache()
|
|
325
|
+
self.downloaded_dataset_artifact.unlink(missing_ok=True)
|
|
353
326
|
|
|
354
|
-
def download_dataset_artifact(self, dataset: Dataset) ->
|
|
327
|
+
def download_dataset_artifact(self, dataset: Dataset) -> None:
|
|
355
328
|
"""
|
|
356
329
|
Find and download the compressed archive artifact describing a dataset using
|
|
357
330
|
the [list_artifacts][arkindex_worker.worker.task.TaskMixin.list_artifacts] and
|
|
358
331
|
[download_artifact][arkindex_worker.worker.task.TaskMixin.download_artifact] methods.
|
|
359
332
|
|
|
360
333
|
:param dataset: The dataset to retrieve the compressed archive artifact for.
|
|
361
|
-
:returns: A path to the downloaded artifact.
|
|
362
334
|
:raises MissingDatasetArchive: When the dataset artifact is not found.
|
|
363
335
|
"""
|
|
336
|
+
extra_dir = self.find_extras_directory()
|
|
337
|
+
archive = extra_dir / dataset.filepath
|
|
338
|
+
if archive.exists():
|
|
339
|
+
return
|
|
364
340
|
|
|
365
|
-
|
|
341
|
+
# Cleanup the dataset artifact that was downloaded previously
|
|
342
|
+
self.cleanup_downloaded_artifact()
|
|
366
343
|
|
|
344
|
+
logger.info(f"Downloading artifact for {dataset}")
|
|
345
|
+
task_id = uuid.UUID(dataset.task_id)
|
|
367
346
|
for artifact in self.list_artifacts(task_id):
|
|
368
347
|
if artifact.path != dataset.filepath:
|
|
369
348
|
continue
|
|
370
349
|
|
|
371
|
-
extra_dir = self.find_extras_directory()
|
|
372
|
-
archive = extra_dir / dataset.filepath
|
|
373
350
|
archive.write_bytes(self.download_artifact(task_id, artifact).read())
|
|
374
|
-
|
|
351
|
+
self.downloaded_dataset_artifact = archive
|
|
352
|
+
return
|
|
375
353
|
|
|
376
354
|
raise MissingDatasetArchive(
|
|
377
355
|
"The dataset compressed archive artifact was not found."
|
|
378
356
|
)
|
|
379
357
|
|
|
380
|
-
def
|
|
381
|
-
self, dataset: Dataset
|
|
382
|
-
) -> Iterator[tuple[str, list[Element]]]:
|
|
383
|
-
"""
|
|
384
|
-
List the elements in the dataset, grouped by split, using the
|
|
385
|
-
[list_dataset_elements][arkindex_worker.worker.dataset.DatasetMixin.list_dataset_elements] method.
|
|
386
|
-
|
|
387
|
-
:param dataset: The dataset to retrieve elements from.
|
|
388
|
-
:returns: An iterator of tuples containing the split name and the list of its elements.
|
|
389
|
-
"""
|
|
390
|
-
|
|
391
|
-
def format_split(
|
|
392
|
-
split: tuple[str, Iterator[tuple[str, Element]]],
|
|
393
|
-
) -> tuple[str, list[Element]]:
|
|
394
|
-
return (split[0], list(map(itemgetter(1), list(split[1]))))
|
|
395
|
-
|
|
396
|
-
return map(
|
|
397
|
-
format_split,
|
|
398
|
-
groupby(
|
|
399
|
-
sorted(self.list_dataset_elements(dataset), key=itemgetter(0)),
|
|
400
|
-
key=itemgetter(0),
|
|
401
|
-
),
|
|
402
|
-
)
|
|
403
|
-
|
|
404
|
-
def process_dataset(self, dataset: Dataset):
|
|
405
|
-
"""
|
|
406
|
-
Override this method to implement your worker and process a single Arkindex dataset at once.
|
|
407
|
-
|
|
408
|
-
:param dataset: The dataset to process.
|
|
358
|
+
def process_set(self, set: Set):
|
|
409
359
|
"""
|
|
360
|
+
Override this method to implement your worker and process a single Arkindex dataset set at once.
|
|
410
361
|
|
|
411
|
-
|
|
362
|
+
:param set: The set to process.
|
|
412
363
|
"""
|
|
413
|
-
List the datasets to be processed, either from the CLI arguments or using the
|
|
414
|
-
[list_process_datasets][arkindex_worker.worker.dataset.DatasetMixin.list_process_datasets] method.
|
|
415
|
-
|
|
416
|
-
:returns: An iterator of strings if the worker is in read-only mode,
|
|
417
|
-
else an iterator of ``Dataset`` objects.
|
|
418
|
-
"""
|
|
419
|
-
if self.is_read_only:
|
|
420
|
-
return map(str, self.args.dataset)
|
|
421
|
-
|
|
422
|
-
return self.list_process_datasets()
|
|
423
364
|
|
|
424
365
|
def run(self):
|
|
425
366
|
"""
|
|
426
|
-
Implements an Arkindex worker that goes through each dataset returned by
|
|
427
|
-
[
|
|
367
|
+
Implements an Arkindex worker that goes through each dataset set returned by
|
|
368
|
+
[list_sets][arkindex_worker.worker.dataset.DatasetMixin.list_sets].
|
|
428
369
|
|
|
429
|
-
It calls [
|
|
430
|
-
catching exceptions
|
|
431
|
-
when the worker is a generator.
|
|
370
|
+
It calls [process_set][arkindex_worker.worker.DatasetWorker.process_set],
|
|
371
|
+
catching exceptions.
|
|
432
372
|
"""
|
|
433
373
|
self.configure()
|
|
434
374
|
|
|
435
|
-
|
|
436
|
-
if not
|
|
437
|
-
logger.warning("No
|
|
375
|
+
dataset_sets: list[Set] = list(self.list_sets())
|
|
376
|
+
if not dataset_sets:
|
|
377
|
+
logger.warning("No sets to process, stopping.")
|
|
438
378
|
sys.exit(1)
|
|
439
379
|
|
|
440
|
-
# Process every
|
|
441
|
-
count = len(
|
|
380
|
+
# Process every set
|
|
381
|
+
count = len(dataset_sets)
|
|
442
382
|
failed = 0
|
|
443
|
-
for i,
|
|
444
|
-
dataset = None
|
|
445
|
-
dataset_artifact = None
|
|
446
|
-
|
|
383
|
+
for i, dataset_set in enumerate(dataset_sets, start=1):
|
|
447
384
|
try:
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
else:
|
|
452
|
-
# Load dataset using the Arkindex API
|
|
453
|
-
dataset = Dataset(**self.request("RetrieveDataset", id=item))
|
|
454
|
-
|
|
455
|
-
if self.generator:
|
|
456
|
-
assert (
|
|
457
|
-
dataset.state
|
|
458
|
-
in [DatasetState.Open.value, DatasetState.Error.value]
|
|
459
|
-
), "When generating a new dataset, its state should be Open or Error."
|
|
460
|
-
else:
|
|
461
|
-
assert (
|
|
462
|
-
dataset.state == DatasetState.Complete.value
|
|
463
|
-
), "When processing an existing dataset, its state should be Complete."
|
|
464
|
-
|
|
465
|
-
logger.info(f"Processing {dataset} ({i}/{count})")
|
|
466
|
-
|
|
467
|
-
if self.generator:
|
|
468
|
-
# Update the dataset state to Building
|
|
469
|
-
logger.info(f"Building {dataset} ({i}/{count})")
|
|
470
|
-
self.update_dataset_state(dataset, DatasetState.Building)
|
|
471
|
-
else:
|
|
472
|
-
logger.info(f"Downloading data for {dataset} ({i}/{count})")
|
|
473
|
-
dataset_artifact = self.download_dataset_artifact(dataset)
|
|
385
|
+
assert (
|
|
386
|
+
dataset_set.dataset.state == DatasetState.Complete.value
|
|
387
|
+
), "When processing a set, its dataset state should be Complete."
|
|
474
388
|
|
|
475
|
-
|
|
476
|
-
self.
|
|
389
|
+
logger.info(f"Retrieving data for {dataset_set} ({i}/{count})")
|
|
390
|
+
self.download_dataset_artifact(dataset_set.dataset)
|
|
477
391
|
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
logger.info(f"Completed {dataset} ({i}/{count})")
|
|
481
|
-
self.update_dataset_state(dataset, DatasetState.Complete)
|
|
392
|
+
logger.info(f"Processing {dataset_set} ({i}/{count})")
|
|
393
|
+
self.process_set(dataset_set)
|
|
482
394
|
except Exception as e:
|
|
483
|
-
# Handle errors occurring while retrieving
|
|
395
|
+
# Handle errors occurring while retrieving or processing this dataset set
|
|
484
396
|
failed += 1
|
|
485
397
|
|
|
486
|
-
# Handle the case where we failed retrieving the dataset
|
|
487
|
-
dataset_id = dataset.id if dataset else item
|
|
488
|
-
|
|
489
398
|
if isinstance(e, ErrorResponse):
|
|
490
|
-
message = f"An API error occurred while processing
|
|
399
|
+
message = f"An API error occurred while processing {dataset_set}: {e.title} - {e.content}"
|
|
491
400
|
else:
|
|
492
|
-
message = (
|
|
493
|
-
f"Failed running worker on dataset {dataset_id}: {repr(e)}"
|
|
494
|
-
)
|
|
401
|
+
message = f"Failed running worker on {dataset_set}: {repr(e)}"
|
|
495
402
|
|
|
496
|
-
logger.warning(
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
if dataset and self.generator:
|
|
501
|
-
# Try to update the state to Error regardless of the response
|
|
502
|
-
with contextlib.suppress(Exception):
|
|
503
|
-
self.update_dataset_state(dataset, DatasetState.Error)
|
|
504
|
-
finally:
|
|
505
|
-
# Cleanup the dataset artifact if it was downloaded, no matter what
|
|
506
|
-
if dataset_artifact:
|
|
507
|
-
dataset_artifact.unlink(missing_ok=True)
|
|
403
|
+
logger.warning(message, exc_info=e if self.args.verbose else None)
|
|
404
|
+
|
|
405
|
+
# Cleanup the latest downloaded dataset artifact
|
|
406
|
+
self.cleanup_downloaded_artifact()
|
|
508
407
|
|
|
509
|
-
message = f'Ran on {count}
|
|
408
|
+
message = f'Ran on {count} {pluralize("set", count)}: {count - failed} completed, {failed} failed'
|
|
510
409
|
if failed:
|
|
511
410
|
logger.error(message)
|
|
512
411
|
if failed >= count: # Everything failed!
|
arkindex_worker/worker/base.py
CHANGED
|
@@ -12,16 +12,9 @@ from tempfile import mkdtemp
|
|
|
12
12
|
|
|
13
13
|
import gnupg
|
|
14
14
|
import yaml
|
|
15
|
-
from apistar.exceptions import ErrorResponse
|
|
16
|
-
from tenacity import (
|
|
17
|
-
before_sleep_log,
|
|
18
|
-
retry,
|
|
19
|
-
retry_if_exception,
|
|
20
|
-
stop_after_attempt,
|
|
21
|
-
wait_exponential,
|
|
22
|
-
)
|
|
23
15
|
|
|
24
|
-
from arkindex import
|
|
16
|
+
from arkindex import options_from_env
|
|
17
|
+
from arkindex.exceptions import ErrorResponse
|
|
25
18
|
from arkindex_worker import logger
|
|
26
19
|
from arkindex_worker.cache import (
|
|
27
20
|
check_version,
|
|
@@ -31,18 +24,8 @@ from arkindex_worker.cache import (
|
|
|
31
24
|
merge_parents_cache,
|
|
32
25
|
)
|
|
33
26
|
from arkindex_worker.utils import close_delete_file, extract_tar_zst_archive
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def _is_500_error(exc: Exception) -> bool:
|
|
37
|
-
"""
|
|
38
|
-
Check if an Arkindex API error has a HTTP 5xx error code.
|
|
39
|
-
Used to retry most API calls in [BaseWorker][arkindex_worker.worker.base.BaseWorker].
|
|
40
|
-
:param exc: Exception to check
|
|
41
|
-
"""
|
|
42
|
-
if not isinstance(exc, ErrorResponse):
|
|
43
|
-
return False
|
|
44
|
-
|
|
45
|
-
return 500 <= exc.status_code < 600
|
|
27
|
+
from arkindex_worker.worker.process import ProcessMode
|
|
28
|
+
from teklia_toolbox.requests import get_arkindex_client
|
|
46
29
|
|
|
47
30
|
|
|
48
31
|
class ExtrasDirNotFoundError(Exception):
|
|
@@ -174,6 +157,13 @@ class BaseWorker:
|
|
|
174
157
|
raise Exception("Missing ARKINDEX_CORPUS_ID environment variable")
|
|
175
158
|
return self._corpus_id
|
|
176
159
|
|
|
160
|
+
@property
|
|
161
|
+
def process_mode(self) -> ProcessMode | None:
|
|
162
|
+
"""Mode of the process being run. Returns None when read-only."""
|
|
163
|
+
if self.is_read_only:
|
|
164
|
+
return
|
|
165
|
+
return ProcessMode(self.process_information["mode"])
|
|
166
|
+
|
|
177
167
|
@property
|
|
178
168
|
def is_read_only(self) -> bool:
|
|
179
169
|
"""
|
|
@@ -197,7 +187,7 @@ class BaseWorker:
|
|
|
197
187
|
Create an ArkindexClient to make API requests towards Arkindex instances.
|
|
198
188
|
"""
|
|
199
189
|
# Build Arkindex API client from environment variables
|
|
200
|
-
self.api_client =
|
|
190
|
+
self.api_client = get_arkindex_client(**options_from_env())
|
|
201
191
|
logger.debug(f"Setup Arkindex API client on {self.api_client.document.url}")
|
|
202
192
|
|
|
203
193
|
def configure_for_developers(self):
|
|
@@ -237,7 +227,7 @@ class BaseWorker:
|
|
|
237
227
|
# Load all required secrets
|
|
238
228
|
self.secrets = {name: self.load_secret(Path(name)) for name in required_secrets}
|
|
239
229
|
|
|
240
|
-
def
|
|
230
|
+
def configure_worker_run(self):
|
|
241
231
|
"""
|
|
242
232
|
Setup the necessary configuration needed using CLI args and environment variables.
|
|
243
233
|
This is the method called when running a worker on Arkindex.
|
|
@@ -249,7 +239,7 @@ class BaseWorker:
|
|
|
249
239
|
logger.debug("Debug output enabled")
|
|
250
240
|
|
|
251
241
|
# Load worker run information
|
|
252
|
-
worker_run = self.request("RetrieveWorkerRun", id=self.worker_run_id)
|
|
242
|
+
worker_run = self.api_client.request("RetrieveWorkerRun", id=self.worker_run_id)
|
|
253
243
|
|
|
254
244
|
# Load process information
|
|
255
245
|
self.process_information = worker_run["process"]
|
|
@@ -308,7 +298,7 @@ class BaseWorker:
|
|
|
308
298
|
if self.support_cache and self.args.database is not None:
|
|
309
299
|
self.use_cache = True
|
|
310
300
|
elif self.support_cache and self.task_id:
|
|
311
|
-
task = self.request("
|
|
301
|
+
task = self.api_client.request("RetrieveTask", id=self.task_id)
|
|
312
302
|
self.task_parents = task["parents"]
|
|
313
303
|
paths = self.find_parents_file_paths(Path("db.sqlite"))
|
|
314
304
|
self.use_cache = len(paths) > 0
|
|
@@ -338,6 +328,29 @@ class BaseWorker:
|
|
|
338
328
|
else:
|
|
339
329
|
logger.debug("Cache is disabled")
|
|
340
330
|
|
|
331
|
+
def configure(self):
|
|
332
|
+
"""
|
|
333
|
+
Setup the worker using CLI arguments and environment variables.
|
|
334
|
+
"""
|
|
335
|
+
# CLI args are stored on the instance so that implementations can access them
|
|
336
|
+
self.args = self.parser.parse_args()
|
|
337
|
+
|
|
338
|
+
if self.is_read_only:
|
|
339
|
+
self.configure_for_developers()
|
|
340
|
+
else:
|
|
341
|
+
self.configure_worker_run()
|
|
342
|
+
self.configure_cache()
|
|
343
|
+
|
|
344
|
+
# Retrieve the model configuration
|
|
345
|
+
if self.model_configuration:
|
|
346
|
+
self.config.update(self.model_configuration)
|
|
347
|
+
logger.info("Model version configuration retrieved")
|
|
348
|
+
|
|
349
|
+
# Retrieve the user configuration
|
|
350
|
+
if self.user_configuration:
|
|
351
|
+
self.config.update(self.user_configuration)
|
|
352
|
+
logger.info("User configuration retrieved")
|
|
353
|
+
|
|
341
354
|
def load_secret(self, name: Path):
|
|
342
355
|
"""
|
|
343
356
|
Load a Ponos secret by name.
|
|
@@ -349,7 +362,7 @@ class BaseWorker:
|
|
|
349
362
|
|
|
350
363
|
# Load from the backend
|
|
351
364
|
try:
|
|
352
|
-
resp = self.request("RetrieveSecret", name=str(name))
|
|
365
|
+
resp = self.api_client.request("RetrieveSecret", name=str(name))
|
|
353
366
|
secret = resp["content"]
|
|
354
367
|
logging.info(f"Loaded API secret {name}")
|
|
355
368
|
except ErrorResponse as e:
|
|
@@ -489,26 +502,6 @@ class BaseWorker:
|
|
|
489
502
|
# Clean up
|
|
490
503
|
shutil.rmtree(base_extracted_path)
|
|
491
504
|
|
|
492
|
-
@retry(
|
|
493
|
-
retry=retry_if_exception(_is_500_error),
|
|
494
|
-
wait=wait_exponential(multiplier=2, min=3),
|
|
495
|
-
reraise=True,
|
|
496
|
-
stop=stop_after_attempt(5),
|
|
497
|
-
before_sleep=before_sleep_log(logger, logging.INFO),
|
|
498
|
-
)
|
|
499
|
-
def request(self, *args, **kwargs):
|
|
500
|
-
"""
|
|
501
|
-
Wrapper around the ``ArkindexClient.request`` method.
|
|
502
|
-
|
|
503
|
-
The API call will be retried up to 5 times in case of HTTP 5xx errors,
|
|
504
|
-
with an exponential sleep time of 3, 4, 8 and 16 seconds between calls.
|
|
505
|
-
If the 5th call still causes an HTTP 5xx error, the exception is re-raised
|
|
506
|
-
and the caller should catch it.
|
|
507
|
-
|
|
508
|
-
Log messages are displayed when an HTTP 5xx error occurs, before waiting for the next call.
|
|
509
|
-
"""
|
|
510
|
-
return self.api_client.request(*args, **kwargs)
|
|
511
|
-
|
|
512
505
|
def add_arguments(self):
|
|
513
506
|
"""Override this method to add ``argparse`` arguments to this worker"""
|
|
514
507
|
|