arkindex-base-worker 0.4.0__py3-none-any.whl → 0.4.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arkindex_base_worker-0.4.0.dist-info → arkindex_base_worker-0.4.0a1.dist-info}/METADATA +13 -15
- arkindex_base_worker-0.4.0a1.dist-info/RECORD +51 -0
- {arkindex_base_worker-0.4.0.dist-info → arkindex_base_worker-0.4.0a1.dist-info}/WHEEL +1 -1
- arkindex_worker/cache.py +1 -1
- arkindex_worker/image.py +1 -120
- arkindex_worker/utils.py +0 -82
- arkindex_worker/worker/__init__.py +161 -46
- arkindex_worker/worker/base.py +11 -36
- arkindex_worker/worker/classification.py +18 -34
- arkindex_worker/worker/corpus.py +4 -21
- arkindex_worker/worker/dataset.py +1 -71
- arkindex_worker/worker/element.py +91 -352
- arkindex_worker/worker/entity.py +11 -11
- arkindex_worker/worker/metadata.py +9 -19
- arkindex_worker/worker/task.py +4 -5
- arkindex_worker/worker/training.py +18 -21
- arkindex_worker/worker/transcription.py +68 -89
- arkindex_worker/worker/version.py +1 -3
- tests/__init__.py +1 -1
- tests/conftest.py +45 -33
- tests/test_base_worker.py +3 -204
- tests/test_dataset_worker.py +4 -7
- tests/test_elements_worker/{test_classification.py → test_classifications.py} +61 -194
- tests/test_elements_worker/test_corpus.py +1 -32
- tests/test_elements_worker/test_dataset.py +1 -1
- tests/test_elements_worker/test_elements.py +2734 -0
- tests/test_elements_worker/{test_entity_create.py → test_entities.py} +160 -26
- tests/test_elements_worker/test_image.py +1 -2
- tests/test_elements_worker/test_metadata.py +99 -224
- tests/test_elements_worker/test_task.py +1 -1
- tests/test_elements_worker/test_training.py +43 -17
- tests/test_elements_worker/test_transcriptions.py +2102 -0
- tests/test_elements_worker/test_worker.py +280 -563
- tests/test_image.py +204 -429
- tests/test_merge.py +2 -1
- tests/test_utils.py +3 -66
- arkindex_base_worker-0.4.0.dist-info/RECORD +0 -61
- arkindex_worker/worker/process.py +0 -92
- tests/test_elements_worker/test_element.py +0 -427
- tests/test_elements_worker/test_element_create_multiple.py +0 -715
- tests/test_elements_worker/test_element_create_single.py +0 -528
- tests/test_elements_worker/test_element_list_children.py +0 -969
- tests/test_elements_worker/test_element_list_parents.py +0 -530
- tests/test_elements_worker/test_entity_list_and_check.py +0 -160
- tests/test_elements_worker/test_process.py +0 -89
- tests/test_elements_worker/test_transcription_create.py +0 -873
- tests/test_elements_worker/test_transcription_create_with_elements.py +0 -951
- tests/test_elements_worker/test_transcription_list.py +0 -450
- tests/test_elements_worker/test_version.py +0 -60
- {arkindex_base_worker-0.4.0.dist-info → arkindex_base_worker-0.4.0a1.dist-info}/LICENSE +0 -0
- {arkindex_base_worker-0.4.0.dist-info → arkindex_base_worker-0.4.0a1.dist-info}/top_level.txt +0 -0
|
@@ -4,47 +4,68 @@ Base classes to implement Arkindex workers.
|
|
|
4
4
|
|
|
5
5
|
import contextlib
|
|
6
6
|
import json
|
|
7
|
+
import os
|
|
7
8
|
import sys
|
|
8
9
|
import uuid
|
|
9
|
-
from
|
|
10
|
-
from
|
|
10
|
+
from argparse import ArgumentTypeError
|
|
11
|
+
from collections.abc import Iterable, Iterator
|
|
12
|
+
from enum import Enum
|
|
11
13
|
from pathlib import Path
|
|
12
14
|
|
|
13
|
-
from
|
|
15
|
+
from apistar.exceptions import ErrorResponse
|
|
16
|
+
|
|
14
17
|
from arkindex_worker import logger
|
|
15
18
|
from arkindex_worker.cache import CachedElement
|
|
16
19
|
from arkindex_worker.models import Dataset, Element, Set
|
|
17
|
-
from arkindex_worker.utils import pluralize
|
|
18
20
|
from arkindex_worker.worker.base import BaseWorker
|
|
19
21
|
from arkindex_worker.worker.classification import ClassificationMixin
|
|
20
22
|
from arkindex_worker.worker.corpus import CorpusMixin
|
|
21
|
-
from arkindex_worker.worker.dataset import
|
|
22
|
-
DatasetMixin,
|
|
23
|
-
DatasetState,
|
|
24
|
-
MissingDatasetArchive,
|
|
25
|
-
)
|
|
23
|
+
from arkindex_worker.worker.dataset import DatasetMixin, DatasetState
|
|
26
24
|
from arkindex_worker.worker.element import ElementMixin
|
|
27
25
|
from arkindex_worker.worker.entity import EntityMixin
|
|
28
26
|
from arkindex_worker.worker.image import ImageMixin
|
|
29
27
|
from arkindex_worker.worker.metadata import MetaDataMixin, MetaType # noqa: F401
|
|
30
|
-
from arkindex_worker.worker.process import ActivityState, ProcessMixin, ProcessMode
|
|
31
28
|
from arkindex_worker.worker.task import TaskMixin
|
|
32
29
|
from arkindex_worker.worker.transcription import TranscriptionMixin
|
|
33
30
|
from arkindex_worker.worker.version import WorkerVersionMixin
|
|
34
31
|
|
|
35
32
|
|
|
33
|
+
class ActivityState(Enum):
|
|
34
|
+
"""
|
|
35
|
+
Processing state of an element.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
Queued = "queued"
|
|
39
|
+
"""
|
|
40
|
+
The element has not yet been processed by a worker.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
Started = "started"
|
|
44
|
+
"""
|
|
45
|
+
The element is being processed by a worker.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
Processed = "processed"
|
|
49
|
+
"""
|
|
50
|
+
The element has been successfully processed by a worker.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
Error = "error"
|
|
54
|
+
"""
|
|
55
|
+
An error occurred while processing this element.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
|
|
36
59
|
class ElementsWorker(
|
|
37
|
-
ElementMixin,
|
|
38
|
-
DatasetMixin,
|
|
39
60
|
BaseWorker,
|
|
40
61
|
ClassificationMixin,
|
|
41
62
|
CorpusMixin,
|
|
63
|
+
ElementMixin,
|
|
42
64
|
TranscriptionMixin,
|
|
43
65
|
WorkerVersionMixin,
|
|
44
66
|
EntityMixin,
|
|
45
67
|
MetaDataMixin,
|
|
46
68
|
ImageMixin,
|
|
47
|
-
ProcessMixin,
|
|
48
69
|
):
|
|
49
70
|
"""
|
|
50
71
|
Base class for ML workers that operate on Arkindex elements.
|
|
@@ -62,19 +83,29 @@ class ElementsWorker(
|
|
|
62
83
|
"""
|
|
63
84
|
super().__init__(description, support_cache)
|
|
64
85
|
|
|
86
|
+
# Add mandatory argument to process elements
|
|
87
|
+
self.parser.add_argument(
|
|
88
|
+
"--elements-list",
|
|
89
|
+
help="JSON elements list to use",
|
|
90
|
+
type=open,
|
|
91
|
+
default=os.environ.get("TASK_ELEMENTS"),
|
|
92
|
+
)
|
|
93
|
+
self.parser.add_argument(
|
|
94
|
+
"--element",
|
|
95
|
+
type=str,
|
|
96
|
+
nargs="+",
|
|
97
|
+
help="One or more Arkindex element ID",
|
|
98
|
+
)
|
|
99
|
+
|
|
65
100
|
self.classes = {}
|
|
66
101
|
|
|
67
102
|
self.entity_types = {}
|
|
68
103
|
"""Known and available entity types in processed corpus
|
|
69
104
|
"""
|
|
70
105
|
|
|
71
|
-
self.corpus_types = {}
|
|
72
|
-
"""Known and available element types in processed corpus
|
|
73
|
-
"""
|
|
74
|
-
|
|
75
106
|
self._worker_version_cache = {}
|
|
76
107
|
|
|
77
|
-
def
|
|
108
|
+
def list_elements(self) -> Iterable[CachedElement] | list[str]:
|
|
78
109
|
"""
|
|
79
110
|
List the elements to be processed, either from the CLI arguments or
|
|
80
111
|
the cache database when enabled.
|
|
@@ -106,23 +137,15 @@ class ElementsWorker(
|
|
|
106
137
|
)
|
|
107
138
|
if self.use_cache and cache_query.exists():
|
|
108
139
|
return cache_query
|
|
140
|
+
# Process elements from JSON file
|
|
109
141
|
elif self.args.elements_list:
|
|
110
|
-
# Process elements from JSON file
|
|
111
142
|
data = json.load(self.args.elements_list)
|
|
112
143
|
assert isinstance(data, list), "Elements list must be a list"
|
|
113
144
|
assert len(data), "No elements in elements list"
|
|
114
145
|
out += list(filter(None, [element.get("id") for element in data]))
|
|
146
|
+
# Add any extra element from CLI
|
|
115
147
|
elif self.args.element:
|
|
116
|
-
# Add any extra element from CLI
|
|
117
148
|
out += self.args.element
|
|
118
|
-
elif self.process_mode == ProcessMode.Dataset or self.args.set:
|
|
119
|
-
# Elements from datasets
|
|
120
|
-
return list(
|
|
121
|
-
chain.from_iterable(map(self.list_set_elements, self.list_sets()))
|
|
122
|
-
)
|
|
123
|
-
elif self.process_mode == ProcessMode.Export:
|
|
124
|
-
# For export mode processes, use list_process_elements and return element IDs
|
|
125
|
-
return {item["id"] for item in self.list_process_elements()}
|
|
126
149
|
|
|
127
150
|
invalid_element_ids = list(filter(invalid_element_id, out))
|
|
128
151
|
assert (
|
|
@@ -137,22 +160,40 @@ class ElementsWorker(
|
|
|
137
160
|
Whether or not WorkerActivity support has been enabled on the DataImport
|
|
138
161
|
used to run this worker.
|
|
139
162
|
"""
|
|
140
|
-
if self.is_read_only
|
|
141
|
-
ProcessMode.Dataset,
|
|
142
|
-
ProcessMode.Export,
|
|
143
|
-
]:
|
|
144
|
-
# Worker activities are also disabled when running an ElementsWorker in a Dataset process
|
|
145
|
-
# and when running export processes.
|
|
163
|
+
if self.is_read_only:
|
|
146
164
|
return False
|
|
147
165
|
assert (
|
|
148
166
|
self.process_information
|
|
149
167
|
), "Worker must be configured to access its process activity state"
|
|
150
168
|
return self.process_information.get("activity_state") == "ready"
|
|
151
169
|
|
|
170
|
+
def configure(self):
|
|
171
|
+
"""
|
|
172
|
+
Setup the worker using CLI arguments and environment variables.
|
|
173
|
+
"""
|
|
174
|
+
# CLI args are stored on the instance so that implementations can access them
|
|
175
|
+
self.args = self.parser.parse_args()
|
|
176
|
+
|
|
177
|
+
if self.is_read_only:
|
|
178
|
+
super().configure_for_developers()
|
|
179
|
+
else:
|
|
180
|
+
super().configure()
|
|
181
|
+
super().configure_cache()
|
|
182
|
+
|
|
183
|
+
# Retrieve the model configuration
|
|
184
|
+
if self.model_configuration:
|
|
185
|
+
self.config.update(self.model_configuration)
|
|
186
|
+
logger.info("Model version configuration retrieved")
|
|
187
|
+
|
|
188
|
+
# Retrieve the user configuration
|
|
189
|
+
if self.user_configuration:
|
|
190
|
+
self.config.update(self.user_configuration)
|
|
191
|
+
logger.info("User configuration retrieved")
|
|
192
|
+
|
|
152
193
|
def run(self):
|
|
153
194
|
"""
|
|
154
195
|
Implements an Arkindex worker that goes through each element returned by
|
|
155
|
-
[
|
|
196
|
+
[list_elements][arkindex_worker.worker.ElementsWorker.list_elements].
|
|
156
197
|
It calls [process_element][arkindex_worker.worker.ElementsWorker.process_element],
|
|
157
198
|
catching exceptions, and handles saving WorkerActivity updates when enabled.
|
|
158
199
|
"""
|
|
@@ -160,7 +201,7 @@ class ElementsWorker(
|
|
|
160
201
|
|
|
161
202
|
# List all elements either from JSON file
|
|
162
203
|
# or direct list of elements on CLI
|
|
163
|
-
elements = self.
|
|
204
|
+
elements = self.list_elements()
|
|
164
205
|
if not elements:
|
|
165
206
|
logger.warning("No elements to process, stopping.")
|
|
166
207
|
sys.exit(1)
|
|
@@ -176,14 +217,12 @@ class ElementsWorker(
|
|
|
176
217
|
for i, item in enumerate(elements, start=1):
|
|
177
218
|
element = None
|
|
178
219
|
try:
|
|
179
|
-
if
|
|
180
|
-
# Just use the result of
|
|
220
|
+
if self.use_cache:
|
|
221
|
+
# Just use the result of list_elements as the element
|
|
181
222
|
element = item
|
|
182
223
|
else:
|
|
183
224
|
# Load element using the Arkindex API
|
|
184
|
-
element = Element(
|
|
185
|
-
**self.api_client.request("RetrieveElement", id=item)
|
|
186
|
-
)
|
|
225
|
+
element = Element(**self.request("RetrieveElement", id=item))
|
|
187
226
|
|
|
188
227
|
logger.info(f"Processing {element} ({i}/{count})")
|
|
189
228
|
|
|
@@ -221,7 +260,7 @@ class ElementsWorker(
|
|
|
221
260
|
with contextlib.suppress(Exception):
|
|
222
261
|
self.update_activity(element.id, ActivityState.Error)
|
|
223
262
|
|
|
224
|
-
message = f'Ran on {count} {
|
|
263
|
+
message = f'Ran on {count} element{"s"[:count>1]}: {count - failed} completed, {failed} failed'
|
|
225
264
|
if failed:
|
|
226
265
|
logger.error(message)
|
|
227
266
|
if failed >= count: # Everything failed!
|
|
@@ -262,7 +301,7 @@ class ElementsWorker(
|
|
|
262
301
|
assert isinstance(state, ActivityState), "state should be an ActivityState"
|
|
263
302
|
|
|
264
303
|
try:
|
|
265
|
-
self.
|
|
304
|
+
self.request(
|
|
266
305
|
"UpdateWorkerActivity",
|
|
267
306
|
id=self.worker_run_id,
|
|
268
307
|
body={
|
|
@@ -292,7 +331,29 @@ class ElementsWorker(
|
|
|
292
331
|
return True
|
|
293
332
|
|
|
294
333
|
|
|
295
|
-
|
|
334
|
+
def check_dataset_set(value: str) -> tuple[uuid.UUID, str]:
|
|
335
|
+
values = value.split(":")
|
|
336
|
+
if len(values) != 2:
|
|
337
|
+
raise ArgumentTypeError(
|
|
338
|
+
f"'{value}' is not in the correct format `<dataset_id>:<set_name>`"
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
dataset_id, set_name = values
|
|
342
|
+
try:
|
|
343
|
+
dataset_id = uuid.UUID(dataset_id)
|
|
344
|
+
return (dataset_id, set_name)
|
|
345
|
+
except (TypeError, ValueError) as e:
|
|
346
|
+
raise ArgumentTypeError(f"'{dataset_id}' should be a valid UUID") from e
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
class MissingDatasetArchive(Exception):
|
|
350
|
+
"""
|
|
351
|
+
Exception raised when the compressed archive associated to
|
|
352
|
+
a dataset isn't found in its task artifacts.
|
|
353
|
+
"""
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
|
|
296
357
|
"""
|
|
297
358
|
Base class for ML workers that operate on Arkindex dataset sets.
|
|
298
359
|
|
|
@@ -315,6 +376,40 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
|
|
|
315
376
|
# Set as an instance variable as dataset workers might use it to easily extract its content
|
|
316
377
|
self.downloaded_dataset_artifact: Path | None = None
|
|
317
378
|
|
|
379
|
+
self.parser.add_argument(
|
|
380
|
+
"--set",
|
|
381
|
+
type=check_dataset_set,
|
|
382
|
+
nargs="+",
|
|
383
|
+
help="""
|
|
384
|
+
One or more Arkindex dataset sets, format is <dataset_uuid>:<set_name>
|
|
385
|
+
(e.g.: "12341234-1234-1234-1234-123412341234:train")
|
|
386
|
+
""",
|
|
387
|
+
default=[],
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
def configure(self):
|
|
391
|
+
"""
|
|
392
|
+
Setup the worker using CLI arguments and environment variables.
|
|
393
|
+
"""
|
|
394
|
+
# CLI args are stored on the instance so that implementations can access them
|
|
395
|
+
self.args = self.parser.parse_args()
|
|
396
|
+
|
|
397
|
+
if self.is_read_only:
|
|
398
|
+
super().configure_for_developers()
|
|
399
|
+
else:
|
|
400
|
+
super().configure()
|
|
401
|
+
super().configure_cache()
|
|
402
|
+
|
|
403
|
+
# Retrieve the model configuration
|
|
404
|
+
if self.model_configuration:
|
|
405
|
+
self.config.update(self.model_configuration)
|
|
406
|
+
logger.info("Model version configuration retrieved")
|
|
407
|
+
|
|
408
|
+
# Retrieve the user configuration
|
|
409
|
+
if self.user_configuration:
|
|
410
|
+
self.config.update(self.user_configuration)
|
|
411
|
+
logger.info("User configuration retrieved")
|
|
412
|
+
|
|
318
413
|
def cleanup_downloaded_artifact(self) -> None:
|
|
319
414
|
"""
|
|
320
415
|
Cleanup the downloaded dataset artifact if any
|
|
@@ -362,10 +457,30 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
|
|
|
362
457
|
:param set: The set to process.
|
|
363
458
|
"""
|
|
364
459
|
|
|
460
|
+
def list_sets(self) -> Iterator[Set]:
|
|
461
|
+
"""
|
|
462
|
+
List the sets to be processed, either from the CLI arguments or using the
|
|
463
|
+
[list_process_sets][arkindex_worker.worker.dataset.DatasetMixin.list_process_sets] method.
|
|
464
|
+
|
|
465
|
+
:returns: An iterator of ``Set`` objects.
|
|
466
|
+
"""
|
|
467
|
+
if not self.is_read_only:
|
|
468
|
+
yield from self.list_process_sets()
|
|
469
|
+
|
|
470
|
+
datasets: dict[uuid.UUID, Dataset] = {}
|
|
471
|
+
for dataset_id, set_name in self.args.set:
|
|
472
|
+
# Retrieving dataset information is not already cached
|
|
473
|
+
if dataset_id not in datasets:
|
|
474
|
+
datasets[dataset_id] = Dataset(
|
|
475
|
+
**self.request("RetrieveDataset", id=dataset_id)
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
yield Set(name=set_name, dataset=datasets[dataset_id])
|
|
479
|
+
|
|
365
480
|
def run(self):
|
|
366
481
|
"""
|
|
367
482
|
Implements an Arkindex worker that goes through each dataset set returned by
|
|
368
|
-
[list_sets][arkindex_worker.worker.
|
|
483
|
+
[list_sets][arkindex_worker.worker.DatasetWorker.list_sets].
|
|
369
484
|
|
|
370
485
|
It calls [process_set][arkindex_worker.worker.DatasetWorker.process_set],
|
|
371
486
|
catching exceptions.
|
|
@@ -405,7 +520,7 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
|
|
|
405
520
|
# Cleanup the latest downloaded dataset artifact
|
|
406
521
|
self.cleanup_downloaded_artifact()
|
|
407
522
|
|
|
408
|
-
message = f'Ran on {count} {
|
|
523
|
+
message = f'Ran on {count} set{"s"[:count>1]}: {count - failed} completed, {failed} failed'
|
|
409
524
|
if failed:
|
|
410
525
|
logger.error(message)
|
|
411
526
|
if failed >= count: # Everything failed!
|
arkindex_worker/worker/base.py
CHANGED
|
@@ -12,9 +12,9 @@ from tempfile import mkdtemp
|
|
|
12
12
|
|
|
13
13
|
import gnupg
|
|
14
14
|
import yaml
|
|
15
|
+
from apistar.exceptions import ErrorResponse
|
|
15
16
|
|
|
16
17
|
from arkindex import options_from_env
|
|
17
|
-
from arkindex.exceptions import ErrorResponse
|
|
18
18
|
from arkindex_worker import logger
|
|
19
19
|
from arkindex_worker.cache import (
|
|
20
20
|
check_version,
|
|
@@ -24,7 +24,6 @@ from arkindex_worker.cache import (
|
|
|
24
24
|
merge_parents_cache,
|
|
25
25
|
)
|
|
26
26
|
from arkindex_worker.utils import close_delete_file, extract_tar_zst_archive
|
|
27
|
-
from arkindex_worker.worker.process import ProcessMode
|
|
28
27
|
from teklia_toolbox.requests import get_arkindex_client
|
|
29
28
|
|
|
30
29
|
|
|
@@ -157,13 +156,6 @@ class BaseWorker:
|
|
|
157
156
|
raise Exception("Missing ARKINDEX_CORPUS_ID environment variable")
|
|
158
157
|
return self._corpus_id
|
|
159
158
|
|
|
160
|
-
@property
|
|
161
|
-
def process_mode(self) -> ProcessMode | None:
|
|
162
|
-
"""Mode of the process being run. Returns None when read-only."""
|
|
163
|
-
if self.is_read_only:
|
|
164
|
-
return
|
|
165
|
-
return ProcessMode(self.process_information["mode"])
|
|
166
|
-
|
|
167
159
|
@property
|
|
168
160
|
def is_read_only(self) -> bool:
|
|
169
161
|
"""
|
|
@@ -227,7 +219,7 @@ class BaseWorker:
|
|
|
227
219
|
# Load all required secrets
|
|
228
220
|
self.secrets = {name: self.load_secret(Path(name)) for name in required_secrets}
|
|
229
221
|
|
|
230
|
-
def
|
|
222
|
+
def configure(self):
|
|
231
223
|
"""
|
|
232
224
|
Setup the necessary configuration needed using CLI args and environment variables.
|
|
233
225
|
This is the method called when running a worker on Arkindex.
|
|
@@ -239,7 +231,7 @@ class BaseWorker:
|
|
|
239
231
|
logger.debug("Debug output enabled")
|
|
240
232
|
|
|
241
233
|
# Load worker run information
|
|
242
|
-
worker_run = self.
|
|
234
|
+
worker_run = self.request("RetrieveWorkerRun", id=self.worker_run_id)
|
|
243
235
|
|
|
244
236
|
# Load process information
|
|
245
237
|
self.process_information = worker_run["process"]
|
|
@@ -298,7 +290,7 @@ class BaseWorker:
|
|
|
298
290
|
if self.support_cache and self.args.database is not None:
|
|
299
291
|
self.use_cache = True
|
|
300
292
|
elif self.support_cache and self.task_id:
|
|
301
|
-
task = self.
|
|
293
|
+
task = self.request("RetrieveTaskFromAgent", id=self.task_id)
|
|
302
294
|
self.task_parents = task["parents"]
|
|
303
295
|
paths = self.find_parents_file_paths(Path("db.sqlite"))
|
|
304
296
|
self.use_cache = len(paths) > 0
|
|
@@ -328,29 +320,6 @@ class BaseWorker:
|
|
|
328
320
|
else:
|
|
329
321
|
logger.debug("Cache is disabled")
|
|
330
322
|
|
|
331
|
-
def configure(self):
|
|
332
|
-
"""
|
|
333
|
-
Setup the worker using CLI arguments and environment variables.
|
|
334
|
-
"""
|
|
335
|
-
# CLI args are stored on the instance so that implementations can access them
|
|
336
|
-
self.args = self.parser.parse_args()
|
|
337
|
-
|
|
338
|
-
if self.is_read_only:
|
|
339
|
-
self.configure_for_developers()
|
|
340
|
-
else:
|
|
341
|
-
self.configure_worker_run()
|
|
342
|
-
self.configure_cache()
|
|
343
|
-
|
|
344
|
-
# Retrieve the model configuration
|
|
345
|
-
if self.model_configuration:
|
|
346
|
-
self.config.update(self.model_configuration)
|
|
347
|
-
logger.info("Model version configuration retrieved")
|
|
348
|
-
|
|
349
|
-
# Retrieve the user configuration
|
|
350
|
-
if self.user_configuration:
|
|
351
|
-
self.config.update(self.user_configuration)
|
|
352
|
-
logger.info("User configuration retrieved")
|
|
353
|
-
|
|
354
323
|
def load_secret(self, name: Path):
|
|
355
324
|
"""
|
|
356
325
|
Load a Ponos secret by name.
|
|
@@ -362,7 +331,7 @@ class BaseWorker:
|
|
|
362
331
|
|
|
363
332
|
# Load from the backend
|
|
364
333
|
try:
|
|
365
|
-
resp = self.
|
|
334
|
+
resp = self.request("RetrieveSecret", name=str(name))
|
|
366
335
|
secret = resp["content"]
|
|
367
336
|
logging.info(f"Loaded API secret {name}")
|
|
368
337
|
except ErrorResponse as e:
|
|
@@ -502,6 +471,12 @@ class BaseWorker:
|
|
|
502
471
|
# Clean up
|
|
503
472
|
shutil.rmtree(base_extracted_path)
|
|
504
473
|
|
|
474
|
+
def request(self, *args, **kwargs):
|
|
475
|
+
"""
|
|
476
|
+
Wrapper around the ``ArkindexClient.request`` method.
|
|
477
|
+
"""
|
|
478
|
+
return self.api_client.request(*args, **kwargs)
|
|
479
|
+
|
|
505
480
|
def add_arguments(self):
|
|
506
481
|
"""Override this method to add ``argparse`` arguments to this worker"""
|
|
507
482
|
|
|
@@ -2,18 +2,12 @@
|
|
|
2
2
|
ElementsWorker methods for classifications and ML classes.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
from apistar.exceptions import ErrorResponse
|
|
5
6
|
from peewee import IntegrityError
|
|
6
7
|
|
|
7
|
-
from arkindex.exceptions import ErrorResponse
|
|
8
8
|
from arkindex_worker import logger
|
|
9
9
|
from arkindex_worker.cache import CachedClassification, CachedElement
|
|
10
10
|
from arkindex_worker.models import Element
|
|
11
|
-
from arkindex_worker.utils import (
|
|
12
|
-
DEFAULT_BATCH_SIZE,
|
|
13
|
-
batch_publication,
|
|
14
|
-
make_batches,
|
|
15
|
-
pluralize,
|
|
16
|
-
)
|
|
17
11
|
|
|
18
12
|
|
|
19
13
|
class ClassificationMixin:
|
|
@@ -27,7 +21,7 @@ class ClassificationMixin:
|
|
|
27
21
|
)
|
|
28
22
|
self.classes = {ml_class["name"]: ml_class["id"] for ml_class in corpus_classes}
|
|
29
23
|
logger.info(
|
|
30
|
-
f
|
|
24
|
+
f"Loaded {len(self.classes)} ML classes in corpus ({self.corpus_id})"
|
|
31
25
|
)
|
|
32
26
|
|
|
33
27
|
def get_ml_class_id(self, ml_class: str) -> str:
|
|
@@ -45,7 +39,7 @@ class ClassificationMixin:
|
|
|
45
39
|
if ml_class_id is None:
|
|
46
40
|
logger.info(f"Creating ML class {ml_class} on corpus {self.corpus_id}")
|
|
47
41
|
try:
|
|
48
|
-
response = self.
|
|
42
|
+
response = self.request(
|
|
49
43
|
"CreateMLClass", id=self.corpus_id, body={"name": ml_class}
|
|
50
44
|
)
|
|
51
45
|
ml_class_id = self.classes[ml_class] = response["id"]
|
|
@@ -125,7 +119,7 @@ class ClassificationMixin:
|
|
|
125
119
|
)
|
|
126
120
|
return
|
|
127
121
|
try:
|
|
128
|
-
created = self.
|
|
122
|
+
created = self.request(
|
|
129
123
|
"CreateClassification",
|
|
130
124
|
body={
|
|
131
125
|
"element": str(element.id),
|
|
@@ -173,12 +167,10 @@ class ClassificationMixin:
|
|
|
173
167
|
|
|
174
168
|
return created
|
|
175
169
|
|
|
176
|
-
@batch_publication
|
|
177
170
|
def create_classifications(
|
|
178
171
|
self,
|
|
179
172
|
element: Element | CachedElement,
|
|
180
173
|
classifications: list[dict[str, str | float | bool]],
|
|
181
|
-
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
182
174
|
) -> list[dict[str, str | float | bool]]:
|
|
183
175
|
"""
|
|
184
176
|
Create multiple classifications at once on the given element through the API.
|
|
@@ -193,8 +185,6 @@ class ClassificationMixin:
|
|
|
193
185
|
high_confidence (bool)
|
|
194
186
|
Optional. Whether or not the classification is of high confidence.
|
|
195
187
|
|
|
196
|
-
:param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
|
|
197
|
-
|
|
198
188
|
:returns: List of created classifications, as returned in the ``classifications`` field by
|
|
199
189
|
the ``CreateClassifications`` API endpoint.
|
|
200
190
|
"""
|
|
@@ -230,26 +220,20 @@ class ClassificationMixin:
|
|
|
230
220
|
)
|
|
231
221
|
return
|
|
232
222
|
|
|
233
|
-
created_cls =
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
"
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
}
|
|
248
|
-
for classification in batch
|
|
249
|
-
],
|
|
250
|
-
},
|
|
251
|
-
)["classifications"]
|
|
252
|
-
]
|
|
223
|
+
created_cls = self.request(
|
|
224
|
+
"CreateClassifications",
|
|
225
|
+
body={
|
|
226
|
+
"parent": str(element.id),
|
|
227
|
+
"worker_run_id": self.worker_run_id,
|
|
228
|
+
"classifications": [
|
|
229
|
+
{
|
|
230
|
+
**classification,
|
|
231
|
+
"ml_class": self.get_ml_class_id(classification["ml_class"]),
|
|
232
|
+
}
|
|
233
|
+
for classification in classifications
|
|
234
|
+
],
|
|
235
|
+
},
|
|
236
|
+
)["classifications"]
|
|
253
237
|
|
|
254
238
|
for created_cl in created_cls:
|
|
255
239
|
created_cl["class_name"] = self.retrieve_ml_class(created_cl["ml_class"])
|
arkindex_worker/worker/corpus.py
CHANGED
|
@@ -5,7 +5,6 @@ BaseWorker methods for corpora.
|
|
|
5
5
|
from enum import Enum
|
|
6
6
|
from operator import itemgetter
|
|
7
7
|
from tempfile import _TemporaryFileWrapper
|
|
8
|
-
from uuid import UUID
|
|
9
8
|
|
|
10
9
|
from arkindex_worker import logger
|
|
11
10
|
|
|
@@ -37,25 +36,6 @@ class CorpusExportState(Enum):
|
|
|
37
36
|
|
|
38
37
|
|
|
39
38
|
class CorpusMixin:
|
|
40
|
-
def download_export(self, export_id: str) -> _TemporaryFileWrapper:
|
|
41
|
-
"""
|
|
42
|
-
Download an export.
|
|
43
|
-
|
|
44
|
-
:param export_id: UUID of the export to download
|
|
45
|
-
:returns: The downloaded export stored in a temporary file.
|
|
46
|
-
"""
|
|
47
|
-
try:
|
|
48
|
-
UUID(export_id)
|
|
49
|
-
except ValueError as e:
|
|
50
|
-
raise ValueError("export_id is not a valid uuid.") from e
|
|
51
|
-
|
|
52
|
-
logger.info(f"Downloading export ({export_id})...")
|
|
53
|
-
export: _TemporaryFileWrapper = self.api_client.request(
|
|
54
|
-
"DownloadExport", id=export_id
|
|
55
|
-
)
|
|
56
|
-
logger.info(f"Downloaded export ({export_id}) @ `{export.name}`")
|
|
57
|
-
return export
|
|
58
|
-
|
|
59
39
|
def download_latest_export(self) -> _TemporaryFileWrapper:
|
|
60
40
|
"""
|
|
61
41
|
Download the latest export in `done` state of the current corpus.
|
|
@@ -82,5 +62,8 @@ class CorpusMixin:
|
|
|
82
62
|
|
|
83
63
|
# Download latest export
|
|
84
64
|
export_id: str = exports[0]["id"]
|
|
65
|
+
logger.info(f"Downloading export ({export_id})...")
|
|
66
|
+
export: _TemporaryFileWrapper = self.request("DownloadExport", id=export_id)
|
|
67
|
+
logger.info(f"Downloaded export ({export_id}) @ `{export.name}`")
|
|
85
68
|
|
|
86
|
-
return
|
|
69
|
+
return export
|