arkindex-base-worker 0.3.7rc4__py3-none-any.whl → 0.5.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/METADATA +18 -19
  2. arkindex_base_worker-0.5.0a1.dist-info/RECORD +61 -0
  3. {arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/WHEEL +1 -1
  4. {arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/top_level.txt +2 -0
  5. arkindex_worker/cache.py +1 -1
  6. arkindex_worker/image.py +167 -2
  7. arkindex_worker/models.py +18 -0
  8. arkindex_worker/utils.py +98 -4
  9. arkindex_worker/worker/__init__.py +117 -218
  10. arkindex_worker/worker/base.py +39 -46
  11. arkindex_worker/worker/classification.py +45 -29
  12. arkindex_worker/worker/corpus.py +86 -0
  13. arkindex_worker/worker/dataset.py +89 -26
  14. arkindex_worker/worker/element.py +352 -91
  15. arkindex_worker/worker/entity.py +13 -11
  16. arkindex_worker/worker/image.py +21 -0
  17. arkindex_worker/worker/metadata.py +26 -16
  18. arkindex_worker/worker/process.py +92 -0
  19. arkindex_worker/worker/task.py +5 -4
  20. arkindex_worker/worker/training.py +25 -10
  21. arkindex_worker/worker/transcription.py +89 -68
  22. arkindex_worker/worker/version.py +3 -1
  23. hooks/pre_gen_project.py +3 -0
  24. tests/__init__.py +8 -0
  25. tests/conftest.py +47 -58
  26. tests/test_base_worker.py +212 -12
  27. tests/test_dataset_worker.py +294 -437
  28. tests/test_elements_worker/{test_classifications.py → test_classification.py} +313 -200
  29. tests/test_elements_worker/test_cli.py +3 -11
  30. tests/test_elements_worker/test_corpus.py +168 -0
  31. tests/test_elements_worker/test_dataset.py +106 -157
  32. tests/test_elements_worker/test_element.py +427 -0
  33. tests/test_elements_worker/test_element_create_multiple.py +715 -0
  34. tests/test_elements_worker/test_element_create_single.py +528 -0
  35. tests/test_elements_worker/test_element_list_children.py +969 -0
  36. tests/test_elements_worker/test_element_list_parents.py +530 -0
  37. tests/test_elements_worker/{test_entities.py → test_entity_create.py} +37 -195
  38. tests/test_elements_worker/test_entity_list_and_check.py +160 -0
  39. tests/test_elements_worker/test_image.py +66 -0
  40. tests/test_elements_worker/test_metadata.py +252 -161
  41. tests/test_elements_worker/test_process.py +89 -0
  42. tests/test_elements_worker/test_task.py +8 -18
  43. tests/test_elements_worker/test_training.py +17 -8
  44. tests/test_elements_worker/test_transcription_create.py +873 -0
  45. tests/test_elements_worker/test_transcription_create_with_elements.py +951 -0
  46. tests/test_elements_worker/test_transcription_list.py +450 -0
  47. tests/test_elements_worker/test_version.py +60 -0
  48. tests/test_elements_worker/test_worker.py +578 -293
  49. tests/test_image.py +542 -209
  50. tests/test_merge.py +1 -2
  51. tests/test_utils.py +89 -4
  52. worker-demo/tests/__init__.py +0 -0
  53. worker-demo/tests/conftest.py +32 -0
  54. worker-demo/tests/test_worker.py +12 -0
  55. worker-demo/worker_demo/__init__.py +6 -0
  56. worker-demo/worker_demo/worker.py +19 -0
  57. arkindex_base_worker-0.3.7rc4.dist-info/RECORD +0 -41
  58. tests/test_elements_worker/test_elements.py +0 -2713
  59. tests/test_elements_worker/test_transcriptions.py +0 -2119
  60. {arkindex_base_worker-0.3.7rc4.dist-info → arkindex_base_worker-0.5.0a1.dist-info}/LICENSE +0 -0
@@ -4,65 +4,47 @@ Base classes to implement Arkindex workers.
4
4
 
5
5
  import contextlib
6
6
  import json
7
- import os
8
7
  import sys
9
8
  import uuid
10
- from collections.abc import Iterable, Iterator
11
- from enum import Enum
12
- from itertools import groupby
13
- from operator import itemgetter
9
+ from collections.abc import Iterable
10
+ from itertools import chain
14
11
  from pathlib import Path
15
12
 
16
- from apistar.exceptions import ErrorResponse
17
-
13
+ from arkindex.exceptions import ErrorResponse
18
14
  from arkindex_worker import logger
19
15
  from arkindex_worker.cache import CachedElement
20
- from arkindex_worker.models import Dataset, Element
16
+ from arkindex_worker.models import Dataset, Element, Set
17
+ from arkindex_worker.utils import pluralize
21
18
  from arkindex_worker.worker.base import BaseWorker
22
19
  from arkindex_worker.worker.classification import ClassificationMixin
23
- from arkindex_worker.worker.dataset import DatasetMixin, DatasetState
20
+ from arkindex_worker.worker.corpus import CorpusMixin
21
+ from arkindex_worker.worker.dataset import (
22
+ DatasetMixin,
23
+ DatasetState,
24
+ MissingDatasetArchive,
25
+ )
24
26
  from arkindex_worker.worker.element import ElementMixin
25
- from arkindex_worker.worker.entity import EntityMixin # noqa: F401
27
+ from arkindex_worker.worker.entity import EntityMixin
28
+ from arkindex_worker.worker.image import ImageMixin
26
29
  from arkindex_worker.worker.metadata import MetaDataMixin, MetaType # noqa: F401
30
+ from arkindex_worker.worker.process import ActivityState, ProcessMixin, ProcessMode
27
31
  from arkindex_worker.worker.task import TaskMixin
28
32
  from arkindex_worker.worker.transcription import TranscriptionMixin
29
- from arkindex_worker.worker.version import WorkerVersionMixin # noqa: F401
30
-
31
-
32
- class ActivityState(Enum):
33
- """
34
- Processing state of an element.
35
- """
36
-
37
- Queued = "queued"
38
- """
39
- The element has not yet been processed by a worker.
40
- """
41
-
42
- Started = "started"
43
- """
44
- The element is being processed by a worker.
45
- """
46
-
47
- Processed = "processed"
48
- """
49
- The element has been successfully processed by a worker.
50
- """
51
-
52
- Error = "error"
53
- """
54
- An error occurred while processing this element.
55
- """
33
+ from arkindex_worker.worker.version import WorkerVersionMixin
56
34
 
57
35
 
58
36
  class ElementsWorker(
37
+ ElementMixin,
38
+ DatasetMixin,
59
39
  BaseWorker,
60
40
  ClassificationMixin,
61
- ElementMixin,
41
+ CorpusMixin,
62
42
  TranscriptionMixin,
63
43
  WorkerVersionMixin,
64
44
  EntityMixin,
65
45
  MetaDataMixin,
46
+ ImageMixin,
47
+ ProcessMixin,
66
48
  ):
67
49
  """
68
50
  Base class for ML workers that operate on Arkindex elements.
@@ -80,39 +62,41 @@ class ElementsWorker(
80
62
  """
81
63
  super().__init__(description, support_cache)
82
64
 
83
- # Add mandatory argument to process elements
84
- self.parser.add_argument(
85
- "--elements-list",
86
- help="JSON elements list to use",
87
- type=open,
88
- default=os.environ.get("TASK_ELEMENTS"),
89
- )
90
- self.parser.add_argument(
91
- "--element",
92
- type=uuid.UUID,
93
- nargs="+",
94
- help="One or more Arkindex element ID",
95
- )
96
-
97
65
  self.classes = {}
98
66
 
99
67
  self.entity_types = {}
100
68
  """Known and available entity types in processed corpus
101
69
  """
102
70
 
71
+ self.corpus_types = {}
72
+ """Known and available element types in processed corpus
73
+ """
74
+
103
75
  self._worker_version_cache = {}
104
76
 
105
- def list_elements(self) -> Iterable[CachedElement] | list[str]:
77
+ def get_elements(self) -> Iterable[CachedElement] | list[str] | list[Element]:
106
78
  """
107
79
  List the elements to be processed, either from the CLI arguments or
108
80
  the cache database when enabled.
109
81
 
110
82
  :return: An iterable of [CachedElement][arkindex_worker.cache.CachedElement] when cache support is enabled,
111
- and a list of strings representing element IDs otherwise.
83
+ or a list of strings representing element IDs otherwise.
112
84
  """
113
85
  assert not (
114
86
  self.args.elements_list and self.args.element
115
87
  ), "elements-list and element CLI args shouldn't be both set"
88
+
89
+ def invalid_element_id(value: str) -> bool:
90
+ """
91
+ Return whether the ID of an element is a valid UUID or not
92
+ """
93
+ try:
94
+ uuid.UUID(value)
95
+ except Exception:
96
+ return True
97
+
98
+ return False
99
+
116
100
  out = []
117
101
 
118
102
  # Load from the cache when available
@@ -122,15 +106,28 @@ class ElementsWorker(
122
106
  )
123
107
  if self.use_cache and cache_query.exists():
124
108
  return cache_query
125
- # Process elements from JSON file
126
109
  elif self.args.elements_list:
110
+ # Process elements from JSON file
127
111
  data = json.load(self.args.elements_list)
128
112
  assert isinstance(data, list), "Elements list must be a list"
129
113
  assert len(data), "No elements in elements list"
130
114
  out += list(filter(None, [element.get("id") for element in data]))
131
- # Add any extra element from CLI
132
115
  elif self.args.element:
116
+ # Add any extra element from CLI
133
117
  out += self.args.element
118
+ elif self.process_mode == ProcessMode.Dataset or self.args.set:
119
+ # Elements from datasets
120
+ return list(
121
+ chain.from_iterable(map(self.list_set_elements, self.list_sets()))
122
+ )
123
+ elif self.process_mode == ProcessMode.Export:
124
+ # For export mode processes, use list_process_elements and return element IDs
125
+ return {item["id"] for item in self.list_process_elements()}
126
+
127
+ invalid_element_ids = list(filter(invalid_element_id, out))
128
+ assert (
129
+ not invalid_element_ids
130
+ ), f"These element IDs are invalid: {', '.join(invalid_element_ids)}"
134
131
 
135
132
  return out
136
133
 
@@ -140,30 +137,22 @@ class ElementsWorker(
140
137
  Whether or not WorkerActivity support has been enabled on the DataImport
141
138
  used to run this worker.
142
139
  """
143
- if self.is_read_only:
140
+ if self.is_read_only or self.process_mode in [
141
+ ProcessMode.Dataset,
142
+ ProcessMode.Export,
143
+ ]:
144
+ # Worker activities are also disabled when running an ElementsWorker in a Dataset process
145
+ # and when running export processes.
144
146
  return False
145
147
  assert (
146
148
  self.process_information
147
149
  ), "Worker must be configured to access its process activity state"
148
150
  return self.process_information.get("activity_state") == "ready"
149
151
 
150
- def configure(self):
151
- """
152
- Setup the worker using CLI arguments and environment variables.
153
- """
154
- # CLI args are stored on the instance so that implementations can access them
155
- self.args = self.parser.parse_args()
156
-
157
- if self.is_read_only:
158
- super().configure_for_developers()
159
- else:
160
- super().configure()
161
- super().configure_cache()
162
-
163
152
  def run(self):
164
153
  """
165
154
  Implements an Arkindex worker that goes through each element returned by
166
- [list_elements][arkindex_worker.worker.ElementsWorker.list_elements].
155
+ [get_elements][arkindex_worker.worker.ElementsWorker.get_elements].
167
156
  It calls [process_element][arkindex_worker.worker.ElementsWorker.process_element],
168
157
  catching exceptions, and handles saving WorkerActivity updates when enabled.
169
158
  """
@@ -171,7 +160,7 @@ class ElementsWorker(
171
160
 
172
161
  # List all elements either from JSON file
173
162
  # or direct list of elements on CLI
174
- elements = self.list_elements()
163
+ elements = self.get_elements()
175
164
  if not elements:
176
165
  logger.warning("No elements to process, stopping.")
177
166
  sys.exit(1)
@@ -187,12 +176,14 @@ class ElementsWorker(
187
176
  for i, item in enumerate(elements, start=1):
188
177
  element = None
189
178
  try:
190
- if self.use_cache:
191
- # Just use the result of list_elements as the element
179
+ if isinstance(item, CachedElement | Element):
180
+ # Just use the result of get_elements as the element
192
181
  element = item
193
182
  else:
194
183
  # Load element using the Arkindex API
195
- element = Element(**self.request("RetrieveElement", id=item))
184
+ element = Element(
185
+ **self.api_client.request("RetrieveElement", id=item)
186
+ )
196
187
 
197
188
  logger.info(f"Processing {element} ({i}/{count})")
198
189
 
@@ -230,7 +221,7 @@ class ElementsWorker(
230
221
  with contextlib.suppress(Exception):
231
222
  self.update_activity(element.id, ActivityState.Error)
232
223
 
233
- message = f'Ran on {count} element{"s"[:count>1]}: {count - failed} completed, {failed} failed'
224
+ message = f'Ran on {count} {pluralize("element", count)}: {count - failed} completed, {failed} failed'
234
225
  if failed:
235
226
  logger.error(message)
236
227
  if failed >= count: # Everything failed!
@@ -271,7 +262,7 @@ class ElementsWorker(
271
262
  assert isinstance(state, ActivityState), "state should be an ActivityState"
272
263
 
273
264
  try:
274
- self.request(
265
+ self.api_client.request(
275
266
  "UpdateWorkerActivity",
276
267
  id=self.worker_run_id,
277
268
  body={
@@ -301,16 +292,9 @@ class ElementsWorker(
301
292
  return True
302
293
 
303
294
 
304
- class MissingDatasetArchive(Exception):
305
- """
306
- Exception raised when the compressed archive associated to
307
- a dataset isn't found in its task artifacts.
308
- """
309
-
310
-
311
- class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
295
+ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
312
296
  """
313
- Base class for ML workers that operate on Arkindex datasets.
297
+ Base class for ML workers that operate on Arkindex dataset sets.
314
298
 
315
299
  This class inherits from numerous mixin classes found in other modules of
316
300
  ``arkindex.worker``, which provide helpers to read and write to the Arkindex API.
@@ -320,193 +304,108 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
320
304
  self,
321
305
  description: str = "Arkindex Dataset Worker",
322
306
  support_cache: bool = False,
323
- generator: bool = False,
324
307
  ):
325
308
  """
326
309
  :param description: The worker's description.
327
310
  :param support_cache: Whether the worker supports cache.
328
- :param generator: Whether the worker generates the dataset archive artifact.
329
311
  """
330
312
  super().__init__(description, support_cache)
331
313
 
332
- self.parser.add_argument(
333
- "--dataset",
334
- type=uuid.UUID,
335
- nargs="+",
336
- help="One or more Arkindex dataset ID",
337
- )
338
-
339
- self.generator = generator
314
+ # Path to the dataset compressed archive (containing images and a SQLite database)
315
+ # Set as an instance variable as dataset workers might use it to easily extract its content
316
+ self.downloaded_dataset_artifact: Path | None = None
340
317
 
341
- def configure(self):
318
+ def cleanup_downloaded_artifact(self) -> None:
342
319
  """
343
- Setup the worker using CLI arguments and environment variables.
320
+ Cleanup the downloaded dataset artifact if any
344
321
  """
345
- # CLI args are stored on the instance so that implementations can access them
346
- self.args = self.parser.parse_args()
322
+ if not self.downloaded_dataset_artifact:
323
+ return
347
324
 
348
- if self.is_read_only:
349
- super().configure_for_developers()
350
- else:
351
- super().configure()
352
- super().configure_cache()
325
+ self.downloaded_dataset_artifact.unlink(missing_ok=True)
353
326
 
354
- def download_dataset_artifact(self, dataset: Dataset) -> Path:
327
+ def download_dataset_artifact(self, dataset: Dataset) -> None:
355
328
  """
356
329
  Find and download the compressed archive artifact describing a dataset using
357
330
  the [list_artifacts][arkindex_worker.worker.task.TaskMixin.list_artifacts] and
358
331
  [download_artifact][arkindex_worker.worker.task.TaskMixin.download_artifact] methods.
359
332
 
360
333
  :param dataset: The dataset to retrieve the compressed archive artifact for.
361
- :returns: A path to the downloaded artifact.
362
334
  :raises MissingDatasetArchive: When the dataset artifact is not found.
363
335
  """
336
+ extra_dir = self.find_extras_directory()
337
+ archive = extra_dir / dataset.filepath
338
+ if archive.exists():
339
+ return
364
340
 
365
- task_id = uuid.UUID(dataset.task_id)
341
+ # Cleanup the dataset artifact that was downloaded previously
342
+ self.cleanup_downloaded_artifact()
366
343
 
344
+ logger.info(f"Downloading artifact for {dataset}")
345
+ task_id = uuid.UUID(dataset.task_id)
367
346
  for artifact in self.list_artifacts(task_id):
368
347
  if artifact.path != dataset.filepath:
369
348
  continue
370
349
 
371
- extra_dir = self.find_extras_directory()
372
- archive = extra_dir / dataset.filepath
373
350
  archive.write_bytes(self.download_artifact(task_id, artifact).read())
374
- return archive
351
+ self.downloaded_dataset_artifact = archive
352
+ return
375
353
 
376
354
  raise MissingDatasetArchive(
377
355
  "The dataset compressed archive artifact was not found."
378
356
  )
379
357
 
380
- def list_dataset_elements_per_split(
381
- self, dataset: Dataset
382
- ) -> Iterator[tuple[str, list[Element]]]:
383
- """
384
- List the elements in the dataset, grouped by split, using the
385
- [list_dataset_elements][arkindex_worker.worker.dataset.DatasetMixin.list_dataset_elements] method.
386
-
387
- :param dataset: The dataset to retrieve elements from.
388
- :returns: An iterator of tuples containing the split name and the list of its elements.
389
- """
390
-
391
- def format_split(
392
- split: tuple[str, Iterator[tuple[str, Element]]],
393
- ) -> tuple[str, list[Element]]:
394
- return (split[0], list(map(itemgetter(1), list(split[1]))))
395
-
396
- return map(
397
- format_split,
398
- groupby(
399
- sorted(self.list_dataset_elements(dataset), key=itemgetter(0)),
400
- key=itemgetter(0),
401
- ),
402
- )
403
-
404
- def process_dataset(self, dataset: Dataset):
405
- """
406
- Override this method to implement your worker and process a single Arkindex dataset at once.
407
-
408
- :param dataset: The dataset to process.
358
+ def process_set(self, set: Set):
409
359
  """
360
+ Override this method to implement your worker and process a single Arkindex dataset set at once.
410
361
 
411
- def list_datasets(self) -> Iterator[Dataset] | Iterator[str]:
362
+ :param set: The set to process.
412
363
  """
413
- List the datasets to be processed, either from the CLI arguments or using the
414
- [list_process_datasets][arkindex_worker.worker.dataset.DatasetMixin.list_process_datasets] method.
415
-
416
- :returns: An iterator of strings if the worker is in read-only mode,
417
- else an iterator of ``Dataset`` objects.
418
- """
419
- if self.is_read_only:
420
- return map(str, self.args.dataset)
421
-
422
- return self.list_process_datasets()
423
364
 
424
365
  def run(self):
425
366
  """
426
- Implements an Arkindex worker that goes through each dataset returned by
427
- [list_datasets][arkindex_worker.worker.DatasetWorker.list_datasets].
367
+ Implements an Arkindex worker that goes through each dataset set returned by
368
+ [list_sets][arkindex_worker.worker.dataset.DatasetMixin.list_sets].
428
369
 
429
- It calls [process_dataset][arkindex_worker.worker.DatasetWorker.process_dataset],
430
- catching exceptions, and handles updating the [DatasetState][arkindex_worker.worker.dataset.DatasetState]
431
- when the worker is a generator.
370
+ It calls [process_set][arkindex_worker.worker.DatasetWorker.process_set],
371
+ catching exceptions.
432
372
  """
433
373
  self.configure()
434
374
 
435
- datasets: list[Dataset] | list[str] = list(self.list_datasets())
436
- if not datasets:
437
- logger.warning("No datasets to process, stopping.")
375
+ dataset_sets: list[Set] = list(self.list_sets())
376
+ if not dataset_sets:
377
+ logger.warning("No sets to process, stopping.")
438
378
  sys.exit(1)
439
379
 
440
- # Process every dataset
441
- count = len(datasets)
380
+ # Process every set
381
+ count = len(dataset_sets)
442
382
  failed = 0
443
- for i, item in enumerate(datasets, start=1):
444
- dataset = None
445
- dataset_artifact = None
446
-
383
+ for i, dataset_set in enumerate(dataset_sets, start=1):
447
384
  try:
448
- if not self.is_read_only:
449
- # Just use the result of list_datasets as the dataset
450
- dataset = item
451
- else:
452
- # Load dataset using the Arkindex API
453
- dataset = Dataset(**self.request("RetrieveDataset", id=item))
454
-
455
- if self.generator:
456
- assert (
457
- dataset.state
458
- in [DatasetState.Open.value, DatasetState.Error.value]
459
- ), "When generating a new dataset, its state should be Open or Error."
460
- else:
461
- assert (
462
- dataset.state == DatasetState.Complete.value
463
- ), "When processing an existing dataset, its state should be Complete."
464
-
465
- logger.info(f"Processing {dataset} ({i}/{count})")
466
-
467
- if self.generator:
468
- # Update the dataset state to Building
469
- logger.info(f"Building {dataset} ({i}/{count})")
470
- self.update_dataset_state(dataset, DatasetState.Building)
471
- else:
472
- logger.info(f"Downloading data for {dataset} ({i}/{count})")
473
- dataset_artifact = self.download_dataset_artifact(dataset)
385
+ assert (
386
+ dataset_set.dataset.state == DatasetState.Complete.value
387
+ ), "When processing a set, its dataset state should be Complete."
474
388
 
475
- # Process the dataset
476
- self.process_dataset(dataset)
389
+ logger.info(f"Retrieving data for {dataset_set} ({i}/{count})")
390
+ self.download_dataset_artifact(dataset_set.dataset)
477
391
 
478
- if self.generator:
479
- # Update the dataset state to Complete
480
- logger.info(f"Completed {dataset} ({i}/{count})")
481
- self.update_dataset_state(dataset, DatasetState.Complete)
392
+ logger.info(f"Processing {dataset_set} ({i}/{count})")
393
+ self.process_set(dataset_set)
482
394
  except Exception as e:
483
- # Handle errors occurring while retrieving, processing or patching the state for this dataset.
395
+ # Handle errors occurring while retrieving or processing this dataset set
484
396
  failed += 1
485
397
 
486
- # Handle the case where we failed retrieving the dataset
487
- dataset_id = dataset.id if dataset else item
488
-
489
398
  if isinstance(e, ErrorResponse):
490
- message = f"An API error occurred while processing dataset {dataset_id}: {e.title} - {e.content}"
399
+ message = f"An API error occurred while processing {dataset_set}: {e.title} - {e.content}"
491
400
  else:
492
- message = (
493
- f"Failed running worker on dataset {dataset_id}: {repr(e)}"
494
- )
401
+ message = f"Failed running worker on {dataset_set}: {repr(e)}"
495
402
 
496
- logger.warning(
497
- message,
498
- exc_info=e if self.args.verbose else None,
499
- )
500
- if dataset and self.generator:
501
- # Try to update the state to Error regardless of the response
502
- with contextlib.suppress(Exception):
503
- self.update_dataset_state(dataset, DatasetState.Error)
504
- finally:
505
- # Cleanup the dataset artifact if it was downloaded, no matter what
506
- if dataset_artifact:
507
- dataset_artifact.unlink(missing_ok=True)
403
+ logger.warning(message, exc_info=e if self.args.verbose else None)
404
+
405
+ # Cleanup the latest downloaded dataset artifact
406
+ self.cleanup_downloaded_artifact()
508
407
 
509
- message = f'Ran on {count} dataset{"s"[:count>1]}: {count - failed} completed, {failed} failed'
408
+ message = f'Ran on {count} {pluralize("set", count)}: {count - failed} completed, {failed} failed'
510
409
  if failed:
511
410
  logger.error(message)
512
411
  if failed >= count: # Everything failed!
@@ -12,16 +12,9 @@ from tempfile import mkdtemp
12
12
 
13
13
  import gnupg
14
14
  import yaml
15
- from apistar.exceptions import ErrorResponse
16
- from tenacity import (
17
- before_sleep_log,
18
- retry,
19
- retry_if_exception,
20
- stop_after_attempt,
21
- wait_exponential,
22
- )
23
15
 
24
- from arkindex import ArkindexClient, options_from_env
16
+ from arkindex import options_from_env
17
+ from arkindex.exceptions import ErrorResponse
25
18
  from arkindex_worker import logger
26
19
  from arkindex_worker.cache import (
27
20
  check_version,
@@ -31,18 +24,8 @@ from arkindex_worker.cache import (
31
24
  merge_parents_cache,
32
25
  )
33
26
  from arkindex_worker.utils import close_delete_file, extract_tar_zst_archive
34
-
35
-
36
- def _is_500_error(exc: Exception) -> bool:
37
- """
38
- Check if an Arkindex API error has a HTTP 5xx error code.
39
- Used to retry most API calls in [BaseWorker][arkindex_worker.worker.base.BaseWorker].
40
- :param exc: Exception to check
41
- """
42
- if not isinstance(exc, ErrorResponse):
43
- return False
44
-
45
- return 500 <= exc.status_code < 600
27
+ from arkindex_worker.worker.process import ProcessMode
28
+ from teklia_toolbox.requests import get_arkindex_client
46
29
 
47
30
 
48
31
  class ExtrasDirNotFoundError(Exception):
@@ -174,6 +157,13 @@ class BaseWorker:
174
157
  raise Exception("Missing ARKINDEX_CORPUS_ID environment variable")
175
158
  return self._corpus_id
176
159
 
160
+ @property
161
+ def process_mode(self) -> ProcessMode | None:
162
+ """Mode of the process being run. Returns None when read-only."""
163
+ if self.is_read_only:
164
+ return
165
+ return ProcessMode(self.process_information["mode"])
166
+
177
167
  @property
178
168
  def is_read_only(self) -> bool:
179
169
  """
@@ -197,7 +187,7 @@ class BaseWorker:
197
187
  Create an ArkindexClient to make API requests towards Arkindex instances.
198
188
  """
199
189
  # Build Arkindex API client from environment variables
200
- self.api_client = ArkindexClient(**options_from_env())
190
+ self.api_client = get_arkindex_client(**options_from_env())
201
191
  logger.debug(f"Setup Arkindex API client on {self.api_client.document.url}")
202
192
 
203
193
  def configure_for_developers(self):
@@ -237,7 +227,7 @@ class BaseWorker:
237
227
  # Load all required secrets
238
228
  self.secrets = {name: self.load_secret(Path(name)) for name in required_secrets}
239
229
 
240
- def configure(self):
230
+ def configure_worker_run(self):
241
231
  """
242
232
  Setup the necessary configuration needed using CLI args and environment variables.
243
233
  This is the method called when running a worker on Arkindex.
@@ -249,7 +239,7 @@ class BaseWorker:
249
239
  logger.debug("Debug output enabled")
250
240
 
251
241
  # Load worker run information
252
- worker_run = self.request("RetrieveWorkerRun", id=self.worker_run_id)
242
+ worker_run = self.api_client.request("RetrieveWorkerRun", id=self.worker_run_id)
253
243
 
254
244
  # Load process information
255
245
  self.process_information = worker_run["process"]
@@ -308,7 +298,7 @@ class BaseWorker:
308
298
  if self.support_cache and self.args.database is not None:
309
299
  self.use_cache = True
310
300
  elif self.support_cache and self.task_id:
311
- task = self.request("RetrieveTaskFromAgent", id=self.task_id)
301
+ task = self.api_client.request("RetrieveTask", id=self.task_id)
312
302
  self.task_parents = task["parents"]
313
303
  paths = self.find_parents_file_paths(Path("db.sqlite"))
314
304
  self.use_cache = len(paths) > 0
@@ -338,6 +328,29 @@ class BaseWorker:
338
328
  else:
339
329
  logger.debug("Cache is disabled")
340
330
 
331
+ def configure(self):
332
+ """
333
+ Setup the worker using CLI arguments and environment variables.
334
+ """
335
+ # CLI args are stored on the instance so that implementations can access them
336
+ self.args = self.parser.parse_args()
337
+
338
+ if self.is_read_only:
339
+ self.configure_for_developers()
340
+ else:
341
+ self.configure_worker_run()
342
+ self.configure_cache()
343
+
344
+ # Retrieve the model configuration
345
+ if self.model_configuration:
346
+ self.config.update(self.model_configuration)
347
+ logger.info("Model version configuration retrieved")
348
+
349
+ # Retrieve the user configuration
350
+ if self.user_configuration:
351
+ self.config.update(self.user_configuration)
352
+ logger.info("User configuration retrieved")
353
+
341
354
  def load_secret(self, name: Path):
342
355
  """
343
356
  Load a Ponos secret by name.
@@ -349,7 +362,7 @@ class BaseWorker:
349
362
 
350
363
  # Load from the backend
351
364
  try:
352
- resp = self.request("RetrieveSecret", name=str(name))
365
+ resp = self.api_client.request("RetrieveSecret", name=str(name))
353
366
  secret = resp["content"]
354
367
  logging.info(f"Loaded API secret {name}")
355
368
  except ErrorResponse as e:
@@ -489,26 +502,6 @@ class BaseWorker:
489
502
  # Clean up
490
503
  shutil.rmtree(base_extracted_path)
491
504
 
492
- @retry(
493
- retry=retry_if_exception(_is_500_error),
494
- wait=wait_exponential(multiplier=2, min=3),
495
- reraise=True,
496
- stop=stop_after_attempt(5),
497
- before_sleep=before_sleep_log(logger, logging.INFO),
498
- )
499
- def request(self, *args, **kwargs):
500
- """
501
- Wrapper around the ``ArkindexClient.request`` method.
502
-
503
- The API call will be retried up to 5 times in case of HTTP 5xx errors,
504
- with an exponential sleep time of 3, 4, 8 and 16 seconds between calls.
505
- If the 5th call still causes an HTTP 5xx error, the exception is re-raised
506
- and the caller should catch it.
507
-
508
- Log messages are displayed when an HTTP 5xx error occurs, before waiting for the next call.
509
- """
510
- return self.api_client.request(*args, **kwargs)
511
-
512
505
  def add_arguments(self):
513
506
  """Override this method to add ``argparse`` arguments to this worker"""
514
507