arkindex-base-worker 0.4.0__py3-none-any.whl → 0.4.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {arkindex_base_worker-0.4.0.dist-info → arkindex_base_worker-0.4.0a2.dist-info}/METADATA +13 -15
  2. arkindex_base_worker-0.4.0a2.dist-info/RECORD +51 -0
  3. {arkindex_base_worker-0.4.0.dist-info → arkindex_base_worker-0.4.0a2.dist-info}/WHEEL +1 -1
  4. arkindex_worker/cache.py +1 -1
  5. arkindex_worker/image.py +1 -120
  6. arkindex_worker/utils.py +0 -82
  7. arkindex_worker/worker/__init__.py +161 -46
  8. arkindex_worker/worker/base.py +11 -36
  9. arkindex_worker/worker/classification.py +18 -34
  10. arkindex_worker/worker/corpus.py +4 -21
  11. arkindex_worker/worker/dataset.py +1 -71
  12. arkindex_worker/worker/element.py +91 -352
  13. arkindex_worker/worker/entity.py +11 -11
  14. arkindex_worker/worker/metadata.py +9 -19
  15. arkindex_worker/worker/task.py +4 -5
  16. arkindex_worker/worker/training.py +6 -6
  17. arkindex_worker/worker/transcription.py +68 -89
  18. arkindex_worker/worker/version.py +1 -3
  19. tests/__init__.py +1 -1
  20. tests/conftest.py +45 -33
  21. tests/test_base_worker.py +3 -204
  22. tests/test_dataset_worker.py +4 -7
  23. tests/test_elements_worker/{test_classification.py → test_classifications.py} +61 -194
  24. tests/test_elements_worker/test_corpus.py +1 -32
  25. tests/test_elements_worker/test_dataset.py +1 -1
  26. tests/test_elements_worker/test_elements.py +2734 -0
  27. tests/test_elements_worker/{test_entity_create.py → test_entities.py} +160 -26
  28. tests/test_elements_worker/test_image.py +1 -2
  29. tests/test_elements_worker/test_metadata.py +99 -224
  30. tests/test_elements_worker/test_task.py +1 -1
  31. tests/test_elements_worker/test_training.py +2 -2
  32. tests/test_elements_worker/test_transcriptions.py +2102 -0
  33. tests/test_elements_worker/test_worker.py +280 -563
  34. tests/test_image.py +204 -429
  35. tests/test_merge.py +2 -1
  36. tests/test_utils.py +3 -66
  37. arkindex_base_worker-0.4.0.dist-info/RECORD +0 -61
  38. arkindex_worker/worker/process.py +0 -92
  39. tests/test_elements_worker/test_element.py +0 -427
  40. tests/test_elements_worker/test_element_create_multiple.py +0 -715
  41. tests/test_elements_worker/test_element_create_single.py +0 -528
  42. tests/test_elements_worker/test_element_list_children.py +0 -969
  43. tests/test_elements_worker/test_element_list_parents.py +0 -530
  44. tests/test_elements_worker/test_entity_list_and_check.py +0 -160
  45. tests/test_elements_worker/test_process.py +0 -89
  46. tests/test_elements_worker/test_transcription_create.py +0 -873
  47. tests/test_elements_worker/test_transcription_create_with_elements.py +0 -951
  48. tests/test_elements_worker/test_transcription_list.py +0 -450
  49. tests/test_elements_worker/test_version.py +0 -60
  50. {arkindex_base_worker-0.4.0.dist-info → arkindex_base_worker-0.4.0a2.dist-info}/LICENSE +0 -0
  51. {arkindex_base_worker-0.4.0.dist-info → arkindex_base_worker-0.4.0a2.dist-info}/top_level.txt +0 -0
@@ -4,47 +4,68 @@ Base classes to implement Arkindex workers.
4
4
 
5
5
  import contextlib
6
6
  import json
7
+ import os
7
8
  import sys
8
9
  import uuid
9
- from collections.abc import Iterable
10
- from itertools import chain
10
+ from argparse import ArgumentTypeError
11
+ from collections.abc import Iterable, Iterator
12
+ from enum import Enum
11
13
  from pathlib import Path
12
14
 
13
- from arkindex.exceptions import ErrorResponse
15
+ from apistar.exceptions import ErrorResponse
16
+
14
17
  from arkindex_worker import logger
15
18
  from arkindex_worker.cache import CachedElement
16
19
  from arkindex_worker.models import Dataset, Element, Set
17
- from arkindex_worker.utils import pluralize
18
20
  from arkindex_worker.worker.base import BaseWorker
19
21
  from arkindex_worker.worker.classification import ClassificationMixin
20
22
  from arkindex_worker.worker.corpus import CorpusMixin
21
- from arkindex_worker.worker.dataset import (
22
- DatasetMixin,
23
- DatasetState,
24
- MissingDatasetArchive,
25
- )
23
+ from arkindex_worker.worker.dataset import DatasetMixin, DatasetState
26
24
  from arkindex_worker.worker.element import ElementMixin
27
25
  from arkindex_worker.worker.entity import EntityMixin
28
26
  from arkindex_worker.worker.image import ImageMixin
29
27
  from arkindex_worker.worker.metadata import MetaDataMixin, MetaType # noqa: F401
30
- from arkindex_worker.worker.process import ActivityState, ProcessMixin, ProcessMode
31
28
  from arkindex_worker.worker.task import TaskMixin
32
29
  from arkindex_worker.worker.transcription import TranscriptionMixin
33
30
  from arkindex_worker.worker.version import WorkerVersionMixin
34
31
 
35
32
 
33
+ class ActivityState(Enum):
34
+ """
35
+ Processing state of an element.
36
+ """
37
+
38
+ Queued = "queued"
39
+ """
40
+ The element has not yet been processed by a worker.
41
+ """
42
+
43
+ Started = "started"
44
+ """
45
+ The element is being processed by a worker.
46
+ """
47
+
48
+ Processed = "processed"
49
+ """
50
+ The element has been successfully processed by a worker.
51
+ """
52
+
53
+ Error = "error"
54
+ """
55
+ An error occurred while processing this element.
56
+ """
57
+
58
+
36
59
  class ElementsWorker(
37
- ElementMixin,
38
- DatasetMixin,
39
60
  BaseWorker,
40
61
  ClassificationMixin,
41
62
  CorpusMixin,
63
+ ElementMixin,
42
64
  TranscriptionMixin,
43
65
  WorkerVersionMixin,
44
66
  EntityMixin,
45
67
  MetaDataMixin,
46
68
  ImageMixin,
47
- ProcessMixin,
48
69
  ):
49
70
  """
50
71
  Base class for ML workers that operate on Arkindex elements.
@@ -62,19 +83,29 @@ class ElementsWorker(
62
83
  """
63
84
  super().__init__(description, support_cache)
64
85
 
86
+ # Add mandatory argument to process elements
87
+ self.parser.add_argument(
88
+ "--elements-list",
89
+ help="JSON elements list to use",
90
+ type=open,
91
+ default=os.environ.get("TASK_ELEMENTS"),
92
+ )
93
+ self.parser.add_argument(
94
+ "--element",
95
+ type=str,
96
+ nargs="+",
97
+ help="One or more Arkindex element ID",
98
+ )
99
+
65
100
  self.classes = {}
66
101
 
67
102
  self.entity_types = {}
68
103
  """Known and available entity types in processed corpus
69
104
  """
70
105
 
71
- self.corpus_types = {}
72
- """Known and available element types in processed corpus
73
- """
74
-
75
106
  self._worker_version_cache = {}
76
107
 
77
- def get_elements(self) -> Iterable[CachedElement] | list[str] | list[Element]:
108
+ def list_elements(self) -> Iterable[CachedElement] | list[str]:
78
109
  """
79
110
  List the elements to be processed, either from the CLI arguments or
80
111
  the cache database when enabled.
@@ -106,23 +137,15 @@ class ElementsWorker(
106
137
  )
107
138
  if self.use_cache and cache_query.exists():
108
139
  return cache_query
140
+ # Process elements from JSON file
109
141
  elif self.args.elements_list:
110
- # Process elements from JSON file
111
142
  data = json.load(self.args.elements_list)
112
143
  assert isinstance(data, list), "Elements list must be a list"
113
144
  assert len(data), "No elements in elements list"
114
145
  out += list(filter(None, [element.get("id") for element in data]))
146
+ # Add any extra element from CLI
115
147
  elif self.args.element:
116
- # Add any extra element from CLI
117
148
  out += self.args.element
118
- elif self.process_mode == ProcessMode.Dataset or self.args.set:
119
- # Elements from datasets
120
- return list(
121
- chain.from_iterable(map(self.list_set_elements, self.list_sets()))
122
- )
123
- elif self.process_mode == ProcessMode.Export:
124
- # For export mode processes, use list_process_elements and return element IDs
125
- return {item["id"] for item in self.list_process_elements()}
126
149
 
127
150
  invalid_element_ids = list(filter(invalid_element_id, out))
128
151
  assert (
@@ -137,22 +160,40 @@ class ElementsWorker(
137
160
  Whether or not WorkerActivity support has been enabled on the DataImport
138
161
  used to run this worker.
139
162
  """
140
- if self.is_read_only or self.process_mode in [
141
- ProcessMode.Dataset,
142
- ProcessMode.Export,
143
- ]:
144
- # Worker activities are also disabled when running an ElementsWorker in a Dataset process
145
- # and when running export processes.
163
+ if self.is_read_only:
146
164
  return False
147
165
  assert (
148
166
  self.process_information
149
167
  ), "Worker must be configured to access its process activity state"
150
168
  return self.process_information.get("activity_state") == "ready"
151
169
 
170
+ def configure(self):
171
+ """
172
+ Setup the worker using CLI arguments and environment variables.
173
+ """
174
+ # CLI args are stored on the instance so that implementations can access them
175
+ self.args = self.parser.parse_args()
176
+
177
+ if self.is_read_only:
178
+ super().configure_for_developers()
179
+ else:
180
+ super().configure()
181
+ super().configure_cache()
182
+
183
+ # Retrieve the model configuration
184
+ if self.model_configuration:
185
+ self.config.update(self.model_configuration)
186
+ logger.info("Model version configuration retrieved")
187
+
188
+ # Retrieve the user configuration
189
+ if self.user_configuration:
190
+ self.config.update(self.user_configuration)
191
+ logger.info("User configuration retrieved")
192
+
152
193
  def run(self):
153
194
  """
154
195
  Implements an Arkindex worker that goes through each element returned by
155
- [get_elements][arkindex_worker.worker.ElementsWorker.get_elements].
196
+ [list_elements][arkindex_worker.worker.ElementsWorker.list_elements].
156
197
  It calls [process_element][arkindex_worker.worker.ElementsWorker.process_element],
157
198
  catching exceptions, and handles saving WorkerActivity updates when enabled.
158
199
  """
@@ -160,7 +201,7 @@ class ElementsWorker(
160
201
 
161
202
  # List all elements either from JSON file
162
203
  # or direct list of elements on CLI
163
- elements = self.get_elements()
204
+ elements = self.list_elements()
164
205
  if not elements:
165
206
  logger.warning("No elements to process, stopping.")
166
207
  sys.exit(1)
@@ -176,14 +217,12 @@ class ElementsWorker(
176
217
  for i, item in enumerate(elements, start=1):
177
218
  element = None
178
219
  try:
179
- if isinstance(item, CachedElement | Element):
180
- # Just use the result of get_elements as the element
220
+ if self.use_cache:
221
+ # Just use the result of list_elements as the element
181
222
  element = item
182
223
  else:
183
224
  # Load element using the Arkindex API
184
- element = Element(
185
- **self.api_client.request("RetrieveElement", id=item)
186
- )
225
+ element = Element(**self.request("RetrieveElement", id=item))
187
226
 
188
227
  logger.info(f"Processing {element} ({i}/{count})")
189
228
 
@@ -221,7 +260,7 @@ class ElementsWorker(
221
260
  with contextlib.suppress(Exception):
222
261
  self.update_activity(element.id, ActivityState.Error)
223
262
 
224
- message = f'Ran on {count} {pluralize("element", count)}: {count - failed} completed, {failed} failed'
263
+ message = f'Ran on {count} element{"s"[:count>1]}: {count - failed} completed, {failed} failed'
225
264
  if failed:
226
265
  logger.error(message)
227
266
  if failed >= count: # Everything failed!
@@ -262,7 +301,7 @@ class ElementsWorker(
262
301
  assert isinstance(state, ActivityState), "state should be an ActivityState"
263
302
 
264
303
  try:
265
- self.api_client.request(
304
+ self.request(
266
305
  "UpdateWorkerActivity",
267
306
  id=self.worker_run_id,
268
307
  body={
@@ -292,7 +331,29 @@ class ElementsWorker(
292
331
  return True
293
332
 
294
333
 
295
- class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
334
+ def check_dataset_set(value: str) -> tuple[uuid.UUID, str]:
335
+ values = value.split(":")
336
+ if len(values) != 2:
337
+ raise ArgumentTypeError(
338
+ f"'{value}' is not in the correct format `<dataset_id>:<set_name>`"
339
+ )
340
+
341
+ dataset_id, set_name = values
342
+ try:
343
+ dataset_id = uuid.UUID(dataset_id)
344
+ return (dataset_id, set_name)
345
+ except (TypeError, ValueError) as e:
346
+ raise ArgumentTypeError(f"'{dataset_id}' should be a valid UUID") from e
347
+
348
+
349
+ class MissingDatasetArchive(Exception):
350
+ """
351
+ Exception raised when the compressed archive associated to
352
+ a dataset isn't found in its task artifacts.
353
+ """
354
+
355
+
356
+ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
296
357
  """
297
358
  Base class for ML workers that operate on Arkindex dataset sets.
298
359
 
@@ -315,6 +376,40 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
315
376
  # Set as an instance variable as dataset workers might use it to easily extract its content
316
377
  self.downloaded_dataset_artifact: Path | None = None
317
378
 
379
+ self.parser.add_argument(
380
+ "--set",
381
+ type=check_dataset_set,
382
+ nargs="+",
383
+ help="""
384
+ One or more Arkindex dataset sets, format is <dataset_uuid>:<set_name>
385
+ (e.g.: "12341234-1234-1234-1234-123412341234:train")
386
+ """,
387
+ default=[],
388
+ )
389
+
390
+ def configure(self):
391
+ """
392
+ Setup the worker using CLI arguments and environment variables.
393
+ """
394
+ # CLI args are stored on the instance so that implementations can access them
395
+ self.args = self.parser.parse_args()
396
+
397
+ if self.is_read_only:
398
+ super().configure_for_developers()
399
+ else:
400
+ super().configure()
401
+ super().configure_cache()
402
+
403
+ # Retrieve the model configuration
404
+ if self.model_configuration:
405
+ self.config.update(self.model_configuration)
406
+ logger.info("Model version configuration retrieved")
407
+
408
+ # Retrieve the user configuration
409
+ if self.user_configuration:
410
+ self.config.update(self.user_configuration)
411
+ logger.info("User configuration retrieved")
412
+
318
413
  def cleanup_downloaded_artifact(self) -> None:
319
414
  """
320
415
  Cleanup the downloaded dataset artifact if any
@@ -362,10 +457,30 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
362
457
  :param set: The set to process.
363
458
  """
364
459
 
460
+ def list_sets(self) -> Iterator[Set]:
461
+ """
462
+ List the sets to be processed, either from the CLI arguments or using the
463
+ [list_process_sets][arkindex_worker.worker.dataset.DatasetMixin.list_process_sets] method.
464
+
465
+ :returns: An iterator of ``Set`` objects.
466
+ """
467
+ if not self.is_read_only:
468
+ yield from self.list_process_sets()
469
+
470
+ datasets: dict[uuid.UUID, Dataset] = {}
471
+ for dataset_id, set_name in self.args.set:
472
+ # Retrieving dataset information is not already cached
473
+ if dataset_id not in datasets:
474
+ datasets[dataset_id] = Dataset(
475
+ **self.request("RetrieveDataset", id=dataset_id)
476
+ )
477
+
478
+ yield Set(name=set_name, dataset=datasets[dataset_id])
479
+
365
480
  def run(self):
366
481
  """
367
482
  Implements an Arkindex worker that goes through each dataset set returned by
368
- [list_sets][arkindex_worker.worker.dataset.DatasetMixin.list_sets].
483
+ [list_sets][arkindex_worker.worker.DatasetWorker.list_sets].
369
484
 
370
485
  It calls [process_set][arkindex_worker.worker.DatasetWorker.process_set],
371
486
  catching exceptions.
@@ -405,7 +520,7 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
405
520
  # Cleanup the latest downloaded dataset artifact
406
521
  self.cleanup_downloaded_artifact()
407
522
 
408
- message = f'Ran on {count} {pluralize("set", count)}: {count - failed} completed, {failed} failed'
523
+ message = f'Ran on {count} set{"s"[:count>1]}: {count - failed} completed, {failed} failed'
409
524
  if failed:
410
525
  logger.error(message)
411
526
  if failed >= count: # Everything failed!
@@ -12,9 +12,9 @@ from tempfile import mkdtemp
12
12
 
13
13
  import gnupg
14
14
  import yaml
15
+ from apistar.exceptions import ErrorResponse
15
16
 
16
17
  from arkindex import options_from_env
17
- from arkindex.exceptions import ErrorResponse
18
18
  from arkindex_worker import logger
19
19
  from arkindex_worker.cache import (
20
20
  check_version,
@@ -24,7 +24,6 @@ from arkindex_worker.cache import (
24
24
  merge_parents_cache,
25
25
  )
26
26
  from arkindex_worker.utils import close_delete_file, extract_tar_zst_archive
27
- from arkindex_worker.worker.process import ProcessMode
28
27
  from teklia_toolbox.requests import get_arkindex_client
29
28
 
30
29
 
@@ -157,13 +156,6 @@ class BaseWorker:
157
156
  raise Exception("Missing ARKINDEX_CORPUS_ID environment variable")
158
157
  return self._corpus_id
159
158
 
160
- @property
161
- def process_mode(self) -> ProcessMode | None:
162
- """Mode of the process being run. Returns None when read-only."""
163
- if self.is_read_only:
164
- return
165
- return ProcessMode(self.process_information["mode"])
166
-
167
159
  @property
168
160
  def is_read_only(self) -> bool:
169
161
  """
@@ -227,7 +219,7 @@ class BaseWorker:
227
219
  # Load all required secrets
228
220
  self.secrets = {name: self.load_secret(Path(name)) for name in required_secrets}
229
221
 
230
- def configure_worker_run(self):
222
+ def configure(self):
231
223
  """
232
224
  Setup the necessary configuration needed using CLI args and environment variables.
233
225
  This is the method called when running a worker on Arkindex.
@@ -239,7 +231,7 @@ class BaseWorker:
239
231
  logger.debug("Debug output enabled")
240
232
 
241
233
  # Load worker run information
242
- worker_run = self.api_client.request("RetrieveWorkerRun", id=self.worker_run_id)
234
+ worker_run = self.request("RetrieveWorkerRun", id=self.worker_run_id)
243
235
 
244
236
  # Load process information
245
237
  self.process_information = worker_run["process"]
@@ -298,7 +290,7 @@ class BaseWorker:
298
290
  if self.support_cache and self.args.database is not None:
299
291
  self.use_cache = True
300
292
  elif self.support_cache and self.task_id:
301
- task = self.api_client.request("RetrieveTask", id=self.task_id)
293
+ task = self.request("RetrieveTaskFromAgent", id=self.task_id)
302
294
  self.task_parents = task["parents"]
303
295
  paths = self.find_parents_file_paths(Path("db.sqlite"))
304
296
  self.use_cache = len(paths) > 0
@@ -328,29 +320,6 @@ class BaseWorker:
328
320
  else:
329
321
  logger.debug("Cache is disabled")
330
322
 
331
- def configure(self):
332
- """
333
- Setup the worker using CLI arguments and environment variables.
334
- """
335
- # CLI args are stored on the instance so that implementations can access them
336
- self.args = self.parser.parse_args()
337
-
338
- if self.is_read_only:
339
- self.configure_for_developers()
340
- else:
341
- self.configure_worker_run()
342
- self.configure_cache()
343
-
344
- # Retrieve the model configuration
345
- if self.model_configuration:
346
- self.config.update(self.model_configuration)
347
- logger.info("Model version configuration retrieved")
348
-
349
- # Retrieve the user configuration
350
- if self.user_configuration:
351
- self.config.update(self.user_configuration)
352
- logger.info("User configuration retrieved")
353
-
354
323
  def load_secret(self, name: Path):
355
324
  """
356
325
  Load a Ponos secret by name.
@@ -362,7 +331,7 @@ class BaseWorker:
362
331
 
363
332
  # Load from the backend
364
333
  try:
365
- resp = self.api_client.request("RetrieveSecret", name=str(name))
334
+ resp = self.request("RetrieveSecret", name=str(name))
366
335
  secret = resp["content"]
367
336
  logging.info(f"Loaded API secret {name}")
368
337
  except ErrorResponse as e:
@@ -502,6 +471,12 @@ class BaseWorker:
502
471
  # Clean up
503
472
  shutil.rmtree(base_extracted_path)
504
473
 
474
+ def request(self, *args, **kwargs):
475
+ """
476
+ Wrapper around the ``ArkindexClient.request`` method.
477
+ """
478
+ return self.api_client.request(*args, **kwargs)
479
+
505
480
  def add_arguments(self):
506
481
  """Override this method to add ``argparse`` arguments to this worker"""
507
482
 
@@ -2,18 +2,12 @@
2
2
  ElementsWorker methods for classifications and ML classes.
3
3
  """
4
4
 
5
+ from apistar.exceptions import ErrorResponse
5
6
  from peewee import IntegrityError
6
7
 
7
- from arkindex.exceptions import ErrorResponse
8
8
  from arkindex_worker import logger
9
9
  from arkindex_worker.cache import CachedClassification, CachedElement
10
10
  from arkindex_worker.models import Element
11
- from arkindex_worker.utils import (
12
- DEFAULT_BATCH_SIZE,
13
- batch_publication,
14
- make_batches,
15
- pluralize,
16
- )
17
11
 
18
12
 
19
13
  class ClassificationMixin:
@@ -27,7 +21,7 @@ class ClassificationMixin:
27
21
  )
28
22
  self.classes = {ml_class["name"]: ml_class["id"] for ml_class in corpus_classes}
29
23
  logger.info(
30
- f'Loaded {len(self.classes)} ML {pluralize("class", len(self.classes))} in corpus ({self.corpus_id})'
24
+ f"Loaded {len(self.classes)} ML classes in corpus ({self.corpus_id})"
31
25
  )
32
26
 
33
27
  def get_ml_class_id(self, ml_class: str) -> str:
@@ -45,7 +39,7 @@ class ClassificationMixin:
45
39
  if ml_class_id is None:
46
40
  logger.info(f"Creating ML class {ml_class} on corpus {self.corpus_id}")
47
41
  try:
48
- response = self.api_client.request(
42
+ response = self.request(
49
43
  "CreateMLClass", id=self.corpus_id, body={"name": ml_class}
50
44
  )
51
45
  ml_class_id = self.classes[ml_class] = response["id"]
@@ -125,7 +119,7 @@ class ClassificationMixin:
125
119
  )
126
120
  return
127
121
  try:
128
- created = self.api_client.request(
122
+ created = self.request(
129
123
  "CreateClassification",
130
124
  body={
131
125
  "element": str(element.id),
@@ -173,12 +167,10 @@ class ClassificationMixin:
173
167
 
174
168
  return created
175
169
 
176
- @batch_publication
177
170
  def create_classifications(
178
171
  self,
179
172
  element: Element | CachedElement,
180
173
  classifications: list[dict[str, str | float | bool]],
181
- batch_size: int = DEFAULT_BATCH_SIZE,
182
174
  ) -> list[dict[str, str | float | bool]]:
183
175
  """
184
176
  Create multiple classifications at once on the given element through the API.
@@ -193,8 +185,6 @@ class ClassificationMixin:
193
185
  high_confidence (bool)
194
186
  Optional. Whether or not the classification is of high confidence.
195
187
 
196
- :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
197
-
198
188
  :returns: List of created classifications, as returned in the ``classifications`` field by
199
189
  the ``CreateClassifications`` API endpoint.
200
190
  """
@@ -230,26 +220,20 @@ class ClassificationMixin:
230
220
  )
231
221
  return
232
222
 
233
- created_cls = [
234
- created_cl
235
- for batch in make_batches(classifications, "classification", batch_size)
236
- for created_cl in self.api_client.request(
237
- "CreateClassifications",
238
- body={
239
- "parent": str(element.id),
240
- "worker_run_id": self.worker_run_id,
241
- "classifications": [
242
- {
243
- **classification,
244
- "ml_class": self.get_ml_class_id(
245
- classification["ml_class"]
246
- ),
247
- }
248
- for classification in batch
249
- ],
250
- },
251
- )["classifications"]
252
- ]
223
+ created_cls = self.request(
224
+ "CreateClassifications",
225
+ body={
226
+ "parent": str(element.id),
227
+ "worker_run_id": self.worker_run_id,
228
+ "classifications": [
229
+ {
230
+ **classification,
231
+ "ml_class": self.get_ml_class_id(classification["ml_class"]),
232
+ }
233
+ for classification in classifications
234
+ ],
235
+ },
236
+ )["classifications"]
253
237
 
254
238
  for created_cl in created_cls:
255
239
  created_cl["class_name"] = self.retrieve_ml_class(created_cl["ml_class"])
@@ -5,7 +5,6 @@ BaseWorker methods for corpora.
5
5
  from enum import Enum
6
6
  from operator import itemgetter
7
7
  from tempfile import _TemporaryFileWrapper
8
- from uuid import UUID
9
8
 
10
9
  from arkindex_worker import logger
11
10
 
@@ -37,25 +36,6 @@ class CorpusExportState(Enum):
37
36
 
38
37
 
39
38
  class CorpusMixin:
40
- def download_export(self, export_id: str) -> _TemporaryFileWrapper:
41
- """
42
- Download an export.
43
-
44
- :param export_id: UUID of the export to download
45
- :returns: The downloaded export stored in a temporary file.
46
- """
47
- try:
48
- UUID(export_id)
49
- except ValueError as e:
50
- raise ValueError("export_id is not a valid uuid.") from e
51
-
52
- logger.info(f"Downloading export ({export_id})...")
53
- export: _TemporaryFileWrapper = self.api_client.request(
54
- "DownloadExport", id=export_id
55
- )
56
- logger.info(f"Downloaded export ({export_id}) @ `{export.name}`")
57
- return export
58
-
59
39
  def download_latest_export(self) -> _TemporaryFileWrapper:
60
40
  """
61
41
  Download the latest export in `done` state of the current corpus.
@@ -82,5 +62,8 @@ class CorpusMixin:
82
62
 
83
63
  # Download latest export
84
64
  export_id: str = exports[0]["id"]
65
+ logger.info(f"Downloading export ({export_id})...")
66
+ export: _TemporaryFileWrapper = self.request("DownloadExport", id=export_id)
67
+ logger.info(f"Downloaded export ({export_id}) @ `{export.name}`")
85
68
 
86
- return self.download_export(export_id)
69
+ return export