arkindex-base-worker 0.3.7rc6__tar.gz → 0.3.7rc7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/PKG-INFO +2 -3
  2. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/arkindex_base_worker.egg-info/PKG-INFO +2 -3
  3. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/arkindex_base_worker.egg-info/SOURCES.txt +6 -2
  4. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/arkindex_base_worker.egg-info/requires.txt +1 -2
  5. arkindex-base-worker-0.3.7rc7/arkindex_base_worker.egg-info/top_level.txt +6 -0
  6. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/image.py +4 -1
  7. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/models.py +12 -0
  8. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/__init__.py +112 -121
  9. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/base.py +2 -14
  10. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/dataset.py +19 -26
  11. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/entity.py +4 -2
  12. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/metadata.py +7 -7
  13. arkindex-base-worker-0.3.7rc7/hooks/pre_gen_project.py +3 -0
  14. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/pyproject.toml +5 -2
  15. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/requirements.txt +1 -2
  16. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/tests/conftest.py +12 -7
  17. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/tests/test_dataset_worker.py +279 -401
  18. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/tests/test_elements_worker/test_dataset.py +99 -145
  19. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/tests/test_elements_worker/test_metadata.py +21 -21
  20. arkindex-base-worker-0.3.7rc7/worker-demo/tests/conftest.py +32 -0
  21. arkindex-base-worker-0.3.7rc7/worker-demo/tests/test_worker.py +12 -0
  22. arkindex-base-worker-0.3.7rc7/worker-demo/worker_demo/__init__.py +6 -0
  23. arkindex-base-worker-0.3.7rc7/worker-demo/worker_demo/worker.py +19 -0
  24. arkindex-base-worker-0.3.7rc6/arkindex_base_worker.egg-info/top_level.txt +0 -2
  25. arkindex-base-worker-0.3.7rc6/setup.py +0 -4
  26. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/LICENSE +0 -0
  27. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/README.md +0 -0
  28. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/arkindex_base_worker.egg-info/dependency_links.txt +0 -0
  29. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/__init__.py +0 -0
  30. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/cache.py +0 -0
  31. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/utils.py +0 -0
  32. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/classification.py +0 -0
  33. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/element.py +0 -0
  34. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/task.py +0 -0
  35. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/training.py +0 -0
  36. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/transcription.py +0 -0
  37. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/version.py +0 -0
  38. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/docs-requirements.txt +0 -0
  39. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/setup.cfg +0 -0
  40. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/tests/__init__.py +0 -0
  41. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/tests/test_base_worker.py +0 -0
  42. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/tests/test_cache.py +0 -0
  43. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/tests/test_element.py +0 -0
  44. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/tests/test_elements_worker/__init__.py +0 -0
  45. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/tests/test_elements_worker/test_classifications.py +0 -0
  46. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/tests/test_elements_worker/test_cli.py +0 -0
  47. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/tests/test_elements_worker/test_elements.py +0 -0
  48. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/tests/test_elements_worker/test_entities.py +0 -0
  49. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/tests/test_elements_worker/test_task.py +0 -0
  50. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/tests/test_elements_worker/test_training.py +0 -0
  51. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/tests/test_elements_worker/test_transcriptions.py +0 -0
  52. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/tests/test_elements_worker/test_worker.py +0 -0
  53. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/tests/test_image.py +0 -0
  54. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/tests/test_merge.py +0 -0
  55. {arkindex-base-worker-0.3.7rc6 → arkindex-base-worker-0.3.7rc7}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: arkindex-base-worker
3
- Version: 0.3.7rc6
3
+ Version: 0.3.7rc7
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
@@ -41,13 +41,12 @@ Classifier: Topic :: Text Processing :: Linguistic
41
41
  Requires-Python: >=3.10
42
42
  Description-Content-Type: text/markdown
43
43
  License-File: LICENSE
44
- Requires-Dist: arkindex-client==1.0.15
45
44
  Requires-Dist: peewee==3.17.0
46
45
  Requires-Dist: Pillow==10.2.0
47
46
  Requires-Dist: pymdown-extensions==10.7
48
47
  Requires-Dist: python-gnupg==0.5.2
49
48
  Requires-Dist: shapely==2.0.3
50
- Requires-Dist: tenacity==8.2.3
49
+ Requires-Dist: teklia-toolbox==0.1.4rc3
51
50
  Requires-Dist: zstandard==0.22.0
52
51
  Provides-Extra: docs
53
52
  Requires-Dist: black==24.2.0; extra == "docs"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: arkindex-base-worker
3
- Version: 0.3.7rc6
3
+ Version: 0.3.7rc7
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
@@ -41,13 +41,12 @@ Classifier: Topic :: Text Processing :: Linguistic
41
41
  Requires-Python: >=3.10
42
42
  Description-Content-Type: text/markdown
43
43
  License-File: LICENSE
44
- Requires-Dist: arkindex-client==1.0.15
45
44
  Requires-Dist: peewee==3.17.0
46
45
  Requires-Dist: Pillow==10.2.0
47
46
  Requires-Dist: pymdown-extensions==10.7
48
47
  Requires-Dist: python-gnupg==0.5.2
49
48
  Requires-Dist: shapely==2.0.3
50
- Requires-Dist: tenacity==8.2.3
49
+ Requires-Dist: teklia-toolbox==0.1.4rc3
51
50
  Requires-Dist: zstandard==0.22.0
52
51
  Provides-Extra: docs
53
52
  Requires-Dist: black==24.2.0; extra == "docs"
@@ -3,7 +3,6 @@ README.md
3
3
  docs-requirements.txt
4
4
  pyproject.toml
5
5
  requirements.txt
6
- setup.py
7
6
  arkindex_base_worker.egg-info/PKG-INFO
8
7
  arkindex_base_worker.egg-info/SOURCES.txt
9
8
  arkindex_base_worker.egg-info/dependency_links.txt
@@ -25,6 +24,7 @@ arkindex_worker/worker/task.py
25
24
  arkindex_worker/worker/training.py
26
25
  arkindex_worker/worker/transcription.py
27
26
  arkindex_worker/worker/version.py
27
+ hooks/pre_gen_project.py
28
28
  tests/__init__.py
29
29
  tests/conftest.py
30
30
  tests/test_base_worker.py
@@ -44,4 +44,8 @@ tests/test_elements_worker/test_metadata.py
44
44
  tests/test_elements_worker/test_task.py
45
45
  tests/test_elements_worker/test_training.py
46
46
  tests/test_elements_worker/test_transcriptions.py
47
- tests/test_elements_worker/test_worker.py
47
+ tests/test_elements_worker/test_worker.py
48
+ worker-demo/tests/conftest.py
49
+ worker-demo/tests/test_worker.py
50
+ worker-demo/worker_demo/__init__.py
51
+ worker-demo/worker_demo/worker.py
@@ -1,10 +1,9 @@
1
- arkindex-client==1.0.15
2
1
  peewee==3.17.0
3
2
  Pillow==10.2.0
4
3
  pymdown-extensions==10.7
5
4
  python-gnupg==0.5.2
6
5
  shapely==2.0.3
7
- tenacity==8.2.3
6
+ teklia-toolbox==0.1.4rc3
8
7
  zstandard==0.22.0
9
8
 
10
9
  [docs]
@@ -0,0 +1,6 @@
1
+ arkindex_worker
2
+ dist
3
+ docs
4
+ hooks
5
+ tests
6
+ worker-demo
@@ -21,6 +21,7 @@ from tenacity import (
21
21
  )
22
22
 
23
23
  from arkindex_worker import logger
24
+ from teklia_toolbox.requests import should_verify_cert
24
25
 
25
26
  # Avoid circular imports error when type checking
26
27
  if TYPE_CHECKING:
@@ -175,7 +176,9 @@ def _retry_log(retry_state, *args, **kwargs):
175
176
  reraise=True,
176
177
  )
177
178
  def _retried_request(url, *args, method=requests.get, **kwargs):
178
- resp = method(url, *args, timeout=DOWNLOAD_TIMEOUT, **kwargs)
179
+ resp = method(
180
+ url, *args, timeout=DOWNLOAD_TIMEOUT, verify=should_verify_cert(url), **kwargs
181
+ )
179
182
  resp.raise_for_status()
180
183
  return resp
181
184
 
@@ -20,6 +20,8 @@ class MagicDict(dict):
20
20
  Automagically convert lists and dicts to MagicDicts and lists of MagicDicts
21
21
  Allows for nested access: foo.bar.baz
22
22
  """
23
+ if isinstance(item, Dataset):
24
+ return item
23
25
  if isinstance(item, list):
24
26
  return list(map(self._magify, item))
25
27
  if isinstance(item, dict):
@@ -272,6 +274,16 @@ class Dataset(ArkindexModel):
272
274
  return f"{self.id}.tar.zst"
273
275
 
274
276
 
277
+ class Set(MagicDict):
278
+ """
279
+ Describes an Arkindex dataset set.
280
+ """
281
+
282
+ def __str__(self):
283
+ # Not using ArkindexModel.__str__ as we do not retrieve the Set ID
284
+ return f"{self.__class__.__name__} ({self.name}) from {self.dataset}"
285
+
286
+
275
287
  class Artifact(ArkindexModel):
276
288
  """
277
289
  Describes an Arkindex artifact.
@@ -7,26 +7,25 @@ import json
7
7
  import os
8
8
  import sys
9
9
  import uuid
10
+ from argparse import ArgumentTypeError
10
11
  from collections.abc import Iterable, Iterator
11
12
  from enum import Enum
12
- from itertools import groupby
13
- from operator import itemgetter
14
13
  from pathlib import Path
15
14
 
16
15
  from apistar.exceptions import ErrorResponse
17
16
 
18
17
  from arkindex_worker import logger
19
18
  from arkindex_worker.cache import CachedElement
20
- from arkindex_worker.models import Dataset, Element
19
+ from arkindex_worker.models import Dataset, Element, Set
21
20
  from arkindex_worker.worker.base import BaseWorker
22
21
  from arkindex_worker.worker.classification import ClassificationMixin
23
22
  from arkindex_worker.worker.dataset import DatasetMixin, DatasetState
24
23
  from arkindex_worker.worker.element import ElementMixin
25
- from arkindex_worker.worker.entity import EntityMixin # noqa: F401
24
+ from arkindex_worker.worker.entity import EntityMixin
26
25
  from arkindex_worker.worker.metadata import MetaDataMixin, MetaType # noqa: F401
27
26
  from arkindex_worker.worker.task import TaskMixin
28
27
  from arkindex_worker.worker.transcription import TranscriptionMixin
29
- from arkindex_worker.worker.version import WorkerVersionMixin # noqa: F401
28
+ from arkindex_worker.worker.version import WorkerVersionMixin
30
29
 
31
30
 
32
31
  class ActivityState(Enum):
@@ -160,6 +159,16 @@ class ElementsWorker(
160
159
  super().configure()
161
160
  super().configure_cache()
162
161
 
162
+ # Retrieve the model configuration
163
+ if self.model_configuration:
164
+ self.config.update(self.model_configuration)
165
+ logger.info("Model version configuration retrieved")
166
+
167
+ # Retrieve the user configuration
168
+ if self.user_configuration:
169
+ self.config.update(self.user_configuration)
170
+ logger.info("User configuration retrieved")
171
+
163
172
  def run(self):
164
173
  """
165
174
  Implements an Arkindex worker that goes through each element returned by
@@ -301,6 +310,21 @@ class ElementsWorker(
301
310
  return True
302
311
 
303
312
 
313
+ def check_dataset_set(value: str) -> tuple[uuid.UUID, str]:
314
+ values = value.split(":")
315
+ if len(values) != 2:
316
+ raise ArgumentTypeError(
317
+ f"'{value}' is not in the correct format `<dataset_id>:<set_name>`"
318
+ )
319
+
320
+ dataset_id, set_name = values
321
+ try:
322
+ dataset_id = uuid.UUID(dataset_id)
323
+ return (dataset_id, set_name)
324
+ except (TypeError, ValueError) as e:
325
+ raise ArgumentTypeError(f"'{dataset_id}' should be a valid UUID") from e
326
+
327
+
304
328
  class MissingDatasetArchive(Exception):
305
329
  """
306
330
  Exception raised when the compressed archive associated to
@@ -310,7 +334,7 @@ class MissingDatasetArchive(Exception):
310
334
 
311
335
  class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
312
336
  """
313
- Base class for ML workers that operate on Arkindex datasets.
337
+ Base class for ML workers that operate on Arkindex dataset sets.
314
338
 
315
339
  This class inherits from numerous mixin classes found in other modules of
316
340
  ``arkindex.worker``, which provide helpers to read and write to the Arkindex API.
@@ -320,24 +344,26 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
320
344
  self,
321
345
  description: str = "Arkindex Dataset Worker",
322
346
  support_cache: bool = False,
323
- generator: bool = False,
324
347
  ):
325
348
  """
326
349
  :param description: The worker's description.
327
350
  :param support_cache: Whether the worker supports cache.
328
- :param generator: Whether the worker generates the dataset archive artifact.
329
351
  """
330
352
  super().__init__(description, support_cache)
331
353
 
354
+ self.downloaded_artifact: Path | None = None
355
+
332
356
  self.parser.add_argument(
333
- "--dataset",
334
- type=uuid.UUID,
357
+ "--set",
358
+ type=check_dataset_set,
335
359
  nargs="+",
336
- help="One or more Arkindex dataset ID",
360
+ help="""
361
+ One or more Arkindex dataset sets, format is <dataset_uuid>:<set_name>
362
+ (e.g.: "12341234-1234-1234-1234-123412341234:train")
363
+ """,
364
+ default=[],
337
365
  )
338
366
 
339
- self.generator = generator
340
-
341
367
  def configure(self):
342
368
  """
343
369
  Setup the worker using CLI arguments and environment variables.
@@ -351,162 +377,127 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
351
377
  super().configure()
352
378
  super().configure_cache()
353
379
 
354
- def download_dataset_artifact(self, dataset: Dataset) -> Path:
380
+ # Retrieve the model configuration
381
+ if self.model_configuration:
382
+ self.config.update(self.model_configuration)
383
+ logger.info("Model version configuration retrieved")
384
+
385
+ # Retrieve the user configuration
386
+ if self.user_configuration:
387
+ self.config.update(self.user_configuration)
388
+ logger.info("User configuration retrieved")
389
+
390
+ def cleanup_downloaded_artifact(self) -> None:
391
+ """
392
+ Cleanup the downloaded artifact if any
393
+ """
394
+ if not self.downloaded_artifact:
395
+ return
396
+
397
+ self.downloaded_artifact.unlink(missing_ok=True)
398
+
399
+ def download_dataset_artifact(self, dataset: Dataset) -> None:
355
400
  """
356
401
  Find and download the compressed archive artifact describing a dataset using
357
402
  the [list_artifacts][arkindex_worker.worker.task.TaskMixin.list_artifacts] and
358
403
  [download_artifact][arkindex_worker.worker.task.TaskMixin.download_artifact] methods.
359
404
 
360
405
  :param dataset: The dataset to retrieve the compressed archive artifact for.
361
- :returns: A path to the downloaded artifact.
362
406
  :raises MissingDatasetArchive: When the dataset artifact is not found.
363
407
  """
408
+ extra_dir = self.find_extras_directory()
409
+ archive = extra_dir / dataset.filepath
410
+ if archive.exists():
411
+ return
364
412
 
365
- task_id = uuid.UUID(dataset.task_id)
413
+ # Cleanup the dataset artifact that was downloaded previously
414
+ self.cleanup_downloaded_artifact()
366
415
 
416
+ logger.info(f"Downloading artifact for {dataset}")
417
+ task_id = uuid.UUID(dataset.task_id)
367
418
  for artifact in self.list_artifacts(task_id):
368
419
  if artifact.path != dataset.filepath:
369
420
  continue
370
421
 
371
- extra_dir = self.find_extras_directory()
372
- archive = extra_dir / dataset.filepath
373
422
  archive.write_bytes(self.download_artifact(task_id, artifact).read())
374
- return archive
423
+ self.downloaded_artifact = archive
424
+ return
375
425
 
376
426
  raise MissingDatasetArchive(
377
427
  "The dataset compressed archive artifact was not found."
378
428
  )
379
429
 
380
- def list_dataset_elements_per_split(
381
- self, dataset: Dataset
382
- ) -> Iterator[tuple[str, list[Element]]]:
383
- """
384
- List the elements in the dataset, grouped by split, using the
385
- [list_dataset_elements][arkindex_worker.worker.dataset.DatasetMixin.list_dataset_elements] method.
386
-
387
- :param dataset: The dataset to retrieve elements from.
388
- :returns: An iterator of tuples containing the split name and the list of its elements.
389
- """
390
-
391
- def format_split(
392
- split: tuple[str, Iterator[tuple[str, Element]]],
393
- ) -> tuple[str, list[Element]]:
394
- return (split[0], list(map(itemgetter(1), list(split[1]))))
395
-
396
- return map(
397
- format_split,
398
- groupby(
399
- sorted(self.list_dataset_elements(dataset), key=itemgetter(0)),
400
- key=itemgetter(0),
401
- ),
402
- )
403
-
404
- def process_dataset(self, dataset: Dataset):
430
+ def process_set(self, set: Set):
405
431
  """
406
- Override this method to implement your worker and process a single Arkindex dataset at once.
432
+ Override this method to implement your worker and process a single Arkindex dataset set at once.
407
433
 
408
- :param dataset: The dataset to process.
434
+ :param set: The set to process.
409
435
  """
410
436
 
411
- def list_datasets(self) -> Iterator[Dataset] | Iterator[str]:
437
+ def list_sets(self) -> Iterator[Set]:
412
438
  """
413
- List the datasets to be processed, either from the CLI arguments or using the
414
- [list_process_datasets][arkindex_worker.worker.dataset.DatasetMixin.list_process_datasets] method.
439
+ List the sets to be processed, either from the CLI arguments or using the
440
+ [list_process_sets][arkindex_worker.worker.dataset.DatasetMixin.list_process_sets] method.
415
441
 
416
- :returns: An iterator of strings if the worker is in read-only mode,
417
- else an iterator of ``Dataset`` objects.
442
+ :returns: An iterator of ``Set`` objects.
418
443
  """
419
- if self.is_read_only:
420
- return map(str, self.args.dataset)
444
+ if not self.is_read_only:
445
+ yield from self.list_process_sets()
446
+
447
+ datasets: dict[uuid.UUID, Dataset] = {}
448
+ for dataset_id, set_name in self.args.set:
449
+ # Retrieving dataset information is not already cached
450
+ if dataset_id not in datasets:
451
+ datasets[dataset_id] = Dataset(
452
+ **self.request("RetrieveDataset", id=dataset_id)
453
+ )
421
454
 
422
- return self.list_process_datasets()
455
+ yield Set(name=set_name, dataset=datasets[dataset_id])
423
456
 
424
457
  def run(self):
425
458
  """
426
- Implements an Arkindex worker that goes through each dataset returned by
427
- [list_datasets][arkindex_worker.worker.DatasetWorker.list_datasets].
459
+ Implements an Arkindex worker that goes through each dataset set returned by
460
+ [list_sets][arkindex_worker.worker.DatasetWorker.list_sets].
428
461
 
429
- It calls [process_dataset][arkindex_worker.worker.DatasetWorker.process_dataset],
430
- catching exceptions, and handles updating the [DatasetState][arkindex_worker.worker.dataset.DatasetState]
431
- when the worker is a generator.
462
+ It calls [process_set][arkindex_worker.worker.DatasetWorker.process_set],
463
+ catching exceptions.
432
464
  """
433
465
  self.configure()
434
466
 
435
- datasets: list[Dataset] | list[str] = list(self.list_datasets())
436
- if not datasets:
437
- logger.warning("No datasets to process, stopping.")
467
+ dataset_sets: list[Set] = list(self.list_sets())
468
+ if not dataset_sets:
469
+ logger.warning("No sets to process, stopping.")
438
470
  sys.exit(1)
439
471
 
440
- # Process every dataset
441
- count = len(datasets)
472
+ # Process every set
473
+ count = len(dataset_sets)
442
474
  failed = 0
443
- for i, item in enumerate(datasets, start=1):
444
- dataset = None
445
- dataset_artifact = None
446
-
475
+ for i, dataset_set in enumerate(dataset_sets, start=1):
447
476
  try:
448
- if not self.is_read_only:
449
- # Just use the result of list_datasets as the dataset
450
- dataset = item
451
- else:
452
- # Load dataset using the Arkindex API
453
- dataset = Dataset(**self.request("RetrieveDataset", id=item))
454
-
455
- if self.generator:
456
- assert (
457
- dataset.state
458
- in [DatasetState.Open.value, DatasetState.Error.value]
459
- ), "When generating a new dataset, its state should be Open or Error."
460
- else:
461
- assert (
462
- dataset.state == DatasetState.Complete.value
463
- ), "When processing an existing dataset, its state should be Complete."
464
-
465
- logger.info(f"Processing {dataset} ({i}/{count})")
466
-
467
- if self.generator:
468
- # Update the dataset state to Building
469
- logger.info(f"Building {dataset} ({i}/{count})")
470
- self.update_dataset_state(dataset, DatasetState.Building)
471
- else:
472
- logger.info(f"Downloading data for {dataset} ({i}/{count})")
473
- dataset_artifact = self.download_dataset_artifact(dataset)
477
+ assert (
478
+ dataset_set.dataset.state == DatasetState.Complete.value
479
+ ), "When processing a set, its dataset state should be Complete."
474
480
 
475
- # Process the dataset
476
- self.process_dataset(dataset)
481
+ logger.info(f"Retrieving data for {dataset_set} ({i}/{count})")
482
+ self.download_dataset_artifact(dataset_set.dataset)
477
483
 
478
- if self.generator:
479
- # Update the dataset state to Complete
480
- logger.info(f"Completed {dataset} ({i}/{count})")
481
- self.update_dataset_state(dataset, DatasetState.Complete)
484
+ logger.info(f"Processing {dataset_set} ({i}/{count})")
485
+ self.process_set(dataset_set)
482
486
  except Exception as e:
483
- # Handle errors occurring while retrieving, processing or patching the state for this dataset.
487
+ # Handle errors occurring while retrieving or processing this dataset set
484
488
  failed += 1
485
489
 
486
- # Handle the case where we failed retrieving the dataset
487
- dataset_id = dataset.id if dataset else item
488
-
489
490
  if isinstance(e, ErrorResponse):
490
- message = f"An API error occurred while processing dataset {dataset_id}: {e.title} - {e.content}"
491
+ message = f"An API error occurred while processing {dataset_set}: {e.title} - {e.content}"
491
492
  else:
492
- message = (
493
- f"Failed running worker on dataset {dataset_id}: {repr(e)}"
494
- )
493
+ message = f"Failed running worker on {dataset_set}: {repr(e)}"
495
494
 
496
- logger.warning(
497
- message,
498
- exc_info=e if self.args.verbose else None,
499
- )
500
- if dataset and self.generator:
501
- # Try to update the state to Error regardless of the response
502
- with contextlib.suppress(Exception):
503
- self.update_dataset_state(dataset, DatasetState.Error)
504
- finally:
505
- # Cleanup the dataset artifact if it was downloaded, no matter what
506
- if dataset_artifact:
507
- dataset_artifact.unlink(missing_ok=True)
495
+ logger.warning(message, exc_info=e if self.args.verbose else None)
496
+
497
+ # Cleanup the latest downloaded dataset artifact
498
+ self.cleanup_downloaded_artifact()
508
499
 
509
- message = f'Ran on {count} dataset{"s"[:count>1]}: {count - failed} completed, {failed} failed'
500
+ message = f'Ran on {count} set{"s"[:count>1]}: {count - failed} completed, {failed} failed'
510
501
  if failed:
511
502
  logger.error(message)
512
503
  if failed >= count: # Everything failed!
@@ -21,7 +21,6 @@ from tenacity import (
21
21
  wait_exponential,
22
22
  )
23
23
 
24
- from arkindex import ArkindexClient, options_from_env
25
24
  from arkindex_worker import logger
26
25
  from arkindex_worker.cache import (
27
26
  check_version,
@@ -31,18 +30,7 @@ from arkindex_worker.cache import (
31
30
  merge_parents_cache,
32
31
  )
33
32
  from arkindex_worker.utils import close_delete_file, extract_tar_zst_archive
34
-
35
-
36
- def _is_500_error(exc: Exception) -> bool:
37
- """
38
- Check if an Arkindex API error has a HTTP 5xx error code.
39
- Used to retry most API calls in [BaseWorker][arkindex_worker.worker.base.BaseWorker].
40
- :param exc: Exception to check
41
- """
42
- if not isinstance(exc, ErrorResponse):
43
- return False
44
-
45
- return 500 <= exc.status_code < 600
33
+ from teklia_toolbox.requests import _get_arkindex_client, _is_500_error
46
34
 
47
35
 
48
36
  class ExtrasDirNotFoundError(Exception):
@@ -197,7 +185,7 @@ class BaseWorker:
197
185
  Create an ArkindexClient to make API requests towards Arkindex instances.
198
186
  """
199
187
  # Build Arkindex API client from environment variables
200
- self.api_client = ArkindexClient(**options_from_env())
188
+ self.api_client = _get_arkindex_client()
201
189
  logger.debug(f"Setup Arkindex API client on {self.api_client.document.url}")
202
190
 
203
191
  def configure_for_developers(self):
@@ -7,7 +7,7 @@ from enum import Enum
7
7
 
8
8
  from arkindex_worker import logger
9
9
  from arkindex_worker.cache import unsupported_cache
10
- from arkindex_worker.models import Dataset, Element
10
+ from arkindex_worker.models import Dataset, Element, Set
11
11
 
12
12
 
13
13
  class DatasetState(Enum):
@@ -37,49 +37,42 @@ class DatasetState(Enum):
37
37
 
38
38
 
39
39
  class DatasetMixin:
40
- def list_process_datasets(self) -> Iterator[Dataset]:
40
+ def list_process_sets(self) -> Iterator[Set]:
41
41
  """
42
- List datasets associated to the worker's process. This helper is not available in developer mode.
42
+ List dataset sets associated to the worker's process. This helper is not available in developer mode.
43
43
 
44
- :returns: An iterator of ``Dataset`` objects built from the ``ListProcessDatasets`` API endpoint.
44
+ :returns: An iterator of ``Set`` objects built from the ``ListProcessSets`` API endpoint.
45
45
  """
46
46
  assert not self.is_read_only, "This helper is not available in read-only mode."
47
47
 
48
48
  results = self.api_client.paginate(
49
- "ListProcessDatasets", id=self.process_information["id"]
49
+ "ListProcessSets", id=self.process_information["id"]
50
50
  )
51
51
 
52
52
  return map(
53
- lambda result: Dataset(**result["dataset"], selected_sets=result["sets"]),
53
+ lambda result: Set(
54
+ name=result["set_name"], dataset=Dataset(**result["dataset"])
55
+ ),
54
56
  results,
55
57
  )
56
58
 
57
- def list_dataset_elements(self, dataset: Dataset) -> Iterator[tuple[str, Element]]:
59
+ def list_set_elements(self, dataset_set: Set) -> Iterator[Element]:
58
60
  """
59
- List elements in a dataset.
61
+ List elements in a dataset set.
60
62
 
61
- :param dataset: Dataset to find elements in.
62
- :returns: An iterator of tuples built from the ``ListDatasetElements`` API endpoint.
63
+ :param dataset_set: Set to find elements in.
64
+ :returns: An iterator of Element built from the ``ListDatasetElements`` API endpoint.
63
65
  """
64
- assert dataset and isinstance(
65
- dataset, Dataset
66
- ), "dataset shouldn't be null and should be a Dataset"
67
-
68
- if dataset.sets == dataset.selected_sets:
69
- results = self.api_client.paginate("ListDatasetElements", id=dataset.id)
70
- else:
71
- results = iter(
72
- element
73
- for selected_set in dataset.selected_sets
74
- for element in self.api_client.paginate(
75
- "ListDatasetElements", id=dataset.id, set=selected_set
76
- )
77
- )
66
+ assert dataset_set and isinstance(
67
+ dataset_set, Set
68
+ ), "dataset_set shouldn't be null and should be a Set"
78
69
 
79
- return map(
80
- lambda result: (result["set"], Element(**result["element"])), results
70
+ results = self.api_client.paginate(
71
+ "ListDatasetElements", id=dataset_set.dataset.id, set=dataset_set.name
81
72
  )
82
73
 
74
+ return map(lambda result: Element(**result["element"]), results)
75
+
83
76
  @unsupported_cache
84
77
  def update_dataset_state(self, dataset: Dataset, state: DatasetState) -> Dataset:
85
78
  """
@@ -380,8 +380,9 @@ class EntityMixin:
380
380
  "ListCorpusEntities", id=self.corpus_id, **query_params
381
381
  )
382
382
  }
383
+ count = len(self.entities)
383
384
  logger.info(
384
- f"Loaded {len(self.entities)} entities in corpus ({self.corpus_id})"
385
+ f'Loaded {count} entit{"ies" if count > 1 else "y"} in corpus ({self.corpus_id})'
385
386
  )
386
387
 
387
388
  def list_corpus_entity_types(
@@ -396,6 +397,7 @@ class EntityMixin:
396
397
  "ListCorpusEntityTypes", id=self.corpus_id
397
398
  )
398
399
  }
400
+ count = len(self.entity_types)
399
401
  logger.info(
400
- f"Loaded {len(self.entity_types)} entity types in corpus ({self.corpus_id})."
402
+ f'Loaded {count} entity type{"s"[:count>1]} in corpus ({self.corpus_id}).'
401
403
  )
@@ -108,17 +108,17 @@ class MetaDataMixin:
108
108
  return metadata["id"]
109
109
 
110
110
  @unsupported_cache
111
- def create_metadatas(
111
+ def create_metadata_bulk(
112
112
  self,
113
113
  element: Element | CachedElement,
114
- metadatas: list[dict[str, MetaType | str | int | float | None]],
114
+ metadata_list: list[dict[str, MetaType | str | int | float | None]],
115
115
  ) -> list[dict[str, str]]:
116
116
  """
117
117
  Create multiple metadata on an existing element.
118
118
  This method does not support cache.
119
119
 
120
120
  :param element: The element to create multiple metadata on.
121
- :param metadatas: The list of dict whose keys are the following:
121
+ :param metadata_list: The list of dict whose keys are the following:
122
122
  - type: MetaType
123
123
  - name: str
124
124
  - value: str | int | float
@@ -128,13 +128,13 @@ class MetaDataMixin:
128
128
  element, Element | CachedElement
129
129
  ), "element shouldn't be null and should be of type Element or CachedElement"
130
130
 
131
- assert metadatas and isinstance(
132
- metadatas, list
133
- ), "type shouldn't be null and should be of type list of Dict"
131
+ assert metadata_list and isinstance(
132
+ metadata_list, list
133
+ ), "metadata_list shouldn't be null and should be of type list of dict"
134
134
 
135
135
  # Make a copy to avoid modifying the metadata_list argument
136
136
  metas = []
137
- for index, metadata in enumerate(metadatas):
137
+ for index, metadata in enumerate(metadata_list):
138
138
  assert isinstance(
139
139
  metadata, dict
140
140
  ), f"Element at index {index} in metadata_list: Should be of type dict"
@@ -0,0 +1,3 @@
1
+ # Normalize the slug to generate __package and __module private variables
2
+ {{cookiecutter.update({"__package": cookiecutter.slug.lower().replace("_", "-")})}} # noqa: F821
3
+ {{cookiecutter.update({"__module": cookiecutter.slug.lower().replace("-", "_")})}} # noqa: F821