arkindex-base-worker 0.3.7rc10__tar.gz → 0.4.0a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/PKG-INFO +10 -16
  2. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/README.md +1 -1
  3. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_base_worker.egg-info/PKG-INFO +10 -16
  4. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_base_worker.egg-info/SOURCES.txt +4 -0
  5. arkindex_base_worker-0.4.0a1/arkindex_base_worker.egg-info/requires.txt +16 -0
  6. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/models.py +6 -0
  7. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/utils.py +3 -4
  8. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/__init__.py +23 -2
  9. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/base.py +3 -23
  10. arkindex_base_worker-0.4.0a1/arkindex_worker/worker/corpus.py +69 -0
  11. arkindex_base_worker-0.4.0a1/arkindex_worker/worker/image.py +21 -0
  12. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/training.py +12 -0
  13. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/pyproject.toml +9 -20
  14. arkindex_base_worker-0.4.0a1/tests/__init__.py +8 -0
  15. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/conftest.py +4 -8
  16. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_base_worker.py +8 -9
  17. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_dataset_worker.py +14 -41
  18. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_elements_worker/test_classifications.py +22 -39
  19. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_elements_worker/test_cli.py +3 -11
  20. arkindex_base_worker-0.4.0a1/tests/test_elements_worker/test_corpus.py +137 -0
  21. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_elements_worker/test_dataset.py +6 -11
  22. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_elements_worker/test_elements.py +106 -85
  23. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_elements_worker/test_entities.py +15 -39
  24. arkindex_base_worker-0.4.0a1/tests/test_elements_worker/test_image.py +65 -0
  25. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_elements_worker/test_metadata.py +6 -40
  26. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_elements_worker/test_task.py +7 -17
  27. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_elements_worker/test_training.py +35 -0
  28. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_elements_worker/test_transcriptions.py +10 -27
  29. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_elements_worker/test_worker.py +2 -1
  30. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_image.py +3 -5
  31. arkindex-base-worker-0.3.7rc10/arkindex_base_worker.egg-info/requires.txt +0 -21
  32. arkindex-base-worker-0.3.7rc10/worker-demo/tests/__init__.py +0 -0
  33. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/LICENSE +0 -0
  34. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_base_worker.egg-info/dependency_links.txt +0 -0
  35. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_base_worker.egg-info/top_level.txt +0 -0
  36. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/__init__.py +0 -0
  37. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/cache.py +0 -0
  38. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/image.py +0 -0
  39. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/classification.py +0 -0
  40. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/dataset.py +0 -0
  41. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/element.py +0 -0
  42. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/entity.py +0 -0
  43. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/metadata.py +0 -0
  44. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/task.py +0 -0
  45. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/transcription.py +0 -0
  46. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/version.py +0 -0
  47. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/hooks/pre_gen_project.py +0 -0
  48. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/setup.cfg +0 -0
  49. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_cache.py +0 -0
  50. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_element.py +0 -0
  51. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_elements_worker/__init__.py +0 -0
  52. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_merge.py +0 -0
  53. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_utils.py +0 -0
  54. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1/worker-demo}/tests/__init__.py +0 -0
  55. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/worker-demo/tests/conftest.py +0 -0
  56. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/worker-demo/tests/test_worker.py +0 -0
  57. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/worker-demo/worker_demo/__init__.py +0 -0
  58. {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/worker-demo/worker_demo/worker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: arkindex-base-worker
3
- Version: 0.3.7rc10
3
+ Version: 0.4.0a1
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
@@ -37,28 +37,22 @@ Classifier: License :: OSI Approved :: MIT License
37
37
  Classifier: Programming Language :: Python :: 3 :: Only
38
38
  Classifier: Programming Language :: Python :: 3.10
39
39
  Classifier: Programming Language :: Python :: 3.11
40
- Classifier: Topic :: Text Processing :: Linguistic
41
40
  Requires-Python: >=3.10
42
41
  Description-Content-Type: text/markdown
43
42
  License-File: LICENSE
44
- Requires-Dist: peewee==3.17.0
45
- Requires-Dist: Pillow==10.2.0
46
- Requires-Dist: pymdown-extensions==10.7
43
+ Requires-Dist: peewee~=3.17
44
+ Requires-Dist: Pillow==10.3.0
47
45
  Requires-Dist: python-gnupg==0.5.2
48
46
  Requires-Dist: shapely==2.0.3
49
- Requires-Dist: teklia-toolbox==0.1.4
47
+ Requires-Dist: teklia-toolbox==0.1.5
50
48
  Requires-Dist: zstandard==0.22.0
51
49
  Provides-Extra: docs
52
- Requires-Dist: black==24.2.0; extra == "docs"
53
- Requires-Dist: doc8==1.1.1; extra == "docs"
54
- Requires-Dist: mkdocs==1.5.3; extra == "docs"
55
- Requires-Dist: mkdocs-material==9.5.10; extra == "docs"
56
- Requires-Dist: mkdocstrings==0.24.0; extra == "docs"
57
- Requires-Dist: mkdocstrings-python==1.8.0; extra == "docs"
58
- Requires-Dist: recommonmark==0.7.1; extra == "docs"
50
+ Requires-Dist: black==24.4.0; extra == "docs"
51
+ Requires-Dist: mkdocs-material==9.5.17; extra == "docs"
52
+ Requires-Dist: mkdocstrings-python==1.9.2; extra == "docs"
59
53
  Provides-Extra: tests
60
- Requires-Dist: pytest==8.0.1; extra == "tests"
61
- Requires-Dist: pytest-mock==3.12.0; extra == "tests"
54
+ Requires-Dist: pytest==8.1.1; extra == "tests"
55
+ Requires-Dist: pytest-mock==3.14.0; extra == "tests"
62
56
  Requires-Dist: pytest-responses==0.5.1; extra == "tests"
63
57
 
64
58
  # Arkindex base Worker
@@ -73,7 +67,7 @@ The [documentation](https://workers.arkindex.org/) is made with [Material for Mk
73
67
 
74
68
  ## Create a new worker using our template
75
69
 
76
- ```
70
+ ```shell
77
71
  pip install --user cookiecutter
78
72
  cookiecutter git@gitlab.teklia.com:workers/base-worker.git
79
73
  ```
@@ -10,7 +10,7 @@ The [documentation](https://workers.arkindex.org/) is made with [Material for Mk
10
10
 
11
11
  ## Create a new worker using our template
12
12
 
13
- ```
13
+ ```shell
14
14
  pip install --user cookiecutter
15
15
  cookiecutter git@gitlab.teklia.com:workers/base-worker.git
16
16
  ```
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: arkindex-base-worker
3
- Version: 0.3.7rc10
3
+ Version: 0.4.0a1
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
@@ -37,28 +37,22 @@ Classifier: License :: OSI Approved :: MIT License
37
37
  Classifier: Programming Language :: Python :: 3 :: Only
38
38
  Classifier: Programming Language :: Python :: 3.10
39
39
  Classifier: Programming Language :: Python :: 3.11
40
- Classifier: Topic :: Text Processing :: Linguistic
41
40
  Requires-Python: >=3.10
42
41
  Description-Content-Type: text/markdown
43
42
  License-File: LICENSE
44
- Requires-Dist: peewee==3.17.0
45
- Requires-Dist: Pillow==10.2.0
46
- Requires-Dist: pymdown-extensions==10.7
43
+ Requires-Dist: peewee~=3.17
44
+ Requires-Dist: Pillow==10.3.0
47
45
  Requires-Dist: python-gnupg==0.5.2
48
46
  Requires-Dist: shapely==2.0.3
49
- Requires-Dist: teklia-toolbox==0.1.4
47
+ Requires-Dist: teklia-toolbox==0.1.5
50
48
  Requires-Dist: zstandard==0.22.0
51
49
  Provides-Extra: docs
52
- Requires-Dist: black==24.2.0; extra == "docs"
53
- Requires-Dist: doc8==1.1.1; extra == "docs"
54
- Requires-Dist: mkdocs==1.5.3; extra == "docs"
55
- Requires-Dist: mkdocs-material==9.5.10; extra == "docs"
56
- Requires-Dist: mkdocstrings==0.24.0; extra == "docs"
57
- Requires-Dist: mkdocstrings-python==1.8.0; extra == "docs"
58
- Requires-Dist: recommonmark==0.7.1; extra == "docs"
50
+ Requires-Dist: black==24.4.0; extra == "docs"
51
+ Requires-Dist: mkdocs-material==9.5.17; extra == "docs"
52
+ Requires-Dist: mkdocstrings-python==1.9.2; extra == "docs"
59
53
  Provides-Extra: tests
60
- Requires-Dist: pytest==8.0.1; extra == "tests"
61
- Requires-Dist: pytest-mock==3.12.0; extra == "tests"
54
+ Requires-Dist: pytest==8.1.1; extra == "tests"
55
+ Requires-Dist: pytest-mock==3.14.0; extra == "tests"
62
56
  Requires-Dist: pytest-responses==0.5.1; extra == "tests"
63
57
 
64
58
  # Arkindex base Worker
@@ -73,7 +67,7 @@ The [documentation](https://workers.arkindex.org/) is made with [Material for Mk
73
67
 
74
68
  ## Create a new worker using our template
75
69
 
76
- ```
70
+ ```shell
77
71
  pip install --user cookiecutter
78
72
  cookiecutter git@gitlab.teklia.com:workers/base-worker.git
79
73
  ```
@@ -14,9 +14,11 @@ arkindex_worker/utils.py
14
14
  arkindex_worker/worker/__init__.py
15
15
  arkindex_worker/worker/base.py
16
16
  arkindex_worker/worker/classification.py
17
+ arkindex_worker/worker/corpus.py
17
18
  arkindex_worker/worker/dataset.py
18
19
  arkindex_worker/worker/element.py
19
20
  arkindex_worker/worker/entity.py
21
+ arkindex_worker/worker/image.py
20
22
  arkindex_worker/worker/metadata.py
21
23
  arkindex_worker/worker/task.py
22
24
  arkindex_worker/worker/training.py
@@ -35,9 +37,11 @@ tests/test_utils.py
35
37
  tests/test_elements_worker/__init__.py
36
38
  tests/test_elements_worker/test_classifications.py
37
39
  tests/test_elements_worker/test_cli.py
40
+ tests/test_elements_worker/test_corpus.py
38
41
  tests/test_elements_worker/test_dataset.py
39
42
  tests/test_elements_worker/test_elements.py
40
43
  tests/test_elements_worker/test_entities.py
44
+ tests/test_elements_worker/test_image.py
41
45
  tests/test_elements_worker/test_metadata.py
42
46
  tests/test_elements_worker/test_task.py
43
47
  tests/test_elements_worker/test_training.py
@@ -0,0 +1,16 @@
1
+ peewee~=3.17
2
+ Pillow==10.3.0
3
+ python-gnupg==0.5.2
4
+ shapely==2.0.3
5
+ teklia-toolbox==0.1.5
6
+ zstandard==0.22.0
7
+
8
+ [docs]
9
+ black==24.4.0
10
+ mkdocs-material==9.5.17
11
+ mkdocstrings-python==1.9.2
12
+
13
+ [tests]
14
+ pytest==8.1.1
15
+ pytest-mock==3.14.0
16
+ pytest-responses==0.5.1
@@ -261,6 +261,12 @@ class Transcription(ArkindexModel):
261
261
  """
262
262
 
263
263
 
264
+ class Image(ArkindexModel):
265
+ """
266
+ Describes an Arkindex image.
267
+ """
268
+
269
+
264
270
  class Dataset(ArkindexModel):
265
271
  """
266
272
  Describes an Arkindex dataset.
@@ -5,7 +5,6 @@ import tarfile
5
5
  import tempfile
6
6
  from pathlib import Path
7
7
 
8
- import zstandard
9
8
  import zstandard as zstd
10
9
 
11
10
  logger = logging.getLogger(__name__)
@@ -38,7 +37,7 @@ def decompress_zst_archive(compressed_archive: Path) -> tuple[int, Path]:
38
37
  :param compressed_archive: Path to the target ZST-compressed archive
39
38
  :return: File descriptor and path to the uncompressed tar archive
40
39
  """
41
- dctx = zstandard.ZstdDecompressor()
40
+ dctx = zstd.ZstdDecompressor()
42
41
  archive_fd, archive_path = tempfile.mkstemp(suffix=".tar")
43
42
  archive_path = Path(archive_path)
44
43
 
@@ -50,7 +49,7 @@ def decompress_zst_archive(compressed_archive: Path) -> tuple[int, Path]:
50
49
  ):
51
50
  dctx.copy_stream(compressed, decompressed)
52
51
  logger.debug(f"Successfully uncompressed archive {compressed_archive}")
53
- except zstandard.ZstdError as e:
52
+ except zstd.ZstdError as e:
54
53
  raise Exception(f"Couldn't uncompressed archive: {e}") from e
55
54
 
56
55
  return archive_fd, archive_path
@@ -129,7 +128,7 @@ def zstd_compress(
129
128
  archive_hasher.update(compressed_chunk)
130
129
  archive_file.write(compressed_chunk)
131
130
  logger.debug(f"Successfully compressed {source}")
132
- except zstandard.ZstdError as e:
131
+ except zstd.ZstdError as e:
133
132
  raise Exception(f"Couldn't compress archive: {e}") from e
134
133
  return file_d, destination, archive_hasher.hexdigest()
135
134
 
@@ -19,9 +19,11 @@ from arkindex_worker.cache import CachedElement
19
19
  from arkindex_worker.models import Dataset, Element, Set
20
20
  from arkindex_worker.worker.base import BaseWorker
21
21
  from arkindex_worker.worker.classification import ClassificationMixin
22
+ from arkindex_worker.worker.corpus import CorpusMixin
22
23
  from arkindex_worker.worker.dataset import DatasetMixin, DatasetState
23
24
  from arkindex_worker.worker.element import ElementMixin
24
25
  from arkindex_worker.worker.entity import EntityMixin
26
+ from arkindex_worker.worker.image import ImageMixin
25
27
  from arkindex_worker.worker.metadata import MetaDataMixin, MetaType # noqa: F401
26
28
  from arkindex_worker.worker.task import TaskMixin
27
29
  from arkindex_worker.worker.transcription import TranscriptionMixin
@@ -57,11 +59,13 @@ class ActivityState(Enum):
57
59
  class ElementsWorker(
58
60
  BaseWorker,
59
61
  ClassificationMixin,
62
+ CorpusMixin,
60
63
  ElementMixin,
61
64
  TranscriptionMixin,
62
65
  WorkerVersionMixin,
63
66
  EntityMixin,
64
67
  MetaDataMixin,
68
+ ImageMixin,
65
69
  ):
66
70
  """
67
71
  Base class for ML workers that operate on Arkindex elements.
@@ -88,7 +92,7 @@ class ElementsWorker(
88
92
  )
89
93
  self.parser.add_argument(
90
94
  "--element",
91
- type=uuid.UUID,
95
+ type=str,
92
96
  nargs="+",
93
97
  help="One or more Arkindex element ID",
94
98
  )
@@ -107,11 +111,23 @@ class ElementsWorker(
107
111
  the cache database when enabled.
108
112
 
109
113
  :return: An iterable of [CachedElement][arkindex_worker.cache.CachedElement] when cache support is enabled,
110
- and a list of strings representing element IDs otherwise.
114
+ or a list of strings representing element IDs otherwise.
111
115
  """
112
116
  assert not (
113
117
  self.args.elements_list and self.args.element
114
118
  ), "elements-list and element CLI args shouldn't be both set"
119
+
120
+ def invalid_element_id(value: str) -> bool:
121
+ """
122
+ Return whether the ID of an element is a valid UUID or not
123
+ """
124
+ try:
125
+ uuid.UUID(value)
126
+ except Exception:
127
+ return True
128
+
129
+ return False
130
+
115
131
  out = []
116
132
 
117
133
  # Load from the cache when available
@@ -131,6 +147,11 @@ class ElementsWorker(
131
147
  elif self.args.element:
132
148
  out += self.args.element
133
149
 
150
+ invalid_element_ids = list(filter(invalid_element_id, out))
151
+ assert (
152
+ not invalid_element_ids
153
+ ), f"These element IDs are invalid: {', '.join(invalid_element_ids)}"
154
+
134
155
  return out
135
156
 
136
157
  @property
@@ -13,14 +13,8 @@ from tempfile import mkdtemp
13
13
  import gnupg
14
14
  import yaml
15
15
  from apistar.exceptions import ErrorResponse
16
- from tenacity import (
17
- before_sleep_log,
18
- retry,
19
- retry_if_exception,
20
- stop_after_attempt,
21
- wait_exponential,
22
- )
23
16
 
17
+ from arkindex import options_from_env
24
18
  from arkindex_worker import logger
25
19
  from arkindex_worker.cache import (
26
20
  check_version,
@@ -30,7 +24,7 @@ from arkindex_worker.cache import (
30
24
  merge_parents_cache,
31
25
  )
32
26
  from arkindex_worker.utils import close_delete_file, extract_tar_zst_archive
33
- from teklia_toolbox.requests import _get_arkindex_client, _is_500_error
27
+ from teklia_toolbox.requests import get_arkindex_client
34
28
 
35
29
 
36
30
  class ExtrasDirNotFoundError(Exception):
@@ -185,7 +179,7 @@ class BaseWorker:
185
179
  Create an ArkindexClient to make API requests towards Arkindex instances.
186
180
  """
187
181
  # Build Arkindex API client from environment variables
188
- self.api_client = _get_arkindex_client()
182
+ self.api_client = get_arkindex_client(**options_from_env())
189
183
  logger.debug(f"Setup Arkindex API client on {self.api_client.document.url}")
190
184
 
191
185
  def configure_for_developers(self):
@@ -477,23 +471,9 @@ class BaseWorker:
477
471
  # Clean up
478
472
  shutil.rmtree(base_extracted_path)
479
473
 
480
- @retry(
481
- retry=retry_if_exception(_is_500_error),
482
- wait=wait_exponential(multiplier=2, min=3),
483
- reraise=True,
484
- stop=stop_after_attempt(5),
485
- before_sleep=before_sleep_log(logger, logging.INFO),
486
- )
487
474
  def request(self, *args, **kwargs):
488
475
  """
489
476
  Wrapper around the ``ArkindexClient.request`` method.
490
-
491
- The API call will be retried up to 5 times in case of HTTP 5xx errors,
492
- with an exponential sleep time of 3, 4, 8 and 16 seconds between calls.
493
- If the 5th call still causes an HTTP 5xx error, the exception is re-raised
494
- and the caller should catch it.
495
-
496
- Log messages are displayed when an HTTP 5xx error occurs, before waiting for the next call.
497
477
  """
498
478
  return self.api_client.request(*args, **kwargs)
499
479
 
@@ -0,0 +1,69 @@
1
+ """
2
+ BaseWorker methods for corpora.
3
+ """
4
+
5
+ from enum import Enum
6
+ from operator import itemgetter
7
+ from tempfile import _TemporaryFileWrapper
8
+
9
+ from arkindex_worker import logger
10
+
11
+
12
+ class CorpusExportState(Enum):
13
+ """
14
+ State of a corpus export.
15
+ """
16
+
17
+ Created = "created"
18
+ """
19
+ The corpus export is created, awaiting its processing.
20
+ """
21
+
22
+ Running = "running"
23
+ """
24
+ The corpus export is being built.
25
+ """
26
+
27
+ Failed = "failed"
28
+ """
29
+ The corpus export failed.
30
+ """
31
+
32
+ Done = "done"
33
+ """
34
+ The corpus export ended in success.
35
+ """
36
+
37
+
38
+ class CorpusMixin:
39
+ def download_latest_export(self) -> _TemporaryFileWrapper:
40
+ """
41
+ Download the latest export in `done` state of the current corpus.
42
+
43
+ :returns: The downloaded export stored in a temporary file.
44
+ """
45
+ # List all exports on the corpus
46
+ exports = self.api_client.paginate("ListExports", id=self.corpus_id)
47
+
48
+ # Find the latest that is in "done" state
49
+ exports: list[dict] = sorted(
50
+ list(
51
+ filter(
52
+ lambda export: export["state"] == CorpusExportState.Done.value,
53
+ exports,
54
+ )
55
+ ),
56
+ key=itemgetter("updated"),
57
+ reverse=True,
58
+ )
59
+ assert (
60
+ len(exports) > 0
61
+ ), f'No available exports found for the corpus ({self.corpus_id}) with state "{CorpusExportState.Done.value.capitalize()}".'
62
+
63
+ # Download latest export
64
+ export_id: str = exports[0]["id"]
65
+ logger.info(f"Downloading export ({export_id})...")
66
+ export: _TemporaryFileWrapper = self.request("DownloadExport", id=export_id)
67
+ logger.info(f"Downloaded export ({export_id}) @ `{export.name}`")
68
+
69
+ return export
@@ -0,0 +1,21 @@
1
+ """
2
+ ElementsWorker methods for images.
3
+ """
4
+
5
+ from arkindex_worker.models import Image
6
+
7
+
8
+ class ImageMixin:
9
+ def create_iiif_url(self, url: str) -> Image:
10
+ """
11
+ Create an image from an existing IIIF image by URL.
12
+ The URL should be of the image's identifier, not of its Image Information request (`/info.json`).
13
+
14
+ :param url: URL of the image.
15
+ :returns: The created image.
16
+ """
17
+ assert url and isinstance(
18
+ url, str
19
+ ), "url shouldn't be null and should be of type str"
20
+
21
+ return Image(self.api_client.request("CreateIIIFURL", body={"url": url}))
@@ -83,6 +83,9 @@ class TrainingMixin:
83
83
 
84
84
  @property
85
85
  def is_finetuning(self) -> bool:
86
+ """
87
+ Whether or not this worker is fine-tuning an existing model version.
88
+ """
86
89
  return bool(self.model_version_id)
87
90
 
88
91
  @skip_if_read_only
@@ -280,8 +283,17 @@ class TrainingMixin:
280
283
  },
281
284
  )
282
285
  except ErrorResponse as e:
286
+ # Temporary fix while waiting for `ValidateModelVersion` refactoring as it can
287
+ # return errors even when the model version is properly validated
288
+ if e.status_code in [403, 500]:
289
+ logger.warning(
290
+ f'An error occurred while validating model version {self.model_version["id"]}, please check its status.'
291
+ )
292
+ return
293
+
283
294
  if e.status_code != 409:
284
295
  raise e
296
+
285
297
  logger.warning(
286
298
  f"An available model version exists with hash {hash}, using it instead of the pending version."
287
299
  )
@@ -4,16 +4,15 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "arkindex-base-worker"
7
- version = "0.3.7rc10"
7
+ version = "0.4.0a1"
8
8
  description = "Base Worker to easily build Arkindex ML workflows"
9
9
  license = { file = "LICENSE" }
10
10
  dependencies = [
11
- "peewee==3.17.0",
12
- "Pillow==10.2.0",
13
- "pymdown-extensions==10.7",
11
+ "peewee~=3.17",
12
+ "Pillow==10.3.0",
14
13
  "python-gnupg==0.5.2",
15
14
  "shapely==2.0.3",
16
- "teklia-toolbox==0.1.4",
15
+ "teklia-toolbox==0.1.5",
17
16
  "zstandard==0.22.0",
18
17
  ]
19
18
  authors = [
@@ -32,8 +31,6 @@ classifiers = [
32
31
  "Programming Language :: Python :: 3 :: Only",
33
32
  "Programming Language :: Python :: 3.10",
34
33
  "Programming Language :: Python :: 3.11",
35
- # Topics
36
- "Topic :: Text Processing :: Linguistic",
37
34
  ]
38
35
 
39
36
  [project.urls]
@@ -45,17 +42,13 @@ Authors = "https://teklia.com"
45
42
 
46
43
  [project.optional-dependencies]
47
44
  docs = [
48
- "black==24.2.0",
49
- "doc8==1.1.1",
50
- "mkdocs==1.5.3",
51
- "mkdocs-material==9.5.10",
52
- "mkdocstrings==0.24.0",
53
- "mkdocstrings-python==1.8.0",
54
- "recommonmark==0.7.1",
45
+ "black==24.4.0",
46
+ "mkdocs-material==9.5.17",
47
+ "mkdocstrings-python==1.9.2",
55
48
  ]
56
49
  tests = [
57
- "pytest==8.0.1",
58
- "pytest-mock==3.12.0",
50
+ "pytest==8.1.1",
51
+ "pytest-mock==3.14.0",
59
52
  "pytest-responses==0.5.1",
60
53
  ]
61
54
 
@@ -114,7 +107,3 @@ known-third-party = [
114
107
  "yaml",
115
108
  "zstandard",
116
109
  ]
117
-
118
- [tool.doc8]
119
- ignore-path=["*.egg-info", "public", ".git"]
120
- extensions=[".md"]
@@ -0,0 +1,8 @@
1
+ from pathlib import Path
2
+
3
+ BASE_DIR = Path(__file__).resolve().parent
4
+ FIXTURES_DIR = BASE_DIR / "data"
5
+ SAMPLES_DIR = BASE_DIR / "samples"
6
+
7
+ CORPUS_ID = "11111111-1111-1111-1111-111111111111"
8
+ PROCESS_ID = "cafecafe-cafe-cafe-cafe-cafecafecafe"
@@ -26,11 +26,7 @@ from arkindex_worker.models import Artifact, Dataset, Set
26
26
  from arkindex_worker.worker import BaseWorker, DatasetWorker, ElementsWorker
27
27
  from arkindex_worker.worker.dataset import DatasetState
28
28
  from arkindex_worker.worker.transcription import TextOrientation
29
-
30
- FIXTURES_DIR = Path(__file__).resolve().parent / "data"
31
- SAMPLES_DIR = Path(__file__).resolve().parent / "samples"
32
-
33
- PROCESS_ID = "cafecafe-cafe-cafe-cafe-cafecafecafe"
29
+ from tests import CORPUS_ID, FIXTURES_DIR, PROCESS_ID, SAMPLES_DIR
34
30
 
35
31
  __yaml_cache = {}
36
32
 
@@ -93,7 +89,7 @@ def _setup_api(responses, monkeypatch, _cache_yaml):
93
89
 
94
90
  # Fallback to prod environment
95
91
  if schema_url is None:
96
- schema_url = "https://arkindex.teklia.com/api/v1/openapi/?format=openapi-json"
92
+ schema_url = "https://arkindex.teklia.com/api/v1/openapi/?format=json"
97
93
  monkeypatch.setenv("ARKINDEX_API_SCHEMA_URL", schema_url)
98
94
 
99
95
  # Allow accessing remote API schemas
@@ -159,7 +155,7 @@ def _mock_worker_run_api(responses):
159
155
  "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
160
156
  "state": "running",
161
157
  "mode": "workers",
162
- "corpus": "11111111-1111-1111-1111-111111111111",
158
+ "corpus": CORPUS_ID,
163
159
  "use_cache": False,
164
160
  "activity_state": "ready",
165
161
  "model_id": None,
@@ -226,7 +222,7 @@ def _mock_worker_run_no_revision_api(responses):
226
222
  "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
227
223
  "state": "running",
228
224
  "mode": "workers",
229
- "corpus": "11111111-1111-1111-1111-111111111111",
225
+ "corpus": CORPUS_ID,
230
226
  "use_cache": False,
231
227
  "activity_state": "ready",
232
228
  "model_id": None,
@@ -11,7 +11,7 @@ from arkindex.mock import MockApiClient
11
11
  from arkindex_worker import logger
12
12
  from arkindex_worker.worker import BaseWorker, ElementsWorker
13
13
  from arkindex_worker.worker.base import ExtrasDirNotFoundError
14
- from tests.conftest import FIXTURES_DIR
14
+ from tests import CORPUS_ID, FIXTURES_DIR
15
15
 
16
16
 
17
17
  def test_init_default_local_share():
@@ -178,7 +178,7 @@ def test_configure_worker_run(mocker, responses, caplog):
178
178
  "model_version": None,
179
179
  "process": {
180
180
  "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
181
- "corpus": "11111111-1111-1111-1111-111111111111",
181
+ "corpus": CORPUS_ID,
182
182
  },
183
183
  "summary": "Worker Fake worker @ 123412",
184
184
  }
@@ -270,7 +270,7 @@ def test_configure_user_configuration_defaults(mocker, responses):
270
270
  "model_version": None,
271
271
  "process": {
272
272
  "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
273
- "corpus": "11111111-1111-1111-1111-111111111111",
273
+ "corpus": CORPUS_ID,
274
274
  },
275
275
  "summary": "Worker Fake worker @ 123412",
276
276
  }
@@ -319,7 +319,7 @@ def test_configure_user_config_debug(mocker, responses, debug):
319
319
  },
320
320
  "process": {
321
321
  "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
322
- "corpus": "11111111-1111-1111-1111-111111111111",
322
+ "corpus": CORPUS_ID,
323
323
  },
324
324
  "summary": "Worker Fake worker @ 123412",
325
325
  }
@@ -367,7 +367,7 @@ def test_configure_worker_run_missing_conf(mocker, responses):
367
367
  "configuration": {"id": "bbbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb", "name": "BBB"},
368
368
  "process": {
369
369
  "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
370
- "corpus": "11111111-1111-1111-1111-111111111111",
370
+ "corpus": CORPUS_ID,
371
371
  },
372
372
  "summary": "Worker Fake worker @ 123412",
373
373
  }
@@ -409,7 +409,7 @@ def test_configure_worker_run_no_worker_run_conf(mocker, responses):
409
409
  "configuration": None,
410
410
  "process": {
411
411
  "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
412
- "corpus": "11111111-1111-1111-1111-111111111111",
412
+ "corpus": CORPUS_ID,
413
413
  },
414
414
  "summary": "Worker Fake worker @ 123412",
415
415
  }
@@ -458,7 +458,7 @@ def test_configure_load_model_configuration(mocker, responses):
458
458
  },
459
459
  "process": {
460
460
  "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
461
- "corpus": "11111111-1111-1111-1111-111111111111",
461
+ "corpus": CORPUS_ID,
462
462
  },
463
463
  "summary": "Worker Fake worker @ 123412",
464
464
  }
@@ -669,8 +669,7 @@ def test_find_parents_file_paths(responses, mock_base_worker_with_cache, tmp_pat
669
669
  ):
670
670
  (tmp_path / parent_id).mkdir()
671
671
  file_path = tmp_path / parent_id / filename
672
- with file_path.open("w", encoding="utf-8") as f:
673
- f.write(content)
672
+ file_path.write_text(content)
674
673
 
675
674
  # Configure worker with a specific data directory
676
675
  mock_base_worker_with_cache.task_data_dir = tmp_path