arkindex-base-worker 0.3.7rc10__py3-none-any.whl → 0.4.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arkindex_base_worker-0.3.7rc10.dist-info → arkindex_base_worker-0.4.0a1.dist-info}/METADATA +10 -16
- arkindex_base_worker-0.4.0a1.dist-info/RECORD +51 -0
- arkindex_worker/models.py +6 -0
- arkindex_worker/utils.py +3 -4
- arkindex_worker/worker/__init__.py +23 -2
- arkindex_worker/worker/base.py +3 -23
- arkindex_worker/worker/corpus.py +69 -0
- arkindex_worker/worker/image.py +21 -0
- arkindex_worker/worker/training.py +12 -0
- tests/__init__.py +8 -0
- tests/conftest.py +4 -8
- tests/test_base_worker.py +8 -9
- tests/test_dataset_worker.py +14 -41
- tests/test_elements_worker/test_classifications.py +22 -39
- tests/test_elements_worker/test_cli.py +3 -11
- tests/test_elements_worker/test_corpus.py +137 -0
- tests/test_elements_worker/test_dataset.py +6 -11
- tests/test_elements_worker/test_elements.py +106 -85
- tests/test_elements_worker/test_entities.py +15 -39
- tests/test_elements_worker/test_image.py +65 -0
- tests/test_elements_worker/test_metadata.py +6 -40
- tests/test_elements_worker/test_task.py +7 -17
- tests/test_elements_worker/test_training.py +35 -0
- tests/test_elements_worker/test_transcriptions.py +10 -27
- tests/test_elements_worker/test_worker.py +2 -1
- tests/test_image.py +3 -5
- arkindex_base_worker-0.3.7rc10.dist-info/RECORD +0 -47
- {arkindex_base_worker-0.3.7rc10.dist-info → arkindex_base_worker-0.4.0a1.dist-info}/LICENSE +0 -0
- {arkindex_base_worker-0.3.7rc10.dist-info → arkindex_base_worker-0.4.0a1.dist-info}/WHEEL +0 -0
- {arkindex_base_worker-0.3.7rc10.dist-info → arkindex_base_worker-0.4.0a1.dist-info}/top_level.txt +0 -0
{arkindex_base_worker-0.3.7rc10.dist-info → arkindex_base_worker-0.4.0a1.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: arkindex-base-worker
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0a1
|
|
4
4
|
Summary: Base Worker to easily build Arkindex ML workflows
|
|
5
5
|
Author-email: Teklia <contact@teklia.com>
|
|
6
6
|
Maintainer-email: Teklia <contact@teklia.com>
|
|
@@ -37,28 +37,22 @@ Classifier: License :: OSI Approved :: MIT License
|
|
|
37
37
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
38
38
|
Classifier: Programming Language :: Python :: 3.10
|
|
39
39
|
Classifier: Programming Language :: Python :: 3.11
|
|
40
|
-
Classifier: Topic :: Text Processing :: Linguistic
|
|
41
40
|
Requires-Python: >=3.10
|
|
42
41
|
Description-Content-Type: text/markdown
|
|
43
42
|
License-File: LICENSE
|
|
44
|
-
Requires-Dist: peewee
|
|
45
|
-
Requires-Dist: Pillow ==10.
|
|
46
|
-
Requires-Dist: pymdown-extensions ==10.7
|
|
43
|
+
Requires-Dist: peewee ~=3.17
|
|
44
|
+
Requires-Dist: Pillow ==10.3.0
|
|
47
45
|
Requires-Dist: python-gnupg ==0.5.2
|
|
48
46
|
Requires-Dist: shapely ==2.0.3
|
|
49
|
-
Requires-Dist: teklia-toolbox ==0.1.
|
|
47
|
+
Requires-Dist: teklia-toolbox ==0.1.5
|
|
50
48
|
Requires-Dist: zstandard ==0.22.0
|
|
51
49
|
Provides-Extra: docs
|
|
52
|
-
Requires-Dist: black ==24.
|
|
53
|
-
Requires-Dist:
|
|
54
|
-
Requires-Dist:
|
|
55
|
-
Requires-Dist: mkdocs-material ==9.5.10 ; extra == 'docs'
|
|
56
|
-
Requires-Dist: mkdocstrings ==0.24.0 ; extra == 'docs'
|
|
57
|
-
Requires-Dist: mkdocstrings-python ==1.8.0 ; extra == 'docs'
|
|
58
|
-
Requires-Dist: recommonmark ==0.7.1 ; extra == 'docs'
|
|
50
|
+
Requires-Dist: black ==24.4.0 ; extra == 'docs'
|
|
51
|
+
Requires-Dist: mkdocs-material ==9.5.17 ; extra == 'docs'
|
|
52
|
+
Requires-Dist: mkdocstrings-python ==1.9.2 ; extra == 'docs'
|
|
59
53
|
Provides-Extra: tests
|
|
60
|
-
Requires-Dist: pytest ==8.
|
|
61
|
-
Requires-Dist: pytest-mock ==3.
|
|
54
|
+
Requires-Dist: pytest ==8.1.1 ; extra == 'tests'
|
|
55
|
+
Requires-Dist: pytest-mock ==3.14.0 ; extra == 'tests'
|
|
62
56
|
Requires-Dist: pytest-responses ==0.5.1 ; extra == 'tests'
|
|
63
57
|
|
|
64
58
|
# Arkindex base Worker
|
|
@@ -73,7 +67,7 @@ The [documentation](https://workers.arkindex.org/) is made with [Material for Mk
|
|
|
73
67
|
|
|
74
68
|
## Create a new worker using our template
|
|
75
69
|
|
|
76
|
-
```
|
|
70
|
+
```shell
|
|
77
71
|
pip install --user cookiecutter
|
|
78
72
|
cookiecutter git@gitlab.teklia.com:workers/base-worker.git
|
|
79
73
|
```
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
arkindex_worker/__init__.py,sha256=OlgCtTC9MaWeejviY0a3iQpALcRQGMVArFVVYwTF6I8,162
|
|
2
|
+
arkindex_worker/cache.py,sha256=FTlB0coXofn5zTNRTcVIvh709mcw4a1bPGqkwWjKs3w,11248
|
|
3
|
+
arkindex_worker/image.py,sha256=5ymIGaTm2D7Sp2YYQkbuheuGnx5VJo0_AzYAEIvNGhs,14267
|
|
4
|
+
arkindex_worker/models.py,sha256=bPQzGZNs5a6z6DEcygsa8T33VOqPlMUbwKzHqlKzwbw,9923
|
|
5
|
+
arkindex_worker/utils.py,sha256=KXWIACda7D3IpdToaAplLoAgnCK8bKWw7aWUyq-IWUA,7211
|
|
6
|
+
arkindex_worker/worker/__init__.py,sha256=3sJ_EPB7yG-kPfgunbm2B7B7DzoeOi5ZNpQwC_3QuZ0,19429
|
|
7
|
+
arkindex_worker/worker/base.py,sha256=c9u37W1BNHt5RoQV2ZrYUYv6tBs-CjiSgUAAg7p7GA0,18876
|
|
8
|
+
arkindex_worker/worker/classification.py,sha256=JVz-6YEeuavOy7zGfQi4nE_wpj9hwMUZDXTem-hXQY8,10328
|
|
9
|
+
arkindex_worker/worker/corpus.py,sha256=ZHAAYE4PRPXqqaZm71wjrsxYETFqU6TAz-3VYgIXzac,1794
|
|
10
|
+
arkindex_worker/worker/dataset.py,sha256=roX2IMMNA-icteTtRADiFSZiZSRPClqS62ZPJm9s2JI,2923
|
|
11
|
+
arkindex_worker/worker/element.py,sha256=AWK3YJSHWy3j4ajntJloi_2X4zxsgXZ6c6dzphgq3OI,33848
|
|
12
|
+
arkindex_worker/worker/entity.py,sha256=suhycfikC9oTPEWmX48_cnvFEw-Wu5zBA8n_00K4KUk,14714
|
|
13
|
+
arkindex_worker/worker/image.py,sha256=t_Az6IGnj0EZyvcA4XxfPikOUjn_pztgsyxTkFZhaXU,621
|
|
14
|
+
arkindex_worker/worker/metadata.py,sha256=Bouuc_JaXogKykVXOTKDVP3tX--OUQeHoazxIGrGrJI,6702
|
|
15
|
+
arkindex_worker/worker/task.py,sha256=cz3wJNPgogZv1lm_3lm7WScitQtYQtL6H6I7Xokq208,1475
|
|
16
|
+
arkindex_worker/worker/training.py,sha256=hkwCBjVE4bByXzHUmCZF73Bl5JxARdXWjYgFE6ydAT0,10749
|
|
17
|
+
arkindex_worker/worker/transcription.py,sha256=6R7ofcGnNqX4rjT0kRKIE-G9FHq2TJ1tfztNM5sTqYE,20464
|
|
18
|
+
arkindex_worker/worker/version.py,sha256=cs2pdlDxpKRO2Oldvcu54w-D_DQhf1cdeEt4tKX_QYs,1927
|
|
19
|
+
hooks/pre_gen_project.py,sha256=xQJERv3vv9VzIqcBHI281eeWLWREXUF4mMw7PvJHHXM,269
|
|
20
|
+
tests/__init__.py,sha256=6aeTMHf4q_dKY4jIZWg1KT70VKaLvVlzCxh-Uu_cWiQ,241
|
|
21
|
+
tests/conftest.py,sha256=-ZQTV4rg7TgW84-5Ioqndqv8byNILfDOpyUt8wecEiI,21967
|
|
22
|
+
tests/test_base_worker.py,sha256=qG45O3nPbASXN5a5RadXU1BAXn3EIaTK6Hvjj3s4Ozs,24292
|
|
23
|
+
tests/test_cache.py,sha256=ii0gyr0DrG7ChEs7pmT8hMdSguAOAcCze4bRMiFQxuk,10640
|
|
24
|
+
tests/test_dataset_worker.py,sha256=d9HG36qnO5HXu9vQ0UTBvdTSRR21FVq1FNoXM-vZbPk,22105
|
|
25
|
+
tests/test_element.py,sha256=2G9M15TLxQRmvrWM9Kw2ucnElh4kSv_oF_5FYwwAxTY,13181
|
|
26
|
+
tests/test_image.py,sha256=Fs9vKYgQ7mEFylbzI4YIO_JyOLeAcs-WxUXpzewxCd8,16188
|
|
27
|
+
tests/test_merge.py,sha256=Q4zCbtZbe0wBfqE56gvAD06c6pDuhqnjKaioFqIgAQw,8331
|
|
28
|
+
tests/test_utils.py,sha256=vpeHMeL7bJQonv5ZEbJmlJikqVKn5VWlVEbvmYFzDYA,1650
|
|
29
|
+
tests/test_elements_worker/__init__.py,sha256=Fh4nkbbyJSMv_VtjQxnWrOqTnxXaaWI8S9WU0VrzCHs,179
|
|
30
|
+
tests/test_elements_worker/test_classifications.py,sha256=DYRKhPpplFp144GCXKyFG1hz4Ra9vk5FiAN6dhfMP6k,25511
|
|
31
|
+
tests/test_elements_worker/test_cli.py,sha256=a23i1pUDbXi23MUtbWwGEcLLrmc_YlrbDgOG3h66wLM,2620
|
|
32
|
+
tests/test_elements_worker/test_corpus.py,sha256=c_LUHvkJIYgk_wXF06VQPNOoWfiZ06XpjOXrJ7MRiBc,4479
|
|
33
|
+
tests/test_elements_worker/test_dataset.py,sha256=lSXqubhg1EEq2Y2goE8Y2RYaqIpM9Iejq6fGNW2BczU,11411
|
|
34
|
+
tests/test_elements_worker/test_elements.py,sha256=2_kdeo99biCH3Uez6HB8ltS_iIizZ7ir5uOkFjIXfjM,84812
|
|
35
|
+
tests/test_elements_worker/test_entities.py,sha256=jirb_IKAMqMhwxeDgjO-rsr1fTP9GdXwuyhncUjCJFM,33494
|
|
36
|
+
tests/test_elements_worker/test_image.py,sha256=_E3UGdDOwTo1MW5KMS81PrdeSPBPWinWYoQPNy2F9Ro,2077
|
|
37
|
+
tests/test_elements_worker/test_metadata.py,sha256=-cZhlVAh4o2uRnHz8fPf_thfavRnJrtJYN_p4BmHISU,17566
|
|
38
|
+
tests/test_elements_worker/test_task.py,sha256=7Sr3fbjdgWUXJUhJEiC9CwnbhQIQX3rCInmHMIrmA38,5573
|
|
39
|
+
tests/test_elements_worker/test_training.py,sha256=wVYWdMdeSA6T2XyhH5AJJNGemYq3LOViiZvj0dblACA,9468
|
|
40
|
+
tests/test_elements_worker/test_transcriptions.py,sha256=7HDkIW8IDK7pKAfpSdAPB7YOyKyeBJTn2_alvVK46SA,72411
|
|
41
|
+
tests/test_elements_worker/test_worker.py,sha256=AwdP8uSXNQ_SJavXxJV2s3_J3OiCafShVjMV1dgt4xo,17162
|
|
42
|
+
worker-demo/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
43
|
+
worker-demo/tests/conftest.py,sha256=XzNMNeg6pmABUAH8jN6eZTlZSFGLYjS3-DTXjiRN6Yc,1002
|
|
44
|
+
worker-demo/tests/test_worker.py,sha256=3DLd4NRK4bfyatG5P_PK4k9P9tJHx9XQq5_ryFEEFVg,304
|
|
45
|
+
worker-demo/worker_demo/__init__.py,sha256=2BPomV8ZMNf3YXJgloatKeHQCE6QOkwmsHGkO6MkQuM,125
|
|
46
|
+
worker-demo/worker_demo/worker.py,sha256=Rt-DjWa5iBP08k58NDZMfeyPuFbtNcbX6nc5jFX7GNo,440
|
|
47
|
+
arkindex_base_worker-0.4.0a1.dist-info/LICENSE,sha256=NVshRi1efwVezMfW7xXYLrdDr2Li1AfwfGOd5WuH1kQ,1063
|
|
48
|
+
arkindex_base_worker-0.4.0a1.dist-info/METADATA,sha256=PBTlbhWTCvvkkcGqQew6yvJIdncf9mKZ71yI_QSX2iM,3269
|
|
49
|
+
arkindex_base_worker-0.4.0a1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
50
|
+
arkindex_base_worker-0.4.0a1.dist-info/top_level.txt,sha256=58NuslgxQC2vT4DiqZEgO4JqJRrYa2yeNI9QvkbfGQU,40
|
|
51
|
+
arkindex_base_worker-0.4.0a1.dist-info/RECORD,,
|
arkindex_worker/models.py
CHANGED
arkindex_worker/utils.py
CHANGED
|
@@ -5,7 +5,6 @@ import tarfile
|
|
|
5
5
|
import tempfile
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
|
|
8
|
-
import zstandard
|
|
9
8
|
import zstandard as zstd
|
|
10
9
|
|
|
11
10
|
logger = logging.getLogger(__name__)
|
|
@@ -38,7 +37,7 @@ def decompress_zst_archive(compressed_archive: Path) -> tuple[int, Path]:
|
|
|
38
37
|
:param compressed_archive: Path to the target ZST-compressed archive
|
|
39
38
|
:return: File descriptor and path to the uncompressed tar archive
|
|
40
39
|
"""
|
|
41
|
-
dctx =
|
|
40
|
+
dctx = zstd.ZstdDecompressor()
|
|
42
41
|
archive_fd, archive_path = tempfile.mkstemp(suffix=".tar")
|
|
43
42
|
archive_path = Path(archive_path)
|
|
44
43
|
|
|
@@ -50,7 +49,7 @@ def decompress_zst_archive(compressed_archive: Path) -> tuple[int, Path]:
|
|
|
50
49
|
):
|
|
51
50
|
dctx.copy_stream(compressed, decompressed)
|
|
52
51
|
logger.debug(f"Successfully uncompressed archive {compressed_archive}")
|
|
53
|
-
except
|
|
52
|
+
except zstd.ZstdError as e:
|
|
54
53
|
raise Exception(f"Couldn't uncompressed archive: {e}") from e
|
|
55
54
|
|
|
56
55
|
return archive_fd, archive_path
|
|
@@ -129,7 +128,7 @@ def zstd_compress(
|
|
|
129
128
|
archive_hasher.update(compressed_chunk)
|
|
130
129
|
archive_file.write(compressed_chunk)
|
|
131
130
|
logger.debug(f"Successfully compressed {source}")
|
|
132
|
-
except
|
|
131
|
+
except zstd.ZstdError as e:
|
|
133
132
|
raise Exception(f"Couldn't compress archive: {e}") from e
|
|
134
133
|
return file_d, destination, archive_hasher.hexdigest()
|
|
135
134
|
|
|
@@ -19,9 +19,11 @@ from arkindex_worker.cache import CachedElement
|
|
|
19
19
|
from arkindex_worker.models import Dataset, Element, Set
|
|
20
20
|
from arkindex_worker.worker.base import BaseWorker
|
|
21
21
|
from arkindex_worker.worker.classification import ClassificationMixin
|
|
22
|
+
from arkindex_worker.worker.corpus import CorpusMixin
|
|
22
23
|
from arkindex_worker.worker.dataset import DatasetMixin, DatasetState
|
|
23
24
|
from arkindex_worker.worker.element import ElementMixin
|
|
24
25
|
from arkindex_worker.worker.entity import EntityMixin
|
|
26
|
+
from arkindex_worker.worker.image import ImageMixin
|
|
25
27
|
from arkindex_worker.worker.metadata import MetaDataMixin, MetaType # noqa: F401
|
|
26
28
|
from arkindex_worker.worker.task import TaskMixin
|
|
27
29
|
from arkindex_worker.worker.transcription import TranscriptionMixin
|
|
@@ -57,11 +59,13 @@ class ActivityState(Enum):
|
|
|
57
59
|
class ElementsWorker(
|
|
58
60
|
BaseWorker,
|
|
59
61
|
ClassificationMixin,
|
|
62
|
+
CorpusMixin,
|
|
60
63
|
ElementMixin,
|
|
61
64
|
TranscriptionMixin,
|
|
62
65
|
WorkerVersionMixin,
|
|
63
66
|
EntityMixin,
|
|
64
67
|
MetaDataMixin,
|
|
68
|
+
ImageMixin,
|
|
65
69
|
):
|
|
66
70
|
"""
|
|
67
71
|
Base class for ML workers that operate on Arkindex elements.
|
|
@@ -88,7 +92,7 @@ class ElementsWorker(
|
|
|
88
92
|
)
|
|
89
93
|
self.parser.add_argument(
|
|
90
94
|
"--element",
|
|
91
|
-
type=
|
|
95
|
+
type=str,
|
|
92
96
|
nargs="+",
|
|
93
97
|
help="One or more Arkindex element ID",
|
|
94
98
|
)
|
|
@@ -107,11 +111,23 @@ class ElementsWorker(
|
|
|
107
111
|
the cache database when enabled.
|
|
108
112
|
|
|
109
113
|
:return: An iterable of [CachedElement][arkindex_worker.cache.CachedElement] when cache support is enabled,
|
|
110
|
-
|
|
114
|
+
or a list of strings representing element IDs otherwise.
|
|
111
115
|
"""
|
|
112
116
|
assert not (
|
|
113
117
|
self.args.elements_list and self.args.element
|
|
114
118
|
), "elements-list and element CLI args shouldn't be both set"
|
|
119
|
+
|
|
120
|
+
def invalid_element_id(value: str) -> bool:
|
|
121
|
+
"""
|
|
122
|
+
Return whether the ID of an element is a valid UUID or not
|
|
123
|
+
"""
|
|
124
|
+
try:
|
|
125
|
+
uuid.UUID(value)
|
|
126
|
+
except Exception:
|
|
127
|
+
return True
|
|
128
|
+
|
|
129
|
+
return False
|
|
130
|
+
|
|
115
131
|
out = []
|
|
116
132
|
|
|
117
133
|
# Load from the cache when available
|
|
@@ -131,6 +147,11 @@ class ElementsWorker(
|
|
|
131
147
|
elif self.args.element:
|
|
132
148
|
out += self.args.element
|
|
133
149
|
|
|
150
|
+
invalid_element_ids = list(filter(invalid_element_id, out))
|
|
151
|
+
assert (
|
|
152
|
+
not invalid_element_ids
|
|
153
|
+
), f"These element IDs are invalid: {', '.join(invalid_element_ids)}"
|
|
154
|
+
|
|
134
155
|
return out
|
|
135
156
|
|
|
136
157
|
@property
|
arkindex_worker/worker/base.py
CHANGED
|
@@ -13,14 +13,8 @@ from tempfile import mkdtemp
|
|
|
13
13
|
import gnupg
|
|
14
14
|
import yaml
|
|
15
15
|
from apistar.exceptions import ErrorResponse
|
|
16
|
-
from tenacity import (
|
|
17
|
-
before_sleep_log,
|
|
18
|
-
retry,
|
|
19
|
-
retry_if_exception,
|
|
20
|
-
stop_after_attempt,
|
|
21
|
-
wait_exponential,
|
|
22
|
-
)
|
|
23
16
|
|
|
17
|
+
from arkindex import options_from_env
|
|
24
18
|
from arkindex_worker import logger
|
|
25
19
|
from arkindex_worker.cache import (
|
|
26
20
|
check_version,
|
|
@@ -30,7 +24,7 @@ from arkindex_worker.cache import (
|
|
|
30
24
|
merge_parents_cache,
|
|
31
25
|
)
|
|
32
26
|
from arkindex_worker.utils import close_delete_file, extract_tar_zst_archive
|
|
33
|
-
from teklia_toolbox.requests import
|
|
27
|
+
from teklia_toolbox.requests import get_arkindex_client
|
|
34
28
|
|
|
35
29
|
|
|
36
30
|
class ExtrasDirNotFoundError(Exception):
|
|
@@ -185,7 +179,7 @@ class BaseWorker:
|
|
|
185
179
|
Create an ArkindexClient to make API requests towards Arkindex instances.
|
|
186
180
|
"""
|
|
187
181
|
# Build Arkindex API client from environment variables
|
|
188
|
-
self.api_client =
|
|
182
|
+
self.api_client = get_arkindex_client(**options_from_env())
|
|
189
183
|
logger.debug(f"Setup Arkindex API client on {self.api_client.document.url}")
|
|
190
184
|
|
|
191
185
|
def configure_for_developers(self):
|
|
@@ -477,23 +471,9 @@ class BaseWorker:
|
|
|
477
471
|
# Clean up
|
|
478
472
|
shutil.rmtree(base_extracted_path)
|
|
479
473
|
|
|
480
|
-
@retry(
|
|
481
|
-
retry=retry_if_exception(_is_500_error),
|
|
482
|
-
wait=wait_exponential(multiplier=2, min=3),
|
|
483
|
-
reraise=True,
|
|
484
|
-
stop=stop_after_attempt(5),
|
|
485
|
-
before_sleep=before_sleep_log(logger, logging.INFO),
|
|
486
|
-
)
|
|
487
474
|
def request(self, *args, **kwargs):
|
|
488
475
|
"""
|
|
489
476
|
Wrapper around the ``ArkindexClient.request`` method.
|
|
490
|
-
|
|
491
|
-
The API call will be retried up to 5 times in case of HTTP 5xx errors,
|
|
492
|
-
with an exponential sleep time of 3, 4, 8 and 16 seconds between calls.
|
|
493
|
-
If the 5th call still causes an HTTP 5xx error, the exception is re-raised
|
|
494
|
-
and the caller should catch it.
|
|
495
|
-
|
|
496
|
-
Log messages are displayed when an HTTP 5xx error occurs, before waiting for the next call.
|
|
497
477
|
"""
|
|
498
478
|
return self.api_client.request(*args, **kwargs)
|
|
499
479
|
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BaseWorker methods for corpora.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from operator import itemgetter
|
|
7
|
+
from tempfile import _TemporaryFileWrapper
|
|
8
|
+
|
|
9
|
+
from arkindex_worker import logger
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class CorpusExportState(Enum):
|
|
13
|
+
"""
|
|
14
|
+
State of a corpus export.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
Created = "created"
|
|
18
|
+
"""
|
|
19
|
+
The corpus export is created, awaiting its processing.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
Running = "running"
|
|
23
|
+
"""
|
|
24
|
+
The corpus export is being built.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
Failed = "failed"
|
|
28
|
+
"""
|
|
29
|
+
The corpus export failed.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
Done = "done"
|
|
33
|
+
"""
|
|
34
|
+
The corpus export ended in success.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class CorpusMixin:
|
|
39
|
+
def download_latest_export(self) -> _TemporaryFileWrapper:
|
|
40
|
+
"""
|
|
41
|
+
Download the latest export in `done` state of the current corpus.
|
|
42
|
+
|
|
43
|
+
:returns: The downloaded export stored in a temporary file.
|
|
44
|
+
"""
|
|
45
|
+
# List all exports on the corpus
|
|
46
|
+
exports = self.api_client.paginate("ListExports", id=self.corpus_id)
|
|
47
|
+
|
|
48
|
+
# Find the latest that is in "done" state
|
|
49
|
+
exports: list[dict] = sorted(
|
|
50
|
+
list(
|
|
51
|
+
filter(
|
|
52
|
+
lambda export: export["state"] == CorpusExportState.Done.value,
|
|
53
|
+
exports,
|
|
54
|
+
)
|
|
55
|
+
),
|
|
56
|
+
key=itemgetter("updated"),
|
|
57
|
+
reverse=True,
|
|
58
|
+
)
|
|
59
|
+
assert (
|
|
60
|
+
len(exports) > 0
|
|
61
|
+
), f'No available exports found for the corpus ({self.corpus_id}) with state "{CorpusExportState.Done.value.capitalize()}".'
|
|
62
|
+
|
|
63
|
+
# Download latest export
|
|
64
|
+
export_id: str = exports[0]["id"]
|
|
65
|
+
logger.info(f"Downloading export ({export_id})...")
|
|
66
|
+
export: _TemporaryFileWrapper = self.request("DownloadExport", id=export_id)
|
|
67
|
+
logger.info(f"Downloaded export ({export_id}) @ `{export.name}`")
|
|
68
|
+
|
|
69
|
+
return export
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ElementsWorker methods for images.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from arkindex_worker.models import Image
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ImageMixin:
|
|
9
|
+
def create_iiif_url(self, url: str) -> Image:
|
|
10
|
+
"""
|
|
11
|
+
Create an image from an existing IIIF image by URL.
|
|
12
|
+
The URL should be of the image's identifier, not of its Image Information request (`/info.json`).
|
|
13
|
+
|
|
14
|
+
:param url: URL of the image.
|
|
15
|
+
:returns: The created image.
|
|
16
|
+
"""
|
|
17
|
+
assert url and isinstance(
|
|
18
|
+
url, str
|
|
19
|
+
), "url shouldn't be null and should be of type str"
|
|
20
|
+
|
|
21
|
+
return Image(self.api_client.request("CreateIIIFURL", body={"url": url}))
|
|
@@ -83,6 +83,9 @@ class TrainingMixin:
|
|
|
83
83
|
|
|
84
84
|
@property
|
|
85
85
|
def is_finetuning(self) -> bool:
|
|
86
|
+
"""
|
|
87
|
+
Whether or not this worker is fine-tuning an existing model version.
|
|
88
|
+
"""
|
|
86
89
|
return bool(self.model_version_id)
|
|
87
90
|
|
|
88
91
|
@skip_if_read_only
|
|
@@ -280,8 +283,17 @@ class TrainingMixin:
|
|
|
280
283
|
},
|
|
281
284
|
)
|
|
282
285
|
except ErrorResponse as e:
|
|
286
|
+
# Temporary fix while waiting for `ValidateModelVersion` refactoring as it can
|
|
287
|
+
# return errors even when the model version is properly validated
|
|
288
|
+
if e.status_code in [403, 500]:
|
|
289
|
+
logger.warning(
|
|
290
|
+
f'An error occurred while validating model version {self.model_version["id"]}, please check its status.'
|
|
291
|
+
)
|
|
292
|
+
return
|
|
293
|
+
|
|
283
294
|
if e.status_code != 409:
|
|
284
295
|
raise e
|
|
296
|
+
|
|
285
297
|
logger.warning(
|
|
286
298
|
f"An available model version exists with hash {hash}, using it instead of the pending version."
|
|
287
299
|
)
|
tests/__init__.py
CHANGED
tests/conftest.py
CHANGED
|
@@ -26,11 +26,7 @@ from arkindex_worker.models import Artifact, Dataset, Set
|
|
|
26
26
|
from arkindex_worker.worker import BaseWorker, DatasetWorker, ElementsWorker
|
|
27
27
|
from arkindex_worker.worker.dataset import DatasetState
|
|
28
28
|
from arkindex_worker.worker.transcription import TextOrientation
|
|
29
|
-
|
|
30
|
-
FIXTURES_DIR = Path(__file__).resolve().parent / "data"
|
|
31
|
-
SAMPLES_DIR = Path(__file__).resolve().parent / "samples"
|
|
32
|
-
|
|
33
|
-
PROCESS_ID = "cafecafe-cafe-cafe-cafe-cafecafecafe"
|
|
29
|
+
from tests import CORPUS_ID, FIXTURES_DIR, PROCESS_ID, SAMPLES_DIR
|
|
34
30
|
|
|
35
31
|
__yaml_cache = {}
|
|
36
32
|
|
|
@@ -93,7 +89,7 @@ def _setup_api(responses, monkeypatch, _cache_yaml):
|
|
|
93
89
|
|
|
94
90
|
# Fallback to prod environment
|
|
95
91
|
if schema_url is None:
|
|
96
|
-
schema_url = "https://arkindex.teklia.com/api/v1/openapi/?format=
|
|
92
|
+
schema_url = "https://arkindex.teklia.com/api/v1/openapi/?format=json"
|
|
97
93
|
monkeypatch.setenv("ARKINDEX_API_SCHEMA_URL", schema_url)
|
|
98
94
|
|
|
99
95
|
# Allow accessing remote API schemas
|
|
@@ -159,7 +155,7 @@ def _mock_worker_run_api(responses):
|
|
|
159
155
|
"id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
|
|
160
156
|
"state": "running",
|
|
161
157
|
"mode": "workers",
|
|
162
|
-
"corpus":
|
|
158
|
+
"corpus": CORPUS_ID,
|
|
163
159
|
"use_cache": False,
|
|
164
160
|
"activity_state": "ready",
|
|
165
161
|
"model_id": None,
|
|
@@ -226,7 +222,7 @@ def _mock_worker_run_no_revision_api(responses):
|
|
|
226
222
|
"id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
|
|
227
223
|
"state": "running",
|
|
228
224
|
"mode": "workers",
|
|
229
|
-
"corpus":
|
|
225
|
+
"corpus": CORPUS_ID,
|
|
230
226
|
"use_cache": False,
|
|
231
227
|
"activity_state": "ready",
|
|
232
228
|
"model_id": None,
|
tests/test_base_worker.py
CHANGED
|
@@ -11,7 +11,7 @@ from arkindex.mock import MockApiClient
|
|
|
11
11
|
from arkindex_worker import logger
|
|
12
12
|
from arkindex_worker.worker import BaseWorker, ElementsWorker
|
|
13
13
|
from arkindex_worker.worker.base import ExtrasDirNotFoundError
|
|
14
|
-
from tests
|
|
14
|
+
from tests import CORPUS_ID, FIXTURES_DIR
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
def test_init_default_local_share():
|
|
@@ -178,7 +178,7 @@ def test_configure_worker_run(mocker, responses, caplog):
|
|
|
178
178
|
"model_version": None,
|
|
179
179
|
"process": {
|
|
180
180
|
"id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
|
|
181
|
-
"corpus":
|
|
181
|
+
"corpus": CORPUS_ID,
|
|
182
182
|
},
|
|
183
183
|
"summary": "Worker Fake worker @ 123412",
|
|
184
184
|
}
|
|
@@ -270,7 +270,7 @@ def test_configure_user_configuration_defaults(mocker, responses):
|
|
|
270
270
|
"model_version": None,
|
|
271
271
|
"process": {
|
|
272
272
|
"id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
|
|
273
|
-
"corpus":
|
|
273
|
+
"corpus": CORPUS_ID,
|
|
274
274
|
},
|
|
275
275
|
"summary": "Worker Fake worker @ 123412",
|
|
276
276
|
}
|
|
@@ -319,7 +319,7 @@ def test_configure_user_config_debug(mocker, responses, debug):
|
|
|
319
319
|
},
|
|
320
320
|
"process": {
|
|
321
321
|
"id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
|
|
322
|
-
"corpus":
|
|
322
|
+
"corpus": CORPUS_ID,
|
|
323
323
|
},
|
|
324
324
|
"summary": "Worker Fake worker @ 123412",
|
|
325
325
|
}
|
|
@@ -367,7 +367,7 @@ def test_configure_worker_run_missing_conf(mocker, responses):
|
|
|
367
367
|
"configuration": {"id": "bbbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb", "name": "BBB"},
|
|
368
368
|
"process": {
|
|
369
369
|
"id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
|
|
370
|
-
"corpus":
|
|
370
|
+
"corpus": CORPUS_ID,
|
|
371
371
|
},
|
|
372
372
|
"summary": "Worker Fake worker @ 123412",
|
|
373
373
|
}
|
|
@@ -409,7 +409,7 @@ def test_configure_worker_run_no_worker_run_conf(mocker, responses):
|
|
|
409
409
|
"configuration": None,
|
|
410
410
|
"process": {
|
|
411
411
|
"id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
|
|
412
|
-
"corpus":
|
|
412
|
+
"corpus": CORPUS_ID,
|
|
413
413
|
},
|
|
414
414
|
"summary": "Worker Fake worker @ 123412",
|
|
415
415
|
}
|
|
@@ -458,7 +458,7 @@ def test_configure_load_model_configuration(mocker, responses):
|
|
|
458
458
|
},
|
|
459
459
|
"process": {
|
|
460
460
|
"id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
|
|
461
|
-
"corpus":
|
|
461
|
+
"corpus": CORPUS_ID,
|
|
462
462
|
},
|
|
463
463
|
"summary": "Worker Fake worker @ 123412",
|
|
464
464
|
}
|
|
@@ -669,8 +669,7 @@ def test_find_parents_file_paths(responses, mock_base_worker_with_cache, tmp_pat
|
|
|
669
669
|
):
|
|
670
670
|
(tmp_path / parent_id).mkdir()
|
|
671
671
|
file_path = tmp_path / parent_id / filename
|
|
672
|
-
|
|
673
|
-
f.write(content)
|
|
672
|
+
file_path.write_text(content)
|
|
674
673
|
|
|
675
674
|
# Configure worker with a specific data directory
|
|
676
675
|
mock_base_worker_with_cache.task_data_dir = tmp_path
|
tests/test_dataset_worker.py
CHANGED
|
@@ -8,7 +8,7 @@ from apistar.exceptions import ErrorResponse
|
|
|
8
8
|
from arkindex_worker.models import Dataset, Set
|
|
9
9
|
from arkindex_worker.worker import MissingDatasetArchive, check_dataset_set
|
|
10
10
|
from arkindex_worker.worker.dataset import DatasetState
|
|
11
|
-
from tests
|
|
11
|
+
from tests import FIXTURES_DIR, PROCESS_ID
|
|
12
12
|
from tests.test_elements_worker import BASE_API_CALLS
|
|
13
13
|
|
|
14
14
|
RANDOM_UUID = uuid.uuid4()
|
|
@@ -63,22 +63,17 @@ def test_download_dataset_artifact_list_api_error(
|
|
|
63
63
|
responses.add(
|
|
64
64
|
responses.GET,
|
|
65
65
|
f"http://testserver/api/v1/task/{task_id}/artifacts/",
|
|
66
|
-
status=
|
|
66
|
+
status=418,
|
|
67
67
|
)
|
|
68
68
|
|
|
69
69
|
with pytest.raises(ErrorResponse):
|
|
70
70
|
mock_dataset_worker.download_dataset_artifact(default_dataset)
|
|
71
71
|
|
|
72
|
-
assert len(responses.calls) == len(BASE_API_CALLS) +
|
|
72
|
+
assert len(responses.calls) == len(BASE_API_CALLS) + 1
|
|
73
73
|
assert [
|
|
74
74
|
(call.request.method, call.request.url) for call in responses.calls
|
|
75
75
|
] == BASE_API_CALLS + [
|
|
76
|
-
|
|
77
|
-
("GET", f"http://testserver/api/v1/task/{task_id}/artifacts/"),
|
|
78
|
-
("GET", f"http://testserver/api/v1/task/{task_id}/artifacts/"),
|
|
79
|
-
("GET", f"http://testserver/api/v1/task/{task_id}/artifacts/"),
|
|
80
|
-
("GET", f"http://testserver/api/v1/task/{task_id}/artifacts/"),
|
|
81
|
-
("GET", f"http://testserver/api/v1/task/{task_id}/artifacts/"),
|
|
76
|
+
("GET", f"http://testserver/api/v1/task/{task_id}/artifacts/")
|
|
82
77
|
]
|
|
83
78
|
|
|
84
79
|
|
|
@@ -116,22 +111,17 @@ def test_download_dataset_artifact_download_api_error(
|
|
|
116
111
|
responses.add(
|
|
117
112
|
responses.GET,
|
|
118
113
|
f"http://testserver/api/v1/task/{task_id}/artifact/dataset_id.tar.zst",
|
|
119
|
-
status=
|
|
114
|
+
status=418,
|
|
120
115
|
)
|
|
121
116
|
|
|
122
117
|
with pytest.raises(ErrorResponse):
|
|
123
118
|
mock_dataset_worker.download_dataset_artifact(default_dataset)
|
|
124
119
|
|
|
125
|
-
assert len(responses.calls) == len(BASE_API_CALLS) +
|
|
120
|
+
assert len(responses.calls) == len(BASE_API_CALLS) + 2
|
|
126
121
|
assert [
|
|
127
122
|
(call.request.method, call.request.url) for call in responses.calls
|
|
128
123
|
] == BASE_API_CALLS + [
|
|
129
124
|
("GET", f"http://testserver/api/v1/task/{task_id}/artifacts/"),
|
|
130
|
-
# The API call is retried 5 times
|
|
131
|
-
("GET", f"http://testserver/api/v1/task/{task_id}/artifact/dataset_id.tar.zst"),
|
|
132
|
-
("GET", f"http://testserver/api/v1/task/{task_id}/artifact/dataset_id.tar.zst"),
|
|
133
|
-
("GET", f"http://testserver/api/v1/task/{task_id}/artifact/dataset_id.tar.zst"),
|
|
134
|
-
("GET", f"http://testserver/api/v1/task/{task_id}/artifact/dataset_id.tar.zst"),
|
|
135
125
|
("GET", f"http://testserver/api/v1/task/{task_id}/artifact/dataset_id.tar.zst"),
|
|
136
126
|
]
|
|
137
127
|
|
|
@@ -284,7 +274,7 @@ def test_list_sets_api_error(responses, mock_dataset_worker):
|
|
|
284
274
|
responses.add(
|
|
285
275
|
responses.GET,
|
|
286
276
|
f"http://testserver/api/v1/process/{PROCESS_ID}/sets/",
|
|
287
|
-
status=
|
|
277
|
+
status=418,
|
|
288
278
|
)
|
|
289
279
|
|
|
290
280
|
with pytest.raises(
|
|
@@ -393,20 +383,15 @@ def test_list_sets_retrieve_dataset_api_error(
|
|
|
393
383
|
responses.add(
|
|
394
384
|
responses.GET,
|
|
395
385
|
f"http://testserver/api/v1/datasets/{default_dataset.id}/",
|
|
396
|
-
status=
|
|
386
|
+
status=418,
|
|
397
387
|
)
|
|
398
388
|
|
|
399
389
|
with pytest.raises(ErrorResponse):
|
|
400
390
|
next(mock_dev_dataset_worker.list_sets())
|
|
401
391
|
|
|
402
|
-
assert len(responses.calls) ==
|
|
392
|
+
assert len(responses.calls) == 1
|
|
403
393
|
assert [(call.request.method, call.request.url) for call in responses.calls] == [
|
|
404
|
-
|
|
405
|
-
("GET", f"http://testserver/api/v1/datasets/{default_dataset.id}/"),
|
|
406
|
-
("GET", f"http://testserver/api/v1/datasets/{default_dataset.id}/"),
|
|
407
|
-
("GET", f"http://testserver/api/v1/datasets/{default_dataset.id}/"),
|
|
408
|
-
("GET", f"http://testserver/api/v1/datasets/{default_dataset.id}/"),
|
|
409
|
-
("GET", f"http://testserver/api/v1/datasets/{default_dataset.id}/"),
|
|
394
|
+
("GET", f"http://testserver/api/v1/datasets/{default_dataset.id}/")
|
|
410
395
|
]
|
|
411
396
|
|
|
412
397
|
|
|
@@ -494,22 +479,17 @@ def test_run_download_dataset_artifact_api_error(
|
|
|
494
479
|
responses.add(
|
|
495
480
|
responses.GET,
|
|
496
481
|
f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/",
|
|
497
|
-
status=
|
|
482
|
+
status=418,
|
|
498
483
|
)
|
|
499
484
|
|
|
500
485
|
with pytest.raises(SystemExit):
|
|
501
486
|
mock_dataset_worker.run()
|
|
502
487
|
|
|
503
|
-
assert len(responses.calls) == len(BASE_API_CALLS) * 2 +
|
|
488
|
+
assert len(responses.calls) == len(BASE_API_CALLS) * 2 + 1
|
|
504
489
|
assert [
|
|
505
490
|
(call.request.method, call.request.url) for call in responses.calls
|
|
506
491
|
] == BASE_API_CALLS * 2 + [
|
|
507
|
-
|
|
508
|
-
("GET", f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/"),
|
|
509
|
-
("GET", f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/"),
|
|
510
|
-
("GET", f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/"),
|
|
511
|
-
("GET", f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/"),
|
|
512
|
-
("GET", f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/"),
|
|
492
|
+
("GET", f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/")
|
|
513
493
|
]
|
|
514
494
|
|
|
515
495
|
assert [(level, message) for _, level, message in caplog.record_tuples] == [
|
|
@@ -519,16 +499,9 @@ def test_run_download_dataset_artifact_api_error(
|
|
|
519
499
|
"Retrieving data for Set (train) from Dataset (dataset_id) (1/1)",
|
|
520
500
|
),
|
|
521
501
|
(logging.INFO, "Downloading artifact for Dataset (dataset_id)"),
|
|
522
|
-
*[
|
|
523
|
-
(
|
|
524
|
-
logging.INFO,
|
|
525
|
-
f"Retrying arkindex_worker.worker.base.BaseWorker.request in {retry} seconds as it raised ErrorResponse: .",
|
|
526
|
-
)
|
|
527
|
-
for retry in [3.0, 4.0, 8.0, 16.0]
|
|
528
|
-
],
|
|
529
502
|
(
|
|
530
503
|
logging.WARNING,
|
|
531
|
-
"An API error occurred while processing Set (train) from Dataset (dataset_id):
|
|
504
|
+
"An API error occurred while processing Set (train) from Dataset (dataset_id): 418 I'm a Teapot - None",
|
|
532
505
|
),
|
|
533
506
|
(
|
|
534
507
|
logging.ERROR,
|