arkindex-base-worker 0.3.7rc10__tar.gz → 0.4.0a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/PKG-INFO +10 -16
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/README.md +1 -1
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_base_worker.egg-info/PKG-INFO +10 -16
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_base_worker.egg-info/SOURCES.txt +4 -0
- arkindex_base_worker-0.4.0a1/arkindex_base_worker.egg-info/requires.txt +16 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/models.py +6 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/utils.py +3 -4
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/__init__.py +23 -2
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/base.py +3 -23
- arkindex_base_worker-0.4.0a1/arkindex_worker/worker/corpus.py +69 -0
- arkindex_base_worker-0.4.0a1/arkindex_worker/worker/image.py +21 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/training.py +12 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/pyproject.toml +9 -20
- arkindex_base_worker-0.4.0a1/tests/__init__.py +8 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/conftest.py +4 -8
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_base_worker.py +8 -9
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_dataset_worker.py +14 -41
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_elements_worker/test_classifications.py +22 -39
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_elements_worker/test_cli.py +3 -11
- arkindex_base_worker-0.4.0a1/tests/test_elements_worker/test_corpus.py +137 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_elements_worker/test_dataset.py +6 -11
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_elements_worker/test_elements.py +106 -85
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_elements_worker/test_entities.py +15 -39
- arkindex_base_worker-0.4.0a1/tests/test_elements_worker/test_image.py +65 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_elements_worker/test_metadata.py +6 -40
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_elements_worker/test_task.py +7 -17
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_elements_worker/test_training.py +35 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_elements_worker/test_transcriptions.py +10 -27
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_elements_worker/test_worker.py +2 -1
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_image.py +3 -5
- arkindex-base-worker-0.3.7rc10/arkindex_base_worker.egg-info/requires.txt +0 -21
- arkindex-base-worker-0.3.7rc10/worker-demo/tests/__init__.py +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/LICENSE +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_base_worker.egg-info/dependency_links.txt +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_base_worker.egg-info/top_level.txt +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/__init__.py +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/cache.py +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/image.py +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/classification.py +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/dataset.py +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/element.py +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/entity.py +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/metadata.py +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/task.py +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/transcription.py +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/version.py +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/hooks/pre_gen_project.py +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/setup.cfg +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_cache.py +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_element.py +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_elements_worker/__init__.py +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_merge.py +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/tests/test_utils.py +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1/worker-demo}/tests/__init__.py +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/worker-demo/tests/conftest.py +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/worker-demo/tests/test_worker.py +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/worker-demo/worker_demo/__init__.py +0 -0
- {arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/worker-demo/worker_demo/worker.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: arkindex-base-worker
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0a1
|
|
4
4
|
Summary: Base Worker to easily build Arkindex ML workflows
|
|
5
5
|
Author-email: Teklia <contact@teklia.com>
|
|
6
6
|
Maintainer-email: Teklia <contact@teklia.com>
|
|
@@ -37,28 +37,22 @@ Classifier: License :: OSI Approved :: MIT License
|
|
|
37
37
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
38
38
|
Classifier: Programming Language :: Python :: 3.10
|
|
39
39
|
Classifier: Programming Language :: Python :: 3.11
|
|
40
|
-
Classifier: Topic :: Text Processing :: Linguistic
|
|
41
40
|
Requires-Python: >=3.10
|
|
42
41
|
Description-Content-Type: text/markdown
|
|
43
42
|
License-File: LICENSE
|
|
44
|
-
Requires-Dist: peewee
|
|
45
|
-
Requires-Dist: Pillow==10.
|
|
46
|
-
Requires-Dist: pymdown-extensions==10.7
|
|
43
|
+
Requires-Dist: peewee~=3.17
|
|
44
|
+
Requires-Dist: Pillow==10.3.0
|
|
47
45
|
Requires-Dist: python-gnupg==0.5.2
|
|
48
46
|
Requires-Dist: shapely==2.0.3
|
|
49
|
-
Requires-Dist: teklia-toolbox==0.1.
|
|
47
|
+
Requires-Dist: teklia-toolbox==0.1.5
|
|
50
48
|
Requires-Dist: zstandard==0.22.0
|
|
51
49
|
Provides-Extra: docs
|
|
52
|
-
Requires-Dist: black==24.
|
|
53
|
-
Requires-Dist:
|
|
54
|
-
Requires-Dist:
|
|
55
|
-
Requires-Dist: mkdocs-material==9.5.10; extra == "docs"
|
|
56
|
-
Requires-Dist: mkdocstrings==0.24.0; extra == "docs"
|
|
57
|
-
Requires-Dist: mkdocstrings-python==1.8.0; extra == "docs"
|
|
58
|
-
Requires-Dist: recommonmark==0.7.1; extra == "docs"
|
|
50
|
+
Requires-Dist: black==24.4.0; extra == "docs"
|
|
51
|
+
Requires-Dist: mkdocs-material==9.5.17; extra == "docs"
|
|
52
|
+
Requires-Dist: mkdocstrings-python==1.9.2; extra == "docs"
|
|
59
53
|
Provides-Extra: tests
|
|
60
|
-
Requires-Dist: pytest==8.
|
|
61
|
-
Requires-Dist: pytest-mock==3.
|
|
54
|
+
Requires-Dist: pytest==8.1.1; extra == "tests"
|
|
55
|
+
Requires-Dist: pytest-mock==3.14.0; extra == "tests"
|
|
62
56
|
Requires-Dist: pytest-responses==0.5.1; extra == "tests"
|
|
63
57
|
|
|
64
58
|
# Arkindex base Worker
|
|
@@ -73,7 +67,7 @@ The [documentation](https://workers.arkindex.org/) is made with [Material for Mk
|
|
|
73
67
|
|
|
74
68
|
## Create a new worker using our template
|
|
75
69
|
|
|
76
|
-
```
|
|
70
|
+
```shell
|
|
77
71
|
pip install --user cookiecutter
|
|
78
72
|
cookiecutter git@gitlab.teklia.com:workers/base-worker.git
|
|
79
73
|
```
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: arkindex-base-worker
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0a1
|
|
4
4
|
Summary: Base Worker to easily build Arkindex ML workflows
|
|
5
5
|
Author-email: Teklia <contact@teklia.com>
|
|
6
6
|
Maintainer-email: Teklia <contact@teklia.com>
|
|
@@ -37,28 +37,22 @@ Classifier: License :: OSI Approved :: MIT License
|
|
|
37
37
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
38
38
|
Classifier: Programming Language :: Python :: 3.10
|
|
39
39
|
Classifier: Programming Language :: Python :: 3.11
|
|
40
|
-
Classifier: Topic :: Text Processing :: Linguistic
|
|
41
40
|
Requires-Python: >=3.10
|
|
42
41
|
Description-Content-Type: text/markdown
|
|
43
42
|
License-File: LICENSE
|
|
44
|
-
Requires-Dist: peewee
|
|
45
|
-
Requires-Dist: Pillow==10.
|
|
46
|
-
Requires-Dist: pymdown-extensions==10.7
|
|
43
|
+
Requires-Dist: peewee~=3.17
|
|
44
|
+
Requires-Dist: Pillow==10.3.0
|
|
47
45
|
Requires-Dist: python-gnupg==0.5.2
|
|
48
46
|
Requires-Dist: shapely==2.0.3
|
|
49
|
-
Requires-Dist: teklia-toolbox==0.1.
|
|
47
|
+
Requires-Dist: teklia-toolbox==0.1.5
|
|
50
48
|
Requires-Dist: zstandard==0.22.0
|
|
51
49
|
Provides-Extra: docs
|
|
52
|
-
Requires-Dist: black==24.
|
|
53
|
-
Requires-Dist:
|
|
54
|
-
Requires-Dist:
|
|
55
|
-
Requires-Dist: mkdocs-material==9.5.10; extra == "docs"
|
|
56
|
-
Requires-Dist: mkdocstrings==0.24.0; extra == "docs"
|
|
57
|
-
Requires-Dist: mkdocstrings-python==1.8.0; extra == "docs"
|
|
58
|
-
Requires-Dist: recommonmark==0.7.1; extra == "docs"
|
|
50
|
+
Requires-Dist: black==24.4.0; extra == "docs"
|
|
51
|
+
Requires-Dist: mkdocs-material==9.5.17; extra == "docs"
|
|
52
|
+
Requires-Dist: mkdocstrings-python==1.9.2; extra == "docs"
|
|
59
53
|
Provides-Extra: tests
|
|
60
|
-
Requires-Dist: pytest==8.
|
|
61
|
-
Requires-Dist: pytest-mock==3.
|
|
54
|
+
Requires-Dist: pytest==8.1.1; extra == "tests"
|
|
55
|
+
Requires-Dist: pytest-mock==3.14.0; extra == "tests"
|
|
62
56
|
Requires-Dist: pytest-responses==0.5.1; extra == "tests"
|
|
63
57
|
|
|
64
58
|
# Arkindex base Worker
|
|
@@ -73,7 +67,7 @@ The [documentation](https://workers.arkindex.org/) is made with [Material for Mk
|
|
|
73
67
|
|
|
74
68
|
## Create a new worker using our template
|
|
75
69
|
|
|
76
|
-
```
|
|
70
|
+
```shell
|
|
77
71
|
pip install --user cookiecutter
|
|
78
72
|
cookiecutter git@gitlab.teklia.com:workers/base-worker.git
|
|
79
73
|
```
|
|
@@ -14,9 +14,11 @@ arkindex_worker/utils.py
|
|
|
14
14
|
arkindex_worker/worker/__init__.py
|
|
15
15
|
arkindex_worker/worker/base.py
|
|
16
16
|
arkindex_worker/worker/classification.py
|
|
17
|
+
arkindex_worker/worker/corpus.py
|
|
17
18
|
arkindex_worker/worker/dataset.py
|
|
18
19
|
arkindex_worker/worker/element.py
|
|
19
20
|
arkindex_worker/worker/entity.py
|
|
21
|
+
arkindex_worker/worker/image.py
|
|
20
22
|
arkindex_worker/worker/metadata.py
|
|
21
23
|
arkindex_worker/worker/task.py
|
|
22
24
|
arkindex_worker/worker/training.py
|
|
@@ -35,9 +37,11 @@ tests/test_utils.py
|
|
|
35
37
|
tests/test_elements_worker/__init__.py
|
|
36
38
|
tests/test_elements_worker/test_classifications.py
|
|
37
39
|
tests/test_elements_worker/test_cli.py
|
|
40
|
+
tests/test_elements_worker/test_corpus.py
|
|
38
41
|
tests/test_elements_worker/test_dataset.py
|
|
39
42
|
tests/test_elements_worker/test_elements.py
|
|
40
43
|
tests/test_elements_worker/test_entities.py
|
|
44
|
+
tests/test_elements_worker/test_image.py
|
|
41
45
|
tests/test_elements_worker/test_metadata.py
|
|
42
46
|
tests/test_elements_worker/test_task.py
|
|
43
47
|
tests/test_elements_worker/test_training.py
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
peewee~=3.17
|
|
2
|
+
Pillow==10.3.0
|
|
3
|
+
python-gnupg==0.5.2
|
|
4
|
+
shapely==2.0.3
|
|
5
|
+
teklia-toolbox==0.1.5
|
|
6
|
+
zstandard==0.22.0
|
|
7
|
+
|
|
8
|
+
[docs]
|
|
9
|
+
black==24.4.0
|
|
10
|
+
mkdocs-material==9.5.17
|
|
11
|
+
mkdocstrings-python==1.9.2
|
|
12
|
+
|
|
13
|
+
[tests]
|
|
14
|
+
pytest==8.1.1
|
|
15
|
+
pytest-mock==3.14.0
|
|
16
|
+
pytest-responses==0.5.1
|
|
@@ -5,7 +5,6 @@ import tarfile
|
|
|
5
5
|
import tempfile
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
|
|
8
|
-
import zstandard
|
|
9
8
|
import zstandard as zstd
|
|
10
9
|
|
|
11
10
|
logger = logging.getLogger(__name__)
|
|
@@ -38,7 +37,7 @@ def decompress_zst_archive(compressed_archive: Path) -> tuple[int, Path]:
|
|
|
38
37
|
:param compressed_archive: Path to the target ZST-compressed archive
|
|
39
38
|
:return: File descriptor and path to the uncompressed tar archive
|
|
40
39
|
"""
|
|
41
|
-
dctx =
|
|
40
|
+
dctx = zstd.ZstdDecompressor()
|
|
42
41
|
archive_fd, archive_path = tempfile.mkstemp(suffix=".tar")
|
|
43
42
|
archive_path = Path(archive_path)
|
|
44
43
|
|
|
@@ -50,7 +49,7 @@ def decompress_zst_archive(compressed_archive: Path) -> tuple[int, Path]:
|
|
|
50
49
|
):
|
|
51
50
|
dctx.copy_stream(compressed, decompressed)
|
|
52
51
|
logger.debug(f"Successfully uncompressed archive {compressed_archive}")
|
|
53
|
-
except
|
|
52
|
+
except zstd.ZstdError as e:
|
|
54
53
|
raise Exception(f"Couldn't uncompressed archive: {e}") from e
|
|
55
54
|
|
|
56
55
|
return archive_fd, archive_path
|
|
@@ -129,7 +128,7 @@ def zstd_compress(
|
|
|
129
128
|
archive_hasher.update(compressed_chunk)
|
|
130
129
|
archive_file.write(compressed_chunk)
|
|
131
130
|
logger.debug(f"Successfully compressed {source}")
|
|
132
|
-
except
|
|
131
|
+
except zstd.ZstdError as e:
|
|
133
132
|
raise Exception(f"Couldn't compress archive: {e}") from e
|
|
134
133
|
return file_d, destination, archive_hasher.hexdigest()
|
|
135
134
|
|
{arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/__init__.py
RENAMED
|
@@ -19,9 +19,11 @@ from arkindex_worker.cache import CachedElement
|
|
|
19
19
|
from arkindex_worker.models import Dataset, Element, Set
|
|
20
20
|
from arkindex_worker.worker.base import BaseWorker
|
|
21
21
|
from arkindex_worker.worker.classification import ClassificationMixin
|
|
22
|
+
from arkindex_worker.worker.corpus import CorpusMixin
|
|
22
23
|
from arkindex_worker.worker.dataset import DatasetMixin, DatasetState
|
|
23
24
|
from arkindex_worker.worker.element import ElementMixin
|
|
24
25
|
from arkindex_worker.worker.entity import EntityMixin
|
|
26
|
+
from arkindex_worker.worker.image import ImageMixin
|
|
25
27
|
from arkindex_worker.worker.metadata import MetaDataMixin, MetaType # noqa: F401
|
|
26
28
|
from arkindex_worker.worker.task import TaskMixin
|
|
27
29
|
from arkindex_worker.worker.transcription import TranscriptionMixin
|
|
@@ -57,11 +59,13 @@ class ActivityState(Enum):
|
|
|
57
59
|
class ElementsWorker(
|
|
58
60
|
BaseWorker,
|
|
59
61
|
ClassificationMixin,
|
|
62
|
+
CorpusMixin,
|
|
60
63
|
ElementMixin,
|
|
61
64
|
TranscriptionMixin,
|
|
62
65
|
WorkerVersionMixin,
|
|
63
66
|
EntityMixin,
|
|
64
67
|
MetaDataMixin,
|
|
68
|
+
ImageMixin,
|
|
65
69
|
):
|
|
66
70
|
"""
|
|
67
71
|
Base class for ML workers that operate on Arkindex elements.
|
|
@@ -88,7 +92,7 @@ class ElementsWorker(
|
|
|
88
92
|
)
|
|
89
93
|
self.parser.add_argument(
|
|
90
94
|
"--element",
|
|
91
|
-
type=
|
|
95
|
+
type=str,
|
|
92
96
|
nargs="+",
|
|
93
97
|
help="One or more Arkindex element ID",
|
|
94
98
|
)
|
|
@@ -107,11 +111,23 @@ class ElementsWorker(
|
|
|
107
111
|
the cache database when enabled.
|
|
108
112
|
|
|
109
113
|
:return: An iterable of [CachedElement][arkindex_worker.cache.CachedElement] when cache support is enabled,
|
|
110
|
-
|
|
114
|
+
or a list of strings representing element IDs otherwise.
|
|
111
115
|
"""
|
|
112
116
|
assert not (
|
|
113
117
|
self.args.elements_list and self.args.element
|
|
114
118
|
), "elements-list and element CLI args shouldn't be both set"
|
|
119
|
+
|
|
120
|
+
def invalid_element_id(value: str) -> bool:
|
|
121
|
+
"""
|
|
122
|
+
Return whether the ID of an element is a valid UUID or not
|
|
123
|
+
"""
|
|
124
|
+
try:
|
|
125
|
+
uuid.UUID(value)
|
|
126
|
+
except Exception:
|
|
127
|
+
return True
|
|
128
|
+
|
|
129
|
+
return False
|
|
130
|
+
|
|
115
131
|
out = []
|
|
116
132
|
|
|
117
133
|
# Load from the cache when available
|
|
@@ -131,6 +147,11 @@ class ElementsWorker(
|
|
|
131
147
|
elif self.args.element:
|
|
132
148
|
out += self.args.element
|
|
133
149
|
|
|
150
|
+
invalid_element_ids = list(filter(invalid_element_id, out))
|
|
151
|
+
assert (
|
|
152
|
+
not invalid_element_ids
|
|
153
|
+
), f"These element IDs are invalid: {', '.join(invalid_element_ids)}"
|
|
154
|
+
|
|
134
155
|
return out
|
|
135
156
|
|
|
136
157
|
@property
|
{arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/base.py
RENAMED
|
@@ -13,14 +13,8 @@ from tempfile import mkdtemp
|
|
|
13
13
|
import gnupg
|
|
14
14
|
import yaml
|
|
15
15
|
from apistar.exceptions import ErrorResponse
|
|
16
|
-
from tenacity import (
|
|
17
|
-
before_sleep_log,
|
|
18
|
-
retry,
|
|
19
|
-
retry_if_exception,
|
|
20
|
-
stop_after_attempt,
|
|
21
|
-
wait_exponential,
|
|
22
|
-
)
|
|
23
16
|
|
|
17
|
+
from arkindex import options_from_env
|
|
24
18
|
from arkindex_worker import logger
|
|
25
19
|
from arkindex_worker.cache import (
|
|
26
20
|
check_version,
|
|
@@ -30,7 +24,7 @@ from arkindex_worker.cache import (
|
|
|
30
24
|
merge_parents_cache,
|
|
31
25
|
)
|
|
32
26
|
from arkindex_worker.utils import close_delete_file, extract_tar_zst_archive
|
|
33
|
-
from teklia_toolbox.requests import
|
|
27
|
+
from teklia_toolbox.requests import get_arkindex_client
|
|
34
28
|
|
|
35
29
|
|
|
36
30
|
class ExtrasDirNotFoundError(Exception):
|
|
@@ -185,7 +179,7 @@ class BaseWorker:
|
|
|
185
179
|
Create an ArkindexClient to make API requests towards Arkindex instances.
|
|
186
180
|
"""
|
|
187
181
|
# Build Arkindex API client from environment variables
|
|
188
|
-
self.api_client =
|
|
182
|
+
self.api_client = get_arkindex_client(**options_from_env())
|
|
189
183
|
logger.debug(f"Setup Arkindex API client on {self.api_client.document.url}")
|
|
190
184
|
|
|
191
185
|
def configure_for_developers(self):
|
|
@@ -477,23 +471,9 @@ class BaseWorker:
|
|
|
477
471
|
# Clean up
|
|
478
472
|
shutil.rmtree(base_extracted_path)
|
|
479
473
|
|
|
480
|
-
@retry(
|
|
481
|
-
retry=retry_if_exception(_is_500_error),
|
|
482
|
-
wait=wait_exponential(multiplier=2, min=3),
|
|
483
|
-
reraise=True,
|
|
484
|
-
stop=stop_after_attempt(5),
|
|
485
|
-
before_sleep=before_sleep_log(logger, logging.INFO),
|
|
486
|
-
)
|
|
487
474
|
def request(self, *args, **kwargs):
|
|
488
475
|
"""
|
|
489
476
|
Wrapper around the ``ArkindexClient.request`` method.
|
|
490
|
-
|
|
491
|
-
The API call will be retried up to 5 times in case of HTTP 5xx errors,
|
|
492
|
-
with an exponential sleep time of 3, 4, 8 and 16 seconds between calls.
|
|
493
|
-
If the 5th call still causes an HTTP 5xx error, the exception is re-raised
|
|
494
|
-
and the caller should catch it.
|
|
495
|
-
|
|
496
|
-
Log messages are displayed when an HTTP 5xx error occurs, before waiting for the next call.
|
|
497
477
|
"""
|
|
498
478
|
return self.api_client.request(*args, **kwargs)
|
|
499
479
|
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BaseWorker methods for corpora.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from operator import itemgetter
|
|
7
|
+
from tempfile import _TemporaryFileWrapper
|
|
8
|
+
|
|
9
|
+
from arkindex_worker import logger
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class CorpusExportState(Enum):
|
|
13
|
+
"""
|
|
14
|
+
State of a corpus export.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
Created = "created"
|
|
18
|
+
"""
|
|
19
|
+
The corpus export is created, awaiting its processing.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
Running = "running"
|
|
23
|
+
"""
|
|
24
|
+
The corpus export is being built.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
Failed = "failed"
|
|
28
|
+
"""
|
|
29
|
+
The corpus export failed.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
Done = "done"
|
|
33
|
+
"""
|
|
34
|
+
The corpus export ended in success.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class CorpusMixin:
|
|
39
|
+
def download_latest_export(self) -> _TemporaryFileWrapper:
|
|
40
|
+
"""
|
|
41
|
+
Download the latest export in `done` state of the current corpus.
|
|
42
|
+
|
|
43
|
+
:returns: The downloaded export stored in a temporary file.
|
|
44
|
+
"""
|
|
45
|
+
# List all exports on the corpus
|
|
46
|
+
exports = self.api_client.paginate("ListExports", id=self.corpus_id)
|
|
47
|
+
|
|
48
|
+
# Find the latest that is in "done" state
|
|
49
|
+
exports: list[dict] = sorted(
|
|
50
|
+
list(
|
|
51
|
+
filter(
|
|
52
|
+
lambda export: export["state"] == CorpusExportState.Done.value,
|
|
53
|
+
exports,
|
|
54
|
+
)
|
|
55
|
+
),
|
|
56
|
+
key=itemgetter("updated"),
|
|
57
|
+
reverse=True,
|
|
58
|
+
)
|
|
59
|
+
assert (
|
|
60
|
+
len(exports) > 0
|
|
61
|
+
), f'No available exports found for the corpus ({self.corpus_id}) with state "{CorpusExportState.Done.value.capitalize()}".'
|
|
62
|
+
|
|
63
|
+
# Download latest export
|
|
64
|
+
export_id: str = exports[0]["id"]
|
|
65
|
+
logger.info(f"Downloading export ({export_id})...")
|
|
66
|
+
export: _TemporaryFileWrapper = self.request("DownloadExport", id=export_id)
|
|
67
|
+
logger.info(f"Downloaded export ({export_id}) @ `{export.name}`")
|
|
68
|
+
|
|
69
|
+
return export
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ElementsWorker methods for images.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from arkindex_worker.models import Image
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ImageMixin:
|
|
9
|
+
def create_iiif_url(self, url: str) -> Image:
|
|
10
|
+
"""
|
|
11
|
+
Create an image from an existing IIIF image by URL.
|
|
12
|
+
The URL should be of the image's identifier, not of its Image Information request (`/info.json`).
|
|
13
|
+
|
|
14
|
+
:param url: URL of the image.
|
|
15
|
+
:returns: The created image.
|
|
16
|
+
"""
|
|
17
|
+
assert url and isinstance(
|
|
18
|
+
url, str
|
|
19
|
+
), "url shouldn't be null and should be of type str"
|
|
20
|
+
|
|
21
|
+
return Image(self.api_client.request("CreateIIIFURL", body={"url": url}))
|
{arkindex-base-worker-0.3.7rc10 → arkindex_base_worker-0.4.0a1}/arkindex_worker/worker/training.py
RENAMED
|
@@ -83,6 +83,9 @@ class TrainingMixin:
|
|
|
83
83
|
|
|
84
84
|
@property
|
|
85
85
|
def is_finetuning(self) -> bool:
|
|
86
|
+
"""
|
|
87
|
+
Whether or not this worker is fine-tuning an existing model version.
|
|
88
|
+
"""
|
|
86
89
|
return bool(self.model_version_id)
|
|
87
90
|
|
|
88
91
|
@skip_if_read_only
|
|
@@ -280,8 +283,17 @@ class TrainingMixin:
|
|
|
280
283
|
},
|
|
281
284
|
)
|
|
282
285
|
except ErrorResponse as e:
|
|
286
|
+
# Temporary fix while waiting for `ValidateModelVersion` refactoring as it can
|
|
287
|
+
# return errors even when the model version is properly validated
|
|
288
|
+
if e.status_code in [403, 500]:
|
|
289
|
+
logger.warning(
|
|
290
|
+
f'An error occurred while validating model version {self.model_version["id"]}, please check its status.'
|
|
291
|
+
)
|
|
292
|
+
return
|
|
293
|
+
|
|
283
294
|
if e.status_code != 409:
|
|
284
295
|
raise e
|
|
296
|
+
|
|
285
297
|
logger.warning(
|
|
286
298
|
f"An available model version exists with hash {hash}, using it instead of the pending version."
|
|
287
299
|
)
|
|
@@ -4,16 +4,15 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "arkindex-base-worker"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.4.0a1"
|
|
8
8
|
description = "Base Worker to easily build Arkindex ML workflows"
|
|
9
9
|
license = { file = "LICENSE" }
|
|
10
10
|
dependencies = [
|
|
11
|
-
"peewee
|
|
12
|
-
"Pillow==10.
|
|
13
|
-
"pymdown-extensions==10.7",
|
|
11
|
+
"peewee~=3.17",
|
|
12
|
+
"Pillow==10.3.0",
|
|
14
13
|
"python-gnupg==0.5.2",
|
|
15
14
|
"shapely==2.0.3",
|
|
16
|
-
"teklia-toolbox==0.1.
|
|
15
|
+
"teklia-toolbox==0.1.5",
|
|
17
16
|
"zstandard==0.22.0",
|
|
18
17
|
]
|
|
19
18
|
authors = [
|
|
@@ -32,8 +31,6 @@ classifiers = [
|
|
|
32
31
|
"Programming Language :: Python :: 3 :: Only",
|
|
33
32
|
"Programming Language :: Python :: 3.10",
|
|
34
33
|
"Programming Language :: Python :: 3.11",
|
|
35
|
-
# Topics
|
|
36
|
-
"Topic :: Text Processing :: Linguistic",
|
|
37
34
|
]
|
|
38
35
|
|
|
39
36
|
[project.urls]
|
|
@@ -45,17 +42,13 @@ Authors = "https://teklia.com"
|
|
|
45
42
|
|
|
46
43
|
[project.optional-dependencies]
|
|
47
44
|
docs = [
|
|
48
|
-
"black==24.
|
|
49
|
-
"
|
|
50
|
-
"
|
|
51
|
-
"mkdocs-material==9.5.10",
|
|
52
|
-
"mkdocstrings==0.24.0",
|
|
53
|
-
"mkdocstrings-python==1.8.0",
|
|
54
|
-
"recommonmark==0.7.1",
|
|
45
|
+
"black==24.4.0",
|
|
46
|
+
"mkdocs-material==9.5.17",
|
|
47
|
+
"mkdocstrings-python==1.9.2",
|
|
55
48
|
]
|
|
56
49
|
tests = [
|
|
57
|
-
"pytest==8.
|
|
58
|
-
"pytest-mock==3.
|
|
50
|
+
"pytest==8.1.1",
|
|
51
|
+
"pytest-mock==3.14.0",
|
|
59
52
|
"pytest-responses==0.5.1",
|
|
60
53
|
]
|
|
61
54
|
|
|
@@ -114,7 +107,3 @@ known-third-party = [
|
|
|
114
107
|
"yaml",
|
|
115
108
|
"zstandard",
|
|
116
109
|
]
|
|
117
|
-
|
|
118
|
-
[tool.doc8]
|
|
119
|
-
ignore-path=["*.egg-info", "public", ".git"]
|
|
120
|
-
extensions=[".md"]
|
|
@@ -26,11 +26,7 @@ from arkindex_worker.models import Artifact, Dataset, Set
|
|
|
26
26
|
from arkindex_worker.worker import BaseWorker, DatasetWorker, ElementsWorker
|
|
27
27
|
from arkindex_worker.worker.dataset import DatasetState
|
|
28
28
|
from arkindex_worker.worker.transcription import TextOrientation
|
|
29
|
-
|
|
30
|
-
FIXTURES_DIR = Path(__file__).resolve().parent / "data"
|
|
31
|
-
SAMPLES_DIR = Path(__file__).resolve().parent / "samples"
|
|
32
|
-
|
|
33
|
-
PROCESS_ID = "cafecafe-cafe-cafe-cafe-cafecafecafe"
|
|
29
|
+
from tests import CORPUS_ID, FIXTURES_DIR, PROCESS_ID, SAMPLES_DIR
|
|
34
30
|
|
|
35
31
|
__yaml_cache = {}
|
|
36
32
|
|
|
@@ -93,7 +89,7 @@ def _setup_api(responses, monkeypatch, _cache_yaml):
|
|
|
93
89
|
|
|
94
90
|
# Fallback to prod environment
|
|
95
91
|
if schema_url is None:
|
|
96
|
-
schema_url = "https://arkindex.teklia.com/api/v1/openapi/?format=
|
|
92
|
+
schema_url = "https://arkindex.teklia.com/api/v1/openapi/?format=json"
|
|
97
93
|
monkeypatch.setenv("ARKINDEX_API_SCHEMA_URL", schema_url)
|
|
98
94
|
|
|
99
95
|
# Allow accessing remote API schemas
|
|
@@ -159,7 +155,7 @@ def _mock_worker_run_api(responses):
|
|
|
159
155
|
"id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
|
|
160
156
|
"state": "running",
|
|
161
157
|
"mode": "workers",
|
|
162
|
-
"corpus":
|
|
158
|
+
"corpus": CORPUS_ID,
|
|
163
159
|
"use_cache": False,
|
|
164
160
|
"activity_state": "ready",
|
|
165
161
|
"model_id": None,
|
|
@@ -226,7 +222,7 @@ def _mock_worker_run_no_revision_api(responses):
|
|
|
226
222
|
"id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
|
|
227
223
|
"state": "running",
|
|
228
224
|
"mode": "workers",
|
|
229
|
-
"corpus":
|
|
225
|
+
"corpus": CORPUS_ID,
|
|
230
226
|
"use_cache": False,
|
|
231
227
|
"activity_state": "ready",
|
|
232
228
|
"model_id": None,
|
|
@@ -11,7 +11,7 @@ from arkindex.mock import MockApiClient
|
|
|
11
11
|
from arkindex_worker import logger
|
|
12
12
|
from arkindex_worker.worker import BaseWorker, ElementsWorker
|
|
13
13
|
from arkindex_worker.worker.base import ExtrasDirNotFoundError
|
|
14
|
-
from tests
|
|
14
|
+
from tests import CORPUS_ID, FIXTURES_DIR
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
def test_init_default_local_share():
|
|
@@ -178,7 +178,7 @@ def test_configure_worker_run(mocker, responses, caplog):
|
|
|
178
178
|
"model_version": None,
|
|
179
179
|
"process": {
|
|
180
180
|
"id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
|
|
181
|
-
"corpus":
|
|
181
|
+
"corpus": CORPUS_ID,
|
|
182
182
|
},
|
|
183
183
|
"summary": "Worker Fake worker @ 123412",
|
|
184
184
|
}
|
|
@@ -270,7 +270,7 @@ def test_configure_user_configuration_defaults(mocker, responses):
|
|
|
270
270
|
"model_version": None,
|
|
271
271
|
"process": {
|
|
272
272
|
"id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
|
|
273
|
-
"corpus":
|
|
273
|
+
"corpus": CORPUS_ID,
|
|
274
274
|
},
|
|
275
275
|
"summary": "Worker Fake worker @ 123412",
|
|
276
276
|
}
|
|
@@ -319,7 +319,7 @@ def test_configure_user_config_debug(mocker, responses, debug):
|
|
|
319
319
|
},
|
|
320
320
|
"process": {
|
|
321
321
|
"id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
|
|
322
|
-
"corpus":
|
|
322
|
+
"corpus": CORPUS_ID,
|
|
323
323
|
},
|
|
324
324
|
"summary": "Worker Fake worker @ 123412",
|
|
325
325
|
}
|
|
@@ -367,7 +367,7 @@ def test_configure_worker_run_missing_conf(mocker, responses):
|
|
|
367
367
|
"configuration": {"id": "bbbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb", "name": "BBB"},
|
|
368
368
|
"process": {
|
|
369
369
|
"id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
|
|
370
|
-
"corpus":
|
|
370
|
+
"corpus": CORPUS_ID,
|
|
371
371
|
},
|
|
372
372
|
"summary": "Worker Fake worker @ 123412",
|
|
373
373
|
}
|
|
@@ -409,7 +409,7 @@ def test_configure_worker_run_no_worker_run_conf(mocker, responses):
|
|
|
409
409
|
"configuration": None,
|
|
410
410
|
"process": {
|
|
411
411
|
"id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
|
|
412
|
-
"corpus":
|
|
412
|
+
"corpus": CORPUS_ID,
|
|
413
413
|
},
|
|
414
414
|
"summary": "Worker Fake worker @ 123412",
|
|
415
415
|
}
|
|
@@ -458,7 +458,7 @@ def test_configure_load_model_configuration(mocker, responses):
|
|
|
458
458
|
},
|
|
459
459
|
"process": {
|
|
460
460
|
"id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
|
|
461
|
-
"corpus":
|
|
461
|
+
"corpus": CORPUS_ID,
|
|
462
462
|
},
|
|
463
463
|
"summary": "Worker Fake worker @ 123412",
|
|
464
464
|
}
|
|
@@ -669,8 +669,7 @@ def test_find_parents_file_paths(responses, mock_base_worker_with_cache, tmp_pat
|
|
|
669
669
|
):
|
|
670
670
|
(tmp_path / parent_id).mkdir()
|
|
671
671
|
file_path = tmp_path / parent_id / filename
|
|
672
|
-
|
|
673
|
-
f.write(content)
|
|
672
|
+
file_path.write_text(content)
|
|
674
673
|
|
|
675
674
|
# Configure worker with a specific data directory
|
|
676
675
|
mock_base_worker_with_cache.task_data_dir = tmp_path
|