arkindex-base-worker 0.4.0rc1__py3-none-any.whl → 0.4.0rc3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arkindex_base_worker-0.4.0rc1.dist-info → arkindex_base_worker-0.4.0rc3.dist-info}/METADATA +5 -5
- arkindex_base_worker-0.4.0rc3.dist-info/RECORD +52 -0
- {arkindex_base_worker-0.4.0rc1.dist-info → arkindex_base_worker-0.4.0rc3.dist-info}/WHEEL +1 -1
- arkindex_worker/worker/__init__.py +24 -111
- arkindex_worker/worker/base.py +9 -1
- arkindex_worker/worker/classification.py +1 -1
- arkindex_worker/worker/corpus.py +21 -6
- arkindex_worker/worker/dataset.py +70 -0
- arkindex_worker/worker/element.py +17 -0
- arkindex_worker/worker/entity.py +1 -1
- arkindex_worker/worker/process.py +63 -0
- arkindex_worker/worker/task.py +1 -2
- arkindex_worker/worker/training.py +1 -1
- tests/__init__.py +1 -1
- tests/conftest.py +10 -3
- tests/test_dataset_worker.py +6 -3
- tests/test_elements_worker/test_classifications.py +1 -1
- tests/test_elements_worker/test_corpus.py +32 -1
- tests/test_elements_worker/test_dataset.py +1 -1
- tests/test_elements_worker/test_elements.py +270 -3
- tests/test_elements_worker/test_entities.py +1 -1
- tests/test_elements_worker/test_image.py +2 -1
- tests/test_elements_worker/test_metadata.py +1 -1
- tests/test_elements_worker/test_task.py +1 -1
- tests/test_elements_worker/test_transcriptions.py +1 -1
- tests/test_elements_worker/test_worker.py +1 -1
- arkindex_base_worker-0.4.0rc1.dist-info/RECORD +0 -51
- {arkindex_base_worker-0.4.0rc1.dist-info → arkindex_base_worker-0.4.0rc3.dist-info}/LICENSE +0 -0
- {arkindex_base_worker-0.4.0rc1.dist-info → arkindex_base_worker-0.4.0rc3.dist-info}/top_level.txt +0 -0
{arkindex_base_worker-0.4.0rc1.dist-info → arkindex_base_worker-0.4.0rc3.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: arkindex-base-worker
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.0rc3
|
|
4
4
|
Summary: Base Worker to easily build Arkindex ML workflows
|
|
5
5
|
Author-email: Teklia <contact@teklia.com>
|
|
6
6
|
Maintainer-email: Teklia <contact@teklia.com>
|
|
@@ -40,17 +40,17 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
40
40
|
Requires-Python: >=3.10
|
|
41
41
|
Description-Content-Type: text/markdown
|
|
42
42
|
License-File: LICENSE
|
|
43
|
-
Requires-Dist: humanize ==4.
|
|
43
|
+
Requires-Dist: humanize ==4.10.0
|
|
44
44
|
Requires-Dist: peewee ~=3.17
|
|
45
45
|
Requires-Dist: Pillow ==10.4.0
|
|
46
46
|
Requires-Dist: python-gnupg ==0.5.2
|
|
47
47
|
Requires-Dist: shapely ==2.0.5
|
|
48
|
-
Requires-Dist: teklia-toolbox ==0.1.
|
|
48
|
+
Requires-Dist: teklia-toolbox ==0.1.7b1
|
|
49
49
|
Requires-Dist: zstandard ==0.22.0
|
|
50
50
|
Provides-Extra: docs
|
|
51
51
|
Requires-Dist: black ==24.4.2 ; extra == 'docs'
|
|
52
|
-
Requires-Dist: mkdocs-material ==9.5.
|
|
53
|
-
Requires-Dist: mkdocstrings-python ==1.
|
|
52
|
+
Requires-Dist: mkdocs-material ==9.5.33 ; extra == 'docs'
|
|
53
|
+
Requires-Dist: mkdocstrings-python ==1.11.1 ; extra == 'docs'
|
|
54
54
|
Provides-Extra: tests
|
|
55
55
|
Requires-Dist: pytest ==8.3.2 ; extra == 'tests'
|
|
56
56
|
Requires-Dist: pytest-mock ==3.14.0 ; extra == 'tests'
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
arkindex_worker/__init__.py,sha256=OlgCtTC9MaWeejviY0a3iQpALcRQGMVArFVVYwTF6I8,162
|
|
2
|
+
arkindex_worker/cache.py,sha256=FTlB0coXofn5zTNRTcVIvh709mcw4a1bPGqkwWjKs3w,11248
|
|
3
|
+
arkindex_worker/image.py,sha256=oEgVCrSHiGh3D5-UXfM6PvT17TttSxC0115irpvB3Dw,18581
|
|
4
|
+
arkindex_worker/models.py,sha256=bPQzGZNs5a6z6DEcygsa8T33VOqPlMUbwKzHqlKzwbw,9923
|
|
5
|
+
arkindex_worker/utils.py,sha256=q1EeLdC6ebYIH-C0LOAqw2cNpjCjVoP-Vbr-39mF4w0,9884
|
|
6
|
+
arkindex_worker/worker/__init__.py,sha256=tcqxrox9EpOjaN2EQgXumiABKpWHLsJiynPC2_sZuOQ,15880
|
|
7
|
+
arkindex_worker/worker/base.py,sha256=7Pmw-UQSxV-xkW8NO5cXsxJ8W8szzyppMaNjq_az81A,19844
|
|
8
|
+
arkindex_worker/worker/classification.py,sha256=zECSNzGCZFzoPoDVZN4kuGYRNLzMQLBaRt3q1jnBSaA,10952
|
|
9
|
+
arkindex_worker/worker/corpus.py,sha256=0TQFOwZ6Te-CZi6lgkZY1wzyJ5wO9LAmcVQtqHvZpPk,2291
|
|
10
|
+
arkindex_worker/worker/dataset.py,sha256=LwzKwNFX4FqfLxh29LSvJydPwRw3VHaB1wjuFhUshsE,5267
|
|
11
|
+
arkindex_worker/worker/element.py,sha256=Qvvq9kJnAHNATHW7zi96eIY1x-0MsR-T5rrSJg6e9Y4,45309
|
|
12
|
+
arkindex_worker/worker/entity.py,sha256=ThhP22xOYR5Z4P1VH_pOl_y_uDKZFeQVDqxO6aRkIhg,15227
|
|
13
|
+
arkindex_worker/worker/image.py,sha256=t_Az6IGnj0EZyvcA4XxfPikOUjn_pztgsyxTkFZhaXU,621
|
|
14
|
+
arkindex_worker/worker/metadata.py,sha256=VRajtd2kaBvar9GercX4knvR6l1WFYjoCdJWU9ccKgk,7291
|
|
15
|
+
arkindex_worker/worker/process.py,sha256=I1rBt3Y8bV4zcPr8N1E2NRZ0UClSTqhExsO9CPcP41E,1012
|
|
16
|
+
arkindex_worker/worker/task.py,sha256=r1j7_qbdNu2Z8H8HbGzO3P3qdx-2N1pBbUPFDca0rqg,1519
|
|
17
|
+
arkindex_worker/worker/training.py,sha256=H8FmCdzGcDW-WMMwcgvmZPlN5tPHwGo0BXn12qmzj8g,10875
|
|
18
|
+
arkindex_worker/worker/transcription.py,sha256=52RY9kYsiR1sz9FxOigyo12Ker3VDbQ4U42gK9DpR3g,21146
|
|
19
|
+
arkindex_worker/worker/version.py,sha256=JIT7OI3Mo7RPkNrjOB9hfqrsG-FYygz_zi4l8PbkuAo,1960
|
|
20
|
+
hooks/pre_gen_project.py,sha256=xQJERv3vv9VzIqcBHI281eeWLWREXUF4mMw7PvJHHXM,269
|
|
21
|
+
tests/__init__.py,sha256=DG--S6IpGl399rzSAjDdHL76CkOIeZIjajCcyUSDhOQ,241
|
|
22
|
+
tests/conftest.py,sha256=2ocZ2x-mZQrNe9zvWwhWk2_4ExdaBHIB74SvtDlExRE,21580
|
|
23
|
+
tests/test_base_worker.py,sha256=2EIYcd_3f9O0zB5WiGIQV0Cn9wndLvnEnSfcAE1qWWU,30607
|
|
24
|
+
tests/test_cache.py,sha256=ii0gyr0DrG7ChEs7pmT8hMdSguAOAcCze4bRMiFQxuk,10640
|
|
25
|
+
tests/test_dataset_worker.py,sha256=gApYz0LArHr1cNn079_fa_BQABF6RVQYuM1Tc4m3NsQ,22089
|
|
26
|
+
tests/test_element.py,sha256=2G9M15TLxQRmvrWM9Kw2ucnElh4kSv_oF_5FYwwAxTY,13181
|
|
27
|
+
tests/test_image.py,sha256=J3jqB5OhcdCpB6n0UnwivxrMlne8YjFLXhq1gBMANrs,26711
|
|
28
|
+
tests/test_merge.py,sha256=TuOeUS0UCz66DPOQFFhc4NQBxIjZL9f5czi4XnvGrr4,8270
|
|
29
|
+
tests/test_utils.py,sha256=_WJUPnt-pM_TQ0er4yjPZy-u_LePrHq1lxwk_teky7M,2544
|
|
30
|
+
tests/test_elements_worker/__init__.py,sha256=Fh4nkbbyJSMv_VtjQxnWrOqTnxXaaWI8S9WU0VrzCHs,179
|
|
31
|
+
tests/test_elements_worker/test_classifications.py,sha256=GtVyi9bg4YTd7nyw8u6IjQZYBwFMwoVZdrfSBc5UybU,27780
|
|
32
|
+
tests/test_elements_worker/test_cli.py,sha256=a23i1pUDbXi23MUtbWwGEcLLrmc_YlrbDgOG3h66wLM,2620
|
|
33
|
+
tests/test_elements_worker/test_corpus.py,sha256=OAbwgaQtHmcmPkcAl9Kuceun_BvMasnZvYj4_EdfugY,5483
|
|
34
|
+
tests/test_elements_worker/test_dataset.py,sha256=00IlOZv9YFlZ23rGXyR-HLbKLQxGelZ1Bf9lEZYA0IY,11412
|
|
35
|
+
tests/test_elements_worker/test_elements.py,sha256=l5YTfm0CzBTQyZvdOplhhza-gpPSz-8RVix1YUzAwhM,115497
|
|
36
|
+
tests/test_elements_worker/test_entities.py,sha256=nrCvkdJdjsyOrbD6R-H8NvxREZxciiR6CGIObXzeg50,36182
|
|
37
|
+
tests/test_elements_worker/test_image.py,sha256=BljMNKgec_9a5bzNzFpYZIvSbuvwsWDfdqLHVJaTa7M,2079
|
|
38
|
+
tests/test_elements_worker/test_metadata.py,sha256=Xfggy-vxw5DZ3hFKx3sB7OYb2d1tu1RiNK8fvKJIaBs,22294
|
|
39
|
+
tests/test_elements_worker/test_task.py,sha256=wTUWqN9UhfKmJn3IcFY75EW4I1ulRhisflmY1kmP47s,5574
|
|
40
|
+
tests/test_elements_worker/test_training.py,sha256=Qxi9EzGr_uKcn2Fh5aE6jNrq1K8QKLiOiSew4upASPs,8721
|
|
41
|
+
tests/test_elements_worker/test_transcriptions.py,sha256=iq-nR_st7Q9E_nD7knrKGY57g36J6nYSEzbPk9y-cxY,77061
|
|
42
|
+
tests/test_elements_worker/test_worker.py,sha256=VdprIWezB3dJdE8vNOrS71RQugqUysHlveOWTQate-8,10804
|
|
43
|
+
worker-demo/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
44
|
+
worker-demo/tests/conftest.py,sha256=XzNMNeg6pmABUAH8jN6eZTlZSFGLYjS3-DTXjiRN6Yc,1002
|
|
45
|
+
worker-demo/tests/test_worker.py,sha256=3DLd4NRK4bfyatG5P_PK4k9P9tJHx9XQq5_ryFEEFVg,304
|
|
46
|
+
worker-demo/worker_demo/__init__.py,sha256=2BPomV8ZMNf3YXJgloatKeHQCE6QOkwmsHGkO6MkQuM,125
|
|
47
|
+
worker-demo/worker_demo/worker.py,sha256=Rt-DjWa5iBP08k58NDZMfeyPuFbtNcbX6nc5jFX7GNo,440
|
|
48
|
+
arkindex_base_worker-0.4.0rc3.dist-info/LICENSE,sha256=NVshRi1efwVezMfW7xXYLrdDr2Li1AfwfGOd5WuH1kQ,1063
|
|
49
|
+
arkindex_base_worker-0.4.0rc3.dist-info/METADATA,sha256=eDT7HxTvEz2yg4U_lbzkuigNWFu4JTqaLTnY0fqSCiM,3306
|
|
50
|
+
arkindex_base_worker-0.4.0rc3.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
|
51
|
+
arkindex_base_worker-0.4.0rc3.dist-info/top_level.txt,sha256=58NuslgxQC2vT4DiqZEgO4JqJRrYa2yeNI9QvkbfGQU,40
|
|
52
|
+
arkindex_base_worker-0.4.0rc3.dist-info/RECORD,,
|
|
@@ -4,16 +4,13 @@ Base classes to implement Arkindex workers.
|
|
|
4
4
|
|
|
5
5
|
import contextlib
|
|
6
6
|
import json
|
|
7
|
-
import os
|
|
8
7
|
import sys
|
|
9
8
|
import uuid
|
|
10
|
-
from
|
|
11
|
-
from
|
|
12
|
-
from enum import Enum
|
|
9
|
+
from collections.abc import Iterable
|
|
10
|
+
from itertools import chain
|
|
13
11
|
from pathlib import Path
|
|
14
12
|
|
|
15
|
-
from
|
|
16
|
-
|
|
13
|
+
from arkindex.exceptions import ErrorResponse
|
|
17
14
|
from arkindex_worker import logger
|
|
18
15
|
from arkindex_worker.cache import CachedElement
|
|
19
16
|
from arkindex_worker.models import Dataset, Element, Set
|
|
@@ -21,47 +18,27 @@ from arkindex_worker.utils import pluralize
|
|
|
21
18
|
from arkindex_worker.worker.base import BaseWorker
|
|
22
19
|
from arkindex_worker.worker.classification import ClassificationMixin
|
|
23
20
|
from arkindex_worker.worker.corpus import CorpusMixin
|
|
24
|
-
from arkindex_worker.worker.dataset import
|
|
21
|
+
from arkindex_worker.worker.dataset import (
|
|
22
|
+
DatasetMixin,
|
|
23
|
+
DatasetState,
|
|
24
|
+
MissingDatasetArchive,
|
|
25
|
+
)
|
|
25
26
|
from arkindex_worker.worker.element import ElementMixin
|
|
26
27
|
from arkindex_worker.worker.entity import EntityMixin
|
|
27
28
|
from arkindex_worker.worker.image import ImageMixin
|
|
28
29
|
from arkindex_worker.worker.metadata import MetaDataMixin, MetaType # noqa: F401
|
|
30
|
+
from arkindex_worker.worker.process import ActivityState, ProcessMode
|
|
29
31
|
from arkindex_worker.worker.task import TaskMixin
|
|
30
32
|
from arkindex_worker.worker.transcription import TranscriptionMixin
|
|
31
33
|
from arkindex_worker.worker.version import WorkerVersionMixin
|
|
32
34
|
|
|
33
35
|
|
|
34
|
-
class ActivityState(Enum):
|
|
35
|
-
"""
|
|
36
|
-
Processing state of an element.
|
|
37
|
-
"""
|
|
38
|
-
|
|
39
|
-
Queued = "queued"
|
|
40
|
-
"""
|
|
41
|
-
The element has not yet been processed by a worker.
|
|
42
|
-
"""
|
|
43
|
-
|
|
44
|
-
Started = "started"
|
|
45
|
-
"""
|
|
46
|
-
The element is being processed by a worker.
|
|
47
|
-
"""
|
|
48
|
-
|
|
49
|
-
Processed = "processed"
|
|
50
|
-
"""
|
|
51
|
-
The element has been successfully processed by a worker.
|
|
52
|
-
"""
|
|
53
|
-
|
|
54
|
-
Error = "error"
|
|
55
|
-
"""
|
|
56
|
-
An error occurred while processing this element.
|
|
57
|
-
"""
|
|
58
|
-
|
|
59
|
-
|
|
60
36
|
class ElementsWorker(
|
|
37
|
+
ElementMixin,
|
|
38
|
+
DatasetMixin,
|
|
61
39
|
BaseWorker,
|
|
62
40
|
ClassificationMixin,
|
|
63
41
|
CorpusMixin,
|
|
64
|
-
ElementMixin,
|
|
65
42
|
TranscriptionMixin,
|
|
66
43
|
WorkerVersionMixin,
|
|
67
44
|
EntityMixin,
|
|
@@ -96,22 +73,7 @@ class ElementsWorker(
|
|
|
96
73
|
|
|
97
74
|
self._worker_version_cache = {}
|
|
98
75
|
|
|
99
|
-
def
|
|
100
|
-
"""Define specific ``argparse`` arguments for this worker"""
|
|
101
|
-
self.parser.add_argument(
|
|
102
|
-
"--elements-list",
|
|
103
|
-
help="JSON elements list to use",
|
|
104
|
-
type=open,
|
|
105
|
-
default=os.environ.get("TASK_ELEMENTS"),
|
|
106
|
-
)
|
|
107
|
-
self.parser.add_argument(
|
|
108
|
-
"--element",
|
|
109
|
-
type=str,
|
|
110
|
-
nargs="+",
|
|
111
|
-
help="One or more Arkindex element ID",
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
def get_elements(self) -> Iterable[CachedElement] | list[str]:
|
|
76
|
+
def get_elements(self) -> Iterable[CachedElement] | list[str] | list[Element]:
|
|
115
77
|
"""
|
|
116
78
|
List the elements to be processed, either from the CLI arguments or
|
|
117
79
|
the cache database when enabled.
|
|
@@ -143,15 +105,20 @@ class ElementsWorker(
|
|
|
143
105
|
)
|
|
144
106
|
if self.use_cache and cache_query.exists():
|
|
145
107
|
return cache_query
|
|
146
|
-
# Process elements from JSON file
|
|
147
108
|
elif self.args.elements_list:
|
|
109
|
+
# Process elements from JSON file
|
|
148
110
|
data = json.load(self.args.elements_list)
|
|
149
111
|
assert isinstance(data, list), "Elements list must be a list"
|
|
150
112
|
assert len(data), "No elements in elements list"
|
|
151
113
|
out += list(filter(None, [element.get("id") for element in data]))
|
|
152
|
-
# Add any extra element from CLI
|
|
153
114
|
elif self.args.element:
|
|
115
|
+
# Add any extra element from CLI
|
|
154
116
|
out += self.args.element
|
|
117
|
+
elif self.process_mode == ProcessMode.Dataset or self.args.set:
|
|
118
|
+
# Elements from datasets
|
|
119
|
+
return list(
|
|
120
|
+
chain.from_iterable(map(self.list_set_elements, self.list_sets()))
|
|
121
|
+
)
|
|
155
122
|
|
|
156
123
|
invalid_element_ids = list(filter(invalid_element_id, out))
|
|
157
124
|
assert (
|
|
@@ -166,7 +133,8 @@ class ElementsWorker(
|
|
|
166
133
|
Whether or not WorkerActivity support has been enabled on the DataImport
|
|
167
134
|
used to run this worker.
|
|
168
135
|
"""
|
|
169
|
-
if self.is_read_only:
|
|
136
|
+
if self.is_read_only or self.process_mode == ProcessMode.Dataset:
|
|
137
|
+
# Worker activities are also disabled when running an ElementsWorker in a Dataset process.
|
|
170
138
|
return False
|
|
171
139
|
assert (
|
|
172
140
|
self.process_information
|
|
@@ -200,7 +168,7 @@ class ElementsWorker(
|
|
|
200
168
|
for i, item in enumerate(elements, start=1):
|
|
201
169
|
element = None
|
|
202
170
|
try:
|
|
203
|
-
if
|
|
171
|
+
if isinstance(item, CachedElement | Element):
|
|
204
172
|
# Just use the result of get_elements as the element
|
|
205
173
|
element = item
|
|
206
174
|
else:
|
|
@@ -316,29 +284,7 @@ class ElementsWorker(
|
|
|
316
284
|
return True
|
|
317
285
|
|
|
318
286
|
|
|
319
|
-
|
|
320
|
-
values = value.split(":")
|
|
321
|
-
if len(values) != 2:
|
|
322
|
-
raise ArgumentTypeError(
|
|
323
|
-
f"'{value}' is not in the correct format `<dataset_id>:<set_name>`"
|
|
324
|
-
)
|
|
325
|
-
|
|
326
|
-
dataset_id, set_name = values
|
|
327
|
-
try:
|
|
328
|
-
dataset_id = uuid.UUID(dataset_id)
|
|
329
|
-
return (dataset_id, set_name)
|
|
330
|
-
except (TypeError, ValueError) as e:
|
|
331
|
-
raise ArgumentTypeError(f"'{dataset_id}' should be a valid UUID") from e
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
class MissingDatasetArchive(Exception):
|
|
335
|
-
"""
|
|
336
|
-
Exception raised when the compressed archive associated to
|
|
337
|
-
a dataset isn't found in its task artifacts.
|
|
338
|
-
"""
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
|
|
287
|
+
class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
|
|
342
288
|
"""
|
|
343
289
|
Base class for ML workers that operate on Arkindex dataset sets.
|
|
344
290
|
|
|
@@ -361,19 +307,6 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
|
|
|
361
307
|
# Set as an instance variable as dataset workers might use it to easily extract its content
|
|
362
308
|
self.downloaded_dataset_artifact: Path | None = None
|
|
363
309
|
|
|
364
|
-
def add_arguments(self):
|
|
365
|
-
"""Define specific ``argparse`` arguments for this worker"""
|
|
366
|
-
self.parser.add_argument(
|
|
367
|
-
"--set",
|
|
368
|
-
type=check_dataset_set,
|
|
369
|
-
nargs="+",
|
|
370
|
-
help="""
|
|
371
|
-
One or more Arkindex dataset sets, format is <dataset_uuid>:<set_name>
|
|
372
|
-
(e.g.: "12341234-1234-1234-1234-123412341234:train")
|
|
373
|
-
""",
|
|
374
|
-
default=[],
|
|
375
|
-
)
|
|
376
|
-
|
|
377
310
|
def cleanup_downloaded_artifact(self) -> None:
|
|
378
311
|
"""
|
|
379
312
|
Cleanup the downloaded dataset artifact if any
|
|
@@ -421,30 +354,10 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
|
|
|
421
354
|
:param set: The set to process.
|
|
422
355
|
"""
|
|
423
356
|
|
|
424
|
-
def list_sets(self) -> Iterator[Set]:
|
|
425
|
-
"""
|
|
426
|
-
List the sets to be processed, either from the CLI arguments or using the
|
|
427
|
-
[list_process_sets][arkindex_worker.worker.dataset.DatasetMixin.list_process_sets] method.
|
|
428
|
-
|
|
429
|
-
:returns: An iterator of ``Set`` objects.
|
|
430
|
-
"""
|
|
431
|
-
if not self.is_read_only:
|
|
432
|
-
yield from self.list_process_sets()
|
|
433
|
-
|
|
434
|
-
datasets: dict[uuid.UUID, Dataset] = {}
|
|
435
|
-
for dataset_id, set_name in self.args.set:
|
|
436
|
-
# Retrieving dataset information is not already cached
|
|
437
|
-
if dataset_id not in datasets:
|
|
438
|
-
datasets[dataset_id] = Dataset(
|
|
439
|
-
**self.api_client.request("RetrieveDataset", id=dataset_id)
|
|
440
|
-
)
|
|
441
|
-
|
|
442
|
-
yield Set(name=set_name, dataset=datasets[dataset_id])
|
|
443
|
-
|
|
444
357
|
def run(self):
|
|
445
358
|
"""
|
|
446
359
|
Implements an Arkindex worker that goes through each dataset set returned by
|
|
447
|
-
[list_sets][arkindex_worker.worker.
|
|
360
|
+
[list_sets][arkindex_worker.worker.dataset.DatasetMixin.list_sets].
|
|
448
361
|
|
|
449
362
|
It calls [process_set][arkindex_worker.worker.DatasetWorker.process_set],
|
|
450
363
|
catching exceptions.
|
arkindex_worker/worker/base.py
CHANGED
|
@@ -12,9 +12,9 @@ from tempfile import mkdtemp
|
|
|
12
12
|
|
|
13
13
|
import gnupg
|
|
14
14
|
import yaml
|
|
15
|
-
from apistar.exceptions import ErrorResponse
|
|
16
15
|
|
|
17
16
|
from arkindex import options_from_env
|
|
17
|
+
from arkindex.exceptions import ErrorResponse
|
|
18
18
|
from arkindex_worker import logger
|
|
19
19
|
from arkindex_worker.cache import (
|
|
20
20
|
check_version,
|
|
@@ -24,6 +24,7 @@ from arkindex_worker.cache import (
|
|
|
24
24
|
merge_parents_cache,
|
|
25
25
|
)
|
|
26
26
|
from arkindex_worker.utils import close_delete_file, extract_tar_zst_archive
|
|
27
|
+
from arkindex_worker.worker.process import ProcessMode
|
|
27
28
|
from teklia_toolbox.requests import get_arkindex_client
|
|
28
29
|
|
|
29
30
|
|
|
@@ -156,6 +157,13 @@ class BaseWorker:
|
|
|
156
157
|
raise Exception("Missing ARKINDEX_CORPUS_ID environment variable")
|
|
157
158
|
return self._corpus_id
|
|
158
159
|
|
|
160
|
+
@property
|
|
161
|
+
def process_mode(self) -> ProcessMode | None:
|
|
162
|
+
"""Mode of the process being run. Returns None when read-only."""
|
|
163
|
+
if self.is_read_only:
|
|
164
|
+
return
|
|
165
|
+
return ProcessMode(self.process_information["mode"])
|
|
166
|
+
|
|
159
167
|
@property
|
|
160
168
|
def is_read_only(self) -> bool:
|
|
161
169
|
"""
|
|
@@ -2,9 +2,9 @@
|
|
|
2
2
|
ElementsWorker methods for classifications and ML classes.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
from apistar.exceptions import ErrorResponse
|
|
6
5
|
from peewee import IntegrityError
|
|
7
6
|
|
|
7
|
+
from arkindex.exceptions import ErrorResponse
|
|
8
8
|
from arkindex_worker import logger
|
|
9
9
|
from arkindex_worker.cache import CachedClassification, CachedElement
|
|
10
10
|
from arkindex_worker.models import Element
|
arkindex_worker/worker/corpus.py
CHANGED
|
@@ -5,6 +5,7 @@ BaseWorker methods for corpora.
|
|
|
5
5
|
from enum import Enum
|
|
6
6
|
from operator import itemgetter
|
|
7
7
|
from tempfile import _TemporaryFileWrapper
|
|
8
|
+
from uuid import UUID
|
|
8
9
|
|
|
9
10
|
from arkindex_worker import logger
|
|
10
11
|
|
|
@@ -36,6 +37,25 @@ class CorpusExportState(Enum):
|
|
|
36
37
|
|
|
37
38
|
|
|
38
39
|
class CorpusMixin:
|
|
40
|
+
def download_export(self, export_id: str) -> _TemporaryFileWrapper:
|
|
41
|
+
"""
|
|
42
|
+
Download an export.
|
|
43
|
+
|
|
44
|
+
:param export_id: UUID of the export to download
|
|
45
|
+
:returns: The downloaded export stored in a temporary file.
|
|
46
|
+
"""
|
|
47
|
+
try:
|
|
48
|
+
UUID(export_id)
|
|
49
|
+
except ValueError as e:
|
|
50
|
+
raise ValueError("export_id is not a valid uuid.") from e
|
|
51
|
+
|
|
52
|
+
logger.info(f"Downloading export ({export_id})...")
|
|
53
|
+
export: _TemporaryFileWrapper = self.api_client.request(
|
|
54
|
+
"DownloadExport", id=export_id
|
|
55
|
+
)
|
|
56
|
+
logger.info(f"Downloaded export ({export_id}) @ `{export.name}`")
|
|
57
|
+
return export
|
|
58
|
+
|
|
39
59
|
def download_latest_export(self) -> _TemporaryFileWrapper:
|
|
40
60
|
"""
|
|
41
61
|
Download the latest export in `done` state of the current corpus.
|
|
@@ -62,10 +82,5 @@ class CorpusMixin:
|
|
|
62
82
|
|
|
63
83
|
# Download latest export
|
|
64
84
|
export_id: str = exports[0]["id"]
|
|
65
|
-
logger.info(f"Downloading export ({export_id})...")
|
|
66
|
-
export: _TemporaryFileWrapper = self.api_client.request(
|
|
67
|
-
"DownloadExport", id=export_id
|
|
68
|
-
)
|
|
69
|
-
logger.info(f"Downloaded export ({export_id}) @ `{export.name}`")
|
|
70
85
|
|
|
71
|
-
return
|
|
86
|
+
return self.download_export(export_id)
|
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
BaseWorker methods for datasets.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
import uuid
|
|
6
|
+
from argparse import ArgumentTypeError
|
|
5
7
|
from collections.abc import Iterator
|
|
6
8
|
from enum import Enum
|
|
7
9
|
|
|
@@ -36,7 +38,55 @@ class DatasetState(Enum):
|
|
|
36
38
|
"""
|
|
37
39
|
|
|
38
40
|
|
|
41
|
+
class MissingDatasetArchive(Exception):
|
|
42
|
+
"""
|
|
43
|
+
Exception raised when the compressed archive associated to
|
|
44
|
+
a dataset isn't found in its task artifacts.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def check_dataset_set(value: str) -> tuple[uuid.UUID, str]:
|
|
49
|
+
"""The `--set` argument should have the following format:
|
|
50
|
+
<dataset_id>:<set_name>
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
value (str): Provided argument.
|
|
54
|
+
|
|
55
|
+
Raises:
|
|
56
|
+
ArgumentTypeError: When the value is invalid.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
tuple[uuid.UUID, str]: The ID of the dataset parsed as UUID and the name of the set.
|
|
60
|
+
"""
|
|
61
|
+
values = value.split(":")
|
|
62
|
+
if len(values) != 2:
|
|
63
|
+
raise ArgumentTypeError(
|
|
64
|
+
f"'{value}' is not in the correct format `<dataset_id>:<set_name>`"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
dataset_id, set_name = values
|
|
68
|
+
try:
|
|
69
|
+
dataset_id = uuid.UUID(dataset_id)
|
|
70
|
+
return (dataset_id, set_name)
|
|
71
|
+
except (TypeError, ValueError) as e:
|
|
72
|
+
raise ArgumentTypeError(f"'{dataset_id}' should be a valid UUID") from e
|
|
73
|
+
|
|
74
|
+
|
|
39
75
|
class DatasetMixin:
|
|
76
|
+
def add_arguments(self) -> None:
|
|
77
|
+
"""Define specific ``argparse`` arguments for the worker using this mixin"""
|
|
78
|
+
self.parser.add_argument(
|
|
79
|
+
"--set",
|
|
80
|
+
type=check_dataset_set,
|
|
81
|
+
nargs="+",
|
|
82
|
+
help="""
|
|
83
|
+
One or more Arkindex dataset sets, format is <dataset_uuid>:<set_name>
|
|
84
|
+
(e.g.: "12341234-1234-1234-1234-123412341234:train")
|
|
85
|
+
""",
|
|
86
|
+
default=[],
|
|
87
|
+
)
|
|
88
|
+
super().add_arguments()
|
|
89
|
+
|
|
40
90
|
def list_process_sets(self) -> Iterator[Set]:
|
|
41
91
|
"""
|
|
42
92
|
List dataset sets associated to the worker's process. This helper is not available in developer mode.
|
|
@@ -73,6 +123,26 @@ class DatasetMixin:
|
|
|
73
123
|
|
|
74
124
|
return map(lambda result: Element(**result["element"]), results)
|
|
75
125
|
|
|
126
|
+
def list_sets(self) -> Iterator[Set]:
|
|
127
|
+
"""
|
|
128
|
+
List the sets to be processed, either from the CLI arguments or using the
|
|
129
|
+
[list_process_sets][arkindex_worker.worker.dataset.DatasetMixin.list_process_sets] method.
|
|
130
|
+
|
|
131
|
+
:returns: An iterator of ``Set`` objects.
|
|
132
|
+
"""
|
|
133
|
+
if not self.is_read_only:
|
|
134
|
+
yield from self.list_process_sets()
|
|
135
|
+
|
|
136
|
+
datasets: dict[uuid.UUID, Dataset] = {}
|
|
137
|
+
for dataset_id, set_name in self.args.set:
|
|
138
|
+
# Retrieving dataset information if not already cached
|
|
139
|
+
if dataset_id not in datasets:
|
|
140
|
+
datasets[dataset_id] = Dataset(
|
|
141
|
+
**self.api_client.request("RetrieveDataset", id=dataset_id)
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
yield Set(name=set_name, dataset=datasets[dataset_id])
|
|
145
|
+
|
|
76
146
|
@unsupported_cache
|
|
77
147
|
def update_dataset_state(self, dataset: Dataset, state: DatasetState) -> Dataset:
|
|
78
148
|
"""
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
ElementsWorker methods for elements and element types.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
import os
|
|
5
6
|
from collections.abc import Iterable
|
|
6
7
|
from operator import attrgetter
|
|
7
8
|
from typing import NamedTuple
|
|
@@ -38,6 +39,22 @@ class MissingTypeError(Exception):
|
|
|
38
39
|
|
|
39
40
|
|
|
40
41
|
class ElementMixin:
|
|
42
|
+
def add_arguments(self):
|
|
43
|
+
"""Define specific ``argparse`` arguments for the worker using this mixin"""
|
|
44
|
+
self.parser.add_argument(
|
|
45
|
+
"--elements-list",
|
|
46
|
+
help="JSON elements list to use",
|
|
47
|
+
type=open,
|
|
48
|
+
default=os.environ.get("TASK_ELEMENTS"),
|
|
49
|
+
)
|
|
50
|
+
self.parser.add_argument(
|
|
51
|
+
"--element",
|
|
52
|
+
type=str,
|
|
53
|
+
nargs="+",
|
|
54
|
+
help="One or more Arkindex element ID",
|
|
55
|
+
)
|
|
56
|
+
super().add_arguments()
|
|
57
|
+
|
|
41
58
|
def list_corpus_types(self):
|
|
42
59
|
"""
|
|
43
60
|
Loads available element types in corpus.
|
arkindex_worker/worker/entity.py
CHANGED
|
@@ -302,7 +302,7 @@ class EntityMixin:
|
|
|
302
302
|
|
|
303
303
|
created_entities = [
|
|
304
304
|
created_entity
|
|
305
|
-
for batch in make_batches(entities, "
|
|
305
|
+
for batch in make_batches(entities, "entity", batch_size)
|
|
306
306
|
for created_entity in self.api_client.request(
|
|
307
307
|
"CreateTranscriptionEntities",
|
|
308
308
|
id=transcription.id,
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class ActivityState(Enum):
|
|
5
|
+
"""
|
|
6
|
+
Processing state of an element.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
Queued = "queued"
|
|
10
|
+
"""
|
|
11
|
+
The element has not yet been processed by a worker.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
Started = "started"
|
|
15
|
+
"""
|
|
16
|
+
The element is being processed by a worker.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
Processed = "processed"
|
|
20
|
+
"""
|
|
21
|
+
The element has been successfully processed by a worker.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
Error = "error"
|
|
25
|
+
"""
|
|
26
|
+
An error occurred while processing this element.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ProcessMode(Enum):
|
|
31
|
+
"""
|
|
32
|
+
Mode of the process of the worker.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
Files = "files"
|
|
36
|
+
"""
|
|
37
|
+
Processes of files (images, PDFs, IIIF, ...) imports.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
Workers = "workers"
|
|
41
|
+
"""
|
|
42
|
+
Processes of worker executions.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
Template = "template"
|
|
46
|
+
"""
|
|
47
|
+
Process templates.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
S3 = "s3"
|
|
51
|
+
"""
|
|
52
|
+
Processes of imports from an S3-compatible storage.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
Local = "local"
|
|
56
|
+
"""
|
|
57
|
+
Local processes.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
Dataset = "dataset"
|
|
61
|
+
"""
|
|
62
|
+
Dataset processes.
|
|
63
|
+
"""
|
arkindex_worker/worker/task.py
CHANGED
|
@@ -9,8 +9,8 @@ from typing import NewType
|
|
|
9
9
|
from uuid import UUID
|
|
10
10
|
|
|
11
11
|
import requests
|
|
12
|
-
from apistar.exceptions import ErrorResponse
|
|
13
12
|
|
|
13
|
+
from arkindex.exceptions import ErrorResponse
|
|
14
14
|
from arkindex_worker import logger
|
|
15
15
|
from arkindex_worker.utils import close_delete_file, create_tar_zst_archive
|
|
16
16
|
|
tests/__init__.py
CHANGED
tests/conftest.py
CHANGED
|
@@ -23,10 +23,15 @@ from arkindex_worker.cache import (
|
|
|
23
23
|
init_cache_db,
|
|
24
24
|
)
|
|
25
25
|
from arkindex_worker.models import Artifact, Dataset, Set
|
|
26
|
-
from arkindex_worker.worker import
|
|
26
|
+
from arkindex_worker.worker import (
|
|
27
|
+
BaseWorker,
|
|
28
|
+
DatasetWorker,
|
|
29
|
+
ElementsWorker,
|
|
30
|
+
ProcessMode,
|
|
31
|
+
)
|
|
27
32
|
from arkindex_worker.worker.dataset import DatasetState
|
|
28
33
|
from arkindex_worker.worker.transcription import TextOrientation
|
|
29
|
-
from tests import CORPUS_ID,
|
|
34
|
+
from tests import CORPUS_ID, SAMPLES_DIR
|
|
30
35
|
|
|
31
36
|
__yaml_cache = {}
|
|
32
37
|
|
|
@@ -601,7 +606,9 @@ def mock_dataset_worker(monkeypatch, mocker, _mock_worker_run_api):
|
|
|
601
606
|
|
|
602
607
|
dataset_worker = DatasetWorker()
|
|
603
608
|
dataset_worker.configure()
|
|
604
|
-
|
|
609
|
+
|
|
610
|
+
# Update process mode
|
|
611
|
+
dataset_worker.process_information["mode"] = ProcessMode.Dataset
|
|
605
612
|
|
|
606
613
|
assert not dataset_worker.is_read_only
|
|
607
614
|
|
tests/test_dataset_worker.py
CHANGED
|
@@ -3,11 +3,14 @@ import uuid
|
|
|
3
3
|
from argparse import ArgumentTypeError
|
|
4
4
|
|
|
5
5
|
import pytest
|
|
6
|
-
from apistar.exceptions import ErrorResponse
|
|
7
6
|
|
|
7
|
+
from arkindex.exceptions import ErrorResponse
|
|
8
8
|
from arkindex_worker.models import Dataset, Set
|
|
9
|
-
from arkindex_worker.worker import
|
|
10
|
-
|
|
9
|
+
from arkindex_worker.worker.dataset import (
|
|
10
|
+
DatasetState,
|
|
11
|
+
MissingDatasetArchive,
|
|
12
|
+
check_dataset_set,
|
|
13
|
+
)
|
|
11
14
|
from tests import FIXTURES_DIR, PROCESS_ID
|
|
12
15
|
from tests.test_elements_worker import BASE_API_CALLS
|
|
13
16
|
|
|
@@ -3,8 +3,8 @@ import re
|
|
|
3
3
|
from uuid import UUID
|
|
4
4
|
|
|
5
5
|
import pytest
|
|
6
|
-
from apistar.exceptions import ErrorResponse
|
|
7
6
|
|
|
7
|
+
from arkindex.exceptions import ErrorResponse
|
|
8
8
|
from arkindex_worker.cache import CachedClassification, CachedElement
|
|
9
9
|
from arkindex_worker.models import Element
|
|
10
10
|
from arkindex_worker.utils import DEFAULT_BATCH_SIZE
|
|
@@ -2,8 +2,8 @@ import re
|
|
|
2
2
|
import uuid
|
|
3
3
|
|
|
4
4
|
import pytest
|
|
5
|
-
from apistar.exceptions import ErrorResponse
|
|
6
5
|
|
|
6
|
+
from arkindex.exceptions import ErrorResponse
|
|
7
7
|
from arkindex_worker.worker.corpus import CorpusExportState
|
|
8
8
|
from tests import CORPUS_ID
|
|
9
9
|
from tests.test_elements_worker import BASE_API_CALLS
|
|
@@ -135,3 +135,34 @@ def test_download_latest_export(responses, mock_elements_worker):
|
|
|
135
135
|
("GET", f"http://testserver/api/v1/corpus/{CORPUS_ID}/export/"),
|
|
136
136
|
("GET", f"http://testserver/api/v1/export/{export_id}/"),
|
|
137
137
|
]
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def test_download_export_not_a_uuid(responses, mock_elements_worker):
|
|
141
|
+
with pytest.raises(ValueError, match="export_id is not a valid uuid."):
|
|
142
|
+
mock_elements_worker.download_export("mon export")
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def test_download_export(responses, mock_elements_worker):
|
|
146
|
+
responses.add(
|
|
147
|
+
responses.GET,
|
|
148
|
+
"http://testserver/api/v1/export/aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff/",
|
|
149
|
+
status=302,
|
|
150
|
+
body=b"some SQLite export",
|
|
151
|
+
content_type="application/x-sqlite3",
|
|
152
|
+
stream=True,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
export = mock_elements_worker.download_export(
|
|
156
|
+
"aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff"
|
|
157
|
+
)
|
|
158
|
+
assert export.name == "/tmp/aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff"
|
|
159
|
+
|
|
160
|
+
assert len(responses.calls) == len(BASE_API_CALLS) + 1
|
|
161
|
+
assert [
|
|
162
|
+
(call.request.method, call.request.url) for call in responses.calls
|
|
163
|
+
] == BASE_API_CALLS + [
|
|
164
|
+
(
|
|
165
|
+
"GET",
|
|
166
|
+
"http://testserver/api/v1/export/aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff/",
|
|
167
|
+
),
|
|
168
|
+
]
|
|
@@ -2,8 +2,8 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
|
|
4
4
|
import pytest
|
|
5
|
-
from apistar.exceptions import ErrorResponse
|
|
6
5
|
|
|
6
|
+
from arkindex.exceptions import ErrorResponse
|
|
7
7
|
from arkindex_worker.models import Dataset, Element, Set
|
|
8
8
|
from arkindex_worker.worker.dataset import DatasetState
|
|
9
9
|
from tests import PROCESS_ID
|
|
@@ -4,9 +4,9 @@ from argparse import Namespace
|
|
|
4
4
|
from uuid import UUID
|
|
5
5
|
|
|
6
6
|
import pytest
|
|
7
|
-
from apistar.exceptions import ErrorResponse
|
|
8
7
|
from responses import matchers
|
|
9
8
|
|
|
9
|
+
from arkindex.exceptions import ErrorResponse
|
|
10
10
|
from arkindex_worker.cache import (
|
|
11
11
|
SQL_VERSION,
|
|
12
12
|
CachedElement,
|
|
@@ -17,7 +17,9 @@ from arkindex_worker.cache import (
|
|
|
17
17
|
from arkindex_worker.models import Element
|
|
18
18
|
from arkindex_worker.utils import DEFAULT_BATCH_SIZE
|
|
19
19
|
from arkindex_worker.worker import ElementsWorker
|
|
20
|
+
from arkindex_worker.worker.dataset import DatasetState
|
|
20
21
|
from arkindex_worker.worker.element import MissingTypeError
|
|
22
|
+
from arkindex_worker.worker.process import ProcessMode
|
|
21
23
|
from tests import CORPUS_ID
|
|
22
24
|
|
|
23
25
|
from . import BASE_API_CALLS
|
|
@@ -208,10 +210,12 @@ def test_get_elements_element_arg_not_uuid(mocker, mock_elements_worker):
|
|
|
208
210
|
"arkindex_worker.worker.base.argparse.ArgumentParser.parse_args",
|
|
209
211
|
return_value=Namespace(
|
|
210
212
|
element=["volumeid", "pageid"],
|
|
213
|
+
config={},
|
|
211
214
|
verbose=False,
|
|
212
215
|
elements_list=None,
|
|
213
216
|
database=None,
|
|
214
|
-
dev=
|
|
217
|
+
dev=True,
|
|
218
|
+
set=[],
|
|
215
219
|
),
|
|
216
220
|
)
|
|
217
221
|
|
|
@@ -232,10 +236,12 @@ def test_get_elements_element_arg(mocker, mock_elements_worker):
|
|
|
232
236
|
"11111111-1111-1111-1111-111111111111",
|
|
233
237
|
"22222222-2222-2222-2222-222222222222",
|
|
234
238
|
],
|
|
239
|
+
config={},
|
|
235
240
|
verbose=False,
|
|
236
241
|
elements_list=None,
|
|
237
242
|
database=None,
|
|
238
|
-
dev=
|
|
243
|
+
dev=True,
|
|
244
|
+
set=[],
|
|
239
245
|
),
|
|
240
246
|
)
|
|
241
247
|
|
|
@@ -250,6 +256,264 @@ def test_get_elements_element_arg(mocker, mock_elements_worker):
|
|
|
250
256
|
]
|
|
251
257
|
|
|
252
258
|
|
|
259
|
+
def test_get_elements_dataset_set_arg(responses, mocker, mock_elements_worker):
|
|
260
|
+
mocker.patch(
|
|
261
|
+
"arkindex_worker.worker.base.argparse.ArgumentParser.parse_args",
|
|
262
|
+
return_value=Namespace(
|
|
263
|
+
element=[],
|
|
264
|
+
config={},
|
|
265
|
+
verbose=False,
|
|
266
|
+
elements_list=None,
|
|
267
|
+
database=None,
|
|
268
|
+
dev=True,
|
|
269
|
+
set=[(UUID("11111111-1111-1111-1111-111111111111"), "train")],
|
|
270
|
+
),
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
# Mock RetrieveDataset call
|
|
274
|
+
responses.add(
|
|
275
|
+
responses.GET,
|
|
276
|
+
"http://testserver/api/v1/datasets/11111111-1111-1111-1111-111111111111/",
|
|
277
|
+
status=200,
|
|
278
|
+
json={
|
|
279
|
+
"id": "11111111-1111-1111-1111-111111111111",
|
|
280
|
+
"name": "My dataset",
|
|
281
|
+
"description": "A dataset about cats.",
|
|
282
|
+
"sets": ["train", "dev", "test"],
|
|
283
|
+
"state": DatasetState.Complete.value,
|
|
284
|
+
},
|
|
285
|
+
content_type="application/json",
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
# Mock ListSetElements call
|
|
289
|
+
element = {
|
|
290
|
+
"id": "22222222-2222-2222-2222-222222222222",
|
|
291
|
+
"type": "page",
|
|
292
|
+
"name": "1",
|
|
293
|
+
"corpus": {
|
|
294
|
+
"id": "11111111-1111-1111-1111-111111111111",
|
|
295
|
+
},
|
|
296
|
+
"thumbnail_url": "http://example.com",
|
|
297
|
+
"zone": {
|
|
298
|
+
"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08",
|
|
299
|
+
"polygon": [[0, 0], [0, 0], [0, 0]],
|
|
300
|
+
"image": {
|
|
301
|
+
"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08",
|
|
302
|
+
"path": "string",
|
|
303
|
+
"width": 0,
|
|
304
|
+
"height": 0,
|
|
305
|
+
"url": "http://example.com",
|
|
306
|
+
"s3_url": "string",
|
|
307
|
+
"status": "checked",
|
|
308
|
+
"server": {
|
|
309
|
+
"display_name": "string",
|
|
310
|
+
"url": "http://example.com",
|
|
311
|
+
"max_width": 2147483647,
|
|
312
|
+
"max_height": 2147483647,
|
|
313
|
+
},
|
|
314
|
+
},
|
|
315
|
+
"url": "http://example.com",
|
|
316
|
+
},
|
|
317
|
+
"rotation_angle": 0,
|
|
318
|
+
"mirrored": False,
|
|
319
|
+
"created": "2019-08-24T14:15:22Z",
|
|
320
|
+
"classes": [
|
|
321
|
+
{
|
|
322
|
+
"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08",
|
|
323
|
+
"ml_class": {
|
|
324
|
+
"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08",
|
|
325
|
+
"name": "string",
|
|
326
|
+
},
|
|
327
|
+
"state": "pending",
|
|
328
|
+
"confidence": 0,
|
|
329
|
+
"high_confidence": True,
|
|
330
|
+
"worker_run": {
|
|
331
|
+
"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08",
|
|
332
|
+
"summary": "string",
|
|
333
|
+
},
|
|
334
|
+
}
|
|
335
|
+
],
|
|
336
|
+
"metadata": [
|
|
337
|
+
{
|
|
338
|
+
"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08",
|
|
339
|
+
"type": "text",
|
|
340
|
+
"name": "string",
|
|
341
|
+
"value": "string",
|
|
342
|
+
"dates": [{"type": "exact", "year": 0, "month": 1, "day": 1}],
|
|
343
|
+
}
|
|
344
|
+
],
|
|
345
|
+
"transcriptions": [
|
|
346
|
+
{
|
|
347
|
+
"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08",
|
|
348
|
+
"text": "string",
|
|
349
|
+
"confidence": 0,
|
|
350
|
+
"orientation": "horizontal-lr",
|
|
351
|
+
"worker_run": {
|
|
352
|
+
"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08",
|
|
353
|
+
"summary": "string",
|
|
354
|
+
},
|
|
355
|
+
}
|
|
356
|
+
],
|
|
357
|
+
"has_children": True,
|
|
358
|
+
"worker_run": {
|
|
359
|
+
"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08",
|
|
360
|
+
"summary": "string",
|
|
361
|
+
},
|
|
362
|
+
"confidence": 1,
|
|
363
|
+
}
|
|
364
|
+
responses.add(
|
|
365
|
+
responses.GET,
|
|
366
|
+
"http://testserver/api/v1/datasets/11111111-1111-1111-1111-111111111111/elements/?set=train&with_count=true",
|
|
367
|
+
status=200,
|
|
368
|
+
json={
|
|
369
|
+
"next": None,
|
|
370
|
+
"previous": None,
|
|
371
|
+
"results": [
|
|
372
|
+
{
|
|
373
|
+
"set": "train",
|
|
374
|
+
"element": element,
|
|
375
|
+
}
|
|
376
|
+
],
|
|
377
|
+
"count": 1,
|
|
378
|
+
},
|
|
379
|
+
content_type="application/json",
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
worker = ElementsWorker()
|
|
383
|
+
worker.configure()
|
|
384
|
+
|
|
385
|
+
elt_list = worker.get_elements()
|
|
386
|
+
|
|
387
|
+
assert elt_list == [
|
|
388
|
+
Element(**element),
|
|
389
|
+
]
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def test_get_elements_dataset_set_api(responses, mocker, mock_elements_worker):
|
|
393
|
+
# Mock ListProcessSets call
|
|
394
|
+
responses.add(
|
|
395
|
+
responses.GET,
|
|
396
|
+
"http://testserver/api/v1/process/aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff/sets/",
|
|
397
|
+
status=200,
|
|
398
|
+
json={
|
|
399
|
+
"next": None,
|
|
400
|
+
"previous": None,
|
|
401
|
+
"results": [
|
|
402
|
+
{
|
|
403
|
+
"id": "33333333-3333-3333-3333-333333333333",
|
|
404
|
+
"dataset": {"id": "11111111-1111-1111-1111-111111111111"},
|
|
405
|
+
"set_name": "train",
|
|
406
|
+
}
|
|
407
|
+
],
|
|
408
|
+
"count": 1,
|
|
409
|
+
},
|
|
410
|
+
content_type="application/json",
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
# Mock ListSetElements call
|
|
414
|
+
element = {
|
|
415
|
+
"id": "22222222-2222-2222-2222-222222222222",
|
|
416
|
+
"type": "page",
|
|
417
|
+
"name": "1",
|
|
418
|
+
"corpus": {
|
|
419
|
+
"id": "11111111-1111-1111-1111-111111111111",
|
|
420
|
+
},
|
|
421
|
+
"thumbnail_url": "http://example.com",
|
|
422
|
+
"zone": {
|
|
423
|
+
"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08",
|
|
424
|
+
"polygon": [[0, 0], [0, 0], [0, 0]],
|
|
425
|
+
"image": {
|
|
426
|
+
"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08",
|
|
427
|
+
"path": "string",
|
|
428
|
+
"width": 0,
|
|
429
|
+
"height": 0,
|
|
430
|
+
"url": "http://example.com",
|
|
431
|
+
"s3_url": "string",
|
|
432
|
+
"status": "checked",
|
|
433
|
+
"server": {
|
|
434
|
+
"display_name": "string",
|
|
435
|
+
"url": "http://example.com",
|
|
436
|
+
"max_width": 2147483647,
|
|
437
|
+
"max_height": 2147483647,
|
|
438
|
+
},
|
|
439
|
+
},
|
|
440
|
+
"url": "http://example.com",
|
|
441
|
+
},
|
|
442
|
+
"rotation_angle": 0,
|
|
443
|
+
"mirrored": False,
|
|
444
|
+
"created": "2019-08-24T14:15:22Z",
|
|
445
|
+
"classes": [
|
|
446
|
+
{
|
|
447
|
+
"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08",
|
|
448
|
+
"ml_class": {
|
|
449
|
+
"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08",
|
|
450
|
+
"name": "string",
|
|
451
|
+
},
|
|
452
|
+
"state": "pending",
|
|
453
|
+
"confidence": 0,
|
|
454
|
+
"high_confidence": True,
|
|
455
|
+
"worker_run": {
|
|
456
|
+
"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08",
|
|
457
|
+
"summary": "string",
|
|
458
|
+
},
|
|
459
|
+
}
|
|
460
|
+
],
|
|
461
|
+
"metadata": [
|
|
462
|
+
{
|
|
463
|
+
"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08",
|
|
464
|
+
"type": "text",
|
|
465
|
+
"name": "string",
|
|
466
|
+
"value": "string",
|
|
467
|
+
"dates": [{"type": "exact", "year": 0, "month": 1, "day": 1}],
|
|
468
|
+
}
|
|
469
|
+
],
|
|
470
|
+
"transcriptions": [
|
|
471
|
+
{
|
|
472
|
+
"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08",
|
|
473
|
+
"text": "string",
|
|
474
|
+
"confidence": 0,
|
|
475
|
+
"orientation": "horizontal-lr",
|
|
476
|
+
"worker_run": {
|
|
477
|
+
"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08",
|
|
478
|
+
"summary": "string",
|
|
479
|
+
},
|
|
480
|
+
}
|
|
481
|
+
],
|
|
482
|
+
"has_children": True,
|
|
483
|
+
"worker_run": {
|
|
484
|
+
"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08",
|
|
485
|
+
"summary": "string",
|
|
486
|
+
},
|
|
487
|
+
"confidence": 1,
|
|
488
|
+
}
|
|
489
|
+
responses.add(
|
|
490
|
+
responses.GET,
|
|
491
|
+
"http://testserver/api/v1/datasets/11111111-1111-1111-1111-111111111111/elements/?set=train&with_count=true",
|
|
492
|
+
status=200,
|
|
493
|
+
json={
|
|
494
|
+
"next": None,
|
|
495
|
+
"previous": None,
|
|
496
|
+
"results": [
|
|
497
|
+
{
|
|
498
|
+
"set": "train",
|
|
499
|
+
"element": element,
|
|
500
|
+
}
|
|
501
|
+
],
|
|
502
|
+
"count": 1,
|
|
503
|
+
},
|
|
504
|
+
content_type="application/json",
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
# Update ProcessMode to Dataset
|
|
508
|
+
mock_elements_worker.process_information["mode"] = ProcessMode.Dataset
|
|
509
|
+
|
|
510
|
+
elt_list = mock_elements_worker.get_elements()
|
|
511
|
+
|
|
512
|
+
assert elt_list == [
|
|
513
|
+
Element(**element),
|
|
514
|
+
]
|
|
515
|
+
|
|
516
|
+
|
|
253
517
|
def test_get_elements_both_args_error(mocker, mock_elements_worker, tmp_path):
|
|
254
518
|
elements_path = tmp_path / "elements.json"
|
|
255
519
|
elements_path.write_text(
|
|
@@ -270,6 +534,7 @@ def test_get_elements_both_args_error(mocker, mock_elements_worker, tmp_path):
|
|
|
270
534
|
elements_list=elements_path.open(),
|
|
271
535
|
database=None,
|
|
272
536
|
dev=False,
|
|
537
|
+
set=[],
|
|
273
538
|
),
|
|
274
539
|
)
|
|
275
540
|
|
|
@@ -295,6 +560,7 @@ def test_database_arg(mocker, mock_elements_worker, tmp_path):
|
|
|
295
560
|
elements_list=None,
|
|
296
561
|
database=database_path,
|
|
297
562
|
dev=False,
|
|
563
|
+
set=[],
|
|
298
564
|
),
|
|
299
565
|
)
|
|
300
566
|
|
|
@@ -319,6 +585,7 @@ def test_database_arg_cache_missing_version_table(
|
|
|
319
585
|
elements_list=None,
|
|
320
586
|
database=database_path,
|
|
321
587
|
dev=False,
|
|
588
|
+
set=[],
|
|
322
589
|
),
|
|
323
590
|
)
|
|
324
591
|
|
|
@@ -2,8 +2,8 @@ import json
|
|
|
2
2
|
import re
|
|
3
3
|
|
|
4
4
|
import pytest
|
|
5
|
-
from apistar.exceptions import ErrorResponse
|
|
6
5
|
|
|
6
|
+
from arkindex.exceptions import ErrorResponse
|
|
7
7
|
from arkindex.mock import MockApiClient
|
|
8
8
|
from arkindex_worker.cache import CachedElement
|
|
9
9
|
from arkindex_worker.models import Element
|
|
@@ -3,9 +3,9 @@ import re
|
|
|
3
3
|
from uuid import UUID
|
|
4
4
|
|
|
5
5
|
import pytest
|
|
6
|
-
from apistar.exceptions import ErrorResponse
|
|
7
6
|
from playhouse.shortcuts import model_to_dict
|
|
8
7
|
|
|
8
|
+
from arkindex.exceptions import ErrorResponse
|
|
9
9
|
from arkindex_worker.cache import CachedElement, CachedTranscription
|
|
10
10
|
from arkindex_worker.models import Element
|
|
11
11
|
from arkindex_worker.utils import DEFAULT_BATCH_SIZE
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
arkindex_worker/__init__.py,sha256=OlgCtTC9MaWeejviY0a3iQpALcRQGMVArFVVYwTF6I8,162
|
|
2
|
-
arkindex_worker/cache.py,sha256=FTlB0coXofn5zTNRTcVIvh709mcw4a1bPGqkwWjKs3w,11248
|
|
3
|
-
arkindex_worker/image.py,sha256=oEgVCrSHiGh3D5-UXfM6PvT17TttSxC0115irpvB3Dw,18581
|
|
4
|
-
arkindex_worker/models.py,sha256=bPQzGZNs5a6z6DEcygsa8T33VOqPlMUbwKzHqlKzwbw,9923
|
|
5
|
-
arkindex_worker/utils.py,sha256=q1EeLdC6ebYIH-C0LOAqw2cNpjCjVoP-Vbr-39mF4w0,9884
|
|
6
|
-
arkindex_worker/worker/__init__.py,sha256=Xzn20bD4THFcnDfPjZeE-uU41m_whs_3yA0WjZb9uqk,18195
|
|
7
|
-
arkindex_worker/worker/base.py,sha256=wyEJB5_zcy4cTvqSXMhX8DLaWQVgvIKO77-uovcprq4,19539
|
|
8
|
-
arkindex_worker/worker/classification.py,sha256=ECm1cnQPOj_9m-CoO0e182ElSySAUOoyddHrORbShhc,10951
|
|
9
|
-
arkindex_worker/worker/corpus.py,sha256=s9bCxOszJMwRq1WWAmKjWq888mjDfbaJ18Wo7h-rNOw,1827
|
|
10
|
-
arkindex_worker/worker/dataset.py,sha256=UXElhhARca9m7Himp-yxD5dAqWbdxDKWOUJUGgeCZXI,2934
|
|
11
|
-
arkindex_worker/worker/element.py,sha256=1qTnz9Y4nbTSxn274-sRmM2stzT5wJrsbshxXHlBoPw,44789
|
|
12
|
-
arkindex_worker/worker/entity.py,sha256=qGjQvOVXfP84rER0Dkui6q-rb9nTWerHVG0Z5voB8pU,15229
|
|
13
|
-
arkindex_worker/worker/image.py,sha256=t_Az6IGnj0EZyvcA4XxfPikOUjn_pztgsyxTkFZhaXU,621
|
|
14
|
-
arkindex_worker/worker/metadata.py,sha256=VRajtd2kaBvar9GercX4knvR6l1WFYjoCdJWU9ccKgk,7291
|
|
15
|
-
arkindex_worker/worker/task.py,sha256=1O9zrWXxe3na3TOcoHX5Pxn1875v7EU08BSsCPnb62g,1519
|
|
16
|
-
arkindex_worker/worker/training.py,sha256=qnBFEk11JOWWPLTbjF-lZ9iFBdTPpQzZAzQ9a03J1j4,10874
|
|
17
|
-
arkindex_worker/worker/transcription.py,sha256=52RY9kYsiR1sz9FxOigyo12Ker3VDbQ4U42gK9DpR3g,21146
|
|
18
|
-
arkindex_worker/worker/version.py,sha256=JIT7OI3Mo7RPkNrjOB9hfqrsG-FYygz_zi4l8PbkuAo,1960
|
|
19
|
-
hooks/pre_gen_project.py,sha256=xQJERv3vv9VzIqcBHI281eeWLWREXUF4mMw7PvJHHXM,269
|
|
20
|
-
tests/__init__.py,sha256=6aeTMHf4q_dKY4jIZWg1KT70VKaLvVlzCxh-Uu_cWiQ,241
|
|
21
|
-
tests/conftest.py,sha256=KNBZ0xMC9xX2pKQXp_4XwVU07JGeTSFeM4rN2RpipfY,21522
|
|
22
|
-
tests/test_base_worker.py,sha256=2EIYcd_3f9O0zB5WiGIQV0Cn9wndLvnEnSfcAE1qWWU,30607
|
|
23
|
-
tests/test_cache.py,sha256=ii0gyr0DrG7ChEs7pmT8hMdSguAOAcCze4bRMiFQxuk,10640
|
|
24
|
-
tests/test_dataset_worker.py,sha256=d9HG36qnO5HXu9vQ0UTBvdTSRR21FVq1FNoXM-vZbPk,22105
|
|
25
|
-
tests/test_element.py,sha256=2G9M15TLxQRmvrWM9Kw2ucnElh4kSv_oF_5FYwwAxTY,13181
|
|
26
|
-
tests/test_image.py,sha256=J3jqB5OhcdCpB6n0UnwivxrMlne8YjFLXhq1gBMANrs,26711
|
|
27
|
-
tests/test_merge.py,sha256=TuOeUS0UCz66DPOQFFhc4NQBxIjZL9f5czi4XnvGrr4,8270
|
|
28
|
-
tests/test_utils.py,sha256=_WJUPnt-pM_TQ0er4yjPZy-u_LePrHq1lxwk_teky7M,2544
|
|
29
|
-
tests/test_elements_worker/__init__.py,sha256=Fh4nkbbyJSMv_VtjQxnWrOqTnxXaaWI8S9WU0VrzCHs,179
|
|
30
|
-
tests/test_elements_worker/test_classifications.py,sha256=fXZ8cSzIWwZ6LHsY7tKsy9-Pp9fKyKUStIXS4ViBcek,27779
|
|
31
|
-
tests/test_elements_worker/test_cli.py,sha256=a23i1pUDbXi23MUtbWwGEcLLrmc_YlrbDgOG3h66wLM,2620
|
|
32
|
-
tests/test_elements_worker/test_corpus.py,sha256=c_LUHvkJIYgk_wXF06VQPNOoWfiZ06XpjOXrJ7MRiBc,4479
|
|
33
|
-
tests/test_elements_worker/test_dataset.py,sha256=lSXqubhg1EEq2Y2goE8Y2RYaqIpM9Iejq6fGNW2BczU,11411
|
|
34
|
-
tests/test_elements_worker/test_elements.py,sha256=PBVRIQB8yTCCa22A0VJKIsJSa4gvagDVZVtZT8mlZF0,107199
|
|
35
|
-
tests/test_elements_worker/test_entities.py,sha256=oav2dtvWWavQe1l3Drbxw1Ta2ocUJEVxJfDQ_r6-rYQ,36181
|
|
36
|
-
tests/test_elements_worker/test_image.py,sha256=_E3UGdDOwTo1MW5KMS81PrdeSPBPWinWYoQPNy2F9Ro,2077
|
|
37
|
-
tests/test_elements_worker/test_metadata.py,sha256=cm2NNaXxBYmYMkPexSPVTAqb2skDTB4mliwQCLz8Y98,22293
|
|
38
|
-
tests/test_elements_worker/test_task.py,sha256=7Sr3fbjdgWUXJUhJEiC9CwnbhQIQX3rCInmHMIrmA38,5573
|
|
39
|
-
tests/test_elements_worker/test_training.py,sha256=Qxi9EzGr_uKcn2Fh5aE6jNrq1K8QKLiOiSew4upASPs,8721
|
|
40
|
-
tests/test_elements_worker/test_transcriptions.py,sha256=FNY6E26iTKqe7LP9LO72By4oV4g9hBIZYTU9BAc_w7I,77060
|
|
41
|
-
tests/test_elements_worker/test_worker.py,sha256=AuFDyqncIusT-rMMY4sEay9MqGvoNuSuZQq-5rHN02U,10803
|
|
42
|
-
worker-demo/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
43
|
-
worker-demo/tests/conftest.py,sha256=XzNMNeg6pmABUAH8jN6eZTlZSFGLYjS3-DTXjiRN6Yc,1002
|
|
44
|
-
worker-demo/tests/test_worker.py,sha256=3DLd4NRK4bfyatG5P_PK4k9P9tJHx9XQq5_ryFEEFVg,304
|
|
45
|
-
worker-demo/worker_demo/__init__.py,sha256=2BPomV8ZMNf3YXJgloatKeHQCE6QOkwmsHGkO6MkQuM,125
|
|
46
|
-
worker-demo/worker_demo/worker.py,sha256=Rt-DjWa5iBP08k58NDZMfeyPuFbtNcbX6nc5jFX7GNo,440
|
|
47
|
-
arkindex_base_worker-0.4.0rc1.dist-info/LICENSE,sha256=NVshRi1efwVezMfW7xXYLrdDr2Li1AfwfGOd5WuH1kQ,1063
|
|
48
|
-
arkindex_base_worker-0.4.0rc1.dist-info/METADATA,sha256=22DYiI2CtAzJ9d0P21Y2ZlAoBFX_Ks-yRQMoYlMO5KM,3303
|
|
49
|
-
arkindex_base_worker-0.4.0rc1.dist-info/WHEEL,sha256=Mdi9PDNwEZptOjTlUcAth7XJDFtKrHYaQMPulZeBCiQ,91
|
|
50
|
-
arkindex_base_worker-0.4.0rc1.dist-info/top_level.txt,sha256=58NuslgxQC2vT4DiqZEgO4JqJRrYa2yeNI9QvkbfGQU,40
|
|
51
|
-
arkindex_base_worker-0.4.0rc1.dist-info/RECORD,,
|
|
File without changes
|
{arkindex_base_worker-0.4.0rc1.dist-info → arkindex_base_worker-0.4.0rc3.dist-info}/top_level.txt
RENAMED
|
File without changes
|