arkindex-base-worker 0.3.6rc5__py3-none-any.whl → 0.3.7.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arkindex_base_worker-0.3.6rc5.dist-info → arkindex_base_worker-0.3.7.post1.dist-info}/METADATA +14 -16
- arkindex_base_worker-0.3.7.post1.dist-info/RECORD +47 -0
- {arkindex_base_worker-0.3.6rc5.dist-info → arkindex_base_worker-0.3.7.post1.dist-info}/WHEEL +1 -1
- {arkindex_base_worker-0.3.6rc5.dist-info → arkindex_base_worker-0.3.7.post1.dist-info}/top_level.txt +2 -0
- arkindex_worker/cache.py +14 -0
- arkindex_worker/image.py +29 -19
- arkindex_worker/models.py +14 -2
- arkindex_worker/utils.py +17 -3
- arkindex_worker/worker/__init__.py +122 -125
- arkindex_worker/worker/base.py +25 -45
- arkindex_worker/worker/classification.py +18 -25
- arkindex_worker/worker/dataset.py +24 -18
- arkindex_worker/worker/element.py +45 -6
- arkindex_worker/worker/entity.py +35 -4
- arkindex_worker/worker/metadata.py +21 -11
- arkindex_worker/worker/training.py +16 -0
- arkindex_worker/worker/transcription.py +45 -5
- arkindex_worker/worker/version.py +22 -0
- hooks/pre_gen_project.py +3 -0
- tests/conftest.py +15 -7
- tests/test_base_worker.py +0 -6
- tests/test_dataset_worker.py +292 -410
- tests/test_elements_worker/test_classifications.py +365 -539
- tests/test_elements_worker/test_cli.py +1 -1
- tests/test_elements_worker/test_dataset.py +97 -116
- tests/test_elements_worker/test_elements.py +227 -61
- tests/test_elements_worker/test_entities.py +22 -2
- tests/test_elements_worker/test_metadata.py +53 -27
- tests/test_elements_worker/test_training.py +35 -0
- tests/test_elements_worker/test_transcriptions.py +149 -16
- tests/test_elements_worker/test_worker.py +19 -6
- tests/test_image.py +37 -0
- tests/test_utils.py +23 -1
- worker-demo/tests/__init__.py +0 -0
- worker-demo/tests/conftest.py +32 -0
- worker-demo/tests/test_worker.py +12 -0
- worker-demo/worker_demo/__init__.py +6 -0
- worker-demo/worker_demo/worker.py +19 -0
- arkindex_base_worker-0.3.6rc5.dist-info/RECORD +0 -41
- {arkindex_base_worker-0.3.6rc5.dist-info → arkindex_base_worker-0.3.7.post1.dist-info}/LICENSE +0 -0
|
@@ -4,13 +4,13 @@ import logging
|
|
|
4
4
|
import pytest
|
|
5
5
|
from apistar.exceptions import ErrorResponse
|
|
6
6
|
|
|
7
|
-
from arkindex_worker.models import Dataset
|
|
7
|
+
from arkindex_worker.models import Dataset, Element, Set
|
|
8
8
|
from arkindex_worker.worker.dataset import DatasetState
|
|
9
9
|
from tests.conftest import PROCESS_ID
|
|
10
10
|
from tests.test_elements_worker import BASE_API_CALLS
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
def
|
|
13
|
+
def test_list_process_sets_readonly_error(mock_dataset_worker):
|
|
14
14
|
# Set worker in read_only mode
|
|
15
15
|
mock_dataset_worker.worker_run_id = None
|
|
16
16
|
assert mock_dataset_worker.is_read_only
|
|
@@ -18,73 +18,91 @@ def test_list_process_datasets_readonly_error(mock_dataset_worker):
|
|
|
18
18
|
with pytest.raises(
|
|
19
19
|
AssertionError, match="This helper is not available in read-only mode."
|
|
20
20
|
):
|
|
21
|
-
mock_dataset_worker.
|
|
21
|
+
mock_dataset_worker.list_process_sets()
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
def
|
|
24
|
+
def test_list_process_sets_api_error(responses, mock_dataset_worker):
|
|
25
25
|
responses.add(
|
|
26
26
|
responses.GET,
|
|
27
|
-
f"http://testserver/api/v1/process/{PROCESS_ID}/
|
|
27
|
+
f"http://testserver/api/v1/process/{PROCESS_ID}/sets/",
|
|
28
28
|
status=500,
|
|
29
29
|
)
|
|
30
30
|
|
|
31
31
|
with pytest.raises(
|
|
32
32
|
Exception, match="Stopping pagination as data will be incomplete"
|
|
33
33
|
):
|
|
34
|
-
next(mock_dataset_worker.
|
|
34
|
+
next(mock_dataset_worker.list_process_sets())
|
|
35
35
|
|
|
36
36
|
assert len(responses.calls) == len(BASE_API_CALLS) + 5
|
|
37
37
|
assert [
|
|
38
38
|
(call.request.method, call.request.url) for call in responses.calls
|
|
39
39
|
] == BASE_API_CALLS + [
|
|
40
40
|
# The API call is retried 5 times
|
|
41
|
-
("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/
|
|
42
|
-
("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/
|
|
43
|
-
("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/
|
|
44
|
-
("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/
|
|
45
|
-
("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/
|
|
41
|
+
("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/sets/"),
|
|
42
|
+
("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/sets/"),
|
|
43
|
+
("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/sets/"),
|
|
44
|
+
("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/sets/"),
|
|
45
|
+
("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/sets/"),
|
|
46
46
|
]
|
|
47
47
|
|
|
48
48
|
|
|
49
|
-
def
|
|
49
|
+
def test_list_process_sets(
|
|
50
50
|
responses,
|
|
51
51
|
mock_dataset_worker,
|
|
52
52
|
):
|
|
53
53
|
expected_results = [
|
|
54
54
|
{
|
|
55
|
-
"id": "
|
|
56
|
-
"
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
55
|
+
"id": "set_1",
|
|
56
|
+
"dataset": {
|
|
57
|
+
"id": "dataset_1",
|
|
58
|
+
"name": "Dataset 1",
|
|
59
|
+
"description": "My first great dataset",
|
|
60
|
+
"sets": [
|
|
61
|
+
{"id": "set_1", "name": "train"},
|
|
62
|
+
{"id": "set_2", "name": "val"},
|
|
63
|
+
],
|
|
64
|
+
"state": "open",
|
|
65
|
+
"corpus_id": "corpus_id",
|
|
66
|
+
"creator": "test@teklia.com",
|
|
67
|
+
"task_id": "task_id_1",
|
|
68
|
+
},
|
|
69
|
+
"set_name": "train",
|
|
63
70
|
},
|
|
64
71
|
{
|
|
65
|
-
"id": "
|
|
66
|
-
"
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
72
|
+
"id": "set_2",
|
|
73
|
+
"dataset": {
|
|
74
|
+
"id": "dataset_1",
|
|
75
|
+
"name": "Dataset 1",
|
|
76
|
+
"description": "My first great dataset",
|
|
77
|
+
"sets": [
|
|
78
|
+
{"id": "set_1", "name": "train"},
|
|
79
|
+
{"id": "set_2", "name": "val"},
|
|
80
|
+
],
|
|
81
|
+
"state": "open",
|
|
82
|
+
"corpus_id": "corpus_id",
|
|
83
|
+
"creator": "test@teklia.com",
|
|
84
|
+
"task_id": "task_id_1",
|
|
85
|
+
},
|
|
86
|
+
"set_name": "val",
|
|
73
87
|
},
|
|
74
88
|
{
|
|
75
|
-
"id": "
|
|
76
|
-
"
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
89
|
+
"id": "set_3",
|
|
90
|
+
"dataset": {
|
|
91
|
+
"id": "dataset_2",
|
|
92
|
+
"name": "Dataset 2",
|
|
93
|
+
"description": "My second great dataset",
|
|
94
|
+
"sets": [{"id": "set_3", "name": "my_set"}],
|
|
95
|
+
"state": "complete",
|
|
96
|
+
"corpus_id": "corpus_id",
|
|
97
|
+
"creator": "test@teklia.com",
|
|
98
|
+
"task_id": "task_id_2",
|
|
99
|
+
},
|
|
100
|
+
"set_name": "my_set",
|
|
83
101
|
},
|
|
84
102
|
]
|
|
85
103
|
responses.add(
|
|
86
104
|
responses.GET,
|
|
87
|
-
f"http://testserver/api/v1/process/{PROCESS_ID}/
|
|
105
|
+
f"http://testserver/api/v1/process/{PROCESS_ID}/sets/",
|
|
88
106
|
status=200,
|
|
89
107
|
json={
|
|
90
108
|
"count": 3,
|
|
@@ -93,50 +111,54 @@ def test_list_process_datasets(
|
|
|
93
111
|
},
|
|
94
112
|
)
|
|
95
113
|
|
|
96
|
-
for idx,
|
|
97
|
-
assert isinstance(
|
|
98
|
-
assert
|
|
114
|
+
for idx, dataset_set in enumerate(mock_dataset_worker.list_process_sets()):
|
|
115
|
+
assert isinstance(dataset_set, Set)
|
|
116
|
+
assert dataset_set.name == expected_results[idx]["set_name"]
|
|
117
|
+
|
|
118
|
+
assert isinstance(dataset_set.dataset, Dataset)
|
|
119
|
+
assert dataset_set.dataset == expected_results[idx]["dataset"]
|
|
99
120
|
|
|
100
121
|
assert len(responses.calls) == len(BASE_API_CALLS) + 1
|
|
101
122
|
assert [
|
|
102
123
|
(call.request.method, call.request.url) for call in responses.calls
|
|
103
124
|
] == BASE_API_CALLS + [
|
|
104
|
-
("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/
|
|
125
|
+
("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/sets/"),
|
|
105
126
|
]
|
|
106
127
|
|
|
107
128
|
|
|
108
129
|
@pytest.mark.parametrize(
|
|
109
130
|
("payload", "error"),
|
|
110
131
|
[
|
|
111
|
-
#
|
|
132
|
+
# Set
|
|
112
133
|
(
|
|
113
|
-
{"
|
|
114
|
-
"
|
|
134
|
+
{"dataset_set": None},
|
|
135
|
+
"dataset_set shouldn't be null and should be a Set",
|
|
115
136
|
),
|
|
116
137
|
(
|
|
117
|
-
{"
|
|
118
|
-
"
|
|
138
|
+
{"dataset_set": "not Set type"},
|
|
139
|
+
"dataset_set shouldn't be null and should be a Set",
|
|
119
140
|
),
|
|
120
141
|
],
|
|
121
142
|
)
|
|
122
|
-
def
|
|
143
|
+
def test_list_set_elements_wrong_param_dataset_set(mock_dataset_worker, payload, error):
|
|
123
144
|
with pytest.raises(AssertionError, match=error):
|
|
124
|
-
mock_dataset_worker.
|
|
145
|
+
mock_dataset_worker.list_set_elements(**payload)
|
|
125
146
|
|
|
126
147
|
|
|
127
|
-
def
|
|
128
|
-
responses, mock_dataset_worker, default_dataset
|
|
148
|
+
def test_list_set_elements_api_error(
|
|
149
|
+
responses, mock_dataset_worker, default_dataset, default_train_set
|
|
129
150
|
):
|
|
151
|
+
query_params = f"?set={default_train_set.name}&with_count=true"
|
|
130
152
|
responses.add(
|
|
131
153
|
responses.GET,
|
|
132
|
-
f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/",
|
|
154
|
+
f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/{query_params}",
|
|
133
155
|
status=500,
|
|
134
156
|
)
|
|
135
157
|
|
|
136
158
|
with pytest.raises(
|
|
137
159
|
Exception, match="Stopping pagination as data will be incomplete"
|
|
138
160
|
):
|
|
139
|
-
next(mock_dataset_worker.
|
|
161
|
+
next(mock_dataset_worker.list_set_elements(dataset_set=default_train_set))
|
|
140
162
|
|
|
141
163
|
assert len(responses.calls) == len(BASE_API_CALLS) + 5
|
|
142
164
|
assert [
|
|
@@ -145,69 +167,40 @@ def test_list_dataset_elements_api_error(
|
|
|
145
167
|
# The API call is retried 5 times
|
|
146
168
|
(
|
|
147
169
|
"GET",
|
|
148
|
-
f"http://testserver/api/v1/datasets/{default_dataset.id}/elements
|
|
170
|
+
f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/{query_params}",
|
|
149
171
|
),
|
|
150
172
|
(
|
|
151
173
|
"GET",
|
|
152
|
-
f"http://testserver/api/v1/datasets/{default_dataset.id}/elements
|
|
174
|
+
f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/{query_params}",
|
|
153
175
|
),
|
|
154
176
|
(
|
|
155
177
|
"GET",
|
|
156
|
-
f"http://testserver/api/v1/datasets/{default_dataset.id}/elements
|
|
178
|
+
f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/{query_params}",
|
|
157
179
|
),
|
|
158
180
|
(
|
|
159
181
|
"GET",
|
|
160
|
-
f"http://testserver/api/v1/datasets/{default_dataset.id}/elements
|
|
182
|
+
f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/{query_params}",
|
|
161
183
|
),
|
|
162
184
|
(
|
|
163
185
|
"GET",
|
|
164
|
-
f"http://testserver/api/v1/datasets/{default_dataset.id}/elements
|
|
186
|
+
f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/{query_params}",
|
|
165
187
|
),
|
|
166
188
|
]
|
|
167
189
|
|
|
168
190
|
|
|
169
|
-
def
|
|
191
|
+
def test_list_set_elements(
|
|
170
192
|
responses,
|
|
171
193
|
mock_dataset_worker,
|
|
172
194
|
default_dataset,
|
|
195
|
+
default_train_set,
|
|
173
196
|
):
|
|
174
197
|
expected_results = [
|
|
175
198
|
{
|
|
176
|
-
"set": "
|
|
177
|
-
"element": {
|
|
178
|
-
"id": "0000",
|
|
179
|
-
"type": "page",
|
|
180
|
-
"name": "Test",
|
|
181
|
-
"corpus": {},
|
|
182
|
-
"thumbnail_url": None,
|
|
183
|
-
"zone": {},
|
|
184
|
-
"best_classes": None,
|
|
185
|
-
"has_children": None,
|
|
186
|
-
"worker_version_id": None,
|
|
187
|
-
"worker_run_id": None,
|
|
188
|
-
},
|
|
189
|
-
},
|
|
190
|
-
{
|
|
191
|
-
"set": "set_1",
|
|
192
|
-
"element": {
|
|
193
|
-
"id": "1111",
|
|
194
|
-
"type": "page",
|
|
195
|
-
"name": "Test 2",
|
|
196
|
-
"corpus": {},
|
|
197
|
-
"thumbnail_url": None,
|
|
198
|
-
"zone": {},
|
|
199
|
-
"best_classes": None,
|
|
200
|
-
"has_children": None,
|
|
201
|
-
"worker_version_id": None,
|
|
202
|
-
"worker_run_id": None,
|
|
203
|
-
},
|
|
204
|
-
},
|
|
205
|
-
{
|
|
206
|
-
"set": "set_2",
|
|
199
|
+
"set": "train",
|
|
207
200
|
"element": {
|
|
208
|
-
"id": "
|
|
201
|
+
"id": "element_1",
|
|
209
202
|
"type": "page",
|
|
210
|
-
"name": "
|
|
203
|
+
"name": "1",
|
|
211
204
|
"corpus": {},
|
|
212
205
|
"thumbnail_url": None,
|
|
213
206
|
"zone": {},
|
|
@@ -216,41 +209,29 @@ def test_list_dataset_elements(
|
|
|
216
209
|
"worker_version_id": None,
|
|
217
210
|
"worker_run_id": None,
|
|
218
211
|
},
|
|
219
|
-
}
|
|
220
|
-
{
|
|
221
|
-
"set": "set_3",
|
|
222
|
-
"element": {
|
|
223
|
-
"id": "3333",
|
|
224
|
-
"type": "page",
|
|
225
|
-
"name": "Test 4",
|
|
226
|
-
"corpus": {},
|
|
227
|
-
"thumbnail_url": None,
|
|
228
|
-
"zone": {},
|
|
229
|
-
"best_classes": None,
|
|
230
|
-
"has_children": None,
|
|
231
|
-
"worker_version_id": None,
|
|
232
|
-
"worker_run_id": None,
|
|
233
|
-
},
|
|
234
|
-
},
|
|
212
|
+
}
|
|
235
213
|
]
|
|
214
|
+
expected_results.append({**expected_results[-1]})
|
|
215
|
+
expected_results[-1]["element"]["id"] = "element_2"
|
|
216
|
+
expected_results[-1]["element"]["name"] = "2"
|
|
217
|
+
|
|
218
|
+
query_params = f"?set={default_train_set.name}&with_count=true"
|
|
236
219
|
responses.add(
|
|
237
220
|
responses.GET,
|
|
238
|
-
f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/",
|
|
221
|
+
f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/{query_params}",
|
|
239
222
|
status=200,
|
|
240
223
|
json={
|
|
241
|
-
"count":
|
|
224
|
+
"count": 2,
|
|
242
225
|
"next": None,
|
|
243
226
|
"results": expected_results,
|
|
244
227
|
},
|
|
245
228
|
)
|
|
246
229
|
|
|
247
230
|
for idx, element in enumerate(
|
|
248
|
-
mock_dataset_worker.
|
|
231
|
+
mock_dataset_worker.list_set_elements(dataset_set=default_train_set)
|
|
249
232
|
):
|
|
250
|
-
assert element
|
|
251
|
-
|
|
252
|
-
expected_results[idx]["element"],
|
|
253
|
-
)
|
|
233
|
+
assert isinstance(element, Element)
|
|
234
|
+
assert element == expected_results[idx]["element"]
|
|
254
235
|
|
|
255
236
|
assert len(responses.calls) == len(BASE_API_CALLS) + 1
|
|
256
237
|
assert [
|
|
@@ -258,8 +239,8 @@ def test_list_dataset_elements(
|
|
|
258
239
|
] == BASE_API_CALLS + [
|
|
259
240
|
(
|
|
260
241
|
"GET",
|
|
261
|
-
f"http://testserver/api/v1/datasets/{default_dataset.id}/elements
|
|
262
|
-
)
|
|
242
|
+
f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/{query_params}",
|
|
243
|
+
)
|
|
263
244
|
]
|
|
264
245
|
|
|
265
246
|
|