arkindex-base-worker 0.3.6rc5__py3-none-any.whl → 0.3.7.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. {arkindex_base_worker-0.3.6rc5.dist-info → arkindex_base_worker-0.3.7.post1.dist-info}/METADATA +14 -16
  2. arkindex_base_worker-0.3.7.post1.dist-info/RECORD +47 -0
  3. {arkindex_base_worker-0.3.6rc5.dist-info → arkindex_base_worker-0.3.7.post1.dist-info}/WHEEL +1 -1
  4. {arkindex_base_worker-0.3.6rc5.dist-info → arkindex_base_worker-0.3.7.post1.dist-info}/top_level.txt +2 -0
  5. arkindex_worker/cache.py +14 -0
  6. arkindex_worker/image.py +29 -19
  7. arkindex_worker/models.py +14 -2
  8. arkindex_worker/utils.py +17 -3
  9. arkindex_worker/worker/__init__.py +122 -125
  10. arkindex_worker/worker/base.py +25 -45
  11. arkindex_worker/worker/classification.py +18 -25
  12. arkindex_worker/worker/dataset.py +24 -18
  13. arkindex_worker/worker/element.py +45 -6
  14. arkindex_worker/worker/entity.py +35 -4
  15. arkindex_worker/worker/metadata.py +21 -11
  16. arkindex_worker/worker/training.py +16 -0
  17. arkindex_worker/worker/transcription.py +45 -5
  18. arkindex_worker/worker/version.py +22 -0
  19. hooks/pre_gen_project.py +3 -0
  20. tests/conftest.py +15 -7
  21. tests/test_base_worker.py +0 -6
  22. tests/test_dataset_worker.py +292 -410
  23. tests/test_elements_worker/test_classifications.py +365 -539
  24. tests/test_elements_worker/test_cli.py +1 -1
  25. tests/test_elements_worker/test_dataset.py +97 -116
  26. tests/test_elements_worker/test_elements.py +227 -61
  27. tests/test_elements_worker/test_entities.py +22 -2
  28. tests/test_elements_worker/test_metadata.py +53 -27
  29. tests/test_elements_worker/test_training.py +35 -0
  30. tests/test_elements_worker/test_transcriptions.py +149 -16
  31. tests/test_elements_worker/test_worker.py +19 -6
  32. tests/test_image.py +37 -0
  33. tests/test_utils.py +23 -1
  34. worker-demo/tests/__init__.py +0 -0
  35. worker-demo/tests/conftest.py +32 -0
  36. worker-demo/tests/test_worker.py +12 -0
  37. worker-demo/worker_demo/__init__.py +6 -0
  38. worker-demo/worker_demo/worker.py +19 -0
  39. arkindex_base_worker-0.3.6rc5.dist-info/RECORD +0 -41
  40. {arkindex_base_worker-0.3.6rc5.dist-info → arkindex_base_worker-0.3.7.post1.dist-info}/LICENSE +0 -0
@@ -24,7 +24,7 @@ def test_cli_default(monkeypatch):
24
24
  )
25
25
  )
26
26
 
27
- monkeypatch.setenv("TASK_ELEMENTS", path)
27
+ monkeypatch.setenv("TASK_ELEMENTS", str(path))
28
28
  monkeypatch.setattr(sys, "argv", ["worker"])
29
29
  worker = ElementsWorker()
30
30
  worker.configure()
@@ -4,13 +4,13 @@ import logging
4
4
  import pytest
5
5
  from apistar.exceptions import ErrorResponse
6
6
 
7
- from arkindex_worker.models import Dataset
7
+ from arkindex_worker.models import Dataset, Element, Set
8
8
  from arkindex_worker.worker.dataset import DatasetState
9
9
  from tests.conftest import PROCESS_ID
10
10
  from tests.test_elements_worker import BASE_API_CALLS
11
11
 
12
12
 
13
- def test_list_process_datasets_readonly_error(mock_dataset_worker):
13
+ def test_list_process_sets_readonly_error(mock_dataset_worker):
14
14
  # Set worker in read_only mode
15
15
  mock_dataset_worker.worker_run_id = None
16
16
  assert mock_dataset_worker.is_read_only
@@ -18,73 +18,91 @@ def test_list_process_datasets_readonly_error(mock_dataset_worker):
18
18
  with pytest.raises(
19
19
  AssertionError, match="This helper is not available in read-only mode."
20
20
  ):
21
- mock_dataset_worker.list_process_datasets()
21
+ mock_dataset_worker.list_process_sets()
22
22
 
23
23
 
24
- def test_list_process_datasets_api_error(responses, mock_dataset_worker):
24
+ def test_list_process_sets_api_error(responses, mock_dataset_worker):
25
25
  responses.add(
26
26
  responses.GET,
27
- f"http://testserver/api/v1/process/{PROCESS_ID}/datasets/",
27
+ f"http://testserver/api/v1/process/{PROCESS_ID}/sets/",
28
28
  status=500,
29
29
  )
30
30
 
31
31
  with pytest.raises(
32
32
  Exception, match="Stopping pagination as data will be incomplete"
33
33
  ):
34
- next(mock_dataset_worker.list_process_datasets())
34
+ next(mock_dataset_worker.list_process_sets())
35
35
 
36
36
  assert len(responses.calls) == len(BASE_API_CALLS) + 5
37
37
  assert [
38
38
  (call.request.method, call.request.url) for call in responses.calls
39
39
  ] == BASE_API_CALLS + [
40
40
  # The API call is retried 5 times
41
- ("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/datasets/"),
42
- ("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/datasets/"),
43
- ("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/datasets/"),
44
- ("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/datasets/"),
45
- ("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/datasets/"),
41
+ ("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/sets/"),
42
+ ("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/sets/"),
43
+ ("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/sets/"),
44
+ ("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/sets/"),
45
+ ("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/sets/"),
46
46
  ]
47
47
 
48
48
 
49
- def test_list_process_datasets(
49
+ def test_list_process_sets(
50
50
  responses,
51
51
  mock_dataset_worker,
52
52
  ):
53
53
  expected_results = [
54
54
  {
55
- "id": "dataset_1",
56
- "name": "Dataset 1",
57
- "description": "My first great dataset",
58
- "sets": ["train", "val", "test"],
59
- "state": "open",
60
- "corpus_id": "corpus_id",
61
- "creator": "test@teklia.com",
62
- "task_id": "task_id_1",
55
+ "id": "set_1",
56
+ "dataset": {
57
+ "id": "dataset_1",
58
+ "name": "Dataset 1",
59
+ "description": "My first great dataset",
60
+ "sets": [
61
+ {"id": "set_1", "name": "train"},
62
+ {"id": "set_2", "name": "val"},
63
+ ],
64
+ "state": "open",
65
+ "corpus_id": "corpus_id",
66
+ "creator": "test@teklia.com",
67
+ "task_id": "task_id_1",
68
+ },
69
+ "set_name": "train",
63
70
  },
64
71
  {
65
- "id": "dataset_2",
66
- "name": "Dataset 2",
67
- "description": "My second great dataset",
68
- "sets": ["train", "val"],
69
- "state": "complete",
70
- "corpus_id": "corpus_id",
71
- "creator": "test@teklia.com",
72
- "task_id": "task_id_2",
72
+ "id": "set_2",
73
+ "dataset": {
74
+ "id": "dataset_1",
75
+ "name": "Dataset 1",
76
+ "description": "My first great dataset",
77
+ "sets": [
78
+ {"id": "set_1", "name": "train"},
79
+ {"id": "set_2", "name": "val"},
80
+ ],
81
+ "state": "open",
82
+ "corpus_id": "corpus_id",
83
+ "creator": "test@teklia.com",
84
+ "task_id": "task_id_1",
85
+ },
86
+ "set_name": "val",
73
87
  },
74
88
  {
75
- "id": "dataset_3",
76
- "name": "Dataset 3 (TRASHME)",
77
- "description": "My third dataset, in error",
78
- "sets": ["nonsense", "random set"],
79
- "state": "error",
80
- "corpus_id": "corpus_id",
81
- "creator": "test@teklia.com",
82
- "task_id": "task_id_3",
89
+ "id": "set_3",
90
+ "dataset": {
91
+ "id": "dataset_2",
92
+ "name": "Dataset 2",
93
+ "description": "My second great dataset",
94
+ "sets": [{"id": "set_3", "name": "my_set"}],
95
+ "state": "complete",
96
+ "corpus_id": "corpus_id",
97
+ "creator": "test@teklia.com",
98
+ "task_id": "task_id_2",
99
+ },
100
+ "set_name": "my_set",
83
101
  },
84
102
  ]
85
103
  responses.add(
86
104
  responses.GET,
87
- f"http://testserver/api/v1/process/{PROCESS_ID}/datasets/",
105
+ f"http://testserver/api/v1/process/{PROCESS_ID}/sets/",
88
106
  status=200,
89
107
  json={
90
108
  "count": 3,
@@ -93,50 +111,54 @@ def test_list_process_datasets(
93
111
  },
94
112
  )
95
113
 
96
- for idx, dataset in enumerate(mock_dataset_worker.list_process_datasets()):
97
- assert isinstance(dataset, Dataset)
98
- assert dataset == expected_results[idx]
114
+ for idx, dataset_set in enumerate(mock_dataset_worker.list_process_sets()):
115
+ assert isinstance(dataset_set, Set)
116
+ assert dataset_set.name == expected_results[idx]["set_name"]
117
+
118
+ assert isinstance(dataset_set.dataset, Dataset)
119
+ assert dataset_set.dataset == expected_results[idx]["dataset"]
99
120
 
100
121
  assert len(responses.calls) == len(BASE_API_CALLS) + 1
101
122
  assert [
102
123
  (call.request.method, call.request.url) for call in responses.calls
103
124
  ] == BASE_API_CALLS + [
104
- ("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/datasets/"),
125
+ ("GET", f"http://testserver/api/v1/process/{PROCESS_ID}/sets/"),
105
126
  ]
106
127
 
107
128
 
108
129
  @pytest.mark.parametrize(
109
130
  ("payload", "error"),
110
131
  [
111
- # Dataset
132
+ # Set
112
133
  (
113
- {"dataset": None},
114
- "dataset shouldn't be null and should be a Dataset",
134
+ {"dataset_set": None},
135
+ "dataset_set shouldn't be null and should be a Set",
115
136
  ),
116
137
  (
117
- {"dataset": "not Dataset type"},
118
- "dataset shouldn't be null and should be a Dataset",
138
+ {"dataset_set": "not Set type"},
139
+ "dataset_set shouldn't be null and should be a Set",
119
140
  ),
120
141
  ],
121
142
  )
122
- def test_list_dataset_elements_wrong_param_dataset(mock_dataset_worker, payload, error):
143
+ def test_list_set_elements_wrong_param_dataset_set(mock_dataset_worker, payload, error):
123
144
  with pytest.raises(AssertionError, match=error):
124
- mock_dataset_worker.list_dataset_elements(**payload)
145
+ mock_dataset_worker.list_set_elements(**payload)
125
146
 
126
147
 
127
- def test_list_dataset_elements_api_error(
128
- responses, mock_dataset_worker, default_dataset
148
+ def test_list_set_elements_api_error(
149
+ responses, mock_dataset_worker, default_dataset, default_train_set
129
150
  ):
151
+ query_params = f"?set={default_train_set.name}&with_count=true"
130
152
  responses.add(
131
153
  responses.GET,
132
- f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/",
154
+ f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/{query_params}",
133
155
  status=500,
134
156
  )
135
157
 
136
158
  with pytest.raises(
137
159
  Exception, match="Stopping pagination as data will be incomplete"
138
160
  ):
139
- next(mock_dataset_worker.list_dataset_elements(dataset=default_dataset))
161
+ next(mock_dataset_worker.list_set_elements(dataset_set=default_train_set))
140
162
 
141
163
  assert len(responses.calls) == len(BASE_API_CALLS) + 5
142
164
  assert [
@@ -145,69 +167,40 @@ def test_list_dataset_elements_api_error(
145
167
  # The API call is retried 5 times
146
168
  (
147
169
  "GET",
148
- f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?with_count=true",
170
+ f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/{query_params}",
149
171
  ),
150
172
  (
151
173
  "GET",
152
- f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?with_count=true",
174
+ f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/{query_params}",
153
175
  ),
154
176
  (
155
177
  "GET",
156
- f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?with_count=true",
178
+ f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/{query_params}",
157
179
  ),
158
180
  (
159
181
  "GET",
160
- f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?with_count=true",
182
+ f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/{query_params}",
161
183
  ),
162
184
  (
163
185
  "GET",
164
- f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?with_count=true",
186
+ f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/{query_params}",
165
187
  ),
166
188
  ]
167
189
 
168
190
 
169
- def test_list_dataset_elements(
191
+ def test_list_set_elements(
170
192
  responses,
171
193
  mock_dataset_worker,
172
194
  default_dataset,
195
+ default_train_set,
173
196
  ):
174
197
  expected_results = [
175
198
  {
176
- "set": "set_1",
177
- "element": {
178
- "id": "0000",
179
- "type": "page",
180
- "name": "Test",
181
- "corpus": {},
182
- "thumbnail_url": None,
183
- "zone": {},
184
- "best_classes": None,
185
- "has_children": None,
186
- "worker_version_id": None,
187
- "worker_run_id": None,
188
- },
189
- },
190
- {
191
- "set": "set_1",
192
- "element": {
193
- "id": "1111",
194
- "type": "page",
195
- "name": "Test 2",
196
- "corpus": {},
197
- "thumbnail_url": None,
198
- "zone": {},
199
- "best_classes": None,
200
- "has_children": None,
201
- "worker_version_id": None,
202
- "worker_run_id": None,
203
- },
204
- },
205
- {
206
- "set": "set_2",
199
+ "set": "train",
207
200
  "element": {
208
- "id": "2222",
201
+ "id": "element_1",
209
202
  "type": "page",
210
- "name": "Test 3",
203
+ "name": "1",
211
204
  "corpus": {},
212
205
  "thumbnail_url": None,
213
206
  "zone": {},
@@ -216,41 +209,29 @@ def test_list_dataset_elements(
216
209
  "worker_version_id": None,
217
210
  "worker_run_id": None,
218
211
  },
219
- },
220
- {
221
- "set": "set_3",
222
- "element": {
223
- "id": "3333",
224
- "type": "page",
225
- "name": "Test 4",
226
- "corpus": {},
227
- "thumbnail_url": None,
228
- "zone": {},
229
- "best_classes": None,
230
- "has_children": None,
231
- "worker_version_id": None,
232
- "worker_run_id": None,
233
- },
234
- },
212
+ }
235
213
  ]
214
+ expected_results.append({**expected_results[-1]})
215
+ expected_results[-1]["element"]["id"] = "element_2"
216
+ expected_results[-1]["element"]["name"] = "2"
217
+
218
+ query_params = f"?set={default_train_set.name}&with_count=true"
236
219
  responses.add(
237
220
  responses.GET,
238
- f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/",
221
+ f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/{query_params}",
239
222
  status=200,
240
223
  json={
241
- "count": 4,
224
+ "count": 2,
242
225
  "next": None,
243
226
  "results": expected_results,
244
227
  },
245
228
  )
246
229
 
247
230
  for idx, element in enumerate(
248
- mock_dataset_worker.list_dataset_elements(dataset=default_dataset)
231
+ mock_dataset_worker.list_set_elements(dataset_set=default_train_set)
249
232
  ):
250
- assert element == (
251
- expected_results[idx]["set"],
252
- expected_results[idx]["element"],
253
- )
233
+ assert isinstance(element, Element)
234
+ assert element == expected_results[idx]["element"]
254
235
 
255
236
  assert len(responses.calls) == len(BASE_API_CALLS) + 1
256
237
  assert [
@@ -258,8 +239,8 @@ def test_list_dataset_elements(
258
239
  ] == BASE_API_CALLS + [
259
240
  (
260
241
  "GET",
261
- f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?with_count=true",
262
- ),
242
+ f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/{query_params}",
243
+ )
263
244
  ]
264
245
 
265
246