arkindex-base-worker 0.4.0rc5__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arkindex_base_worker-0.4.0rc5.dist-info → arkindex_base_worker-0.5.0.dist-info}/METADATA +10 -13
- arkindex_base_worker-0.5.0.dist-info/RECORD +60 -0
- {arkindex_base_worker-0.4.0rc5.dist-info → arkindex_base_worker-0.5.0.dist-info}/WHEEL +1 -1
- {arkindex_base_worker-0.4.0rc5.dist-info → arkindex_base_worker-0.5.0.dist-info}/top_level.txt +1 -0
- arkindex_worker/__init__.py +3 -0
- arkindex_worker/cache.py +6 -25
- arkindex_worker/image.py +105 -66
- arkindex_worker/utils.py +2 -1
- arkindex_worker/worker/__init__.py +22 -32
- arkindex_worker/worker/base.py +16 -9
- arkindex_worker/worker/classification.py +36 -34
- arkindex_worker/worker/corpus.py +3 -3
- arkindex_worker/worker/dataset.py +9 -9
- arkindex_worker/worker/element.py +261 -231
- arkindex_worker/worker/entity.py +137 -206
- arkindex_worker/worker/image.py +3 -3
- arkindex_worker/worker/metadata.py +27 -38
- arkindex_worker/worker/process.py +24 -0
- arkindex_worker/worker/task.py +9 -9
- arkindex_worker/worker/training.py +15 -11
- arkindex_worker/worker/transcription.py +77 -71
- examples/standalone/python/worker.py +171 -0
- examples/tooled/python/worker.py +50 -0
- tests/conftest.py +22 -36
- tests/test_base_worker.py +1 -1
- tests/test_cache.py +1 -2
- tests/test_dataset_worker.py +1 -1
- tests/test_elements_worker/test_element.py +200 -26
- tests/test_elements_worker/{test_entity_create.py → test_entity.py} +220 -227
- tests/test_elements_worker/test_metadata.py +0 -47
- tests/test_elements_worker/test_process.py +89 -0
- tests/test_elements_worker/test_training.py +8 -8
- tests/test_elements_worker/test_worker.py +61 -14
- tests/test_image.py +244 -126
- tests/test_merge.py +0 -7
- tests/test_utils.py +37 -0
- arkindex_base_worker-0.4.0rc5.dist-info/RECORD +0 -60
- arkindex_worker/worker/version.py +0 -58
- tests/test_elements_worker/test_entity_list_and_check.py +0 -160
- tests/test_elements_worker/test_version.py +0 -60
- {arkindex_base_worker-0.4.0rc5.dist-info → arkindex_base_worker-0.5.0.dist-info/licenses}/LICENSE +0 -0
arkindex_worker/worker/entity.py
CHANGED
|
@@ -8,20 +8,17 @@ from warnings import warn
|
|
|
8
8
|
|
|
9
9
|
from peewee import IntegrityError
|
|
10
10
|
|
|
11
|
+
from arkindex.exceptions import ErrorResponse
|
|
11
12
|
from arkindex_worker import logger
|
|
12
13
|
from arkindex_worker.cache import (
|
|
13
|
-
CachedEntity,
|
|
14
14
|
CachedTranscriptionEntity,
|
|
15
15
|
unsupported_cache,
|
|
16
16
|
)
|
|
17
|
-
from arkindex_worker.models import
|
|
18
|
-
from arkindex_worker.utils import
|
|
19
|
-
pluralize,
|
|
20
|
-
)
|
|
17
|
+
from arkindex_worker.models import Transcription
|
|
18
|
+
from arkindex_worker.utils import pluralize
|
|
21
19
|
|
|
22
20
|
|
|
23
21
|
class Entity(TypedDict):
|
|
24
|
-
name: str
|
|
25
22
|
type_id: str
|
|
26
23
|
length: int
|
|
27
24
|
offset: int
|
|
@@ -36,24 +33,85 @@ class MissingEntityType(Exception):
|
|
|
36
33
|
|
|
37
34
|
|
|
38
35
|
class EntityMixin:
|
|
36
|
+
def list_corpus_entity_types(self):
|
|
37
|
+
"""
|
|
38
|
+
Loads available entity types in corpus.
|
|
39
|
+
"""
|
|
40
|
+
self.entity_types = {
|
|
41
|
+
entity_type["name"]: entity_type["id"]
|
|
42
|
+
for entity_type in self.api_client.paginate(
|
|
43
|
+
"ListCorpusEntityTypes", id=self.corpus_id
|
|
44
|
+
)
|
|
45
|
+
}
|
|
46
|
+
count = len(self.entity_types)
|
|
47
|
+
logger.info(
|
|
48
|
+
f"Loaded {count} entity {pluralize('type', count)} in corpus ({self.corpus_id})."
|
|
49
|
+
)
|
|
50
|
+
|
|
39
51
|
@unsupported_cache
|
|
52
|
+
def create_entity_type(self, name: str) -> None:
|
|
53
|
+
"""
|
|
54
|
+
Create an entity type on the given corpus.
|
|
55
|
+
|
|
56
|
+
:param name: Name of the entity type.
|
|
57
|
+
"""
|
|
58
|
+
assert name and isinstance(name, str), (
|
|
59
|
+
"name shouldn't be null and should be of type str"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
try:
|
|
63
|
+
entity_type = self.api_client.request(
|
|
64
|
+
"CreateEntityType",
|
|
65
|
+
body={
|
|
66
|
+
"name": name,
|
|
67
|
+
"corpus": self.corpus_id,
|
|
68
|
+
},
|
|
69
|
+
)
|
|
70
|
+
self.entity_types[name] = entity_type["id"]
|
|
71
|
+
logger.info(f"Created a new entity type with name `{name}`.")
|
|
72
|
+
except ErrorResponse as e:
|
|
73
|
+
# Only reload for 400 errors
|
|
74
|
+
if e.status_code != 400:
|
|
75
|
+
raise
|
|
76
|
+
|
|
77
|
+
# Reload and make sure we have the element type now
|
|
78
|
+
logger.warning(
|
|
79
|
+
f"Unable to create the entity type `{name}`. Refreshing corpus entity types cache."
|
|
80
|
+
)
|
|
81
|
+
self.list_corpus_entity_types()
|
|
82
|
+
assert name in self.entity_types, (
|
|
83
|
+
f"Missing entity type `{name}` even after refreshing."
|
|
84
|
+
)
|
|
85
|
+
|
|
40
86
|
def check_required_entity_types(
|
|
41
87
|
self, entity_types: list[str], create_missing: bool = True
|
|
42
|
-
):
|
|
43
|
-
"""
|
|
88
|
+
) -> None:
|
|
89
|
+
"""
|
|
90
|
+
Check that every entity type needed is available in the corpus.
|
|
44
91
|
Missing ones may be created automatically if needed.
|
|
45
92
|
|
|
46
93
|
:param entity_types: Entity type names to search.
|
|
47
94
|
:param create_missing: Whether the missing types should be created. Defaults to True.
|
|
48
|
-
:raises MissingEntityType: When an entity type is missing and cannot
|
|
95
|
+
:raises MissingEntityType: When an entity type is missing and cannot be created.
|
|
49
96
|
"""
|
|
50
|
-
|
|
97
|
+
assert entity_types and isinstance(entity_types, list), (
|
|
98
|
+
"entity_types shouldn't be null and should be of type list"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
for index, entity_type in enumerate(entity_types):
|
|
102
|
+
assert isinstance(entity_type, str), (
|
|
103
|
+
f"Entity type at index {index} in entity_types: Should be of type str"
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
assert create_missing is not None and isinstance(create_missing, bool), (
|
|
107
|
+
"create_missing shouldn't be null and should be of type bool"
|
|
108
|
+
)
|
|
109
|
+
|
|
51
110
|
if not self.entity_types:
|
|
52
|
-
# Load entity_types of corpus
|
|
53
111
|
self.list_corpus_entity_types()
|
|
54
112
|
|
|
55
113
|
for entity_type in entity_types:
|
|
56
|
-
# Do nothing if type already exists
|
|
114
|
+
# Do nothing if the type already exists
|
|
57
115
|
if entity_type in self.entity_types:
|
|
58
116
|
continue
|
|
59
117
|
|
|
@@ -63,98 +121,23 @@ class EntityMixin:
|
|
|
63
121
|
f"Entity type `{entity_type}` was not in the corpus."
|
|
64
122
|
)
|
|
65
123
|
|
|
66
|
-
# Create type if non-existent
|
|
67
|
-
self.
|
|
68
|
-
"CreateEntityType",
|
|
69
|
-
body={
|
|
70
|
-
"name": entity_type,
|
|
71
|
-
"corpus": self.corpus_id,
|
|
72
|
-
},
|
|
73
|
-
)["id"]
|
|
74
|
-
logger.info(f"Created a new entity type with name `{entity_type}`.")
|
|
75
|
-
|
|
76
|
-
def create_entity(
|
|
77
|
-
self,
|
|
78
|
-
name: str,
|
|
79
|
-
type: str,
|
|
80
|
-
metas=None,
|
|
81
|
-
validated=None,
|
|
82
|
-
):
|
|
83
|
-
"""
|
|
84
|
-
Create an entity on the given corpus.
|
|
85
|
-
If cache support is enabled, a [CachedEntity][arkindex_worker.cache.CachedEntity] will also be created.
|
|
86
|
-
|
|
87
|
-
:param name: Name of the entity.
|
|
88
|
-
:param type: Type of the entity.
|
|
89
|
-
"""
|
|
90
|
-
assert name and isinstance(
|
|
91
|
-
name, str
|
|
92
|
-
), "name shouldn't be null and should be of type str"
|
|
93
|
-
assert type and isinstance(
|
|
94
|
-
type, str
|
|
95
|
-
), "type shouldn't be null and should be of type str"
|
|
96
|
-
metas = metas or {}
|
|
97
|
-
if metas:
|
|
98
|
-
assert isinstance(metas, dict), "metas should be of type dict"
|
|
99
|
-
if validated is not None:
|
|
100
|
-
assert isinstance(validated, bool), "validated should be of type bool"
|
|
101
|
-
if self.is_read_only:
|
|
102
|
-
logger.warning("Cannot create entity as this worker is in read-only mode")
|
|
103
|
-
return
|
|
104
|
-
|
|
105
|
-
# Retrieve entity_type ID
|
|
106
|
-
if not self.entity_types:
|
|
107
|
-
# Load entity_types of corpus
|
|
108
|
-
self.list_corpus_entity_types()
|
|
109
|
-
|
|
110
|
-
entity_type_id = self.entity_types.get(type)
|
|
111
|
-
assert entity_type_id, f"Entity type `{type}` not found in the corpus."
|
|
112
|
-
|
|
113
|
-
entity = self.api_client.request(
|
|
114
|
-
"CreateEntity",
|
|
115
|
-
body={
|
|
116
|
-
"name": name,
|
|
117
|
-
"type_id": entity_type_id,
|
|
118
|
-
"metas": metas,
|
|
119
|
-
"validated": validated,
|
|
120
|
-
"corpus": self.corpus_id,
|
|
121
|
-
"worker_run_id": self.worker_run_id,
|
|
122
|
-
},
|
|
123
|
-
)
|
|
124
|
-
|
|
125
|
-
if self.use_cache:
|
|
126
|
-
# Store entity in local cache
|
|
127
|
-
try:
|
|
128
|
-
to_insert = [
|
|
129
|
-
{
|
|
130
|
-
"id": entity["id"],
|
|
131
|
-
"type": type,
|
|
132
|
-
"name": name,
|
|
133
|
-
"validated": validated if validated is not None else False,
|
|
134
|
-
"metas": metas,
|
|
135
|
-
"worker_run_id": self.worker_run_id,
|
|
136
|
-
}
|
|
137
|
-
]
|
|
138
|
-
CachedEntity.insert_many(to_insert).execute()
|
|
139
|
-
except IntegrityError as e:
|
|
140
|
-
logger.warning(f"Couldn't save created entity in local cache: {e}")
|
|
141
|
-
|
|
142
|
-
return entity["id"]
|
|
124
|
+
# Create the type if non-existent
|
|
125
|
+
self.create_entity_type(entity_type)
|
|
143
126
|
|
|
144
127
|
def create_transcription_entity(
|
|
145
128
|
self,
|
|
146
129
|
transcription: Transcription,
|
|
147
|
-
|
|
130
|
+
type_id: str,
|
|
148
131
|
offset: int,
|
|
149
132
|
length: int,
|
|
150
133
|
confidence: float | None = None,
|
|
151
134
|
) -> dict[str, str | int] | None:
|
|
152
135
|
"""
|
|
153
|
-
Create
|
|
136
|
+
Create an entity on an existing transcription.
|
|
154
137
|
If cache support is enabled, a `CachedTranscriptionEntity` will also be created.
|
|
155
138
|
|
|
156
139
|
:param transcription: Transcription to create the entity on.
|
|
157
|
-
:param
|
|
140
|
+
:param type_id: UUID of the entity type.
|
|
158
141
|
:param offset: Starting position of the entity in the transcription's text,
|
|
159
142
|
as a 0-based index.
|
|
160
143
|
:param length: Length of the entity in the transcription's text.
|
|
@@ -162,18 +145,18 @@ class EntityMixin:
|
|
|
162
145
|
:returns: A dict as returned by the ``CreateTranscriptionEntity`` API endpoint,
|
|
163
146
|
or None if the worker is in read-only mode.
|
|
164
147
|
"""
|
|
165
|
-
assert transcription and isinstance(
|
|
166
|
-
transcription
|
|
167
|
-
)
|
|
168
|
-
assert
|
|
169
|
-
|
|
170
|
-
)
|
|
171
|
-
assert (
|
|
172
|
-
offset
|
|
173
|
-
)
|
|
174
|
-
assert (
|
|
175
|
-
length
|
|
176
|
-
)
|
|
148
|
+
assert transcription and isinstance(transcription, Transcription), (
|
|
149
|
+
"transcription shouldn't be null and should be a Transcription"
|
|
150
|
+
)
|
|
151
|
+
assert type_id and isinstance(type_id, str), (
|
|
152
|
+
"type_id shouldn't be null and should be of type str"
|
|
153
|
+
)
|
|
154
|
+
assert offset is not None and isinstance(offset, int) and offset >= 0, (
|
|
155
|
+
"offset shouldn't be null and should be a positive integer"
|
|
156
|
+
)
|
|
157
|
+
assert length is not None and isinstance(length, int) and length > 0, (
|
|
158
|
+
"length shouldn't be null and should be a strictly positive integer"
|
|
159
|
+
)
|
|
177
160
|
assert (
|
|
178
161
|
confidence is None or isinstance(confidence, float) and 0 <= confidence <= 1
|
|
179
162
|
), "confidence should be null or a float in [0..1] range"
|
|
@@ -184,7 +167,7 @@ class EntityMixin:
|
|
|
184
167
|
return
|
|
185
168
|
|
|
186
169
|
body = {
|
|
187
|
-
"
|
|
170
|
+
"type_id": type_id,
|
|
188
171
|
"length": length,
|
|
189
172
|
"offset": offset,
|
|
190
173
|
"worker_run_id": self.worker_run_id,
|
|
@@ -192,7 +175,7 @@ class EntityMixin:
|
|
|
192
175
|
if confidence is not None:
|
|
193
176
|
body["confidence"] = confidence
|
|
194
177
|
|
|
195
|
-
|
|
178
|
+
tr_entity = self.api_client.request(
|
|
196
179
|
"CreateTranscriptionEntity",
|
|
197
180
|
id=transcription.id,
|
|
198
181
|
body=body,
|
|
@@ -203,7 +186,7 @@ class EntityMixin:
|
|
|
203
186
|
try:
|
|
204
187
|
CachedTranscriptionEntity.create(
|
|
205
188
|
transcription=transcription.id,
|
|
206
|
-
|
|
189
|
+
type=tr_entity["type"]["name"],
|
|
207
190
|
offset=offset,
|
|
208
191
|
length=length,
|
|
209
192
|
worker_run_id=self.worker_run_id,
|
|
@@ -213,7 +196,8 @@ class EntityMixin:
|
|
|
213
196
|
logger.warning(
|
|
214
197
|
f"Couldn't save created transcription entity in local cache: {e}"
|
|
215
198
|
)
|
|
216
|
-
|
|
199
|
+
|
|
200
|
+
return tr_entity
|
|
217
201
|
|
|
218
202
|
@unsupported_cache
|
|
219
203
|
def create_transcription_entities(
|
|
@@ -222,14 +206,11 @@ class EntityMixin:
|
|
|
222
206
|
entities: list[Entity],
|
|
223
207
|
) -> list[dict[str, str]]:
|
|
224
208
|
"""
|
|
225
|
-
Create multiple entities
|
|
209
|
+
Create multiple entities on a transcription in a single API request.
|
|
226
210
|
|
|
227
211
|
:param transcription: Transcription to create the entity on.
|
|
228
212
|
:param entities: List of dicts, one per element. Each dict can have the following keys:
|
|
229
213
|
|
|
230
|
-
name (str)
|
|
231
|
-
Required. Name of the entity.
|
|
232
|
-
|
|
233
214
|
type_id (str)
|
|
234
215
|
Required. ID of the EntityType of the entity.
|
|
235
216
|
|
|
@@ -242,49 +223,45 @@ class EntityMixin:
|
|
|
242
223
|
confidence (float or None)
|
|
243
224
|
Optional confidence score, between 0.0 and 1.0.
|
|
244
225
|
|
|
245
|
-
:return: List of
|
|
226
|
+
:return: List of strings, holding the UUID of each created object.
|
|
246
227
|
"""
|
|
247
|
-
assert transcription and isinstance(
|
|
248
|
-
transcription
|
|
249
|
-
)
|
|
228
|
+
assert transcription and isinstance(transcription, Transcription), (
|
|
229
|
+
"transcription shouldn't be null and should be of type Transcription"
|
|
230
|
+
)
|
|
250
231
|
|
|
251
|
-
assert entities and isinstance(
|
|
252
|
-
entities
|
|
253
|
-
)
|
|
232
|
+
assert entities and isinstance(entities, list), (
|
|
233
|
+
"entities shouldn't be null and should be of type list"
|
|
234
|
+
)
|
|
254
235
|
|
|
255
236
|
for index, entity in enumerate(entities):
|
|
256
|
-
assert isinstance(
|
|
257
|
-
|
|
258
|
-
)
|
|
259
|
-
|
|
260
|
-
name = entity.get("name")
|
|
261
|
-
assert (
|
|
262
|
-
name and isinstance(name, str)
|
|
263
|
-
), f"Entity at index {index} in entities: name shouldn't be null and should be of type str"
|
|
237
|
+
assert isinstance(entity, dict), (
|
|
238
|
+
f"Entity at index {index} in entities: Should be of type dict"
|
|
239
|
+
)
|
|
264
240
|
|
|
265
241
|
type_id = entity.get("type_id")
|
|
266
|
-
assert (
|
|
267
|
-
type_id and
|
|
268
|
-
)
|
|
242
|
+
assert type_id and isinstance(type_id, str), (
|
|
243
|
+
f"Entity at index {index} in entities: type_id shouldn't be null and should be of type str"
|
|
244
|
+
)
|
|
269
245
|
|
|
270
246
|
offset = entity.get("offset")
|
|
271
|
-
assert (
|
|
272
|
-
|
|
273
|
-
)
|
|
247
|
+
assert offset is not None and isinstance(offset, int) and offset >= 0, (
|
|
248
|
+
f"Entity at index {index} in entities: offset shouldn't be null and should be a positive integer"
|
|
249
|
+
)
|
|
274
250
|
|
|
275
251
|
length = entity.get("length")
|
|
276
|
-
assert (
|
|
277
|
-
|
|
278
|
-
)
|
|
252
|
+
assert length is not None and isinstance(length, int) and length > 0, (
|
|
253
|
+
f"Entity at index {index} in entities: length shouldn't be null and should be a strictly positive integer"
|
|
254
|
+
)
|
|
279
255
|
|
|
280
256
|
confidence = entity.get("confidence")
|
|
281
|
-
assert (
|
|
282
|
-
confidence
|
|
283
|
-
|
|
284
|
-
|
|
257
|
+
assert confidence is None or (
|
|
258
|
+
isinstance(confidence, float) and 0 <= confidence <= 1
|
|
259
|
+
), (
|
|
260
|
+
f"Entity at index {index} in entities: confidence should be None or a float in [0..1] range"
|
|
261
|
+
)
|
|
285
262
|
|
|
286
263
|
assert len(entities) == len(
|
|
287
|
-
set(map(itemgetter("offset", "length", "
|
|
264
|
+
set(map(itemgetter("offset", "length", "type_id"), entities))
|
|
288
265
|
), "entities should be unique"
|
|
289
266
|
|
|
290
267
|
if self.is_read_only:
|
|
@@ -293,16 +270,16 @@ class EntityMixin:
|
|
|
293
270
|
)
|
|
294
271
|
return
|
|
295
272
|
|
|
296
|
-
|
|
273
|
+
created_tr_entities = self.api_client.request(
|
|
297
274
|
"CreateTranscriptionEntities",
|
|
298
275
|
id=transcription.id,
|
|
299
276
|
body={
|
|
300
277
|
"worker_run_id": self.worker_run_id,
|
|
301
|
-
"
|
|
278
|
+
"transcription_entities": entities,
|
|
302
279
|
},
|
|
303
|
-
)["
|
|
280
|
+
)["transcription_entities"]
|
|
304
281
|
|
|
305
|
-
return
|
|
282
|
+
return created_tr_entities
|
|
306
283
|
|
|
307
284
|
def list_transcription_entities(
|
|
308
285
|
self,
|
|
@@ -325,9 +302,9 @@ class EntityMixin:
|
|
|
325
302
|
:param worker_run: Restrict to entities created by a worker run with this UUID. Set to False to look for manually created entities.
|
|
326
303
|
"""
|
|
327
304
|
query_params = {}
|
|
328
|
-
assert transcription and isinstance(
|
|
329
|
-
transcription
|
|
330
|
-
)
|
|
305
|
+
assert transcription and isinstance(transcription, Transcription), (
|
|
306
|
+
"transcription shouldn't be null and should be a Transcription"
|
|
307
|
+
)
|
|
331
308
|
|
|
332
309
|
if worker_version is not None:
|
|
333
310
|
warn(
|
|
@@ -335,71 +312,25 @@ class EntityMixin:
|
|
|
335
312
|
DeprecationWarning,
|
|
336
313
|
stacklevel=1,
|
|
337
314
|
)
|
|
338
|
-
assert isinstance(
|
|
339
|
-
worker_version
|
|
340
|
-
)
|
|
315
|
+
assert isinstance(worker_version, str | bool), (
|
|
316
|
+
"worker_version should be of type str or bool"
|
|
317
|
+
)
|
|
341
318
|
|
|
342
319
|
if isinstance(worker_version, bool):
|
|
343
|
-
assert (
|
|
344
|
-
worker_version
|
|
345
|
-
)
|
|
320
|
+
assert worker_version is False, (
|
|
321
|
+
"if of type bool, worker_version can only be set to False"
|
|
322
|
+
)
|
|
346
323
|
query_params["worker_version"] = worker_version
|
|
347
324
|
if worker_run is not None:
|
|
348
|
-
assert isinstance(
|
|
349
|
-
worker_run
|
|
350
|
-
)
|
|
325
|
+
assert isinstance(worker_run, str | bool), (
|
|
326
|
+
"worker_run should be of type str or bool"
|
|
327
|
+
)
|
|
351
328
|
if isinstance(worker_run, bool):
|
|
352
|
-
assert (
|
|
353
|
-
worker_run
|
|
354
|
-
)
|
|
329
|
+
assert worker_run is False, (
|
|
330
|
+
"if of type bool, worker_run can only be set to False"
|
|
331
|
+
)
|
|
355
332
|
query_params["worker_run"] = worker_run
|
|
356
333
|
|
|
357
334
|
return self.api_client.paginate(
|
|
358
335
|
"ListTranscriptionEntities", id=transcription.id, **query_params
|
|
359
336
|
)
|
|
360
|
-
|
|
361
|
-
def list_corpus_entities(
|
|
362
|
-
self,
|
|
363
|
-
name: str | None = None,
|
|
364
|
-
parent: Element | None = None,
|
|
365
|
-
):
|
|
366
|
-
"""
|
|
367
|
-
List all entities in the worker's corpus and store them in the ``self.entities`` cache.
|
|
368
|
-
:param name: Filter entities by part of their name (case-insensitive)
|
|
369
|
-
:param parent: Restrict entities to those linked to all transcriptions of an element and all its descendants. Note that links to metadata are ignored.
|
|
370
|
-
"""
|
|
371
|
-
query_params = {}
|
|
372
|
-
|
|
373
|
-
if name is not None:
|
|
374
|
-
assert name and isinstance(name, str), "name should be of type str"
|
|
375
|
-
query_params["name"] = name
|
|
376
|
-
|
|
377
|
-
if parent is not None:
|
|
378
|
-
assert isinstance(parent, Element), "parent should be of type Element"
|
|
379
|
-
query_params["parent"] = parent.id
|
|
380
|
-
|
|
381
|
-
self.entities = {
|
|
382
|
-
entity["id"]: entity
|
|
383
|
-
for entity in self.api_client.paginate(
|
|
384
|
-
"ListCorpusEntities", id=self.corpus_id, **query_params
|
|
385
|
-
)
|
|
386
|
-
}
|
|
387
|
-
count = len(self.entities)
|
|
388
|
-
logger.info(
|
|
389
|
-
f'Loaded {count} {pluralize("entity", count)} in corpus ({self.corpus_id})'
|
|
390
|
-
)
|
|
391
|
-
|
|
392
|
-
def list_corpus_entity_types(self):
|
|
393
|
-
"""
|
|
394
|
-
Loads available entity types in corpus.
|
|
395
|
-
"""
|
|
396
|
-
self.entity_types = {
|
|
397
|
-
entity_type["name"]: entity_type["id"]
|
|
398
|
-
for entity_type in self.api_client.paginate(
|
|
399
|
-
"ListCorpusEntityTypes", id=self.corpus_id
|
|
400
|
-
)
|
|
401
|
-
}
|
|
402
|
-
count = len(self.entity_types)
|
|
403
|
-
logger.info(
|
|
404
|
-
f'Loaded {count} entity {pluralize("type", count)} in corpus ({self.corpus_id}).'
|
|
405
|
-
)
|
arkindex_worker/worker/image.py
CHANGED
|
@@ -14,8 +14,8 @@ class ImageMixin:
|
|
|
14
14
|
:param url: URL of the image.
|
|
15
15
|
:returns: The created image.
|
|
16
16
|
"""
|
|
17
|
-
assert url and isinstance(
|
|
18
|
-
url
|
|
19
|
-
)
|
|
17
|
+
assert url and isinstance(url, str), (
|
|
18
|
+
"url shouldn't be null and should be of type str"
|
|
19
|
+
)
|
|
20
20
|
|
|
21
21
|
return Image(self.api_client.request("CreateIIIFURL", body={"url": url}))
|
|
@@ -64,7 +64,6 @@ class MetaDataMixin:
|
|
|
64
64
|
type: MetaType,
|
|
65
65
|
name: str,
|
|
66
66
|
value: str,
|
|
67
|
-
entity: str | None = None,
|
|
68
67
|
) -> str:
|
|
69
68
|
"""
|
|
70
69
|
Create a metadata on the given element through API.
|
|
@@ -73,23 +72,20 @@ class MetaDataMixin:
|
|
|
73
72
|
:param type: Type of the metadata.
|
|
74
73
|
:param name: Name of the metadata.
|
|
75
74
|
:param value: Value of the metadata.
|
|
76
|
-
:param entity: UUID of an entity this metadata is related to.
|
|
77
75
|
:returns: UUID of the created metadata.
|
|
78
76
|
"""
|
|
79
|
-
assert element and isinstance(
|
|
80
|
-
element
|
|
81
|
-
)
|
|
82
|
-
assert type and isinstance(
|
|
83
|
-
type
|
|
84
|
-
)
|
|
85
|
-
assert name and isinstance(
|
|
86
|
-
name
|
|
87
|
-
)
|
|
88
|
-
assert value and isinstance(
|
|
89
|
-
value
|
|
90
|
-
)
|
|
91
|
-
if entity:
|
|
92
|
-
assert isinstance(entity, str), "entity should be of type str"
|
|
77
|
+
assert element and isinstance(element, Element | CachedElement), (
|
|
78
|
+
"element shouldn't be null and should be of type Element or CachedElement"
|
|
79
|
+
)
|
|
80
|
+
assert type and isinstance(type, MetaType), (
|
|
81
|
+
"type shouldn't be null and should be of type MetaType"
|
|
82
|
+
)
|
|
83
|
+
assert name and isinstance(name, str), (
|
|
84
|
+
"name shouldn't be null and should be of type str"
|
|
85
|
+
)
|
|
86
|
+
assert value and isinstance(value, str), (
|
|
87
|
+
"value shouldn't be null and should be of type str"
|
|
88
|
+
)
|
|
93
89
|
if self.is_read_only:
|
|
94
90
|
logger.warning("Cannot create metadata as this worker is in read-only mode")
|
|
95
91
|
return
|
|
@@ -101,7 +97,6 @@ class MetaDataMixin:
|
|
|
101
97
|
"type": type.value,
|
|
102
98
|
"name": name,
|
|
103
99
|
"value": value,
|
|
104
|
-
"entity_id": entity,
|
|
105
100
|
"worker_run_id": self.worker_run_id,
|
|
106
101
|
},
|
|
107
102
|
)
|
|
@@ -125,48 +120,42 @@ class MetaDataMixin:
|
|
|
125
120
|
- type: MetaType
|
|
126
121
|
- name: str
|
|
127
122
|
- value: str | int | float
|
|
128
|
-
- entity_id: str | None
|
|
129
123
|
:param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
|
|
130
124
|
|
|
131
125
|
:returns: A list of dicts as returned in the ``metadata_list`` field by the ``CreateMetaDataBulk`` API endpoint.
|
|
132
126
|
"""
|
|
133
|
-
assert element and isinstance(
|
|
134
|
-
element
|
|
135
|
-
)
|
|
127
|
+
assert element and isinstance(element, Element | CachedElement), (
|
|
128
|
+
"element shouldn't be null and should be of type Element or CachedElement"
|
|
129
|
+
)
|
|
136
130
|
|
|
137
|
-
assert metadata_list and isinstance(
|
|
138
|
-
metadata_list
|
|
139
|
-
)
|
|
131
|
+
assert metadata_list and isinstance(metadata_list, list), (
|
|
132
|
+
"metadata_list shouldn't be null and should be of type list of dict"
|
|
133
|
+
)
|
|
140
134
|
|
|
141
135
|
# Make a copy to avoid modifying the metadata_list argument
|
|
142
136
|
metas = []
|
|
143
137
|
for index, metadata in enumerate(metadata_list):
|
|
144
|
-
assert isinstance(
|
|
145
|
-
|
|
146
|
-
)
|
|
138
|
+
assert isinstance(metadata, dict), (
|
|
139
|
+
f"Element at index {index} in metadata_list: Should be of type dict"
|
|
140
|
+
)
|
|
147
141
|
|
|
148
142
|
assert metadata.get("type") and isinstance(
|
|
149
143
|
metadata.get("type"), MetaType
|
|
150
144
|
), "type shouldn't be null and should be of type MetaType"
|
|
151
145
|
|
|
152
|
-
assert metadata.get("name") and isinstance(
|
|
153
|
-
|
|
154
|
-
)
|
|
146
|
+
assert metadata.get("name") and isinstance(metadata.get("name"), str), (
|
|
147
|
+
"name shouldn't be null and should be of type str"
|
|
148
|
+
)
|
|
155
149
|
|
|
156
150
|
assert metadata.get("value") is not None and isinstance(
|
|
157
151
|
metadata.get("value"), str | float | int
|
|
158
152
|
), "value shouldn't be null and should be of type (str or float or int)"
|
|
159
153
|
|
|
160
|
-
assert metadata.get("entity_id") is None or isinstance(
|
|
161
|
-
metadata.get("entity_id"), str
|
|
162
|
-
), "entity_id should be None or a str"
|
|
163
|
-
|
|
164
154
|
metas.append(
|
|
165
155
|
{
|
|
166
156
|
"type": metadata.get("type").value,
|
|
167
157
|
"name": metadata.get("name"),
|
|
168
158
|
"value": metadata.get("value"),
|
|
169
|
-
"entity_id": metadata.get("entity_id"),
|
|
170
159
|
}
|
|
171
160
|
)
|
|
172
161
|
|
|
@@ -199,9 +188,9 @@ class MetaDataMixin:
|
|
|
199
188
|
:param element: The element to list metadata on.
|
|
200
189
|
:param load_parents: Also include all metadata from the element's parents in the response.
|
|
201
190
|
"""
|
|
202
|
-
assert element and isinstance(
|
|
203
|
-
element
|
|
204
|
-
)
|
|
191
|
+
assert element and isinstance(element, Element | CachedElement), (
|
|
192
|
+
"element shouldn't be null and should be of type Element or CachedElement"
|
|
193
|
+
)
|
|
205
194
|
|
|
206
195
|
query_params = {}
|
|
207
196
|
if load_parents is not None:
|
|
@@ -1,5 +1,11 @@
|
|
|
1
|
+
from collections.abc import Iterator
|
|
1
2
|
from enum import Enum
|
|
2
3
|
|
|
4
|
+
from arkindex_worker.cache import unsupported_cache
|
|
5
|
+
|
|
6
|
+
# Increases the number of elements returned per page by the API
|
|
7
|
+
PROCESS_ELEMENTS_PAGE_SIZE = 500
|
|
8
|
+
|
|
3
9
|
|
|
4
10
|
class ActivityState(Enum):
|
|
5
11
|
"""
|
|
@@ -66,3 +72,21 @@ class ProcessMode(Enum):
|
|
|
66
72
|
"""
|
|
67
73
|
Export processes.
|
|
68
74
|
"""
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class ProcessMixin:
|
|
78
|
+
@unsupported_cache
|
|
79
|
+
def list_process_elements(self, with_image: bool = False) -> Iterator[dict]:
|
|
80
|
+
"""
|
|
81
|
+
List the elements of a process.
|
|
82
|
+
|
|
83
|
+
:param with_image: whether or not to include zone and image information in the elements response.
|
|
84
|
+
:returns: the process' elements.
|
|
85
|
+
"""
|
|
86
|
+
return self.api_client.paginate(
|
|
87
|
+
"ListProcessElements",
|
|
88
|
+
id=self.process_information["id"],
|
|
89
|
+
with_image=with_image,
|
|
90
|
+
allow_missing_data=True,
|
|
91
|
+
page_size=PROCESS_ELEMENTS_PAGE_SIZE,
|
|
92
|
+
)
|