arkindex-base-worker 0.3.5rc6__py3-none-any.whl → 0.3.6rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arkindex_base_worker-0.3.6rc2.dist-info/METADATA +39 -0
- arkindex_base_worker-0.3.6rc2.dist-info/RECORD +40 -0
- arkindex_worker/__init__.py +0 -1
- arkindex_worker/cache.py +19 -25
- arkindex_worker/image.py +16 -17
- arkindex_worker/models.py +24 -21
- arkindex_worker/utils.py +18 -19
- arkindex_worker/worker/__init__.py +17 -27
- arkindex_worker/worker/base.py +12 -7
- arkindex_worker/worker/classification.py +13 -15
- arkindex_worker/worker/dataset.py +3 -4
- arkindex_worker/worker/element.py +80 -76
- arkindex_worker/worker/entity.py +28 -30
- arkindex_worker/worker/metadata.py +21 -27
- arkindex_worker/worker/task.py +2 -3
- arkindex_worker/worker/training.py +25 -26
- arkindex_worker/worker/transcription.py +37 -34
- arkindex_worker/worker/version.py +1 -2
- tests/conftest.py +56 -76
- tests/test_base_worker.py +38 -32
- tests/test_cache.py +14 -7
- tests/test_dataset_worker.py +25 -22
- tests/test_element.py +0 -1
- tests/test_elements_worker/__init__.py +0 -1
- tests/test_elements_worker/test_classifications.py +0 -1
- tests/test_elements_worker/test_cli.py +22 -17
- tests/test_elements_worker/test_dataset.py +9 -10
- tests/test_elements_worker/test_elements.py +58 -63
- tests/test_elements_worker/test_entities.py +10 -20
- tests/test_elements_worker/test_metadata.py +72 -96
- tests/test_elements_worker/test_task.py +22 -20
- tests/test_elements_worker/test_training.py +20 -13
- tests/test_elements_worker/test_transcriptions.py +6 -10
- tests/test_elements_worker/test_worker.py +16 -14
- tests/test_image.py +21 -20
- tests/test_merge.py +5 -6
- tests/test_utils.py +0 -1
- arkindex_base_worker-0.3.5rc6.dist-info/METADATA +0 -27
- arkindex_base_worker-0.3.5rc6.dist-info/RECORD +0 -42
- arkindex_worker/git.py +0 -392
- tests/test_git.py +0 -480
- {arkindex_base_worker-0.3.5rc6.dist-info → arkindex_base_worker-0.3.6rc2.dist-info}/WHEEL +0 -0
- {arkindex_base_worker-0.3.5rc6.dist-info → arkindex_base_worker-0.3.6rc2.dist-info}/top_level.txt +0 -0
arkindex_worker/worker/base.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
1
|
"""
|
|
3
2
|
The base class for all Arkindex workers.
|
|
4
3
|
"""
|
|
@@ -9,7 +8,6 @@ import os
|
|
|
9
8
|
import shutil
|
|
10
9
|
from pathlib import Path
|
|
11
10
|
from tempfile import mkdtemp
|
|
12
|
-
from typing import List, Optional
|
|
13
11
|
|
|
14
12
|
import gnupg
|
|
15
13
|
import yaml
|
|
@@ -52,15 +50,15 @@ class ExtrasDirNotFoundError(Exception):
|
|
|
52
50
|
"""
|
|
53
51
|
|
|
54
52
|
|
|
55
|
-
class BaseWorker
|
|
53
|
+
class BaseWorker:
|
|
56
54
|
"""
|
|
57
55
|
Base class for Arkindex workers.
|
|
58
56
|
"""
|
|
59
57
|
|
|
60
58
|
def __init__(
|
|
61
59
|
self,
|
|
62
|
-
description:
|
|
63
|
-
support_cache:
|
|
60
|
+
description: str | None = "Arkindex Base Worker",
|
|
61
|
+
support_cache: bool | None = False,
|
|
64
62
|
):
|
|
65
63
|
"""
|
|
66
64
|
Initialize the worker.
|
|
@@ -217,6 +215,9 @@ class BaseWorker(object):
|
|
|
217
215
|
# Define model_version_id from environment
|
|
218
216
|
self.model_version_id = os.environ.get("ARKINDEX_MODEL_VERSION_ID")
|
|
219
217
|
|
|
218
|
+
# Define model_details from environment
|
|
219
|
+
self.model_details = {"id": os.environ.get("ARKINDEX_MODEL_ID")}
|
|
220
|
+
|
|
220
221
|
# Load all required secrets
|
|
221
222
|
self.secrets = {name: self.load_secret(Path(name)) for name in required_secrets}
|
|
222
223
|
|
|
@@ -259,6 +260,9 @@ class BaseWorker(object):
|
|
|
259
260
|
# Set model_version ID as worker attribute
|
|
260
261
|
self.model_version_id = model_version.get("id")
|
|
261
262
|
|
|
263
|
+
# Set model details as worker attribute
|
|
264
|
+
self.model_details = model_version.get("model")
|
|
265
|
+
|
|
262
266
|
# Retrieve initial configuration from API
|
|
263
267
|
self.config = worker_version["configuration"].get("configuration", {})
|
|
264
268
|
if "user_configuration" in worker_version["configuration"]:
|
|
@@ -347,7 +351,8 @@ class BaseWorker(object):
|
|
|
347
351
|
|
|
348
352
|
try:
|
|
349
353
|
gpg = gnupg.GPG()
|
|
350
|
-
|
|
354
|
+
with path.open("rb") as gpg_file:
|
|
355
|
+
decrypted = gpg.decrypt_file(gpg_file)
|
|
351
356
|
assert (
|
|
352
357
|
decrypted.ok
|
|
353
358
|
), f"GPG error: {decrypted.status} - {decrypted.stderr}"
|
|
@@ -406,7 +411,7 @@ class BaseWorker(object):
|
|
|
406
411
|
)
|
|
407
412
|
return extras_dir
|
|
408
413
|
|
|
409
|
-
def find_parents_file_paths(self, filename: Path) ->
|
|
414
|
+
def find_parents_file_paths(self, filename: Path) -> list[Path]:
|
|
410
415
|
"""
|
|
411
416
|
Find the paths of a specific file from the parent tasks.
|
|
412
417
|
Only works if the task_parents attributes is updated, so if the cache is supported,
|
|
@@ -1,9 +1,7 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
1
|
"""
|
|
3
2
|
ElementsWorker methods for classifications and ML classes.
|
|
4
3
|
"""
|
|
5
4
|
|
|
6
|
-
from typing import Dict, List, Optional, Union
|
|
7
5
|
from uuid import UUID
|
|
8
6
|
|
|
9
7
|
from apistar.exceptions import ErrorResponse
|
|
@@ -14,7 +12,7 @@ from arkindex_worker.cache import CachedClassification, CachedElement
|
|
|
14
12
|
from arkindex_worker.models import Element
|
|
15
13
|
|
|
16
14
|
|
|
17
|
-
class ClassificationMixin
|
|
15
|
+
class ClassificationMixin:
|
|
18
16
|
def load_corpus_classes(self):
|
|
19
17
|
"""
|
|
20
18
|
Load all ML classes available in the worker's corpus and store them in the ``self.classes`` cache.
|
|
@@ -91,11 +89,11 @@ class ClassificationMixin(object):
|
|
|
91
89
|
|
|
92
90
|
def create_classification(
|
|
93
91
|
self,
|
|
94
|
-
element:
|
|
92
|
+
element: Element | CachedElement,
|
|
95
93
|
ml_class: str,
|
|
96
94
|
confidence: float,
|
|
97
|
-
high_confidence:
|
|
98
|
-
) ->
|
|
95
|
+
high_confidence: bool = False,
|
|
96
|
+
) -> dict[str, str]:
|
|
99
97
|
"""
|
|
100
98
|
Create a classification on the given element through the API.
|
|
101
99
|
|
|
@@ -106,7 +104,7 @@ class ClassificationMixin(object):
|
|
|
106
104
|
:returns: The created classification, as returned by the ``CreateClassification`` API endpoint.
|
|
107
105
|
"""
|
|
108
106
|
assert element and isinstance(
|
|
109
|
-
element,
|
|
107
|
+
element, Element | CachedElement
|
|
110
108
|
), "element shouldn't be null and should be an Element or CachedElement"
|
|
111
109
|
assert ml_class and isinstance(
|
|
112
110
|
ml_class, str
|
|
@@ -180,9 +178,9 @@ class ClassificationMixin(object):
|
|
|
180
178
|
|
|
181
179
|
def create_classifications(
|
|
182
180
|
self,
|
|
183
|
-
element:
|
|
184
|
-
classifications:
|
|
185
|
-
) ->
|
|
181
|
+
element: Element | CachedElement,
|
|
182
|
+
classifications: list[dict[str, str | float | bool]],
|
|
183
|
+
) -> list[dict[str, str | float | bool]]:
|
|
186
184
|
"""
|
|
187
185
|
Create multiple classifications at once on the given element through the API.
|
|
188
186
|
|
|
@@ -196,7 +194,7 @@ class ClassificationMixin(object):
|
|
|
196
194
|
the ``CreateClassifications`` API endpoint.
|
|
197
195
|
"""
|
|
198
196
|
assert element and isinstance(
|
|
199
|
-
element,
|
|
197
|
+
element, Element | CachedElement
|
|
200
198
|
), "element shouldn't be null and should be an Element or CachedElement"
|
|
201
199
|
assert classifications and isinstance(
|
|
202
200
|
classifications, list
|
|
@@ -204,17 +202,17 @@ class ClassificationMixin(object):
|
|
|
204
202
|
|
|
205
203
|
for index, classification in enumerate(classifications):
|
|
206
204
|
ml_class_id = classification.get("ml_class_id")
|
|
207
|
-
assert
|
|
208
|
-
ml_class_id, str
|
|
205
|
+
assert (
|
|
206
|
+
ml_class_id and isinstance(ml_class_id, str)
|
|
209
207
|
), f"Classification at index {index} in classifications: ml_class_id shouldn't be null and should be of type str"
|
|
210
208
|
|
|
211
209
|
# Make sure it's a valid UUID
|
|
212
210
|
try:
|
|
213
211
|
UUID(ml_class_id)
|
|
214
|
-
except ValueError:
|
|
212
|
+
except ValueError as e:
|
|
215
213
|
raise ValueError(
|
|
216
214
|
f"Classification at index {index} in classifications: ml_class_id is not a valid uuid."
|
|
217
|
-
)
|
|
215
|
+
) from e
|
|
218
216
|
|
|
219
217
|
confidence = classification.get("confidence")
|
|
220
218
|
assert (
|
|
@@ -1,10 +1,9 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
1
|
"""
|
|
3
2
|
BaseWorker methods for datasets.
|
|
4
3
|
"""
|
|
5
4
|
|
|
5
|
+
from collections.abc import Iterator
|
|
6
6
|
from enum import Enum
|
|
7
|
-
from typing import Iterator, Tuple
|
|
8
7
|
|
|
9
8
|
from arkindex_worker import logger
|
|
10
9
|
from arkindex_worker.models import Dataset, Element
|
|
@@ -36,7 +35,7 @@ class DatasetState(Enum):
|
|
|
36
35
|
"""
|
|
37
36
|
|
|
38
37
|
|
|
39
|
-
class DatasetMixin
|
|
38
|
+
class DatasetMixin:
|
|
40
39
|
def list_process_datasets(self) -> Iterator[Dataset]:
|
|
41
40
|
"""
|
|
42
41
|
List datasets associated to the worker's process. This helper is not available in developer mode.
|
|
@@ -51,7 +50,7 @@ class DatasetMixin(object):
|
|
|
51
50
|
|
|
52
51
|
return map(Dataset, list(results))
|
|
53
52
|
|
|
54
|
-
def list_dataset_elements(self, dataset: Dataset) -> Iterator[
|
|
53
|
+
def list_dataset_elements(self, dataset: Dataset) -> Iterator[tuple[str, Element]]:
|
|
55
54
|
"""
|
|
56
55
|
List elements in a dataset.
|
|
57
56
|
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
1
|
"""
|
|
3
2
|
ElementsWorker methods for elements and element types.
|
|
4
3
|
"""
|
|
5
|
-
from
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
from typing import NamedTuple
|
|
6
6
|
from uuid import UUID
|
|
7
7
|
|
|
8
8
|
from peewee import IntegrityError
|
|
@@ -28,11 +28,10 @@ class MissingTypeError(Exception):
|
|
|
28
28
|
"""
|
|
29
29
|
|
|
30
30
|
|
|
31
|
-
class ElementMixin
|
|
32
|
-
def create_required_types(self, element_types:
|
|
31
|
+
class ElementMixin:
|
|
32
|
+
def create_required_types(self, element_types: list[ElementType]):
|
|
33
33
|
"""Creates given element types in the corpus.
|
|
34
34
|
|
|
35
|
-
:param Corpus corpus: The corpus to create types on.
|
|
36
35
|
:param element_types: The missing element types to create.
|
|
37
36
|
"""
|
|
38
37
|
for element_type in element_types:
|
|
@@ -87,9 +86,9 @@ class ElementMixin(object):
|
|
|
87
86
|
element: Element,
|
|
88
87
|
type: str,
|
|
89
88
|
name: str,
|
|
90
|
-
polygon:
|
|
91
|
-
confidence:
|
|
92
|
-
slim_output:
|
|
89
|
+
polygon: list[list[int | float]],
|
|
90
|
+
confidence: float | None = None,
|
|
91
|
+
slim_output: bool = True,
|
|
93
92
|
) -> str:
|
|
94
93
|
"""
|
|
95
94
|
Create a child element on the given element through the API.
|
|
@@ -118,7 +117,7 @@ class ElementMixin(object):
|
|
|
118
117
|
isinstance(point, list) and len(point) == 2 for point in polygon
|
|
119
118
|
), "polygon points should be lists of two items"
|
|
120
119
|
assert all(
|
|
121
|
-
isinstance(coord,
|
|
120
|
+
isinstance(coord, int | float) for point in polygon for coord in point
|
|
122
121
|
), "polygon points should be lists of two numbers"
|
|
123
122
|
assert confidence is None or (
|
|
124
123
|
isinstance(confidence, float) and 0 <= confidence <= 1
|
|
@@ -147,11 +146,9 @@ class ElementMixin(object):
|
|
|
147
146
|
|
|
148
147
|
def create_elements(
|
|
149
148
|
self,
|
|
150
|
-
parent:
|
|
151
|
-
elements:
|
|
152
|
-
|
|
153
|
-
],
|
|
154
|
-
) -> List[Dict[str, str]]:
|
|
149
|
+
parent: Element | CachedElement,
|
|
150
|
+
elements: list[dict[str, str | list[list[int | float]] | float | None]],
|
|
151
|
+
) -> list[dict[str, str]]:
|
|
155
152
|
"""
|
|
156
153
|
Create child elements on the given element in a single API request.
|
|
157
154
|
|
|
@@ -196,18 +193,18 @@ class ElementMixin(object):
|
|
|
196
193
|
), f"Element at index {index} in elements: Should be of type dict"
|
|
197
194
|
|
|
198
195
|
name = element.get("name")
|
|
199
|
-
assert
|
|
200
|
-
name, str
|
|
196
|
+
assert (
|
|
197
|
+
name and isinstance(name, str)
|
|
201
198
|
), f"Element at index {index} in elements: name shouldn't be null and should be of type str"
|
|
202
199
|
|
|
203
200
|
type = element.get("type")
|
|
204
|
-
assert
|
|
205
|
-
type, str
|
|
201
|
+
assert (
|
|
202
|
+
type and isinstance(type, str)
|
|
206
203
|
), f"Element at index {index} in elements: type shouldn't be null and should be of type str"
|
|
207
204
|
|
|
208
205
|
polygon = element.get("polygon")
|
|
209
|
-
assert
|
|
210
|
-
polygon, list
|
|
206
|
+
assert (
|
|
207
|
+
polygon and isinstance(polygon, list)
|
|
211
208
|
), f"Element at index {index} in elements: polygon shouldn't be null and should be of type list"
|
|
212
209
|
assert (
|
|
213
210
|
len(polygon) >= 3
|
|
@@ -216,12 +213,13 @@ class ElementMixin(object):
|
|
|
216
213
|
isinstance(point, list) and len(point) == 2 for point in polygon
|
|
217
214
|
), f"Element at index {index} in elements: polygon points should be lists of two items"
|
|
218
215
|
assert all(
|
|
219
|
-
isinstance(coord,
|
|
216
|
+
isinstance(coord, int | float) for point in polygon for coord in point
|
|
220
217
|
), f"Element at index {index} in elements: polygon points should be lists of two numbers"
|
|
221
218
|
|
|
222
219
|
confidence = element.get("confidence")
|
|
223
|
-
assert
|
|
224
|
-
|
|
220
|
+
assert (
|
|
221
|
+
confidence is None
|
|
222
|
+
or (isinstance(confidence, float) and 0 <= confidence <= 1)
|
|
225
223
|
), f"Element at index {index} in elements: confidence should be None or a float in [0..1] range"
|
|
226
224
|
|
|
227
225
|
if self.is_read_only:
|
|
@@ -273,7 +271,7 @@ class ElementMixin(object):
|
|
|
273
271
|
return created_ids
|
|
274
272
|
|
|
275
273
|
def partial_update_element(
|
|
276
|
-
self, element:
|
|
274
|
+
self, element: Element | CachedElement, **kwargs
|
|
277
275
|
) -> dict:
|
|
278
276
|
"""
|
|
279
277
|
Partially updates an element through the API.
|
|
@@ -293,7 +291,7 @@ class ElementMixin(object):
|
|
|
293
291
|
:returns: A dict from the ``PartialUpdateElement`` API endpoint,
|
|
294
292
|
"""
|
|
295
293
|
assert element and isinstance(
|
|
296
|
-
element,
|
|
294
|
+
element, Element | CachedElement
|
|
297
295
|
), "element shouldn't be null and should be an Element or CachedElement"
|
|
298
296
|
|
|
299
297
|
if "type" in kwargs:
|
|
@@ -310,7 +308,7 @@ class ElementMixin(object):
|
|
|
310
308
|
isinstance(point, list) and len(point) == 2 for point in polygon
|
|
311
309
|
), "polygon points should be lists of two items"
|
|
312
310
|
assert all(
|
|
313
|
-
isinstance(coord,
|
|
311
|
+
isinstance(coord, int | float) for point in polygon for coord in point
|
|
314
312
|
), "polygon points should be lists of two numbers"
|
|
315
313
|
|
|
316
314
|
if "confidence" in kwargs:
|
|
@@ -364,21 +362,21 @@ class ElementMixin(object):
|
|
|
364
362
|
|
|
365
363
|
def list_element_children(
|
|
366
364
|
self,
|
|
367
|
-
element:
|
|
368
|
-
folder:
|
|
369
|
-
name:
|
|
370
|
-
recursive:
|
|
371
|
-
transcription_worker_version:
|
|
372
|
-
transcription_worker_run:
|
|
373
|
-
type:
|
|
374
|
-
with_classes:
|
|
375
|
-
with_corpus:
|
|
376
|
-
with_metadata:
|
|
377
|
-
with_has_children:
|
|
378
|
-
with_zone:
|
|
379
|
-
worker_version:
|
|
380
|
-
worker_run:
|
|
381
|
-
) ->
|
|
365
|
+
element: Element | CachedElement,
|
|
366
|
+
folder: bool | None = None,
|
|
367
|
+
name: str | None = None,
|
|
368
|
+
recursive: bool | None = None,
|
|
369
|
+
transcription_worker_version: str | bool | None = None,
|
|
370
|
+
transcription_worker_run: str | bool | None = None,
|
|
371
|
+
type: str | None = None,
|
|
372
|
+
with_classes: bool | None = None,
|
|
373
|
+
with_corpus: bool | None = None,
|
|
374
|
+
with_metadata: bool | None = None,
|
|
375
|
+
with_has_children: bool | None = None,
|
|
376
|
+
with_zone: bool | None = None,
|
|
377
|
+
worker_version: str | bool | None = None,
|
|
378
|
+
worker_run: str | bool | None = None,
|
|
379
|
+
) -> Iterable[dict] | Iterable[CachedElement]:
|
|
382
380
|
"""
|
|
383
381
|
List children of an element.
|
|
384
382
|
|
|
@@ -413,7 +411,7 @@ class ElementMixin(object):
|
|
|
413
411
|
or an iterable of [CachedElement][arkindex_worker.cache.CachedElement] when caching is enabled.
|
|
414
412
|
"""
|
|
415
413
|
assert element and isinstance(
|
|
416
|
-
element,
|
|
414
|
+
element, Element | CachedElement
|
|
417
415
|
), "element shouldn't be null and should be an Element or CachedElement"
|
|
418
416
|
query_params = {}
|
|
419
417
|
if folder is not None:
|
|
@@ -427,7 +425,7 @@ class ElementMixin(object):
|
|
|
427
425
|
query_params["recursive"] = recursive
|
|
428
426
|
if transcription_worker_version is not None:
|
|
429
427
|
assert isinstance(
|
|
430
|
-
transcription_worker_version,
|
|
428
|
+
transcription_worker_version, str | bool
|
|
431
429
|
), "transcription_worker_version should be of type str or bool"
|
|
432
430
|
if isinstance(transcription_worker_version, bool):
|
|
433
431
|
assert (
|
|
@@ -436,7 +434,7 @@ class ElementMixin(object):
|
|
|
436
434
|
query_params["transcription_worker_version"] = transcription_worker_version
|
|
437
435
|
if transcription_worker_run is not None:
|
|
438
436
|
assert isinstance(
|
|
439
|
-
transcription_worker_run,
|
|
437
|
+
transcription_worker_run, str | bool
|
|
440
438
|
), "transcription_worker_run should be of type str or bool"
|
|
441
439
|
if isinstance(transcription_worker_run, bool):
|
|
442
440
|
assert (
|
|
@@ -467,7 +465,7 @@ class ElementMixin(object):
|
|
|
467
465
|
query_params["with_zone"] = with_zone
|
|
468
466
|
if worker_version is not None:
|
|
469
467
|
assert isinstance(
|
|
470
|
-
worker_version,
|
|
468
|
+
worker_version, str | bool
|
|
471
469
|
), "worker_version should be of type str or bool"
|
|
472
470
|
if isinstance(worker_version, bool):
|
|
473
471
|
assert (
|
|
@@ -476,7 +474,7 @@ class ElementMixin(object):
|
|
|
476
474
|
query_params["worker_version"] = worker_version
|
|
477
475
|
if worker_run is not None:
|
|
478
476
|
assert isinstance(
|
|
479
|
-
worker_run,
|
|
477
|
+
worker_run, str | bool
|
|
480
478
|
), "worker_run should be of type str or bool"
|
|
481
479
|
if isinstance(worker_run, bool):
|
|
482
480
|
assert (
|
|
@@ -486,11 +484,14 @@ class ElementMixin(object):
|
|
|
486
484
|
|
|
487
485
|
if self.use_cache:
|
|
488
486
|
# Checking that we only received query_params handled by the cache
|
|
489
|
-
assert
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
487
|
+
assert (
|
|
488
|
+
set(query_params.keys())
|
|
489
|
+
<= {
|
|
490
|
+
"type",
|
|
491
|
+
"worker_version",
|
|
492
|
+
"worker_run",
|
|
493
|
+
}
|
|
494
|
+
), "When using the local cache, you can only filter by 'type' and/or 'worker_version' and/or 'worker_run'"
|
|
494
495
|
|
|
495
496
|
query = CachedElement.select().where(CachedElement.parent_id == element.id)
|
|
496
497
|
if type:
|
|
@@ -523,21 +524,21 @@ class ElementMixin(object):
|
|
|
523
524
|
|
|
524
525
|
def list_element_parents(
|
|
525
526
|
self,
|
|
526
|
-
element:
|
|
527
|
-
folder:
|
|
528
|
-
name:
|
|
529
|
-
recursive:
|
|
530
|
-
transcription_worker_version:
|
|
531
|
-
transcription_worker_run:
|
|
532
|
-
type:
|
|
533
|
-
with_classes:
|
|
534
|
-
with_corpus:
|
|
535
|
-
with_metadata:
|
|
536
|
-
with_has_children:
|
|
537
|
-
with_zone:
|
|
538
|
-
worker_version:
|
|
539
|
-
worker_run:
|
|
540
|
-
) ->
|
|
527
|
+
element: Element | CachedElement,
|
|
528
|
+
folder: bool | None = None,
|
|
529
|
+
name: str | None = None,
|
|
530
|
+
recursive: bool | None = None,
|
|
531
|
+
transcription_worker_version: str | bool | None = None,
|
|
532
|
+
transcription_worker_run: str | bool | None = None,
|
|
533
|
+
type: str | None = None,
|
|
534
|
+
with_classes: bool | None = None,
|
|
535
|
+
with_corpus: bool | None = None,
|
|
536
|
+
with_metadata: bool | None = None,
|
|
537
|
+
with_has_children: bool | None = None,
|
|
538
|
+
with_zone: bool | None = None,
|
|
539
|
+
worker_version: str | bool | None = None,
|
|
540
|
+
worker_run: str | bool | None = None,
|
|
541
|
+
) -> Iterable[dict] | Iterable[CachedElement]:
|
|
541
542
|
"""
|
|
542
543
|
List parents of an element.
|
|
543
544
|
|
|
@@ -572,7 +573,7 @@ class ElementMixin(object):
|
|
|
572
573
|
or an iterable of [CachedElement][arkindex_worker.cache.CachedElement] when caching is enabled.
|
|
573
574
|
"""
|
|
574
575
|
assert element and isinstance(
|
|
575
|
-
element,
|
|
576
|
+
element, Element | CachedElement
|
|
576
577
|
), "element shouldn't be null and should be an Element or CachedElement"
|
|
577
578
|
query_params = {}
|
|
578
579
|
if folder is not None:
|
|
@@ -586,7 +587,7 @@ class ElementMixin(object):
|
|
|
586
587
|
query_params["recursive"] = recursive
|
|
587
588
|
if transcription_worker_version is not None:
|
|
588
589
|
assert isinstance(
|
|
589
|
-
transcription_worker_version,
|
|
590
|
+
transcription_worker_version, str | bool
|
|
590
591
|
), "transcription_worker_version should be of type str or bool"
|
|
591
592
|
if isinstance(transcription_worker_version, bool):
|
|
592
593
|
assert (
|
|
@@ -595,7 +596,7 @@ class ElementMixin(object):
|
|
|
595
596
|
query_params["transcription_worker_version"] = transcription_worker_version
|
|
596
597
|
if transcription_worker_run is not None:
|
|
597
598
|
assert isinstance(
|
|
598
|
-
transcription_worker_run,
|
|
599
|
+
transcription_worker_run, str | bool
|
|
599
600
|
), "transcription_worker_run should be of type str or bool"
|
|
600
601
|
if isinstance(transcription_worker_run, bool):
|
|
601
602
|
assert (
|
|
@@ -626,7 +627,7 @@ class ElementMixin(object):
|
|
|
626
627
|
query_params["with_zone"] = with_zone
|
|
627
628
|
if worker_version is not None:
|
|
628
629
|
assert isinstance(
|
|
629
|
-
worker_version,
|
|
630
|
+
worker_version, str | bool
|
|
630
631
|
), "worker_version should be of type str or bool"
|
|
631
632
|
if isinstance(worker_version, bool):
|
|
632
633
|
assert (
|
|
@@ -635,7 +636,7 @@ class ElementMixin(object):
|
|
|
635
636
|
query_params["worker_version"] = worker_version
|
|
636
637
|
if worker_run is not None:
|
|
637
638
|
assert isinstance(
|
|
638
|
-
worker_run,
|
|
639
|
+
worker_run, str | bool
|
|
639
640
|
), "worker_run should be of type str or bool"
|
|
640
641
|
if isinstance(worker_run, bool):
|
|
641
642
|
assert (
|
|
@@ -645,11 +646,14 @@ class ElementMixin(object):
|
|
|
645
646
|
|
|
646
647
|
if self.use_cache:
|
|
647
648
|
# Checking that we only received query_params handled by the cache
|
|
648
|
-
assert
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
649
|
+
assert (
|
|
650
|
+
set(query_params.keys())
|
|
651
|
+
<= {
|
|
652
|
+
"type",
|
|
653
|
+
"worker_version",
|
|
654
|
+
"worker_run",
|
|
655
|
+
}
|
|
656
|
+
), "When using the local cache, you can only filter by 'type' and/or 'worker_version' and/or 'worker_run'"
|
|
653
657
|
|
|
654
658
|
parent_ids = CachedElement.select(CachedElement.parent_id).where(
|
|
655
659
|
CachedElement.id == element.id
|
arkindex_worker/worker/entity.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
1
|
"""
|
|
3
2
|
ElementsWorker methods for entities.
|
|
4
3
|
"""
|
|
5
4
|
|
|
6
5
|
from operator import itemgetter
|
|
7
|
-
from typing import
|
|
6
|
+
from typing import TypedDict
|
|
8
7
|
|
|
9
8
|
from peewee import IntegrityError
|
|
10
9
|
|
|
@@ -12,16 +11,13 @@ from arkindex_worker import logger
|
|
|
12
11
|
from arkindex_worker.cache import CachedEntity, CachedTranscriptionEntity
|
|
13
12
|
from arkindex_worker.models import Element, Transcription
|
|
14
13
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
"confidence": Optional[float],
|
|
23
|
-
},
|
|
24
|
-
)
|
|
14
|
+
|
|
15
|
+
class Entity(TypedDict):
|
|
16
|
+
name: str
|
|
17
|
+
type_id: str
|
|
18
|
+
length: int
|
|
19
|
+
offset: int
|
|
20
|
+
confidence: float | None
|
|
25
21
|
|
|
26
22
|
|
|
27
23
|
class MissingEntityType(Exception):
|
|
@@ -31,9 +27,9 @@ class MissingEntityType(Exception):
|
|
|
31
27
|
"""
|
|
32
28
|
|
|
33
29
|
|
|
34
|
-
class EntityMixin
|
|
30
|
+
class EntityMixin:
|
|
35
31
|
def check_required_entity_types(
|
|
36
|
-
self, entity_types:
|
|
32
|
+
self, entity_types: list[str], create_missing: bool = True
|
|
37
33
|
):
|
|
38
34
|
"""Checks that every entity type needed is available in the corpus.
|
|
39
35
|
Missing ones may be created automatically if needed.
|
|
@@ -71,7 +67,7 @@ class EntityMixin(object):
|
|
|
71
67
|
self,
|
|
72
68
|
name: str,
|
|
73
69
|
type: str,
|
|
74
|
-
metas=
|
|
70
|
+
metas=None,
|
|
75
71
|
validated=None,
|
|
76
72
|
):
|
|
77
73
|
"""
|
|
@@ -87,6 +83,7 @@ class EntityMixin(object):
|
|
|
87
83
|
assert type and isinstance(
|
|
88
84
|
type, str
|
|
89
85
|
), "type shouldn't be null and should be of type str"
|
|
86
|
+
metas = metas or {}
|
|
90
87
|
if metas:
|
|
91
88
|
assert isinstance(metas, dict), "metas should be of type dict"
|
|
92
89
|
if validated is not None:
|
|
@@ -140,8 +137,8 @@ class EntityMixin(object):
|
|
|
140
137
|
entity: str,
|
|
141
138
|
offset: int,
|
|
142
139
|
length: int,
|
|
143
|
-
confidence:
|
|
144
|
-
) ->
|
|
140
|
+
confidence: float | None = None,
|
|
141
|
+
) -> dict[str, str | int] | None:
|
|
145
142
|
"""
|
|
146
143
|
Create a link between an existing entity and an existing transcription.
|
|
147
144
|
If cache support is enabled, a `CachedTranscriptionEntity` will also be created.
|
|
@@ -211,8 +208,8 @@ class EntityMixin(object):
|
|
|
211
208
|
def create_transcription_entities(
|
|
212
209
|
self,
|
|
213
210
|
transcription: Transcription,
|
|
214
|
-
entities:
|
|
215
|
-
) ->
|
|
211
|
+
entities: list[Entity],
|
|
212
|
+
) -> list[dict[str, str]]:
|
|
216
213
|
"""
|
|
217
214
|
Create multiple entities attached to a transcription in a single API request.
|
|
218
215
|
|
|
@@ -250,13 +247,13 @@ class EntityMixin(object):
|
|
|
250
247
|
), f"Entity at index {index} in entities: Should be of type dict"
|
|
251
248
|
|
|
252
249
|
name = entity.get("name")
|
|
253
|
-
assert
|
|
254
|
-
name, str
|
|
250
|
+
assert (
|
|
251
|
+
name and isinstance(name, str)
|
|
255
252
|
), f"Entity at index {index} in entities: name shouldn't be null and should be of type str"
|
|
256
253
|
|
|
257
254
|
type_id = entity.get("type_id")
|
|
258
|
-
assert
|
|
259
|
-
type_id, str
|
|
255
|
+
assert (
|
|
256
|
+
type_id and isinstance(type_id, str)
|
|
260
257
|
), f"Entity at index {index} in entities: type_id shouldn't be null and should be of type str"
|
|
261
258
|
|
|
262
259
|
offset = entity.get("offset")
|
|
@@ -270,8 +267,9 @@ class EntityMixin(object):
|
|
|
270
267
|
), f"Entity at index {index} in entities: length shouldn't be null and should be a strictly positive integer"
|
|
271
268
|
|
|
272
269
|
confidence = entity.get("confidence")
|
|
273
|
-
assert
|
|
274
|
-
|
|
270
|
+
assert (
|
|
271
|
+
confidence is None
|
|
272
|
+
or (isinstance(confidence, float) and 0 <= confidence <= 1)
|
|
275
273
|
), f"Entity at index {index} in entities: confidence should be None or a float in [0..1] range"
|
|
276
274
|
|
|
277
275
|
assert len(entities) == len(
|
|
@@ -298,7 +296,7 @@ class EntityMixin(object):
|
|
|
298
296
|
def list_transcription_entities(
|
|
299
297
|
self,
|
|
300
298
|
transcription: Transcription,
|
|
301
|
-
worker_version:
|
|
299
|
+
worker_version: str | bool | None = None,
|
|
302
300
|
):
|
|
303
301
|
"""
|
|
304
302
|
List existing entities on a transcription
|
|
@@ -314,7 +312,7 @@ class EntityMixin(object):
|
|
|
314
312
|
|
|
315
313
|
if worker_version is not None:
|
|
316
314
|
assert isinstance(
|
|
317
|
-
worker_version,
|
|
315
|
+
worker_version, str | bool
|
|
318
316
|
), "worker_version should be of type str or bool"
|
|
319
317
|
|
|
320
318
|
if isinstance(worker_version, bool):
|
|
@@ -329,14 +327,14 @@ class EntityMixin(object):
|
|
|
329
327
|
|
|
330
328
|
def list_corpus_entities(
|
|
331
329
|
self,
|
|
332
|
-
name:
|
|
333
|
-
parent:
|
|
330
|
+
name: str | None = None,
|
|
331
|
+
parent: Element | None = None,
|
|
334
332
|
):
|
|
335
333
|
"""
|
|
336
334
|
List all entities in the worker's corpus
|
|
337
335
|
This method does not support cache
|
|
338
336
|
:param name: Filter entities by part of their name (case-insensitive)
|
|
339
|
-
:param parent
|
|
337
|
+
:param parent: Restrict entities to those linked to all transcriptions of an element and all its descendants. Note that links to metadata are ignored.
|
|
340
338
|
"""
|
|
341
339
|
query_params = {}
|
|
342
340
|
|