arkindex-base-worker 0.4.0rc5__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arkindex_base_worker-0.4.0rc5.dist-info → arkindex_base_worker-0.5.0.dist-info}/METADATA +10 -13
- arkindex_base_worker-0.5.0.dist-info/RECORD +60 -0
- {arkindex_base_worker-0.4.0rc5.dist-info → arkindex_base_worker-0.5.0.dist-info}/WHEEL +1 -1
- {arkindex_base_worker-0.4.0rc5.dist-info → arkindex_base_worker-0.5.0.dist-info}/top_level.txt +1 -0
- arkindex_worker/__init__.py +3 -0
- arkindex_worker/cache.py +6 -25
- arkindex_worker/image.py +105 -66
- arkindex_worker/utils.py +2 -1
- arkindex_worker/worker/__init__.py +22 -32
- arkindex_worker/worker/base.py +16 -9
- arkindex_worker/worker/classification.py +36 -34
- arkindex_worker/worker/corpus.py +3 -3
- arkindex_worker/worker/dataset.py +9 -9
- arkindex_worker/worker/element.py +261 -231
- arkindex_worker/worker/entity.py +137 -206
- arkindex_worker/worker/image.py +3 -3
- arkindex_worker/worker/metadata.py +27 -38
- arkindex_worker/worker/process.py +24 -0
- arkindex_worker/worker/task.py +9 -9
- arkindex_worker/worker/training.py +15 -11
- arkindex_worker/worker/transcription.py +77 -71
- examples/standalone/python/worker.py +171 -0
- examples/tooled/python/worker.py +50 -0
- tests/conftest.py +22 -36
- tests/test_base_worker.py +1 -1
- tests/test_cache.py +1 -2
- tests/test_dataset_worker.py +1 -1
- tests/test_elements_worker/test_element.py +200 -26
- tests/test_elements_worker/{test_entity_create.py → test_entity.py} +220 -227
- tests/test_elements_worker/test_metadata.py +0 -47
- tests/test_elements_worker/test_process.py +89 -0
- tests/test_elements_worker/test_training.py +8 -8
- tests/test_elements_worker/test_worker.py +61 -14
- tests/test_image.py +244 -126
- tests/test_merge.py +0 -7
- tests/test_utils.py +37 -0
- arkindex_base_worker-0.4.0rc5.dist-info/RECORD +0 -60
- arkindex_worker/worker/version.py +0 -58
- tests/test_elements_worker/test_entity_list_and_check.py +0 -160
- tests/test_elements_worker/test_version.py +0 -60
- {arkindex_base_worker-0.4.0rc5.dist-info → arkindex_base_worker-0.5.0.dist-info/licenses}/LICENSE +0 -0
arkindex_worker/worker/base.py
CHANGED
|
@@ -146,6 +146,13 @@ class BaseWorker:
|
|
|
146
146
|
# Define API Client
|
|
147
147
|
self.setup_api_client()
|
|
148
148
|
|
|
149
|
+
# Known and available classes in processed corpus
|
|
150
|
+
self.classes = {}
|
|
151
|
+
# Known and available entity types in processed corpus
|
|
152
|
+
self.entity_types = {}
|
|
153
|
+
# Known and available element types in processed corpus
|
|
154
|
+
self.corpus_types = {}
|
|
155
|
+
|
|
149
156
|
@property
|
|
150
157
|
def corpus_id(self) -> str:
|
|
151
158
|
"""
|
|
@@ -268,12 +275,12 @@ class BaseWorker:
|
|
|
268
275
|
# Retrieve initial configuration from API
|
|
269
276
|
self.config = worker_version["configuration"].get("configuration", {})
|
|
270
277
|
if "user_configuration" in worker_version["configuration"]:
|
|
271
|
-
# Add
|
|
278
|
+
# Add missing values (using the provided default if set) to user_configuration
|
|
272
279
|
for key, value in worker_version["configuration"][
|
|
273
280
|
"user_configuration"
|
|
274
281
|
].items():
|
|
275
|
-
if
|
|
276
|
-
self.user_configuration[key] = value
|
|
282
|
+
if key not in self.model_configuration:
|
|
283
|
+
self.user_configuration[key] = value.get("default")
|
|
277
284
|
|
|
278
285
|
# Load all required secrets
|
|
279
286
|
required_secrets = worker_version["configuration"].get("secrets", [])
|
|
@@ -305,9 +312,9 @@ class BaseWorker:
|
|
|
305
312
|
|
|
306
313
|
if self.use_cache:
|
|
307
314
|
if self.args.database is not None:
|
|
308
|
-
assert (
|
|
309
|
-
self.args.database
|
|
310
|
-
)
|
|
315
|
+
assert self.args.database.is_file(), (
|
|
316
|
+
f"Database in {self.args.database} does not exist"
|
|
317
|
+
)
|
|
311
318
|
self.cache_path = self.args.database
|
|
312
319
|
else:
|
|
313
320
|
cache_dir = self.task_data_dir / self.task_id
|
|
@@ -378,9 +385,9 @@ class BaseWorker:
|
|
|
378
385
|
gpg = gnupg.GPG()
|
|
379
386
|
with path.open("rb") as gpg_file:
|
|
380
387
|
decrypted = gpg.decrypt_file(gpg_file)
|
|
381
|
-
assert (
|
|
382
|
-
decrypted.
|
|
383
|
-
)
|
|
388
|
+
assert decrypted.ok, (
|
|
389
|
+
f"GPG error: {decrypted.status} - {decrypted.stderr}"
|
|
390
|
+
)
|
|
384
391
|
secret = decrypted.data.decode("utf-8")
|
|
385
392
|
logging.info(f"Loaded local secret {name}")
|
|
386
393
|
except Exception as e:
|
|
@@ -27,7 +27,7 @@ class ClassificationMixin:
|
|
|
27
27
|
)
|
|
28
28
|
self.classes = {ml_class["name"]: ml_class["id"] for ml_class in corpus_classes}
|
|
29
29
|
logger.info(
|
|
30
|
-
f
|
|
30
|
+
f"Loaded {len(self.classes)} ML {pluralize('class', len(self.classes))} in corpus ({self.corpus_id})"
|
|
31
31
|
)
|
|
32
32
|
|
|
33
33
|
def get_ml_class_id(self, ml_class: str) -> str:
|
|
@@ -49,7 +49,7 @@ class ClassificationMixin:
|
|
|
49
49
|
"CreateMLClass", id=self.corpus_id, body={"name": ml_class}
|
|
50
50
|
)
|
|
51
51
|
ml_class_id = self.classes[ml_class] = response["id"]
|
|
52
|
-
logger.debug(f"Created ML class {response['id']}")
|
|
52
|
+
logger.debug(f"Created a new ML class {response['id']}")
|
|
53
53
|
except ErrorResponse as e:
|
|
54
54
|
# Only reload for 400 errors
|
|
55
55
|
if e.status_code != 400:
|
|
@@ -57,12 +57,12 @@ class ClassificationMixin:
|
|
|
57
57
|
|
|
58
58
|
# Reload and make sure we have the class
|
|
59
59
|
logger.info(
|
|
60
|
-
f"
|
|
60
|
+
f"Unable to create the ML class `{ml_class}`. Refreshing corpus classes cache."
|
|
61
61
|
)
|
|
62
62
|
self.load_corpus_classes()
|
|
63
|
-
assert (
|
|
64
|
-
ml_class
|
|
65
|
-
)
|
|
63
|
+
assert ml_class in self.classes, (
|
|
64
|
+
f"Missing ML class {ml_class} even after refreshing."
|
|
65
|
+
)
|
|
66
66
|
ml_class_id = self.classes[ml_class]
|
|
67
67
|
|
|
68
68
|
return ml_class_id
|
|
@@ -86,9 +86,9 @@ class ClassificationMixin:
|
|
|
86
86
|
),
|
|
87
87
|
None,
|
|
88
88
|
)
|
|
89
|
-
assert (
|
|
90
|
-
|
|
91
|
-
)
|
|
89
|
+
assert ml_class_name is not None, (
|
|
90
|
+
f"Missing class with id ({ml_class_id}) in corpus ({self.corpus_id})"
|
|
91
|
+
)
|
|
92
92
|
return ml_class_name
|
|
93
93
|
|
|
94
94
|
def create_classification(
|
|
@@ -107,18 +107,18 @@ class ClassificationMixin:
|
|
|
107
107
|
:param high_confidence: Whether or not the classification is of high confidence.
|
|
108
108
|
:returns: The created classification, as returned by the ``CreateClassification`` API endpoint.
|
|
109
109
|
"""
|
|
110
|
-
assert element and isinstance(
|
|
111
|
-
element
|
|
112
|
-
)
|
|
113
|
-
assert ml_class and isinstance(
|
|
114
|
-
ml_class
|
|
115
|
-
)
|
|
116
|
-
assert (
|
|
117
|
-
|
|
118
|
-
)
|
|
119
|
-
assert isinstance(
|
|
120
|
-
high_confidence
|
|
121
|
-
)
|
|
110
|
+
assert element and isinstance(element, Element | CachedElement), (
|
|
111
|
+
"element shouldn't be null and should be an Element or CachedElement"
|
|
112
|
+
)
|
|
113
|
+
assert ml_class and isinstance(ml_class, str), (
|
|
114
|
+
"ml_class shouldn't be null and should be of type str"
|
|
115
|
+
)
|
|
116
|
+
assert isinstance(confidence, float) and 0 <= confidence <= 1, (
|
|
117
|
+
"confidence shouldn't be null and should be a float in [0..1] range"
|
|
118
|
+
)
|
|
119
|
+
assert isinstance(high_confidence, bool), (
|
|
120
|
+
"high_confidence shouldn't be null and should be of type bool"
|
|
121
|
+
)
|
|
122
122
|
if self.is_read_only:
|
|
123
123
|
logger.warning(
|
|
124
124
|
"Cannot create classification as this worker is in read-only mode"
|
|
@@ -198,31 +198,33 @@ class ClassificationMixin:
|
|
|
198
198
|
:returns: List of created classifications, as returned in the ``classifications`` field by
|
|
199
199
|
the ``CreateClassifications`` API endpoint.
|
|
200
200
|
"""
|
|
201
|
-
assert element and isinstance(
|
|
202
|
-
element
|
|
203
|
-
)
|
|
204
|
-
assert classifications and isinstance(
|
|
205
|
-
classifications
|
|
206
|
-
)
|
|
201
|
+
assert element and isinstance(element, Element | CachedElement), (
|
|
202
|
+
"element shouldn't be null and should be an Element or CachedElement"
|
|
203
|
+
)
|
|
204
|
+
assert classifications and isinstance(classifications, list), (
|
|
205
|
+
"classifications shouldn't be null and should be of type list"
|
|
206
|
+
)
|
|
207
207
|
|
|
208
208
|
for index, classification in enumerate(classifications):
|
|
209
209
|
ml_class = classification.get("ml_class")
|
|
210
|
-
assert (
|
|
211
|
-
ml_class and
|
|
212
|
-
)
|
|
210
|
+
assert ml_class and isinstance(ml_class, str), (
|
|
211
|
+
f"Classification at index {index} in classifications: ml_class shouldn't be null and should be of type str"
|
|
212
|
+
)
|
|
213
213
|
|
|
214
214
|
confidence = classification.get("confidence")
|
|
215
215
|
assert (
|
|
216
216
|
confidence is not None
|
|
217
217
|
and isinstance(confidence, float)
|
|
218
218
|
and 0 <= confidence <= 1
|
|
219
|
-
),
|
|
219
|
+
), (
|
|
220
|
+
f"Classification at index {index} in classifications: confidence shouldn't be null and should be a float in [0..1] range"
|
|
221
|
+
)
|
|
220
222
|
|
|
221
223
|
high_confidence = classification.get("high_confidence")
|
|
222
224
|
if high_confidence is not None:
|
|
223
|
-
assert isinstance(
|
|
224
|
-
high_confidence
|
|
225
|
-
)
|
|
225
|
+
assert isinstance(high_confidence, bool), (
|
|
226
|
+
f"Classification at index {index} in classifications: high_confidence should be of type bool"
|
|
227
|
+
)
|
|
226
228
|
|
|
227
229
|
if self.is_read_only:
|
|
228
230
|
logger.warning(
|
arkindex_worker/worker/corpus.py
CHANGED
|
@@ -76,9 +76,9 @@ class CorpusMixin:
|
|
|
76
76
|
key=itemgetter("updated"),
|
|
77
77
|
reverse=True,
|
|
78
78
|
)
|
|
79
|
-
assert (
|
|
80
|
-
|
|
81
|
-
)
|
|
79
|
+
assert len(exports) > 0, (
|
|
80
|
+
f'No available exports found for the corpus ({self.corpus_id}) with state "{CorpusExportState.Done.value.capitalize()}".'
|
|
81
|
+
)
|
|
82
82
|
|
|
83
83
|
# Download latest export
|
|
84
84
|
export_id: str = exports[0]["id"]
|
|
@@ -113,9 +113,9 @@ class DatasetMixin:
|
|
|
113
113
|
:param dataset_set: Set to find elements in.
|
|
114
114
|
:returns: An iterator of Element built from the ``ListDatasetElements`` API endpoint.
|
|
115
115
|
"""
|
|
116
|
-
assert dataset_set and isinstance(
|
|
117
|
-
dataset_set
|
|
118
|
-
)
|
|
116
|
+
assert dataset_set and isinstance(dataset_set, Set), (
|
|
117
|
+
"dataset_set shouldn't be null and should be a Set"
|
|
118
|
+
)
|
|
119
119
|
|
|
120
120
|
results = self.api_client.paginate(
|
|
121
121
|
"ListDatasetElements", id=dataset_set.dataset.id, set=dataset_set.name
|
|
@@ -152,12 +152,12 @@ class DatasetMixin:
|
|
|
152
152
|
:param state: State of the dataset.
|
|
153
153
|
:returns: The updated ``Dataset`` object from the ``PartialUpdateDataset`` API endpoint.
|
|
154
154
|
"""
|
|
155
|
-
assert dataset and isinstance(
|
|
156
|
-
dataset
|
|
157
|
-
)
|
|
158
|
-
assert state and isinstance(
|
|
159
|
-
state
|
|
160
|
-
)
|
|
155
|
+
assert dataset and isinstance(dataset, Dataset), (
|
|
156
|
+
"dataset shouldn't be null and should be a Dataset"
|
|
157
|
+
)
|
|
158
|
+
assert state and isinstance(state, DatasetState), (
|
|
159
|
+
"state shouldn't be null and should be a str from DatasetState"
|
|
160
|
+
)
|
|
161
161
|
|
|
162
162
|
if self.is_read_only:
|
|
163
163
|
logger.warning("Cannot update dataset as this worker is in read-only mode")
|