arkindex-base-worker 0.5.0a1__tar.gz → 0.5.0a3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/PKG-INFO +2 -2
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_base_worker.egg-info/PKG-INFO +2 -2
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_base_worker.egg-info/requires.txt +1 -1
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_worker/__init__.py +3 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_worker/cache.py +3 -3
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_worker/image.py +31 -24
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_worker/worker/__init__.py +17 -17
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_worker/worker/base.py +6 -6
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_worker/worker/classification.py +34 -32
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_worker/worker/corpus.py +3 -3
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_worker/worker/dataset.py +9 -9
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_worker/worker/element.py +193 -189
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_worker/worker/entity.py +61 -60
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_worker/worker/image.py +3 -3
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_worker/worker/metadata.py +27 -27
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_worker/worker/task.py +9 -9
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_worker/worker/training.py +15 -11
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_worker/worker/transcription.py +77 -71
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/pyproject.toml +2 -2
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_elements_worker/test_training.py +6 -6
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/LICENSE +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/README.md +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_base_worker.egg-info/SOURCES.txt +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_base_worker.egg-info/dependency_links.txt +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_base_worker.egg-info/top_level.txt +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_worker/models.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_worker/utils.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_worker/worker/process.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_worker/worker/version.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/hooks/pre_gen_project.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/setup.cfg +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/__init__.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/conftest.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_base_worker.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_cache.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_dataset_worker.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_element.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_elements_worker/__init__.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_elements_worker/test_classification.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_elements_worker/test_cli.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_elements_worker/test_corpus.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_elements_worker/test_dataset.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_elements_worker/test_element.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_elements_worker/test_element_create_multiple.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_elements_worker/test_element_create_single.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_elements_worker/test_element_list_children.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_elements_worker/test_element_list_parents.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_elements_worker/test_entity_create.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_elements_worker/test_entity_list_and_check.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_elements_worker/test_image.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_elements_worker/test_metadata.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_elements_worker/test_process.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_elements_worker/test_task.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_elements_worker/test_transcription_create.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_elements_worker/test_transcription_create_with_elements.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_elements_worker/test_transcription_list.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_elements_worker/test_version.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_elements_worker/test_worker.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_image.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_merge.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/tests/test_utils.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/worker-demo/tests/__init__.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/worker-demo/tests/conftest.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/worker-demo/tests/test_worker.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/worker-demo/worker_demo/__init__.py +0 -0
- {arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/worker-demo/worker_demo/worker.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: arkindex-base-worker
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.0a3
|
|
4
4
|
Summary: Base Worker to easily build Arkindex ML workflows
|
|
5
5
|
Author-email: Teklia <contact@teklia.com>
|
|
6
6
|
Maintainer-email: Teklia <contact@teklia.com>
|
|
@@ -46,7 +46,7 @@ Requires-Dist: peewee~=3.17
|
|
|
46
46
|
Requires-Dist: Pillow==11.0.0
|
|
47
47
|
Requires-Dist: python-gnupg==0.5.3
|
|
48
48
|
Requires-Dist: shapely==2.0.6
|
|
49
|
-
Requires-Dist: teklia-toolbox==0.1.
|
|
49
|
+
Requires-Dist: teklia-toolbox==0.1.8
|
|
50
50
|
Requires-Dist: zstandard==0.23.0
|
|
51
51
|
Provides-Extra: docs
|
|
52
52
|
Requires-Dist: black==24.10.0; extra == "docs"
|
{arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_base_worker.egg-info/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: arkindex-base-worker
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.0a3
|
|
4
4
|
Summary: Base Worker to easily build Arkindex ML workflows
|
|
5
5
|
Author-email: Teklia <contact@teklia.com>
|
|
6
6
|
Maintainer-email: Teklia <contact@teklia.com>
|
|
@@ -46,7 +46,7 @@ Requires-Dist: peewee~=3.17
|
|
|
46
46
|
Requires-Dist: Pillow==11.0.0
|
|
47
47
|
Requires-Dist: python-gnupg==0.5.3
|
|
48
48
|
Requires-Dist: shapely==2.0.6
|
|
49
|
-
Requires-Dist: teklia-toolbox==0.1.
|
|
49
|
+
Requires-Dist: teklia-toolbox==0.1.8
|
|
50
50
|
Requires-Dist: zstandard==0.23.0
|
|
51
51
|
Provides-Extra: docs
|
|
52
52
|
Requires-Dist: black==24.10.0; extra == "docs"
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import importlib.metadata
|
|
1
2
|
import logging
|
|
2
3
|
|
|
3
4
|
logging.basicConfig(
|
|
@@ -5,3 +6,5 @@ logging.basicConfig(
|
|
|
5
6
|
format="%(asctime)s %(levelname)s/%(name)s: %(message)s",
|
|
6
7
|
)
|
|
7
8
|
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
VERSION = importlib.metadata.version("arkindex-base-worker")
|
|
@@ -327,9 +327,9 @@ def check_version(cache_path: str | Path):
|
|
|
327
327
|
except OperationalError:
|
|
328
328
|
version = None
|
|
329
329
|
|
|
330
|
-
assert (
|
|
331
|
-
version
|
|
332
|
-
)
|
|
330
|
+
assert version == SQL_VERSION, (
|
|
331
|
+
f"The SQLite database {cache_path} does not have the correct cache version, it should be {SQL_VERSION}"
|
|
332
|
+
)
|
|
333
333
|
|
|
334
334
|
|
|
335
335
|
def merge_parents_cache(paths: list, current_database: Path):
|
|
@@ -27,7 +27,7 @@ from tenacity import (
|
|
|
27
27
|
wait_exponential,
|
|
28
28
|
)
|
|
29
29
|
|
|
30
|
-
from arkindex_worker import logger
|
|
30
|
+
from arkindex_worker import VERSION, logger
|
|
31
31
|
from arkindex_worker.utils import pluralize
|
|
32
32
|
from teklia_toolbox.requests import should_verify_cert
|
|
33
33
|
|
|
@@ -41,6 +41,8 @@ DOWNLOAD_TIMEOUT = (30, 60)
|
|
|
41
41
|
|
|
42
42
|
BoundingBox = namedtuple("BoundingBox", ["x", "y", "width", "height"])
|
|
43
43
|
|
|
44
|
+
# Specific User-Agent to bypass potential server limitations
|
|
45
|
+
IIIF_USER_AGENT = f"Teklia/Workers {VERSION}"
|
|
44
46
|
# To parse IIIF Urls
|
|
45
47
|
IIIF_URL = re.compile(r"\w+:\/{2}.+\/.+\/.+\/.+\/(?P<size>.+)\/!?\d+\/\w+\.\w+")
|
|
46
48
|
# Full size of the region
|
|
@@ -326,7 +328,7 @@ def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
|
|
|
326
328
|
def _retry_log(retry_state, *args, **kwargs):
|
|
327
329
|
logger.warning(
|
|
328
330
|
f"Request to {retry_state.args[0]} failed ({repr(retry_state.outcome.exception())}), "
|
|
329
|
-
f
|
|
331
|
+
f"retrying in {retry_state.idle_for} {pluralize('second', retry_state.idle_for)}"
|
|
330
332
|
)
|
|
331
333
|
|
|
332
334
|
|
|
@@ -339,7 +341,12 @@ def _retry_log(retry_state, *args, **kwargs):
|
|
|
339
341
|
)
|
|
340
342
|
def _retried_request(url, *args, method=requests.get, **kwargs):
|
|
341
343
|
resp = method(
|
|
342
|
-
url,
|
|
344
|
+
url,
|
|
345
|
+
*args,
|
|
346
|
+
headers={"User-Agent": IIIF_USER_AGENT},
|
|
347
|
+
timeout=DOWNLOAD_TIMEOUT,
|
|
348
|
+
verify=should_verify_cert(url),
|
|
349
|
+
**kwargs,
|
|
343
350
|
)
|
|
344
351
|
resp.raise_for_status()
|
|
345
352
|
return resp
|
|
@@ -359,9 +366,9 @@ def download_tiles(url: str) -> Image:
|
|
|
359
366
|
|
|
360
367
|
image_width, image_height = info.get("width"), info.get("height")
|
|
361
368
|
assert image_width and image_height, "Missing image dimensions in info.json"
|
|
362
|
-
assert info.get(
|
|
363
|
-
"tiles"
|
|
364
|
-
)
|
|
369
|
+
assert info.get("tiles"), (
|
|
370
|
+
"Image cannot be retrieved at full size and tiles are not supported"
|
|
371
|
+
)
|
|
365
372
|
|
|
366
373
|
# Take the biggest available tile size
|
|
367
374
|
tile = sorted(info["tiles"], key=lambda tile: tile.get("width", 0), reverse=True)[0]
|
|
@@ -435,15 +442,15 @@ def trim_polygon(
|
|
|
435
442
|
is entirely outside of the image's bounds.
|
|
436
443
|
"""
|
|
437
444
|
|
|
438
|
-
assert isinstance(
|
|
439
|
-
polygon
|
|
440
|
-
)
|
|
441
|
-
assert all(
|
|
442
|
-
|
|
443
|
-
)
|
|
444
|
-
assert all(
|
|
445
|
-
|
|
446
|
-
)
|
|
445
|
+
assert isinstance(polygon, list | tuple), (
|
|
446
|
+
"Input polygon must be a valid list or tuple of points."
|
|
447
|
+
)
|
|
448
|
+
assert all(isinstance(point, list | tuple) for point in polygon), (
|
|
449
|
+
"Polygon points must be tuples or lists."
|
|
450
|
+
)
|
|
451
|
+
assert all(len(point) == 2 for point in polygon), (
|
|
452
|
+
"Polygon points must be tuples or lists of 2 elements."
|
|
453
|
+
)
|
|
447
454
|
assert all(
|
|
448
455
|
isinstance(point[0], int) and isinstance(point[1], int) for point in polygon
|
|
449
456
|
), "Polygon point coordinates must be integers."
|
|
@@ -494,12 +501,12 @@ def revert_orientation(
|
|
|
494
501
|
from arkindex_worker.cache import CachedElement
|
|
495
502
|
from arkindex_worker.models import Element
|
|
496
503
|
|
|
497
|
-
assert element and isinstance(
|
|
498
|
-
element
|
|
499
|
-
)
|
|
500
|
-
assert polygon and isinstance(
|
|
501
|
-
polygon
|
|
502
|
-
)
|
|
504
|
+
assert element and isinstance(element, Element | CachedElement), (
|
|
505
|
+
"element shouldn't be null and should be an Element or CachedElement"
|
|
506
|
+
)
|
|
507
|
+
assert polygon and isinstance(polygon, list), (
|
|
508
|
+
"polygon shouldn't be null and should be a list"
|
|
509
|
+
)
|
|
503
510
|
assert isinstance(reverse, bool), "Reverse should be a bool"
|
|
504
511
|
# Rotating with Pillow can cause it to move the image around, as the image cannot have negative coordinates
|
|
505
512
|
# and must be a rectangle. This means the origin point of any coordinates from an image is invalid, and the
|
|
@@ -507,9 +514,9 @@ def revert_orientation(
|
|
|
507
514
|
# To properly undo the mirroring and rotation implicitly applied by open_image, we first need to find the center
|
|
508
515
|
# of the rotated bounding box.
|
|
509
516
|
if isinstance(element, Element):
|
|
510
|
-
assert (
|
|
511
|
-
element
|
|
512
|
-
)
|
|
517
|
+
assert element.zone and element.zone.polygon, (
|
|
518
|
+
"element should have a zone and a polygon"
|
|
519
|
+
)
|
|
513
520
|
parent_ring = LinearRing(element.zone.polygon)
|
|
514
521
|
elif isinstance(element, CachedElement):
|
|
515
522
|
assert element.polygon, "cached element should have a polygon"
|
{arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_worker/worker/__init__.py
RENAMED
|
@@ -82,9 +82,9 @@ class ElementsWorker(
|
|
|
82
82
|
:return: An iterable of [CachedElement][arkindex_worker.cache.CachedElement] when cache support is enabled,
|
|
83
83
|
or a list of strings representing element IDs otherwise.
|
|
84
84
|
"""
|
|
85
|
-
assert not (
|
|
86
|
-
|
|
87
|
-
)
|
|
85
|
+
assert not (self.args.elements_list and self.args.element), (
|
|
86
|
+
"elements-list and element CLI args shouldn't be both set"
|
|
87
|
+
)
|
|
88
88
|
|
|
89
89
|
def invalid_element_id(value: str) -> bool:
|
|
90
90
|
"""
|
|
@@ -125,9 +125,9 @@ class ElementsWorker(
|
|
|
125
125
|
return {item["id"] for item in self.list_process_elements()}
|
|
126
126
|
|
|
127
127
|
invalid_element_ids = list(filter(invalid_element_id, out))
|
|
128
|
-
assert (
|
|
129
|
-
|
|
130
|
-
)
|
|
128
|
+
assert not invalid_element_ids, (
|
|
129
|
+
f"These element IDs are invalid: {', '.join(invalid_element_ids)}"
|
|
130
|
+
)
|
|
131
131
|
|
|
132
132
|
return out
|
|
133
133
|
|
|
@@ -144,9 +144,9 @@ class ElementsWorker(
|
|
|
144
144
|
# Worker activities are also disabled when running an ElementsWorker in a Dataset process
|
|
145
145
|
# and when running export processes.
|
|
146
146
|
return False
|
|
147
|
-
assert (
|
|
148
|
-
|
|
149
|
-
)
|
|
147
|
+
assert self.process_information, (
|
|
148
|
+
"Worker must be configured to access its process activity state"
|
|
149
|
+
)
|
|
150
150
|
return self.process_information.get("activity_state") == "ready"
|
|
151
151
|
|
|
152
152
|
def run(self):
|
|
@@ -221,7 +221,7 @@ class ElementsWorker(
|
|
|
221
221
|
with contextlib.suppress(Exception):
|
|
222
222
|
self.update_activity(element.id, ActivityState.Error)
|
|
223
223
|
|
|
224
|
-
message = f
|
|
224
|
+
message = f"Ran on {count} {pluralize('element', count)}: {count - failed} completed, {failed} failed"
|
|
225
225
|
if failed:
|
|
226
226
|
logger.error(message)
|
|
227
227
|
if failed >= count: # Everything failed!
|
|
@@ -256,9 +256,9 @@ class ElementsWorker(
|
|
|
256
256
|
)
|
|
257
257
|
return True
|
|
258
258
|
|
|
259
|
-
assert element_id and isinstance(
|
|
260
|
-
element_id
|
|
261
|
-
)
|
|
259
|
+
assert element_id and isinstance(element_id, uuid.UUID | str), (
|
|
260
|
+
"element_id shouldn't be null and should be an UUID or str"
|
|
261
|
+
)
|
|
262
262
|
assert isinstance(state, ActivityState), "state should be an ActivityState"
|
|
263
263
|
|
|
264
264
|
try:
|
|
@@ -382,9 +382,9 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
|
|
|
382
382
|
failed = 0
|
|
383
383
|
for i, dataset_set in enumerate(dataset_sets, start=1):
|
|
384
384
|
try:
|
|
385
|
-
assert (
|
|
386
|
-
|
|
387
|
-
)
|
|
385
|
+
assert dataset_set.dataset.state == DatasetState.Complete.value, (
|
|
386
|
+
"When processing a set, its dataset state should be Complete."
|
|
387
|
+
)
|
|
388
388
|
|
|
389
389
|
logger.info(f"Retrieving data for {dataset_set} ({i}/{count})")
|
|
390
390
|
self.download_dataset_artifact(dataset_set.dataset)
|
|
@@ -405,7 +405,7 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
|
|
|
405
405
|
# Cleanup the latest downloaded dataset artifact
|
|
406
406
|
self.cleanup_downloaded_artifact()
|
|
407
407
|
|
|
408
|
-
message = f
|
|
408
|
+
message = f"Ran on {count} {pluralize('set', count)}: {count - failed} completed, {failed} failed"
|
|
409
409
|
if failed:
|
|
410
410
|
logger.error(message)
|
|
411
411
|
if failed >= count: # Everything failed!
|
{arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_worker/worker/base.py
RENAMED
|
@@ -305,9 +305,9 @@ class BaseWorker:
|
|
|
305
305
|
|
|
306
306
|
if self.use_cache:
|
|
307
307
|
if self.args.database is not None:
|
|
308
|
-
assert (
|
|
309
|
-
self.args.database
|
|
310
|
-
)
|
|
308
|
+
assert self.args.database.is_file(), (
|
|
309
|
+
f"Database in {self.args.database} does not exist"
|
|
310
|
+
)
|
|
311
311
|
self.cache_path = self.args.database
|
|
312
312
|
else:
|
|
313
313
|
cache_dir = self.task_data_dir / self.task_id
|
|
@@ -378,9 +378,9 @@ class BaseWorker:
|
|
|
378
378
|
gpg = gnupg.GPG()
|
|
379
379
|
with path.open("rb") as gpg_file:
|
|
380
380
|
decrypted = gpg.decrypt_file(gpg_file)
|
|
381
|
-
assert (
|
|
382
|
-
decrypted.
|
|
383
|
-
)
|
|
381
|
+
assert decrypted.ok, (
|
|
382
|
+
f"GPG error: {decrypted.status} - {decrypted.stderr}"
|
|
383
|
+
)
|
|
384
384
|
secret = decrypted.data.decode("utf-8")
|
|
385
385
|
logging.info(f"Loaded local secret {name}")
|
|
386
386
|
except Exception as e:
|
|
@@ -27,7 +27,7 @@ class ClassificationMixin:
|
|
|
27
27
|
)
|
|
28
28
|
self.classes = {ml_class["name"]: ml_class["id"] for ml_class in corpus_classes}
|
|
29
29
|
logger.info(
|
|
30
|
-
f
|
|
30
|
+
f"Loaded {len(self.classes)} ML {pluralize('class', len(self.classes))} in corpus ({self.corpus_id})"
|
|
31
31
|
)
|
|
32
32
|
|
|
33
33
|
def get_ml_class_id(self, ml_class: str) -> str:
|
|
@@ -60,9 +60,9 @@ class ClassificationMixin:
|
|
|
60
60
|
f"Reloading corpus classes to see if {ml_class} already exists"
|
|
61
61
|
)
|
|
62
62
|
self.load_corpus_classes()
|
|
63
|
-
assert (
|
|
64
|
-
ml_class
|
|
65
|
-
)
|
|
63
|
+
assert ml_class in self.classes, (
|
|
64
|
+
"Missing class {ml_class} even after reloading"
|
|
65
|
+
)
|
|
66
66
|
ml_class_id = self.classes[ml_class]
|
|
67
67
|
|
|
68
68
|
return ml_class_id
|
|
@@ -86,9 +86,9 @@ class ClassificationMixin:
|
|
|
86
86
|
),
|
|
87
87
|
None,
|
|
88
88
|
)
|
|
89
|
-
assert (
|
|
90
|
-
|
|
91
|
-
)
|
|
89
|
+
assert ml_class_name is not None, (
|
|
90
|
+
f"Missing class with id ({ml_class_id}) in corpus ({self.corpus_id})"
|
|
91
|
+
)
|
|
92
92
|
return ml_class_name
|
|
93
93
|
|
|
94
94
|
def create_classification(
|
|
@@ -107,18 +107,18 @@ class ClassificationMixin:
|
|
|
107
107
|
:param high_confidence: Whether or not the classification is of high confidence.
|
|
108
108
|
:returns: The created classification, as returned by the ``CreateClassification`` API endpoint.
|
|
109
109
|
"""
|
|
110
|
-
assert element and isinstance(
|
|
111
|
-
element
|
|
112
|
-
)
|
|
113
|
-
assert ml_class and isinstance(
|
|
114
|
-
ml_class
|
|
115
|
-
)
|
|
116
|
-
assert (
|
|
117
|
-
|
|
118
|
-
)
|
|
119
|
-
assert isinstance(
|
|
120
|
-
high_confidence
|
|
121
|
-
)
|
|
110
|
+
assert element and isinstance(element, Element | CachedElement), (
|
|
111
|
+
"element shouldn't be null and should be an Element or CachedElement"
|
|
112
|
+
)
|
|
113
|
+
assert ml_class and isinstance(ml_class, str), (
|
|
114
|
+
"ml_class shouldn't be null and should be of type str"
|
|
115
|
+
)
|
|
116
|
+
assert isinstance(confidence, float) and 0 <= confidence <= 1, (
|
|
117
|
+
"confidence shouldn't be null and should be a float in [0..1] range"
|
|
118
|
+
)
|
|
119
|
+
assert isinstance(high_confidence, bool), (
|
|
120
|
+
"high_confidence shouldn't be null and should be of type bool"
|
|
121
|
+
)
|
|
122
122
|
if self.is_read_only:
|
|
123
123
|
logger.warning(
|
|
124
124
|
"Cannot create classification as this worker is in read-only mode"
|
|
@@ -198,31 +198,33 @@ class ClassificationMixin:
|
|
|
198
198
|
:returns: List of created classifications, as returned in the ``classifications`` field by
|
|
199
199
|
the ``CreateClassifications`` API endpoint.
|
|
200
200
|
"""
|
|
201
|
-
assert element and isinstance(
|
|
202
|
-
element
|
|
203
|
-
)
|
|
204
|
-
assert classifications and isinstance(
|
|
205
|
-
classifications
|
|
206
|
-
)
|
|
201
|
+
assert element and isinstance(element, Element | CachedElement), (
|
|
202
|
+
"element shouldn't be null and should be an Element or CachedElement"
|
|
203
|
+
)
|
|
204
|
+
assert classifications and isinstance(classifications, list), (
|
|
205
|
+
"classifications shouldn't be null and should be of type list"
|
|
206
|
+
)
|
|
207
207
|
|
|
208
208
|
for index, classification in enumerate(classifications):
|
|
209
209
|
ml_class = classification.get("ml_class")
|
|
210
|
-
assert (
|
|
211
|
-
ml_class and
|
|
212
|
-
)
|
|
210
|
+
assert ml_class and isinstance(ml_class, str), (
|
|
211
|
+
f"Classification at index {index} in classifications: ml_class shouldn't be null and should be of type str"
|
|
212
|
+
)
|
|
213
213
|
|
|
214
214
|
confidence = classification.get("confidence")
|
|
215
215
|
assert (
|
|
216
216
|
confidence is not None
|
|
217
217
|
and isinstance(confidence, float)
|
|
218
218
|
and 0 <= confidence <= 1
|
|
219
|
-
),
|
|
219
|
+
), (
|
|
220
|
+
f"Classification at index {index} in classifications: confidence shouldn't be null and should be a float in [0..1] range"
|
|
221
|
+
)
|
|
220
222
|
|
|
221
223
|
high_confidence = classification.get("high_confidence")
|
|
222
224
|
if high_confidence is not None:
|
|
223
|
-
assert isinstance(
|
|
224
|
-
high_confidence
|
|
225
|
-
)
|
|
225
|
+
assert isinstance(high_confidence, bool), (
|
|
226
|
+
f"Classification at index {index} in classifications: high_confidence should be of type bool"
|
|
227
|
+
)
|
|
226
228
|
|
|
227
229
|
if self.is_read_only:
|
|
228
230
|
logger.warning(
|
{arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_worker/worker/corpus.py
RENAMED
|
@@ -76,9 +76,9 @@ class CorpusMixin:
|
|
|
76
76
|
key=itemgetter("updated"),
|
|
77
77
|
reverse=True,
|
|
78
78
|
)
|
|
79
|
-
assert (
|
|
80
|
-
|
|
81
|
-
)
|
|
79
|
+
assert len(exports) > 0, (
|
|
80
|
+
f'No available exports found for the corpus ({self.corpus_id}) with state "{CorpusExportState.Done.value.capitalize()}".'
|
|
81
|
+
)
|
|
82
82
|
|
|
83
83
|
# Download latest export
|
|
84
84
|
export_id: str = exports[0]["id"]
|
{arkindex_base_worker-0.5.0a1 → arkindex_base_worker-0.5.0a3}/arkindex_worker/worker/dataset.py
RENAMED
|
@@ -113,9 +113,9 @@ class DatasetMixin:
|
|
|
113
113
|
:param dataset_set: Set to find elements in.
|
|
114
114
|
:returns: An iterator of Element built from the ``ListDatasetElements`` API endpoint.
|
|
115
115
|
"""
|
|
116
|
-
assert dataset_set and isinstance(
|
|
117
|
-
dataset_set
|
|
118
|
-
)
|
|
116
|
+
assert dataset_set and isinstance(dataset_set, Set), (
|
|
117
|
+
"dataset_set shouldn't be null and should be a Set"
|
|
118
|
+
)
|
|
119
119
|
|
|
120
120
|
results = self.api_client.paginate(
|
|
121
121
|
"ListDatasetElements", id=dataset_set.dataset.id, set=dataset_set.name
|
|
@@ -152,12 +152,12 @@ class DatasetMixin:
|
|
|
152
152
|
:param state: State of the dataset.
|
|
153
153
|
:returns: The updated ``Dataset`` object from the ``PartialUpdateDataset`` API endpoint.
|
|
154
154
|
"""
|
|
155
|
-
assert dataset and isinstance(
|
|
156
|
-
dataset
|
|
157
|
-
)
|
|
158
|
-
assert state and isinstance(
|
|
159
|
-
state
|
|
160
|
-
)
|
|
155
|
+
assert dataset and isinstance(dataset, Dataset), (
|
|
156
|
+
"dataset shouldn't be null and should be a Dataset"
|
|
157
|
+
)
|
|
158
|
+
assert state and isinstance(state, DatasetState), (
|
|
159
|
+
"state shouldn't be null and should be a str from DatasetState"
|
|
160
|
+
)
|
|
161
161
|
|
|
162
162
|
if self.is_read_only:
|
|
163
163
|
logger.warning("Cannot update dataset as this worker is in read-only mode")
|