arkindex-base-worker 0.3.6rc1__tar.gz → 0.3.6rc2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arkindex-base-worker-0.3.6rc2/PKG-INFO +39 -0
- arkindex-base-worker-0.3.6rc2/arkindex_base_worker.egg-info/PKG-INFO +39 -0
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/arkindex_base_worker.egg-info/SOURCES.txt +0 -2
- arkindex-base-worker-0.3.6rc2/arkindex_base_worker.egg-info/requires.txt +9 -0
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/arkindex_worker/__init__.py +0 -1
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/arkindex_worker/cache.py +19 -25
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/arkindex_worker/image.py +16 -17
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/arkindex_worker/models.py +17 -21
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/arkindex_worker/utils.py +16 -17
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/arkindex_worker/worker/__init__.py +14 -23
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/arkindex_worker/worker/base.py +12 -7
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/arkindex_worker/worker/classification.py +13 -15
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/arkindex_worker/worker/dataset.py +3 -4
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/arkindex_worker/worker/element.py +80 -75
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/arkindex_worker/worker/entity.py +27 -29
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/arkindex_worker/worker/metadata.py +19 -25
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/arkindex_worker/worker/task.py +2 -3
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/arkindex_worker/worker/training.py +21 -22
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/arkindex_worker/worker/transcription.py +37 -34
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/arkindex_worker/worker/version.py +1 -2
- arkindex-base-worker-0.3.6rc2/pyproject.toml +83 -0
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/setup.py +2 -12
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/tests/conftest.py +55 -75
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/tests/test_base_worker.py +37 -31
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/tests/test_cache.py +14 -7
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/tests/test_dataset_worker.py +4 -4
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/tests/test_element.py +0 -1
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/tests/test_elements_worker/__init__.py +0 -1
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/tests/test_elements_worker/test_classifications.py +0 -1
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/tests/test_elements_worker/test_cli.py +22 -17
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/tests/test_elements_worker/test_dataset.py +9 -10
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/tests/test_elements_worker/test_elements.py +58 -63
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/tests/test_elements_worker/test_entities.py +10 -20
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/tests/test_elements_worker/test_metadata.py +72 -96
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/tests/test_elements_worker/test_task.py +9 -10
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/tests/test_elements_worker/test_training.py +20 -13
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/tests/test_elements_worker/test_transcriptions.py +6 -10
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/tests/test_elements_worker/test_worker.py +16 -14
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/tests/test_image.py +21 -20
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/tests/test_merge.py +5 -6
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/tests/test_utils.py +0 -1
- arkindex-base-worker-0.3.6rc1/PKG-INFO +0 -26
- arkindex-base-worker-0.3.6rc1/arkindex_base_worker.egg-info/PKG-INFO +0 -26
- arkindex-base-worker-0.3.6rc1/arkindex_base_worker.egg-info/requires.txt +0 -19
- arkindex-base-worker-0.3.6rc1/arkindex_worker/git.py +0 -392
- arkindex-base-worker-0.3.6rc1/pyproject.toml +0 -24
- arkindex-base-worker-0.3.6rc1/tests/test_git.py +0 -480
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/README.md +0 -0
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/arkindex_base_worker.egg-info/dependency_links.txt +0 -0
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/arkindex_base_worker.egg-info/top_level.txt +0 -0
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/setup.cfg +0 -0
- {arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/tests/__init__.py +0 -0
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: arkindex-base-worker
|
|
3
|
+
Version: 0.3.6rc2
|
|
4
|
+
Summary: Base Worker to easily build Arkindex ML workflows
|
|
5
|
+
Author-email: Teklia <contact@teklia.com>
|
|
6
|
+
Maintainer-email: Teklia <contact@teklia.com>
|
|
7
|
+
Project-URL: Homepage, https://workers.arkindex.org
|
|
8
|
+
Project-URL: Documentation, https://workers.arkindex.org
|
|
9
|
+
Project-URL: Repository, https://gitlab.teklia.com/workers/base-worker
|
|
10
|
+
Project-URL: Bug Tracker, https://gitlab.teklia.com/workers/base-worker/issues
|
|
11
|
+
Project-URL: Authors, https://teklia.com
|
|
12
|
+
Keywords: python
|
|
13
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
Provides-Extra: docs
|
|
22
|
+
Requires-Dist: black==23.11.0; extra == "docs"
|
|
23
|
+
Requires-Dist: doc8==1.1.1; extra == "docs"
|
|
24
|
+
Requires-Dist: mkdocs==1.5.3; extra == "docs"
|
|
25
|
+
Requires-Dist: mkdocs-material==9.4.8; extra == "docs"
|
|
26
|
+
Requires-Dist: mkdocstrings==0.23.0; extra == "docs"
|
|
27
|
+
Requires-Dist: mkdocstrings-python==1.7.3; extra == "docs"
|
|
28
|
+
Requires-Dist: recommonmark==0.7.1; extra == "docs"
|
|
29
|
+
|
|
30
|
+
# Arkindex base Worker
|
|
31
|
+
|
|
32
|
+
An easy to use Python 3 high level API client, to build ML tasks.
|
|
33
|
+
|
|
34
|
+
## Create a new worker using our template
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
pip install --user cookiecutter
|
|
38
|
+
cookiecutter git@gitlab.teklia.com:workers/base-worker.git
|
|
39
|
+
```
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: arkindex-base-worker
|
|
3
|
+
Version: 0.3.6rc2
|
|
4
|
+
Summary: Base Worker to easily build Arkindex ML workflows
|
|
5
|
+
Author-email: Teklia <contact@teklia.com>
|
|
6
|
+
Maintainer-email: Teklia <contact@teklia.com>
|
|
7
|
+
Project-URL: Homepage, https://workers.arkindex.org
|
|
8
|
+
Project-URL: Documentation, https://workers.arkindex.org
|
|
9
|
+
Project-URL: Repository, https://gitlab.teklia.com/workers/base-worker
|
|
10
|
+
Project-URL: Bug Tracker, https://gitlab.teklia.com/workers/base-worker/issues
|
|
11
|
+
Project-URL: Authors, https://teklia.com
|
|
12
|
+
Keywords: python
|
|
13
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
Provides-Extra: docs
|
|
22
|
+
Requires-Dist: black==23.11.0; extra == "docs"
|
|
23
|
+
Requires-Dist: doc8==1.1.1; extra == "docs"
|
|
24
|
+
Requires-Dist: mkdocs==1.5.3; extra == "docs"
|
|
25
|
+
Requires-Dist: mkdocs-material==9.4.8; extra == "docs"
|
|
26
|
+
Requires-Dist: mkdocstrings==0.23.0; extra == "docs"
|
|
27
|
+
Requires-Dist: mkdocstrings-python==1.7.3; extra == "docs"
|
|
28
|
+
Requires-Dist: recommonmark==0.7.1; extra == "docs"
|
|
29
|
+
|
|
30
|
+
# Arkindex base Worker
|
|
31
|
+
|
|
32
|
+
An easy to use Python 3 high level API client, to build ML tasks.
|
|
33
|
+
|
|
34
|
+
## Create a new worker using our template
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
pip install --user cookiecutter
|
|
38
|
+
cookiecutter git@gitlab.teklia.com:workers/base-worker.git
|
|
39
|
+
```
|
|
@@ -9,7 +9,6 @@ arkindex_base_worker.egg-info/requires.txt
|
|
|
9
9
|
arkindex_base_worker.egg-info/top_level.txt
|
|
10
10
|
arkindex_worker/__init__.py
|
|
11
11
|
arkindex_worker/cache.py
|
|
12
|
-
arkindex_worker/git.py
|
|
13
12
|
arkindex_worker/image.py
|
|
14
13
|
arkindex_worker/models.py
|
|
15
14
|
arkindex_worker/utils.py
|
|
@@ -30,7 +29,6 @@ tests/test_base_worker.py
|
|
|
30
29
|
tests/test_cache.py
|
|
31
30
|
tests/test_dataset_worker.py
|
|
32
31
|
tests/test_element.py
|
|
33
|
-
tests/test_git.py
|
|
34
32
|
tests/test_image.py
|
|
35
33
|
tests/test_merge.py
|
|
36
34
|
tests/test_utils.py
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
1
|
"""
|
|
3
2
|
Database mappings and helper methods for the experimental worker caching feature.
|
|
4
3
|
|
|
@@ -10,7 +9,6 @@ reducing network usage.
|
|
|
10
9
|
import json
|
|
11
10
|
import sqlite3
|
|
12
11
|
from pathlib import Path
|
|
13
|
-
from typing import Optional, Union
|
|
14
12
|
|
|
15
13
|
from peewee import (
|
|
16
14
|
SQL,
|
|
@@ -106,8 +104,8 @@ class CachedElement(Model):
|
|
|
106
104
|
def open_image(
|
|
107
105
|
self,
|
|
108
106
|
*args,
|
|
109
|
-
max_width:
|
|
110
|
-
max_height:
|
|
107
|
+
max_width: int | None = None,
|
|
108
|
+
max_height: int | None = None,
|
|
111
109
|
**kwargs,
|
|
112
110
|
) -> Image:
|
|
113
111
|
"""
|
|
@@ -145,17 +143,15 @@ class CachedElement(Model):
|
|
|
145
143
|
if max_width is None and max_height is None:
|
|
146
144
|
resize = "full"
|
|
147
145
|
else:
|
|
148
|
-
# Do not resize for polygons that do not exactly match the images
|
|
149
|
-
# as the resize is made directly by the IIIF server using the box parameter
|
|
150
146
|
if (
|
|
147
|
+
# Do not resize for polygons that do not exactly match the images
|
|
148
|
+
# as the resize is made directly by the IIIF server using the box parameter
|
|
151
149
|
bounding_box.width != self.image.width
|
|
152
150
|
or bounding_box.height != self.image.height
|
|
153
|
-
)
|
|
154
|
-
resize
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
elif (max_width is None or self.image.width <= max_width) and (
|
|
158
|
-
max_height is None or self.image.height <= max_height
|
|
151
|
+
) or (
|
|
152
|
+
# Do not resize when the image is below the maximum size
|
|
153
|
+
(max_width is None or self.image.width <= max_width)
|
|
154
|
+
and (max_height is None or self.image.height <= max_height)
|
|
159
155
|
):
|
|
160
156
|
resize = "full"
|
|
161
157
|
else:
|
|
@@ -319,22 +315,21 @@ def create_version_table():
|
|
|
319
315
|
Version.create(version=SQL_VERSION)
|
|
320
316
|
|
|
321
317
|
|
|
322
|
-
def check_version(cache_path:
|
|
318
|
+
def check_version(cache_path: str | Path):
|
|
323
319
|
"""
|
|
324
320
|
Check the validity of the SQLite version
|
|
325
321
|
|
|
326
322
|
:param cache_path: Path towards a local SQLite database
|
|
327
323
|
"""
|
|
328
|
-
with SqliteDatabase(cache_path) as provided_db:
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
version = None
|
|
324
|
+
with SqliteDatabase(cache_path) as provided_db, provided_db.bind_ctx([Version]):
|
|
325
|
+
try:
|
|
326
|
+
version = Version.get().version
|
|
327
|
+
except OperationalError:
|
|
328
|
+
version = None
|
|
334
329
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
330
|
+
assert (
|
|
331
|
+
version == SQL_VERSION
|
|
332
|
+
), f"The SQLite database {cache_path} does not have the correct cache version, it should be {SQL_VERSION}"
|
|
338
333
|
|
|
339
334
|
|
|
340
335
|
def merge_parents_cache(paths: list, current_database: Path):
|
|
@@ -358,9 +353,8 @@ def merge_parents_cache(paths: list, current_database: Path):
|
|
|
358
353
|
# Check that the parent cache uses a compatible version
|
|
359
354
|
check_version(path)
|
|
360
355
|
|
|
361
|
-
with SqliteDatabase(path) as source:
|
|
362
|
-
|
|
363
|
-
source.create_tables(MODELS)
|
|
356
|
+
with SqliteDatabase(path) as source, source.bind_ctx(MODELS):
|
|
357
|
+
source.create_tables(MODELS)
|
|
364
358
|
|
|
365
359
|
logger.info(f"Merging parent db {path} into {current_database}")
|
|
366
360
|
statements = [
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
1
|
"""
|
|
3
2
|
Helper methods to download and open IIIF images, and manage polygons.
|
|
4
3
|
"""
|
|
@@ -7,7 +6,7 @@ from collections import namedtuple
|
|
|
7
6
|
from io import BytesIO
|
|
8
7
|
from math import ceil
|
|
9
8
|
from pathlib import Path
|
|
10
|
-
from typing import TYPE_CHECKING
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
11
10
|
|
|
12
11
|
import requests
|
|
13
12
|
from PIL import Image
|
|
@@ -42,9 +41,9 @@ IIIF_MAX = "max"
|
|
|
42
41
|
|
|
43
42
|
def open_image(
|
|
44
43
|
path: str,
|
|
45
|
-
mode:
|
|
46
|
-
rotation_angle:
|
|
47
|
-
mirrored:
|
|
44
|
+
mode: str | None = "RGB",
|
|
45
|
+
rotation_angle: int | None = 0,
|
|
46
|
+
mirrored: bool | None = False,
|
|
48
47
|
) -> Image:
|
|
49
48
|
"""
|
|
50
49
|
Open an image from a path or a URL.
|
|
@@ -71,7 +70,7 @@ def open_image(
|
|
|
71
70
|
else:
|
|
72
71
|
try:
|
|
73
72
|
image = Image.open(path)
|
|
74
|
-
except (
|
|
73
|
+
except (OSError, ValueError):
|
|
75
74
|
image = download_image(path)
|
|
76
75
|
|
|
77
76
|
if image.mode != mode:
|
|
@@ -141,14 +140,14 @@ def download_image(url: str) -> Image:
|
|
|
141
140
|
return image
|
|
142
141
|
|
|
143
142
|
|
|
144
|
-
def polygon_bounding_box(polygon:
|
|
143
|
+
def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
|
|
145
144
|
"""
|
|
146
145
|
Compute the rectangle bounding box of a polygon.
|
|
147
146
|
|
|
148
147
|
:param polygon: Polygon to get the bounding box of.
|
|
149
148
|
:returns: Bounding box of this polygon.
|
|
150
149
|
"""
|
|
151
|
-
x_coords, y_coords = zip(*polygon)
|
|
150
|
+
x_coords, y_coords = zip(*polygon, strict=True)
|
|
152
151
|
x, y = min(x_coords), min(y_coords)
|
|
153
152
|
width, height = max(x_coords) - x, max(y_coords) - y
|
|
154
153
|
return BoundingBox(x, y, width, height)
|
|
@@ -248,8 +247,8 @@ def download_tiles(url: str) -> Image:
|
|
|
248
247
|
|
|
249
248
|
|
|
250
249
|
def trim_polygon(
|
|
251
|
-
polygon:
|
|
252
|
-
) ->
|
|
250
|
+
polygon: list[list[int]], image_width: int, image_height: int
|
|
251
|
+
) -> list[list[int]]:
|
|
253
252
|
"""
|
|
254
253
|
Trim a polygon to an image's boundaries, with non-negative coordinates.
|
|
255
254
|
|
|
@@ -265,10 +264,10 @@ def trim_polygon(
|
|
|
265
264
|
"""
|
|
266
265
|
|
|
267
266
|
assert isinstance(
|
|
268
|
-
polygon,
|
|
267
|
+
polygon, list | tuple
|
|
269
268
|
), "Input polygon must be a valid list or tuple of points."
|
|
270
269
|
assert all(
|
|
271
|
-
isinstance(point,
|
|
270
|
+
isinstance(point, list | tuple) for point in polygon
|
|
272
271
|
), "Polygon points must be tuples or lists."
|
|
273
272
|
assert all(
|
|
274
273
|
len(point) == 2 for point in polygon
|
|
@@ -301,10 +300,10 @@ def trim_polygon(
|
|
|
301
300
|
|
|
302
301
|
|
|
303
302
|
def revert_orientation(
|
|
304
|
-
element:
|
|
305
|
-
polygon:
|
|
306
|
-
reverse:
|
|
307
|
-
) ->
|
|
303
|
+
element: "Element | CachedElement",
|
|
304
|
+
polygon: list[list[int | float]],
|
|
305
|
+
reverse: bool = False,
|
|
306
|
+
) -> list[list[int]]:
|
|
308
307
|
"""
|
|
309
308
|
Update the coordinates of the polygon of a child element based on the orientation of
|
|
310
309
|
its parent.
|
|
@@ -324,7 +323,7 @@ def revert_orientation(
|
|
|
324
323
|
from arkindex_worker.models import Element
|
|
325
324
|
|
|
326
325
|
assert element and isinstance(
|
|
327
|
-
element,
|
|
326
|
+
element, Element | CachedElement
|
|
328
327
|
), "element shouldn't be null and should be an Element or CachedElement"
|
|
329
328
|
assert polygon and isinstance(
|
|
330
329
|
polygon, list
|
|
@@ -1,11 +1,10 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
1
|
"""
|
|
3
2
|
Wrappers around API results to provide more convenient attribute access and IIIF helpers.
|
|
4
3
|
"""
|
|
5
4
|
|
|
6
5
|
import tempfile
|
|
6
|
+
from collections.abc import Generator
|
|
7
7
|
from contextlib import contextmanager
|
|
8
|
-
from typing import Generator, List, Optional
|
|
9
8
|
|
|
10
9
|
from PIL import Image
|
|
11
10
|
from requests import HTTPError
|
|
@@ -34,10 +33,10 @@ class MagicDict(dict):
|
|
|
34
33
|
def __getattr__(self, name):
|
|
35
34
|
try:
|
|
36
35
|
return self[name]
|
|
37
|
-
except KeyError:
|
|
36
|
+
except KeyError as e:
|
|
38
37
|
raise AttributeError(
|
|
39
|
-
"{} object has no attribute '{}'"
|
|
40
|
-
)
|
|
38
|
+
f"{self.__class__.__name__} object has no attribute '{name}'"
|
|
39
|
+
) from e
|
|
41
40
|
|
|
42
41
|
def __setattr__(self, name, value):
|
|
43
42
|
return super().__setitem__(name, value)
|
|
@@ -74,7 +73,7 @@ class Element(MagicDict):
|
|
|
74
73
|
parts[-3] = size
|
|
75
74
|
return "/".join(parts)
|
|
76
75
|
|
|
77
|
-
def image_url(self, size: str = "full") ->
|
|
76
|
+
def image_url(self, size: str = "full") -> str | None:
|
|
78
77
|
"""
|
|
79
78
|
Build an URL to access the image.
|
|
80
79
|
When possible, will return the S3 URL for images, so an ML worker can bypass IIIF servers.
|
|
@@ -89,10 +88,10 @@ class Element(MagicDict):
|
|
|
89
88
|
url = self.zone.image.url
|
|
90
89
|
if not url.endswith("/"):
|
|
91
90
|
url += "/"
|
|
92
|
-
return "{}full/{}/0/default.jpg"
|
|
91
|
+
return f"{url}full/{size}/0/default.jpg"
|
|
93
92
|
|
|
94
93
|
@property
|
|
95
|
-
def polygon(self) ->
|
|
94
|
+
def polygon(self) -> list[float]:
|
|
96
95
|
"""
|
|
97
96
|
Access an Element's polygon.
|
|
98
97
|
This is a shortcut to an Element's polygon, normally accessed via
|
|
@@ -101,7 +100,7 @@ class Element(MagicDict):
|
|
|
101
100
|
the [CachedElement][arkindex_worker.cache.CachedElement].polygon field.
|
|
102
101
|
"""
|
|
103
102
|
if not self.get("zone"):
|
|
104
|
-
raise ValueError("Element {} has no zone"
|
|
103
|
+
raise ValueError(f"Element {self.id} has no zone")
|
|
105
104
|
return self.zone.polygon
|
|
106
105
|
|
|
107
106
|
@property
|
|
@@ -122,11 +121,11 @@ class Element(MagicDict):
|
|
|
122
121
|
def open_image(
|
|
123
122
|
self,
|
|
124
123
|
*args,
|
|
125
|
-
max_width:
|
|
126
|
-
max_height:
|
|
127
|
-
use_full_image:
|
|
124
|
+
max_width: int | None = None,
|
|
125
|
+
max_height: int | None = None,
|
|
126
|
+
use_full_image: bool | None = False,
|
|
128
127
|
**kwargs,
|
|
129
|
-
) -> Image:
|
|
128
|
+
) -> Image.Image:
|
|
130
129
|
"""
|
|
131
130
|
Open this element's image using Pillow, rotating and mirroring it according
|
|
132
131
|
to the ``rotation_angle`` and ``mirrored`` attributes.
|
|
@@ -173,7 +172,7 @@ class Element(MagicDict):
|
|
|
173
172
|
)
|
|
174
173
|
|
|
175
174
|
if not self.get("zone"):
|
|
176
|
-
raise ValueError("Element {} has no zone"
|
|
175
|
+
raise ValueError(f"Element {self.id} has no zone")
|
|
177
176
|
|
|
178
177
|
if self.requires_tiles:
|
|
179
178
|
if max_width is None and max_height is None:
|
|
@@ -194,10 +193,7 @@ class Element(MagicDict):
|
|
|
194
193
|
else:
|
|
195
194
|
resize = f"{max_width or ''},{max_height or ''}"
|
|
196
195
|
|
|
197
|
-
if use_full_image
|
|
198
|
-
url = self.image_url(resize)
|
|
199
|
-
else:
|
|
200
|
-
url = self.resize_zone_url(resize)
|
|
196
|
+
url = self.image_url(resize) if use_full_image else self.resize_zone_url(resize)
|
|
201
197
|
|
|
202
198
|
try:
|
|
203
199
|
return open_image(
|
|
@@ -215,13 +211,13 @@ class Element(MagicDict):
|
|
|
215
211
|
# This element uses an S3 URL: the URL may have expired.
|
|
216
212
|
# Call the API to get a fresh URL and try again
|
|
217
213
|
# TODO: this should be done by the worker
|
|
218
|
-
raise NotImplementedError
|
|
214
|
+
raise NotImplementedError from e
|
|
219
215
|
return open_image(self.image_url(resize), *args, **kwargs)
|
|
220
216
|
raise
|
|
221
217
|
|
|
222
218
|
@contextmanager
|
|
223
219
|
def open_image_tempfile(
|
|
224
|
-
self, format:
|
|
220
|
+
self, format: str | None = "jpeg", *args, **kwargs
|
|
225
221
|
) -> Generator[tempfile.NamedTemporaryFile, None, None]:
|
|
226
222
|
"""
|
|
227
223
|
Get the element's image as a temporary file stored on the disk.
|
|
@@ -249,7 +245,7 @@ class Element(MagicDict):
|
|
|
249
245
|
type_name = self.type["display_name"]
|
|
250
246
|
else:
|
|
251
247
|
type_name = str(self.type)
|
|
252
|
-
return "{} {} ({
|
|
248
|
+
return f"{type_name} {self.name} ({self.id})"
|
|
253
249
|
|
|
254
250
|
|
|
255
251
|
class ArkindexModel(MagicDict):
|
|
@@ -1,11 +1,9 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
1
|
import hashlib
|
|
3
2
|
import logging
|
|
4
3
|
import os
|
|
5
4
|
import tarfile
|
|
6
5
|
import tempfile
|
|
7
6
|
from pathlib import Path
|
|
8
|
-
from typing import Optional, Tuple, Union
|
|
9
7
|
|
|
10
8
|
import zstandard
|
|
11
9
|
import zstandard as zstd
|
|
@@ -16,7 +14,7 @@ CHUNK_SIZE = 1024
|
|
|
16
14
|
"""Chunk Size used for ZSTD compression"""
|
|
17
15
|
|
|
18
16
|
|
|
19
|
-
def decompress_zst_archive(compressed_archive: Path) ->
|
|
17
|
+
def decompress_zst_archive(compressed_archive: Path) -> tuple[int, Path]:
|
|
20
18
|
"""
|
|
21
19
|
Decompress a ZST-compressed tar archive in data dir. The tar archive is not extracted.
|
|
22
20
|
This returns the path to the archive and the file descriptor.
|
|
@@ -29,18 +27,19 @@ def decompress_zst_archive(compressed_archive: Path) -> Tuple[int, Path]:
|
|
|
29
27
|
"""
|
|
30
28
|
dctx = zstandard.ZstdDecompressor()
|
|
31
29
|
archive_fd, archive_path = tempfile.mkstemp(suffix=".tar")
|
|
30
|
+
archive_path = Path(archive_path)
|
|
32
31
|
|
|
33
32
|
logger.debug(f"Uncompressing file to {archive_path}")
|
|
34
33
|
try:
|
|
35
|
-
with open(
|
|
36
|
-
|
|
34
|
+
with compressed_archive.open("rb") as compressed, archive_path.open(
|
|
35
|
+
"wb"
|
|
37
36
|
) as decompressed:
|
|
38
37
|
dctx.copy_stream(compressed, decompressed)
|
|
39
38
|
logger.debug(f"Successfully uncompressed archive {compressed_archive}")
|
|
40
39
|
except zstandard.ZstdError as e:
|
|
41
|
-
raise Exception(f"Couldn't uncompressed archive: {e}")
|
|
40
|
+
raise Exception(f"Couldn't uncompressed archive: {e}") from e
|
|
42
41
|
|
|
43
|
-
return archive_fd,
|
|
42
|
+
return archive_fd, archive_path
|
|
44
43
|
|
|
45
44
|
|
|
46
45
|
def extract_tar_archive(archive_path: Path, destination: Path):
|
|
@@ -54,12 +53,12 @@ def extract_tar_archive(archive_path: Path, destination: Path):
|
|
|
54
53
|
with tarfile.open(archive_path) as tar_archive:
|
|
55
54
|
tar_archive.extractall(destination)
|
|
56
55
|
except tarfile.ReadError as e:
|
|
57
|
-
raise Exception(f"Couldn't handle the decompressed Tar archive: {e}")
|
|
56
|
+
raise Exception(f"Couldn't handle the decompressed Tar archive: {e}") from e
|
|
58
57
|
|
|
59
58
|
|
|
60
59
|
def extract_tar_zst_archive(
|
|
61
60
|
compressed_archive: Path, destination: Path
|
|
62
|
-
) ->
|
|
61
|
+
) -> tuple[int, Path]:
|
|
63
62
|
"""
|
|
64
63
|
Extract a ZST-compressed tar archive's content to a specific destination
|
|
65
64
|
|
|
@@ -89,8 +88,8 @@ def close_delete_file(file_descriptor: int, file_path: Path):
|
|
|
89
88
|
|
|
90
89
|
|
|
91
90
|
def zstd_compress(
|
|
92
|
-
source: Path, destination:
|
|
93
|
-
) ->
|
|
91
|
+
source: Path, destination: Path | None = None
|
|
92
|
+
) -> tuple[int | None, Path, str]:
|
|
94
93
|
"""Compress a file using the Zstandard compression algorithm.
|
|
95
94
|
|
|
96
95
|
:param source: Path to the file to compress.
|
|
@@ -117,13 +116,13 @@ def zstd_compress(
|
|
|
117
116
|
archive_file.write(compressed_chunk)
|
|
118
117
|
logger.debug(f"Successfully compressed {source}")
|
|
119
118
|
except zstandard.ZstdError as e:
|
|
120
|
-
raise Exception(f"Couldn't compress archive: {e}")
|
|
119
|
+
raise Exception(f"Couldn't compress archive: {e}") from e
|
|
121
120
|
return file_d, destination, archive_hasher.hexdigest()
|
|
122
121
|
|
|
123
122
|
|
|
124
123
|
def create_tar_archive(
|
|
125
|
-
path: Path, destination:
|
|
126
|
-
) ->
|
|
124
|
+
path: Path, destination: Path | None = None
|
|
125
|
+
) -> tuple[int | None, Path, str]:
|
|
127
126
|
"""Create a tar archive using the content at specified location.
|
|
128
127
|
|
|
129
128
|
:param path: Path to the file to archive
|
|
@@ -153,7 +152,7 @@ def create_tar_archive(
|
|
|
153
152
|
files.append(p)
|
|
154
153
|
logger.debug(f"Successfully created Tar archive from files @ {path}")
|
|
155
154
|
except tarfile.TarError as e:
|
|
156
|
-
raise Exception(f"Couldn't create Tar archive: {e}")
|
|
155
|
+
raise Exception(f"Couldn't create Tar archive: {e}") from e
|
|
157
156
|
|
|
158
157
|
# Sort by path
|
|
159
158
|
files.sort()
|
|
@@ -168,8 +167,8 @@ def create_tar_archive(
|
|
|
168
167
|
|
|
169
168
|
|
|
170
169
|
def create_tar_zst_archive(
|
|
171
|
-
source: Path, destination:
|
|
172
|
-
) ->
|
|
170
|
+
source: Path, destination: Path | None = None
|
|
171
|
+
) -> tuple[int | None, Path, str, str]:
|
|
173
172
|
"""Helper to create a TAR+ZST archive from a source folder.
|
|
174
173
|
|
|
175
174
|
:param source: Path to the folder whose content should be archived.
|
{arkindex-base-worker-0.3.6rc1 → arkindex-base-worker-0.3.6rc2}/arkindex_worker/worker/__init__.py
RENAMED
|
@@ -1,17 +1,16 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
1
|
"""
|
|
3
2
|
Base classes to implement Arkindex workers.
|
|
4
3
|
"""
|
|
5
|
-
|
|
4
|
+
import contextlib
|
|
6
5
|
import json
|
|
7
6
|
import os
|
|
8
7
|
import sys
|
|
9
8
|
import uuid
|
|
9
|
+
from collections.abc import Iterable, Iterator
|
|
10
10
|
from enum import Enum
|
|
11
11
|
from itertools import groupby
|
|
12
12
|
from operator import itemgetter
|
|
13
13
|
from pathlib import Path
|
|
14
|
-
from typing import Iterable, Iterator, List, Tuple, Union
|
|
15
14
|
|
|
16
15
|
from apistar.exceptions import ErrorResponse
|
|
17
16
|
|
|
@@ -102,7 +101,7 @@ class ElementsWorker(
|
|
|
102
101
|
|
|
103
102
|
self._worker_version_cache = {}
|
|
104
103
|
|
|
105
|
-
def list_elements(self) ->
|
|
104
|
+
def list_elements(self) -> Iterable[CachedElement] | list[str]:
|
|
106
105
|
"""
|
|
107
106
|
List the elements to be processed, either from the CLI arguments or
|
|
108
107
|
the cache database when enabled.
|
|
@@ -227,21 +226,17 @@ class ElementsWorker(
|
|
|
227
226
|
)
|
|
228
227
|
if element:
|
|
229
228
|
# Try to update the activity to error state regardless of the response
|
|
230
|
-
|
|
229
|
+
with contextlib.suppress(Exception):
|
|
231
230
|
self.update_activity(element.id, ActivityState.Error)
|
|
232
|
-
except Exception:
|
|
233
|
-
pass
|
|
234
231
|
|
|
235
232
|
if failed:
|
|
236
233
|
logger.error(
|
|
237
|
-
"Ran on {} elements: {} completed, {} failed"
|
|
238
|
-
count, count - failed, failed
|
|
239
|
-
)
|
|
234
|
+
f"Ran on {count} elements: {count - failed} completed, {failed} failed"
|
|
240
235
|
)
|
|
241
236
|
if failed >= count: # Everything failed!
|
|
242
237
|
sys.exit(1)
|
|
243
238
|
|
|
244
|
-
def process_element(self, element:
|
|
239
|
+
def process_element(self, element: Element | CachedElement):
|
|
245
240
|
"""
|
|
246
241
|
Override this method to implement your worker and process a single Arkindex element at once.
|
|
247
242
|
|
|
@@ -251,7 +246,7 @@ class ElementsWorker(
|
|
|
251
246
|
"""
|
|
252
247
|
|
|
253
248
|
def update_activity(
|
|
254
|
-
self, element_id:
|
|
249
|
+
self, element_id: str | uuid.UUID, state: ActivityState
|
|
255
250
|
) -> bool:
|
|
256
251
|
"""
|
|
257
252
|
Update the WorkerActivity for this element and worker.
|
|
@@ -269,7 +264,7 @@ class ElementsWorker(
|
|
|
269
264
|
return True
|
|
270
265
|
|
|
271
266
|
assert element_id and isinstance(
|
|
272
|
-
element_id,
|
|
267
|
+
element_id, uuid.UUID | str
|
|
273
268
|
), "element_id shouldn't be null and should be an UUID or str"
|
|
274
269
|
assert isinstance(state, ActivityState), "state should be an ActivityState"
|
|
275
270
|
|
|
@@ -382,7 +377,7 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
|
|
|
382
377
|
|
|
383
378
|
def list_dataset_elements_per_split(
|
|
384
379
|
self, dataset: Dataset
|
|
385
|
-
) -> Iterator[
|
|
380
|
+
) -> Iterator[tuple[str, list[Element]]]:
|
|
386
381
|
"""
|
|
387
382
|
List the elements in the dataset, grouped by split, using the
|
|
388
383
|
[list_dataset_elements][arkindex_worker.worker.dataset.DatasetMixin.list_dataset_elements] method.
|
|
@@ -392,8 +387,8 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
|
|
|
392
387
|
"""
|
|
393
388
|
|
|
394
389
|
def format_split(
|
|
395
|
-
split:
|
|
396
|
-
) ->
|
|
390
|
+
split: tuple[str, Iterator[tuple[str, Element]]]
|
|
391
|
+
) -> tuple[str, list[Element]]:
|
|
397
392
|
return (split[0], list(map(itemgetter(1), list(split[1]))))
|
|
398
393
|
|
|
399
394
|
return map(
|
|
@@ -435,7 +430,7 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
|
|
|
435
430
|
"""
|
|
436
431
|
self.configure()
|
|
437
432
|
|
|
438
|
-
datasets:
|
|
433
|
+
datasets: list[Dataset] | list[str] = list(self.list_datasets())
|
|
439
434
|
if not datasets:
|
|
440
435
|
logger.warning("No datasets to process, stopping.")
|
|
441
436
|
sys.exit(1)
|
|
@@ -499,16 +494,12 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
|
|
|
499
494
|
)
|
|
500
495
|
if dataset and self.generator:
|
|
501
496
|
# Try to update the state to Error regardless of the response
|
|
502
|
-
|
|
497
|
+
with contextlib.suppress(Exception):
|
|
503
498
|
self.update_dataset_state(dataset, DatasetState.Error)
|
|
504
|
-
except Exception:
|
|
505
|
-
pass
|
|
506
499
|
|
|
507
500
|
if failed:
|
|
508
501
|
logger.error(
|
|
509
|
-
"Ran on {} datasets: {} completed, {} failed"
|
|
510
|
-
count, count - failed, failed
|
|
511
|
-
)
|
|
502
|
+
f"Ran on {count} datasets: {count - failed} completed, {failed} failed"
|
|
512
503
|
)
|
|
513
504
|
if failed >= count: # Everything failed!
|
|
514
505
|
sys.exit(1)
|