arkindex-base-worker 0.3.7rc5__tar.gz → 0.5.0a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/PKG-INFO +18 -19
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/README.md +1 -1
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/arkindex_base_worker.egg-info/PKG-INFO +18 -19
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/arkindex_base_worker.egg-info/SOURCES.txt +25 -8
- arkindex_base_worker-0.5.0a1/arkindex_base_worker.egg-info/requires.txt +17 -0
- arkindex_base_worker-0.5.0a1/arkindex_base_worker.egg-info/top_level.txt +6 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/arkindex_worker/cache.py +1 -1
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/arkindex_worker/image.py +167 -2
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/arkindex_worker/models.py +18 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/arkindex_worker/utils.py +98 -4
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/arkindex_worker/worker/__init__.py +117 -218
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/arkindex_worker/worker/base.py +39 -46
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/arkindex_worker/worker/classification.py +34 -18
- arkindex_base_worker-0.5.0a1/arkindex_worker/worker/corpus.py +86 -0
- arkindex_base_worker-0.5.0a1/arkindex_worker/worker/dataset.py +173 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/arkindex_worker/worker/element.py +352 -91
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/arkindex_worker/worker/entity.py +13 -11
- arkindex_base_worker-0.5.0a1/arkindex_worker/worker/image.py +21 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/arkindex_worker/worker/metadata.py +26 -16
- arkindex_base_worker-0.5.0a1/arkindex_worker/worker/process.py +92 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/arkindex_worker/worker/task.py +5 -4
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/arkindex_worker/worker/training.py +25 -10
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/arkindex_worker/worker/transcription.py +89 -68
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/arkindex_worker/worker/version.py +3 -1
- arkindex_base_worker-0.5.0a1/hooks/pre_gen_project.py +3 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/pyproject.toml +27 -14
- arkindex_base_worker-0.5.0a1/tests/__init__.py +8 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/tests/conftest.py +47 -58
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/tests/test_base_worker.py +212 -12
- arkindex_base_worker-0.5.0a1/tests/test_dataset_worker.py +704 -0
- arkindex-base-worker-0.3.7rc5/tests/test_elements_worker/test_classifications.py → arkindex_base_worker-0.5.0a1/tests/test_elements_worker/test_classification.py +216 -100
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/tests/test_elements_worker/test_cli.py +3 -11
- arkindex_base_worker-0.5.0a1/tests/test_elements_worker/test_corpus.py +168 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/tests/test_elements_worker/test_dataset.py +106 -157
- arkindex_base_worker-0.5.0a1/tests/test_elements_worker/test_element.py +427 -0
- arkindex_base_worker-0.5.0a1/tests/test_elements_worker/test_element_create_multiple.py +715 -0
- arkindex_base_worker-0.5.0a1/tests/test_elements_worker/test_element_create_single.py +528 -0
- arkindex_base_worker-0.5.0a1/tests/test_elements_worker/test_element_list_children.py +969 -0
- arkindex_base_worker-0.5.0a1/tests/test_elements_worker/test_element_list_parents.py +530 -0
- arkindex-base-worker-0.3.7rc5/tests/test_elements_worker/test_entities.py → arkindex_base_worker-0.5.0a1/tests/test_elements_worker/test_entity_create.py +37 -195
- arkindex_base_worker-0.5.0a1/tests/test_elements_worker/test_entity_list_and_check.py +160 -0
- arkindex_base_worker-0.5.0a1/tests/test_elements_worker/test_image.py +66 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/tests/test_elements_worker/test_metadata.py +252 -161
- arkindex_base_worker-0.5.0a1/tests/test_elements_worker/test_process.py +89 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/tests/test_elements_worker/test_task.py +8 -18
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/tests/test_elements_worker/test_training.py +17 -8
- arkindex_base_worker-0.5.0a1/tests/test_elements_worker/test_transcription_create.py +873 -0
- arkindex_base_worker-0.5.0a1/tests/test_elements_worker/test_transcription_create_with_elements.py +951 -0
- arkindex_base_worker-0.5.0a1/tests/test_elements_worker/test_transcription_list.py +450 -0
- arkindex_base_worker-0.5.0a1/tests/test_elements_worker/test_version.py +60 -0
- arkindex_base_worker-0.5.0a1/tests/test_elements_worker/test_worker.py +798 -0
- arkindex_base_worker-0.5.0a1/tests/test_image.py +919 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/tests/test_merge.py +1 -2
- arkindex_base_worker-0.5.0a1/tests/test_utils.py +120 -0
- arkindex_base_worker-0.5.0a1/worker-demo/tests/conftest.py +32 -0
- arkindex_base_worker-0.5.0a1/worker-demo/tests/test_worker.py +12 -0
- arkindex_base_worker-0.5.0a1/worker-demo/worker_demo/__init__.py +6 -0
- arkindex_base_worker-0.5.0a1/worker-demo/worker_demo/worker.py +19 -0
- arkindex-base-worker-0.3.7rc5/arkindex_base_worker.egg-info/requires.txt +0 -17
- arkindex-base-worker-0.3.7rc5/arkindex_base_worker.egg-info/top_level.txt +0 -2
- arkindex-base-worker-0.3.7rc5/arkindex_worker/worker/dataset.py +0 -110
- arkindex-base-worker-0.3.7rc5/docs-requirements.txt +0 -7
- arkindex-base-worker-0.3.7rc5/requirements.txt +0 -8
- arkindex-base-worker-0.3.7rc5/setup.py +0 -4
- arkindex-base-worker-0.3.7rc5/tests/test_dataset_worker.py +0 -847
- arkindex-base-worker-0.3.7rc5/tests/test_elements_worker/test_elements.py +0 -2713
- arkindex-base-worker-0.3.7rc5/tests/test_elements_worker/test_transcriptions.py +0 -2119
- arkindex-base-worker-0.3.7rc5/tests/test_elements_worker/test_worker.py +0 -513
- arkindex-base-worker-0.3.7rc5/tests/test_image.py +0 -586
- arkindex-base-worker-0.3.7rc5/tests/test_utils.py +0 -35
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/LICENSE +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/arkindex_base_worker.egg-info/dependency_links.txt +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/arkindex_worker/__init__.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/setup.cfg +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/tests/test_cache.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/tests/test_element.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1}/tests/test_elements_worker/__init__.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex_base_worker-0.5.0a1/worker-demo}/tests/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: arkindex-base-worker
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0a1
|
|
4
4
|
Summary: Base Worker to easily build Arkindex ML workflows
|
|
5
5
|
Author-email: Teklia <contact@teklia.com>
|
|
6
6
|
Maintainer-email: Teklia <contact@teklia.com>
|
|
@@ -37,26 +37,25 @@ Classifier: License :: OSI Approved :: MIT License
|
|
|
37
37
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
38
38
|
Classifier: Programming Language :: Python :: 3.10
|
|
39
39
|
Classifier: Programming Language :: Python :: 3.11
|
|
40
|
-
Classifier:
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
41
41
|
Requires-Python: >=3.10
|
|
42
42
|
Description-Content-Type: text/markdown
|
|
43
43
|
License-File: LICENSE
|
|
44
|
-
Requires-Dist:
|
|
45
|
-
Requires-Dist: peewee
|
|
46
|
-
Requires-Dist: Pillow==
|
|
47
|
-
Requires-Dist:
|
|
48
|
-
Requires-Dist:
|
|
49
|
-
Requires-Dist:
|
|
50
|
-
Requires-Dist:
|
|
51
|
-
Requires-Dist: zstandard==0.22.0
|
|
44
|
+
Requires-Dist: humanize==4.11.0
|
|
45
|
+
Requires-Dist: peewee~=3.17
|
|
46
|
+
Requires-Dist: Pillow==11.0.0
|
|
47
|
+
Requires-Dist: python-gnupg==0.5.3
|
|
48
|
+
Requires-Dist: shapely==2.0.6
|
|
49
|
+
Requires-Dist: teklia-toolbox==0.1.7
|
|
50
|
+
Requires-Dist: zstandard==0.23.0
|
|
52
51
|
Provides-Extra: docs
|
|
53
|
-
Requires-Dist: black==24.
|
|
54
|
-
Requires-Dist:
|
|
55
|
-
Requires-Dist:
|
|
56
|
-
|
|
57
|
-
Requires-Dist:
|
|
58
|
-
Requires-Dist:
|
|
59
|
-
Requires-Dist:
|
|
52
|
+
Requires-Dist: black==24.10.0; extra == "docs"
|
|
53
|
+
Requires-Dist: mkdocs-material==9.5.48; extra == "docs"
|
|
54
|
+
Requires-Dist: mkdocstrings-python==1.12.2; extra == "docs"
|
|
55
|
+
Provides-Extra: tests
|
|
56
|
+
Requires-Dist: pytest==8.3.4; extra == "tests"
|
|
57
|
+
Requires-Dist: pytest-mock==3.14.0; extra == "tests"
|
|
58
|
+
Requires-Dist: pytest-responses==0.5.1; extra == "tests"
|
|
60
59
|
|
|
61
60
|
# Arkindex base Worker
|
|
62
61
|
|
|
@@ -70,7 +69,7 @@ The [documentation](https://workers.arkindex.org/) is made with [Material for Mk
|
|
|
70
69
|
|
|
71
70
|
## Create a new worker using our template
|
|
72
71
|
|
|
73
|
-
```
|
|
72
|
+
```shell
|
|
74
73
|
pip install --user cookiecutter
|
|
75
74
|
cookiecutter git@gitlab.teklia.com:workers/base-worker.git
|
|
76
75
|
```
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: arkindex-base-worker
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0a1
|
|
4
4
|
Summary: Base Worker to easily build Arkindex ML workflows
|
|
5
5
|
Author-email: Teklia <contact@teklia.com>
|
|
6
6
|
Maintainer-email: Teklia <contact@teklia.com>
|
|
@@ -37,26 +37,25 @@ Classifier: License :: OSI Approved :: MIT License
|
|
|
37
37
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
38
38
|
Classifier: Programming Language :: Python :: 3.10
|
|
39
39
|
Classifier: Programming Language :: Python :: 3.11
|
|
40
|
-
Classifier:
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
41
41
|
Requires-Python: >=3.10
|
|
42
42
|
Description-Content-Type: text/markdown
|
|
43
43
|
License-File: LICENSE
|
|
44
|
-
Requires-Dist:
|
|
45
|
-
Requires-Dist: peewee
|
|
46
|
-
Requires-Dist: Pillow==
|
|
47
|
-
Requires-Dist:
|
|
48
|
-
Requires-Dist:
|
|
49
|
-
Requires-Dist:
|
|
50
|
-
Requires-Dist:
|
|
51
|
-
Requires-Dist: zstandard==0.22.0
|
|
44
|
+
Requires-Dist: humanize==4.11.0
|
|
45
|
+
Requires-Dist: peewee~=3.17
|
|
46
|
+
Requires-Dist: Pillow==11.0.0
|
|
47
|
+
Requires-Dist: python-gnupg==0.5.3
|
|
48
|
+
Requires-Dist: shapely==2.0.6
|
|
49
|
+
Requires-Dist: teklia-toolbox==0.1.7
|
|
50
|
+
Requires-Dist: zstandard==0.23.0
|
|
52
51
|
Provides-Extra: docs
|
|
53
|
-
Requires-Dist: black==24.
|
|
54
|
-
Requires-Dist:
|
|
55
|
-
Requires-Dist:
|
|
56
|
-
|
|
57
|
-
Requires-Dist:
|
|
58
|
-
Requires-Dist:
|
|
59
|
-
Requires-Dist:
|
|
52
|
+
Requires-Dist: black==24.10.0; extra == "docs"
|
|
53
|
+
Requires-Dist: mkdocs-material==9.5.48; extra == "docs"
|
|
54
|
+
Requires-Dist: mkdocstrings-python==1.12.2; extra == "docs"
|
|
55
|
+
Provides-Extra: tests
|
|
56
|
+
Requires-Dist: pytest==8.3.4; extra == "tests"
|
|
57
|
+
Requires-Dist: pytest-mock==3.14.0; extra == "tests"
|
|
58
|
+
Requires-Dist: pytest-responses==0.5.1; extra == "tests"
|
|
60
59
|
|
|
61
60
|
# Arkindex base Worker
|
|
62
61
|
|
|
@@ -70,7 +69,7 @@ The [documentation](https://workers.arkindex.org/) is made with [Material for Mk
|
|
|
70
69
|
|
|
71
70
|
## Create a new worker using our template
|
|
72
71
|
|
|
73
|
-
```
|
|
72
|
+
```shell
|
|
74
73
|
pip install --user cookiecutter
|
|
75
74
|
cookiecutter git@gitlab.teklia.com:workers/base-worker.git
|
|
76
75
|
```
|
|
@@ -1,9 +1,6 @@
|
|
|
1
1
|
LICENSE
|
|
2
2
|
README.md
|
|
3
|
-
docs-requirements.txt
|
|
4
3
|
pyproject.toml
|
|
5
|
-
requirements.txt
|
|
6
|
-
setup.py
|
|
7
4
|
arkindex_base_worker.egg-info/PKG-INFO
|
|
8
5
|
arkindex_base_worker.egg-info/SOURCES.txt
|
|
9
6
|
arkindex_base_worker.egg-info/dependency_links.txt
|
|
@@ -17,14 +14,18 @@ arkindex_worker/utils.py
|
|
|
17
14
|
arkindex_worker/worker/__init__.py
|
|
18
15
|
arkindex_worker/worker/base.py
|
|
19
16
|
arkindex_worker/worker/classification.py
|
|
17
|
+
arkindex_worker/worker/corpus.py
|
|
20
18
|
arkindex_worker/worker/dataset.py
|
|
21
19
|
arkindex_worker/worker/element.py
|
|
22
20
|
arkindex_worker/worker/entity.py
|
|
21
|
+
arkindex_worker/worker/image.py
|
|
23
22
|
arkindex_worker/worker/metadata.py
|
|
23
|
+
arkindex_worker/worker/process.py
|
|
24
24
|
arkindex_worker/worker/task.py
|
|
25
25
|
arkindex_worker/worker/training.py
|
|
26
26
|
arkindex_worker/worker/transcription.py
|
|
27
27
|
arkindex_worker/worker/version.py
|
|
28
|
+
hooks/pre_gen_project.py
|
|
28
29
|
tests/__init__.py
|
|
29
30
|
tests/conftest.py
|
|
30
31
|
tests/test_base_worker.py
|
|
@@ -35,13 +36,29 @@ tests/test_image.py
|
|
|
35
36
|
tests/test_merge.py
|
|
36
37
|
tests/test_utils.py
|
|
37
38
|
tests/test_elements_worker/__init__.py
|
|
38
|
-
tests/test_elements_worker/
|
|
39
|
+
tests/test_elements_worker/test_classification.py
|
|
39
40
|
tests/test_elements_worker/test_cli.py
|
|
41
|
+
tests/test_elements_worker/test_corpus.py
|
|
40
42
|
tests/test_elements_worker/test_dataset.py
|
|
41
|
-
tests/test_elements_worker/
|
|
42
|
-
tests/test_elements_worker/
|
|
43
|
+
tests/test_elements_worker/test_element.py
|
|
44
|
+
tests/test_elements_worker/test_element_create_multiple.py
|
|
45
|
+
tests/test_elements_worker/test_element_create_single.py
|
|
46
|
+
tests/test_elements_worker/test_element_list_children.py
|
|
47
|
+
tests/test_elements_worker/test_element_list_parents.py
|
|
48
|
+
tests/test_elements_worker/test_entity_create.py
|
|
49
|
+
tests/test_elements_worker/test_entity_list_and_check.py
|
|
50
|
+
tests/test_elements_worker/test_image.py
|
|
43
51
|
tests/test_elements_worker/test_metadata.py
|
|
52
|
+
tests/test_elements_worker/test_process.py
|
|
44
53
|
tests/test_elements_worker/test_task.py
|
|
45
54
|
tests/test_elements_worker/test_training.py
|
|
46
|
-
tests/test_elements_worker/
|
|
47
|
-
tests/test_elements_worker/
|
|
55
|
+
tests/test_elements_worker/test_transcription_create.py
|
|
56
|
+
tests/test_elements_worker/test_transcription_create_with_elements.py
|
|
57
|
+
tests/test_elements_worker/test_transcription_list.py
|
|
58
|
+
tests/test_elements_worker/test_version.py
|
|
59
|
+
tests/test_elements_worker/test_worker.py
|
|
60
|
+
worker-demo/tests/__init__.py
|
|
61
|
+
worker-demo/tests/conftest.py
|
|
62
|
+
worker-demo/tests/test_worker.py
|
|
63
|
+
worker-demo/worker_demo/__init__.py
|
|
64
|
+
worker-demo/worker_demo/worker.py
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
humanize==4.11.0
|
|
2
|
+
peewee~=3.17
|
|
3
|
+
Pillow==11.0.0
|
|
4
|
+
python-gnupg==0.5.3
|
|
5
|
+
shapely==2.0.6
|
|
6
|
+
teklia-toolbox==0.1.7
|
|
7
|
+
zstandard==0.23.0
|
|
8
|
+
|
|
9
|
+
[docs]
|
|
10
|
+
black==24.10.0
|
|
11
|
+
mkdocs-material==9.5.48
|
|
12
|
+
mkdocstrings-python==1.12.2
|
|
13
|
+
|
|
14
|
+
[tests]
|
|
15
|
+
pytest==8.3.4
|
|
16
|
+
pytest-mock==3.14.0
|
|
17
|
+
pytest-responses==0.5.1
|
|
@@ -380,7 +380,7 @@ def unsupported_cache(func):
|
|
|
380
380
|
def wrapper(self, *args, **kwargs):
|
|
381
381
|
results = func(self, *args, **kwargs)
|
|
382
382
|
|
|
383
|
-
if
|
|
383
|
+
if self.use_cache:
|
|
384
384
|
logger.warning(
|
|
385
385
|
f"This API helper `{func.__name__}` did not update the cache database"
|
|
386
386
|
)
|
|
@@ -2,13 +2,20 @@
|
|
|
2
2
|
Helper methods to download and open IIIF images, and manage polygons.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
import base64
|
|
6
|
+
import functools
|
|
7
|
+
import os
|
|
5
8
|
import re
|
|
9
|
+
import tempfile
|
|
6
10
|
from collections import namedtuple
|
|
11
|
+
from collections.abc import Generator, Iterator
|
|
7
12
|
from io import BytesIO
|
|
8
13
|
from math import ceil
|
|
9
14
|
from pathlib import Path
|
|
10
15
|
from typing import TYPE_CHECKING
|
|
11
16
|
|
|
17
|
+
import humanize
|
|
18
|
+
import numpy as np
|
|
12
19
|
import requests
|
|
13
20
|
from PIL import Image
|
|
14
21
|
from shapely.affinity import rotate, scale, translate
|
|
@@ -21,6 +28,8 @@ from tenacity import (
|
|
|
21
28
|
)
|
|
22
29
|
|
|
23
30
|
from arkindex_worker import logger
|
|
31
|
+
from arkindex_worker.utils import pluralize
|
|
32
|
+
from teklia_toolbox.requests import should_verify_cert
|
|
24
33
|
|
|
25
34
|
# Avoid circular imports error when type checking
|
|
26
35
|
if TYPE_CHECKING:
|
|
@@ -38,8 +47,57 @@ IIIF_URL = re.compile(r"\w+:\/{2}.+\/.+\/.+\/.+\/(?P<size>.+)\/!?\d+\/\w+\.\w+")
|
|
|
38
47
|
IIIF_FULL = "full"
|
|
39
48
|
# Maximum size available
|
|
40
49
|
IIIF_MAX = "max"
|
|
50
|
+
# Ratios to resize images: 1.0, 0.95, [...], 0.1, 0.05
|
|
51
|
+
IMAGE_RATIOS = np.arange(1, 0, -0.05).round(2).tolist()
|
|
41
52
|
|
|
42
53
|
|
|
54
|
+
def update_pillow_image_size_limit(func):
|
|
55
|
+
"""
|
|
56
|
+
Update Pillow Image size limit
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
@functools.wraps(func)
|
|
60
|
+
def wrapper(
|
|
61
|
+
*args,
|
|
62
|
+
max_image_pixels: str | int | None = os.getenv("ARKINDEX_MAX_IMAGE_PIXELS"),
|
|
63
|
+
**kwargs,
|
|
64
|
+
):
|
|
65
|
+
"""
|
|
66
|
+
Wrapper to update Pillow Image size limit and restore it at the end of the function.
|
|
67
|
+
|
|
68
|
+
:param *args: Positional arguments passed to the function.
|
|
69
|
+
:param max_image_pixels: Pillow Image size limit to use.
|
|
70
|
+
:param **kwargs: Keyword arguments passed to the function.
|
|
71
|
+
"""
|
|
72
|
+
MAX_IMAGE_PIXELS = Image.MAX_IMAGE_PIXELS
|
|
73
|
+
|
|
74
|
+
# Override Pillow Image size limit
|
|
75
|
+
if max_image_pixels is not None:
|
|
76
|
+
max_image_pixels = int(max_image_pixels)
|
|
77
|
+
# Override Pillow limit for detecting decompression bombs, disabled if set to 0
|
|
78
|
+
if max_image_pixels == 0:
|
|
79
|
+
logger.warning(
|
|
80
|
+
"Pillow Image size limit is completely disabled, make sure you trust the image source."
|
|
81
|
+
)
|
|
82
|
+
Image.MAX_IMAGE_PIXELS = None
|
|
83
|
+
else:
|
|
84
|
+
Image.MAX_IMAGE_PIXELS = max_image_pixels
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
results = func(*args, **kwargs)
|
|
88
|
+
except:
|
|
89
|
+
# Restore initial Pillow Image size limit
|
|
90
|
+
Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
|
|
91
|
+
raise
|
|
92
|
+
|
|
93
|
+
# Restore initial Pillow Image size limit
|
|
94
|
+
Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
|
|
95
|
+
return results
|
|
96
|
+
|
|
97
|
+
return wrapper
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@update_pillow_image_size_limit
|
|
43
101
|
def open_image(
|
|
44
102
|
path: str,
|
|
45
103
|
mode: str | None = "RGB",
|
|
@@ -147,6 +205,111 @@ def upload_image(image: Image, url: str) -> requests.Response:
|
|
|
147
205
|
return resp
|
|
148
206
|
|
|
149
207
|
|
|
208
|
+
def resized_images(
|
|
209
|
+
*args,
|
|
210
|
+
element: "Element",
|
|
211
|
+
max_pixels_short: int | None = None,
|
|
212
|
+
max_pixels_long: int | None = None,
|
|
213
|
+
max_bytes: int | None = None,
|
|
214
|
+
use_base64: bool = False,
|
|
215
|
+
**kwargs,
|
|
216
|
+
) -> Iterator[Generator[tempfile._TemporaryFileWrapper | str]]:
|
|
217
|
+
"""
|
|
218
|
+
Build resized images according to pixel and byte limits.
|
|
219
|
+
|
|
220
|
+
:param *args: Positional arguments passed to [arkindex_worker.models.Element.open_image_tempfile][].
|
|
221
|
+
:param element: Element whose image needs to be resized.
|
|
222
|
+
:param max_pixels_short: Maximum pixel size of the resized images' short side.
|
|
223
|
+
:param max_pixels_long: Maximum pixel size of the resized images' long side.
|
|
224
|
+
:param max_bytes: Maximum byte size of the resized images.
|
|
225
|
+
:param use_base64: Whether or not to encode resized images in base64 before calculating their size.
|
|
226
|
+
:param **kwargs: Keyword arguments passed to [arkindex_worker.models.Element.open_image_tempfile][].
|
|
227
|
+
:returns: An iterator of temporary files for resized images OR an iterator of base64-encoded strings if `use_base64` is set.
|
|
228
|
+
"""
|
|
229
|
+
_, _, element_width, element_height = polygon_bounding_box(element.polygon)
|
|
230
|
+
logger.info(
|
|
231
|
+
f"This element's image dimensions are ({element_width} x {element_height})."
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
portrait_format = element_width <= element_height
|
|
235
|
+
max_pixels_width, max_pixels_height = (
|
|
236
|
+
(max_pixels_short, max_pixels_long)
|
|
237
|
+
if portrait_format
|
|
238
|
+
else (max_pixels_long, max_pixels_short)
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
# The image dimension is already within the pixel limitation, no need to resize the image
|
|
242
|
+
if max_pixels_width and max_pixels_width >= element_width:
|
|
243
|
+
max_pixels_width = None
|
|
244
|
+
if max_pixels_height and max_pixels_height >= element_height:
|
|
245
|
+
max_pixels_height = None
|
|
246
|
+
|
|
247
|
+
if (max_pixels_width and element_width > max_pixels_width) or (
|
|
248
|
+
max_pixels_height and element_height > max_pixels_height
|
|
249
|
+
):
|
|
250
|
+
logger.warning(
|
|
251
|
+
f"Maximum image dimensions supported are ({max_pixels_width or element_width} x {max_pixels_height or element_height})."
|
|
252
|
+
)
|
|
253
|
+
logger.warning("The image will be resized.")
|
|
254
|
+
|
|
255
|
+
# No limitations provided, we keep the image initial dimensions
|
|
256
|
+
if max_pixels_width is None and max_pixels_height is None:
|
|
257
|
+
open_image_param, max_value = (
|
|
258
|
+
("max_height", element_height)
|
|
259
|
+
if portrait_format
|
|
260
|
+
else ("max_width", element_width)
|
|
261
|
+
)
|
|
262
|
+
# A limitation is only given for the height, we resize it
|
|
263
|
+
elif max_pixels_width is None:
|
|
264
|
+
open_image_param, max_value = ("max_height", max_pixels_height)
|
|
265
|
+
# A limitation is only given for the width, we resize it
|
|
266
|
+
elif max_pixels_height is None:
|
|
267
|
+
open_image_param, max_value = ("max_width", max_pixels_width)
|
|
268
|
+
# Limitations are provided for both sides:
|
|
269
|
+
# - we resize only the one with the biggest scale factor
|
|
270
|
+
# - the remaining one will automatically fall within the other limitation
|
|
271
|
+
else:
|
|
272
|
+
width_rescaling_factor = element_width / max_pixels_width
|
|
273
|
+
height_rescaling_factor = element_height / max_pixels_height
|
|
274
|
+
open_image_param, max_value = (
|
|
275
|
+
("max_height", max_pixels_height)
|
|
276
|
+
if height_rescaling_factor > width_rescaling_factor
|
|
277
|
+
else ("max_width", max_pixels_width)
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
resized_pixels = set(
|
|
281
|
+
min(round(ratio * max_value), max_value) for ratio in IMAGE_RATIOS
|
|
282
|
+
)
|
|
283
|
+
for resized_pixel in sorted(resized_pixels, reverse=True):
|
|
284
|
+
with element.open_image_tempfile(
|
|
285
|
+
*args, **{**kwargs, open_image_param: resized_pixel}
|
|
286
|
+
) as image:
|
|
287
|
+
pillow_image = Image.open(image)
|
|
288
|
+
if (
|
|
289
|
+
pillow_image.width != element_width
|
|
290
|
+
or pillow_image.height != element_height
|
|
291
|
+
):
|
|
292
|
+
logger.warning(
|
|
293
|
+
f"The image was resized to ({pillow_image.width} x {pillow_image.height})."
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
image_size = Path(image.name).stat().st_size
|
|
297
|
+
if use_base64:
|
|
298
|
+
image = base64.b64encode(Path(image.name).read_bytes()).decode("utf-8")
|
|
299
|
+
image_size = len(image)
|
|
300
|
+
|
|
301
|
+
# The image is still too heavy
|
|
302
|
+
if max_bytes and image_size > max_bytes:
|
|
303
|
+
logger.warning(f"The image size is {humanize.naturalsize(image_size)}.")
|
|
304
|
+
logger.warning(
|
|
305
|
+
f"Maximum image input size supported is {humanize.naturalsize(max_bytes)}."
|
|
306
|
+
)
|
|
307
|
+
logger.warning("The image will be resized.")
|
|
308
|
+
continue
|
|
309
|
+
|
|
310
|
+
yield image
|
|
311
|
+
|
|
312
|
+
|
|
150
313
|
def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
|
|
151
314
|
"""
|
|
152
315
|
Compute the rectangle bounding box of a polygon.
|
|
@@ -163,7 +326,7 @@ def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
|
|
|
163
326
|
def _retry_log(retry_state, *args, **kwargs):
|
|
164
327
|
logger.warning(
|
|
165
328
|
f"Request to {retry_state.args[0]} failed ({repr(retry_state.outcome.exception())}), "
|
|
166
|
-
f
|
|
329
|
+
f'retrying in {retry_state.idle_for} {pluralize("second", retry_state.idle_for)}'
|
|
167
330
|
)
|
|
168
331
|
|
|
169
332
|
|
|
@@ -175,7 +338,9 @@ def _retry_log(retry_state, *args, **kwargs):
|
|
|
175
338
|
reraise=True,
|
|
176
339
|
)
|
|
177
340
|
def _retried_request(url, *args, method=requests.get, **kwargs):
|
|
178
|
-
resp = method(
|
|
341
|
+
resp = method(
|
|
342
|
+
url, *args, timeout=DOWNLOAD_TIMEOUT, verify=should_verify_cert(url), **kwargs
|
|
343
|
+
)
|
|
179
344
|
resp.raise_for_status()
|
|
180
345
|
return resp
|
|
181
346
|
|
|
@@ -20,6 +20,8 @@ class MagicDict(dict):
|
|
|
20
20
|
Automagically convert lists and dicts to MagicDicts and lists of MagicDicts
|
|
21
21
|
Allows for nested access: foo.bar.baz
|
|
22
22
|
"""
|
|
23
|
+
if isinstance(item, Dataset):
|
|
24
|
+
return item
|
|
23
25
|
if isinstance(item, list):
|
|
24
26
|
return list(map(self._magify, item))
|
|
25
27
|
if isinstance(item, dict):
|
|
@@ -259,6 +261,12 @@ class Transcription(ArkindexModel):
|
|
|
259
261
|
"""
|
|
260
262
|
|
|
261
263
|
|
|
264
|
+
class Image(ArkindexModel):
|
|
265
|
+
"""
|
|
266
|
+
Describes an Arkindex image.
|
|
267
|
+
"""
|
|
268
|
+
|
|
269
|
+
|
|
262
270
|
class Dataset(ArkindexModel):
|
|
263
271
|
"""
|
|
264
272
|
Describes an Arkindex dataset.
|
|
@@ -272,6 +280,16 @@ class Dataset(ArkindexModel):
|
|
|
272
280
|
return f"{self.id}.tar.zst"
|
|
273
281
|
|
|
274
282
|
|
|
283
|
+
class Set(MagicDict):
|
|
284
|
+
"""
|
|
285
|
+
Describes an Arkindex dataset set.
|
|
286
|
+
"""
|
|
287
|
+
|
|
288
|
+
def __str__(self):
|
|
289
|
+
# Not using ArkindexModel.__str__ as we do not retrieve the Set ID
|
|
290
|
+
return f"{self.__class__.__name__} ({self.name}) from {self.dataset}"
|
|
291
|
+
|
|
292
|
+
|
|
275
293
|
class Artifact(ArkindexModel):
|
|
276
294
|
"""
|
|
277
295
|
Describes an Arkindex artifact.
|
|
@@ -1,15 +1,54 @@
|
|
|
1
1
|
import hashlib
|
|
2
|
+
import inspect
|
|
2
3
|
import logging
|
|
3
4
|
import os
|
|
4
5
|
import tarfile
|
|
5
6
|
import tempfile
|
|
7
|
+
from collections.abc import Callable, Generator
|
|
8
|
+
from itertools import islice
|
|
6
9
|
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
7
11
|
|
|
8
|
-
import zstandard
|
|
9
12
|
import zstandard as zstd
|
|
10
13
|
|
|
11
14
|
logger = logging.getLogger(__name__)
|
|
12
15
|
|
|
16
|
+
|
|
17
|
+
def pluralize(singular: str, count: int) -> str:
|
|
18
|
+
"""Pluralize a noun, if necessary, using simplified rules of English pluralization and a list of exceptions.
|
|
19
|
+
|
|
20
|
+
:param str singular: A singular noun describing an object
|
|
21
|
+
:param int count: The object count, to determine whether to pluralize or not
|
|
22
|
+
:return str: The noun in its singular or plural form
|
|
23
|
+
"""
|
|
24
|
+
if count == 1:
|
|
25
|
+
return singular
|
|
26
|
+
|
|
27
|
+
some_exceptions = {
|
|
28
|
+
"child": "children",
|
|
29
|
+
"class": "classes",
|
|
30
|
+
"entity": "entities",
|
|
31
|
+
"metadata": "metadata",
|
|
32
|
+
}
|
|
33
|
+
if singular in some_exceptions:
|
|
34
|
+
return some_exceptions[singular]
|
|
35
|
+
|
|
36
|
+
return singular + "s"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
MANUAL_SOURCE = "manual"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def parse_source_id(value: str) -> bool | str | None:
|
|
43
|
+
"""
|
|
44
|
+
Parse a UUID argument (Worker Version, Worker Run, ...) to use it directly in the API.
|
|
45
|
+
Arkindex API filters generally expect `False` to filter manual sources.
|
|
46
|
+
"""
|
|
47
|
+
if value == MANUAL_SOURCE:
|
|
48
|
+
return False
|
|
49
|
+
return value or None
|
|
50
|
+
|
|
51
|
+
|
|
13
52
|
CHUNK_SIZE = 1024
|
|
14
53
|
"""Chunk Size used for ZSTD compression"""
|
|
15
54
|
|
|
@@ -25,7 +64,7 @@ def decompress_zst_archive(compressed_archive: Path) -> tuple[int, Path]:
|
|
|
25
64
|
:param compressed_archive: Path to the target ZST-compressed archive
|
|
26
65
|
:return: File descriptor and path to the uncompressed tar archive
|
|
27
66
|
"""
|
|
28
|
-
dctx =
|
|
67
|
+
dctx = zstd.ZstdDecompressor()
|
|
29
68
|
archive_fd, archive_path = tempfile.mkstemp(suffix=".tar")
|
|
30
69
|
archive_path = Path(archive_path)
|
|
31
70
|
|
|
@@ -37,7 +76,7 @@ def decompress_zst_archive(compressed_archive: Path) -> tuple[int, Path]:
|
|
|
37
76
|
):
|
|
38
77
|
dctx.copy_stream(compressed, decompressed)
|
|
39
78
|
logger.debug(f"Successfully uncompressed archive {compressed_archive}")
|
|
40
|
-
except
|
|
79
|
+
except zstd.ZstdError as e:
|
|
41
80
|
raise Exception(f"Couldn't uncompressed archive: {e}") from e
|
|
42
81
|
|
|
43
82
|
return archive_fd, archive_path
|
|
@@ -116,7 +155,7 @@ def zstd_compress(
|
|
|
116
155
|
archive_hasher.update(compressed_chunk)
|
|
117
156
|
archive_file.write(compressed_chunk)
|
|
118
157
|
logger.debug(f"Successfully compressed {source}")
|
|
119
|
-
except
|
|
158
|
+
except zstd.ZstdError as e:
|
|
120
159
|
raise Exception(f"Couldn't compress archive: {e}") from e
|
|
121
160
|
return file_d, destination, archive_hasher.hexdigest()
|
|
122
161
|
|
|
@@ -184,3 +223,58 @@ def create_tar_zst_archive(
|
|
|
184
223
|
close_delete_file(tar_fd, tar_archive)
|
|
185
224
|
|
|
186
225
|
return zst_fd, zst_archive, zst_hash, tar_hash
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
DEFAULT_BATCH_SIZE = 50
|
|
229
|
+
"""Batch size used for bulk publication to Arkindex"""
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def batch_publication(func: Callable) -> Callable:
|
|
233
|
+
"""
|
|
234
|
+
Decorator for functions that should raise an error when the value passed through the ``batch_size`` parameter is **not** a strictly positive integer.
|
|
235
|
+
|
|
236
|
+
:param func: The function to wrap with the ``batch_size`` check
|
|
237
|
+
:return: The function passing the ``batch_size`` check
|
|
238
|
+
"""
|
|
239
|
+
signature = inspect.signature(func)
|
|
240
|
+
|
|
241
|
+
def wrapper(self, *args, **kwargs):
|
|
242
|
+
bound_func = signature.bind(self, *args, **kwargs)
|
|
243
|
+
bound_func.apply_defaults()
|
|
244
|
+
batch_size = bound_func.arguments.get("batch_size")
|
|
245
|
+
assert (
|
|
246
|
+
batch_size is not None and isinstance(batch_size, int) and batch_size > 0
|
|
247
|
+
), "batch_size shouldn't be null and should be a strictly positive integer"
|
|
248
|
+
|
|
249
|
+
return func(self, *args, **kwargs)
|
|
250
|
+
|
|
251
|
+
wrapper.__name__ = func.__name__
|
|
252
|
+
return wrapper
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def make_batches(
|
|
256
|
+
objects: list, singular_name: str, batch_size: int
|
|
257
|
+
) -> Generator[list[Any]]:
|
|
258
|
+
"""Split an object list in successive batches of maximum size ``batch_size``.
|
|
259
|
+
|
|
260
|
+
:param objects: The object list to divide in batches of ``batch_size`` size
|
|
261
|
+
:param singular_name: The singular form of the noun associated with the object list
|
|
262
|
+
:param batch_size: The maximum size of each batch to split the object list
|
|
263
|
+
:return: A generator of successive batches containing ``batch_size`` items from ``objects``
|
|
264
|
+
"""
|
|
265
|
+
count = len(objects)
|
|
266
|
+
logger.info(
|
|
267
|
+
f"Creating batches of size {batch_size} to process {count} {pluralize(singular_name, count)}"
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
index = 1
|
|
271
|
+
iterator = iter(objects)
|
|
272
|
+
while batch := list(islice(iterator, batch_size)):
|
|
273
|
+
count = len(batch)
|
|
274
|
+
logger.info(
|
|
275
|
+
f"Processing batch {index} containing {count} {pluralize(singular_name, count)}..."
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
yield batch
|
|
279
|
+
|
|
280
|
+
index += 1
|