arkindex-base-worker 0.3.6rc4__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. arkindex_base_worker-0.3.7.dist-info/LICENSE +21 -0
  2. arkindex_base_worker-0.3.7.dist-info/METADATA +77 -0
  3. arkindex_base_worker-0.3.7.dist-info/RECORD +47 -0
  4. {arkindex_base_worker-0.3.6rc4.dist-info → arkindex_base_worker-0.3.7.dist-info}/WHEEL +1 -1
  5. {arkindex_base_worker-0.3.6rc4.dist-info → arkindex_base_worker-0.3.7.dist-info}/top_level.txt +2 -0
  6. arkindex_worker/cache.py +14 -0
  7. arkindex_worker/image.py +29 -19
  8. arkindex_worker/models.py +14 -2
  9. arkindex_worker/utils.py +17 -3
  10. arkindex_worker/worker/__init__.py +122 -125
  11. arkindex_worker/worker/base.py +24 -24
  12. arkindex_worker/worker/classification.py +18 -25
  13. arkindex_worker/worker/dataset.py +24 -18
  14. arkindex_worker/worker/element.py +100 -19
  15. arkindex_worker/worker/entity.py +35 -4
  16. arkindex_worker/worker/metadata.py +21 -11
  17. arkindex_worker/worker/training.py +13 -0
  18. arkindex_worker/worker/transcription.py +45 -5
  19. arkindex_worker/worker/version.py +22 -0
  20. hooks/pre_gen_project.py +3 -0
  21. tests/conftest.py +16 -8
  22. tests/test_base_worker.py +0 -6
  23. tests/test_dataset_worker.py +291 -409
  24. tests/test_elements_worker/test_classifications.py +365 -539
  25. tests/test_elements_worker/test_cli.py +1 -1
  26. tests/test_elements_worker/test_dataset.py +97 -116
  27. tests/test_elements_worker/test_elements.py +354 -76
  28. tests/test_elements_worker/test_entities.py +22 -2
  29. tests/test_elements_worker/test_metadata.py +53 -27
  30. tests/test_elements_worker/test_training.py +35 -0
  31. tests/test_elements_worker/test_transcriptions.py +149 -16
  32. tests/test_elements_worker/test_worker.py +19 -6
  33. tests/test_image.py +37 -0
  34. tests/test_utils.py +23 -1
  35. worker-demo/tests/__init__.py +0 -0
  36. worker-demo/tests/conftest.py +32 -0
  37. worker-demo/tests/test_worker.py +12 -0
  38. worker-demo/worker_demo/__init__.py +6 -0
  39. worker-demo/worker_demo/worker.py +19 -0
  40. arkindex_base_worker-0.3.6rc4.dist-info/METADATA +0 -47
  41. arkindex_base_worker-0.3.6rc4.dist-info/RECORD +0 -40
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Teklia
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,77 @@
1
+ Metadata-Version: 2.1
2
+ Name: arkindex-base-worker
3
+ Version: 0.3.7
4
+ Summary: Base Worker to easily build Arkindex ML workflows
5
+ Author-email: Teklia <contact@teklia.com>
6
+ Maintainer-email: Teklia <contact@teklia.com>
7
+ License: MIT License
8
+
9
+ Copyright (c) 2023 Teklia
10
+
11
+ Permission is hereby granted, free of charge, to any person obtaining a copy
12
+ of this software and associated documentation files (the "Software"), to deal
13
+ in the Software without restriction, including without limitation the rights
14
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15
+ copies of the Software, and to permit persons to whom the Software is
16
+ furnished to do so, subject to the following conditions:
17
+
18
+ The above copyright notice and this permission notice shall be included in all
19
+ copies or substantial portions of the Software.
20
+
21
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27
+ SOFTWARE.
28
+
29
+ Project-URL: Homepage, https://workers.arkindex.org
30
+ Project-URL: Documentation, https://workers.arkindex.org
31
+ Project-URL: Repository, https://gitlab.teklia.com/workers/base-worker
32
+ Project-URL: Bug Tracker, https://gitlab.teklia.com/workers/base-worker/issues
33
+ Project-URL: Authors, https://teklia.com
34
+ Keywords: python
35
+ Classifier: Development Status :: 5 - Production/Stable
36
+ Classifier: License :: OSI Approved :: MIT License
37
+ Classifier: Programming Language :: Python :: 3 :: Only
38
+ Classifier: Programming Language :: Python :: 3.10
39
+ Classifier: Programming Language :: Python :: 3.11
40
+ Classifier: Topic :: Text Processing :: Linguistic
41
+ Requires-Python: >=3.10
42
+ Description-Content-Type: text/markdown
43
+ License-File: LICENSE
44
+ Requires-Dist: peewee ==3.17.1
45
+ Requires-Dist: Pillow ==10.3.0
46
+ Requires-Dist: pymdown-extensions ==10.7.1
47
+ Requires-Dist: python-gnupg ==0.5.2
48
+ Requires-Dist: shapely ==2.0.3
49
+ Requires-Dist: teklia-toolbox ==0.1.4
50
+ Requires-Dist: zstandard ==0.22.0
51
+ Provides-Extra: docs
52
+ Requires-Dist: black ==24.4.0 ; extra == 'docs'
53
+ Requires-Dist: doc8 ==1.1.1 ; extra == 'docs'
54
+ Requires-Dist: mkdocs-material ==9.5.17 ; extra == 'docs'
55
+ Requires-Dist: mkdocstrings-python ==1.9.2 ; extra == 'docs'
56
+ Requires-Dist: recommonmark ==0.7.1 ; extra == 'docs'
57
+ Provides-Extra: tests
58
+ Requires-Dist: pytest ==8.1.1 ; extra == 'tests'
59
+ Requires-Dist: pytest-mock ==3.14.0 ; extra == 'tests'
60
+ Requires-Dist: pytest-responses ==0.5.1 ; extra == 'tests'
61
+
62
+ # Arkindex base Worker
63
+
64
+ An easy to use Python 3 high level API client, to build ML tasks.
65
+
66
+ This is an open-source project, licensed using [the MIT license](https://opensource.org/license/mit/).
67
+
68
+ ## Documentation
69
+
70
+ The [documentation](https://workers.arkindex.org/) is made with [Material for MkDocs](https://github.com/squidfunk/mkdocs-material) and is hosted by [GitLab Pages](https://docs.gitlab.com/ee/user/project/pages/).
71
+
72
+ ## Create a new worker using our template
73
+
74
+ ```
75
+ pip install --user cookiecutter
76
+ cookiecutter git@gitlab.teklia.com:workers/base-worker.git
77
+ ```
@@ -0,0 +1,47 @@
1
+ arkindex_worker/__init__.py,sha256=OlgCtTC9MaWeejviY0a3iQpALcRQGMVArFVVYwTF6I8,162
2
+ arkindex_worker/cache.py,sha256=FTlB0coXofn5zTNRTcVIvh709mcw4a1bPGqkwWjKs3w,11248
3
+ arkindex_worker/image.py,sha256=5ymIGaTm2D7Sp2YYQkbuheuGnx5VJo0_AzYAEIvNGhs,14267
4
+ arkindex_worker/models.py,sha256=xSvOadkNg3rgccic1xLgonzP28ugzmcGw0IUqXn51Cc,9844
5
+ arkindex_worker/utils.py,sha256=0Mu7Fa8DVcHn19pg-FIXqMDpfgzQkb7QR9IAlAi-x_k,7243
6
+ arkindex_worker/worker/__init__.py,sha256=U-_zOrQ09xmpBF9SmrTVj_UwnsCjFueV5G2hJAFEwv0,18806
7
+ arkindex_worker/worker/base.py,sha256=qtkCGfpGn7SWsQZRJ5cpW0gQ4tV_cyR_AHbuHZr53z4,19585
8
+ arkindex_worker/worker/classification.py,sha256=JVz-6YEeuavOy7zGfQi4nE_wpj9hwMUZDXTem-hXQY8,10328
9
+ arkindex_worker/worker/dataset.py,sha256=roX2IMMNA-icteTtRADiFSZiZSRPClqS62ZPJm9s2JI,2923
10
+ arkindex_worker/worker/element.py,sha256=AWK3YJSHWy3j4ajntJloi_2X4zxsgXZ6c6dzphgq3OI,33848
11
+ arkindex_worker/worker/entity.py,sha256=suhycfikC9oTPEWmX48_cnvFEw-Wu5zBA8n_00K4KUk,14714
12
+ arkindex_worker/worker/metadata.py,sha256=Bouuc_JaXogKykVXOTKDVP3tX--OUQeHoazxIGrGrJI,6702
13
+ arkindex_worker/worker/task.py,sha256=cz3wJNPgogZv1lm_3lm7WScitQtYQtL6H6I7Xokq208,1475
14
+ arkindex_worker/worker/training.py,sha256=YYnLNi4lsB0fEDj8Xh73z2Amt1LIfPdpuGzagOEtgDE,10648
15
+ arkindex_worker/worker/transcription.py,sha256=6R7ofcGnNqX4rjT0kRKIE-G9FHq2TJ1tfztNM5sTqYE,20464
16
+ arkindex_worker/worker/version.py,sha256=cs2pdlDxpKRO2Oldvcu54w-D_DQhf1cdeEt4tKX_QYs,1927
17
+ hooks/pre_gen_project.py,sha256=xQJERv3vv9VzIqcBHI281eeWLWREXUF4mMw7PvJHHXM,269
18
+ tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ tests/conftest.py,sha256=Oi5SJic4TNwDj8Pm0WHgg657yB7_JKxbLC0HYPI3RUc,22134
20
+ tests/test_base_worker.py,sha256=Uq6_MpLW23gmKFXkU-SyDUaA_4dlViLBGG4e3gpBBz0,24512
21
+ tests/test_cache.py,sha256=ii0gyr0DrG7ChEs7pmT8hMdSguAOAcCze4bRMiFQxuk,10640
22
+ tests/test_dataset_worker.py,sha256=1joFRFmkL6XfPL9y1NYB_5QO-5FF56rwigAHrqtJMMA,23848
23
+ tests/test_element.py,sha256=2G9M15TLxQRmvrWM9Kw2ucnElh4kSv_oF_5FYwwAxTY,13181
24
+ tests/test_image.py,sha256=FZv8njLxh45sVgmY71UFHt0lv1cHr0cK4rrtPhQleX8,16262
25
+ tests/test_merge.py,sha256=Q4zCbtZbe0wBfqE56gvAD06c6pDuhqnjKaioFqIgAQw,8331
26
+ tests/test_utils.py,sha256=vpeHMeL7bJQonv5ZEbJmlJikqVKn5VWlVEbvmYFzDYA,1650
27
+ tests/test_elements_worker/__init__.py,sha256=Fh4nkbbyJSMv_VtjQxnWrOqTnxXaaWI8S9WU0VrzCHs,179
28
+ tests/test_elements_worker/test_classifications.py,sha256=vU6al1THtDSmERyVscMXaqiRPwTllcpRUHyeyBQ8M9U,26417
29
+ tests/test_elements_worker/test_cli.py,sha256=BsFTswLti63WAZ2pf6ipiZKWJJyCQuSfuKnSlESuK8g,2878
30
+ tests/test_elements_worker/test_dataset.py,sha256=hityecntzrldkuBHBWApYDkXSzSySdG3AZXJlM_sCOM,11777
31
+ tests/test_elements_worker/test_elements.py,sha256=6XKtgXSVQJnTSgTHWwEVsAtIwLBapjYjUYPUdjxcHsY,84971
32
+ tests/test_elements_worker/test_entities.py,sha256=yi1mXzvKvNwUNMzo0UZ56YOIJstYHcLyeepPJ8f10MQ,34557
33
+ tests/test_elements_worker/test_metadata.py,sha256=YMYmkUSEp4WKNBm3QLcrg4yn6qVTWQ_aZzSu9Xygr80,18756
34
+ tests/test_elements_worker/test_task.py,sha256=FCpxE9UpouKXgjGvWgNHEai_Hiy2d1YmqRG-_v2s27s,6312
35
+ tests/test_elements_worker/test_training.py,sha256=3PGH6dAc2eSBD7w6ivrt1yAh6sCoici4nuIS9zdw6S8,9476
36
+ tests/test_elements_worker/test_transcriptions.py,sha256=WVJG26sZyY66fu-Eka9A1_WWIeNI2scogjypzURnp8A,73468
37
+ tests/test_elements_worker/test_worker.py,sha256=7-jGJVT3yMGpIyN96Uafz5eIUrO4ieNLgw0k1D8BhGc,17163
38
+ worker-demo/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
+ worker-demo/tests/conftest.py,sha256=XzNMNeg6pmABUAH8jN6eZTlZSFGLYjS3-DTXjiRN6Yc,1002
40
+ worker-demo/tests/test_worker.py,sha256=3DLd4NRK4bfyatG5P_PK4k9P9tJHx9XQq5_ryFEEFVg,304
41
+ worker-demo/worker_demo/__init__.py,sha256=2BPomV8ZMNf3YXJgloatKeHQCE6QOkwmsHGkO6MkQuM,125
42
+ worker-demo/worker_demo/worker.py,sha256=Rt-DjWa5iBP08k58NDZMfeyPuFbtNcbX6nc5jFX7GNo,440
43
+ arkindex_base_worker-0.3.7.dist-info/LICENSE,sha256=NVshRi1efwVezMfW7xXYLrdDr2Li1AfwfGOd5WuH1kQ,1063
44
+ arkindex_base_worker-0.3.7.dist-info/METADATA,sha256=AH2_i5Ne_vAPAYdQhlFhJQogSzDuLFtxueFsDMpkbMw,3458
45
+ arkindex_base_worker-0.3.7.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
46
+ arkindex_base_worker-0.3.7.dist-info/top_level.txt,sha256=58NuslgxQC2vT4DiqZEgO4JqJRrYa2yeNI9QvkbfGQU,40
47
+ arkindex_base_worker-0.3.7.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.42.0)
2
+ Generator: bdist_wheel (0.43.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,2 +1,4 @@
1
1
  arkindex_worker
2
+ hooks
2
3
  tests
4
+ worker-demo
arkindex_worker/cache.py CHANGED
@@ -374,3 +374,17 @@ def merge_parents_cache(paths: list, current_database: Path):
374
374
  for statement in statements:
375
375
  cursor.execute(statement)
376
376
  connection.commit()
377
+
378
+
379
+ def unsupported_cache(func):
380
+ def wrapper(self, *args, **kwargs):
381
+ results = func(self, *args, **kwargs)
382
+
383
+ if not (self.is_read_only or self.use_cache):
384
+ logger.warning(
385
+ f"This API helper `{func.__name__}` did not update the cache database"
386
+ )
387
+
388
+ return results
389
+
390
+ return wrapper
arkindex_worker/image.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """
2
2
  Helper methods to download and open IIIF images, and manage polygons.
3
3
  """
4
+
4
5
  import re
5
6
  from collections import namedtuple
6
7
  from io import BytesIO
@@ -20,6 +21,7 @@ from tenacity import (
20
21
  )
21
22
 
22
23
  from arkindex_worker import logger
24
+ from teklia_toolbox.requests import should_verify_cert
23
25
 
24
26
  # Avoid circular imports error when type checking
25
27
  if TYPE_CHECKING:
@@ -114,32 +116,38 @@ def download_image(url: str) -> Image:
114
116
  )
115
117
  else:
116
118
  raise e
117
- except requests.exceptions.SSLError:
118
- logger.warning(
119
- "An SSLError occurred during image download, retrying with a weaker and unsafe SSL configuration"
120
- )
121
-
122
- # Saving current ciphers
123
- previous_ciphers = requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS
124
-
125
- # Downgrading ciphers to download the image
126
- requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = "ALL:@SECLEVEL=1"
127
- resp = _retried_request(url)
128
-
129
- # Restoring previous ciphers
130
- requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = previous_ciphers
131
119
 
132
120
  # Preprocess the image and prepare it for classification
133
121
  image = Image.open(BytesIO(resp.content))
134
122
  logger.info(
135
- "Downloaded image {} - size={}x{} in {}".format(
136
- url, image.size[0], image.size[1], resp.elapsed
137
- )
123
+ f"Downloaded image {url} - size={image.size[0]}x{image.size[1]} in {resp.elapsed}"
138
124
  )
139
125
 
140
126
  return image
141
127
 
142
128
 
129
+ def upload_image(image: Image, url: str) -> requests.Response:
130
+ """
131
+ Upload a Pillow image to a URL.
132
+
133
+ :param image: Pillow image to upload.
134
+ :param url: Destination URL.
135
+ :returns: The upload response.
136
+ """
137
+ assert url.startswith("http"), "Destination URL for the image must be HTTP(S)"
138
+
139
+ # Retrieve a binarized version of the image
140
+ image_bytes = BytesIO()
141
+ image.save(image_bytes, format="jpeg")
142
+ image_bytes.seek(0)
143
+
144
+ # Upload the image
145
+ resp = _retried_request(url, method=requests.put, data=image_bytes)
146
+ logger.info(f"Uploaded image to {url} in {resp.elapsed}")
147
+
148
+ return resp
149
+
150
+
143
151
  def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
144
152
  """
145
153
  Compute the rectangle bounding box of a polygon.
@@ -167,8 +175,10 @@ def _retry_log(retry_state, *args, **kwargs):
167
175
  before_sleep=_retry_log,
168
176
  reraise=True,
169
177
  )
170
- def _retried_request(url):
171
- resp = requests.get(url, timeout=DOWNLOAD_TIMEOUT)
178
+ def _retried_request(url, *args, method=requests.get, **kwargs):
179
+ resp = method(
180
+ url, *args, timeout=DOWNLOAD_TIMEOUT, verify=should_verify_cert(url), **kwargs
181
+ )
172
182
  resp.raise_for_status()
173
183
  return resp
174
184
 
arkindex_worker/models.py CHANGED
@@ -20,6 +20,8 @@ class MagicDict(dict):
20
20
  Automagically convert lists and dicts to MagicDicts and lists of MagicDicts
21
21
  Allows for nested access: foo.bar.baz
22
22
  """
23
+ if isinstance(item, Dataset):
24
+ return item
23
25
  if isinstance(item, list):
24
26
  return list(map(self._magify, item))
25
27
  if isinstance(item, dict):
@@ -75,10 +77,10 @@ class Element(MagicDict):
75
77
 
76
78
  def image_url(self, size: str = "full") -> str | None:
77
79
  """
78
- Build an URL to access the image.
80
+ Build a URL to access the image.
79
81
  When possible, will return the S3 URL for images, so an ML worker can bypass IIIF servers.
80
82
  :param size: Subresolution of the image, following the syntax of the IIIF resize parameter.
81
- :returns: An URL to the image, or None if the element does not have an image.
83
+ :returns: A URL to the image, or None if the element does not have an image.
82
84
  """
83
85
  if not self.get("zone"):
84
86
  return
@@ -272,6 +274,16 @@ class Dataset(ArkindexModel):
272
274
  return f"{self.id}.tar.zst"
273
275
 
274
276
 
277
+ class Set(MagicDict):
278
+ """
279
+ Describes an Arkindex dataset set.
280
+ """
281
+
282
+ def __str__(self):
283
+ # Not using ArkindexModel.__str__ as we do not retrieve the Set ID
284
+ return f"{self.__class__.__name__} ({self.name}) from {self.dataset}"
285
+
286
+
275
287
  class Artifact(ArkindexModel):
276
288
  """
277
289
  Describes an Arkindex artifact.
arkindex_worker/utils.py CHANGED
@@ -10,6 +10,19 @@ import zstandard as zstd
10
10
 
11
11
  logger = logging.getLogger(__name__)
12
12
 
13
+ MANUAL_SOURCE = "manual"
14
+
15
+
16
+ def parse_source_id(value: str) -> bool | str | None:
17
+ """
18
+ Parse a UUID argument (Worker Version, Worker Run, ...) to use it directly in the API.
19
+ Arkindex API filters generally expect `False` to filter manual sources.
20
+ """
21
+ if value == MANUAL_SOURCE:
22
+ return False
23
+ return value or None
24
+
25
+
13
26
  CHUNK_SIZE = 1024
14
27
  """Chunk Size used for ZSTD compression"""
15
28
 
@@ -31,9 +44,10 @@ def decompress_zst_archive(compressed_archive: Path) -> tuple[int, Path]:
31
44
 
32
45
  logger.debug(f"Uncompressing file to {archive_path}")
33
46
  try:
34
- with compressed_archive.open("rb") as compressed, archive_path.open(
35
- "wb"
36
- ) as decompressed:
47
+ with (
48
+ compressed_archive.open("rb") as compressed,
49
+ archive_path.open("wb") as decompressed,
50
+ ):
37
51
  dctx.copy_stream(compressed, decompressed)
38
52
  logger.debug(f"Successfully uncompressed archive {compressed_archive}")
39
53
  except zstandard.ZstdError as e: