arkindex-base-worker 0.3.6rc5__tar.gz → 0.3.7.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/PKG-INFO +14 -16
  2. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/README.md +1 -1
  3. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/arkindex_base_worker.egg-info/PKG-INFO +14 -16
  4. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/arkindex_base_worker.egg-info/SOURCES.txt +7 -5
  5. arkindex_base_worker-0.3.7.post1/arkindex_base_worker.egg-info/requires.txt +16 -0
  6. arkindex_base_worker-0.3.7.post1/arkindex_base_worker.egg-info/top_level.txt +6 -0
  7. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/arkindex_worker/cache.py +14 -0
  8. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/arkindex_worker/image.py +29 -19
  9. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/arkindex_worker/models.py +14 -2
  10. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/arkindex_worker/utils.py +17 -3
  11. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/arkindex_worker/worker/__init__.py +122 -125
  12. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/arkindex_worker/worker/base.py +25 -45
  13. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/arkindex_worker/worker/classification.py +18 -25
  14. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/arkindex_worker/worker/dataset.py +24 -18
  15. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/arkindex_worker/worker/element.py +45 -6
  16. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/arkindex_worker/worker/entity.py +35 -4
  17. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/arkindex_worker/worker/metadata.py +21 -11
  18. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/arkindex_worker/worker/training.py +16 -0
  19. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/arkindex_worker/worker/transcription.py +45 -5
  20. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/arkindex_worker/worker/version.py +22 -0
  21. arkindex_base_worker-0.3.7.post1/hooks/pre_gen_project.py +3 -0
  22. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/pyproject.toml +28 -8
  23. arkindex_base_worker-0.3.7.post1/setup.cfg +4 -0
  24. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/tests/conftest.py +15 -7
  25. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/tests/test_base_worker.py +0 -6
  26. arkindex_base_worker-0.3.7.post1/tests/test_dataset_worker.py +728 -0
  27. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/tests/test_elements_worker/test_classifications.py +365 -539
  28. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/tests/test_elements_worker/test_cli.py +1 -1
  29. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/tests/test_elements_worker/test_dataset.py +97 -116
  30. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/tests/test_elements_worker/test_elements.py +227 -61
  31. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/tests/test_elements_worker/test_entities.py +22 -2
  32. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/tests/test_elements_worker/test_metadata.py +53 -27
  33. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/tests/test_elements_worker/test_training.py +35 -0
  34. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/tests/test_elements_worker/test_transcriptions.py +149 -16
  35. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/tests/test_elements_worker/test_worker.py +19 -6
  36. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/tests/test_image.py +37 -0
  37. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/tests/test_utils.py +23 -1
  38. arkindex_base_worker-0.3.7.post1/worker-demo/tests/__init__.py +0 -0
  39. arkindex_base_worker-0.3.7.post1/worker-demo/tests/conftest.py +32 -0
  40. arkindex_base_worker-0.3.7.post1/worker-demo/tests/test_worker.py +12 -0
  41. arkindex_base_worker-0.3.7.post1/worker-demo/worker_demo/__init__.py +6 -0
  42. arkindex_base_worker-0.3.7.post1/worker-demo/worker_demo/worker.py +19 -0
  43. arkindex-base-worker-0.3.6rc5/arkindex_base_worker.egg-info/requires.txt +0 -17
  44. arkindex-base-worker-0.3.6rc5/arkindex_base_worker.egg-info/top_level.txt +0 -2
  45. arkindex-base-worker-0.3.6rc5/docs-requirements.txt +0 -7
  46. arkindex-base-worker-0.3.6rc5/requirements.txt +0 -8
  47. arkindex-base-worker-0.3.6rc5/setup.cfg +0 -8
  48. arkindex-base-worker-0.3.6rc5/setup.py +0 -4
  49. arkindex-base-worker-0.3.6rc5/tests/test_dataset_worker.py +0 -846
  50. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/LICENSE +0 -0
  51. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/arkindex_base_worker.egg-info/dependency_links.txt +0 -0
  52. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/arkindex_worker/__init__.py +0 -0
  53. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/arkindex_worker/worker/task.py +0 -0
  54. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/tests/__init__.py +0 -0
  55. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/tests/test_cache.py +0 -0
  56. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/tests/test_element.py +0 -0
  57. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/tests/test_elements_worker/__init__.py +0 -0
  58. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/tests/test_elements_worker/test_task.py +0 -0
  59. {arkindex-base-worker-0.3.6rc5 → arkindex_base_worker-0.3.7.post1}/tests/test_merge.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: arkindex-base-worker
3
- Version: 0.3.6rc5
3
+ Version: 0.3.7.post1
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
@@ -41,22 +41,20 @@ Classifier: Topic :: Text Processing :: Linguistic
41
41
  Requires-Python: >=3.10
42
42
  Description-Content-Type: text/markdown
43
43
  License-File: LICENSE
44
- Requires-Dist: arkindex-client==1.0.14
45
- Requires-Dist: peewee==3.17.0
46
- Requires-Dist: Pillow==10.1.0
47
- Requires-Dist: pymdown-extensions==10.3.1
48
- Requires-Dist: python-gnupg==0.5.1
49
- Requires-Dist: shapely==2.0.2
50
- Requires-Dist: tenacity==8.2.3
44
+ Requires-Dist: peewee~=3.17
45
+ Requires-Dist: Pillow==10.3.0
46
+ Requires-Dist: python-gnupg==0.5.2
47
+ Requires-Dist: shapely==2.0.3
48
+ Requires-Dist: teklia-toolbox==0.1.5
51
49
  Requires-Dist: zstandard==0.22.0
52
50
  Provides-Extra: docs
53
- Requires-Dist: black==23.11.0; extra == "docs"
54
- Requires-Dist: doc8==1.1.1; extra == "docs"
55
- Requires-Dist: mkdocs==1.5.3; extra == "docs"
56
- Requires-Dist: mkdocs-material==9.4.8; extra == "docs"
57
- Requires-Dist: mkdocstrings==0.23.0; extra == "docs"
58
- Requires-Dist: mkdocstrings-python==1.7.3; extra == "docs"
59
- Requires-Dist: recommonmark==0.7.1; extra == "docs"
51
+ Requires-Dist: black==24.4.0; extra == "docs"
52
+ Requires-Dist: mkdocs-material==9.5.17; extra == "docs"
53
+ Requires-Dist: mkdocstrings-python==1.9.2; extra == "docs"
54
+ Provides-Extra: tests
55
+ Requires-Dist: pytest==8.1.1; extra == "tests"
56
+ Requires-Dist: pytest-mock==3.14.0; extra == "tests"
57
+ Requires-Dist: pytest-responses==0.5.1; extra == "tests"
60
58
 
61
59
  # Arkindex base Worker
62
60
 
@@ -70,7 +68,7 @@ The [documentation](https://workers.arkindex.org/) is made with [Material for Mk
70
68
 
71
69
  ## Create a new worker using our template
72
70
 
73
- ```
71
+ ```shell
74
72
  pip install --user cookiecutter
75
73
  cookiecutter git@gitlab.teklia.com:workers/base-worker.git
76
74
  ```
@@ -10,7 +10,7 @@ The [documentation](https://workers.arkindex.org/) is made with [Material for Mk
10
10
 
11
11
  ## Create a new worker using our template
12
12
 
13
- ```
13
+ ```shell
14
14
  pip install --user cookiecutter
15
15
  cookiecutter git@gitlab.teklia.com:workers/base-worker.git
16
16
  ```
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: arkindex-base-worker
3
- Version: 0.3.6rc5
3
+ Version: 0.3.7.post1
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
@@ -41,22 +41,20 @@ Classifier: Topic :: Text Processing :: Linguistic
41
41
  Requires-Python: >=3.10
42
42
  Description-Content-Type: text/markdown
43
43
  License-File: LICENSE
44
- Requires-Dist: arkindex-client==1.0.14
45
- Requires-Dist: peewee==3.17.0
46
- Requires-Dist: Pillow==10.1.0
47
- Requires-Dist: pymdown-extensions==10.3.1
48
- Requires-Dist: python-gnupg==0.5.1
49
- Requires-Dist: shapely==2.0.2
50
- Requires-Dist: tenacity==8.2.3
44
+ Requires-Dist: peewee~=3.17
45
+ Requires-Dist: Pillow==10.3.0
46
+ Requires-Dist: python-gnupg==0.5.2
47
+ Requires-Dist: shapely==2.0.3
48
+ Requires-Dist: teklia-toolbox==0.1.5
51
49
  Requires-Dist: zstandard==0.22.0
52
50
  Provides-Extra: docs
53
- Requires-Dist: black==23.11.0; extra == "docs"
54
- Requires-Dist: doc8==1.1.1; extra == "docs"
55
- Requires-Dist: mkdocs==1.5.3; extra == "docs"
56
- Requires-Dist: mkdocs-material==9.4.8; extra == "docs"
57
- Requires-Dist: mkdocstrings==0.23.0; extra == "docs"
58
- Requires-Dist: mkdocstrings-python==1.7.3; extra == "docs"
59
- Requires-Dist: recommonmark==0.7.1; extra == "docs"
51
+ Requires-Dist: black==24.4.0; extra == "docs"
52
+ Requires-Dist: mkdocs-material==9.5.17; extra == "docs"
53
+ Requires-Dist: mkdocstrings-python==1.9.2; extra == "docs"
54
+ Provides-Extra: tests
55
+ Requires-Dist: pytest==8.1.1; extra == "tests"
56
+ Requires-Dist: pytest-mock==3.14.0; extra == "tests"
57
+ Requires-Dist: pytest-responses==0.5.1; extra == "tests"
60
58
 
61
59
  # Arkindex base Worker
62
60
 
@@ -70,7 +68,7 @@ The [documentation](https://workers.arkindex.org/) is made with [Material for Mk
70
68
 
71
69
  ## Create a new worker using our template
72
70
 
73
- ```
71
+ ```shell
74
72
  pip install --user cookiecutter
75
73
  cookiecutter git@gitlab.teklia.com:workers/base-worker.git
76
74
  ```
@@ -1,10 +1,6 @@
1
1
  LICENSE
2
2
  README.md
3
- docs-requirements.txt
4
3
  pyproject.toml
5
- requirements.txt
6
- setup.cfg
7
- setup.py
8
4
  arkindex_base_worker.egg-info/PKG-INFO
9
5
  arkindex_base_worker.egg-info/SOURCES.txt
10
6
  arkindex_base_worker.egg-info/dependency_links.txt
@@ -26,6 +22,7 @@ arkindex_worker/worker/task.py
26
22
  arkindex_worker/worker/training.py
27
23
  arkindex_worker/worker/transcription.py
28
24
  arkindex_worker/worker/version.py
25
+ hooks/pre_gen_project.py
29
26
  tests/__init__.py
30
27
  tests/conftest.py
31
28
  tests/test_base_worker.py
@@ -45,4 +42,9 @@ tests/test_elements_worker/test_metadata.py
45
42
  tests/test_elements_worker/test_task.py
46
43
  tests/test_elements_worker/test_training.py
47
44
  tests/test_elements_worker/test_transcriptions.py
48
- tests/test_elements_worker/test_worker.py
45
+ tests/test_elements_worker/test_worker.py
46
+ worker-demo/tests/__init__.py
47
+ worker-demo/tests/conftest.py
48
+ worker-demo/tests/test_worker.py
49
+ worker-demo/worker_demo/__init__.py
50
+ worker-demo/worker_demo/worker.py
@@ -0,0 +1,16 @@
1
+ peewee~=3.17
2
+ Pillow==10.3.0
3
+ python-gnupg==0.5.2
4
+ shapely==2.0.3
5
+ teklia-toolbox==0.1.5
6
+ zstandard==0.22.0
7
+
8
+ [docs]
9
+ black==24.4.0
10
+ mkdocs-material==9.5.17
11
+ mkdocstrings-python==1.9.2
12
+
13
+ [tests]
14
+ pytest==8.1.1
15
+ pytest-mock==3.14.0
16
+ pytest-responses==0.5.1
@@ -0,0 +1,6 @@
1
+ arkindex_worker
2
+ dist
3
+ docs
4
+ hooks
5
+ tests
6
+ worker-demo
@@ -374,3 +374,17 @@ def merge_parents_cache(paths: list, current_database: Path):
374
374
  for statement in statements:
375
375
  cursor.execute(statement)
376
376
  connection.commit()
377
+
378
+
379
+ def unsupported_cache(func):
380
+ def wrapper(self, *args, **kwargs):
381
+ results = func(self, *args, **kwargs)
382
+
383
+ if not (self.is_read_only or self.use_cache):
384
+ logger.warning(
385
+ f"This API helper `{func.__name__}` did not update the cache database"
386
+ )
387
+
388
+ return results
389
+
390
+ return wrapper
@@ -1,6 +1,7 @@
1
1
  """
2
2
  Helper methods to download and open IIIF images, and manage polygons.
3
3
  """
4
+
4
5
  import re
5
6
  from collections import namedtuple
6
7
  from io import BytesIO
@@ -20,6 +21,7 @@ from tenacity import (
20
21
  )
21
22
 
22
23
  from arkindex_worker import logger
24
+ from teklia_toolbox.requests import should_verify_cert
23
25
 
24
26
  # Avoid circular imports error when type checking
25
27
  if TYPE_CHECKING:
@@ -114,32 +116,38 @@ def download_image(url: str) -> Image:
114
116
  )
115
117
  else:
116
118
  raise e
117
- except requests.exceptions.SSLError:
118
- logger.warning(
119
- "An SSLError occurred during image download, retrying with a weaker and unsafe SSL configuration"
120
- )
121
-
122
- # Saving current ciphers
123
- previous_ciphers = requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS
124
-
125
- # Downgrading ciphers to download the image
126
- requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = "ALL:@SECLEVEL=1"
127
- resp = _retried_request(url)
128
-
129
- # Restoring previous ciphers
130
- requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = previous_ciphers
131
119
 
132
120
  # Preprocess the image and prepare it for classification
133
121
  image = Image.open(BytesIO(resp.content))
134
122
  logger.info(
135
- "Downloaded image {} - size={}x{} in {}".format(
136
- url, image.size[0], image.size[1], resp.elapsed
137
- )
123
+ f"Downloaded image {url} - size={image.size[0]}x{image.size[1]} in {resp.elapsed}"
138
124
  )
139
125
 
140
126
  return image
141
127
 
142
128
 
129
+ def upload_image(image: Image, url: str) -> requests.Response:
130
+ """
131
+ Upload a Pillow image to a URL.
132
+
133
+ :param image: Pillow image to upload.
134
+ :param url: Destination URL.
135
+ :returns: The upload response.
136
+ """
137
+ assert url.startswith("http"), "Destination URL for the image must be HTTP(S)"
138
+
139
+ # Retrieve a binarized version of the image
140
+ image_bytes = BytesIO()
141
+ image.save(image_bytes, format="jpeg")
142
+ image_bytes.seek(0)
143
+
144
+ # Upload the image
145
+ resp = _retried_request(url, method=requests.put, data=image_bytes)
146
+ logger.info(f"Uploaded image to {url} in {resp.elapsed}")
147
+
148
+ return resp
149
+
150
+
143
151
  def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
144
152
  """
145
153
  Compute the rectangle bounding box of a polygon.
@@ -167,8 +175,10 @@ def _retry_log(retry_state, *args, **kwargs):
167
175
  before_sleep=_retry_log,
168
176
  reraise=True,
169
177
  )
170
- def _retried_request(url):
171
- resp = requests.get(url, timeout=DOWNLOAD_TIMEOUT)
178
+ def _retried_request(url, *args, method=requests.get, **kwargs):
179
+ resp = method(
180
+ url, *args, timeout=DOWNLOAD_TIMEOUT, verify=should_verify_cert(url), **kwargs
181
+ )
172
182
  resp.raise_for_status()
173
183
  return resp
174
184
 
@@ -20,6 +20,8 @@ class MagicDict(dict):
20
20
  Automagically convert lists and dicts to MagicDicts and lists of MagicDicts
21
21
  Allows for nested access: foo.bar.baz
22
22
  """
23
+ if isinstance(item, Dataset):
24
+ return item
23
25
  if isinstance(item, list):
24
26
  return list(map(self._magify, item))
25
27
  if isinstance(item, dict):
@@ -75,10 +77,10 @@ class Element(MagicDict):
75
77
 
76
78
  def image_url(self, size: str = "full") -> str | None:
77
79
  """
78
- Build an URL to access the image.
80
+ Build a URL to access the image.
79
81
  When possible, will return the S3 URL for images, so an ML worker can bypass IIIF servers.
80
82
  :param size: Subresolution of the image, following the syntax of the IIIF resize parameter.
81
- :returns: An URL to the image, or None if the element does not have an image.
83
+ :returns: A URL to the image, or None if the element does not have an image.
82
84
  """
83
85
  if not self.get("zone"):
84
86
  return
@@ -272,6 +274,16 @@ class Dataset(ArkindexModel):
272
274
  return f"{self.id}.tar.zst"
273
275
 
274
276
 
277
+ class Set(MagicDict):
278
+ """
279
+ Describes an Arkindex dataset set.
280
+ """
281
+
282
+ def __str__(self):
283
+ # Not using ArkindexModel.__str__ as we do not retrieve the Set ID
284
+ return f"{self.__class__.__name__} ({self.name}) from {self.dataset}"
285
+
286
+
275
287
  class Artifact(ArkindexModel):
276
288
  """
277
289
  Describes an Arkindex artifact.
@@ -10,6 +10,19 @@ import zstandard as zstd
10
10
 
11
11
  logger = logging.getLogger(__name__)
12
12
 
13
+ MANUAL_SOURCE = "manual"
14
+
15
+
16
+ def parse_source_id(value: str) -> bool | str | None:
17
+ """
18
+ Parse a UUID argument (Worker Version, Worker Run, ...) to use it directly in the API.
19
+ Arkindex API filters generally expect `False` to filter manual sources.
20
+ """
21
+ if value == MANUAL_SOURCE:
22
+ return False
23
+ return value or None
24
+
25
+
13
26
  CHUNK_SIZE = 1024
14
27
  """Chunk Size used for ZSTD compression"""
15
28
 
@@ -31,9 +44,10 @@ def decompress_zst_archive(compressed_archive: Path) -> tuple[int, Path]:
31
44
 
32
45
  logger.debug(f"Uncompressing file to {archive_path}")
33
46
  try:
34
- with compressed_archive.open("rb") as compressed, archive_path.open(
35
- "wb"
36
- ) as decompressed:
47
+ with (
48
+ compressed_archive.open("rb") as compressed,
49
+ archive_path.open("wb") as decompressed,
50
+ ):
37
51
  dctx.copy_stream(compressed, decompressed)
38
52
  logger.debug(f"Successfully uncompressed archive {compressed_archive}")
39
53
  except zstandard.ZstdError as e: