arkindex-base-worker 0.5.1b4__tar.gz → 0.5.1.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/PKG-INFO +5 -6
  2. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/arkindex_base_worker.egg-info/PKG-INFO +5 -6
  3. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/arkindex_base_worker.egg-info/requires.txt +4 -5
  4. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/arkindex_worker/cache.py +6 -1
  5. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/arkindex_worker/image.py +5 -1
  6. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/arkindex_worker/models.py +5 -0
  7. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/arkindex_worker/utils.py +27 -0
  8. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/arkindex_worker/worker/base.py +21 -12
  9. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/arkindex_worker/worker/metadata.py +3 -3
  10. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/pyproject.toml +5 -6
  11. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/conftest.py +0 -18
  12. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_base_worker.py +33 -123
  13. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_cache.py +1 -1
  14. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_element.py +52 -12
  15. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_image.py +19 -3
  16. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_utils.py +42 -0
  17. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/LICENSE +0 -0
  18. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/README.md +0 -0
  19. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/arkindex_base_worker.egg-info/SOURCES.txt +0 -0
  20. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/arkindex_base_worker.egg-info/dependency_links.txt +0 -0
  21. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/arkindex_base_worker.egg-info/top_level.txt +0 -0
  22. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/arkindex_worker/__init__.py +0 -0
  23. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/arkindex_worker/worker/__init__.py +0 -0
  24. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/arkindex_worker/worker/classification.py +0 -0
  25. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/arkindex_worker/worker/corpus.py +0 -0
  26. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/arkindex_worker/worker/dataset.py +0 -0
  27. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/arkindex_worker/worker/element.py +0 -0
  28. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/arkindex_worker/worker/entity.py +0 -0
  29. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/arkindex_worker/worker/image.py +0 -0
  30. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/arkindex_worker/worker/process.py +0 -0
  31. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/arkindex_worker/worker/task.py +0 -0
  32. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/arkindex_worker/worker/training.py +0 -0
  33. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/arkindex_worker/worker/transcription.py +0 -0
  34. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/examples/standalone/python/worker.py +0 -0
  35. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/examples/tooled/python/worker.py +0 -0
  36. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/hooks/pre_gen_project.py +0 -0
  37. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/setup.cfg +0 -0
  38. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/__init__.py +0 -0
  39. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_dataset_worker.py +0 -0
  40. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_elements_worker/__init__.py +0 -0
  41. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_elements_worker/test_classification.py +0 -0
  42. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_elements_worker/test_cli.py +0 -0
  43. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_elements_worker/test_corpus.py +0 -0
  44. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_elements_worker/test_dataset.py +0 -0
  45. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_elements_worker/test_element.py +0 -0
  46. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_elements_worker/test_element_create_multiple.py +0 -0
  47. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_elements_worker/test_element_create_single.py +0 -0
  48. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_elements_worker/test_element_list_children.py +0 -0
  49. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_elements_worker/test_element_list_parents.py +0 -0
  50. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_elements_worker/test_entity.py +0 -0
  51. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_elements_worker/test_image.py +0 -0
  52. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_elements_worker/test_metadata.py +0 -0
  53. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_elements_worker/test_process.py +0 -0
  54. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_elements_worker/test_task.py +0 -0
  55. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_elements_worker/test_training.py +0 -0
  56. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_elements_worker/test_transcription_create.py +0 -0
  57. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_elements_worker/test_transcription_create_with_elements.py +0 -0
  58. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_elements_worker/test_transcription_list.py +0 -0
  59. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_elements_worker/test_worker.py +0 -0
  60. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_merge.py +0 -0
  61. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/tests/test_modern_config.py +0 -0
  62. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/worker-demo/tests/__init__.py +0 -0
  63. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/worker-demo/tests/conftest.py +0 -0
  64. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/worker-demo/tests/test_worker.py +0 -0
  65. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/worker-demo/worker_demo/__init__.py +0 -0
  66. {arkindex_base_worker-0.5.1b4 → arkindex_base_worker-0.5.1.post1}/worker-demo/worker_demo/worker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: arkindex-base-worker
3
- Version: 0.5.1b4
3
+ Version: 0.5.1.post1
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
@@ -41,16 +41,15 @@ Classifier: Programming Language :: Python :: 3.12
41
41
  Requires-Python: >=3.10
42
42
  Description-Content-Type: text/markdown
43
43
  License-File: LICENSE
44
- Requires-Dist: humanize==4.12.3
44
+ Requires-Dist: humanize==4.14.0
45
45
  Requires-Dist: peewee~=3.17
46
46
  Requires-Dist: Pillow==11.3.0
47
- Requires-Dist: python-gnupg==0.5.4
47
+ Requires-Dist: python-gnupg==0.5.5
48
48
  Requires-Dist: shapely==2.0.6
49
49
  Requires-Dist: teklia-toolbox==0.1.11
50
- Requires-Dist: zstandard==0.23.0
50
+ Requires-Dist: zstandard==0.25.0
51
51
  Provides-Extra: tests
52
- Requires-Dist: pytest==8.3.5; extra == "tests"
53
- Requires-Dist: pytest-mock==3.14.0; extra == "tests"
52
+ Requires-Dist: pytest-mock==3.15.1; extra == "tests"
54
53
  Requires-Dist: pytest-responses==0.5.1; extra == "tests"
55
54
  Dynamic: license-file
56
55
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: arkindex-base-worker
3
- Version: 0.5.1b4
3
+ Version: 0.5.1.post1
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
@@ -41,16 +41,15 @@ Classifier: Programming Language :: Python :: 3.12
41
41
  Requires-Python: >=3.10
42
42
  Description-Content-Type: text/markdown
43
43
  License-File: LICENSE
44
- Requires-Dist: humanize==4.12.3
44
+ Requires-Dist: humanize==4.14.0
45
45
  Requires-Dist: peewee~=3.17
46
46
  Requires-Dist: Pillow==11.3.0
47
- Requires-Dist: python-gnupg==0.5.4
47
+ Requires-Dist: python-gnupg==0.5.5
48
48
  Requires-Dist: shapely==2.0.6
49
49
  Requires-Dist: teklia-toolbox==0.1.11
50
- Requires-Dist: zstandard==0.23.0
50
+ Requires-Dist: zstandard==0.25.0
51
51
  Provides-Extra: tests
52
- Requires-Dist: pytest==8.3.5; extra == "tests"
53
- Requires-Dist: pytest-mock==3.14.0; extra == "tests"
52
+ Requires-Dist: pytest-mock==3.15.1; extra == "tests"
54
53
  Requires-Dist: pytest-responses==0.5.1; extra == "tests"
55
54
  Dynamic: license-file
56
55
 
@@ -1,12 +1,11 @@
1
- humanize==4.12.3
1
+ humanize==4.14.0
2
2
  peewee~=3.17
3
3
  Pillow==11.3.0
4
- python-gnupg==0.5.4
4
+ python-gnupg==0.5.5
5
5
  shapely==2.0.6
6
6
  teklia-toolbox==0.1.11
7
- zstandard==0.23.0
7
+ zstandard==0.25.0
8
8
 
9
9
  [tests]
10
- pytest==8.3.5
11
- pytest-mock==3.14.0
10
+ pytest-mock==3.15.1
12
11
  pytest-responses==0.5.1
@@ -73,6 +73,7 @@ class CachedImage(Model):
73
73
  width = IntegerField()
74
74
  height = IntegerField()
75
75
  url = TextField()
76
+ version = IntegerField(default=2)
76
77
 
77
78
  class Meta:
78
79
  database = db
@@ -157,6 +158,10 @@ class CachedElement(Model):
157
158
  else:
158
159
  resize = f"{max_width or ''},{max_height or ''}"
159
160
 
161
+ # Use `max` instead of `full` for IIIF 3, since `full` was deprecated in 2.1 then removed in 3.0
162
+ if self.image.version == 3 and resize == "full":
163
+ resize = "max"
164
+
160
165
  url = self.image.url
161
166
  if not url.endswith("/"):
162
167
  url += "/"
@@ -259,7 +264,7 @@ MODELS = [
259
264
  CachedDataset,
260
265
  CachedDatasetElement,
261
266
  ]
262
- SQL_VERSION = 4
267
+ SQL_VERSION = 5
263
268
 
264
269
 
265
270
  def init_cache_db(path: Path):
@@ -366,6 +366,10 @@ def download_tiles(url: str) -> Image:
366
366
  logger.debug("Downloading image information")
367
367
  info = _retried_request(url + "info.json").json()
368
368
 
369
+ # Use `max` instead of `full` for IIIF 3, since `full` was deprecated in 2.1 then removed in 3.0
370
+ # With IIIF 3, the image's ID will be at `id`, while IIIF 2 will use `@id``
371
+ resize = "max" if "id" in info else "full"
372
+
369
373
  image_width, image_height = info.get("width"), info.get("height")
370
374
  assert image_width and image_height, "Missing image dimensions in info.json"
371
375
  assert info.get("tiles"), (
@@ -391,7 +395,7 @@ def download_tiles(url: str) -> Image:
391
395
 
392
396
  logger.debug(f"Downloading tile {tile_x},{tile_y}")
393
397
  resp = _retried_request(
394
- f"{url}{region_x},{region_y},{region_width},{region_height}/full/0/default.jpg"
398
+ f"{url}{region_x},{region_y},{region_width},{region_height}/{resize}/0/default.jpg"
395
399
  )
396
400
 
397
401
  tile_img = Image.open(BytesIO(resp.content))
@@ -87,6 +87,11 @@ class Element(MagicDict):
87
87
  url = self.zone.image.get("s3_url")
88
88
  if url:
89
89
  return url
90
+
91
+ # Use `max` instead of `full` for IIIF 3, since `full` was deprecated in 2.1 then removed in 3.0
92
+ if self.zone.image.server.get("version", 2) == 3 and size == "full":
93
+ size = "max"
94
+
90
95
  url = self.zone.image.url
91
96
  if not url.endswith("/"):
92
97
  url += "/"
@@ -4,6 +4,7 @@ import logging
4
4
  import os
5
5
  import tarfile
6
6
  import tempfile
7
+ import zipfile
7
8
  from collections.abc import Callable, Generator
8
9
  from itertools import islice
9
10
  from pathlib import Path
@@ -225,6 +226,32 @@ def create_tar_zst_archive(
225
226
  return zst_fd, zst_archive, zst_hash, tar_hash
226
227
 
227
228
 
229
+ def create_zip_archive(source: Path, destination: Path | None = None) -> Path:
230
+ """Helper to create a ZIP archive from a source folder.
231
+
232
+ :param source: Path to the folder whose content should be archived.
233
+ :param destination: Path to the created archive, defaults to None. If unspecified, a temporary file will be created.
234
+ :return: The file descriptor of the created tempfile (if one was created), path to the archive.
235
+ """
236
+ # Parse destination and create a tmpfile if none was specified
237
+ file_d, destination = (
238
+ tempfile.mkstemp(prefix="teklia-", suffix=".zip")
239
+ if destination is None
240
+ else (None, destination)
241
+ )
242
+ destination = Path(destination)
243
+ logger.debug(f"Compressing file to {destination}")
244
+
245
+ with zipfile.ZipFile(
246
+ destination, mode="w", compression=zipfile.ZIP_DEFLATED
247
+ ) as archive:
248
+ for p in source.rglob("*"):
249
+ relpath = p.relative_to(source)
250
+ archive.write(p, arcname=relpath)
251
+
252
+ return archive, destination
253
+
254
+
228
255
  DEFAULT_BATCH_SIZE = 50
229
256
  """Batch size used for bulk publication to Arkindex"""
230
257
 
@@ -265,6 +265,11 @@ class BaseWorker:
265
265
  if not item["secret"]:
266
266
  return (item["key"], item["value"])
267
267
 
268
+ # The secret may not be picked by the user
269
+ if item["value"] is None:
270
+ logger.info(f"Optional secret `{item['key']}` is not set")
271
+ return (item["key"], None)
272
+
268
273
  # Load secret, only available in Arkindex EE
269
274
  try:
270
275
  secret = self.load_secret(Path(item["value"]))
@@ -276,6 +281,19 @@ class BaseWorker:
276
281
 
277
282
  return (item["key"], secret)
278
283
 
284
+ # Load model version configuration when available
285
+ # Workers will use model version ID and details to download the model
286
+ model_version = worker_run.get("model_version")
287
+ if model_version:
288
+ logger.info("Loaded model version configuration from WorkerRun")
289
+ self.model_configuration.update(model_version["configuration"])
290
+
291
+ # Set model_version ID as worker attribute
292
+ self.model_version_id = model_version["id"]
293
+
294
+ # Set model details as worker attribute
295
+ self.model_details = model_version["model"]
296
+
279
297
  # Load worker run information
280
298
  try:
281
299
  config = self.api_client.request(
@@ -295,6 +313,9 @@ class BaseWorker:
295
313
  }
296
314
  logger.info("Using modern configuration")
297
315
 
316
+ # Reset the model configuration to make sure workers rely on the single new source
317
+ self.model_configuration = {}
318
+
298
319
  return # Stop here once we have modern configuration
299
320
 
300
321
  except ErrorResponse as e:
@@ -303,18 +324,6 @@ class BaseWorker:
303
324
  logger.info("Modern configuration is not available")
304
325
 
305
326
  # Use old-style configuration with local merge
306
- # Load model version configuration when available
307
- model_version = worker_run.get("model_version")
308
- if model_version:
309
- logger.info("Loaded model version configuration from WorkerRun")
310
- self.model_configuration.update(model_version["configuration"])
311
-
312
- # Set model_version ID as worker attribute
313
- self.model_version_id = model_version["id"]
314
-
315
- # Set model details as worker attribute
316
- self.model_details = model_version["model"]
317
-
318
327
  # Retrieve initial configuration from API
319
328
  self.config = worker_version["configuration"].get("configuration", {})
320
329
  if "user_configuration" in worker_version["configuration"]:
@@ -20,10 +20,10 @@ class MetaType(Enum):
20
20
  A regular string with no special interpretation.
21
21
  """
22
22
 
23
- HTML = "html"
23
+ Markdown = "markdown"
24
24
  """
25
- A metadata with a string value that should be interpreted as HTML content.
26
- The allowed HTML tags are restricted for security reasons.
25
+ A metadata with a string value that should be interpreted as Markdown content.
26
+ HTML is allowed, but the allowed HTML tags are restricted for security reasons.
27
27
  """
28
28
 
29
29
  Date = "date"
@@ -4,17 +4,17 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "arkindex-base-worker"
7
- version = "0.5.1b4"
7
+ version = "0.5.1post1"
8
8
  description = "Base Worker to easily build Arkindex ML workflows"
9
9
  license = { file = "LICENSE" }
10
10
  dependencies = [
11
- "humanize==4.12.3",
11
+ "humanize==4.14.0",
12
12
  "peewee~=3.17",
13
13
  "Pillow==11.3.0",
14
- "python-gnupg==0.5.4",
14
+ "python-gnupg==0.5.5",
15
15
  "shapely==2.0.6",
16
16
  "teklia-toolbox==0.1.11",
17
- "zstandard==0.23.0",
17
+ "zstandard==0.25.0",
18
18
  ]
19
19
  authors = [
20
20
  { name = "Teklia", email = "contact@teklia.com" },
@@ -44,8 +44,7 @@ Authors = "https://teklia.com"
44
44
 
45
45
  [project.optional-dependencies]
46
46
  tests = [
47
- "pytest==8.3.5",
48
- "pytest-mock==3.14.0",
47
+ "pytest-mock==3.15.1",
49
48
  "pytest-responses==0.5.1",
50
49
  ]
51
50
 
@@ -103,12 +103,6 @@ def _mock_worker_run_api(responses):
103
103
  payload = {
104
104
  "id": "56785678-5678-5678-5678-567856785678",
105
105
  "parents": [],
106
- "worker": {
107
- "id": "deadbeef-1234-5678-1234-worker",
108
- "name": "Fake worker",
109
- "slug": "fake_worker",
110
- "type": "classifier",
111
- },
112
106
  "worker_version": {
113
107
  "id": "12341234-1234-1234-1234-123412341234",
114
108
  "configuration": {
@@ -180,12 +174,6 @@ def _mock_worker_run_no_revision_api(responses):
180
174
  payload = {
181
175
  "id": "56785678-5678-5678-5678-567856785678",
182
176
  "parents": [],
183
- "worker": {
184
- "id": "deadbeef-1234-5678-1234-worker",
185
- "name": "Fake worker",
186
- "slug": "fake_worker",
187
- "type": "classifier",
188
- },
189
177
  "worker_version": {
190
178
  "id": "12341234-1234-1234-1234-123412341234",
191
179
  "configuration": {
@@ -355,12 +343,6 @@ def mock_elements_worker_consume_wa(monkeypatch, responses, mock_elements_worker
355
343
  json={
356
344
  "id": "56785678-5678-5678-5678-567856785678",
357
345
  "parents": [],
358
- "worker": {
359
- "id": "deadbeef-1234-5678-1234-worker",
360
- "name": "Fake worker",
361
- "slug": "fake_worker",
362
- "type": "classifier",
363
- },
364
346
  "worker_version": {
365
347
  "id": "12341234-1234-1234-1234-123412341234",
366
348
  "configuration": {
@@ -13,6 +13,29 @@ from arkindex_worker.worker import BaseWorker, ElementsWorker
13
13
  from arkindex_worker.worker.base import ExtrasDirNotFoundError
14
14
  from tests import CORPUS_ID, FIXTURES_DIR
15
15
 
16
+ SIMPLE_PAYLOAD = {
17
+ "id": "56785678-5678-5678-5678-567856785678",
18
+ "parents": [],
19
+ "worker_version": {
20
+ "id": "12341234-1234-1234-1234-123412341234",
21
+ "worker": {
22
+ "id": "deadbeef-1234-5678-1234-worker",
23
+ "name": "Fake worker",
24
+ "slug": "fake_worker",
25
+ "type": "classifier",
26
+ },
27
+ "revision": {"hash": "deadbeef1234"},
28
+ "configuration": {"configuration": {}},
29
+ },
30
+ "configuration": None,
31
+ "model_version": None,
32
+ "process": {
33
+ "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
34
+ "corpus": CORPUS_ID,
35
+ },
36
+ "summary": "Worker Fake worker @ 123412",
37
+ }
38
+
16
39
 
17
40
  def test_init_default_local_share():
18
41
  worker = BaseWorker()
@@ -149,38 +172,13 @@ def test_configure_worker_run(mocker, responses, caplog):
149
172
 
150
173
  worker = BaseWorker()
151
174
  mocker.patch.object(sys, "argv", ["worker"])
152
- user_configuration = {
153
- "id": "bbbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb",
154
- "name": "BBB",
155
- "configuration": {"a": "b"},
156
- }
157
175
  payload = {
158
- "id": "56785678-5678-5678-5678-567856785678",
159
- "parents": [],
160
- "worker": {
161
- "id": "deadbeef-1234-5678-1234-worker",
162
- "name": "Fake worker",
163
- "slug": "fake_worker",
164
- "type": "classifier",
165
- },
166
- "worker_version": {
167
- "id": "12341234-1234-1234-1234-123412341234",
168
- "worker": {
169
- "id": "deadbeef-1234-5678-1234-worker",
170
- "name": "Fake worker",
171
- "slug": "fake_worker",
172
- "type": "classifier",
173
- },
174
- "revision": {"hash": "deadbeef1234"},
175
- "configuration": {"configuration": {}},
176
- },
177
- "configuration": user_configuration,
178
- "model_version": None,
179
- "process": {
180
- "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
181
- "corpus": CORPUS_ID,
176
+ **SIMPLE_PAYLOAD,
177
+ "configuration": {
178
+ "id": "bbbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb",
179
+ "name": "BBB",
180
+ "configuration": {"a": "b"},
182
181
  },
183
- "summary": "Worker Fake worker @ 123412",
184
182
  }
185
183
 
186
184
  responses.add(
@@ -262,8 +260,7 @@ def test_configure_user_configuration_defaults(mocker, responses):
262
260
  worker.args = worker.parser.parse_args()
263
261
 
264
262
  payload = {
265
- "id": "56785678-5678-5678-5678-567856785678",
266
- "parents": [],
263
+ **SIMPLE_PAYLOAD,
267
264
  "worker_version": {
268
265
  "id": "12341234-1234-1234-1234-123412341234",
269
266
  "worker": {
@@ -293,12 +290,6 @@ def test_configure_user_configuration_defaults(mocker, responses):
293
290
  "param_5": True,
294
291
  },
295
292
  },
296
- "model_version": None,
297
- "process": {
298
- "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
299
- "corpus": CORPUS_ID,
300
- },
301
- "summary": "Worker Fake worker @ 123412",
302
293
  }
303
294
  responses.add(
304
295
  responses.GET,
@@ -340,30 +331,12 @@ def test_configure_user_config_debug(mocker, responses, debug):
340
331
  mocker.patch.object(sys, "argv", ["worker"])
341
332
  assert logger.level == logging.NOTSET
342
333
  payload = {
343
- "id": "56785678-5678-5678-5678-567856785678",
344
- "parents": [],
345
- "worker_version": {
346
- "id": "12341234-1234-1234-1234-123412341234",
347
- "worker": {
348
- "id": "deadbeef-1234-5678-1234-worker",
349
- "name": "Fake worker",
350
- "slug": "fake_worker",
351
- "type": "classifier",
352
- },
353
- "revision": {"hash": "deadbeef1234"},
354
- "configuration": {"configuration": {}},
355
- },
356
- "model_version": None,
334
+ **SIMPLE_PAYLOAD,
357
335
  "configuration": {
358
336
  "id": "af0daaf4-983e-4703-a7ed-a10f146d6684",
359
337
  "name": "BBB",
360
338
  "configuration": {"debug": debug},
361
339
  },
362
- "process": {
363
- "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
364
- "corpus": CORPUS_ID,
365
- },
366
- "summary": "Worker Fake worker @ 123412",
367
340
  }
368
341
  responses.add(
369
342
  responses.GET,
@@ -393,32 +366,8 @@ def test_configure_worker_run_missing_conf(mocker, responses):
393
366
  mocker.patch.object(sys, "argv", ["worker"])
394
367
 
395
368
  payload = {
396
- "id": "56785678-5678-5678-5678-567856785678",
397
- "parents": [],
398
- "worker": {
399
- "id": "deadbeef-1234-5678-1234-worker",
400
- "name": "Fake worker",
401
- "slug": "fake_worker",
402
- "type": "classifier",
403
- },
404
- "worker_version": {
405
- "id": "12341234-1234-1234-1234-123412341234",
406
- "worker": {
407
- "id": "deadbeef-1234-5678-1234-worker",
408
- "name": "Fake worker",
409
- "slug": "fake_worker",
410
- "type": "classifier",
411
- },
412
- "revision": {"hash": "deadbeef1234"},
413
- "configuration": {"configuration": {}},
414
- },
415
- "model_version": None,
369
+ **SIMPLE_PAYLOAD,
416
370
  "configuration": {"id": "bbbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb", "name": "BBB"},
417
- "process": {
418
- "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
419
- "corpus": CORPUS_ID,
420
- },
421
- "summary": "Worker Fake worker @ 123412",
422
371
  }
423
372
  responses.add(
424
373
  responses.GET,
@@ -446,28 +395,7 @@ def test_configure_worker_run_no_worker_run_conf(mocker, responses):
446
395
  worker = BaseWorker()
447
396
  mocker.patch.object(sys, "argv", ["worker"])
448
397
 
449
- payload = {
450
- "id": "56785678-5678-5678-5678-567856785678",
451
- "parents": [],
452
- "worker_version": {
453
- "id": "12341234-1234-1234-1234-123412341234",
454
- "worker": {
455
- "id": "deadbeef-1234-5678-1234-worker",
456
- "name": "Fake worker",
457
- "slug": "fake_worker",
458
- "type": "classifier",
459
- },
460
- "revision": {"hash": "deadbeef1234"},
461
- "configuration": {},
462
- },
463
- "model_version": None,
464
- "configuration": None,
465
- "process": {
466
- "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
467
- "corpus": CORPUS_ID,
468
- },
469
- "summary": "Worker Fake worker @ 123412",
470
- }
398
+ payload = SIMPLE_PAYLOAD
471
399
  responses.add(
472
400
  responses.GET,
473
401
  "http://testserver/api/v1/process/workers/56785678-5678-5678-5678-567856785678/",
@@ -491,20 +419,7 @@ def test_configure_load_model_configuration(mocker, responses):
491
419
  worker = BaseWorker()
492
420
  mocker.patch.object(sys, "argv", ["worker"])
493
421
  payload = {
494
- "id": "56785678-5678-5678-5678-567856785678",
495
- "parents": [],
496
- "worker_version": {
497
- "id": "12341234-1234-1234-1234-123412341234",
498
- "worker": {
499
- "id": "deadbeef-1234-5678-1234-worker",
500
- "name": "Fake worker",
501
- "slug": "fake_worker",
502
- "type": "classifier",
503
- },
504
- "revision": {"hash": "deadbeef1234"},
505
- "configuration": {"configuration": {}},
506
- },
507
- "configuration": None,
422
+ **SIMPLE_PAYLOAD,
508
423
  "model_version": {
509
424
  "id": "12341234-1234-1234-1234-123412341234",
510
425
  "model": {
@@ -517,11 +432,6 @@ def test_configure_load_model_configuration(mocker, responses):
517
432
  "param3": None,
518
433
  },
519
434
  },
520
- "process": {
521
- "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
522
- "corpus": CORPUS_ID,
523
- },
524
- "summary": "Worker Fake worker @ 123412",
525
435
  }
526
436
 
527
437
  responses.add(
@@ -60,7 +60,7 @@ def test_create_tables(tmp_path):
60
60
  CREATE TABLE "dataset_elements" ("id" TEXT NOT NULL PRIMARY KEY, "element_id" TEXT NOT NULL, "dataset_id" TEXT NOT NULL, "set_name" VARCHAR(255) NOT NULL, FOREIGN KEY ("element_id") REFERENCES "elements" ("id"), FOREIGN KEY ("dataset_id") REFERENCES "datasets" ("id"))
61
61
  CREATE TABLE "datasets" ("id" TEXT NOT NULL PRIMARY KEY, "name" VARCHAR(255) NOT NULL, "state" VARCHAR(255) NOT NULL DEFAULT 'open', "sets" TEXT NOT NULL)
62
62
  CREATE TABLE "elements" ("id" TEXT NOT NULL PRIMARY KEY, "parent_id" TEXT, "type" VARCHAR(50) NOT NULL, "image_id" TEXT, "polygon" text, "rotation_angle" INTEGER NOT NULL, "mirrored" INTEGER NOT NULL, "initial" INTEGER NOT NULL, "worker_version_id" TEXT, "worker_run_id" TEXT, "confidence" REAL, FOREIGN KEY ("image_id") REFERENCES "images" ("id"))
63
- CREATE TABLE "images" ("id" TEXT NOT NULL PRIMARY KEY, "width" INTEGER NOT NULL, "height" INTEGER NOT NULL, "url" TEXT NOT NULL)
63
+ CREATE TABLE "images" ("id" TEXT NOT NULL PRIMARY KEY, "width" INTEGER NOT NULL, "height" INTEGER NOT NULL, "url" TEXT NOT NULL, "version" INTEGER NOT NULL)
64
64
  CREATE TABLE "transcription_entities" ("transcription_id" TEXT NOT NULL, "type" VARCHAR(50) NOT NULL, "offset" INTEGER NOT NULL CHECK (offset >= 0), "length" INTEGER NOT NULL CHECK (length > 0), "worker_run_id" TEXT, "confidence" REAL, PRIMARY KEY ("transcription_id", "type"), FOREIGN KEY ("transcription_id") REFERENCES "transcriptions" ("id"))
65
65
  CREATE TABLE "transcriptions" ("id" TEXT NOT NULL PRIMARY KEY, "element_id" TEXT NOT NULL, "text" TEXT NOT NULL, "confidence" REAL, "orientation" VARCHAR(50) NOT NULL, "worker_version_id" TEXT, "worker_run_id" TEXT, FOREIGN KEY ("element_id") REFERENCES "elements" ("id"))"""
66
66
 
@@ -5,26 +5,36 @@ from arkindex_worker.cache import CachedElement
5
5
  from arkindex_worker.models import Element
6
6
 
7
7
 
8
- def test_no_image_url():
8
+ @pytest.mark.parametrize(
9
+ ("zone", "expected_url"),
10
+ [
11
+ (None, None),
12
+ (
13
+ {"image": {"url": "http://something/", "server": {"version": 2}}},
14
+ "http://something/full/full/0/default.jpg",
15
+ ),
16
+ (
17
+ {"image": {"url": "http://something", "server": {"version": 2}}},
18
+ "http://something/full/full/0/default.jpg",
19
+ ),
20
+ (
21
+ {"image": {"url": "http://something/", "server": {"version": 3}}},
22
+ "http://something/full/max/0/default.jpg",
23
+ ),
24
+ ],
25
+ )
26
+ def test_image_url(zone, expected_url):
9
27
  url = Element({"zone": None}).image_url()
10
28
  assert not url
11
29
 
12
30
 
13
- def test_image_url_iiif():
14
- url = Element({"zone": {"image": {"url": "http://something/"}}}).image_url()
15
- assert url == "http://something/full/full/0/default.jpg"
16
-
17
-
18
31
  def test_image_url_iiif_resize():
19
- url = Element({"zone": {"image": {"url": "http://something/"}}}).image_url(500)
32
+ url = Element(
33
+ {"zone": {"image": {"url": "http://something/", "server": {"version": 2}}}}
34
+ ).image_url(500)
20
35
  assert url == "http://something/full/500/0/default.jpg"
21
36
 
22
37
 
23
- def test_image_url_iiif_append_slash():
24
- url = Element({"zone": {"image": {"url": "http://something"}}}).image_url()
25
- assert url == "http://something/full/full/0/default.jpg"
26
-
27
-
28
38
  def test_image_url_s3():
29
39
  url = Element(
30
40
  {
@@ -418,6 +428,36 @@ def test_open_image_rotation_mirror(mocker):
418
428
  )
419
429
 
420
430
 
431
+ def test_open_image_iiif_3(mocker):
432
+ open_mock = mocker.patch(
433
+ "arkindex_worker.image.open_image", return_value="an image!"
434
+ )
435
+ elt = Element(
436
+ {
437
+ "zone": {
438
+ "image": {
439
+ "url": "http://something",
440
+ "server": {
441
+ "max_width": None,
442
+ "max_height": None,
443
+ "version": 3,
444
+ },
445
+ },
446
+ "polygon": [[0, 0], [181, 0], [181, 240], [0, 240], [0, 0]],
447
+ },
448
+ "rotation_angle": 0,
449
+ "mirrored": False,
450
+ },
451
+ )
452
+ assert elt.open_image(use_full_image=True) == "an image!"
453
+ assert open_mock.call_count == 1
454
+ assert open_mock.call_args == mocker.call(
455
+ "http://something/full/max/0/default.jpg",
456
+ rotation_angle=0,
457
+ mirrored=False,
458
+ )
459
+
460
+
421
461
  def test_setattr_setitem():
422
462
  element = Element({"name": "something"})
423
463
  element.type = "page"
@@ -113,21 +113,37 @@ def test_update_pillow_image_size_limit(max_image_pixels, expected_image_pixels)
113
113
  assert Image.MAX_IMAGE_PIXELS == MAX_IMAGE_PIXELS
114
114
 
115
115
 
116
- def test_download_tiles(responses):
116
+ @pytest.mark.parametrize(
117
+ ("id_key", "resize"),
118
+ [
119
+ # IIIF version 2
120
+ ("@id", "full"),
121
+ # IIIF version 3
122
+ ("id", "max"),
123
+ ],
124
+ )
125
+ def test_download_tiles(responses, id_key, resize):
117
126
  expected = Image.open(FULL_IMAGE).convert("RGB")
118
127
  tile_bytes = TILE.read_bytes()
119
128
 
120
129
  responses.add(
121
130
  responses.GET,
122
131
  "http://nowhere/info.json",
123
- json={"width": 543, "height": 720, "tiles": [{"width": 181, "height": 240}]},
132
+ json={
133
+ id_key: "http://nowhere",
134
+ "width": 543,
135
+ "height": 720,
136
+ "tiles": [
137
+ {"width": 181, "height": 240},
138
+ ],
139
+ },
124
140
  )
125
141
 
126
142
  for x in (0, 181, 362):
127
143
  for y in (0, 240, 480):
128
144
  responses.add(
129
145
  responses.GET,
130
- f"http://nowhere/{x},{y},181,240/full/0/default.jpg",
146
+ f"http://nowhere/{x},{y},181,240/{resize}/0/default.jpg",
131
147
  body=tile_bytes,
132
148
  )
133
149
 
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import zipfile
2
3
 
3
4
  import pytest
4
5
 
@@ -7,6 +8,7 @@ from arkindex_worker.utils import (
7
8
  DEFAULT_BATCH_SIZE,
8
9
  batch_publication,
9
10
  close_delete_file,
11
+ create_zip_archive,
10
12
  extract_tar_zst_archive,
11
13
  parse_source_id,
12
14
  )
@@ -118,3 +120,43 @@ def test_batch_publication_decorator_alongside_unsupported_cache(caplog):
118
120
  "This API helper `custom_publication_in_batches_without_cache` did not update the cache database",
119
121
  ),
120
122
  ]
123
+
124
+
125
+ def test_zip_archive():
126
+ # Create zip archive from fixtures
127
+ _, archive = create_zip_archive(FIXTURES_DIR / "extract_parent_archives/expected")
128
+
129
+ # Check the files in the archive
130
+ with zipfile.ZipFile(archive, mode="r") as f:
131
+ assert sorted(f.namelist()) == [
132
+ "test/",
133
+ "test/images/",
134
+ "test/images/f2649ce7-333e-44d2-ae73-387f18aad1f6.png",
135
+ "test/labels/",
136
+ "test/labels/f2649ce7-333e-44d2-ae73-387f18aad1f6.png",
137
+ "test/labels_json/",
138
+ "test/labels_json/f2649ce7-333e-44d2-ae73-387f18aad1f6.json",
139
+ "train/",
140
+ "train/images/",
141
+ "train/images/98115546-df07-448c-a2f0-34aa24789b77.png",
142
+ "train/images/ebeaa451-9287-4df7-9c40-07eb25cadb78.png",
143
+ "train/labels/",
144
+ "train/labels/98115546-df07-448c-a2f0-34aa24789b77.png",
145
+ "train/labels/ebeaa451-9287-4df7-9c40-07eb25cadb78.png",
146
+ "train/labels_json/",
147
+ "train/labels_json/98115546-df07-448c-a2f0-34aa24789b77.json",
148
+ "train/labels_json/ebeaa451-9287-4df7-9c40-07eb25cadb78.json",
149
+ "val/",
150
+ "val/images/",
151
+ "val/images/2987176d-4338-40f2-90d9-6d2cb4fd4a00.png",
152
+ "val/images/e3f91312-9201-45b7-9c32-e04a97ff1334.png",
153
+ "val/labels/",
154
+ "val/labels/2987176d-4338-40f2-90d9-6d2cb4fd4a00.png",
155
+ "val/labels/e3f91312-9201-45b7-9c32-e04a97ff1334.png",
156
+ "val/labels_json/",
157
+ "val/labels_json/2987176d-4338-40f2-90d9-6d2cb4fd4a00.json",
158
+ "val/labels_json/e3f91312-9201-45b7-9c32-e04a97ff1334.json",
159
+ ]
160
+
161
+ # Cleanup
162
+ archive.unlink()