arkindex-base-worker 0.5.1rc3__py3-none-any.whl → 0.5.2a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,31 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: arkindex-base-worker
3
- Version: 0.5.1rc3
3
+ Version: 0.5.2a2
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
7
- License: MIT License
8
-
9
- Copyright (c) 2023 Teklia
10
-
11
- Permission is hereby granted, free of charge, to any person obtaining a copy
12
- of this software and associated documentation files (the "Software"), to deal
13
- in the Software without restriction, including without limitation the rights
14
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15
- copies of the Software, and to permit persons to whom the Software is
16
- furnished to do so, subject to the following conditions:
17
-
18
- The above copyright notice and this permission notice shall be included in all
19
- copies or substantial portions of the Software.
20
-
21
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27
- SOFTWARE.
28
-
29
7
  Project-URL: Homepage, https://workers.arkindex.org
30
8
  Project-URL: Documentation, https://workers.arkindex.org
31
9
  Project-URL: Repository, https://gitlab.teklia.com/workers/base-worker
@@ -41,12 +19,13 @@ Classifier: Programming Language :: Python :: 3.12
41
19
  Requires-Python: >=3.10
42
20
  Description-Content-Type: text/markdown
43
21
  License-File: LICENSE
44
- Requires-Dist: humanize==4.14.0
22
+ Requires-Dist: humanize==4.15.0
45
23
  Requires-Dist: peewee~=3.17
46
24
  Requires-Dist: Pillow==11.3.0
47
- Requires-Dist: python-gnupg==0.5.5
25
+ Requires-Dist: python-gnupg==0.5.6
26
+ Requires-Dist: python-magic==0.4.27
48
27
  Requires-Dist: shapely==2.0.6
49
- Requires-Dist: teklia-toolbox==0.1.11
28
+ Requires-Dist: teklia-toolbox==0.1.12
50
29
  Requires-Dist: zstandard==0.25.0
51
30
  Provides-Extra: tests
52
31
  Requires-Dist: pytest-mock==3.15.1; extra == "tests"
@@ -1,10 +1,10 @@
1
- arkindex_base_worker-0.5.1rc3.dist-info/licenses/LICENSE,sha256=NVshRi1efwVezMfW7xXYLrdDr2Li1AfwfGOd5WuH1kQ,1063
1
+ arkindex_base_worker-0.5.2a2.dist-info/licenses/LICENSE,sha256=NVshRi1efwVezMfW7xXYLrdDr2Li1AfwfGOd5WuH1kQ,1063
2
2
  arkindex_worker/__init__.py,sha256=Sdt5KXn8EgURb2MurYVrUWaHbH3iFA1XLRo0Lc5AJ44,250
3
3
  arkindex_worker/cache.py,sha256=XpEXMSnbhYCvrJquwA9XXqZo-ajMLpaCxKG5wH3Gp6Y,10959
4
- arkindex_worker/image.py,sha256=sGE8to5iykXv25bpkftOEWzlh5NzBZSKy4lSRoHYHPU,20929
5
- arkindex_worker/models.py,sha256=7GnKqpWPOSxyR_eKlDNVBe_r3TcE4ofK-1GzaonJEdM,10132
6
- arkindex_worker/utils.py,sha256=yq_LmRlqfWOzB09Aiz2XYx4xPZnoEXR3As48h2HxOVc,10974
7
- arkindex_worker/worker/__init__.py,sha256=SzD0s1_m6gMV02EUF-NeciqZdVPA4dpXI84tSj-g494,17869
4
+ arkindex_worker/image.py,sha256=9KeZHWNIDkwNJZR0y-mbyD_pvKfrgdktMB32jZqSMYk,20927
5
+ arkindex_worker/models.py,sha256=DgKvAB_2e1cPcuUavZkyTkV10jBK8y083oVklB9idSk,10855
6
+ arkindex_worker/utils.py,sha256=Eqg5pGAuOmuwMT3EhKTQDMek7wHC1KzZL7XXqYVVfHY,10977
7
+ arkindex_worker/worker/__init__.py,sha256=tM_ynAARmtuJw5YWb_jI0AD5KNXbWN1K-VDiixIp7O4,18009
8
8
  arkindex_worker/worker/base.py,sha256=-R_aLMJHbR6X1uM-U0zExsF_KLy5Wl3WJ_YMGO9We0I,22153
9
9
  arkindex_worker/worker/classification.py,sha256=qvykymkgd4nGywHCxL8obo4egstoGsmWNS4Ztc1qNWQ,11024
10
10
  arkindex_worker/worker/corpus.py,sha256=MeIMod7jkWyX0frtD0a37rhumnMV3p9ZOC1xwAoXrAA,2291
@@ -14,7 +14,7 @@ arkindex_worker/worker/entity.py,sha256=Aj6EOfzHEm7qQV-Egm0YKLZgCrLS_3ggOKTY81M2
14
14
  arkindex_worker/worker/image.py,sha256=L6Ikuf0Z0RxJk7JarY5PggJGrYSHLaPK0vn0dy0CIaQ,623
15
15
  arkindex_worker/worker/metadata.py,sha256=keZdOdUthSH2hAw9iet5pN7rzWihTUYjZHRGTEjaltw,6843
16
16
  arkindex_worker/worker/process.py,sha256=9TEHpMcBax1wc6PrWMMrdXe2uNfqyVj7n_dAYZRBGnY,1854
17
- arkindex_worker/worker/task.py,sha256=nYfMSFm_d-4t8y4PO4HjFBnLsZf7IsDjkS7-A2Pgnac,1525
17
+ arkindex_worker/worker/task.py,sha256=HASQU5LYVtgvCnRCLFC6iH7h7v6q_usZNZ-r_Wkv9A8,3306
18
18
  arkindex_worker/worker/training.py,sha256=tyQOHcwv--_wdYz6CgLEe1YM7kwwwKN30LvGTsnWd78,10923
19
19
  arkindex_worker/worker/transcription.py,sha256=sw718R119tsLNY8inPMVeIilvFJo94fMbMtYgH0zTM8,21250
20
20
  examples/standalone/python/worker.py,sha256=Zr4s4pHvgexEjlkixLFYZp1UuwMLeoTxjyNG5_S2iYE,6672
@@ -24,7 +24,7 @@ tests/__init__.py,sha256=DG--S6IpGl399rzSAjDdHL76CkOIeZIjajCcyUSDhOQ,241
24
24
  tests/conftest.py,sha256=Tp7YFK17NATwF2yAcBwi0QFNyKSXtLS0VhZ-zZngsQI,24343
25
25
  tests/test_base_worker.py,sha256=lwS4X3atS2ktEKd1XdogmN3mbzq-tO206-k_0EDITlw,29302
26
26
  tests/test_cache.py,sha256=_wztzh94EwVrb8UvpFqgl2aa2_FLaCcJKaqunCYR5Dw,10435
27
- tests/test_dataset_worker.py,sha256=iDJM2C4PfQNH0r4_QqSWoPt8BcM0geUUdODtWY0Z9PA,22412
27
+ tests/test_dataset_worker.py,sha256=LmL3ERF1__PUPkTLiAFC0IYglZTv5WQYA42Vm-uhe2w,22023
28
28
  tests/test_element.py,sha256=hlj5VSF4plwC7uz9R4LGOOXZJQcHZiYCIDZT5V6EIB8,14334
29
29
  tests/test_image.py,sha256=yAM5mMfpQcIurT1KLHmu0AhSX2Qm3YvCu7afyZ3XUdU,28314
30
30
  tests/test_merge.py,sha256=REpZ13jkq_qm_4L5URQgFy5lxvPZtXxQEiWfYLMdmF0,7956
@@ -44,7 +44,7 @@ tests/test_elements_worker/test_entity.py,sha256=SNAZEsVVLnqlliOmjkgv_cZhw0bAuJU
44
44
  tests/test_elements_worker/test_image.py,sha256=BljMNKgec_9a5bzNzFpYZIvSbuvwsWDfdqLHVJaTa7M,2079
45
45
  tests/test_elements_worker/test_metadata.py,sha256=qtTDtlp3VnBkfck7PAguK2dEgTLlr1i1EVnmNTeNf3A,20515
46
46
  tests/test_elements_worker/test_process.py,sha256=y4RoVhPfyHzR795fw7-_FXElBcKo3fy4Ew_HI-kxJic,3088
47
- tests/test_elements_worker/test_task.py,sha256=wTUWqN9UhfKmJn3IcFY75EW4I1ulRhisflmY1kmP47s,5574
47
+ tests/test_elements_worker/test_task.py,sha256=oHwP1fbJftXFA2U4qA3Gb4vX-iJoV-sBvPHnfBBpRrc,8906
48
48
  tests/test_elements_worker/test_training.py,sha256=qgK7BLucddRzc8ePbQtY75x17QvGDEq5XCwgyyvmAJE,8717
49
49
  tests/test_elements_worker/test_transcription_create.py,sha256=yznO9B_BVsOR0Z_VY5ZL8gJp0ZPCz_4sPUs5dXtixAg,29281
50
50
  tests/test_elements_worker/test_transcription_create_with_elements.py,sha256=tmcyglgssEqMnt1Mdy_u6X1m2wgLWTo_HdWst3GrK2k,33056
@@ -55,7 +55,7 @@ worker-demo/tests/conftest.py,sha256=XzNMNeg6pmABUAH8jN6eZTlZSFGLYjS3-DTXjiRN6Yc
55
55
  worker-demo/tests/test_worker.py,sha256=3DLd4NRK4bfyatG5P_PK4k9P9tJHx9XQq5_ryFEEFVg,304
56
56
  worker-demo/worker_demo/__init__.py,sha256=2BPomV8ZMNf3YXJgloatKeHQCE6QOkwmsHGkO6MkQuM,125
57
57
  worker-demo/worker_demo/worker.py,sha256=Rt-DjWa5iBP08k58NDZMfeyPuFbtNcbX6nc5jFX7GNo,440
58
- arkindex_base_worker-0.5.1rc3.dist-info/METADATA,sha256=EhM_vIe59B-G10-l3mQUdEWXDIe4HmoSp8vbstcJ2Cs,3091
59
- arkindex_base_worker-0.5.1rc3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
60
- arkindex_base_worker-0.5.1rc3.dist-info/top_level.txt,sha256=-vNjP2VfROx0j83mdi9aIqRZ88eoJjxeWz-R_gPgyXU,49
61
- arkindex_base_worker-0.5.1rc3.dist-info/RECORD,,
58
+ arkindex_base_worker-0.5.2a2.dist-info/METADATA,sha256=LyPpeyvKIadAuqir1cymTwxoWm3XovhF-JmzQ1LW0MI,1885
59
+ arkindex_base_worker-0.5.2a2.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
60
+ arkindex_base_worker-0.5.2a2.dist-info/top_level.txt,sha256=-vNjP2VfROx0j83mdi9aIqRZ88eoJjxeWz-R_gPgyXU,49
61
+ arkindex_base_worker-0.5.2a2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
arkindex_worker/image.py CHANGED
@@ -38,7 +38,7 @@ if TYPE_CHECKING:
38
38
  from arkindex_worker.models import Element
39
39
 
40
40
  # See http://docs.python-requests.org/en/master/user/advanced/#timeouts
41
- DOWNLOAD_TIMEOUT = (30, 60)
41
+ REQUEST_TIMEOUT = (30, 60)
42
42
 
43
43
  BoundingBox = namedtuple("BoundingBox", ["x", "y", "width", "height"])
44
44
 
@@ -346,7 +346,7 @@ def _retried_request(url, *args, method=requests.get, **kwargs):
346
346
  url,
347
347
  *args,
348
348
  headers={"User-Agent": IIIF_USER_AGENT},
349
- timeout=DOWNLOAD_TIMEOUT,
349
+ timeout=REQUEST_TIMEOUT,
350
350
  verify=should_verify_cert(url),
351
351
  **kwargs,
352
352
  )
arkindex_worker/models.py CHANGED
@@ -9,6 +9,8 @@ from contextlib import contextmanager
9
9
  from PIL import Image
10
10
  from requests import HTTPError
11
11
 
12
+ IMAGE_EXTENSION = "jpg"
13
+
12
14
 
13
15
  class MagicDict(dict):
14
16
  """
@@ -62,28 +64,38 @@ class Element(MagicDict):
62
64
  Describes an Arkindex element.
63
65
  """
64
66
 
65
- def resize_zone_url(self, size: str = "full") -> str:
67
+ def resize_zone_url(
68
+ self, size: str = "full", extension: str = IMAGE_EXTENSION
69
+ ) -> str:
66
70
  """
67
71
  Compute the URL of the image corresponding to the size
68
72
  :param size: Requested size
73
+ :param extension: IIIF extension to download the image
69
74
  :return: The URL corresponding to the size
70
75
  """
76
+ # Removing the `jpg` default extension at the end of the URL to use the provided one
77
+ url = self.zone.url[:-3] + extension
78
+
71
79
  if size == "full":
72
- return self.zone.url
80
+ return url
73
81
  else:
74
- parts = self.zone.url.split("/")
82
+ parts = url.split("/")
75
83
  parts[-3] = size
76
84
  return "/".join(parts)
77
85
 
78
- def image_url(self, size: str = "full") -> str | None:
86
+ def image_url(
87
+ self, size: str = "full", extension: str = IMAGE_EXTENSION
88
+ ) -> str | None:
79
89
  """
80
90
  Build a URL to access the image.
81
91
  When possible, will return the S3 URL for images, so an ML worker can bypass IIIF servers.
82
92
  :param size: Subresolution of the image, following the syntax of the IIIF resize parameter.
93
+ :param extension: IIIF extension to download the image.
83
94
  :returns: A URL to the image, or None if the element does not have an image.
84
95
  """
85
96
  if not self.get("zone"):
86
97
  return
98
+
87
99
  url = self.zone.image.get("s3_url")
88
100
  if url:
89
101
  return url
@@ -95,7 +107,7 @@ class Element(MagicDict):
95
107
  url = self.zone.image.url
96
108
  if not url.endswith("/"):
97
109
  url += "/"
98
- return f"{url}full/{size}/0/default.jpg"
110
+ return f"{url}full/{size}/0/default.{extension}"
99
111
 
100
112
  @property
101
113
  def polygon(self) -> list[float]:
@@ -131,6 +143,7 @@ class Element(MagicDict):
131
143
  max_width: int | None = None,
132
144
  max_height: int | None = None,
133
145
  use_full_image: bool | None = False,
146
+ extension: str | None = IMAGE_EXTENSION,
134
147
  **kwargs,
135
148
  ) -> Image.Image:
136
149
  """
@@ -163,6 +176,7 @@ class Element(MagicDict):
163
176
  :param max_height: The maximum height of the image.
164
177
  :param use_full_image: Ignore the ``zone.polygon`` and always
165
178
  retrieve the image without cropping.
179
+ :param extension: The extension to download the image.
166
180
  :param *args: Positional arguments passed to [arkindex_worker.image.open_image][].
167
181
  :param **kwargs: Keyword arguments passed to [arkindex_worker.image.open_image][].
168
182
  :raises ValueError: When the element does not have an image.
@@ -200,7 +214,11 @@ class Element(MagicDict):
200
214
  else:
201
215
  resize = f"{max_width or ''},{max_height or ''}"
202
216
 
203
- url = self.image_url(resize) if use_full_image else self.resize_zone_url(resize)
217
+ url = (
218
+ self.image_url(resize, extension)
219
+ if use_full_image
220
+ else self.resize_zone_url(resize, extension)
221
+ )
204
222
 
205
223
  try:
206
224
  return open_image(
@@ -224,7 +242,7 @@ class Element(MagicDict):
224
242
 
225
243
  @contextmanager
226
244
  def open_image_tempfile(
227
- self, format: str | None = "jpeg", *args, **kwargs
245
+ self, extension: str | None = IMAGE_EXTENSION, *args, **kwargs
228
246
  ) -> Generator[tempfile.NamedTemporaryFile, None, None]:
229
247
  """
230
248
  Get the element's image as a temporary file stored on the disk.
@@ -243,8 +261,12 @@ class Element(MagicDict):
243
261
  :param **kwargs: Keyword arguments passed to [arkindex_worker.image.open_image][].
244
262
 
245
263
  """
246
- with tempfile.NamedTemporaryFile() as f:
247
- self.open_image(*args, **kwargs).save(f, format=format)
264
+ PIL_format = "jpeg" if extension == IMAGE_EXTENSION else extension
265
+
266
+ with tempfile.NamedTemporaryFile(suffix=f".{extension}") as f:
267
+ self.open_image(*args, extension=extension, **kwargs).save(
268
+ f, format=PIL_format
269
+ )
248
270
  yield f
249
271
 
250
272
  def __str__(self):
arkindex_worker/utils.py CHANGED
@@ -243,7 +243,7 @@ def create_zip_archive(source: Path, destination: Path | None = None) -> Path:
243
243
  logger.debug(f"Compressing file to {destination}")
244
244
 
245
245
  with zipfile.ZipFile(
246
- destination, mode="w", compression=zipfile.ZIP_BZIP2
246
+ destination, mode="w", compression=zipfile.ZIP_DEFLATED
247
247
  ) as archive:
248
248
  for p in source.rglob("*"):
249
249
  relpath = p.relative_to(source)
@@ -424,12 +424,13 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
424
424
  failed = 0
425
425
  for i, dataset_set in enumerate(dataset_sets, start=1):
426
426
  try:
427
- assert dataset_set.dataset.state == DatasetState.Complete.value, (
428
- "When processing a set, its dataset state should be Complete."
429
- )
430
-
431
- logger.info(f"Retrieving data for {dataset_set} ({i}/{count})")
432
- self.download_dataset_artifact(dataset_set.dataset)
427
+ if dataset_set.dataset.state == DatasetState.Complete.value:
428
+ logger.info(f"Retrieving data for {dataset_set} ({i}/{count})")
429
+ self.download_dataset_artifact(dataset_set.dataset)
430
+ else:
431
+ logger.warning(
432
+ f"The dataset {dataset_set.dataset} has its state set to `{dataset_set.dataset.state}`, its archive will not be downloaded"
433
+ )
433
434
 
434
435
  logger.info(f"Processing {dataset_set} ({i}/{count})")
435
436
  self.process_set(dataset_set)
@@ -444,7 +445,7 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
444
445
 
445
446
  logger.warning(message, exc_info=e if self.args.verbose else None)
446
447
 
447
- # Cleanup the latest downloaded dataset artifact
448
+ # Cleanup the latest downloaded dataset artifact (if needed)
448
449
  self.cleanup_downloaded_artifact()
449
450
 
450
451
  message = f"Ran on {count} {pluralize('set', count)}: {count - failed} completed, {failed} failed"
@@ -4,9 +4,16 @@ BaseWorker methods for tasks.
4
4
 
5
5
  import uuid
6
6
  from collections.abc import Iterator
7
+ from http.client import REQUEST_TIMEOUT
8
+ from pathlib import Path
9
+
10
+ import magic
11
+ import requests
7
12
 
8
13
  from arkindex.compat import DownloadedFile
14
+ from arkindex_worker import logger
9
15
  from arkindex_worker.models import Artifact
16
+ from teklia_toolbox.requests import should_verify_cert
10
17
 
11
18
 
12
19
  class TaskMixin:
@@ -45,3 +52,49 @@ class TaskMixin:
45
52
  return self.api_client.request(
46
53
  "DownloadArtifact", id=task_id, path=artifact.path
47
54
  )
55
+
56
+ def upload_artifact(self, path: Path) -> None:
57
+ """
58
+ Upload a single file as an Artifact of the current task.
59
+
60
+ :param path: Path of the single file to upload as an Artifact.
61
+ """
62
+ assert path and isinstance(path, Path) and path.exists(), (
63
+ "path shouldn't be null, should be a Path and should exist"
64
+ )
65
+
66
+ if self.is_read_only:
67
+ logger.warning("Cannot upload artifact as this worker is in read-only mode")
68
+ return
69
+
70
+ # Get path relative to task's data directory
71
+ relpath = str(path.relative_to(self.work_dir))
72
+
73
+ # Get file size
74
+ size = path.stat().st_size
75
+
76
+ # Detect content type
77
+ try:
78
+ content_type = magic.from_file(path, mime=True)
79
+ except Exception as e:
80
+ logger.warning(f"Failed to get a mime type for {path}: {e}")
81
+ content_type = "application/octet-stream"
82
+
83
+ # Create artifact on API to get an S3 url
84
+ artifact = self.api_client.request(
85
+ "CreateArtifact",
86
+ id=self.task_id,
87
+ body={"path": relpath, "content_type": content_type, "size": size},
88
+ )
89
+
90
+ # Upload the file content to S3
91
+ s3_put_url = artifact["s3_put_url"]
92
+ with path.open("rb") as content:
93
+ resp = requests.put(
94
+ s3_put_url,
95
+ data=content,
96
+ headers={"Content-Type": content_type},
97
+ timeout=REQUEST_TIMEOUT,
98
+ verify=should_verify_cert(s3_put_url),
99
+ )
100
+ resp.raise_for_status()
@@ -435,34 +435,6 @@ def test_run_no_sets(mocker, caplog, mock_dataset_worker):
435
435
  ]
436
436
 
437
437
 
438
- def test_run_initial_dataset_state_error(
439
- mocker, responses, caplog, mock_dataset_worker, default_dataset
440
- ):
441
- default_dataset.state = DatasetState.Building.value
442
- mocker.patch(
443
- "arkindex_worker.worker.DatasetWorker.list_sets",
444
- return_value=[Set(name="train", dataset=default_dataset)],
445
- )
446
-
447
- with pytest.raises(SystemExit):
448
- mock_dataset_worker.run()
449
-
450
- assert len(responses.calls) == len(BASE_API_CALLS) * 2
451
- assert [
452
- (call.request.method, call.request.url) for call in responses.calls
453
- ] == BASE_API_CALLS * 2
454
-
455
- assert [(level, message) for _, level, message in caplog.record_tuples] == [
456
- (logging.INFO, "Loaded Worker Fake worker @ 123412 from API"),
457
- (logging.INFO, "Modern configuration is not available"),
458
- (
459
- logging.WARNING,
460
- "Failed running worker on Set (train) from Dataset (dataset_id): AssertionError('When processing a set, its dataset state should be Complete.')",
461
- ),
462
- (logging.ERROR, "Ran on 1 set: 0 completed, 1 failed"),
463
- ]
464
-
465
-
466
438
  def test_run_download_dataset_artifact_api_error(
467
439
  mocker,
468
440
  tmp_path,
@@ -570,16 +542,18 @@ def test_run_no_downloaded_dataset_artifact_error(
570
542
  ]
571
543
 
572
544
 
545
+ @pytest.mark.parametrize("dataset_state", DatasetState)
573
546
  def test_run(
574
547
  mocker,
575
548
  tmp_path,
576
549
  responses,
577
550
  caplog,
551
+ dataset_state,
578
552
  mock_dataset_worker,
579
553
  default_dataset,
580
554
  default_artifact,
581
555
  ):
582
- default_dataset.state = DatasetState.Complete.value
556
+ default_dataset.state = dataset_state.value
583
557
  mocker.patch(
584
558
  "arkindex_worker.worker.DatasetWorker.list_sets",
585
559
  return_value=[Set(name="train", dataset=default_dataset)],
@@ -590,55 +564,68 @@ def test_run(
590
564
  )
591
565
  mock_process = mocker.patch("arkindex_worker.worker.DatasetWorker.process_set")
592
566
 
593
- archive_path = (
594
- FIXTURES_DIR
595
- / "extract_parent_archives"
596
- / "first_parent"
597
- / "arkindex_data.tar.zst"
598
- )
599
- responses.add(
600
- responses.GET,
601
- f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/",
602
- status=200,
603
- json=[default_artifact],
604
- )
605
- responses.add(
606
- responses.GET,
607
- f"http://testserver/api/v1/task/{default_dataset.task_id}/artifact/dataset_id.tar.zst",
608
- status=200,
609
- body=archive_path.read_bytes(),
610
- content_type="application/zstd",
611
- )
567
+ if dataset_state == DatasetState.Complete:
568
+ archive_path = (
569
+ FIXTURES_DIR
570
+ / "extract_parent_archives"
571
+ / "first_parent"
572
+ / "arkindex_data.tar.zst"
573
+ )
574
+ responses.add(
575
+ responses.GET,
576
+ f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/",
577
+ status=200,
578
+ json=[default_artifact],
579
+ )
580
+ responses.add(
581
+ responses.GET,
582
+ f"http://testserver/api/v1/task/{default_dataset.task_id}/artifact/dataset_id.tar.zst",
583
+ status=200,
584
+ body=archive_path.read_bytes(),
585
+ content_type="application/zstd",
586
+ )
612
587
 
613
588
  mock_dataset_worker.run()
614
589
 
615
590
  assert mock_process.call_count == 1
616
591
 
617
- assert len(responses.calls) == len(BASE_API_CALLS) * 2 + 2
592
+ # We only download the dataset archive when it is Complete
593
+ extra_calls = []
594
+ if dataset_state == DatasetState.Complete:
595
+ extra_calls = [
596
+ (
597
+ "GET",
598
+ f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/",
599
+ ),
600
+ (
601
+ "GET",
602
+ f"http://testserver/api/v1/task/{default_dataset.task_id}/artifact/dataset_id.tar.zst",
603
+ ),
604
+ ]
605
+
606
+ assert len(responses.calls) == len(BASE_API_CALLS) * 2 + len(extra_calls)
618
607
  assert [
619
608
  (call.request.method, call.request.url) for call in responses.calls
620
- ] == BASE_API_CALLS * 2 + [
621
- (
622
- "GET",
623
- f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/",
624
- ),
625
- (
626
- "GET",
627
- f"http://testserver/api/v1/task/{default_dataset.task_id}/artifact/dataset_id.tar.zst",
628
- ),
629
- ]
609
+ ] == BASE_API_CALLS * 2 + extra_calls
630
610
 
631
- assert [(level, message) for _, level, message in caplog.record_tuples] == [
611
+ logs = [
632
612
  (logging.INFO, "Loaded Worker Fake worker @ 123412 from API"),
633
613
  (logging.INFO, "Modern configuration is not available"),
634
614
  (
635
- logging.INFO,
636
- "Retrieving data for Set (train) from Dataset (dataset_id) (1/1)",
615
+ logging.WARNING,
616
+ f"The dataset Dataset (dataset_id) has its state set to `{dataset_state.value}`, its archive will not be downloaded",
637
617
  ),
638
- (logging.INFO, "Downloading artifact for Dataset (dataset_id)"),
639
618
  (logging.INFO, "Processing Set (train) from Dataset (dataset_id) (1/1)"),
640
619
  (logging.INFO, "Ran on 1 set: 1 completed, 0 failed"),
641
620
  ]
621
+ if dataset_state == DatasetState.Complete:
622
+ logs[2] = (
623
+ logging.INFO,
624
+ "Retrieving data for Set (train) from Dataset (dataset_id) (1/1)",
625
+ )
626
+ logs.insert(3, (logging.INFO, "Downloading artifact for Dataset (dataset_id)"))
627
+
628
+ assert [(level, message) for _, level, message in caplog.record_tuples] == logs
642
629
 
643
630
 
644
631
  def test_run_read_only(
@@ -1,6 +1,9 @@
1
+ import tempfile
1
2
  import uuid
3
+ from pathlib import Path
2
4
 
3
5
  import pytest
6
+ from requests import HTTPError
4
7
 
5
8
  from arkindex.exceptions import ErrorResponse
6
9
  from arkindex_worker.models import Artifact
@@ -196,3 +199,112 @@ def test_download_artifact(
196
199
  ] == BASE_API_CALLS + [
197
200
  ("GET", f"http://testserver/api/v1/task/{TASK_ID}/artifact/dataset_id.tar.zst"),
198
201
  ]
202
+
203
+
204
+ @pytest.mark.parametrize(
205
+ ("payload", "error"),
206
+ [
207
+ # Path
208
+ (
209
+ {"path": None},
210
+ "path shouldn't be null, should be a Path and should exist",
211
+ ),
212
+ (
213
+ {"path": "not path type"},
214
+ "path shouldn't be null, should be a Path and should exist",
215
+ ),
216
+ (
217
+ {"path": Path("i_do_no_exist.oops")},
218
+ "path shouldn't be null, should be a Path and should exist",
219
+ ),
220
+ ],
221
+ )
222
+ def test_upload_artifact_wrong_param_path(mock_dataset_worker, payload, error):
223
+ with pytest.raises(AssertionError, match=error):
224
+ mock_dataset_worker.upload_artifact(**payload)
225
+
226
+
227
+ @pytest.fixture
228
+ def tmp_file(mock_dataset_worker):
229
+ with tempfile.NamedTemporaryFile(
230
+ mode="w", suffix=".txt", dir=mock_dataset_worker.work_dir
231
+ ) as file:
232
+ file.write("Some content...")
233
+ file.seek(0)
234
+
235
+ yield Path(file.name)
236
+
237
+
238
+ def test_upload_artifact_api_error(responses, mock_dataset_worker, tmp_file):
239
+ responses.add(
240
+ responses.POST,
241
+ "http://testserver/api/v1/task/my_task/artifacts/",
242
+ status=418,
243
+ )
244
+
245
+ with pytest.raises(ErrorResponse):
246
+ mock_dataset_worker.upload_artifact(path=tmp_file)
247
+
248
+ assert len(responses.calls) == len(BASE_API_CALLS) + 1
249
+ assert [
250
+ (call.request.method, call.request.url) for call in responses.calls
251
+ ] == BASE_API_CALLS + [("POST", "http://testserver/api/v1/task/my_task/artifacts/")]
252
+
253
+
254
+ def test_upload_artifact_s3_upload_error(
255
+ responses,
256
+ mock_dataset_worker,
257
+ tmp_file,
258
+ ):
259
+ responses.add(
260
+ responses.POST,
261
+ "http://testserver/api/v1/task/my_task/artifacts/",
262
+ json={
263
+ "id": "11111111-1111-1111-1111-111111111111",
264
+ "path": tmp_file.name,
265
+ "size": 15,
266
+ "content_type": "text/plain",
267
+ "s3_put_url": "http://example.com/oops.txt",
268
+ },
269
+ )
270
+ responses.add(responses.PUT, "http://example.com/oops.txt", status=500)
271
+
272
+ with pytest.raises(HTTPError):
273
+ mock_dataset_worker.upload_artifact(path=tmp_file)
274
+
275
+ assert len(responses.calls) == len(BASE_API_CALLS) + 2
276
+ assert [
277
+ (call.request.method, call.request.url) for call in responses.calls
278
+ ] == BASE_API_CALLS + [
279
+ ("POST", "http://testserver/api/v1/task/my_task/artifacts/"),
280
+ ("PUT", "http://example.com/oops.txt"),
281
+ ]
282
+
283
+
284
+ def test_upload_artifact(
285
+ responses,
286
+ mock_dataset_worker,
287
+ tmp_file,
288
+ ):
289
+ responses.add(
290
+ responses.POST,
291
+ "http://testserver/api/v1/task/my_task/artifacts/",
292
+ json={
293
+ "id": "11111111-1111-1111-1111-111111111111",
294
+ "path": tmp_file.name,
295
+ "size": 15,
296
+ "content_type": "text/plain",
297
+ "s3_put_url": "http://example.com/test.txt",
298
+ },
299
+ )
300
+ responses.add(responses.PUT, "http://example.com/test.txt")
301
+
302
+ mock_dataset_worker.upload_artifact(path=tmp_file)
303
+
304
+ assert len(responses.calls) == len(BASE_API_CALLS) + 2
305
+ assert [
306
+ (call.request.method, call.request.url) for call in responses.calls
307
+ ] == BASE_API_CALLS + [
308
+ ("POST", "http://testserver/api/v1/task/my_task/artifacts/"),
309
+ ("PUT", "http://example.com/test.txt"),
310
+ ]