arkindex-base-worker 0.5.2a1__tar.gz → 0.5.2a2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/PKG-INFO +2 -1
  2. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/arkindex_base_worker.egg-info/PKG-INFO +2 -1
  3. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/arkindex_base_worker.egg-info/requires.txt +1 -0
  4. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/arkindex_worker/image.py +2 -2
  5. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/arkindex_worker/worker/__init__.py +8 -7
  6. arkindex_base_worker-0.5.2a2/arkindex_worker/worker/task.py +100 -0
  7. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/pyproject.toml +2 -1
  8. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_dataset_worker.py +50 -63
  9. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_elements_worker/test_task.py +112 -0
  10. arkindex_base_worker-0.5.2a1/arkindex_worker/worker/task.py +0 -47
  11. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/LICENSE +0 -0
  12. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/README.md +0 -0
  13. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/arkindex_base_worker.egg-info/SOURCES.txt +0 -0
  14. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/arkindex_base_worker.egg-info/dependency_links.txt +0 -0
  15. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/arkindex_base_worker.egg-info/top_level.txt +0 -0
  16. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/arkindex_worker/__init__.py +0 -0
  17. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/arkindex_worker/cache.py +0 -0
  18. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/arkindex_worker/models.py +0 -0
  19. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/arkindex_worker/utils.py +0 -0
  20. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/arkindex_worker/worker/base.py +0 -0
  21. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/arkindex_worker/worker/classification.py +0 -0
  22. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/arkindex_worker/worker/corpus.py +0 -0
  23. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/arkindex_worker/worker/dataset.py +0 -0
  24. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/arkindex_worker/worker/element.py +0 -0
  25. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/arkindex_worker/worker/entity.py +0 -0
  26. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/arkindex_worker/worker/image.py +0 -0
  27. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/arkindex_worker/worker/metadata.py +0 -0
  28. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/arkindex_worker/worker/process.py +0 -0
  29. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/arkindex_worker/worker/training.py +0 -0
  30. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/arkindex_worker/worker/transcription.py +0 -0
  31. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/examples/standalone/python/worker.py +0 -0
  32. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/examples/tooled/python/worker.py +0 -0
  33. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/hooks/pre_gen_project.py +0 -0
  34. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/setup.cfg +0 -0
  35. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/__init__.py +0 -0
  36. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/conftest.py +0 -0
  37. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_base_worker.py +0 -0
  38. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_cache.py +0 -0
  39. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_element.py +0 -0
  40. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_elements_worker/__init__.py +0 -0
  41. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_elements_worker/test_classification.py +0 -0
  42. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_elements_worker/test_cli.py +0 -0
  43. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_elements_worker/test_corpus.py +0 -0
  44. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_elements_worker/test_dataset.py +0 -0
  45. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_elements_worker/test_element.py +0 -0
  46. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_elements_worker/test_element_create_multiple.py +0 -0
  47. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_elements_worker/test_element_create_single.py +0 -0
  48. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_elements_worker/test_element_list_children.py +0 -0
  49. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_elements_worker/test_element_list_parents.py +0 -0
  50. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_elements_worker/test_entity.py +0 -0
  51. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_elements_worker/test_image.py +0 -0
  52. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_elements_worker/test_metadata.py +0 -0
  53. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_elements_worker/test_process.py +0 -0
  54. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_elements_worker/test_training.py +0 -0
  55. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_elements_worker/test_transcription_create.py +0 -0
  56. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_elements_worker/test_transcription_create_with_elements.py +0 -0
  57. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_elements_worker/test_transcription_list.py +0 -0
  58. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_elements_worker/test_worker.py +0 -0
  59. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_image.py +0 -0
  60. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_merge.py +0 -0
  61. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_modern_config.py +0 -0
  62. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/tests/test_utils.py +0 -0
  63. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/worker-demo/tests/__init__.py +0 -0
  64. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/worker-demo/tests/conftest.py +0 -0
  65. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/worker-demo/tests/test_worker.py +0 -0
  66. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/worker-demo/worker_demo/__init__.py +0 -0
  67. {arkindex_base_worker-0.5.2a1 → arkindex_base_worker-0.5.2a2}/worker-demo/worker_demo/worker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: arkindex-base-worker
3
- Version: 0.5.2a1
3
+ Version: 0.5.2a2
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
@@ -23,6 +23,7 @@ Requires-Dist: humanize==4.15.0
23
23
  Requires-Dist: peewee~=3.17
24
24
  Requires-Dist: Pillow==11.3.0
25
25
  Requires-Dist: python-gnupg==0.5.6
26
+ Requires-Dist: python-magic==0.4.27
26
27
  Requires-Dist: shapely==2.0.6
27
28
  Requires-Dist: teklia-toolbox==0.1.12
28
29
  Requires-Dist: zstandard==0.25.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: arkindex-base-worker
3
- Version: 0.5.2a1
3
+ Version: 0.5.2a2
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
@@ -23,6 +23,7 @@ Requires-Dist: humanize==4.15.0
23
23
  Requires-Dist: peewee~=3.17
24
24
  Requires-Dist: Pillow==11.3.0
25
25
  Requires-Dist: python-gnupg==0.5.6
26
+ Requires-Dist: python-magic==0.4.27
26
27
  Requires-Dist: shapely==2.0.6
27
28
  Requires-Dist: teklia-toolbox==0.1.12
28
29
  Requires-Dist: zstandard==0.25.0
@@ -2,6 +2,7 @@ humanize==4.15.0
2
2
  peewee~=3.17
3
3
  Pillow==11.3.0
4
4
  python-gnupg==0.5.6
5
+ python-magic==0.4.27
5
6
  shapely==2.0.6
6
7
  teklia-toolbox==0.1.12
7
8
  zstandard==0.25.0
@@ -38,7 +38,7 @@ if TYPE_CHECKING:
38
38
  from arkindex_worker.models import Element
39
39
 
40
40
  # See http://docs.python-requests.org/en/master/user/advanced/#timeouts
41
- DOWNLOAD_TIMEOUT = (30, 60)
41
+ REQUEST_TIMEOUT = (30, 60)
42
42
 
43
43
  BoundingBox = namedtuple("BoundingBox", ["x", "y", "width", "height"])
44
44
 
@@ -346,7 +346,7 @@ def _retried_request(url, *args, method=requests.get, **kwargs):
346
346
  url,
347
347
  *args,
348
348
  headers={"User-Agent": IIIF_USER_AGENT},
349
- timeout=DOWNLOAD_TIMEOUT,
349
+ timeout=REQUEST_TIMEOUT,
350
350
  verify=should_verify_cert(url),
351
351
  **kwargs,
352
352
  )
@@ -424,12 +424,13 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
424
424
  failed = 0
425
425
  for i, dataset_set in enumerate(dataset_sets, start=1):
426
426
  try:
427
- assert dataset_set.dataset.state == DatasetState.Complete.value, (
428
- "When processing a set, its dataset state should be Complete."
429
- )
430
-
431
- logger.info(f"Retrieving data for {dataset_set} ({i}/{count})")
432
- self.download_dataset_artifact(dataset_set.dataset)
427
+ if dataset_set.dataset.state == DatasetState.Complete.value:
428
+ logger.info(f"Retrieving data for {dataset_set} ({i}/{count})")
429
+ self.download_dataset_artifact(dataset_set.dataset)
430
+ else:
431
+ logger.warning(
432
+ f"The dataset {dataset_set.dataset} has its state set to `{dataset_set.dataset.state}`, its archive will not be downloaded"
433
+ )
433
434
 
434
435
  logger.info(f"Processing {dataset_set} ({i}/{count})")
435
436
  self.process_set(dataset_set)
@@ -444,7 +445,7 @@ class DatasetWorker(DatasetMixin, BaseWorker, TaskMixin):
444
445
 
445
446
  logger.warning(message, exc_info=e if self.args.verbose else None)
446
447
 
447
- # Cleanup the latest downloaded dataset artifact
448
+ # Cleanup the latest downloaded dataset artifact (if needed)
448
449
  self.cleanup_downloaded_artifact()
449
450
 
450
451
  message = f"Ran on {count} {pluralize('set', count)}: {count - failed} completed, {failed} failed"
@@ -0,0 +1,100 @@
1
+ """
2
+ BaseWorker methods for tasks.
3
+ """
4
+
5
+ import uuid
6
+ from collections.abc import Iterator
7
+ from http.client import REQUEST_TIMEOUT
8
+ from pathlib import Path
9
+
10
+ import magic
11
+ import requests
12
+
13
+ from arkindex.compat import DownloadedFile
14
+ from arkindex_worker import logger
15
+ from arkindex_worker.models import Artifact
16
+ from teklia_toolbox.requests import should_verify_cert
17
+
18
+
19
+ class TaskMixin:
20
+ def list_artifacts(self, task_id: uuid.UUID) -> Iterator[Artifact]:
21
+ """
22
+ List artifacts associated to a task.
23
+
24
+ :param task_id: Task ID to find artifacts from.
25
+ :returns: An iterator of ``Artifact`` objects built from the ``ListArtifacts`` API endpoint.
26
+ """
27
+ assert task_id and isinstance(task_id, uuid.UUID), (
28
+ "task_id shouldn't be null and should be an UUID"
29
+ )
30
+
31
+ results = self.api_client.request("ListArtifacts", id=task_id)
32
+
33
+ return map(Artifact, results)
34
+
35
+ def download_artifact(
36
+ self, task_id: uuid.UUID, artifact: Artifact
37
+ ) -> DownloadedFile:
38
+ """
39
+ Download an artifact content.
40
+
41
+ :param task_id: Task ID the Artifact is from.
42
+ :param artifact: Artifact to download content from.
43
+ :returns: A temporary file containing the ``Artifact`` downloaded from the ``DownloadArtifact`` API endpoint.
44
+ """
45
+ assert task_id and isinstance(task_id, uuid.UUID), (
46
+ "task_id shouldn't be null and should be an UUID"
47
+ )
48
+ assert artifact and isinstance(artifact, Artifact), (
49
+ "artifact shouldn't be null and should be an Artifact"
50
+ )
51
+
52
+ return self.api_client.request(
53
+ "DownloadArtifact", id=task_id, path=artifact.path
54
+ )
55
+
56
+ def upload_artifact(self, path: Path) -> None:
57
+ """
58
+ Upload a single file as an Artifact of the current task.
59
+
60
+ :param path: Path of the single file to upload as an Artifact.
61
+ """
62
+ assert path and isinstance(path, Path) and path.exists(), (
63
+ "path shouldn't be null, should be a Path and should exist"
64
+ )
65
+
66
+ if self.is_read_only:
67
+ logger.warning("Cannot upload artifact as this worker is in read-only mode")
68
+ return
69
+
70
+ # Get path relative to task's data directory
71
+ relpath = str(path.relative_to(self.work_dir))
72
+
73
+ # Get file size
74
+ size = path.stat().st_size
75
+
76
+ # Detect content type
77
+ try:
78
+ content_type = magic.from_file(path, mime=True)
79
+ except Exception as e:
80
+ logger.warning(f"Failed to get a mime type for {path}: {e}")
81
+ content_type = "application/octet-stream"
82
+
83
+ # Create artifact on API to get an S3 url
84
+ artifact = self.api_client.request(
85
+ "CreateArtifact",
86
+ id=self.task_id,
87
+ body={"path": relpath, "content_type": content_type, "size": size},
88
+ )
89
+
90
+ # Upload the file content to S3
91
+ s3_put_url = artifact["s3_put_url"]
92
+ with path.open("rb") as content:
93
+ resp = requests.put(
94
+ s3_put_url,
95
+ data=content,
96
+ headers={"Content-Type": content_type},
97
+ timeout=REQUEST_TIMEOUT,
98
+ verify=should_verify_cert(s3_put_url),
99
+ )
100
+ resp.raise_for_status()
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "arkindex-base-worker"
7
- version = "0.5.2a1"
7
+ version = "0.5.2a2"
8
8
  description = "Base Worker to easily build Arkindex ML workflows"
9
9
  license-files = ["LICENSE"]
10
10
  dependencies = [
@@ -12,6 +12,7 @@ dependencies = [
12
12
  "peewee~=3.17",
13
13
  "Pillow==11.3.0",
14
14
  "python-gnupg==0.5.6",
15
+ "python-magic==0.4.27",
15
16
  "shapely==2.0.6",
16
17
  "teklia-toolbox==0.1.12",
17
18
  "zstandard==0.25.0",
@@ -435,34 +435,6 @@ def test_run_no_sets(mocker, caplog, mock_dataset_worker):
435
435
  ]
436
436
 
437
437
 
438
- def test_run_initial_dataset_state_error(
439
- mocker, responses, caplog, mock_dataset_worker, default_dataset
440
- ):
441
- default_dataset.state = DatasetState.Building.value
442
- mocker.patch(
443
- "arkindex_worker.worker.DatasetWorker.list_sets",
444
- return_value=[Set(name="train", dataset=default_dataset)],
445
- )
446
-
447
- with pytest.raises(SystemExit):
448
- mock_dataset_worker.run()
449
-
450
- assert len(responses.calls) == len(BASE_API_CALLS) * 2
451
- assert [
452
- (call.request.method, call.request.url) for call in responses.calls
453
- ] == BASE_API_CALLS * 2
454
-
455
- assert [(level, message) for _, level, message in caplog.record_tuples] == [
456
- (logging.INFO, "Loaded Worker Fake worker @ 123412 from API"),
457
- (logging.INFO, "Modern configuration is not available"),
458
- (
459
- logging.WARNING,
460
- "Failed running worker on Set (train) from Dataset (dataset_id): AssertionError('When processing a set, its dataset state should be Complete.')",
461
- ),
462
- (logging.ERROR, "Ran on 1 set: 0 completed, 1 failed"),
463
- ]
464
-
465
-
466
438
  def test_run_download_dataset_artifact_api_error(
467
439
  mocker,
468
440
  tmp_path,
@@ -570,16 +542,18 @@ def test_run_no_downloaded_dataset_artifact_error(
570
542
  ]
571
543
 
572
544
 
545
+ @pytest.mark.parametrize("dataset_state", DatasetState)
573
546
  def test_run(
574
547
  mocker,
575
548
  tmp_path,
576
549
  responses,
577
550
  caplog,
551
+ dataset_state,
578
552
  mock_dataset_worker,
579
553
  default_dataset,
580
554
  default_artifact,
581
555
  ):
582
- default_dataset.state = DatasetState.Complete.value
556
+ default_dataset.state = dataset_state.value
583
557
  mocker.patch(
584
558
  "arkindex_worker.worker.DatasetWorker.list_sets",
585
559
  return_value=[Set(name="train", dataset=default_dataset)],
@@ -590,55 +564,68 @@ def test_run(
590
564
  )
591
565
  mock_process = mocker.patch("arkindex_worker.worker.DatasetWorker.process_set")
592
566
 
593
- archive_path = (
594
- FIXTURES_DIR
595
- / "extract_parent_archives"
596
- / "first_parent"
597
- / "arkindex_data.tar.zst"
598
- )
599
- responses.add(
600
- responses.GET,
601
- f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/",
602
- status=200,
603
- json=[default_artifact],
604
- )
605
- responses.add(
606
- responses.GET,
607
- f"http://testserver/api/v1/task/{default_dataset.task_id}/artifact/dataset_id.tar.zst",
608
- status=200,
609
- body=archive_path.read_bytes(),
610
- content_type="application/zstd",
611
- )
567
+ if dataset_state == DatasetState.Complete:
568
+ archive_path = (
569
+ FIXTURES_DIR
570
+ / "extract_parent_archives"
571
+ / "first_parent"
572
+ / "arkindex_data.tar.zst"
573
+ )
574
+ responses.add(
575
+ responses.GET,
576
+ f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/",
577
+ status=200,
578
+ json=[default_artifact],
579
+ )
580
+ responses.add(
581
+ responses.GET,
582
+ f"http://testserver/api/v1/task/{default_dataset.task_id}/artifact/dataset_id.tar.zst",
583
+ status=200,
584
+ body=archive_path.read_bytes(),
585
+ content_type="application/zstd",
586
+ )
612
587
 
613
588
  mock_dataset_worker.run()
614
589
 
615
590
  assert mock_process.call_count == 1
616
591
 
617
- assert len(responses.calls) == len(BASE_API_CALLS) * 2 + 2
592
+ # We only download the dataset archive when it is Complete
593
+ extra_calls = []
594
+ if dataset_state == DatasetState.Complete:
595
+ extra_calls = [
596
+ (
597
+ "GET",
598
+ f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/",
599
+ ),
600
+ (
601
+ "GET",
602
+ f"http://testserver/api/v1/task/{default_dataset.task_id}/artifact/dataset_id.tar.zst",
603
+ ),
604
+ ]
605
+
606
+ assert len(responses.calls) == len(BASE_API_CALLS) * 2 + len(extra_calls)
618
607
  assert [
619
608
  (call.request.method, call.request.url) for call in responses.calls
620
- ] == BASE_API_CALLS * 2 + [
621
- (
622
- "GET",
623
- f"http://testserver/api/v1/task/{default_dataset.task_id}/artifacts/",
624
- ),
625
- (
626
- "GET",
627
- f"http://testserver/api/v1/task/{default_dataset.task_id}/artifact/dataset_id.tar.zst",
628
- ),
629
- ]
609
+ ] == BASE_API_CALLS * 2 + extra_calls
630
610
 
631
- assert [(level, message) for _, level, message in caplog.record_tuples] == [
611
+ logs = [
632
612
  (logging.INFO, "Loaded Worker Fake worker @ 123412 from API"),
633
613
  (logging.INFO, "Modern configuration is not available"),
634
614
  (
635
- logging.INFO,
636
- "Retrieving data for Set (train) from Dataset (dataset_id) (1/1)",
615
+ logging.WARNING,
616
+ f"The dataset Dataset (dataset_id) has its state set to `{dataset_state.value}`, its archive will not be downloaded",
637
617
  ),
638
- (logging.INFO, "Downloading artifact for Dataset (dataset_id)"),
639
618
  (logging.INFO, "Processing Set (train) from Dataset (dataset_id) (1/1)"),
640
619
  (logging.INFO, "Ran on 1 set: 1 completed, 0 failed"),
641
620
  ]
621
+ if dataset_state == DatasetState.Complete:
622
+ logs[2] = (
623
+ logging.INFO,
624
+ "Retrieving data for Set (train) from Dataset (dataset_id) (1/1)",
625
+ )
626
+ logs.insert(3, (logging.INFO, "Downloading artifact for Dataset (dataset_id)"))
627
+
628
+ assert [(level, message) for _, level, message in caplog.record_tuples] == logs
642
629
 
643
630
 
644
631
  def test_run_read_only(
@@ -1,6 +1,9 @@
1
+ import tempfile
1
2
  import uuid
3
+ from pathlib import Path
2
4
 
3
5
  import pytest
6
+ from requests import HTTPError
4
7
 
5
8
  from arkindex.exceptions import ErrorResponse
6
9
  from arkindex_worker.models import Artifact
@@ -196,3 +199,112 @@ def test_download_artifact(
196
199
  ] == BASE_API_CALLS + [
197
200
  ("GET", f"http://testserver/api/v1/task/{TASK_ID}/artifact/dataset_id.tar.zst"),
198
201
  ]
202
+
203
+
204
+ @pytest.mark.parametrize(
205
+ ("payload", "error"),
206
+ [
207
+ # Path
208
+ (
209
+ {"path": None},
210
+ "path shouldn't be null, should be a Path and should exist",
211
+ ),
212
+ (
213
+ {"path": "not path type"},
214
+ "path shouldn't be null, should be a Path and should exist",
215
+ ),
216
+ (
217
+ {"path": Path("i_do_no_exist.oops")},
218
+ "path shouldn't be null, should be a Path and should exist",
219
+ ),
220
+ ],
221
+ )
222
+ def test_upload_artifact_wrong_param_path(mock_dataset_worker, payload, error):
223
+ with pytest.raises(AssertionError, match=error):
224
+ mock_dataset_worker.upload_artifact(**payload)
225
+
226
+
227
+ @pytest.fixture
228
+ def tmp_file(mock_dataset_worker):
229
+ with tempfile.NamedTemporaryFile(
230
+ mode="w", suffix=".txt", dir=mock_dataset_worker.work_dir
231
+ ) as file:
232
+ file.write("Some content...")
233
+ file.seek(0)
234
+
235
+ yield Path(file.name)
236
+
237
+
238
+ def test_upload_artifact_api_error(responses, mock_dataset_worker, tmp_file):
239
+ responses.add(
240
+ responses.POST,
241
+ "http://testserver/api/v1/task/my_task/artifacts/",
242
+ status=418,
243
+ )
244
+
245
+ with pytest.raises(ErrorResponse):
246
+ mock_dataset_worker.upload_artifact(path=tmp_file)
247
+
248
+ assert len(responses.calls) == len(BASE_API_CALLS) + 1
249
+ assert [
250
+ (call.request.method, call.request.url) for call in responses.calls
251
+ ] == BASE_API_CALLS + [("POST", "http://testserver/api/v1/task/my_task/artifacts/")]
252
+
253
+
254
+ def test_upload_artifact_s3_upload_error(
255
+ responses,
256
+ mock_dataset_worker,
257
+ tmp_file,
258
+ ):
259
+ responses.add(
260
+ responses.POST,
261
+ "http://testserver/api/v1/task/my_task/artifacts/",
262
+ json={
263
+ "id": "11111111-1111-1111-1111-111111111111",
264
+ "path": tmp_file.name,
265
+ "size": 15,
266
+ "content_type": "text/plain",
267
+ "s3_put_url": "http://example.com/oops.txt",
268
+ },
269
+ )
270
+ responses.add(responses.PUT, "http://example.com/oops.txt", status=500)
271
+
272
+ with pytest.raises(HTTPError):
273
+ mock_dataset_worker.upload_artifact(path=tmp_file)
274
+
275
+ assert len(responses.calls) == len(BASE_API_CALLS) + 2
276
+ assert [
277
+ (call.request.method, call.request.url) for call in responses.calls
278
+ ] == BASE_API_CALLS + [
279
+ ("POST", "http://testserver/api/v1/task/my_task/artifacts/"),
280
+ ("PUT", "http://example.com/oops.txt"),
281
+ ]
282
+
283
+
284
+ def test_upload_artifact(
285
+ responses,
286
+ mock_dataset_worker,
287
+ tmp_file,
288
+ ):
289
+ responses.add(
290
+ responses.POST,
291
+ "http://testserver/api/v1/task/my_task/artifacts/",
292
+ json={
293
+ "id": "11111111-1111-1111-1111-111111111111",
294
+ "path": tmp_file.name,
295
+ "size": 15,
296
+ "content_type": "text/plain",
297
+ "s3_put_url": "http://example.com/test.txt",
298
+ },
299
+ )
300
+ responses.add(responses.PUT, "http://example.com/test.txt")
301
+
302
+ mock_dataset_worker.upload_artifact(path=tmp_file)
303
+
304
+ assert len(responses.calls) == len(BASE_API_CALLS) + 2
305
+ assert [
306
+ (call.request.method, call.request.url) for call in responses.calls
307
+ ] == BASE_API_CALLS + [
308
+ ("POST", "http://testserver/api/v1/task/my_task/artifacts/"),
309
+ ("PUT", "http://example.com/test.txt"),
310
+ ]
@@ -1,47 +0,0 @@
1
- """
2
- BaseWorker methods for tasks.
3
- """
4
-
5
- import uuid
6
- from collections.abc import Iterator
7
-
8
- from arkindex.compat import DownloadedFile
9
- from arkindex_worker.models import Artifact
10
-
11
-
12
- class TaskMixin:
13
- def list_artifacts(self, task_id: uuid.UUID) -> Iterator[Artifact]:
14
- """
15
- List artifacts associated to a task.
16
-
17
- :param task_id: Task ID to find artifacts from.
18
- :returns: An iterator of ``Artifact`` objects built from the ``ListArtifacts`` API endpoint.
19
- """
20
- assert task_id and isinstance(task_id, uuid.UUID), (
21
- "task_id shouldn't be null and should be an UUID"
22
- )
23
-
24
- results = self.api_client.request("ListArtifacts", id=task_id)
25
-
26
- return map(Artifact, results)
27
-
28
- def download_artifact(
29
- self, task_id: uuid.UUID, artifact: Artifact
30
- ) -> DownloadedFile:
31
- """
32
- Download an artifact content.
33
-
34
- :param task_id: Task ID the Artifact is from.
35
- :param artifact: Artifact to download content from.
36
- :returns: A temporary file containing the ``Artifact`` downloaded from the ``DownloadArtifact`` API endpoint.
37
- """
38
- assert task_id and isinstance(task_id, uuid.UUID), (
39
- "task_id shouldn't be null and should be an UUID"
40
- )
41
- assert artifact and isinstance(artifact, Artifact), (
42
- "artifact shouldn't be null and should be an Artifact"
43
- )
44
-
45
- return self.api_client.request(
46
- "DownloadArtifact", id=task_id, path=artifact.path
47
- )