arkindex-base-worker 0.3.7rc3__py3-none-any.whl → 0.3.7rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: arkindex-base-worker
3
- Version: 0.3.7rc3
3
+ Version: 0.3.7rc4
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
@@ -1,41 +1,41 @@
1
1
  arkindex_worker/__init__.py,sha256=OlgCtTC9MaWeejviY0a3iQpALcRQGMVArFVVYwTF6I8,162
2
2
  arkindex_worker/cache.py,sha256=FTlB0coXofn5zTNRTcVIvh709mcw4a1bPGqkwWjKs3w,11248
3
- arkindex_worker/image.py,sha256=uwfUE9hy0Iw-e3vU7OHmLSqouxbznWq08SykXmPD1Cs,14107
4
- arkindex_worker/models.py,sha256=DVrZPIurSiOoHvj3t_Szwd0j1t6pnwBx_dqwhNakzN0,9528
5
- arkindex_worker/utils.py,sha256=_lC1-RYvNWXEkK-AuF4_FraoggP1tYPdalNFSj4jDb4,6885
6
- arkindex_worker/worker/__init__.py,sha256=ev_EI7AA4qrQhH4B5WJf2LClcMVWPcxmaz_6L6Fp1V8,19383
7
- arkindex_worker/worker/base.py,sha256=Xd4IjtrQWNcmQzWhhEcE2X7njjTuLTP-31RgJWxoztc,19933
3
+ arkindex_worker/image.py,sha256=9-k_Wojk-sLbgvBSi7tWiiDc9YApWauJpHGKRay_nmo,14166
4
+ arkindex_worker/models.py,sha256=HdKFw3qk4WIWC-DrHDkhsw0mHP3OILuCLFf7aTjruZU,9526
5
+ arkindex_worker/utils.py,sha256=VSO8c21nsSaUCkyJaFX8wOwDQ0tztLOBFtiGvqlT0zU,6900
6
+ arkindex_worker/worker/__init__.py,sha256=I8QmdAs659SalxNjtCu2K2ItdyUlXYm3mK_WhZdjgBs,19498
7
+ arkindex_worker/worker/base.py,sha256=7ii3rZai6IB0-eB0TJ6pg-IhxMmW4izoJAKJKczbyZ4,19934
8
8
  arkindex_worker/worker/classification.py,sha256=0OiwxV9lb97Zs3kODm3hzyk0V7IxBTiW5SL6AYgRH1M,10351
9
- arkindex_worker/worker/dataset.py,sha256=IaXQXO9Zj-u7H0fvuDXOzf9FRga22eub_X-LYQHEpYY,3026
10
- arkindex_worker/worker/element.py,sha256=4f893hTSbk8QBD48xUKgHIzpr4wMxyxy634MqSwqebg,33847
9
+ arkindex_worker/worker/dataset.py,sha256=qzjaXJtfeNCP2acsHbqp5tjQk-KpLHwVzjDAExeAmVg,3228
10
+ arkindex_worker/worker/element.py,sha256=AWK3YJSHWy3j4ajntJloi_2X4zxsgXZ6c6dzphgq3OI,33848
11
11
  arkindex_worker/worker/entity.py,sha256=YT2Ttdn-L5TRoDdhOI3Z4GE1vtkWl7tKZqbYrtxZ2Ug,14630
12
- arkindex_worker/worker/metadata.py,sha256=XEKkwFeOflw0EsNwJhcCfv32iCCZwvQqYIyaqSEMNQc,6670
12
+ arkindex_worker/worker/metadata.py,sha256=SC6apVaOjFrmYw5b-njhqIlH-_r0ExbNpZeQZzlUjBE,6669
13
13
  arkindex_worker/worker/task.py,sha256=cz3wJNPgogZv1lm_3lm7WScitQtYQtL6H6I7Xokq208,1475
14
14
  arkindex_worker/worker/training.py,sha256=SOs3YKGikTr3rdWYp9H-jbtgRnZxQAoqtwB26ztx9j8,10235
15
15
  arkindex_worker/worker/transcription.py,sha256=6R7ofcGnNqX4rjT0kRKIE-G9FHq2TJ1tfztNM5sTqYE,20464
16
- arkindex_worker/worker/version.py,sha256=ogEpGdJ9GVA-eer6GMt6tiIvnuwfad6lDjdLNa6fiUQ,1926
16
+ arkindex_worker/worker/version.py,sha256=cs2pdlDxpKRO2Oldvcu54w-D_DQhf1cdeEt4tKX_QYs,1927
17
17
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  tests/conftest.py,sha256=wzKXRnS7OKQCNHrlDaQhMC8EXlsQTY_S4L9U_hXbjpM,22004
19
19
  tests/test_base_worker.py,sha256=Uq6_MpLW23gmKFXkU-SyDUaA_4dlViLBGG4e3gpBBz0,24512
20
20
  tests/test_cache.py,sha256=ii0gyr0DrG7ChEs7pmT8hMdSguAOAcCze4bRMiFQxuk,10640
21
- tests/test_dataset_worker.py,sha256=Y-g354MroQ1aYAZnqM5dhqgZZoDgpbHxpEZrcaN7SJY,29305
21
+ tests/test_dataset_worker.py,sha256=Q-3gVu2FNa6mJVkUW-PUVgyUAvRkxSzLCJrPhwoJlxQ,28273
22
22
  tests/test_element.py,sha256=2G9M15TLxQRmvrWM9Kw2ucnElh4kSv_oF_5FYwwAxTY,13181
23
- tests/test_image.py,sha256=nIT0NhmuHtD9I1rcMSvqSfjQqvTE5Ko8tAQGLIkm_zo,15232
23
+ tests/test_image.py,sha256=FZv8njLxh45sVgmY71UFHt0lv1cHr0cK4rrtPhQleX8,16262
24
24
  tests/test_merge.py,sha256=Q4zCbtZbe0wBfqE56gvAD06c6pDuhqnjKaioFqIgAQw,8331
25
25
  tests/test_utils.py,sha256=pFXegcBvIuy1tJDDSgQtCbC_tRaoLjd2055R5lu3hS0,1236
26
26
  tests/test_elements_worker/__init__.py,sha256=Fh4nkbbyJSMv_VtjQxnWrOqTnxXaaWI8S9WU0VrzCHs,179
27
- tests/test_elements_worker/test_classifications.py,sha256=0_6vryoQc2-s3CQWANsEvajkyC3aub34cxb3r97pRsk,32027
27
+ tests/test_elements_worker/test_classifications.py,sha256=PE88fsdra8QsWcKjSyao-pTHlaIWNxlbfF0CrLe9LBA,26517
28
28
  tests/test_elements_worker/test_cli.py,sha256=BsFTswLti63WAZ2pf6ipiZKWJJyCQuSfuKnSlESuK8g,2878
29
- tests/test_elements_worker/test_dataset.py,sha256=uqgu5Jpazp2mH8p26jCn7RkJ7jdWpYKdJimWpeqaK1o,13251
30
- tests/test_elements_worker/test_elements.py,sha256=DOw1LKgJeCCFtvtQssBytIvE21mgiikhbkPglcpqRtA,84823
29
+ tests/test_elements_worker/test_dataset.py,sha256=-kVll1NcMPWkIx8D7r-Z5neEGkFiZ9YQfC4eTMIfjg0,13475
30
+ tests/test_elements_worker/test_elements.py,sha256=6XKtgXSVQJnTSgTHWwEVsAtIwLBapjYjUYPUdjxcHsY,84971
31
31
  tests/test_elements_worker/test_entities.py,sha256=yi1mXzvKvNwUNMzo0UZ56YOIJstYHcLyeepPJ8f10MQ,34557
32
32
  tests/test_elements_worker/test_metadata.py,sha256=b9CNv4W31TRJqYauvX_pRIN2SvnybaLqF-FWoFwa2Vc,18672
33
33
  tests/test_elements_worker/test_task.py,sha256=FCpxE9UpouKXgjGvWgNHEai_Hiy2d1YmqRG-_v2s27s,6312
34
34
  tests/test_elements_worker/test_training.py,sha256=WeG-cDuJ-YhPgfKH47TtXBxyargtLuk7c8tsik2WnL8,8414
35
- tests/test_elements_worker/test_transcriptions.py,sha256=w-Fs4dS8RNDChp-9BgC087LiKGJYlIWyI8OreWEj-E4,73373
36
- tests/test_elements_worker/test_worker.py,sha256=mKNTsLODsFMywOXpMcP9VSTckHcgzK5y6eE75nqO1Og,17153
37
- arkindex_base_worker-0.3.7rc3.dist-info/LICENSE,sha256=NVshRi1efwVezMfW7xXYLrdDr2Li1AfwfGOd5WuH1kQ,1063
38
- arkindex_base_worker-0.3.7rc3.dist-info/METADATA,sha256=WY-CD-XJFlOsDQRp-5-0czzWZKxYNI2v0plv7UALSkU,3411
39
- arkindex_base_worker-0.3.7rc3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
40
- arkindex_base_worker-0.3.7rc3.dist-info/top_level.txt,sha256=TtagLI8LSv7GE7nG8MQqDFAJ5bNDPJn7Z5vizOgrWkA,22
41
- arkindex_base_worker-0.3.7rc3.dist-info/RECORD,,
35
+ tests/test_elements_worker/test_transcriptions.py,sha256=WVJG26sZyY66fu-Eka9A1_WWIeNI2scogjypzURnp8A,73468
36
+ tests/test_elements_worker/test_worker.py,sha256=7-jGJVT3yMGpIyN96Uafz5eIUrO4ieNLgw0k1D8BhGc,17163
37
+ arkindex_base_worker-0.3.7rc4.dist-info/LICENSE,sha256=NVshRi1efwVezMfW7xXYLrdDr2Li1AfwfGOd5WuH1kQ,1063
38
+ arkindex_base_worker-0.3.7rc4.dist-info/METADATA,sha256=ilh4IdFYSXepgr0imEMH3ZbewlFJlbg97VKnvhKXMVQ,3411
39
+ arkindex_base_worker-0.3.7rc4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
40
+ arkindex_base_worker-0.3.7rc4.dist-info/top_level.txt,sha256=TtagLI8LSv7GE7nG8MQqDFAJ5bNDPJn7Z5vizOgrWkA,22
41
+ arkindex_base_worker-0.3.7rc4.dist-info/RECORD,,
arkindex_worker/image.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """
2
2
  Helper methods to download and open IIIF images, and manage polygons.
3
3
  """
4
+
4
5
  import re
5
6
  from collections import namedtuple
6
7
  from io import BytesIO
@@ -114,32 +115,38 @@ def download_image(url: str) -> Image:
114
115
  )
115
116
  else:
116
117
  raise e
117
- except requests.exceptions.SSLError:
118
- logger.warning(
119
- "An SSLError occurred during image download, retrying with a weaker and unsafe SSL configuration"
120
- )
121
-
122
- # Saving current ciphers
123
- previous_ciphers = requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS
124
-
125
- # Downgrading ciphers to download the image
126
- requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = "ALL:@SECLEVEL=1"
127
- resp = _retried_request(url)
128
-
129
- # Restoring previous ciphers
130
- requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = previous_ciphers
131
118
 
132
119
  # Preprocess the image and prepare it for classification
133
120
  image = Image.open(BytesIO(resp.content))
134
121
  logger.info(
135
- "Downloaded image {} - size={}x{} in {}".format(
136
- url, image.size[0], image.size[1], resp.elapsed
137
- )
122
+ f"Downloaded image {url} - size={image.size[0]}x{image.size[1]} in {resp.elapsed}"
138
123
  )
139
124
 
140
125
  return image
141
126
 
142
127
 
128
+ def upload_image(image: Image, url: str) -> requests.Response:
129
+ """
130
+ Upload a Pillow image to a URL.
131
+
132
+ :param image: Pillow image to upload.
133
+ :param url: Destination URL.
134
+ :returns: The upload response.
135
+ """
136
+ assert url.startswith("http"), "Destination URL for the image must be HTTP(S)"
137
+
138
+ # Retrieve a binarized version of the image
139
+ image_bytes = BytesIO()
140
+ image.save(image_bytes, format="jpeg")
141
+ image_bytes.seek(0)
142
+
143
+ # Upload the image
144
+ resp = _retried_request(url, method=requests.put, data=image_bytes)
145
+ logger.info(f"Uploaded image to {url} in {resp.elapsed}")
146
+
147
+ return resp
148
+
149
+
143
150
  def polygon_bounding_box(polygon: list[list[int | float]]) -> BoundingBox:
144
151
  """
145
152
  Compute the rectangle bounding box of a polygon.
@@ -167,8 +174,8 @@ def _retry_log(retry_state, *args, **kwargs):
167
174
  before_sleep=_retry_log,
168
175
  reraise=True,
169
176
  )
170
- def _retried_request(url):
171
- resp = requests.get(url, timeout=DOWNLOAD_TIMEOUT)
177
+ def _retried_request(url, *args, method=requests.get, **kwargs):
178
+ resp = method(url, *args, timeout=DOWNLOAD_TIMEOUT, **kwargs)
172
179
  resp.raise_for_status()
173
180
  return resp
174
181
 
arkindex_worker/models.py CHANGED
@@ -75,10 +75,10 @@ class Element(MagicDict):
75
75
 
76
76
  def image_url(self, size: str = "full") -> str | None:
77
77
  """
78
- Build an URL to access the image.
78
+ Build a URL to access the image.
79
79
  When possible, will return the S3 URL for images, so an ML worker can bypass IIIF servers.
80
80
  :param size: Subresolution of the image, following the syntax of the IIIF resize parameter.
81
- :returns: An URL to the image, or None if the element does not have an image.
81
+ :returns: A URL to the image, or None if the element does not have an image.
82
82
  """
83
83
  if not self.get("zone"):
84
84
  return
arkindex_worker/utils.py CHANGED
@@ -31,9 +31,10 @@ def decompress_zst_archive(compressed_archive: Path) -> tuple[int, Path]:
31
31
 
32
32
  logger.debug(f"Uncompressing file to {archive_path}")
33
33
  try:
34
- with compressed_archive.open("rb") as compressed, archive_path.open(
35
- "wb"
36
- ) as decompressed:
34
+ with (
35
+ compressed_archive.open("rb") as compressed,
36
+ archive_path.open("wb") as decompressed,
37
+ ):
37
38
  dctx.copy_stream(compressed, decompressed)
38
39
  logger.debug(f"Successfully uncompressed archive {compressed_archive}")
39
40
  except zstandard.ZstdError as e:
@@ -1,6 +1,7 @@
1
1
  """
2
2
  Base classes to implement Arkindex workers.
3
3
  """
4
+
4
5
  import contextlib
5
6
  import json
6
7
  import os
@@ -229,12 +230,13 @@ class ElementsWorker(
229
230
  with contextlib.suppress(Exception):
230
231
  self.update_activity(element.id, ActivityState.Error)
231
232
 
233
+ message = f'Ran on {count} element{"s"[:count>1]}: {count - failed} completed, {failed} failed'
232
234
  if failed:
233
- logger.error(
234
- f"Ran on {count} elements: {count - failed} completed, {failed} failed"
235
- )
235
+ logger.error(message)
236
236
  if failed >= count: # Everything failed!
237
237
  sys.exit(1)
238
+ else:
239
+ logger.info(message)
238
240
 
239
241
  def process_element(self, element: Element | CachedElement):
240
242
  """
@@ -504,9 +506,10 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
504
506
  if dataset_artifact:
505
507
  dataset_artifact.unlink(missing_ok=True)
506
508
 
509
+ message = f'Ran on {count} dataset{"s"[:count>1]}: {count - failed} completed, {failed} failed'
507
510
  if failed:
508
- logger.error(
509
- f"Ran on {count} datasets: {count - failed} completed, {failed} failed"
510
- )
511
+ logger.error(message)
511
512
  if failed >= count: # Everything failed!
512
513
  sys.exit(1)
514
+ else:
515
+ logger.info(message)
@@ -1,6 +1,7 @@
1
1
  """
2
2
  The base class for all Arkindex workers.
3
3
  """
4
+
4
5
  import argparse
5
6
  import json
6
7
  import logging
@@ -51,7 +51,7 @@ class DatasetMixin:
51
51
 
52
52
  return map(
53
53
  lambda result: Dataset(**result["dataset"], selected_sets=result["sets"]),
54
- list(results),
54
+ results,
55
55
  )
56
56
 
57
57
  def list_dataset_elements(self, dataset: Dataset) -> Iterator[tuple[str, Element]]:
@@ -65,14 +65,20 @@ class DatasetMixin:
65
65
  dataset, Dataset
66
66
  ), "dataset shouldn't be null and should be a Dataset"
67
67
 
68
- results = self.api_client.paginate("ListDatasetElements", id=dataset.id)
68
+ if dataset.sets == dataset.selected_sets:
69
+ results = self.api_client.paginate("ListDatasetElements", id=dataset.id)
70
+ else:
71
+ results = iter(
72
+ element
73
+ for selected_set in dataset.selected_sets
74
+ for element in self.api_client.paginate(
75
+ "ListDatasetElements", id=dataset.id, set=selected_set
76
+ )
77
+ )
69
78
 
70
- def format_result(result):
71
- if result["set"] not in dataset.selected_sets:
72
- return
73
- return (result["set"], Element(**result["element"]))
74
-
75
- return filter(None, map(format_result, list(results)))
79
+ return map(
80
+ lambda result: (result["set"], Element(**result["element"])), results
81
+ )
76
82
 
77
83
  @unsupported_cache
78
84
  def update_dataset_state(self, dataset: Dataset, state: DatasetState) -> Dataset:
@@ -1,6 +1,7 @@
1
1
  """
2
2
  ElementsWorker methods for elements and element types.
3
3
  """
4
+
4
5
  from collections.abc import Iterable
5
6
  from typing import NamedTuple
6
7
  from uuid import UUID
@@ -50,7 +50,7 @@ class MetaType(Enum):
50
50
 
51
51
  URL = "url"
52
52
  """
53
- A metadata with a string value that should be interpreted as an URL.
53
+ A metadata with a string value that should be interpreted as a URL.
54
54
  Only the ``http`` and ``https`` schemes are allowed.
55
55
  """
56
56
 
@@ -1,6 +1,7 @@
1
1
  """
2
2
  ElementsWorker methods for worker versions.
3
3
  """
4
+
4
5
  import functools
5
6
  from warnings import warn
6
7
 
@@ -195,7 +195,7 @@ def test_list_dataset_elements_per_split_api_error(
195
195
  ):
196
196
  responses.add(
197
197
  responses.GET,
198
- f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/",
198
+ f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?set=set_1&with_count=true",
199
199
  status=500,
200
200
  )
201
201
 
@@ -211,23 +211,23 @@ def test_list_dataset_elements_per_split_api_error(
211
211
  # The API call is retried 5 times
212
212
  (
213
213
  "GET",
214
- f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?with_count=true",
214
+ f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?set=set_1&with_count=true",
215
215
  ),
216
216
  (
217
217
  "GET",
218
- f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?with_count=true",
218
+ f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?set=set_1&with_count=true",
219
219
  ),
220
220
  (
221
221
  "GET",
222
- f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?with_count=true",
222
+ f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?set=set_1&with_count=true",
223
223
  ),
224
224
  (
225
225
  "GET",
226
- f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?with_count=true",
226
+ f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?set=set_1&with_count=true",
227
227
  ),
228
228
  (
229
229
  "GET",
230
- f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?with_count=true",
230
+ f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?set=set_1&with_count=true",
231
231
  ),
232
232
  ]
233
233
 
@@ -235,110 +235,60 @@ def test_list_dataset_elements_per_split_api_error(
235
235
  def test_list_dataset_elements_per_split(
236
236
  responses, mock_dataset_worker, default_dataset
237
237
  ):
238
- expected_results = [
239
- {
240
- "set": "set_1",
241
- "element": {
242
- "id": "0000",
243
- "type": "page",
244
- "name": "Test",
245
- "corpus": {},
246
- "thumbnail_url": None,
247
- "zone": {},
248
- "best_classes": None,
249
- "has_children": None,
250
- "worker_version_id": None,
251
- "worker_run_id": None,
252
- },
253
- },
254
- {
255
- "set": "set_1",
256
- "element": {
257
- "id": "1111",
258
- "type": "page",
259
- "name": "Test 2",
260
- "corpus": {},
261
- "thumbnail_url": None,
262
- "zone": {},
263
- "best_classes": None,
264
- "has_children": None,
265
- "worker_version_id": None,
266
- "worker_run_id": None,
267
- },
268
- },
269
- {
270
- "set": "set_2",
271
- "element": {
272
- "id": "2222",
273
- "type": "page",
274
- "name": "Test 3",
275
- "corpus": {},
276
- "thumbnail_url": None,
277
- "zone": {},
278
- "best_classes": None,
279
- "has_children": None,
280
- "worker_version_id": None,
281
- "worker_run_id": None,
282
- },
283
- },
284
- {
285
- "set": "set_3",
286
- "element": {
287
- "id": "3333",
288
- "type": "page",
289
- "name": "Test 4",
290
- "corpus": {},
291
- "thumbnail_url": None,
292
- "zone": {},
293
- "best_classes": None,
294
- "has_children": None,
295
- "worker_version_id": None,
296
- "worker_run_id": None,
297
- },
298
- },
299
- # `set_4` is not in `default_dataset.selected_sets`
300
- {
301
- "set": "set_4",
302
- "element": {
303
- "id": "4444",
304
- "type": "page",
305
- "name": "Test 5",
306
- "corpus": {},
307
- "thumbnail_url": None,
308
- "zone": {},
309
- "best_classes": None,
310
- "has_children": None,
311
- "worker_version_id": None,
312
- "worker_run_id": None,
238
+ expected_results = []
239
+ for selected_set in default_dataset.selected_sets:
240
+ index = selected_set[-1]
241
+ expected_results.append(
242
+ {
243
+ "set": selected_set,
244
+ "element": {
245
+ "id": str(index) * 4,
246
+ "type": "page",
247
+ "name": f"Test {index}",
248
+ "corpus": {},
249
+ "thumbnail_url": None,
250
+ "zone": {},
251
+ "best_classes": None,
252
+ "has_children": None,
253
+ "worker_version_id": None,
254
+ "worker_run_id": None,
255
+ },
256
+ }
257
+ )
258
+ responses.add(
259
+ responses.GET,
260
+ f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?set={selected_set}&with_count=true",
261
+ status=200,
262
+ json={
263
+ "count": 1,
264
+ "next": None,
265
+ "results": [expected_results[-1]],
313
266
  },
314
- },
315
- ]
316
- responses.add(
317
- responses.GET,
318
- f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/",
319
- status=200,
320
- json={
321
- "count": 4,
322
- "next": None,
323
- "results": expected_results,
324
- },
325
- )
267
+ )
326
268
 
327
269
  assert list(
328
270
  mock_dataset_worker.list_dataset_elements_per_split(default_dataset)
329
271
  ) == [
330
- ("set_1", [expected_results[0]["element"], expected_results[1]["element"]]),
331
- ("set_2", [expected_results[2]["element"]]),
332
- ("set_3", [expected_results[3]["element"]]),
272
+ ("set_1", [expected_results[0]["element"]]),
273
+ ("set_2", [expected_results[1]["element"]]),
274
+ ("set_3", [expected_results[2]["element"]]),
333
275
  ]
334
276
 
335
- assert len(responses.calls) == len(BASE_API_CALLS) + 1
277
+ assert len(responses.calls) == len(BASE_API_CALLS) + 3
336
278
  assert [
337
279
  (call.request.method, call.request.url) for call in responses.calls
338
280
  ] == BASE_API_CALLS + [
339
281
  (
340
282
  "GET",
341
- f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?with_count=true",
283
+ f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?set=set_1&with_count=true",
284
+ ),
285
+ (
286
+ "GET",
287
+ f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?set=set_2&with_count=true",
288
+ ),
289
+ (
290
+ "GET",
291
+ f"http://testserver/api/v1/datasets/{default_dataset.id}/elements/?set=set_3&with_count=true",
342
292
  ),
343
293
  ]
344
294
 
@@ -360,7 +310,7 @@ def test_list_datasets_api_error(responses, mock_dataset_worker):
360
310
  with pytest.raises(
361
311
  Exception, match="Stopping pagination as data will be incomplete"
362
312
  ):
363
- mock_dataset_worker.list_datasets()
313
+ next(mock_dataset_worker.list_datasets())
364
314
 
365
315
  assert len(responses.calls) == len(BASE_API_CALLS) + 5
366
316
  assert [
@@ -512,7 +462,7 @@ def test_run_initial_dataset_state_error(
512
462
  if generator
513
463
  else []
514
464
  ) + [
515
- (logging.ERROR, "Ran on 1 datasets: 0 completed, 1 failed"),
465
+ (logging.ERROR, "Ran on 1 dataset: 0 completed, 1 failed"),
516
466
  ]
517
467
 
518
468
 
@@ -577,7 +527,7 @@ def test_run_update_dataset_state_api_error(
577
527
  ],
578
528
  (
579
529
  logging.ERROR,
580
- "Ran on 1 datasets: 0 completed, 1 failed",
530
+ "Ran on 1 dataset: 0 completed, 1 failed",
581
531
  ),
582
532
  ]
583
533
 
@@ -639,7 +589,7 @@ def test_run_download_dataset_artifact_api_error(
639
589
  ),
640
590
  (
641
591
  logging.ERROR,
642
- "Ran on 1 datasets: 0 completed, 1 failed",
592
+ "Ran on 1 dataset: 0 completed, 1 failed",
643
593
  ),
644
594
  ]
645
595
 
@@ -690,7 +640,7 @@ def test_run_no_downloaded_artifact_error(
690
640
  ),
691
641
  (
692
642
  logging.ERROR,
693
- "Ran on 1 datasets: 0 completed, 1 failed",
643
+ "Ran on 1 dataset: 0 completed, 1 failed",
694
644
  ),
695
645
  ]
696
646
 
@@ -792,7 +742,9 @@ def test_run(
792
742
  assert [(level, message) for _, level, message in caplog.record_tuples] == [
793
743
  (logging.INFO, "Loaded Worker Fake worker @ 123412 from API"),
794
744
  (logging.INFO, "Processing Dataset (dataset_id) (1/1)"),
795
- ] + extra_logs
745
+ *extra_logs,
746
+ (logging.INFO, "Ran on 1 dataset: 1 completed, 0 failed"),
747
+ ]
796
748
 
797
749
 
798
750
  @pytest.mark.parametrize(
@@ -890,4 +842,6 @@ def test_run_read_only(
890
842
  assert [(level, message) for _, level, message in caplog.record_tuples] == [
891
843
  (logging.WARNING, "Running without any extra configuration"),
892
844
  (logging.INFO, "Processing Dataset (dataset_id) (1/1)"),
893
- ] + extra_logs
845
+ *extra_logs,
846
+ (logging.INFO, "Ran on 1 dataset: 1 completed, 0 failed"),
847
+ ]