datamaestro 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. datamaestro/__init__.py +11 -7
  2. datamaestro/__main__.py +29 -8
  3. datamaestro/annotations/__init__.py +1 -1
  4. datamaestro/annotations/agreement.py +9 -3
  5. datamaestro/commands/site.py +27 -15
  6. datamaestro/context.py +143 -87
  7. datamaestro/data/__init__.py +23 -11
  8. datamaestro/data/csv.py +12 -12
  9. datamaestro/data/huggingface.py +25 -0
  10. datamaestro/data/ml.py +19 -10
  11. datamaestro/data/tensor.py +32 -24
  12. datamaestro/definitions.py +492 -131
  13. datamaestro/download/__init__.py +610 -24
  14. datamaestro/download/archive.py +129 -77
  15. datamaestro/download/custom.py +53 -0
  16. datamaestro/download/huggingface.py +77 -0
  17. datamaestro/download/links.py +106 -50
  18. datamaestro/download/multiple.py +27 -5
  19. datamaestro/download/single.py +114 -51
  20. datamaestro/download/sync.py +0 -1
  21. datamaestro/download/todo.py +9 -4
  22. datamaestro/download/wayback.py +164 -0
  23. datamaestro/record.py +232 -0
  24. datamaestro/registry.py +1 -0
  25. datamaestro/search.py +1 -1
  26. datamaestro/settings.py +3 -1
  27. datamaestro/sphinx.py +224 -0
  28. datamaestro/stream/__init__.py +0 -2
  29. datamaestro/stream/lines.py +10 -7
  30. datamaestro/templates/dataset.py +5 -4
  31. datamaestro/test/__init__.py +3 -1
  32. datamaestro/test/checks.py +1 -5
  33. datamaestro/test/conftest.py +1 -6
  34. datamaestro/test/test_annotations.py +2 -2
  35. datamaestro/test/test_download_handlers.py +3 -4
  36. datamaestro/test/test_record.py +72 -0
  37. datamaestro/test/test_resource.py +1388 -0
  38. datamaestro/utils.py +15 -9
  39. datamaestro/v2.md +301 -0
  40. datamaestro/version.py +4 -0
  41. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/METADATA +72 -104
  42. datamaestro-1.7.0.dist-info/RECORD +49 -0
  43. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
  44. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -1
  45. datamaestro/__pycache__/__init__.cpython-38.pyc +0 -0
  46. datamaestro/__pycache__/__init__.cpython-39.pyc +0 -0
  47. datamaestro/__pycache__/__main__.cpython-38.pyc +0 -0
  48. datamaestro/__pycache__/__main__.cpython-39.pyc +0 -0
  49. datamaestro/__pycache__/context.cpython-38.pyc +0 -0
  50. datamaestro/__pycache__/context.cpython-39.pyc +0 -0
  51. datamaestro/__pycache__/definitions.cpython-38.pyc +0 -0
  52. datamaestro/__pycache__/definitions.cpython-39.pyc +0 -0
  53. datamaestro/__pycache__/registry.cpython-38.pyc +0 -0
  54. datamaestro/__pycache__/registry.cpython-39.pyc +0 -0
  55. datamaestro/__pycache__/search.cpython-38.pyc +0 -0
  56. datamaestro/__pycache__/search.cpython-39.pyc +0 -0
  57. datamaestro/__pycache__/settings.cpython-38.pyc +0 -0
  58. datamaestro/__pycache__/settings.cpython-39.pyc +0 -0
  59. datamaestro/__pycache__/utils.cpython-38.pyc +0 -0
  60. datamaestro/__pycache__/utils.cpython-39.pyc +0 -0
  61. datamaestro/annotations/__pycache__/__init__.cpython-38.pyc +0 -0
  62. datamaestro/annotations/__pycache__/__init__.cpython-39.pyc +0 -0
  63. datamaestro/annotations/__pycache__/agreement.cpython-38.pyc +0 -0
  64. datamaestro/annotations/__pycache__/agreement.cpython-39.pyc +0 -0
  65. datamaestro/commands/__pycache__/__init__.cpython-38.pyc +0 -0
  66. datamaestro/commands/__pycache__/__init__.cpython-39.pyc +0 -0
  67. datamaestro/commands/__pycache__/site.cpython-38.pyc +0 -0
  68. datamaestro/commands/__pycache__/site.cpython-39.pyc +0 -0
  69. datamaestro/data/__pycache__/__init__.cpython-38.pyc +0 -0
  70. datamaestro/data/__pycache__/__init__.cpython-39.pyc +0 -0
  71. datamaestro/data/__pycache__/csv.cpython-38.pyc +0 -0
  72. datamaestro/data/__pycache__/csv.cpython-39.pyc +0 -0
  73. datamaestro/data/__pycache__/ml.cpython-38.pyc +0 -0
  74. datamaestro/data/__pycache__/ml.cpython-39.pyc +0 -0
  75. datamaestro/data/__pycache__/tensor.cpython-38.pyc +0 -0
  76. datamaestro/data/__pycache__/tensor.cpython-39.pyc +0 -0
  77. datamaestro/download/__pycache__/__init__.cpython-38.pyc +0 -0
  78. datamaestro/download/__pycache__/__init__.cpython-39.pyc +0 -0
  79. datamaestro/download/__pycache__/archive.cpython-38.pyc +0 -0
  80. datamaestro/download/__pycache__/archive.cpython-39.pyc +0 -0
  81. datamaestro/download/__pycache__/links.cpython-38.pyc +0 -0
  82. datamaestro/download/__pycache__/links.cpython-39.pyc +0 -0
  83. datamaestro/download/__pycache__/manual.cpython-39.pyc +0 -0
  84. datamaestro/download/__pycache__/multiple.cpython-39.pyc +0 -0
  85. datamaestro/download/__pycache__/single.cpython-38.pyc +0 -0
  86. datamaestro/download/__pycache__/single.cpython-39.pyc +0 -0
  87. datamaestro/download/__pycache__/sync.cpython-38.pyc +0 -0
  88. datamaestro/download/__pycache__/sync.cpython-39.pyc +0 -0
  89. datamaestro/download/__pycache__/todo.cpython-39.pyc +0 -0
  90. datamaestro/stream/__pycache__/__init__.cpython-38.pyc +0 -0
  91. datamaestro/stream/__pycache__/__init__.cpython-39.pyc +0 -0
  92. datamaestro/stream/__pycache__/compress.cpython-38.pyc +0 -0
  93. datamaestro/stream/__pycache__/compress.cpython-39.pyc +0 -0
  94. datamaestro/stream/__pycache__/lines.cpython-38.pyc +0 -0
  95. datamaestro/stream/__pycache__/lines.cpython-39.pyc +0 -0
  96. datamaestro/templates/__pycache__/dataset.cpython-39.pyc +0 -0
  97. datamaestro/test/__pycache__/__init__.cpython-38.pyc +0 -0
  98. datamaestro/test/__pycache__/__init__.cpython-39.pyc +0 -0
  99. datamaestro/test/__pycache__/checks.cpython-38.pyc +0 -0
  100. datamaestro/test/__pycache__/checks.cpython-39.pyc +0 -0
  101. datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.0.1.pyc +0 -0
  102. datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc +0 -0
  103. datamaestro/test/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc +0 -0
  104. datamaestro/test/__pycache__/conftest.cpython-39.pyc +0 -0
  105. datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.0.1.pyc +0 -0
  106. datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.2.0.pyc +0 -0
  107. datamaestro/test/__pycache__/test_annotations.cpython-39-pytest-6.2.4.pyc +0 -0
  108. datamaestro/test/__pycache__/test_annotations.cpython-39.pyc +0 -0
  109. datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.0.1.pyc +0 -0
  110. datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.2.0.pyc +0 -0
  111. datamaestro/test/__pycache__/test_download_handlers.cpython-39-pytest-6.2.4.pyc +0 -0
  112. datamaestro/test/__pycache__/test_download_handlers.cpython-39.pyc +0 -0
  113. datamaestro/test/__pycache__/utils.cpython-38.pyc +0 -0
  114. datamaestro-0.8.1.dist-info/RECORD +0 -109
  115. datamaestro-0.8.1.dist-info/top_level.txt +0 -1
  116. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,13 +1,31 @@
1
+ """Multiple download resources (legacy).
2
+
3
+ Note: This module uses a legacy API pattern and needs deeper refactoring.
4
+ The List and Datasets classes use an older constructor signature that
5
+ differs from the modern Resource interface.
6
+ """
7
+
1
8
  import logging
2
- from pathlib import Path
3
9
  import os
10
+ import warnings
11
+ from pathlib import Path
4
12
 
5
- from datamaestro import AbstractDataset
13
+ from datamaestro.definitions import AbstractDataset
6
14
  from datamaestro.download import Download
7
15
 
16
+ warnings.warn(
17
+ "datamaestro.download.multiple uses a legacy API. "
18
+ "Consider migrating to class-attribute resource definitions.",
19
+ DeprecationWarning,
20
+ stacklevel=2,
21
+ )
22
+
8
23
 
9
24
  class List(Download):
10
- """Download multiple files or directories given by a list"""
25
+ """Download multiple files or directories given by a list.
26
+
27
+ Legacy: uses old-style constructor API.
28
+ """
11
29
 
12
30
  def __init__(self, dataset: AbstractDataset, definition: object):
13
31
  super().__init__(dataset, definition)
@@ -32,7 +50,10 @@ class List(Download):
32
50
 
33
51
 
34
52
  class Datasets(Download):
35
- """Use links to dataset files"""
53
+ """Use links to dataset files.
54
+
55
+ Legacy: uses old-style constructor API.
56
+ """
36
57
 
37
58
  def __init__(self, dataset: AbstractDataset, definition: object):
38
59
  super().__init__(dataset, definition)
@@ -48,7 +69,8 @@ class Datasets(Download):
48
69
  if isinstance(files, Path):
49
70
  if not files.is_dir():
50
71
  raise AssertionError(
51
- "Dataset path is not a directory: %s", files
72
+ "Dataset path is not a directory: %s",
73
+ files,
52
74
  )
53
75
  path = destination / key
54
76
  if not path.exists():
@@ -1,58 +1,83 @@
1
+ """Single file download resources.
2
+
3
+ Provides FileResource subclasses for downloading individual files
4
+ from URLs, with optional transforms and integrity checking.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import io
10
+ import gzip
1
11
  import logging
12
+ import os
13
+ import os.path as op
2
14
  import shutil
3
15
  import tarfile
4
- import io
5
- import tempfile
6
- import gzip
7
- import os.path as op, os
8
- import urllib3
9
16
  from pathlib import Path
10
- from tempfile import NamedTemporaryFile
11
- import re
12
- from docstring_parser import parse
13
- from datamaestro.utils import copyfileobjs
17
+
18
+ import urllib3
19
+
20
+ from datamaestro.download import FileResource
14
21
  from datamaestro.stream import Transform
15
- from datamaestro.download import Download
22
+ from datamaestro.utils import copyfileobjs
23
+
24
+ logger = logging.getLogger(__name__)
16
25
 
17
26
 
18
27
  def open_ext(*args, **kwargs):
19
- """Opens a file according to its extension"""
28
+ """Opens a file according to its extension."""
20
29
  name = args[0]
21
30
  if name.endswith(".gz"):
22
31
  return gzip.open(*args, *kwargs)
23
32
  return io.open(*args, **kwargs)
24
33
 
25
34
 
26
- class SingleDownload(Download):
27
- def __init__(self, filename: str):
28
- super().__init__(re.sub(r"\..*$", "", filename))
29
- self.name = filename
35
+ class FileDownloader(FileResource):
36
+ """Downloads a single file from a URL.
30
37
 
31
- @property
32
- def path(self):
33
- return self.definition.datapath / self.name
38
+ Supports optional transforms (e.g., gzip decompression)
39
+ and integrity checking.
34
40
 
35
- def prepare(self):
36
- return self.path
41
+ Usage as class attribute (preferred)::
37
42
 
38
- def download(self, force=False):
39
- if not self.path.is_file():
40
- self._download(self.path)
43
+ @dataset(url="...")
44
+ class MyDataset(Base):
45
+ DATA = FileDownloader.apply(
46
+ "data.csv", "http://example.com/data.csv.gz"
47
+ )
41
48
 
49
+ Usage as decorator (deprecated)::
50
+
51
+ @filedownloader("data.csv", "http://example.com/data.csv.gz")
52
+ @dataset(Base)
53
+ def my_dataset(data): ...
54
+ """
42
55
 
43
- class filedownloader(SingleDownload):
44
56
  def __init__(
45
- self, filename: str, url: str, size: int = None, transforms=None, checker=None
57
+ self,
58
+ filename: str,
59
+ url: str,
60
+ size: int | None = None,
61
+ transforms: Transform | None = None,
62
+ checker=None,
63
+ *,
64
+ varname: str | None = None,
65
+ transient: bool = False,
46
66
  ):
47
- """Downloads a file given by a URL
48
-
67
+ """
49
68
  Args:
50
- filename: The filename within the data folder; the variable name corresponds to the filename without the extension
51
- url: The URL to download
52
- transforms: Transform the file before storing it
53
- size: size in bytes (or None)
69
+ filename: The filename within the data folder; the variable
70
+ name corresponds to the filename without the extension.
71
+ url: The URL to download.
72
+ size: Expected size in bytes (or None).
73
+ transforms: Transform the file before storing it.
74
+ Auto-detected from URL path if None.
75
+ checker: File integrity checker.
76
+ varname: Explicit resource name.
77
+ transient: If True, data can be deleted after dependents
78
+ complete.
54
79
  """
55
- super().__init__(filename)
80
+ super().__init__(filename, varname=varname, transient=transient)
56
81
  self.url = url
57
82
  self.checker = checker
58
83
  self.size = size
@@ -61,8 +86,8 @@ class filedownloader(SingleDownload):
61
86
  path = Path(Path(p.path).name)
62
87
  self.transforms = transforms if transforms else Transform.createFromPath(path)
63
88
 
64
- def _download(self, destination):
65
- logging.info("Downloading %s into %s", self.url, destination)
89
+ def _download(self, destination: Path) -> None:
90
+ logger.info("Downloading %s into %s", self.url, destination)
66
91
 
67
92
  # Creates directory if needed
68
93
  dir = op.dirname(destination)
@@ -72,41 +97,69 @@ class filedownloader(SingleDownload):
72
97
  with self.context.downloadURL(self.url, size=self.size) as file:
73
98
  # Transform if need be
74
99
  if self.transforms:
75
- logging.info("Transforming file")
76
- with self.transforms(file.path.open("rb")) as stream, destination.open(
77
- "wb"
78
- ) as out:
100
+ logger.info("Transforming file")
101
+ with (
102
+ self.transforms(file.path.open("rb")) as stream,
103
+ destination.open("wb") as out,
104
+ ):
79
105
  if self.checker:
80
106
  copyfileobjs(stream, [out, self.checker])
81
107
  self.checker.close()
82
108
  else:
83
109
  shutil.copyfileobj(stream, out)
84
110
  else:
85
- logging.info("Keeping original downloaded file %s", file.path)
111
+ logger.info("Keeping original downloaded file %s", file.path)
86
112
  if self.checker:
87
113
  self.checker.check(file.path)
88
114
  (shutil.copy if file.keep else shutil.move)(file.path, destination)
89
115
 
90
- logging.info("Created file %s" % destination)
116
+ logger.info("Created file %s", destination)
117
+
91
118
 
119
+ # Factory alias for backward compat and convenient usage
120
+ filedownloader = FileDownloader.apply
92
121
 
93
- class concatdownload(SingleDownload):
94
- """Concatenate all files in an archive"""
95
122
 
96
- def __init__(self, filename: str, url: str, transforms=None):
97
- """Concat the files in an archive
123
+ class ConcatDownloader(FileResource):
124
+ """Concatenate all files from an archive into a single file.
98
125
 
126
+ Usage as class attribute (preferred)::
127
+
128
+ @dataset(url="...")
129
+ class MyDataset(Base):
130
+ DATA = ConcatDownloader.apply(
131
+ "data.txt", "http://example.com/data.tar.gz"
132
+ )
133
+ """
134
+
135
+ def __init__(
136
+ self,
137
+ filename: str,
138
+ url: str,
139
+ transforms=None,
140
+ *,
141
+ varname: str | None = None,
142
+ transient: bool = False,
143
+ ):
144
+ """
99
145
  Args:
100
- filename: The filename within the data folder; the variable name corresponds to the filename without the extension
101
- url: The URL to download
102
- transforms: Transform the file before storing it
146
+ filename: The filename within the data folder; the variable
147
+ name corresponds to the filename without the extension.
148
+ url: The URL to download.
149
+ transforms: Transform the file before storing it.
150
+ varname: Explicit resource name.
151
+ transient: If True, data can be deleted after dependents
152
+ complete.
103
153
  """
104
- super().__init__(filename)
154
+ super().__init__(filename, varname=varname, transient=transient)
105
155
  self.url = url
106
156
  self.transforms = transforms
107
157
 
108
- def _download(self, destination):
109
- with self.context.downloadURL(self.url) as dl, tarfile.open(dl.path) as archive:
158
+ def _download(self, destination: Path) -> None:
159
+ with (
160
+ self.context.downloadURL(self.url) as dl,
161
+ tarfile.open(dl.path) as archive,
162
+ ):
110
163
  destination.parent.mkdir(parents=True, exist_ok=True)
111
164
 
112
165
  with open(destination, "wb") as out:
@@ -115,6 +168,16 @@ class concatdownload(SingleDownload):
115
168
  transforms = self.transforms or Transform.createFromPath(
116
169
  Path(tarinfo.name)
117
170
  )
118
- logging.debug("Processing file %s", tarinfo.name)
171
+ logger.debug("Processing file %s", tarinfo.name)
119
172
  with transforms(archive.fileobject(archive, tarinfo)) as fp:
120
173
  shutil.copyfileobj(fp, out)
174
+
175
+
176
+ # Factory alias for backward compat
177
+ concatdownload = ConcatDownloader.apply
178
+
179
+
180
+ # --- Backward compat aliases ---
181
+ # Keep old class names importable but they now point to new classes
182
+
183
+ SingleDownload = FileDownloader
@@ -2,7 +2,6 @@ import logging
2
2
  from pathlib import Path
3
3
 
4
4
  from datamaestro.download import Download
5
- from datamaestro.definitions import AbstractDataset
6
5
 
7
6
  from subprocess import run
8
7
 
@@ -1,10 +1,15 @@
1
- from pathlib import Path
1
+ from datamaestro.download import Resource
2
2
 
3
- from datamaestro.download import Download
4
3
 
4
+ class Todo(Resource):
5
+ """Placeholder resource indicating download is not yet implemented."""
5
6
 
6
- class Todo(Download):
7
- def download(self, destination: Path):
7
+ def download(self, force=False):
8
8
  raise NotImplementedError(
9
9
  "Download method not defined - please edit the definition file"
10
10
  )
11
+
12
+ def prepare(self):
13
+ raise NotImplementedError(
14
+ "Prepare method not defined - please edit the definition file"
15
+ )
@@ -0,0 +1,164 @@
1
+ import logging
2
+ import json
3
+ from datamaestro.download import Resource
4
+ from typing import Callable, Iterator
5
+ from pathlib import Path
6
+ import requests
7
+ import random
8
+ import re
9
+ from requests.exceptions import HTTPError
10
+ from tqdm.auto import tqdm
11
+ import time
12
+ import urllib.parse
13
+ import uuid
14
+
15
+
16
+ wayback_prefix = re.compile(r"^https:\/\/web\.archive\.org\/web")
17
+ replace_pattern = re.compile(r"(web\.archive\.org\/web\/\d+)")
18
+
19
+
20
+ def download_with_retry(url: str, max_retries: int = 10) -> requests.Response:
21
+ """Download a URL with exponential backoff, until max_retries is reached."""
22
+ retry_num = 0
23
+ while True:
24
+ try:
25
+ response = requests.get(url)
26
+ response.raise_for_status()
27
+ return response
28
+ except HTTPError as e:
29
+ status_code = e.response.status_code
30
+ if not (status_code == 429 or status_code >= 500):
31
+ # This is not an error we should retry on
32
+ raise e
33
+
34
+ if retry_num > max_retries:
35
+ logging.error(
36
+ f"Failed to perform GET request on {url}"
37
+ f"after {max_retries} retries."
38
+ )
39
+ raise e
40
+
41
+ if status_code == 429:
42
+ time.sleep(5 + 2**retry_num + random.randint(0, 1000) / 1000)
43
+ else:
44
+ time.sleep(2**retry_num + random.randint(0, 1000) / 1000)
45
+ retry_num += 1
46
+
47
+
48
+ def download_link(link: str, timestamp: str):
49
+ page_id = str(uuid.uuid4())
50
+ url_no_header = None
51
+
52
+ try:
53
+ # Find the Wayback Machine link
54
+ if not wayback_prefix.match(link):
55
+ link_encoded = urllib.parse.quote(link)
56
+
57
+ available, availability_attempt = False, 0
58
+ # Sometimes the API returns HTTP success code 200, but archived
59
+ # snapshots shows page is unavailable when it actually is. Give it a
60
+ # total of three tries.
61
+ while not available and availability_attempt < 3:
62
+ response = download_with_retry(
63
+ "http://archive.org/wayback/available?"
64
+ f"url={link_encoded}&timestamp={timestamp}"
65
+ )
66
+ json_response = response.json()
67
+ available = "closest" in json_response["archived_snapshots"]
68
+ availability_attempt += 1
69
+
70
+ if not available:
71
+ logging.warning(
72
+ f"Not available on Wayback Machine: {link}, "
73
+ f"HTTP code {response.status_code}, {json_response}"
74
+ )
75
+ return {"link": link, "page_id": page_id, "available": False}
76
+
77
+ url = json_response["archived_snapshots"]["closest"]["url"]
78
+ else:
79
+ url = link
80
+
81
+ match = replace_pattern.search(url)
82
+ assert match
83
+ url_no_header = replace_pattern.sub(f"{match.group(1)}id_", url)
84
+
85
+ response = download_with_retry(url_no_header)
86
+ html_page = response.text
87
+
88
+ return {
89
+ "link": link,
90
+ "id": url_no_header,
91
+ "contents": html_page,
92
+ }
93
+
94
+ except HTTPError as http_err:
95
+ logging.warning(f"HTTP error occurred: {http_err} for {link}")
96
+ return {
97
+ "link": link,
98
+ "page_id": page_id,
99
+ "available": False,
100
+ "status_code": http_err.response.status_code if http_err.response else None,
101
+ "wayback_url": url_no_header,
102
+ }
103
+ except UnicodeDecodeError as e:
104
+ logging.warning(f"Unicode decode error occurred: {e} for {link}")
105
+ return {
106
+ "link": link,
107
+ "page_id": page_id,
108
+ "available": False,
109
+ "status_code": response.status_code,
110
+ "wayback_url": url_no_header,
111
+ }
112
+ except Exception as e:
113
+ logging.warning(f"Exception occurred: {e} for {link}")
114
+ return {
115
+ "link": link,
116
+ "page_id": page_id,
117
+ "available": False,
118
+ "status_code": None,
119
+ "wayback_url": url_no_header,
120
+ }
121
+
122
+
123
+ class wayback_documents(Resource):
124
+ """Collect documents from wayback"""
125
+
126
+ def __init__(self, timestamp: str, urls_fn: Callable[[], Iterator[str]], name=None):
127
+ super().__init__(name)
128
+ self.timestamp = timestamp
129
+ self.urls_fn = urls_fn
130
+
131
+ def prepare(self):
132
+ return self.dataset.datapath / self.name
133
+
134
+ def download(self, force=False):
135
+ # Creates directory if needed
136
+ destination: Path = self.dataset.datapath / self.name
137
+ self.dataset.datapath.mkdir(exist_ok=True)
138
+
139
+ # Early exit
140
+ done_path = destination.with_suffix(".done")
141
+ if done_path.is_file() and not force:
142
+ return True
143
+
144
+ # Reads the URLs
145
+ logging.info("Retrieving URLs from wayback into %s", destination)
146
+ pos = 0
147
+ urls = set()
148
+ with destination.open("at+") as fp:
149
+ fp.seek(0)
150
+ try:
151
+ while line := fp.readline():
152
+ pos = fp.tell()
153
+ urls.add(json.loads(line)["url"])
154
+ except json.JSONDecodeError:
155
+ logging.warning(f"JSON decoding error: getting back to position {pos}")
156
+ fp.seek(pos)
157
+
158
+ # Get the remaining ones
159
+ for url in tqdm(self.urls_fn()):
160
+ if url not in urls:
161
+ fp.write(json.dumps(download_link(url, self.timestamp)))
162
+
163
+ # Everything is fine
164
+ done_path.touch()