datamaestro 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro/__init__.py +11 -7
- datamaestro/__main__.py +29 -8
- datamaestro/annotations/__init__.py +1 -1
- datamaestro/annotations/agreement.py +9 -3
- datamaestro/commands/site.py +27 -15
- datamaestro/context.py +143 -87
- datamaestro/data/__init__.py +23 -11
- datamaestro/data/csv.py +12 -12
- datamaestro/data/huggingface.py +25 -0
- datamaestro/data/ml.py +19 -10
- datamaestro/data/tensor.py +32 -24
- datamaestro/definitions.py +492 -131
- datamaestro/download/__init__.py +610 -24
- datamaestro/download/archive.py +129 -77
- datamaestro/download/custom.py +53 -0
- datamaestro/download/huggingface.py +77 -0
- datamaestro/download/links.py +106 -50
- datamaestro/download/multiple.py +27 -5
- datamaestro/download/single.py +114 -51
- datamaestro/download/sync.py +0 -1
- datamaestro/download/todo.py +9 -4
- datamaestro/download/wayback.py +164 -0
- datamaestro/record.py +232 -0
- datamaestro/registry.py +1 -0
- datamaestro/search.py +1 -1
- datamaestro/settings.py +3 -1
- datamaestro/sphinx.py +224 -0
- datamaestro/stream/__init__.py +0 -2
- datamaestro/stream/lines.py +10 -7
- datamaestro/templates/dataset.py +5 -4
- datamaestro/test/__init__.py +3 -1
- datamaestro/test/checks.py +1 -5
- datamaestro/test/conftest.py +1 -6
- datamaestro/test/test_annotations.py +2 -2
- datamaestro/test/test_download_handlers.py +3 -4
- datamaestro/test/test_record.py +72 -0
- datamaestro/test/test_resource.py +1388 -0
- datamaestro/utils.py +15 -9
- datamaestro/v2.md +301 -0
- datamaestro/version.py +4 -0
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/METADATA +72 -104
- datamaestro-1.7.0.dist-info/RECORD +49 -0
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -1
- datamaestro/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/__pycache__/__main__.cpython-38.pyc +0 -0
- datamaestro/__pycache__/__main__.cpython-39.pyc +0 -0
- datamaestro/__pycache__/context.cpython-38.pyc +0 -0
- datamaestro/__pycache__/context.cpython-39.pyc +0 -0
- datamaestro/__pycache__/definitions.cpython-38.pyc +0 -0
- datamaestro/__pycache__/definitions.cpython-39.pyc +0 -0
- datamaestro/__pycache__/registry.cpython-38.pyc +0 -0
- datamaestro/__pycache__/registry.cpython-39.pyc +0 -0
- datamaestro/__pycache__/search.cpython-38.pyc +0 -0
- datamaestro/__pycache__/search.cpython-39.pyc +0 -0
- datamaestro/__pycache__/settings.cpython-38.pyc +0 -0
- datamaestro/__pycache__/settings.cpython-39.pyc +0 -0
- datamaestro/__pycache__/utils.cpython-38.pyc +0 -0
- datamaestro/__pycache__/utils.cpython-39.pyc +0 -0
- datamaestro/annotations/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/annotations/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/annotations/__pycache__/agreement.cpython-38.pyc +0 -0
- datamaestro/annotations/__pycache__/agreement.cpython-39.pyc +0 -0
- datamaestro/commands/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/commands/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/commands/__pycache__/site.cpython-38.pyc +0 -0
- datamaestro/commands/__pycache__/site.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/csv.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/csv.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/ml.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/ml.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/tensor.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/tensor.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/archive.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/archive.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/links.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/links.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/manual.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/multiple.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/single.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/single.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/sync.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/sync.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/todo.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/compress.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/compress.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/lines.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/lines.cpython-39.pyc +0 -0
- datamaestro/templates/__pycache__/dataset.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/test/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/checks.cpython-38.pyc +0 -0
- datamaestro/test/__pycache__/checks.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/utils.cpython-38.pyc +0 -0
- datamaestro-0.8.1.dist-info/RECORD +0 -109
- datamaestro-0.8.1.dist-info/top_level.txt +0 -1
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info/licenses}/LICENSE +0 -0
datamaestro/download/multiple.py
CHANGED
|
@@ -1,13 +1,31 @@
|
|
|
1
|
+
"""Multiple download resources (legacy).
|
|
2
|
+
|
|
3
|
+
Note: This module uses a legacy API pattern and needs deeper refactoring.
|
|
4
|
+
The List and Datasets classes use an older constructor signature that
|
|
5
|
+
differs from the modern Resource interface.
|
|
6
|
+
"""
|
|
7
|
+
|
|
1
8
|
import logging
|
|
2
|
-
from pathlib import Path
|
|
3
9
|
import os
|
|
10
|
+
import warnings
|
|
11
|
+
from pathlib import Path
|
|
4
12
|
|
|
5
|
-
from datamaestro import AbstractDataset
|
|
13
|
+
from datamaestro.definitions import AbstractDataset
|
|
6
14
|
from datamaestro.download import Download
|
|
7
15
|
|
|
16
|
+
warnings.warn(
|
|
17
|
+
"datamaestro.download.multiple uses a legacy API. "
|
|
18
|
+
"Consider migrating to class-attribute resource definitions.",
|
|
19
|
+
DeprecationWarning,
|
|
20
|
+
stacklevel=2,
|
|
21
|
+
)
|
|
22
|
+
|
|
8
23
|
|
|
9
24
|
class List(Download):
|
|
10
|
-
"""Download multiple files or directories given by a list
|
|
25
|
+
"""Download multiple files or directories given by a list.
|
|
26
|
+
|
|
27
|
+
Legacy: uses old-style constructor API.
|
|
28
|
+
"""
|
|
11
29
|
|
|
12
30
|
def __init__(self, dataset: AbstractDataset, definition: object):
|
|
13
31
|
super().__init__(dataset, definition)
|
|
@@ -32,7 +50,10 @@ class List(Download):
|
|
|
32
50
|
|
|
33
51
|
|
|
34
52
|
class Datasets(Download):
|
|
35
|
-
"""Use links to dataset files
|
|
53
|
+
"""Use links to dataset files.
|
|
54
|
+
|
|
55
|
+
Legacy: uses old-style constructor API.
|
|
56
|
+
"""
|
|
36
57
|
|
|
37
58
|
def __init__(self, dataset: AbstractDataset, definition: object):
|
|
38
59
|
super().__init__(dataset, definition)
|
|
@@ -48,7 +69,8 @@ class Datasets(Download):
|
|
|
48
69
|
if isinstance(files, Path):
|
|
49
70
|
if not files.is_dir():
|
|
50
71
|
raise AssertionError(
|
|
51
|
-
"Dataset path is not a directory: %s",
|
|
72
|
+
"Dataset path is not a directory: %s",
|
|
73
|
+
files,
|
|
52
74
|
)
|
|
53
75
|
path = destination / key
|
|
54
76
|
if not path.exists():
|
datamaestro/download/single.py
CHANGED
|
@@ -1,58 +1,83 @@
|
|
|
1
|
+
"""Single file download resources.
|
|
2
|
+
|
|
3
|
+
Provides FileResource subclasses for downloading individual files
|
|
4
|
+
from URLs, with optional transforms and integrity checking.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import io
|
|
10
|
+
import gzip
|
|
1
11
|
import logging
|
|
12
|
+
import os
|
|
13
|
+
import os.path as op
|
|
2
14
|
import shutil
|
|
3
15
|
import tarfile
|
|
4
|
-
import io
|
|
5
|
-
import tempfile
|
|
6
|
-
import gzip
|
|
7
|
-
import os.path as op, os
|
|
8
|
-
import urllib3
|
|
9
16
|
from pathlib import Path
|
|
10
|
-
|
|
11
|
-
import
|
|
12
|
-
|
|
13
|
-
from datamaestro.
|
|
17
|
+
|
|
18
|
+
import urllib3
|
|
19
|
+
|
|
20
|
+
from datamaestro.download import FileResource
|
|
14
21
|
from datamaestro.stream import Transform
|
|
15
|
-
from datamaestro.
|
|
22
|
+
from datamaestro.utils import copyfileobjs
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
16
25
|
|
|
17
26
|
|
|
18
27
|
def open_ext(*args, **kwargs):
|
|
19
|
-
"""Opens a file according to its extension"""
|
|
28
|
+
"""Opens a file according to its extension."""
|
|
20
29
|
name = args[0]
|
|
21
30
|
if name.endswith(".gz"):
|
|
22
31
|
return gzip.open(*args, *kwargs)
|
|
23
32
|
return io.open(*args, **kwargs)
|
|
24
33
|
|
|
25
34
|
|
|
26
|
-
class
|
|
27
|
-
|
|
28
|
-
super().__init__(re.sub(r"\..*$", "", filename))
|
|
29
|
-
self.name = filename
|
|
35
|
+
class FileDownloader(FileResource):
|
|
36
|
+
"""Downloads a single file from a URL.
|
|
30
37
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
return self.definition.datapath / self.name
|
|
38
|
+
Supports optional transforms (e.g., gzip decompression)
|
|
39
|
+
and integrity checking.
|
|
34
40
|
|
|
35
|
-
|
|
36
|
-
return self.path
|
|
41
|
+
Usage as class attribute (preferred)::
|
|
37
42
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
43
|
+
@dataset(url="...")
|
|
44
|
+
class MyDataset(Base):
|
|
45
|
+
DATA = FileDownloader.apply(
|
|
46
|
+
"data.csv", "http://example.com/data.csv.gz"
|
|
47
|
+
)
|
|
41
48
|
|
|
49
|
+
Usage as decorator (deprecated)::
|
|
50
|
+
|
|
51
|
+
@filedownloader("data.csv", "http://example.com/data.csv.gz")
|
|
52
|
+
@dataset(Base)
|
|
53
|
+
def my_dataset(data): ...
|
|
54
|
+
"""
|
|
42
55
|
|
|
43
|
-
class filedownloader(SingleDownload):
|
|
44
56
|
def __init__(
|
|
45
|
-
self,
|
|
57
|
+
self,
|
|
58
|
+
filename: str,
|
|
59
|
+
url: str,
|
|
60
|
+
size: int | None = None,
|
|
61
|
+
transforms: Transform | None = None,
|
|
62
|
+
checker=None,
|
|
63
|
+
*,
|
|
64
|
+
varname: str | None = None,
|
|
65
|
+
transient: bool = False,
|
|
46
66
|
):
|
|
47
|
-
"""
|
|
48
|
-
|
|
67
|
+
"""
|
|
49
68
|
Args:
|
|
50
|
-
filename: The filename within the data folder; the variable
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
size: size in bytes (or None)
|
|
69
|
+
filename: The filename within the data folder; the variable
|
|
70
|
+
name corresponds to the filename without the extension.
|
|
71
|
+
url: The URL to download.
|
|
72
|
+
size: Expected size in bytes (or None).
|
|
73
|
+
transforms: Transform the file before storing it.
|
|
74
|
+
Auto-detected from URL path if None.
|
|
75
|
+
checker: File integrity checker.
|
|
76
|
+
varname: Explicit resource name.
|
|
77
|
+
transient: If True, data can be deleted after dependents
|
|
78
|
+
complete.
|
|
54
79
|
"""
|
|
55
|
-
super().__init__(filename)
|
|
80
|
+
super().__init__(filename, varname=varname, transient=transient)
|
|
56
81
|
self.url = url
|
|
57
82
|
self.checker = checker
|
|
58
83
|
self.size = size
|
|
@@ -61,8 +86,8 @@ class filedownloader(SingleDownload):
|
|
|
61
86
|
path = Path(Path(p.path).name)
|
|
62
87
|
self.transforms = transforms if transforms else Transform.createFromPath(path)
|
|
63
88
|
|
|
64
|
-
def _download(self, destination):
|
|
65
|
-
|
|
89
|
+
def _download(self, destination: Path) -> None:
|
|
90
|
+
logger.info("Downloading %s into %s", self.url, destination)
|
|
66
91
|
|
|
67
92
|
# Creates directory if needed
|
|
68
93
|
dir = op.dirname(destination)
|
|
@@ -72,41 +97,69 @@ class filedownloader(SingleDownload):
|
|
|
72
97
|
with self.context.downloadURL(self.url, size=self.size) as file:
|
|
73
98
|
# Transform if need be
|
|
74
99
|
if self.transforms:
|
|
75
|
-
|
|
76
|
-
with
|
|
77
|
-
"
|
|
78
|
-
|
|
100
|
+
logger.info("Transforming file")
|
|
101
|
+
with (
|
|
102
|
+
self.transforms(file.path.open("rb")) as stream,
|
|
103
|
+
destination.open("wb") as out,
|
|
104
|
+
):
|
|
79
105
|
if self.checker:
|
|
80
106
|
copyfileobjs(stream, [out, self.checker])
|
|
81
107
|
self.checker.close()
|
|
82
108
|
else:
|
|
83
109
|
shutil.copyfileobj(stream, out)
|
|
84
110
|
else:
|
|
85
|
-
|
|
111
|
+
logger.info("Keeping original downloaded file %s", file.path)
|
|
86
112
|
if self.checker:
|
|
87
113
|
self.checker.check(file.path)
|
|
88
114
|
(shutil.copy if file.keep else shutil.move)(file.path, destination)
|
|
89
115
|
|
|
90
|
-
|
|
116
|
+
logger.info("Created file %s", destination)
|
|
117
|
+
|
|
91
118
|
|
|
119
|
+
# Factory alias for backward compat and convenient usage
|
|
120
|
+
filedownloader = FileDownloader.apply
|
|
92
121
|
|
|
93
|
-
class concatdownload(SingleDownload):
|
|
94
|
-
"""Concatenate all files in an archive"""
|
|
95
122
|
|
|
96
|
-
|
|
97
|
-
|
|
123
|
+
class ConcatDownloader(FileResource):
|
|
124
|
+
"""Concatenate all files from an archive into a single file.
|
|
98
125
|
|
|
126
|
+
Usage as class attribute (preferred)::
|
|
127
|
+
|
|
128
|
+
@dataset(url="...")
|
|
129
|
+
class MyDataset(Base):
|
|
130
|
+
DATA = ConcatDownloader.apply(
|
|
131
|
+
"data.txt", "http://example.com/data.tar.gz"
|
|
132
|
+
)
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
def __init__(
|
|
136
|
+
self,
|
|
137
|
+
filename: str,
|
|
138
|
+
url: str,
|
|
139
|
+
transforms=None,
|
|
140
|
+
*,
|
|
141
|
+
varname: str | None = None,
|
|
142
|
+
transient: bool = False,
|
|
143
|
+
):
|
|
144
|
+
"""
|
|
99
145
|
Args:
|
|
100
|
-
filename: The filename within the data folder; the variable
|
|
101
|
-
|
|
102
|
-
|
|
146
|
+
filename: The filename within the data folder; the variable
|
|
147
|
+
name corresponds to the filename without the extension.
|
|
148
|
+
url: The URL to download.
|
|
149
|
+
transforms: Transform the file before storing it.
|
|
150
|
+
varname: Explicit resource name.
|
|
151
|
+
transient: If True, data can be deleted after dependents
|
|
152
|
+
complete.
|
|
103
153
|
"""
|
|
104
|
-
super().__init__(filename)
|
|
154
|
+
super().__init__(filename, varname=varname, transient=transient)
|
|
105
155
|
self.url = url
|
|
106
156
|
self.transforms = transforms
|
|
107
157
|
|
|
108
|
-
def _download(self, destination):
|
|
109
|
-
with
|
|
158
|
+
def _download(self, destination: Path) -> None:
|
|
159
|
+
with (
|
|
160
|
+
self.context.downloadURL(self.url) as dl,
|
|
161
|
+
tarfile.open(dl.path) as archive,
|
|
162
|
+
):
|
|
110
163
|
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
111
164
|
|
|
112
165
|
with open(destination, "wb") as out:
|
|
@@ -115,6 +168,16 @@ class concatdownload(SingleDownload):
|
|
|
115
168
|
transforms = self.transforms or Transform.createFromPath(
|
|
116
169
|
Path(tarinfo.name)
|
|
117
170
|
)
|
|
118
|
-
|
|
171
|
+
logger.debug("Processing file %s", tarinfo.name)
|
|
119
172
|
with transforms(archive.fileobject(archive, tarinfo)) as fp:
|
|
120
173
|
shutil.copyfileobj(fp, out)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
# Factory alias for backward compat
|
|
177
|
+
concatdownload = ConcatDownloader.apply
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
# --- Backward compat aliases ---
|
|
181
|
+
# Keep old class names importable but they now point to new classes
|
|
182
|
+
|
|
183
|
+
SingleDownload = FileDownloader
|
datamaestro/download/sync.py
CHANGED
datamaestro/download/todo.py
CHANGED
|
@@ -1,10 +1,15 @@
|
|
|
1
|
-
from
|
|
1
|
+
from datamaestro.download import Resource
|
|
2
2
|
|
|
3
|
-
from datamaestro.download import Download
|
|
4
3
|
|
|
4
|
+
class Todo(Resource):
|
|
5
|
+
"""Placeholder resource indicating download is not yet implemented."""
|
|
5
6
|
|
|
6
|
-
|
|
7
|
-
def download(self, destination: Path):
|
|
7
|
+
def download(self, force=False):
|
|
8
8
|
raise NotImplementedError(
|
|
9
9
|
"Download method not defined - please edit the definition file"
|
|
10
10
|
)
|
|
11
|
+
|
|
12
|
+
def prepare(self):
|
|
13
|
+
raise NotImplementedError(
|
|
14
|
+
"Prepare method not defined - please edit the definition file"
|
|
15
|
+
)
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import json
|
|
3
|
+
from datamaestro.download import Resource
|
|
4
|
+
from typing import Callable, Iterator
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import requests
|
|
7
|
+
import random
|
|
8
|
+
import re
|
|
9
|
+
from requests.exceptions import HTTPError
|
|
10
|
+
from tqdm.auto import tqdm
|
|
11
|
+
import time
|
|
12
|
+
import urllib.parse
|
|
13
|
+
import uuid
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
wayback_prefix = re.compile(r"^https:\/\/web\.archive\.org\/web")
|
|
17
|
+
replace_pattern = re.compile(r"(web\.archive\.org\/web\/\d+)")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def download_with_retry(url: str, max_retries: int = 10) -> requests.Response:
|
|
21
|
+
"""Download a URL with exponential backoff, until max_retries is reached."""
|
|
22
|
+
retry_num = 0
|
|
23
|
+
while True:
|
|
24
|
+
try:
|
|
25
|
+
response = requests.get(url)
|
|
26
|
+
response.raise_for_status()
|
|
27
|
+
return response
|
|
28
|
+
except HTTPError as e:
|
|
29
|
+
status_code = e.response.status_code
|
|
30
|
+
if not (status_code == 429 or status_code >= 500):
|
|
31
|
+
# This is not an error we should retry on
|
|
32
|
+
raise e
|
|
33
|
+
|
|
34
|
+
if retry_num > max_retries:
|
|
35
|
+
logging.error(
|
|
36
|
+
f"Failed to perform GET request on {url}"
|
|
37
|
+
f"after {max_retries} retries."
|
|
38
|
+
)
|
|
39
|
+
raise e
|
|
40
|
+
|
|
41
|
+
if status_code == 429:
|
|
42
|
+
time.sleep(5 + 2**retry_num + random.randint(0, 1000) / 1000)
|
|
43
|
+
else:
|
|
44
|
+
time.sleep(2**retry_num + random.randint(0, 1000) / 1000)
|
|
45
|
+
retry_num += 1
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def download_link(link: str, timestamp: str):
|
|
49
|
+
page_id = str(uuid.uuid4())
|
|
50
|
+
url_no_header = None
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
# Find the Wayback Machine link
|
|
54
|
+
if not wayback_prefix.match(link):
|
|
55
|
+
link_encoded = urllib.parse.quote(link)
|
|
56
|
+
|
|
57
|
+
available, availability_attempt = False, 0
|
|
58
|
+
# Sometimes the API returns HTTP success code 200, but archived
|
|
59
|
+
# snapshots shows page is unavailable when it actually is. Give it a
|
|
60
|
+
# total of three tries.
|
|
61
|
+
while not available and availability_attempt < 3:
|
|
62
|
+
response = download_with_retry(
|
|
63
|
+
"http://archive.org/wayback/available?"
|
|
64
|
+
f"url={link_encoded}×tamp={timestamp}"
|
|
65
|
+
)
|
|
66
|
+
json_response = response.json()
|
|
67
|
+
available = "closest" in json_response["archived_snapshots"]
|
|
68
|
+
availability_attempt += 1
|
|
69
|
+
|
|
70
|
+
if not available:
|
|
71
|
+
logging.warning(
|
|
72
|
+
f"Not available on Wayback Machine: {link}, "
|
|
73
|
+
f"HTTP code {response.status_code}, {json_response}"
|
|
74
|
+
)
|
|
75
|
+
return {"link": link, "page_id": page_id, "available": False}
|
|
76
|
+
|
|
77
|
+
url = json_response["archived_snapshots"]["closest"]["url"]
|
|
78
|
+
else:
|
|
79
|
+
url = link
|
|
80
|
+
|
|
81
|
+
match = replace_pattern.search(url)
|
|
82
|
+
assert match
|
|
83
|
+
url_no_header = replace_pattern.sub(f"{match.group(1)}id_", url)
|
|
84
|
+
|
|
85
|
+
response = download_with_retry(url_no_header)
|
|
86
|
+
html_page = response.text
|
|
87
|
+
|
|
88
|
+
return {
|
|
89
|
+
"link": link,
|
|
90
|
+
"id": url_no_header,
|
|
91
|
+
"contents": html_page,
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
except HTTPError as http_err:
|
|
95
|
+
logging.warning(f"HTTP error occurred: {http_err} for {link}")
|
|
96
|
+
return {
|
|
97
|
+
"link": link,
|
|
98
|
+
"page_id": page_id,
|
|
99
|
+
"available": False,
|
|
100
|
+
"status_code": http_err.response.status_code if http_err.response else None,
|
|
101
|
+
"wayback_url": url_no_header,
|
|
102
|
+
}
|
|
103
|
+
except UnicodeDecodeError as e:
|
|
104
|
+
logging.warning(f"Unicode decode error occurred: {e} for {link}")
|
|
105
|
+
return {
|
|
106
|
+
"link": link,
|
|
107
|
+
"page_id": page_id,
|
|
108
|
+
"available": False,
|
|
109
|
+
"status_code": response.status_code,
|
|
110
|
+
"wayback_url": url_no_header,
|
|
111
|
+
}
|
|
112
|
+
except Exception as e:
|
|
113
|
+
logging.warning(f"Exception occurred: {e} for {link}")
|
|
114
|
+
return {
|
|
115
|
+
"link": link,
|
|
116
|
+
"page_id": page_id,
|
|
117
|
+
"available": False,
|
|
118
|
+
"status_code": None,
|
|
119
|
+
"wayback_url": url_no_header,
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class wayback_documents(Resource):
|
|
124
|
+
"""Collect documents from wayback"""
|
|
125
|
+
|
|
126
|
+
def __init__(self, timestamp: str, urls_fn: Callable[[], Iterator[str]], name=None):
|
|
127
|
+
super().__init__(name)
|
|
128
|
+
self.timestamp = timestamp
|
|
129
|
+
self.urls_fn = urls_fn
|
|
130
|
+
|
|
131
|
+
def prepare(self):
|
|
132
|
+
return self.dataset.datapath / self.name
|
|
133
|
+
|
|
134
|
+
def download(self, force=False):
|
|
135
|
+
# Creates directory if needed
|
|
136
|
+
destination: Path = self.dataset.datapath / self.name
|
|
137
|
+
self.dataset.datapath.mkdir(exist_ok=True)
|
|
138
|
+
|
|
139
|
+
# Early exit
|
|
140
|
+
done_path = destination.with_suffix(".done")
|
|
141
|
+
if done_path.is_file() and not force:
|
|
142
|
+
return True
|
|
143
|
+
|
|
144
|
+
# Reads the URLs
|
|
145
|
+
logging.info("Retrieving URLs from wayback into %s", destination)
|
|
146
|
+
pos = 0
|
|
147
|
+
urls = set()
|
|
148
|
+
with destination.open("at+") as fp:
|
|
149
|
+
fp.seek(0)
|
|
150
|
+
try:
|
|
151
|
+
while line := fp.readline():
|
|
152
|
+
pos = fp.tell()
|
|
153
|
+
urls.add(json.loads(line)["url"])
|
|
154
|
+
except json.JSONDecodeError:
|
|
155
|
+
logging.warning(f"JSON decoding error: getting back to position {pos}")
|
|
156
|
+
fp.seek(pos)
|
|
157
|
+
|
|
158
|
+
# Get the remaining ones
|
|
159
|
+
for url in tqdm(self.urls_fn()):
|
|
160
|
+
if url not in urls:
|
|
161
|
+
fp.write(json.dumps(download_link(url, self.timestamp)))
|
|
162
|
+
|
|
163
|
+
# Everything is fine
|
|
164
|
+
done_path.touch()
|