datamaestro 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro/download/wayback.py +163 -0
- datamaestro/version.py +2 -2
- {datamaestro-1.2.0.dist-info → datamaestro-1.2.1.dist-info}/METADATA +1 -1
- {datamaestro-1.2.0.dist-info → datamaestro-1.2.1.dist-info}/RECORD +8 -7
- {datamaestro-1.2.0.dist-info → datamaestro-1.2.1.dist-info}/LICENSE +0 -0
- {datamaestro-1.2.0.dist-info → datamaestro-1.2.1.dist-info}/WHEEL +0 -0
- {datamaestro-1.2.0.dist-info → datamaestro-1.2.1.dist-info}/entry_points.txt +0 -0
- {datamaestro-1.2.0.dist-info → datamaestro-1.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import json
|
|
3
|
+
from datamaestro.download import Resource
|
|
4
|
+
from typing import Callable, Iterator
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import requests
|
|
7
|
+
import random
|
|
8
|
+
import re
|
|
9
|
+
from requests.exceptions import HTTPError
|
|
10
|
+
from tqdm.auto import tqdm
|
|
11
|
+
import time
|
|
12
|
+
import urllib.parse
|
|
13
|
+
import uuid
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
wayback_prefix = re.compile(r"^https:\/\/web\.archive\.org\/web")
|
|
17
|
+
replace_pattern = re.compile(r"(web\.archive\.org\/web\/\d+)")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def download_with_retry(url: str, max_retries: int = 10) -> requests.Response:
|
|
21
|
+
"""Download a URL with exponential backoff, until max_retries is reached."""
|
|
22
|
+
retry_num = 0
|
|
23
|
+
while True:
|
|
24
|
+
try:
|
|
25
|
+
response = requests.get(url)
|
|
26
|
+
response.raise_for_status()
|
|
27
|
+
return response
|
|
28
|
+
except HTTPError as e:
|
|
29
|
+
status_code = e.response.status_code
|
|
30
|
+
if not (status_code == 429 or status_code >= 500):
|
|
31
|
+
# This is not an error we should retry on
|
|
32
|
+
raise e
|
|
33
|
+
|
|
34
|
+
if retry_num > max_retries:
|
|
35
|
+
logging.error(
|
|
36
|
+
f"Failed to perform GET request on {url}"
|
|
37
|
+
f"after {max_retries} retries."
|
|
38
|
+
)
|
|
39
|
+
raise e
|
|
40
|
+
|
|
41
|
+
if status_code == 429:
|
|
42
|
+
time.sleep(5 + 2**retry_num + random.randint(0, 1000) / 1000)
|
|
43
|
+
else:
|
|
44
|
+
time.sleep(2**retry_num + random.randint(0, 1000) / 1000)
|
|
45
|
+
retry_num += 1
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def download_link(link: str, timestamp: str):
|
|
49
|
+
page_id = str(uuid.uuid4())
|
|
50
|
+
url_no_header = None
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
# Find the Wayback Machine link
|
|
54
|
+
if not wayback_prefix.match(link):
|
|
55
|
+
link_encoded = urllib.parse.quote(link)
|
|
56
|
+
|
|
57
|
+
available, availability_attempt = False, 0
|
|
58
|
+
# Sometimes the API returns HTTP success code 200, but archived
|
|
59
|
+
# snapshots shows page is unavailable when it actually is. Give it a
|
|
60
|
+
# total of three tries.
|
|
61
|
+
while not available and availability_attempt < 3:
|
|
62
|
+
response = download_with_retry(
|
|
63
|
+
"http://archive.org/wayback/available?"
|
|
64
|
+
f"url={link_encoded}×tamp={timestamp}"
|
|
65
|
+
)
|
|
66
|
+
json_response = response.json()
|
|
67
|
+
available = "closest" in json_response["archived_snapshots"]
|
|
68
|
+
availability_attempt += 1
|
|
69
|
+
|
|
70
|
+
if not available:
|
|
71
|
+
logging.warning(
|
|
72
|
+
f"Not available on Wayback Machine: {link}, "
|
|
73
|
+
f"HTTP code {response.status_code}, {json_response}"
|
|
74
|
+
)
|
|
75
|
+
return {"link": link, "page_id": page_id, "available": False}
|
|
76
|
+
|
|
77
|
+
url = json_response["archived_snapshots"]["closest"]["url"]
|
|
78
|
+
else:
|
|
79
|
+
url = link
|
|
80
|
+
|
|
81
|
+
match = replace_pattern.search(url)
|
|
82
|
+
assert match
|
|
83
|
+
url_no_header = replace_pattern.sub(f"{match.group(1)}id_", url)
|
|
84
|
+
|
|
85
|
+
response = download_with_retry(url_no_header)
|
|
86
|
+
html_page = response.text
|
|
87
|
+
|
|
88
|
+
return {
|
|
89
|
+
"link": link,
|
|
90
|
+
"id": url_no_header,
|
|
91
|
+
"contents": html_page,
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
except HTTPError as http_err:
|
|
95
|
+
logging.warning(f"HTTP error occurred: {http_err} for {link}")
|
|
96
|
+
return {
|
|
97
|
+
"link": link,
|
|
98
|
+
"page_id": page_id,
|
|
99
|
+
"available": False,
|
|
100
|
+
"status_code": http_err.response.status_code if http_err.response else None,
|
|
101
|
+
"wayback_url": url_no_header,
|
|
102
|
+
}
|
|
103
|
+
except UnicodeDecodeError as e:
|
|
104
|
+
logging.warning(f"Unicode decode error occurred: {e} for {link}")
|
|
105
|
+
return {
|
|
106
|
+
"link": link,
|
|
107
|
+
"page_id": page_id,
|
|
108
|
+
"available": False,
|
|
109
|
+
"status_code": response.status_code,
|
|
110
|
+
"wayback_url": url_no_header,
|
|
111
|
+
}
|
|
112
|
+
except Exception as e:
|
|
113
|
+
logging.warning(f"Exception occurred: {e} for {link}")
|
|
114
|
+
return {
|
|
115
|
+
"link": link,
|
|
116
|
+
"page_id": page_id,
|
|
117
|
+
"available": False,
|
|
118
|
+
"status_code": None,
|
|
119
|
+
"wayback_url": url_no_header,
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class wayback_documents(Resource):
|
|
124
|
+
"""Collect documents from wayback"""
|
|
125
|
+
|
|
126
|
+
def __init__(self, timestamp: str, urls_fn: Callable[[], Iterator[str]], name=None):
|
|
127
|
+
super().__init__(name)
|
|
128
|
+
self.timestamp = timestamp
|
|
129
|
+
self.urls_fn = urls_fn
|
|
130
|
+
|
|
131
|
+
def prepare(self):
|
|
132
|
+
return self.definition.datapath / self.varname
|
|
133
|
+
|
|
134
|
+
def download(self, force=False):
|
|
135
|
+
# Creates directory if needed
|
|
136
|
+
destination: Path = self.definition.datapath / self.varname
|
|
137
|
+
self.definition.datapath.mkdir(exist_ok=True)
|
|
138
|
+
|
|
139
|
+
# Early exit
|
|
140
|
+
done_path = destination.with_suffix(".done")
|
|
141
|
+
if done_path.is_file() and not force:
|
|
142
|
+
return True
|
|
143
|
+
|
|
144
|
+
# Reads the URLs
|
|
145
|
+
logging.info("Retrieving URLs from wayback")
|
|
146
|
+
pos = 0
|
|
147
|
+
urls = set()
|
|
148
|
+
with destination.open("at+") as fp:
|
|
149
|
+
fp.seek(0)
|
|
150
|
+
try:
|
|
151
|
+
for line in fp:
|
|
152
|
+
pos = fp.tell()
|
|
153
|
+
urls.add(json.loads(line)["url"])
|
|
154
|
+
except json.JSONDecodeError:
|
|
155
|
+
logging.warning(f"JSON decoding error: getting back to position {pos}")
|
|
156
|
+
fp.seek(pos)
|
|
157
|
+
|
|
158
|
+
# Get the remaining ones
|
|
159
|
+
for url in tqdm(self.urls_fn()):
|
|
160
|
+
fp.write(json.dumps(download_link(url, self.timestamp)))
|
|
161
|
+
|
|
162
|
+
# Everything is fine
|
|
163
|
+
done_path.touch()
|
datamaestro/version.py
CHANGED
|
@@ -8,7 +8,7 @@ datamaestro/search.py,sha256=PMceNp5hcp0dlzs4cLb6LJT7XHrdXo58oO7oTucawbE,2887
|
|
|
8
8
|
datamaestro/settings.py,sha256=HYSElTUYZ6DZocBb9o3ifm6WW9knRO64XJUwxGIpvwQ,1304
|
|
9
9
|
datamaestro/sphinx.py,sha256=bp7x_2BFoTSwTqcVZDM8R8cWa7G2pz0Zb8GS054lLYM,6996
|
|
10
10
|
datamaestro/utils.py,sha256=Y3_aqeOHW8vuifwggGWJfgONyDG1FLX7ONAnX85jENI,6511
|
|
11
|
-
datamaestro/version.py,sha256=
|
|
11
|
+
datamaestro/version.py,sha256=2U0Gn26fYI3Vgj5hgkLM8I3wI6YEVdffJGllaVW-sSc,411
|
|
12
12
|
datamaestro/annotations/__init__.py,sha256=jLprrxSBa5QIqc--vqycEcxU4CR9WjVNRaqR5lH0EuE,39
|
|
13
13
|
datamaestro/annotations/agreement.py,sha256=xEH0ddZxdJ_oG_150PoOa-WjY_OaeQja3FzMzY5IB6k,955
|
|
14
14
|
datamaestro/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -28,6 +28,7 @@ datamaestro/download/multiple.py,sha256=Mrr0ObHM5cE1CPSHE9PKIrox3qZVgxwRyxLzNXp0
|
|
|
28
28
|
datamaestro/download/single.py,sha256=QSEviTP9lHLh3ZGyo_KoW3ro8UvWCGNPHeZiNj-9rLA,4134
|
|
29
29
|
datamaestro/download/sync.py,sha256=Z_LsXj4kbZWIYKTVJZEhfdpYiv6wXOOIyw8LahmEcqs,836
|
|
30
30
|
datamaestro/download/todo.py,sha256=y3YnmWC_i-u23ce-vreIwIXZcoO-uA0HXErgJPThnco,256
|
|
31
|
+
datamaestro/download/wayback.py,sha256=B9X1P9jElvd_qnUs9aX0TAO-NrNyvuHLYDAcpNq354w,5430
|
|
31
32
|
datamaestro/stream/__init__.py,sha256=Angu_Yg9rNKXb8s4at-DXYcnE-OTgSMLfUEfrL6APD8,896
|
|
32
33
|
datamaestro/stream/compress.py,sha256=0ViFGpJc6pdvZGUNERE-3XV8jAOTSvhJurb2t0NW2eU,260
|
|
33
34
|
datamaestro/stream/lines.py,sha256=UNGcyZlZxN0Q7kw717jbhZFdDVmtfJfkJZCgK7xzF9A,1996
|
|
@@ -38,9 +39,9 @@ datamaestro/test/conftest.py,sha256=it4S5Qq1CA_U8qM0pr4m7v-1dhLj5Y49WjVg5Ee3mpM,
|
|
|
38
39
|
datamaestro/test/test_annotations.py,sha256=kRPUmS_UAN6JSSVPUwV4OM_LEuEUHF1OcLSiYXjsKjw,246
|
|
39
40
|
datamaestro/test/test_download_handlers.py,sha256=Qqm-fML1KVp6dPwAUcH6xzi_dpQIshvROzviSYCUzc0,603
|
|
40
41
|
datamaestro/test/test_record.py,sha256=hNZ3uo2i5FZ0VsOHRwvLO1Z6Zce92PdipAF65UptPB8,1156
|
|
41
|
-
datamaestro-1.2.
|
|
42
|
-
datamaestro-1.2.
|
|
43
|
-
datamaestro-1.2.
|
|
44
|
-
datamaestro-1.2.
|
|
45
|
-
datamaestro-1.2.
|
|
46
|
-
datamaestro-1.2.
|
|
42
|
+
datamaestro-1.2.1.dist-info/LICENSE,sha256=WJ7YI-moTFb-uVrFjnzzhGJrnL9P2iqQe8NuED3hutI,35141
|
|
43
|
+
datamaestro-1.2.1.dist-info/METADATA,sha256=2_TL_ysMtfV2a84_0Uu3UQloCHCvetGZWo5tcjdhNCA,8999
|
|
44
|
+
datamaestro-1.2.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
45
|
+
datamaestro-1.2.1.dist-info/entry_points.txt,sha256=8qMhwSRvFG2iBqtJYVD22Zd4s4c3YkODtcp0Ajw1knw,133
|
|
46
|
+
datamaestro-1.2.1.dist-info/top_level.txt,sha256=XSznaMNAA8jELV7-TOqaAgDsjLzUf9G9MxL7C4helT0,12
|
|
47
|
+
datamaestro-1.2.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|