datamaestro 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,163 @@
1
+ import logging
2
+ import json
3
+ from datamaestro.download import Resource
4
+ from typing import Callable, Iterator
5
+ from pathlib import Path
6
+ import requests
7
+ import random
8
+ import re
9
+ from requests.exceptions import HTTPError
10
+ from tqdm.auto import tqdm
11
+ import time
12
+ import urllib.parse
13
+ import uuid
14
+
15
+
16
+ wayback_prefix = re.compile(r"^https:\/\/web\.archive\.org\/web")
17
+ replace_pattern = re.compile(r"(web\.archive\.org\/web\/\d+)")
18
+
19
+
20
+ def download_with_retry(url: str, max_retries: int = 10) -> requests.Response:
21
+ """Download a URL with exponential backoff, until max_retries is reached."""
22
+ retry_num = 0
23
+ while True:
24
+ try:
25
+ response = requests.get(url)
26
+ response.raise_for_status()
27
+ return response
28
+ except HTTPError as e:
29
+ status_code = e.response.status_code
30
+ if not (status_code == 429 or status_code >= 500):
31
+ # This is not an error we should retry on
32
+ raise e
33
+
34
+ if retry_num > max_retries:
35
+ logging.error(
36
+ f"Failed to perform GET request on {url}"
37
+ f"after {max_retries} retries."
38
+ )
39
+ raise e
40
+
41
+ if status_code == 429:
42
+ time.sleep(5 + 2**retry_num + random.randint(0, 1000) / 1000)
43
+ else:
44
+ time.sleep(2**retry_num + random.randint(0, 1000) / 1000)
45
+ retry_num += 1
46
+
47
+
48
+ def download_link(link: str, timestamp: str):
49
+ page_id = str(uuid.uuid4())
50
+ url_no_header = None
51
+
52
+ try:
53
+ # Find the Wayback Machine link
54
+ if not wayback_prefix.match(link):
55
+ link_encoded = urllib.parse.quote(link)
56
+
57
+ available, availability_attempt = False, 0
58
+ # Sometimes the API returns HTTP success code 200, but archived
59
+ # snapshots shows page is unavailable when it actually is. Give it a
60
+ # total of three tries.
61
+ while not available and availability_attempt < 3:
62
+ response = download_with_retry(
63
+ "http://archive.org/wayback/available?"
64
+ f"url={link_encoded}&timestamp={timestamp}"
65
+ )
66
+ json_response = response.json()
67
+ available = "closest" in json_response["archived_snapshots"]
68
+ availability_attempt += 1
69
+
70
+ if not available:
71
+ logging.warning(
72
+ f"Not available on Wayback Machine: {link}, "
73
+ f"HTTP code {response.status_code}, {json_response}"
74
+ )
75
+ return {"link": link, "page_id": page_id, "available": False}
76
+
77
+ url = json_response["archived_snapshots"]["closest"]["url"]
78
+ else:
79
+ url = link
80
+
81
+ match = replace_pattern.search(url)
82
+ assert match
83
+ url_no_header = replace_pattern.sub(f"{match.group(1)}id_", url)
84
+
85
+ response = download_with_retry(url_no_header)
86
+ html_page = response.text
87
+
88
+ return {
89
+ "link": link,
90
+ "id": url_no_header,
91
+ "contents": html_page,
92
+ }
93
+
94
+ except HTTPError as http_err:
95
+ logging.warning(f"HTTP error occurred: {http_err} for {link}")
96
+ return {
97
+ "link": link,
98
+ "page_id": page_id,
99
+ "available": False,
100
+ "status_code": http_err.response.status_code if http_err.response else None,
101
+ "wayback_url": url_no_header,
102
+ }
103
+ except UnicodeDecodeError as e:
104
+ logging.warning(f"Unicode decode error occurred: {e} for {link}")
105
+ return {
106
+ "link": link,
107
+ "page_id": page_id,
108
+ "available": False,
109
+ "status_code": response.status_code,
110
+ "wayback_url": url_no_header,
111
+ }
112
+ except Exception as e:
113
+ logging.warning(f"Exception occurred: {e} for {link}")
114
+ return {
115
+ "link": link,
116
+ "page_id": page_id,
117
+ "available": False,
118
+ "status_code": None,
119
+ "wayback_url": url_no_header,
120
+ }
121
+
122
+
123
+ class wayback_documents(Resource):
124
+ """Collect documents from wayback"""
125
+
126
+ def __init__(self, timestamp: str, urls_fn: Callable[[], Iterator[str]], name=None):
127
+ super().__init__(name)
128
+ self.timestamp = timestamp
129
+ self.urls_fn = urls_fn
130
+
131
+ def prepare(self):
132
+ return self.definition.datapath / self.varname
133
+
134
+ def download(self, force=False):
135
+ # Creates directory if needed
136
+ destination: Path = self.definition.datapath / self.varname
137
+ self.definition.datapath.mkdir(exist_ok=True)
138
+
139
+ # Early exit
140
+ done_path = destination.with_suffix(".done")
141
+ if done_path.is_file() and not force:
142
+ return True
143
+
144
+ # Reads the URLs
145
+ logging.info("Retrieving URLs from wayback")
146
+ pos = 0
147
+ urls = set()
148
+ with destination.open("at+") as fp:
149
+ fp.seek(0)
150
+ try:
151
+ for line in fp:
152
+ pos = fp.tell()
153
+ urls.add(json.loads(line)["url"])
154
+ except json.JSONDecodeError:
155
+ logging.warning(f"JSON decoding error: getting back to position {pos}")
156
+ fp.seek(pos)
157
+
158
+ # Get the remaining ones
159
+ for url in tqdm(self.urls_fn()):
160
+ fp.write(json.dumps(download_link(url, self.timestamp)))
161
+
162
+ # Everything is fine
163
+ done_path.touch()
datamaestro/version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '1.2.0'
16
- __version_tuple__ = version_tuple = (1, 2, 0)
15
+ __version__ = version = '1.2.1'
16
+ __version_tuple__ = version_tuple = (1, 2, 1)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamaestro
3
- Version: 1.2.0
3
+ Version: 1.2.1
4
4
  Summary: "Dataset management command line and API"
5
5
  Home-page: https://github.com/experimaestro/datamaestro
6
6
  Author: Benjamin Piwowarski
@@ -8,7 +8,7 @@ datamaestro/search.py,sha256=PMceNp5hcp0dlzs4cLb6LJT7XHrdXo58oO7oTucawbE,2887
8
8
  datamaestro/settings.py,sha256=HYSElTUYZ6DZocBb9o3ifm6WW9knRO64XJUwxGIpvwQ,1304
9
9
  datamaestro/sphinx.py,sha256=bp7x_2BFoTSwTqcVZDM8R8cWa7G2pz0Zb8GS054lLYM,6996
10
10
  datamaestro/utils.py,sha256=Y3_aqeOHW8vuifwggGWJfgONyDG1FLX7ONAnX85jENI,6511
11
- datamaestro/version.py,sha256=zMnMemknXglcJs59xkicNzeEJTVgYd1omSfLWj76yWw,411
11
+ datamaestro/version.py,sha256=2U0Gn26fYI3Vgj5hgkLM8I3wI6YEVdffJGllaVW-sSc,411
12
12
  datamaestro/annotations/__init__.py,sha256=jLprrxSBa5QIqc--vqycEcxU4CR9WjVNRaqR5lH0EuE,39
13
13
  datamaestro/annotations/agreement.py,sha256=xEH0ddZxdJ_oG_150PoOa-WjY_OaeQja3FzMzY5IB6k,955
14
14
  datamaestro/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -28,6 +28,7 @@ datamaestro/download/multiple.py,sha256=Mrr0ObHM5cE1CPSHE9PKIrox3qZVgxwRyxLzNXp0
28
28
  datamaestro/download/single.py,sha256=QSEviTP9lHLh3ZGyo_KoW3ro8UvWCGNPHeZiNj-9rLA,4134
29
29
  datamaestro/download/sync.py,sha256=Z_LsXj4kbZWIYKTVJZEhfdpYiv6wXOOIyw8LahmEcqs,836
30
30
  datamaestro/download/todo.py,sha256=y3YnmWC_i-u23ce-vreIwIXZcoO-uA0HXErgJPThnco,256
31
+ datamaestro/download/wayback.py,sha256=B9X1P9jElvd_qnUs9aX0TAO-NrNyvuHLYDAcpNq354w,5430
31
32
  datamaestro/stream/__init__.py,sha256=Angu_Yg9rNKXb8s4at-DXYcnE-OTgSMLfUEfrL6APD8,896
32
33
  datamaestro/stream/compress.py,sha256=0ViFGpJc6pdvZGUNERE-3XV8jAOTSvhJurb2t0NW2eU,260
33
34
  datamaestro/stream/lines.py,sha256=UNGcyZlZxN0Q7kw717jbhZFdDVmtfJfkJZCgK7xzF9A,1996
@@ -38,9 +39,9 @@ datamaestro/test/conftest.py,sha256=it4S5Qq1CA_U8qM0pr4m7v-1dhLj5Y49WjVg5Ee3mpM,
38
39
  datamaestro/test/test_annotations.py,sha256=kRPUmS_UAN6JSSVPUwV4OM_LEuEUHF1OcLSiYXjsKjw,246
39
40
  datamaestro/test/test_download_handlers.py,sha256=Qqm-fML1KVp6dPwAUcH6xzi_dpQIshvROzviSYCUzc0,603
40
41
  datamaestro/test/test_record.py,sha256=hNZ3uo2i5FZ0VsOHRwvLO1Z6Zce92PdipAF65UptPB8,1156
41
- datamaestro-1.2.0.dist-info/LICENSE,sha256=WJ7YI-moTFb-uVrFjnzzhGJrnL9P2iqQe8NuED3hutI,35141
42
- datamaestro-1.2.0.dist-info/METADATA,sha256=RM87g0kAI517bjmcEFlvRiGC6Dxy1xmiWt0gYG1eZEs,8999
43
- datamaestro-1.2.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
44
- datamaestro-1.2.0.dist-info/entry_points.txt,sha256=8qMhwSRvFG2iBqtJYVD22Zd4s4c3YkODtcp0Ajw1knw,133
45
- datamaestro-1.2.0.dist-info/top_level.txt,sha256=XSznaMNAA8jELV7-TOqaAgDsjLzUf9G9MxL7C4helT0,12
46
- datamaestro-1.2.0.dist-info/RECORD,,
42
+ datamaestro-1.2.1.dist-info/LICENSE,sha256=WJ7YI-moTFb-uVrFjnzzhGJrnL9P2iqQe8NuED3hutI,35141
43
+ datamaestro-1.2.1.dist-info/METADATA,sha256=2_TL_ysMtfV2a84_0Uu3UQloCHCvetGZWo5tcjdhNCA,8999
44
+ datamaestro-1.2.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
45
+ datamaestro-1.2.1.dist-info/entry_points.txt,sha256=8qMhwSRvFG2iBqtJYVD22Zd4s4c3YkODtcp0Ajw1knw,133
46
+ datamaestro-1.2.1.dist-info/top_level.txt,sha256=XSznaMNAA8jELV7-TOqaAgDsjLzUf9G9MxL7C4helT0,12
47
+ datamaestro-1.2.1.dist-info/RECORD,,