pywaybackup 4.1.0__tar.gz → 4.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pywaybackup-4.1.0/pywaybackup.egg-info → pywaybackup-4.1.2}/PKG-INFO +3 -2
- {pywaybackup-4.1.0 → pywaybackup-4.1.2}/pyproject.toml +3 -2
- {pywaybackup-4.1.0 → pywaybackup-4.1.2}/pywaybackup/Exception.py +2 -5
- {pywaybackup-4.1.0 → pywaybackup-4.1.2}/pywaybackup/PyWayBackup.py +5 -1
- {pywaybackup-4.1.0 → pywaybackup-4.1.2}/pywaybackup/Snapshot.py +3 -1
- {pywaybackup-4.1.0 → pywaybackup-4.1.2}/pywaybackup/SnapshotCollection.py +36 -9
- {pywaybackup-4.1.0 → pywaybackup-4.1.2}/pywaybackup/Verbosity.py +21 -5
- {pywaybackup-4.1.0 → pywaybackup-4.1.2}/pywaybackup/archive_download.py +53 -23
- {pywaybackup-4.1.0 → pywaybackup-4.1.2}/pywaybackup/archive_save.py +9 -3
- {pywaybackup-4.1.0 → pywaybackup-4.1.2}/pywaybackup/db.py +26 -9
- {pywaybackup-4.1.0 → pywaybackup-4.1.2}/pywaybackup/files.py +3 -1
- {pywaybackup-4.1.0 → pywaybackup-4.1.2}/pywaybackup/helper.py +3 -6
- {pywaybackup-4.1.0 → pywaybackup-4.1.2/pywaybackup.egg-info}/PKG-INFO +3 -2
- {pywaybackup-4.1.0 → pywaybackup-4.1.2}/pywaybackup.egg-info/requires.txt +2 -1
- {pywaybackup-4.1.0 → pywaybackup-4.1.2}/LICENSE +0 -0
- {pywaybackup-4.1.0 → pywaybackup-4.1.2}/README.md +0 -0
- {pywaybackup-4.1.0 → pywaybackup-4.1.2}/pywaybackup/Arguments.py +0 -0
- {pywaybackup-4.1.0 → pywaybackup-4.1.2}/pywaybackup/Worker.py +0 -0
- {pywaybackup-4.1.0 → pywaybackup-4.1.2}/pywaybackup/__init__.py +0 -0
- {pywaybackup-4.1.0 → pywaybackup-4.1.2}/pywaybackup/main.py +0 -0
- {pywaybackup-4.1.0 → pywaybackup-4.1.2}/pywaybackup.egg-info/SOURCES.txt +0 -0
- {pywaybackup-4.1.0 → pywaybackup-4.1.2}/pywaybackup.egg-info/dependency_links.txt +0 -0
- {pywaybackup-4.1.0 → pywaybackup-4.1.2}/pywaybackup.egg-info/entry_points.txt +0 -0
- {pywaybackup-4.1.0 → pywaybackup-4.1.2}/pywaybackup.egg-info/top_level.txt +0 -0
- {pywaybackup-4.1.0 → pywaybackup-4.1.2}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pywaybackup
|
|
3
|
-
Version: 4.1.
|
|
3
|
+
Version: 4.1.2
|
|
4
4
|
Summary: Query and download archive.org as simple as possible.
|
|
5
5
|
Author-email: bitdruid <bitdruid@outlook.com>
|
|
6
6
|
License: MIT License
|
|
@@ -32,8 +32,9 @@ License-File: LICENSE
|
|
|
32
32
|
Requires-Dist: SQLAlchemy==2.0.43
|
|
33
33
|
Requires-Dist: requests==2.32.3
|
|
34
34
|
Requires-Dist: tqdm==4.67.1
|
|
35
|
+
Requires-Dist: pylibmagic==0.5.0
|
|
35
36
|
Requires-Dist: python-magic==0.4.27; sys_platform == "linux"
|
|
36
|
-
Requires-Dist: python-magic-bin==0.4.14; sys_platform == "win32"
|
|
37
|
+
Requires-Dist: python-magic-bin==0.4.14; sys_platform == "win32" or sys_platform == "darwin"
|
|
37
38
|
|
|
38
39
|
# python wayback machine downloader
|
|
39
40
|
|
|
@@ -7,7 +7,7 @@ packages = ["pywaybackup"]
|
|
|
7
7
|
|
|
8
8
|
[project]
|
|
9
9
|
name = "pywaybackup"
|
|
10
|
-
version = "4.1.
|
|
10
|
+
version = "4.1.2"
|
|
11
11
|
description = "Query and download archive.org as simple as possible."
|
|
12
12
|
authors = [
|
|
13
13
|
{ name = "bitdruid", email = "bitdruid@outlook.com" }
|
|
@@ -19,8 +19,9 @@ dependencies = [
|
|
|
19
19
|
"SQLAlchemy==2.0.43",
|
|
20
20
|
"requests==2.32.3",
|
|
21
21
|
"tqdm==4.67.1",
|
|
22
|
+
"pylibmagic==0.5.0",
|
|
22
23
|
"python-magic==0.4.27; sys_platform == 'linux'",
|
|
23
|
-
"python-magic-bin==0.4.14; sys_platform == 'win32'",
|
|
24
|
+
"python-magic-bin==0.4.14; sys_platform == 'win32' or sys_platform == 'darwin'",
|
|
24
25
|
]
|
|
25
26
|
|
|
26
27
|
[project.scripts]
|
|
@@ -36,10 +36,7 @@ class Exception:
|
|
|
36
36
|
codeline = linecache.getline(filename, tb_line).strip()
|
|
37
37
|
local_vars = tb_frame.f_locals
|
|
38
38
|
exception_message += (
|
|
39
|
-
f"!-- File: {filename}\n"
|
|
40
|
-
f"!-- Function: {func_name}\n"
|
|
41
|
-
f"!-- Line: {tb_line}\n"
|
|
42
|
-
f"!-- Segment: {codeline}\n"
|
|
39
|
+
f"!-- File: {filename}\n!-- Function: {func_name}\n!-- Line: {tb_line}\n!-- Segment: {codeline}\n"
|
|
43
40
|
)
|
|
44
41
|
else:
|
|
45
42
|
exception_message += "!-- Traceback is None\n"
|
|
@@ -96,4 +93,4 @@ class Exception:
|
|
|
96
93
|
if issubclass(exception_type, KeyboardInterrupt):
|
|
97
94
|
sys.__excepthook__(exception_type, exception, traceback)
|
|
98
95
|
return
|
|
99
|
-
Exception.exception(
|
|
96
|
+
Exception.exception("UNCAUGHT EXCEPTION", exception, traceback) # uncaught exceptions also with custom scheme
|
|
@@ -407,7 +407,11 @@ class PyWayBackup:
|
|
|
407
407
|
"log": self._log,
|
|
408
408
|
"debug": self._debug,
|
|
409
409
|
}
|
|
410
|
-
return {
|
|
410
|
+
return {
|
|
411
|
+
key: (os.path.relpath(path) if rel else path)
|
|
412
|
+
for key, path in files.items()
|
|
413
|
+
if path and os.path.exists(path)
|
|
414
|
+
}
|
|
411
415
|
|
|
412
416
|
def status(self) -> dict:
|
|
413
417
|
"""
|
|
@@ -101,7 +101,9 @@ class Snapshot:
|
|
|
101
101
|
value: New value to set for the column.
|
|
102
102
|
"""
|
|
103
103
|
column = getattr(waybackup_snapshots, column)
|
|
104
|
-
self._db.session.execute(
|
|
104
|
+
self._db.session.execute(
|
|
105
|
+
update(waybackup_snapshots).where(waybackup_snapshots.scid == self.scid).values({column: value})
|
|
106
|
+
)
|
|
105
107
|
self._db.session.commit()
|
|
106
108
|
|
|
107
109
|
def create_output(self):
|
|
@@ -50,7 +50,9 @@ class SnapshotCollection:
|
|
|
50
50
|
|
|
51
51
|
def _reset_locked_snapshots(self):
|
|
52
52
|
"""Reset locked snapshots to unprocessed in the database."""
|
|
53
|
-
self.db.session.execute(
|
|
53
|
+
self.db.session.execute(
|
|
54
|
+
update(waybackup_snapshots).where(waybackup_snapshots.response == "LOCK").values(response=None)
|
|
55
|
+
)
|
|
54
56
|
self.db.session.commit()
|
|
55
57
|
|
|
56
58
|
def _finalize_db(self):
|
|
@@ -140,11 +142,19 @@ class SnapshotCollection:
|
|
|
140
142
|
waybackup_snapshots.url_origin,
|
|
141
143
|
waybackup_snapshots.url_archive,
|
|
142
144
|
)
|
|
143
|
-
.filter(
|
|
145
|
+
.filter(
|
|
146
|
+
tuple_(
|
|
147
|
+
waybackup_snapshots.timestamp, waybackup_snapshots.url_origin, waybackup_snapshots.url_archive
|
|
148
|
+
).in_(keys)
|
|
149
|
+
)
|
|
144
150
|
.all()
|
|
145
151
|
)
|
|
146
152
|
existing_rows = set(existing)
|
|
147
|
-
new_rows = [
|
|
153
|
+
new_rows = [
|
|
154
|
+
row
|
|
155
|
+
for row in unique_batch
|
|
156
|
+
if (row["timestamp"], row["url_origin"], row["url_archive"]) not in existing_rows
|
|
157
|
+
]
|
|
148
158
|
if new_rows:
|
|
149
159
|
self.db.session.bulk_insert_mappings(waybackup_snapshots, new_rows)
|
|
150
160
|
self.db.session.commit()
|
|
@@ -200,17 +210,25 @@ class SnapshotCollection:
|
|
|
200
210
|
# index for filtering last snapshots
|
|
201
211
|
if self._mode_last:
|
|
202
212
|
idx1 = Index(
|
|
203
|
-
"idx_waybackup_snapshots_url_origin_timestamp_desc",
|
|
213
|
+
"idx_waybackup_snapshots_url_origin_timestamp_desc",
|
|
214
|
+
waybackup_snapshots.url_origin,
|
|
215
|
+
waybackup_snapshots.timestamp.desc(),
|
|
204
216
|
)
|
|
205
217
|
idx1.create(self.db.session.bind, checkfirst=True)
|
|
206
218
|
# index for filtering first snapshots
|
|
207
219
|
if self._mode_first:
|
|
208
220
|
idx2 = Index(
|
|
209
|
-
"idx_waybackup_snapshots_url_origin_timestamp_asc",
|
|
221
|
+
"idx_waybackup_snapshots_url_origin_timestamp_asc",
|
|
222
|
+
waybackup_snapshots.url_origin,
|
|
223
|
+
waybackup_snapshots.timestamp.asc(),
|
|
210
224
|
)
|
|
211
225
|
idx2.create(self.db.session.bind, checkfirst=True)
|
|
212
226
|
# index for skippable snapshots
|
|
213
|
-
idx3 = Index(
|
|
227
|
+
idx3 = Index(
|
|
228
|
+
"idx_waybackup_snapshots_timestamp_url_origin_response",
|
|
229
|
+
waybackup_snapshots.timestamp,
|
|
230
|
+
waybackup_snapshots.url_origin,
|
|
231
|
+
)
|
|
214
232
|
idx3.create(self.db.session.bind, checkfirst=True)
|
|
215
233
|
|
|
216
234
|
def _filter_snapshots(self):
|
|
@@ -224,7 +242,9 @@ class SnapshotCollection:
|
|
|
224
242
|
def _filter_mode():
|
|
225
243
|
self._filter_mode = 0
|
|
226
244
|
if self._mode_last or self._mode_first:
|
|
227
|
-
ordering =
|
|
245
|
+
ordering = (
|
|
246
|
+
waybackup_snapshots.timestamp.desc() if self._mode_last else waybackup_snapshots.timestamp.asc()
|
|
247
|
+
)
|
|
228
248
|
# assign row numbers per url_origin
|
|
229
249
|
rownum = (
|
|
230
250
|
func.row_number()
|
|
@@ -266,7 +286,9 @@ class SnapshotCollection:
|
|
|
266
286
|
|
|
267
287
|
_filter_mode()
|
|
268
288
|
_enumerate_counter()
|
|
269
|
-
self._filter_response =
|
|
289
|
+
self._filter_response = (
|
|
290
|
+
self.db.session.query(waybackup_snapshots).where(waybackup_snapshots.response.in_(["404", "301"])).count()
|
|
291
|
+
)
|
|
270
292
|
self.db.session.commit()
|
|
271
293
|
|
|
272
294
|
def _skip_set(self):
|
|
@@ -280,7 +302,12 @@ class SnapshotCollection:
|
|
|
280
302
|
for row in f:
|
|
281
303
|
self.db.session.execute(
|
|
282
304
|
update(waybackup_snapshots)
|
|
283
|
-
.where(
|
|
305
|
+
.where(
|
|
306
|
+
and_(
|
|
307
|
+
waybackup_snapshots.timestamp == row["timestamp"],
|
|
308
|
+
waybackup_snapshots.url_origin == row["url_origin"],
|
|
309
|
+
)
|
|
310
|
+
)
|
|
284
311
|
.values(
|
|
285
312
|
url_archive=row["url_archive"],
|
|
286
313
|
redirect_url=row["redirect_url"],
|
|
@@ -68,9 +68,10 @@ class Verbosity:
|
|
|
68
68
|
cls.pbar = Progressbar(
|
|
69
69
|
unit=" snapshot",
|
|
70
70
|
desc="download file".ljust(15),
|
|
71
|
-
total=maxval,
|
|
72
|
-
|
|
73
|
-
|
|
71
|
+
total=maxval,
|
|
72
|
+
ascii="░▒█",
|
|
73
|
+
bar_format="{l_bar}{bar:50}{r_bar}{bar:-10b}",
|
|
74
|
+
)
|
|
74
75
|
if cls.pbar is not None and progress is not None and progress > 0:
|
|
75
76
|
cls.pbar.update(progress)
|
|
76
77
|
|
|
@@ -93,7 +94,15 @@ class Verbosity:
|
|
|
93
94
|
|
|
94
95
|
|
|
95
96
|
class Progressbar(Verbosity):
|
|
96
|
-
def __init__(
|
|
97
|
+
def __init__(
|
|
98
|
+
self,
|
|
99
|
+
unit: str,
|
|
100
|
+
desc: str,
|
|
101
|
+
unit_scale: bool = False,
|
|
102
|
+
total: int = None,
|
|
103
|
+
ascii: str = None,
|
|
104
|
+
bar_format: str = None,
|
|
105
|
+
):
|
|
97
106
|
if not super().silent:
|
|
98
107
|
self.unit = unit
|
|
99
108
|
self.desc = desc
|
|
@@ -101,7 +110,14 @@ class Progressbar(Verbosity):
|
|
|
101
110
|
self.total = total
|
|
102
111
|
self.ascii = ascii
|
|
103
112
|
self.bar_format = bar_format
|
|
104
|
-
self.pbar = tqdm(
|
|
113
|
+
self.pbar = tqdm(
|
|
114
|
+
unit=self.unit,
|
|
115
|
+
desc=self.desc,
|
|
116
|
+
unit_scale=self.unit_scale,
|
|
117
|
+
total=self.total,
|
|
118
|
+
ascii=self.ascii,
|
|
119
|
+
bar_format=self.bar_format,
|
|
120
|
+
)
|
|
105
121
|
|
|
106
122
|
def update(self, progress: int):
|
|
107
123
|
"""
|
|
@@ -4,6 +4,7 @@ import os
|
|
|
4
4
|
import threading
|
|
5
5
|
import time
|
|
6
6
|
import urllib.parse
|
|
7
|
+
from gzip import BadGzipFile
|
|
7
8
|
from http import HTTPStatus
|
|
8
9
|
from importlib.metadata import version
|
|
9
10
|
from socket import timeout
|
|
@@ -168,22 +169,32 @@ class DownloadArchive:
|
|
|
168
169
|
try:
|
|
169
170
|
download_status = self._download(worker=worker)
|
|
170
171
|
|
|
171
|
-
except (
|
|
172
|
+
except (
|
|
173
|
+
timeout,
|
|
174
|
+
ConnectionRefusedError,
|
|
175
|
+
ConnectionResetError,
|
|
176
|
+
http.client.HTTPException,
|
|
177
|
+
Exception,
|
|
178
|
+
) as e:
|
|
172
179
|
if isinstance(e, (timeout, ConnectionRefusedError, ConnectionResetError)):
|
|
173
180
|
if download_attempt < download_max_attempt:
|
|
174
181
|
download_attempt += 1 # try again 2x with same connection
|
|
175
182
|
vb.write(
|
|
176
183
|
verbose=True,
|
|
177
|
-
content=
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
184
|
+
content=(
|
|
185
|
+
f"\n-----> Worker: {worker.id}"
|
|
186
|
+
f" - Attempt: [{worker.attempt}/{retry_max_attempt}]"
|
|
187
|
+
f" Snapshot ID: [{worker.snapshot.counter}/{self.sc._snapshot_total}]"
|
|
188
|
+
f" - {e.__class__.__name__} - requesting again in 50 seconds..."
|
|
189
|
+
),
|
|
181
190
|
)
|
|
182
191
|
vb.write(
|
|
183
192
|
verbose=False,
|
|
184
|
-
content=
|
|
185
|
-
|
|
186
|
-
-
|
|
193
|
+
content=(
|
|
194
|
+
f"Worker: {worker.id}"
|
|
195
|
+
f" - Snapshot {worker.snapshot.counter}/{self.sc._snapshot_total}"
|
|
196
|
+
f" - requesting again in 50 seconds..."
|
|
197
|
+
),
|
|
187
198
|
)
|
|
188
199
|
time.sleep(50)
|
|
189
200
|
continue
|
|
@@ -193,26 +204,32 @@ class DownloadArchive:
|
|
|
193
204
|
download_attempt = download_max_attempt # try again 1x with new connection
|
|
194
205
|
vb.write(
|
|
195
206
|
verbose=True,
|
|
196
|
-
content=
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
207
|
+
content=(
|
|
208
|
+
f"\n-----> Worker: {worker.id}"
|
|
209
|
+
f" - Attempt: [{worker.attempt}/{retry_max_attempt}]"
|
|
210
|
+
f" Snapshot ID: [{worker.snapshot.counter}/{self.sc._snapshot_total}]"
|
|
211
|
+
f" - {e.__class__.__name__} - renewing connection in 15 seconds..."
|
|
212
|
+
),
|
|
200
213
|
)
|
|
201
214
|
vb.write(
|
|
202
215
|
verbose=False,
|
|
203
|
-
content=
|
|
204
|
-
|
|
205
|
-
-
|
|
216
|
+
content=(
|
|
217
|
+
f"Worker: {worker.id}"
|
|
218
|
+
f" - Snapshot {worker.snapshot.counter}/{self.sc._snapshot_total}"
|
|
219
|
+
f" - renewing connection in 15 seconds..."
|
|
220
|
+
),
|
|
206
221
|
)
|
|
207
222
|
time.sleep(15)
|
|
208
223
|
worker.refresh_connection()
|
|
209
224
|
continue
|
|
210
225
|
else:
|
|
211
226
|
ex.exception(
|
|
212
|
-
message=
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
227
|
+
message=(
|
|
228
|
+
f"\n-----> Worker: {worker.id}"
|
|
229
|
+
f" - Attempt: [{worker.attempt}/{retry_max_attempt}]"
|
|
230
|
+
f" Snapshot ID: [{worker.snapshot.counter}/{self.sc._snapshot_total}]"
|
|
231
|
+
f" - EXCEPTION - {e}"
|
|
232
|
+
),
|
|
216
233
|
e=e,
|
|
217
234
|
)
|
|
218
235
|
worker.attempt = retry_max_attempt
|
|
@@ -279,7 +296,14 @@ class DownloadArchive:
|
|
|
279
296
|
if not os.path.isfile(context.output_file):
|
|
280
297
|
with open(context.output_file, "wb") as file:
|
|
281
298
|
if context.response.getheader("Content-Encoding") == "gzip":
|
|
282
|
-
|
|
299
|
+
try:
|
|
300
|
+
context.response_data = gzip.decompress(context.response_data)
|
|
301
|
+
except BadGzipFile:
|
|
302
|
+
vb.write(
|
|
303
|
+
verbose=None,
|
|
304
|
+
content=f"Worker: {worker.id} - GZIP DECOMPRESS SKIPPED - {context.snapshot_url}",
|
|
305
|
+
)
|
|
306
|
+
pass
|
|
283
307
|
file.write(context.response_data)
|
|
284
308
|
|
|
285
309
|
# check if file is downloaded
|
|
@@ -298,7 +322,9 @@ class DownloadArchive:
|
|
|
298
322
|
context (DownloadContext): The download context.
|
|
299
323
|
worker (Worker): The worker instance.
|
|
300
324
|
"""
|
|
301
|
-
worker.message.store(
|
|
325
|
+
worker.message.store(
|
|
326
|
+
verbose=True, result="REDIRECT", content=f"{context.response_status} {context.response_status_message}"
|
|
327
|
+
)
|
|
302
328
|
worker.message.store(verbose=True, result="", info="FROM", content=context.snapshot_url)
|
|
303
329
|
for _ in range(5):
|
|
304
330
|
self.__download_response(context=context, worker=worker)
|
|
@@ -354,7 +380,9 @@ class DownloadArchive:
|
|
|
354
380
|
Returns:
|
|
355
381
|
bool: Always True (indicates result was processed).
|
|
356
382
|
"""
|
|
357
|
-
worker.message.store(
|
|
383
|
+
worker.message.store(
|
|
384
|
+
verbose=True, result=result, content=f"{context.response_status} {context.response_status_message}"
|
|
385
|
+
)
|
|
358
386
|
worker.message.store(verbose=False, result=result)
|
|
359
387
|
worker.message.store(verbose=True, result="", info="URL", content=context.snapshot_url)
|
|
360
388
|
worker.message.store(verbose=True, result="", info="FILE", content=context.output_file)
|
|
@@ -371,7 +399,9 @@ class DownloadArchive:
|
|
|
371
399
|
Returns:
|
|
372
400
|
bool: Always False (indicates failure was processed).
|
|
373
401
|
"""
|
|
374
|
-
worker.message.store(
|
|
402
|
+
worker.message.store(
|
|
403
|
+
verbose=None, result="UNKNOWN", content=f"{context.response_status} {context.response_status_message}"
|
|
404
|
+
)
|
|
375
405
|
worker.message.store(verbose=True, result="", info="URL", content=context.snapshot_url)
|
|
376
406
|
return False
|
|
377
407
|
|
|
@@ -34,10 +34,13 @@ def save_page(url: str):
|
|
|
34
34
|
|
|
35
35
|
if response_status == 302:
|
|
36
36
|
location = response.getheader("Location")
|
|
37
|
-
snapshot_timestamp = datetime.strptime(url_get_timestamp(location), "%Y%m%d%H%M%S").strftime(
|
|
37
|
+
snapshot_timestamp = datetime.strptime(url_get_timestamp(location), "%Y%m%d%H%M%S").strftime(
|
|
38
|
+
"%Y-%m-%d %H:%M:%S"
|
|
39
|
+
)
|
|
38
40
|
current_timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
|
|
39
41
|
timestamp_difference = (
|
|
40
|
-
datetime.strptime(current_timestamp, "%Y-%m-%d %H:%M:%S")
|
|
42
|
+
datetime.strptime(current_timestamp, "%Y-%m-%d %H:%M:%S")
|
|
43
|
+
- datetime.strptime(snapshot_timestamp, "%Y-%m-%d %H:%M:%S")
|
|
41
44
|
).seconds / 60
|
|
42
45
|
timestamp_difference = int(round(timestamp_difference, 0))
|
|
43
46
|
|
|
@@ -45,7 +48,10 @@ def save_page(url: str):
|
|
|
45
48
|
vb.write(verbose=None, content="\n-----> Response: 302 (new snapshot)")
|
|
46
49
|
vb.write(verbose=None, content=f"SNAPSHOT URL: {location}")
|
|
47
50
|
elif timestamp_difference >= 1:
|
|
48
|
-
vb.write(
|
|
51
|
+
vb.write(
|
|
52
|
+
verbose=None,
|
|
53
|
+
content=f"\n-----> Response: 302 (existing snapshot - wait for {60 - timestamp_difference} minutes)",
|
|
54
|
+
)
|
|
49
55
|
vb.write(verbose=None, content=f"SNAPSHOT URL: {location}")
|
|
50
56
|
vb.write(verbose=None, content=f"WAYBACK TIME: {snapshot_timestamp}")
|
|
51
57
|
vb.write(verbose=None, content=f"REQUEST TIME: {current_timestamp}")
|
|
@@ -17,6 +17,7 @@ from sqlalchemy import (
|
|
|
17
17
|
)
|
|
18
18
|
from sqlalchemy.ext.declarative import declarative_base
|
|
19
19
|
from sqlalchemy.orm import sessionmaker
|
|
20
|
+
from typing import Optional # python 3.8
|
|
20
21
|
|
|
21
22
|
Base = declarative_base()
|
|
22
23
|
|
|
@@ -112,7 +113,9 @@ class Database:
|
|
|
112
113
|
Base.metadata.create_all(engine)
|
|
113
114
|
|
|
114
115
|
db = Database()
|
|
115
|
-
if db.session.execute(
|
|
116
|
+
if db.session.execute(
|
|
117
|
+
select(waybackup_job.query_identifier).where(query_identifier == query_identifier)
|
|
118
|
+
).fetchone():
|
|
116
119
|
cls.query_exist = True
|
|
117
120
|
cls.query_progress = db.get_progress()
|
|
118
121
|
else:
|
|
@@ -139,11 +142,13 @@ class Database:
|
|
|
139
142
|
"""
|
|
140
143
|
progress = f"{(done):,} / {(total):,}"
|
|
141
144
|
self.session.execute(
|
|
142
|
-
update(waybackup_job)
|
|
145
|
+
update(waybackup_job)
|
|
146
|
+
.where(waybackup_job.query_identifier == self.query_identifier)
|
|
147
|
+
.values(query_progress=progress)
|
|
143
148
|
)
|
|
144
149
|
self.session.commit()
|
|
145
150
|
|
|
146
|
-
def get_progress(self) -> str
|
|
151
|
+
def get_progress(self) -> Optional[str]:
|
|
147
152
|
"""
|
|
148
153
|
str or None: Progress string (e.g., '5 / 10') or None if not found.
|
|
149
154
|
"""
|
|
@@ -151,7 +156,7 @@ class Database:
|
|
|
151
156
|
select(waybackup_job.query_progress).where(waybackup_job.query_identifier == self.query_identifier)
|
|
152
157
|
).scalar_one_or_none()
|
|
153
158
|
|
|
154
|
-
def get_insert_complete(self) -> int
|
|
159
|
+
def get_insert_complete(self) -> Optional[int]:
|
|
155
160
|
"""
|
|
156
161
|
int or None: 1 if complete, 0 if not, or None if not found.
|
|
157
162
|
"""
|
|
@@ -159,7 +164,7 @@ class Database:
|
|
|
159
164
|
select(waybackup_job.insert_complete).where(waybackup_job.query_identifier == self.query_identifier)
|
|
160
165
|
).scalar_one_or_none()
|
|
161
166
|
|
|
162
|
-
def get_index_complete(self) -> int
|
|
167
|
+
def get_index_complete(self) -> Optional[int]:
|
|
163
168
|
"""
|
|
164
169
|
int or None: 1 if complete, 0 if not, or None if not found.
|
|
165
170
|
"""
|
|
@@ -167,7 +172,7 @@ class Database:
|
|
|
167
172
|
select(waybackup_job.index_complete).where(waybackup_job.query_identifier == self.query_identifier)
|
|
168
173
|
).scalar_one_or_none()
|
|
169
174
|
|
|
170
|
-
def get_filter_complete(self) -> int
|
|
175
|
+
def get_filter_complete(self) -> Optional[int]:
|
|
171
176
|
"""
|
|
172
177
|
int or None: 1 if complete, 0 if not, or None if not found.
|
|
173
178
|
"""
|
|
@@ -179,19 +184,31 @@ class Database:
|
|
|
179
184
|
"""
|
|
180
185
|
Mark the job's insertion phase as complete in the database.
|
|
181
186
|
"""
|
|
182
|
-
self.session.execute(
|
|
187
|
+
self.session.execute(
|
|
188
|
+
update(waybackup_job)
|
|
189
|
+
.where(waybackup_job.query_identifier == self.query_identifier)
|
|
190
|
+
.values(insert_complete=1)
|
|
191
|
+
)
|
|
183
192
|
self.session.commit()
|
|
184
193
|
|
|
185
194
|
def set_index_complete(self):
|
|
186
195
|
"""
|
|
187
196
|
Mark the job's indexing phase as complete in the database.
|
|
188
197
|
"""
|
|
189
|
-
self.session.execute(
|
|
198
|
+
self.session.execute(
|
|
199
|
+
update(waybackup_job)
|
|
200
|
+
.where(waybackup_job.query_identifier == self.query_identifier)
|
|
201
|
+
.values(index_complete=1)
|
|
202
|
+
)
|
|
190
203
|
self.session.commit()
|
|
191
204
|
|
|
192
205
|
def set_filter_complete(self):
|
|
193
206
|
"""
|
|
194
207
|
Mark the job's filtering phase as complete in the database.
|
|
195
208
|
"""
|
|
196
|
-
self.session.execute(
|
|
209
|
+
self.session.execute(
|
|
210
|
+
update(waybackup_job)
|
|
211
|
+
.where(waybackup_job.query_identifier == self.query_identifier)
|
|
212
|
+
.values(filter_complete=1)
|
|
213
|
+
)
|
|
197
214
|
self.session.commit()
|
|
@@ -54,7 +54,9 @@ class CDXquery:
|
|
|
54
54
|
|
|
55
55
|
limit = f"&limit={self.limit}" if self.limit else ""
|
|
56
56
|
|
|
57
|
-
filter_statuscode =
|
|
57
|
+
filter_statuscode = (
|
|
58
|
+
f"&filter=statuscode:({'|'.join(self.filter_statuscode)})$" if self.filter_statuscode else ""
|
|
59
|
+
)
|
|
58
60
|
filter_filetype = f"&filter=original:.*\\.({'|'.join(self.filter_filetype)})$" if self.filter_filetype else ""
|
|
59
61
|
|
|
60
62
|
return f"https://web.archive.org/cdx/search/cdx?output=json&url={cdx_url}{period}&fl=timestamp,digest,mimetype,statuscode,original{limit}{filter_filetype}{filter_statuscode}"
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import shutil
|
|
3
|
+
import pylibmagic
|
|
3
4
|
import magic
|
|
4
5
|
|
|
5
6
|
|
|
@@ -88,18 +89,14 @@ def move_index(existpath: str = None, existfile: str = None, filebuffer: bytes =
|
|
|
88
89
|
shutil.move(existpath, existpath + "_exist")
|
|
89
90
|
os.makedirs(existpath, exist_ok=True)
|
|
90
91
|
if not check_index_mime(existpath):
|
|
91
|
-
new_file = os.path.join(
|
|
92
|
-
existpath, os.path.basename(os.path.normpath(existpath))
|
|
93
|
-
)
|
|
92
|
+
new_file = os.path.join(existpath, os.path.basename(os.path.normpath(existpath)))
|
|
94
93
|
else:
|
|
95
94
|
new_file = os.path.join(existpath, "index.html")
|
|
96
95
|
shutil.move(existpath + "_exist", new_file)
|
|
97
96
|
elif existfile:
|
|
98
97
|
if filebuffer:
|
|
99
98
|
if not check_index_mime(filebuffer):
|
|
100
|
-
return os.path.join(
|
|
101
|
-
existfile, os.path.basename(os.path.normpath(existfile))
|
|
102
|
-
)
|
|
99
|
+
return os.path.join(existfile, os.path.basename(os.path.normpath(existfile)))
|
|
103
100
|
else:
|
|
104
101
|
return os.path.join(existfile, "index.html")
|
|
105
102
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pywaybackup
|
|
3
|
-
Version: 4.1.
|
|
3
|
+
Version: 4.1.2
|
|
4
4
|
Summary: Query and download archive.org as simple as possible.
|
|
5
5
|
Author-email: bitdruid <bitdruid@outlook.com>
|
|
6
6
|
License: MIT License
|
|
@@ -32,8 +32,9 @@ License-File: LICENSE
|
|
|
32
32
|
Requires-Dist: SQLAlchemy==2.0.43
|
|
33
33
|
Requires-Dist: requests==2.32.3
|
|
34
34
|
Requires-Dist: tqdm==4.67.1
|
|
35
|
+
Requires-Dist: pylibmagic==0.5.0
|
|
35
36
|
Requires-Dist: python-magic==0.4.27; sys_platform == "linux"
|
|
36
|
-
Requires-Dist: python-magic-bin==0.4.14; sys_platform == "win32"
|
|
37
|
+
Requires-Dist: python-magic-bin==0.4.14; sys_platform == "win32" or sys_platform == "darwin"
|
|
37
38
|
|
|
38
39
|
# python wayback machine downloader
|
|
39
40
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|