pywaybackup 3.0.4__tar.gz → 3.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pywaybackup-3.0.4/pywaybackup.egg-info → pywaybackup-3.1.0}/PKG-INFO +3 -2
- {pywaybackup-3.0.4 → pywaybackup-3.1.0}/README.md +1 -1
- {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pyproject.toml +2 -1
- {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup/Arguments.py +2 -1
- {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup/Exception.py +16 -19
- {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup/SnapshotCollection.py +24 -22
- {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup/Verbosity.py +7 -5
- pywaybackup-3.0.4/pywaybackup/archive.py → pywaybackup-3.1.0/pywaybackup/archive_download.py +21 -81
- pywaybackup-3.1.0/pywaybackup/archive_save.py +81 -0
- {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup/db.py +1 -2
- {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup/helper.py +20 -15
- {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup/main.py +13 -11
- {pywaybackup-3.0.4 → pywaybackup-3.1.0/pywaybackup.egg-info}/PKG-INFO +3 -2
- {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup.egg-info/SOURCES.txt +2 -1
- {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup.egg-info/requires.txt +1 -0
- {pywaybackup-3.0.4 → pywaybackup-3.1.0}/LICENSE +0 -0
- {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup/Converter.py +0 -0
- {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup/__init__.py +0 -0
- {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup.egg-info/dependency_links.txt +0 -0
- {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup.egg-info/entry_points.txt +0 -0
- {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup.egg-info/top_level.txt +0 -0
- {pywaybackup-3.0.4 → pywaybackup-3.1.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: pywaybackup
|
|
3
|
-
Version: 3.0
|
|
3
|
+
Version: 3.1.0
|
|
4
4
|
Summary: Query and download archive.org as simple as possible.
|
|
5
5
|
Author-email: bitdruid <bitdruid@outlook.com>
|
|
6
6
|
License: MIT License
|
|
@@ -29,6 +29,7 @@ Project-URL: homepage, https://github.com/bitdruid/python-wayback-machine-downlo
|
|
|
29
29
|
Requires-Python: >=3.8
|
|
30
30
|
Description-Content-Type: text/markdown
|
|
31
31
|
License-File: LICENSE
|
|
32
|
+
Requires-Dist: pysqlite3-binary==0.5.4
|
|
32
33
|
Requires-Dist: requests==2.31.0
|
|
33
34
|
Requires-Dist: tqdm==4.66.2
|
|
34
35
|
Requires-Dist: python-magic==0.4.27; sys_platform == "linux"
|
|
@@ -39,7 +40,7 @@ Requires-Dist: python-magic-bin==0.4.14; sys_platform == "win32"
|
|
|
39
40
|
[](https://pypi.org/project/pywaybackup/)
|
|
40
41
|
[](https://pypi.org/project/pywaybackup/)
|
|
41
42
|

|
|
42
|
-
 -->
|
|
43
44
|
[](https://opensource.org/licenses/MIT)
|
|
44
45
|
|
|
45
46
|
Downloading archived web pages from the [Wayback Machine](https://archive.org/web/).
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
[](https://pypi.org/project/pywaybackup/)
|
|
4
4
|
[](https://pypi.org/project/pywaybackup/)
|
|
5
5
|

|
|
6
|
-
 -->
|
|
7
7
|
[](https://opensource.org/licenses/MIT)
|
|
8
8
|
|
|
9
9
|
Downloading archived web pages from the [Wayback Machine](https://archive.org/web/).
|
|
@@ -7,7 +7,7 @@ packages = ["pywaybackup"]
|
|
|
7
7
|
|
|
8
8
|
[project]
|
|
9
9
|
name = "pywaybackup"
|
|
10
|
-
version = "3.0
|
|
10
|
+
version = "3.1.0"
|
|
11
11
|
description = "Query and download archive.org as simple as possible."
|
|
12
12
|
authors = [
|
|
13
13
|
{ name = "bitdruid", email = "bitdruid@outlook.com" }
|
|
@@ -16,6 +16,7 @@ license = { file = "LICENSE" }
|
|
|
16
16
|
readme = "README.md"
|
|
17
17
|
requires-python = ">=3.8"
|
|
18
18
|
dependencies = [
|
|
19
|
+
"pysqlite3-binary==0.5.4",
|
|
19
20
|
"requests==2.31.0",
|
|
20
21
|
"tqdm==4.66.2",
|
|
21
22
|
"python-magic==0.4.27; sys_platform == 'linux'",
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
import sys
|
|
3
3
|
import os
|
|
4
4
|
import argparse
|
|
5
|
+
|
|
5
6
|
from importlib.metadata import version
|
|
6
7
|
|
|
7
8
|
from pywaybackup.helper import url_split, sanitize_filename
|
|
@@ -74,7 +75,7 @@ class Configuration:
|
|
|
74
75
|
|
|
75
76
|
if cls.output is None:
|
|
76
77
|
cls.output = os.path.join(os.getcwd(), "waybackup_snapshots")
|
|
77
|
-
os.makedirs(cls.output, exist_ok=True)
|
|
78
|
+
os.makedirs(cls.output, exist_ok=True) if not cls.save else None
|
|
78
79
|
|
|
79
80
|
if cls.log is True:
|
|
80
81
|
cls.log = os.path.join(cls.output, f"waybackup_{sanitize_filename(cls.url)}.log")
|
|
@@ -1,34 +1,33 @@
|
|
|
1
|
-
|
|
2
1
|
import sys
|
|
3
2
|
import os
|
|
4
|
-
|
|
3
|
+
import re
|
|
5
4
|
import linecache
|
|
6
5
|
import traceback
|
|
7
|
-
|
|
8
|
-
import re
|
|
6
|
+
from datetime import datetime
|
|
9
7
|
|
|
10
8
|
from importlib.metadata import version
|
|
11
9
|
|
|
12
|
-
class Exception:
|
|
13
10
|
|
|
11
|
+
class Exception:
|
|
14
12
|
new_debug = True
|
|
15
13
|
output = None
|
|
16
14
|
command = None
|
|
17
15
|
|
|
18
16
|
@classmethod
|
|
19
17
|
def init(cls, output=None, command=None):
|
|
20
|
-
sys.excepthook =
|
|
18
|
+
sys.excepthook = (
|
|
19
|
+
cls.exception_handler
|
|
20
|
+
) # set custom exception handler (uncaught exceptions)
|
|
21
21
|
cls.output = output
|
|
22
22
|
cls.command = command
|
|
23
23
|
|
|
24
24
|
@classmethod
|
|
25
25
|
def exception(cls, message: str, e: Exception, tb=None):
|
|
26
26
|
custom_tb = sys.exc_info()[-1] if tb is None else tb
|
|
27
|
-
original_tb = cls.relativate_path(
|
|
28
|
-
|
|
29
|
-
"-------------------------\n"
|
|
30
|
-
f"!-- Exception: {message}\n"
|
|
27
|
+
original_tb = cls.relativate_path(
|
|
28
|
+
"".join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
31
29
|
)
|
|
30
|
+
exception_message = f"-------------------------\n!-- Exception: {message}\n"
|
|
32
31
|
if custom_tb is not None:
|
|
33
32
|
while custom_tb.tb_next: # loop to last traceback frame
|
|
34
33
|
custom_tb = custom_tb.tb_next
|
|
@@ -46,10 +45,7 @@ class Exception:
|
|
|
46
45
|
)
|
|
47
46
|
else:
|
|
48
47
|
exception_message += "!-- Traceback is None\n"
|
|
49
|
-
exception_message +=
|
|
50
|
-
f"!-- Description: {e}\n"
|
|
51
|
-
"-------------------------"
|
|
52
|
-
)
|
|
48
|
+
exception_message += f"!-- Description: {e}\n-------------------------"
|
|
53
49
|
print(exception_message)
|
|
54
50
|
debug_file = os.path.join(cls.output, "waybackup_error.log")
|
|
55
51
|
print(f"Exception log: {debug_file}")
|
|
@@ -85,10 +81,10 @@ class Exception:
|
|
|
85
81
|
if os.path.isfile(input): # case single path
|
|
86
82
|
return os.path.relpath(input, os.getcwd())
|
|
87
83
|
input_modified = ""
|
|
88
|
-
input_lines = input.split(
|
|
89
|
-
if len(input_lines) == 1:
|
|
84
|
+
input_lines = input.split("\n")
|
|
85
|
+
if len(input_lines) == 1: # case single line
|
|
90
86
|
return input
|
|
91
|
-
for line in input.split(
|
|
87
|
+
for line in input.split("\n"): # case multiple lines
|
|
92
88
|
match = path_pattern.search(line)
|
|
93
89
|
if match:
|
|
94
90
|
original_path = match.group(1)
|
|
@@ -104,5 +100,6 @@ class Exception:
|
|
|
104
100
|
if issubclass(exception_type, KeyboardInterrupt):
|
|
105
101
|
sys.__excepthook__(exception_type, exception, traceback)
|
|
106
102
|
return
|
|
107
|
-
Exception.exception(
|
|
108
|
-
|
|
103
|
+
Exception.exception(
|
|
104
|
+
"UNCAUGHT EXCEPTION", exception, traceback
|
|
105
|
+
) # uncaught exceptions also with custom scheme
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import csv
|
|
3
3
|
import os
|
|
4
|
-
import threading
|
|
5
4
|
|
|
6
5
|
from tqdm import tqdm
|
|
7
6
|
|
|
@@ -9,8 +8,6 @@ from pywaybackup.Verbosity import Verbosity as vb
|
|
|
9
8
|
from pywaybackup.helper import url_split
|
|
10
9
|
from pywaybackup.db import Database
|
|
11
10
|
|
|
12
|
-
LOCK = threading.Lock() # thread safe lock
|
|
13
|
-
|
|
14
11
|
class SnapshotCollection:
|
|
15
12
|
"""
|
|
16
13
|
Represents the interaction with the snapshot-collection contained in the snapshot database.
|
|
@@ -283,31 +280,36 @@ class SnapshotCollection:
|
|
|
283
280
|
"""
|
|
284
281
|
Modify a snapshot-row in the snapshot table.
|
|
285
282
|
"""
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
)
|
|
296
|
-
connection.conn.commit()
|
|
283
|
+
connection.cursor.execute(
|
|
284
|
+
f"""
|
|
285
|
+
UPDATE snapshot_tbl
|
|
286
|
+
SET {column} = ?
|
|
287
|
+
WHERE rowid = ?
|
|
288
|
+
""",
|
|
289
|
+
(value, snapshot_id)
|
|
290
|
+
)
|
|
291
|
+
connection.conn.commit()
|
|
297
292
|
|
|
298
293
|
def get_snapshot(connection):
|
|
299
294
|
"""
|
|
300
295
|
Get a snapshot-row from the snapshot table with response NULL. (not processed)
|
|
301
296
|
"""
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
297
|
+
# mark as locked for other workers // only visual because get_snapshot fetches by NULL
|
|
298
|
+
connection.cursor.execute(
|
|
299
|
+
"""
|
|
300
|
+
UPDATE snapshot_tbl
|
|
301
|
+
SET response = 'LOCK'
|
|
302
|
+
WHERE rowid = (
|
|
303
|
+
SELECT rowid FROM snapshot_tbl
|
|
304
|
+
WHERE response IS NULL
|
|
305
|
+
LIMIT 1
|
|
308
306
|
)
|
|
309
|
-
|
|
310
|
-
|
|
307
|
+
RETURNING rowid, *;
|
|
308
|
+
"""
|
|
309
|
+
)
|
|
310
|
+
row = connection.cursor.fetchone()
|
|
311
|
+
connection.conn.commit()
|
|
312
|
+
return row
|
|
311
313
|
|
|
312
314
|
@classmethod
|
|
313
315
|
def create_output(cls, url: str, timestamp: str, output: str):
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import sys
|
|
2
1
|
from tqdm import tqdm
|
|
3
2
|
|
|
4
3
|
class Verbosity:
|
|
@@ -63,21 +62,24 @@ class Verbosity:
|
|
|
63
62
|
cls.pbar.refresh()
|
|
64
63
|
|
|
65
64
|
@classmethod
|
|
66
|
-
def generate_logline(cls, status: str
|
|
65
|
+
def generate_logline(cls, status: str, type: str, message: str):
|
|
67
66
|
"""
|
|
68
|
-
STATUS
|
|
67
|
+
STATUS ➔ TYPE: MESSAGE
|
|
69
68
|
"""
|
|
70
69
|
|
|
71
70
|
if not status and not type:
|
|
72
71
|
return message
|
|
73
72
|
|
|
74
|
-
status_length =
|
|
73
|
+
status_length = 10
|
|
75
74
|
type_length = 5
|
|
76
75
|
|
|
77
76
|
status = status.ljust(status_length)
|
|
77
|
+
status = f"{status} -> "
|
|
78
|
+
|
|
78
79
|
type = type.ljust(type_length)
|
|
80
|
+
type = f"{type}: " if type.strip() else ""
|
|
79
81
|
|
|
80
|
-
log_entry = f"{status}
|
|
82
|
+
log_entry = f"{status}{type}{message}"
|
|
81
83
|
|
|
82
84
|
return log_entry
|
|
83
85
|
|
pywaybackup-3.0.4/pywaybackup/archive.py → pywaybackup-3.1.0/pywaybackup/archive_download.py
RENAMED
|
@@ -1,84 +1,26 @@
|
|
|
1
|
-
import requests
|
|
2
|
-
import os
|
|
3
1
|
import gzip
|
|
2
|
+
import http.client
|
|
3
|
+
import os
|
|
4
4
|
import threading
|
|
5
5
|
import time
|
|
6
6
|
import urllib.parse
|
|
7
|
-
import
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from socket import timeout
|
|
8
9
|
from urllib.parse import urljoin
|
|
9
|
-
from datetime import datetime, timezone
|
|
10
|
-
|
|
11
|
-
from tqdm import tqdm
|
|
12
10
|
|
|
13
|
-
from
|
|
11
|
+
from importlib.metadata import version
|
|
14
12
|
|
|
15
|
-
|
|
13
|
+
import requests
|
|
14
|
+
from tqdm import tqdm
|
|
16
15
|
|
|
17
|
-
from pywaybackup.SnapshotCollection import SnapshotCollection as sc
|
|
18
16
|
from pywaybackup.Arguments import Configuration as config
|
|
19
|
-
from pywaybackup.
|
|
20
|
-
|
|
21
|
-
from importlib.metadata import version
|
|
22
|
-
|
|
17
|
+
from pywaybackup.Exception import Exception as ex
|
|
18
|
+
from pywaybackup.SnapshotCollection import SnapshotCollection as sc
|
|
23
19
|
from pywaybackup.Verbosity import Message
|
|
24
20
|
from pywaybackup.Verbosity import Verbosity as vb
|
|
25
|
-
from pywaybackup.
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
# GET: store page to wayback machine and response with redirect to snapshot
|
|
32
|
-
# POST: store page to wayback machine and response with wayback machine status-page
|
|
33
|
-
# tag_jobid = '<script>spn.watchJob("spn2-%s", "/_static/",6000);</script>'
|
|
34
|
-
# tag_result_timeout = '<p>The same snapshot had been made %s minutes ago. You can make new capture of this URL after 1 hour.</p>'
|
|
35
|
-
# tag_result_success = ' A snapshot was captured. Visit page: <a href="%s">%s</a>'
|
|
36
|
-
def save_page(url: str):
|
|
37
|
-
"""
|
|
38
|
-
Saves a webpage to the Wayback Machine.
|
|
39
|
-
|
|
40
|
-
Args:
|
|
41
|
-
url (str): The URL of the webpage to be saved.
|
|
42
|
-
|
|
43
|
-
Returns:
|
|
44
|
-
None: The function does not return any value. It only prints messages to the console.
|
|
45
|
-
"""
|
|
46
|
-
vb.write(message="\nSaving page to the Wayback Machine...")
|
|
47
|
-
connection = http.client.HTTPSConnection("web.archive.org")
|
|
48
|
-
headers = {
|
|
49
|
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
|
|
50
|
-
}
|
|
51
|
-
connection.request("GET", f"https://web.archive.org/save/{url}", headers=headers)
|
|
52
|
-
vb.write(message="\n-----> Request sent")
|
|
53
|
-
response = connection.getresponse()
|
|
54
|
-
response_status = response.status
|
|
55
|
-
|
|
56
|
-
if response_status == 302:
|
|
57
|
-
location = response.getheader("Location")
|
|
58
|
-
vb.write(message="\n-----> Response: 302 (redirect to snapshot)")
|
|
59
|
-
snapshot_timestamp = datetime.strptime(url_get_timestamp(location), '%Y%m%d%H%M%S').strftime('%Y-%m-%d %H:%M:%S')
|
|
60
|
-
current_timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
|
|
61
|
-
timestamp_difference = (datetime.strptime(current_timestamp, '%Y-%m-%d %H:%M:%S') - datetime.strptime(snapshot_timestamp, '%Y-%m-%d %H:%M:%S')).seconds / 60
|
|
62
|
-
timestamp_difference = int(round(timestamp_difference, 0))
|
|
63
|
-
|
|
64
|
-
if timestamp_difference < 1:
|
|
65
|
-
vb.write(message="\n-----> New snapshot created")
|
|
66
|
-
elif timestamp_difference > 1:
|
|
67
|
-
vb.write(message=f"\n-----> Snapshot already exists. (1 hour limit) - wait for {60 - timestamp_difference} minutes")
|
|
68
|
-
vb.write(message=f"TIMESTAMP SNAPSHOT: {snapshot_timestamp}")
|
|
69
|
-
vb.write(message=f"TIMESTAMP REQUEST : {current_timestamp}")
|
|
70
|
-
vb.write(message=f"\nLAST SNAPSHOT BACK: {timestamp_difference} minutes")
|
|
71
|
-
|
|
72
|
-
vb.write(message=f"\nURL: {location}")
|
|
73
|
-
|
|
74
|
-
elif response_status == 404:
|
|
75
|
-
vb.write(message="\n-----> Response: 404 (not found)")
|
|
76
|
-
vb.write(message=f"\nFAILED -> URL: {url}")
|
|
77
|
-
else:
|
|
78
|
-
vb.write(message="\n-----> Response: unexpected")
|
|
79
|
-
vb.write(message=f"\nFAILED -> URL: {url}")
|
|
21
|
+
from pywaybackup.db import Database
|
|
22
|
+
from pywaybackup.helper import check_nt, move_index, url_get_timestamp
|
|
80
23
|
|
|
81
|
-
connection.close()
|
|
82
24
|
|
|
83
25
|
|
|
84
26
|
|
|
@@ -89,7 +31,7 @@ def startup():
|
|
|
89
31
|
vb.write(message=f"\n<<< python-wayback-machine-downloader v{version('pywaybackup')} >>>")
|
|
90
32
|
|
|
91
33
|
if Database.QUERY_EXIST:
|
|
92
|
-
vb.write(message=f"\
|
|
34
|
+
vb.write(message=f"\nDOWNLOAD job exist - processed: {Database.QUERY_PROGRESS}\nResuming download... (to reset the job use '--reset')\n")
|
|
93
35
|
|
|
94
36
|
for i in range(5, -1, -1):
|
|
95
37
|
vb.write(message=f"\r{i}...")
|
|
@@ -224,8 +166,6 @@ def download_loop(output, worker, retry, no_redirect, delay):
|
|
|
224
166
|
|
|
225
167
|
snapshot = sc.get_snapshot(db)
|
|
226
168
|
if not snapshot: break
|
|
227
|
-
# mark as locked for other workers // only visual because get_snapshot fetches by NULL
|
|
228
|
-
sc.modify_snapshot(db, snapshot["rowid"], "response", "LOCK")
|
|
229
169
|
SNAPSHOT_CURRENT = snapshot["rowid"]
|
|
230
170
|
|
|
231
171
|
retry_attempt = 1
|
|
@@ -273,18 +213,18 @@ def download_loop(output, worker, retry, no_redirect, delay):
|
|
|
273
213
|
|
|
274
214
|
# depends on user - retries after timeout or proceed to next snapshot
|
|
275
215
|
if retry > 0:
|
|
276
|
-
status_message.store(message=
|
|
216
|
+
status_message.store(status="FAILED", message="retry timeout: 15 seconds...")
|
|
277
217
|
status_message.write()
|
|
278
218
|
time.sleep(15)
|
|
279
219
|
else:
|
|
280
|
-
status_message.store(message=
|
|
220
|
+
status_message.store(status="FAILED", message="no attempt left")
|
|
281
221
|
status_message.write()
|
|
282
222
|
sc.SNAPSHOT_HANDLED += 1
|
|
283
223
|
break # break all loops and do a user-defined retry
|
|
284
224
|
|
|
285
225
|
retry_attempt += 1
|
|
286
226
|
# if retry_attempt > retry_max_attempt:
|
|
287
|
-
# status_message.store(status="FAILED",
|
|
227
|
+
# status_message.store(status="FAILED", message="Max retries exceeded")
|
|
288
228
|
# status_message.store(status="", type="URL", message=snapshot["url_archive"])
|
|
289
229
|
# status_message.write()
|
|
290
230
|
# vb.progress(1)
|
|
@@ -309,7 +249,7 @@ def download(db, output, snapshot_entry, connection, status_message, no_redirect
|
|
|
309
249
|
response, response_data, response_status, response_status_message = download_response(connection, encoded_download_url, headers)
|
|
310
250
|
sc.modify_snapshot(db, snapshot_entry["rowid"], "response", response_status)
|
|
311
251
|
if not no_redirect and response_status == 302:
|
|
312
|
-
status_message.store(status="REDIRECT",
|
|
252
|
+
status_message.store(status="REDIRECT", message=f"{response.status} - {response_status_message}")
|
|
313
253
|
status_message.store(status="", type="FROM", message=download_url)
|
|
314
254
|
for _ in range(5):
|
|
315
255
|
response, response_data, response_status, response_status_message = download_response(connection, encoded_download_url, headers)
|
|
@@ -327,7 +267,7 @@ def download(db, output, snapshot_entry, connection, status_message, no_redirect
|
|
|
327
267
|
|
|
328
268
|
# if output_file is too long for windows, skip download
|
|
329
269
|
if check_nt() and len(output_file) > 255:
|
|
330
|
-
status_message.store(status="PATH > 255",
|
|
270
|
+
status_message.store(status="PATH > 255", message=f"{response.status} - {response_status_message}")
|
|
331
271
|
status_message.store(status="", type="URL", message=download_url)
|
|
332
272
|
sc.entry_modify(snapshot_entry, "file", "PATH TOO LONG TO SAVE FILE")
|
|
333
273
|
#status_message.write()
|
|
@@ -348,9 +288,9 @@ def download(db, output, snapshot_entry, connection, status_message, no_redirect
|
|
|
348
288
|
file.write(response_data)
|
|
349
289
|
# check if file is downloaded
|
|
350
290
|
if os.path.isfile(output_file):
|
|
351
|
-
status_message.store(status="SUCCESS",
|
|
291
|
+
status_message.store(status="SUCCESS", message=f"{response.status} - {response_status_message}")
|
|
352
292
|
else:
|
|
353
|
-
status_message.store(status="EXISTING",
|
|
293
|
+
status_message.store(status="EXISTING", message=f"{response.status} - {response_status_message}")
|
|
354
294
|
status_message.store(status="", type="URL", message=download_url)
|
|
355
295
|
status_message.store(status="", type="FILE", message=output_file)
|
|
356
296
|
sc.modify_snapshot(db, snapshot_entry["rowid"], "file", output_file)
|
|
@@ -359,7 +299,7 @@ def download(db, output, snapshot_entry, connection, status_message, no_redirect
|
|
|
359
299
|
#status_message.write()
|
|
360
300
|
return True
|
|
361
301
|
else:
|
|
362
|
-
status_message.store(status="UNEXPECTED",
|
|
302
|
+
status_message.store(status="UNEXPECTED", message=f"{response.status} - {response_status_message}")
|
|
363
303
|
status_message.store(status="", type="URL", message=download_url)
|
|
364
304
|
#status_message.write()
|
|
365
305
|
return False
|
|
@@ -375,7 +315,7 @@ def download_response(connection, encoded_download_url, headers):
|
|
|
375
315
|
RESPONSE_CODE_DICT = {
|
|
376
316
|
200: "OK",
|
|
377
317
|
301: "Moved Permanently",
|
|
378
|
-
302: "
|
|
318
|
+
302: "Redirect",
|
|
379
319
|
400: "Bad Request",
|
|
380
320
|
403: "Forbidden",
|
|
381
321
|
404: "Not Found",
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import http.client
|
|
2
|
+
from datetime import datetime, timezone
|
|
3
|
+
|
|
4
|
+
from importlib.metadata import version
|
|
5
|
+
|
|
6
|
+
from pywaybackup.helper import url_get_timestamp
|
|
7
|
+
from pywaybackup.Verbosity import Verbosity as vb
|
|
8
|
+
|
|
9
|
+
# def startup():
|
|
10
|
+
# try:
|
|
11
|
+
# vb.write(message=f"\n<<< python-wayback-machine-downloader v{version('pywaybackup')} >>>")
|
|
12
|
+
|
|
13
|
+
# if Database.QUERY_EXIST:
|
|
14
|
+
# vb.write(message=f"\nSAVE job exist - processed {Database.QUERY_PROGRESS}\nResuming save... (to reset the job use '--reset')\n")
|
|
15
|
+
|
|
16
|
+
# for i in range(5, -1, -1):
|
|
17
|
+
# vb.write(message=f"\r{i}...")
|
|
18
|
+
# print("\033[F", end="")
|
|
19
|
+
# print("\033[K", end="")
|
|
20
|
+
|
|
21
|
+
# time.sleep(1)
|
|
22
|
+
|
|
23
|
+
# #vb.write(message="\n")
|
|
24
|
+
# except KeyboardInterrupt:
|
|
25
|
+
# os._exit(1)
|
|
26
|
+
|
|
27
|
+
# GET: store page to wayback machine and response with redirect to snapshot
|
|
28
|
+
# POST: store page to wayback machine and response with wayback machine status-page
|
|
29
|
+
# tag_jobid = '<script>spn.watchJob("spn2-%s", "/_static/",6000);</script>'
|
|
30
|
+
# tag_result_timeout = '<p>The same snapshot had been made %s minutes ago. You can make new capture of this URL after 1 hour.</p>'
|
|
31
|
+
# tag_result_success = ' A snapshot was captured. Visit page: <a href="%s">%s</a>'
|
|
32
|
+
def save_page(url: str):
|
|
33
|
+
"""
|
|
34
|
+
Saves a webpage to the Wayback Machine.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
url (str): The URL of the webpage to be saved.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
None: The function does not return any value. It only prints messages to the console.
|
|
41
|
+
"""
|
|
42
|
+
try:
|
|
43
|
+
connection = http.client.HTTPSConnection("web.archive.org")
|
|
44
|
+
headers = {"User-Agent": f"bitdruid-python-wayback-downloader/{version('pywaybackup')}"}
|
|
45
|
+
vb.write(message="\nSaving page to the Wayback Machine...")
|
|
46
|
+
connection.request("GET", f"https://web.archive.org/save/{url}", headers=headers)
|
|
47
|
+
vb.write(message=f"\n-----> Request sent -> URL: {url}")
|
|
48
|
+
response = connection.getresponse()
|
|
49
|
+
response_status = response.status
|
|
50
|
+
|
|
51
|
+
if response_status == 302:
|
|
52
|
+
location = response.getheader("Location")
|
|
53
|
+
snapshot_timestamp = datetime.strptime(url_get_timestamp(location), '%Y%m%d%H%M%S').strftime('%Y-%m-%d %H:%M:%S')
|
|
54
|
+
current_timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
|
|
55
|
+
timestamp_difference = (datetime.strptime(current_timestamp, '%Y-%m-%d %H:%M:%S') - datetime.strptime(snapshot_timestamp, '%Y-%m-%d %H:%M:%S')).seconds / 60
|
|
56
|
+
timestamp_difference = int(round(timestamp_difference, 0))
|
|
57
|
+
|
|
58
|
+
if timestamp_difference < 1:
|
|
59
|
+
vb.write(message="\n-----> Response: 302 (new snapshot)")
|
|
60
|
+
vb.write(status="SNAPSHOT", type="URL", message=f"{location}")
|
|
61
|
+
elif timestamp_difference >= 1:
|
|
62
|
+
vb.write(message=f"\n-----> Response: 302 (existing snapshot - wait for {60 - timestamp_difference} minutes)")
|
|
63
|
+
vb.write(status="SNAPSHOT", type="URL", message=f"{location}")
|
|
64
|
+
vb.write(status="WAYBACK", type="TIME", message=f"{snapshot_timestamp}")
|
|
65
|
+
vb.write(status="REQUEST", type="TIME", message=f"{current_timestamp}")
|
|
66
|
+
|
|
67
|
+
elif response_status == 429:
|
|
68
|
+
vb.write(message="\n-----> Response: 429 (too many requests)")
|
|
69
|
+
vb.write(message="- no simultaneous allowed")
|
|
70
|
+
vb.write(message="- 15 per 5 minutes\n")
|
|
71
|
+
elif response_status == 520:
|
|
72
|
+
vb.write(message="\n-----> Response: 520 (job failed)")
|
|
73
|
+
elif response_status == 404:
|
|
74
|
+
vb.write(message="\n-----> Response: 404 (not found)")
|
|
75
|
+
else:
|
|
76
|
+
vb.write(message=f"\n-----> Response: {response_status} - UNHANDLED")
|
|
77
|
+
|
|
78
|
+
connection.close()
|
|
79
|
+
except ConnectionRefusedError:
|
|
80
|
+
vb.write(message="\nCONNECTION REFUSED -> could not connect to wayback machine")
|
|
81
|
+
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import sqlite3
|
|
1
|
+
import pysqlite3 as sqlite3
|
|
2
2
|
|
|
3
3
|
class Database:
|
|
4
4
|
|
|
@@ -40,7 +40,6 @@ class Database:
|
|
|
40
40
|
db.cursor.execute(cls.snapshot_table)
|
|
41
41
|
db.cursor.execute("SELECT query_identifier FROM waybackup_table WHERE query_identifier = ?", (query_identifier,))
|
|
42
42
|
if db.cursor.fetchone():
|
|
43
|
-
print("found")
|
|
44
43
|
cls.QUERY_EXIST = True
|
|
45
44
|
cls.QUERY_PROGRESS = db.get_progress()
|
|
46
45
|
else:
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
|
|
2
1
|
import os
|
|
3
2
|
import shutil
|
|
4
3
|
import magic
|
|
@@ -15,12 +14,13 @@ def sanitize_filename(input: str) -> str:
|
|
|
15
14
|
"""
|
|
16
15
|
Sanitize a string to be used as (part of) a filename.
|
|
17
16
|
"""
|
|
18
|
-
disallowed = [
|
|
17
|
+
disallowed = ["<", ">", ":", '"', "/", "\\", "|", "?", "*"]
|
|
19
18
|
for char in disallowed:
|
|
20
19
|
input = input.replace(char, ".")
|
|
21
|
-
input =
|
|
20
|
+
input = ".".join(filter(None, input.split(".")))
|
|
22
21
|
return input
|
|
23
22
|
|
|
23
|
+
|
|
24
24
|
def sanitize_url(input: str) -> str:
|
|
25
25
|
"""
|
|
26
26
|
Sanitize a url by encoding special characters.
|
|
@@ -32,12 +32,13 @@ def sanitize_url(input: str) -> str:
|
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
def url_get_timestamp(url):
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
35
|
+
"""
|
|
36
|
+
Extract the timestamp from a wayback machine URL.
|
|
37
|
+
"""
|
|
38
|
+
timestamp = url.split("web/")[1].split("/")[0]
|
|
39
|
+
if "id_" in url:
|
|
40
|
+
timestamp = timestamp.split("id_")[0]
|
|
41
|
+
return timestamp
|
|
41
42
|
|
|
42
43
|
|
|
43
44
|
def url_split(url, index=False):
|
|
@@ -52,8 +53,8 @@ def url_split(url, index=False):
|
|
|
52
53
|
if "://" in url:
|
|
53
54
|
url = url.split("://")[1]
|
|
54
55
|
domain = url.split("/")[0]
|
|
55
|
-
path = url[len(domain):]
|
|
56
|
-
domain = domain.split("@")[-1].split(":")[0]
|
|
56
|
+
path = url[len(domain) :]
|
|
57
|
+
domain = domain.split("@")[-1].split(":")[0] # remove mailto and port
|
|
57
58
|
path_parts = path.split("/")
|
|
58
59
|
path_end = path_parts[-1]
|
|
59
60
|
if not url.endswith("/") or "." in path_end:
|
|
@@ -87,21 +88,25 @@ def move_index(existpath: str = None, existfile: str = None, filebuffer: bytes =
|
|
|
87
88
|
shutil.move(existpath, existpath + "_exist")
|
|
88
89
|
os.makedirs(existpath, exist_ok=True)
|
|
89
90
|
if not check_index_mime(existpath):
|
|
90
|
-
new_file = os.path.join(
|
|
91
|
+
new_file = os.path.join(
|
|
92
|
+
existpath, os.path.basename(os.path.normpath(existpath))
|
|
93
|
+
)
|
|
91
94
|
else:
|
|
92
95
|
new_file = os.path.join(existpath, "index.html")
|
|
93
96
|
shutil.move(existpath + "_exist", new_file)
|
|
94
97
|
elif existfile:
|
|
95
98
|
if filebuffer:
|
|
96
99
|
if not check_index_mime(filebuffer):
|
|
97
|
-
return os.path.join(
|
|
100
|
+
return os.path.join(
|
|
101
|
+
existfile, os.path.basename(os.path.normpath(existfile))
|
|
102
|
+
)
|
|
98
103
|
else:
|
|
99
104
|
return os.path.join(existfile, "index.html")
|
|
100
|
-
|
|
105
|
+
|
|
101
106
|
|
|
102
107
|
def check_index_mime(filebuffer: bytes) -> bool:
|
|
103
108
|
mime = magic.Magic(mime=True)
|
|
104
109
|
mime_type = mime.from_buffer(filebuffer)
|
|
105
110
|
if mime_type != "text/html":
|
|
106
111
|
return False
|
|
107
|
-
return True
|
|
112
|
+
return True
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
import os
|
|
2
|
-
|
|
3
2
|
import signal
|
|
4
3
|
|
|
5
|
-
import pywaybackup.
|
|
4
|
+
import pywaybackup.archive_download as archive_download
|
|
5
|
+
import pywaybackup.archive_save as archive_save
|
|
6
6
|
|
|
7
7
|
from pywaybackup.SnapshotCollection import SnapshotCollection as sc
|
|
8
|
-
|
|
9
8
|
from pywaybackup.Arguments import Configuration as config
|
|
10
9
|
from pywaybackup.db import Database as db
|
|
11
10
|
from pywaybackup.Verbosity import Verbosity as vb
|
|
@@ -16,19 +15,22 @@ def main():
|
|
|
16
15
|
config.init()
|
|
17
16
|
ex.init(config.output, config.command)
|
|
18
17
|
vb.init(config.progress, config.log)
|
|
18
|
+
|
|
19
|
+
if config.save:
|
|
20
|
+
archive_save.save_page(config.url)
|
|
21
|
+
os._exit(1)
|
|
22
|
+
|
|
19
23
|
db.init(config.dbfile, config.query_identifier)
|
|
20
24
|
sc.init(config.mode)
|
|
21
25
|
|
|
22
|
-
if config.save:
|
|
23
|
-
archive.save_page(config.url)
|
|
24
26
|
|
|
25
|
-
|
|
27
|
+
if not config.save:
|
|
26
28
|
|
|
27
|
-
|
|
29
|
+
archive_download.startup()
|
|
28
30
|
|
|
29
31
|
try:
|
|
30
|
-
|
|
31
|
-
|
|
32
|
+
archive_download.query_list(config.csvfile, config.cdxfile, config.range, config.limit, config.start, config.end, config.explicit, config.filetype)
|
|
33
|
+
archive_download.download_list(config.output, config.retry, config.no_redirect, config.delay, config.workers)
|
|
32
34
|
except KeyboardInterrupt:
|
|
33
35
|
print("\nInterrupted by user\n")
|
|
34
36
|
config.keep = True
|
|
@@ -44,8 +46,8 @@ def main():
|
|
|
44
46
|
vb.fini()
|
|
45
47
|
|
|
46
48
|
if not config.keep:
|
|
47
|
-
os.remove(config.dbfile)
|
|
48
|
-
os.remove(config.cdxfile)
|
|
49
|
+
os.remove(config.dbfile) if os.path.exists(config.dbfile) else None
|
|
50
|
+
os.remove(config.cdxfile) if os.path.exists(config.cdxfile) else None
|
|
49
51
|
|
|
50
52
|
os._exit(1)
|
|
51
53
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: pywaybackup
|
|
3
|
-
Version: 3.0
|
|
3
|
+
Version: 3.1.0
|
|
4
4
|
Summary: Query and download archive.org as simple as possible.
|
|
5
5
|
Author-email: bitdruid <bitdruid@outlook.com>
|
|
6
6
|
License: MIT License
|
|
@@ -29,6 +29,7 @@ Project-URL: homepage, https://github.com/bitdruid/python-wayback-machine-downlo
|
|
|
29
29
|
Requires-Python: >=3.8
|
|
30
30
|
Description-Content-Type: text/markdown
|
|
31
31
|
License-File: LICENSE
|
|
32
|
+
Requires-Dist: pysqlite3-binary==0.5.4
|
|
32
33
|
Requires-Dist: requests==2.31.0
|
|
33
34
|
Requires-Dist: tqdm==4.66.2
|
|
34
35
|
Requires-Dist: python-magic==0.4.27; sys_platform == "linux"
|
|
@@ -39,7 +40,7 @@ Requires-Dist: python-magic-bin==0.4.14; sys_platform == "win32"
|
|
|
39
40
|
[](https://pypi.org/project/pywaybackup/)
|
|
40
41
|
[](https://pypi.org/project/pywaybackup/)
|
|
41
42
|

|
|
42
|
-
 -->
|
|
43
44
|
[](https://opensource.org/licenses/MIT)
|
|
44
45
|
|
|
45
46
|
Downloading archived web pages from the [Wayback Machine](https://archive.org/web/).
|
|
@@ -7,7 +7,8 @@ pywaybackup/Exception.py
|
|
|
7
7
|
pywaybackup/SnapshotCollection.py
|
|
8
8
|
pywaybackup/Verbosity.py
|
|
9
9
|
pywaybackup/__init__.py
|
|
10
|
-
pywaybackup/
|
|
10
|
+
pywaybackup/archive_download.py
|
|
11
|
+
pywaybackup/archive_save.py
|
|
11
12
|
pywaybackup/db.py
|
|
12
13
|
pywaybackup/helper.py
|
|
13
14
|
pywaybackup/main.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|