pywaybackup 4.1.2__tar.gz → 4.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pywaybackup-4.1.2/pywaybackup.egg-info → pywaybackup-4.1.3}/PKG-INFO +2 -1
- {pywaybackup-4.1.2 → pywaybackup-4.1.3}/pyproject.toml +18 -10
- {pywaybackup-4.1.2 → pywaybackup-4.1.3}/pywaybackup/Arguments.py +1 -1
- {pywaybackup-4.1.2 → pywaybackup-4.1.3}/pywaybackup/PyWayBackup.py +6 -2
- {pywaybackup-4.1.2 → pywaybackup-4.1.3}/pywaybackup/archive_download.py +11 -7
- {pywaybackup-4.1.2 → pywaybackup-4.1.3}/pywaybackup/files.py +9 -1
- {pywaybackup-4.1.2 → pywaybackup-4.1.3/pywaybackup.egg-info}/PKG-INFO +2 -1
- {pywaybackup-4.1.2 → pywaybackup-4.1.3}/pywaybackup.egg-info/requires.txt +1 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.3}/LICENSE +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.3}/README.md +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.3}/pywaybackup/Exception.py +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.3}/pywaybackup/Snapshot.py +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.3}/pywaybackup/SnapshotCollection.py +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.3}/pywaybackup/Verbosity.py +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.3}/pywaybackup/Worker.py +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.3}/pywaybackup/__init__.py +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.3}/pywaybackup/archive_save.py +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.3}/pywaybackup/db.py +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.3}/pywaybackup/helper.py +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.3}/pywaybackup/main.py +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.3}/pywaybackup.egg-info/SOURCES.txt +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.3}/pywaybackup.egg-info/dependency_links.txt +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.3}/pywaybackup.egg-info/entry_points.txt +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.3}/pywaybackup.egg-info/top_level.txt +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.3}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pywaybackup
|
|
3
|
-
Version: 4.1.
|
|
3
|
+
Version: 4.1.3
|
|
4
4
|
Summary: Query and download archive.org as simple as possible.
|
|
5
5
|
Author-email: bitdruid <bitdruid@outlook.com>
|
|
6
6
|
License: MIT License
|
|
@@ -29,6 +29,7 @@ Project-URL: homepage, https://github.com/bitdruid/python-wayback-machine-downlo
|
|
|
29
29
|
Requires-Python: >=3.8
|
|
30
30
|
Description-Content-Type: text/markdown
|
|
31
31
|
License-File: LICENSE
|
|
32
|
+
Requires-Dist: ruff
|
|
32
33
|
Requires-Dist: SQLAlchemy==2.0.43
|
|
33
34
|
Requires-Dist: requests==2.32.3
|
|
34
35
|
Requires-Dist: tqdm==4.67.1
|
|
@@ -2,20 +2,14 @@
|
|
|
2
2
|
requires = ["setuptools", "wheel"]
|
|
3
3
|
build-backend = "setuptools.build_meta"
|
|
4
4
|
|
|
5
|
+
|
|
5
6
|
[tool.setuptools]
|
|
6
7
|
packages = ["pywaybackup"]
|
|
7
8
|
|
|
9
|
+
|
|
8
10
|
[project]
|
|
9
|
-
name = "pywaybackup"
|
|
10
|
-
version = "4.1.2"
|
|
11
|
-
description = "Query and download archive.org as simple as possible."
|
|
12
|
-
authors = [
|
|
13
|
-
{ name = "bitdruid", email = "bitdruid@outlook.com" }
|
|
14
|
-
]
|
|
15
|
-
license = { file = "LICENSE" }
|
|
16
|
-
readme = "README.md"
|
|
17
|
-
requires-python = ">=3.8"
|
|
18
11
|
dependencies = [
|
|
12
|
+
"ruff",
|
|
19
13
|
"SQLAlchemy==2.0.43",
|
|
20
14
|
"requests==2.32.3",
|
|
21
15
|
"tqdm==4.67.1",
|
|
@@ -23,9 +17,23 @@ dependencies = [
|
|
|
23
17
|
"python-magic==0.4.27; sys_platform == 'linux'",
|
|
24
18
|
"python-magic-bin==0.4.14; sys_platform == 'win32' or sys_platform == 'darwin'",
|
|
25
19
|
]
|
|
20
|
+
name = "pywaybackup"
|
|
21
|
+
version = "4.1.3"
|
|
22
|
+
description = "Query and download archive.org as simple as possible."
|
|
23
|
+
authors = [{ name = "bitdruid", email = "bitdruid@outlook.com" }]
|
|
24
|
+
license = { file = "LICENSE" }
|
|
25
|
+
readme = "README.md"
|
|
26
|
+
requires-python = ">=3.8"
|
|
27
|
+
|
|
26
28
|
|
|
27
29
|
[project.scripts]
|
|
28
30
|
waybackup = "pywaybackup.main:cli"
|
|
29
31
|
|
|
32
|
+
|
|
30
33
|
[project.urls]
|
|
31
|
-
homepage = "https://github.com/bitdruid/python-wayback-machine-downloader"
|
|
34
|
+
homepage = "https://github.com/bitdruid/python-wayback-machine-downloader"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
[tool.ruff]
|
|
38
|
+
line-length = 120
|
|
39
|
+
exclude = ["pywaybackup/Arguments.py"]
|
|
@@ -40,6 +40,7 @@ class Arguments:
|
|
|
40
40
|
behavior.add_argument("--retry", type=int, default=0, metavar="", help="retry failed downloads (opt tries as int, else infinite)")
|
|
41
41
|
behavior.add_argument("--workers", type=int, default=1, metavar="", help="number of workers (simultaneous downloads)")
|
|
42
42
|
behavior.add_argument("--delay", type=int, default=0, metavar="", help="delay between each download in seconds")
|
|
43
|
+
behavior.add_argument("--wait", type=int, default=15, metavar="", help="seconds to wait before renewing connection after HTTP errors or snapshot download errors (default: 15)",)
|
|
43
44
|
|
|
44
45
|
special = parser.add_argument_group("special")
|
|
45
46
|
special.add_argument("--reset", action="store_true", help="reset the job and ignore existing cdx/db/csv files")
|
|
@@ -55,4 +56,3 @@ class Arguments:
|
|
|
55
56
|
def get_args(self) -> dict:
|
|
56
57
|
"""Returns the parsed arguments as a dictionary."""
|
|
57
58
|
return vars(self.args)
|
|
58
|
-
|
|
@@ -9,7 +9,7 @@ import pywaybackup.archive_save as archive_save
|
|
|
9
9
|
from pywaybackup.archive_download import DownloadArchive
|
|
10
10
|
from pywaybackup.db import Database as db
|
|
11
11
|
from pywaybackup.Exception import Exception as ex
|
|
12
|
-
from pywaybackup.files import CDXfile, CDXquery, CSVfile
|
|
12
|
+
from pywaybackup.files import CDXfile, CDXquery, CSVfile
|
|
13
13
|
from pywaybackup.helper import sanitize_filename, url_split
|
|
14
14
|
from pywaybackup.SnapshotCollection import SnapshotCollection
|
|
15
15
|
from pywaybackup.Verbosity import Verbosity as vb
|
|
@@ -23,7 +23,7 @@ class _Status:
|
|
|
23
23
|
|
|
24
24
|
Attributes:
|
|
25
25
|
sc (SnapshotCollection): The current snapshot collection being processed.
|
|
26
|
-
task (str): The current task being performed (e.g., 'initializing', 'downloading cdx',
|
|
26
|
+
task (str): The current task being performed (e.g., 'initializing', 'downloading cdx', ...).
|
|
27
27
|
handled (int): The number of snapshots that have been processed so far.
|
|
28
28
|
total (int): The total number of snapshots to be processed.
|
|
29
29
|
progress (float): The progress of the backup process as a percentage.
|
|
@@ -129,6 +129,7 @@ class PyWayBackup:
|
|
|
129
129
|
retry: int = 0,
|
|
130
130
|
workers: int = 1,
|
|
131
131
|
delay: int = 0,
|
|
132
|
+
wait: int = 15,
|
|
132
133
|
reset: bool = False,
|
|
133
134
|
keep: bool = False,
|
|
134
135
|
silent: bool = True,
|
|
@@ -156,6 +157,8 @@ class PyWayBackup:
|
|
|
156
157
|
self._retry = retry
|
|
157
158
|
self._workers = workers
|
|
158
159
|
self._delay = delay
|
|
160
|
+
self._wait = wait
|
|
161
|
+
|
|
159
162
|
self._reset = reset
|
|
160
163
|
self._keep = keep
|
|
161
164
|
|
|
@@ -344,6 +347,7 @@ class PyWayBackup:
|
|
|
344
347
|
retry=self._retry,
|
|
345
348
|
no_redirect=self._no_redirect,
|
|
346
349
|
delay=self._delay,
|
|
350
|
+
wait=self._wait,
|
|
347
351
|
workers=self._workers,
|
|
348
352
|
)
|
|
349
353
|
downloader.run(SnapshotCollection=collection)
|
|
@@ -81,7 +81,7 @@ class DownloadArchive:
|
|
|
81
81
|
sc (SnapshotCollection): The snapshot collection being processed.
|
|
82
82
|
"""
|
|
83
83
|
|
|
84
|
-
def __init__(self, mode: str, output: str, retry: int, no_redirect: bool, delay: int, workers: int):
|
|
84
|
+
def __init__(self, mode: str, output: str, retry: int, no_redirect: bool, delay: int, wait: int, workers: int):
|
|
85
85
|
"""
|
|
86
86
|
Initialize the download manager with configuration options.
|
|
87
87
|
|
|
@@ -98,8 +98,8 @@ class DownloadArchive:
|
|
|
98
98
|
self.retry = retry
|
|
99
99
|
self.no_redirect = no_redirect
|
|
100
100
|
self.delay = delay
|
|
101
|
+
self.wait = wait
|
|
101
102
|
self.workers = workers
|
|
102
|
-
self.no_redirect = no_redirect
|
|
103
103
|
self.sc = None
|
|
104
104
|
|
|
105
105
|
def run(self, SnapshotCollection: SnapshotCollection):
|
|
@@ -208,7 +208,7 @@ class DownloadArchive:
|
|
|
208
208
|
f"\n-----> Worker: {worker.id}"
|
|
209
209
|
f" - Attempt: [{worker.attempt}/{retry_max_attempt}]"
|
|
210
210
|
f" Snapshot ID: [{worker.snapshot.counter}/{self.sc._snapshot_total}]"
|
|
211
|
-
f" - {e.__class__.__name__} - renewing connection in
|
|
211
|
+
f" - {e.__class__.__name__} - renewing connection in {self.wait * download_attempt} seconds..."
|
|
212
212
|
),
|
|
213
213
|
)
|
|
214
214
|
vb.write(
|
|
@@ -216,10 +216,10 @@ class DownloadArchive:
|
|
|
216
216
|
content=(
|
|
217
217
|
f"Worker: {worker.id}"
|
|
218
218
|
f" - Snapshot {worker.snapshot.counter}/{self.sc._snapshot_total}"
|
|
219
|
-
f" - renewing connection in
|
|
219
|
+
f" - renewing connection in {self.wait * download_attempt} seconds..."
|
|
220
220
|
),
|
|
221
221
|
)
|
|
222
|
-
time.sleep(
|
|
222
|
+
time.sleep(self.wait * download_attempt)
|
|
223
223
|
worker.refresh_connection()
|
|
224
224
|
continue
|
|
225
225
|
else:
|
|
@@ -244,9 +244,13 @@ class DownloadArchive:
|
|
|
244
244
|
|
|
245
245
|
# depends on user - retries after timeout or proceed to next snapshot
|
|
246
246
|
if self.retry > 0:
|
|
247
|
-
worker.message.store(
|
|
247
|
+
worker.message.store(
|
|
248
|
+
verbose=True,
|
|
249
|
+
result="FAILED",
|
|
250
|
+
content=f"retry timeout: {self.wait * worker.attempt} seconds...",
|
|
251
|
+
)
|
|
248
252
|
worker.message.write()
|
|
249
|
-
time.sleep(
|
|
253
|
+
time.sleep(self.wait * worker.attempt)
|
|
250
254
|
else:
|
|
251
255
|
worker.message.store(verbose=None, result="FAILED", content="no attempt left")
|
|
252
256
|
worker.message.write()
|
|
@@ -59,7 +59,15 @@ class CDXquery:
|
|
|
59
59
|
)
|
|
60
60
|
filter_filetype = f"&filter=original:.*\\.({'|'.join(self.filter_filetype)})$" if self.filter_filetype else ""
|
|
61
61
|
|
|
62
|
-
return
|
|
62
|
+
return (
|
|
63
|
+
f"https://web.archive.org/cdx/search/cdx?"
|
|
64
|
+
f"output=json"
|
|
65
|
+
f"&url={cdx_url}{period}"
|
|
66
|
+
f"&fl=timestamp,digest,mimetype,statuscode,original"
|
|
67
|
+
f"{limit}"
|
|
68
|
+
f"{filter_filetype}"
|
|
69
|
+
f"{filter_statuscode}"
|
|
70
|
+
)
|
|
63
71
|
|
|
64
72
|
|
|
65
73
|
class File:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pywaybackup
|
|
3
|
-
Version: 4.1.
|
|
3
|
+
Version: 4.1.3
|
|
4
4
|
Summary: Query and download archive.org as simple as possible.
|
|
5
5
|
Author-email: bitdruid <bitdruid@outlook.com>
|
|
6
6
|
License: MIT License
|
|
@@ -29,6 +29,7 @@ Project-URL: homepage, https://github.com/bitdruid/python-wayback-machine-downlo
|
|
|
29
29
|
Requires-Python: >=3.8
|
|
30
30
|
Description-Content-Type: text/markdown
|
|
31
31
|
License-File: LICENSE
|
|
32
|
+
Requires-Dist: ruff
|
|
32
33
|
Requires-Dist: SQLAlchemy==2.0.43
|
|
33
34
|
Requires-Dist: requests==2.32.3
|
|
34
35
|
Requires-Dist: tqdm==4.67.1
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|