pywaybackup 4.1.2__tar.gz → 4.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pywaybackup-4.1.2/pywaybackup.egg-info → pywaybackup-4.1.4}/PKG-INFO +12 -3
- {pywaybackup-4.1.2 → pywaybackup-4.1.4}/README.md +10 -2
- {pywaybackup-4.1.2 → pywaybackup-4.1.4}/pyproject.toml +18 -10
- {pywaybackup-4.1.2 → pywaybackup-4.1.4}/pywaybackup/Arguments.py +15 -2
- {pywaybackup-4.1.2 → pywaybackup-4.1.4}/pywaybackup/PyWayBackup.py +8 -3
- pywaybackup-4.1.4/pywaybackup/Verbosity.py +221 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.4}/pywaybackup/archive_download.py +16 -10
- {pywaybackup-4.1.2 → pywaybackup-4.1.4}/pywaybackup/files.py +9 -1
- {pywaybackup-4.1.2 → pywaybackup-4.1.4/pywaybackup.egg-info}/PKG-INFO +12 -3
- {pywaybackup-4.1.2 → pywaybackup-4.1.4}/pywaybackup.egg-info/requires.txt +1 -0
- pywaybackup-4.1.2/pywaybackup/Verbosity.py +0 -136
- {pywaybackup-4.1.2 → pywaybackup-4.1.4}/LICENSE +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.4}/pywaybackup/Exception.py +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.4}/pywaybackup/Snapshot.py +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.4}/pywaybackup/SnapshotCollection.py +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.4}/pywaybackup/Worker.py +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.4}/pywaybackup/__init__.py +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.4}/pywaybackup/archive_save.py +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.4}/pywaybackup/db.py +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.4}/pywaybackup/helper.py +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.4}/pywaybackup/main.py +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.4}/pywaybackup.egg-info/SOURCES.txt +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.4}/pywaybackup.egg-info/dependency_links.txt +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.4}/pywaybackup.egg-info/entry_points.txt +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.4}/pywaybackup.egg-info/top_level.txt +0 -0
- {pywaybackup-4.1.2 → pywaybackup-4.1.4}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pywaybackup
|
|
3
|
-
Version: 4.1.
|
|
3
|
+
Version: 4.1.4
|
|
4
4
|
Summary: Query and download archive.org as simple as possible.
|
|
5
5
|
Author-email: bitdruid <bitdruid@outlook.com>
|
|
6
6
|
License: MIT License
|
|
@@ -29,6 +29,7 @@ Project-URL: homepage, https://github.com/bitdruid/python-wayback-machine-downlo
|
|
|
29
29
|
Requires-Python: >=3.8
|
|
30
30
|
Description-Content-Type: text/markdown
|
|
31
31
|
License-File: LICENSE
|
|
32
|
+
Requires-Dist: ruff
|
|
32
33
|
Requires-Dist: SQLAlchemy==2.0.43
|
|
33
34
|
Requires-Dist: requests==2.32.3
|
|
34
35
|
Requires-Dist: tqdm==4.67.1
|
|
@@ -216,8 +217,13 @@ Parameters will change the download behavior for snapshots.
|
|
|
216
217
|
- **`-m`**, **`--metadata`**<br>
|
|
217
218
|
Folder where metadata will be saved (`cdx`/`db`/`csv`/`log`). If you are downloading into a network share, you SHOULD set this to a local path because sqlite locking mechanism may cause issues with network shares.
|
|
218
219
|
|
|
219
|
-
- **`--verbose
|
|
220
|
-
|
|
220
|
+
- **`-v`**, **`--verbose`** `[level]`:<br>
|
|
221
|
+
Set verbosity level. Available levels:
|
|
222
|
+
- `low` (or `quiet`, `minimal`, `min`): Essential output only (same as no flag)
|
|
223
|
+
- `default` (or `normal`, `verbose`): Standard verbose output (default when flag is set)
|
|
224
|
+
- `high` (or `debug`, `detailed`, `max`): Detailed verbose output
|
|
225
|
+
|
|
226
|
+
Examples: `--verbose`, `--verbose default`, `--verbose high`, `-v high`
|
|
221
227
|
|
|
222
228
|
- **`--log`** <!-- `<path>` -->:<br>
|
|
223
229
|
Saves a log file into the output-dir. `waybackup_<sanitized_url>.log`.
|
|
@@ -237,6 +243,9 @@ Parameters will change the download behavior for snapshots.
|
|
|
237
243
|
- **`--delay`** `<seconds>`:<br>
|
|
238
244
|
Delay between download requests in seconds. Default is no delay (0).
|
|
239
245
|
|
|
246
|
+
- **`--wait`** `<seconds>`:<br>
|
|
247
|
+
Seconds to wait before renewing connection after HTTP errors or snapshot download errors. Default is 15 seconds.
|
|
248
|
+
|
|
240
249
|
#### Job Handling:
|
|
241
250
|
|
|
242
251
|
- **`--reset`**:
|
|
@@ -178,8 +178,13 @@ Parameters will change the download behavior for snapshots.
|
|
|
178
178
|
- **`-m`**, **`--metadata`**<br>
|
|
179
179
|
Folder where metadata will be saved (`cdx`/`db`/`csv`/`log`). If you are downloading into a network share, you SHOULD set this to a local path because sqlite locking mechanism may cause issues with network shares.
|
|
180
180
|
|
|
181
|
-
- **`--verbose
|
|
182
|
-
|
|
181
|
+
- **`-v`**, **`--verbose`** `[level]`:<br>
|
|
182
|
+
Set verbosity level. Available levels:
|
|
183
|
+
- `low` (or `quiet`, `minimal`, `min`): Essential output only (same as no flag)
|
|
184
|
+
- `default` (or `normal`, `verbose`): Standard verbose output (default when flag is set)
|
|
185
|
+
- `high` (or `debug`, `detailed`, `max`): Detailed verbose output
|
|
186
|
+
|
|
187
|
+
Examples: `--verbose`, `--verbose default`, `--verbose high`, `-v high`
|
|
183
188
|
|
|
184
189
|
- **`--log`** <!-- `<path>` -->:<br>
|
|
185
190
|
Saves a log file into the output-dir. `waybackup_<sanitized_url>.log`.
|
|
@@ -199,6 +204,9 @@ Parameters will change the download behavior for snapshots.
|
|
|
199
204
|
- **`--delay`** `<seconds>`:<br>
|
|
200
205
|
Delay between download requests in seconds. Default is no delay (0).
|
|
201
206
|
|
|
207
|
+
- **`--wait`** `<seconds>`:<br>
|
|
208
|
+
Seconds to wait before renewing connection after HTTP errors or snapshot download errors. Default is 15 seconds.
|
|
209
|
+
|
|
202
210
|
#### Job Handling:
|
|
203
211
|
|
|
204
212
|
- **`--reset`**:
|
|
@@ -2,20 +2,14 @@
|
|
|
2
2
|
requires = ["setuptools", "wheel"]
|
|
3
3
|
build-backend = "setuptools.build_meta"
|
|
4
4
|
|
|
5
|
+
|
|
5
6
|
[tool.setuptools]
|
|
6
7
|
packages = ["pywaybackup"]
|
|
7
8
|
|
|
9
|
+
|
|
8
10
|
[project]
|
|
9
|
-
name = "pywaybackup"
|
|
10
|
-
version = "4.1.2"
|
|
11
|
-
description = "Query and download archive.org as simple as possible."
|
|
12
|
-
authors = [
|
|
13
|
-
{ name = "bitdruid", email = "bitdruid@outlook.com" }
|
|
14
|
-
]
|
|
15
|
-
license = { file = "LICENSE" }
|
|
16
|
-
readme = "README.md"
|
|
17
|
-
requires-python = ">=3.8"
|
|
18
11
|
dependencies = [
|
|
12
|
+
"ruff",
|
|
19
13
|
"SQLAlchemy==2.0.43",
|
|
20
14
|
"requests==2.32.3",
|
|
21
15
|
"tqdm==4.67.1",
|
|
@@ -23,9 +17,23 @@ dependencies = [
|
|
|
23
17
|
"python-magic==0.4.27; sys_platform == 'linux'",
|
|
24
18
|
"python-magic-bin==0.4.14; sys_platform == 'win32' or sys_platform == 'darwin'",
|
|
25
19
|
]
|
|
20
|
+
name = "pywaybackup"
|
|
21
|
+
version = "4.1.4"
|
|
22
|
+
description = "Query and download archive.org as simple as possible."
|
|
23
|
+
authors = [{ name = "bitdruid", email = "bitdruid@outlook.com" }]
|
|
24
|
+
license = { file = "LICENSE" }
|
|
25
|
+
readme = "README.md"
|
|
26
|
+
requires-python = ">=3.8"
|
|
27
|
+
|
|
26
28
|
|
|
27
29
|
[project.scripts]
|
|
28
30
|
waybackup = "pywaybackup.main:cli"
|
|
29
31
|
|
|
32
|
+
|
|
30
33
|
[project.urls]
|
|
31
|
-
homepage = "https://github.com/bitdruid/python-wayback-machine-downloader"
|
|
34
|
+
homepage = "https://github.com/bitdruid/python-wayback-machine-downloader"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
[tool.ruff]
|
|
38
|
+
line-length = 120
|
|
39
|
+
exclude = ["pywaybackup/Arguments.py"]
|
|
@@ -33,13 +33,27 @@ class Arguments:
|
|
|
33
33
|
behavior = parser.add_argument_group("manipulate behavior")
|
|
34
34
|
behavior.add_argument("-o", "--output", type=str, metavar="", help="output for all files - defaults to current directory")
|
|
35
35
|
behavior.add_argument("-m", "--metadata", type=str, metavar="", help="change directory for db/cdx/csv/log files")
|
|
36
|
-
behavior.add_argument(
|
|
36
|
+
behavior.add_argument(
|
|
37
|
+
"-v", "--verbose",
|
|
38
|
+
type=str,
|
|
39
|
+
nargs="?",
|
|
40
|
+
const="default",
|
|
41
|
+
metavar="",
|
|
42
|
+
help="verbosity level: low, default, high (default if flag set without value)",
|
|
43
|
+
)
|
|
37
44
|
behavior.add_argument("--log", action="store_true", help="save a log file into the output folder")
|
|
38
45
|
behavior.add_argument("--progress", action="store_true", help="show a progress bar")
|
|
39
46
|
behavior.add_argument("--no-redirect", action="store_true", help="do not follow redirects by archive.org")
|
|
40
47
|
behavior.add_argument("--retry", type=int, default=0, metavar="", help="retry failed downloads (opt tries as int, else infinite)")
|
|
41
48
|
behavior.add_argument("--workers", type=int, default=1, metavar="", help="number of workers (simultaneous downloads)")
|
|
42
49
|
behavior.add_argument("--delay", type=int, default=0, metavar="", help="delay between each download in seconds")
|
|
50
|
+
behavior.add_argument(
|
|
51
|
+
"--wait",
|
|
52
|
+
type=int,
|
|
53
|
+
default=15,
|
|
54
|
+
metavar="",
|
|
55
|
+
help="seconds to wait before renewing connection after HTTP errors or snapshot download errors (default: 15)",
|
|
56
|
+
)
|
|
43
57
|
|
|
44
58
|
special = parser.add_argument_group("special")
|
|
45
59
|
special.add_argument("--reset", action="store_true", help="reset the job and ignore existing cdx/db/csv files")
|
|
@@ -55,4 +69,3 @@ class Arguments:
|
|
|
55
69
|
def get_args(self) -> dict:
|
|
56
70
|
"""Returns the parsed arguments as a dictionary."""
|
|
57
71
|
return vars(self.args)
|
|
58
|
-
|
|
@@ -4,12 +4,13 @@ import signal
|
|
|
4
4
|
import sys
|
|
5
5
|
import time
|
|
6
6
|
from importlib.metadata import version
|
|
7
|
+
from typing import Union
|
|
7
8
|
|
|
8
9
|
import pywaybackup.archive_save as archive_save
|
|
9
10
|
from pywaybackup.archive_download import DownloadArchive
|
|
10
11
|
from pywaybackup.db import Database as db
|
|
11
12
|
from pywaybackup.Exception import Exception as ex
|
|
12
|
-
from pywaybackup.files import CDXfile, CDXquery, CSVfile
|
|
13
|
+
from pywaybackup.files import CDXfile, CDXquery, CSVfile
|
|
13
14
|
from pywaybackup.helper import sanitize_filename, url_split
|
|
14
15
|
from pywaybackup.SnapshotCollection import SnapshotCollection
|
|
15
16
|
from pywaybackup.Verbosity import Verbosity as vb
|
|
@@ -23,7 +24,7 @@ class _Status:
|
|
|
23
24
|
|
|
24
25
|
Attributes:
|
|
25
26
|
sc (SnapshotCollection): The current snapshot collection being processed.
|
|
26
|
-
task (str): The current task being performed (e.g., 'initializing', 'downloading cdx',
|
|
27
|
+
task (str): The current task being performed (e.g., 'initializing', 'downloading cdx', ...).
|
|
27
28
|
handled (int): The number of snapshots that have been processed so far.
|
|
28
29
|
total (int): The total number of snapshots to be processed.
|
|
29
30
|
progress (float): The progress of the backup process as a percentage.
|
|
@@ -122,13 +123,14 @@ class PyWayBackup:
|
|
|
122
123
|
statuscode: str = None,
|
|
123
124
|
output: str = None,
|
|
124
125
|
metadata: str = None,
|
|
125
|
-
verbose: bool =
|
|
126
|
+
verbose: Union[bool, str, int] = None,
|
|
126
127
|
log: bool = False,
|
|
127
128
|
progress: bool = False,
|
|
128
129
|
no_redirect: bool = False,
|
|
129
130
|
retry: int = 0,
|
|
130
131
|
workers: int = 1,
|
|
131
132
|
delay: int = 0,
|
|
133
|
+
wait: int = 15,
|
|
132
134
|
reset: bool = False,
|
|
133
135
|
keep: bool = False,
|
|
134
136
|
silent: bool = True,
|
|
@@ -156,6 +158,8 @@ class PyWayBackup:
|
|
|
156
158
|
self._retry = retry
|
|
157
159
|
self._workers = workers
|
|
158
160
|
self._delay = delay
|
|
161
|
+
self._wait = wait
|
|
162
|
+
|
|
159
163
|
self._reset = reset
|
|
160
164
|
self._keep = keep
|
|
161
165
|
|
|
@@ -344,6 +348,7 @@ class PyWayBackup:
|
|
|
344
348
|
retry=self._retry,
|
|
345
349
|
no_redirect=self._no_redirect,
|
|
346
350
|
delay=self._delay,
|
|
351
|
+
wait=self._wait,
|
|
347
352
|
workers=self._workers,
|
|
348
353
|
)
|
|
349
354
|
downloader.run(SnapshotCollection=collection)
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
from enum import IntEnum
|
|
2
|
+
from tqdm import tqdm
|
|
3
|
+
from typing import Union
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# outside enum to avoid cls membership
|
|
7
|
+
_VERBOSITY_ALIASES = {
|
|
8
|
+
"NORMAL": "DEFAULT",
|
|
9
|
+
"VERBOSE": "DEFAULT",
|
|
10
|
+
"DETAIL": "HIGH",
|
|
11
|
+
"DETAILED": "HIGH",
|
|
12
|
+
"MAX": "HIGH",
|
|
13
|
+
"QUIET": "LOW",
|
|
14
|
+
"MINIMAL": "LOW",
|
|
15
|
+
"MIN": "LOW",
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class VerbosityLevel(IntEnum):
|
|
20
|
+
"""
|
|
21
|
+
Verbosity levels for output control.
|
|
22
|
+
|
|
23
|
+
- LOW: Essential output only (no verbose flag)
|
|
24
|
+
- DEFAULT: Standard verbose output (--verbose or --verbose default)
|
|
25
|
+
- HIGH: Detailed verbose output (--verbose high)
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
LOW = 0
|
|
29
|
+
DEFAULT = 1
|
|
30
|
+
HIGH = 2
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def from_value(cls, value) -> "VerbosityLevel":
|
|
34
|
+
"""
|
|
35
|
+
Convert various input types to VerbosityLevel.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
value: Can be:
|
|
39
|
+
- None/False: LOW
|
|
40
|
+
- True: DEFAULT
|
|
41
|
+
- str: "low", "default", "high" (+ aliases: normal, info, debug, quiet, etc.)
|
|
42
|
+
- int: 0, 1, 2
|
|
43
|
+
- VerbosityLevel: returned as-is
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
VerbosityLevel enum value
|
|
47
|
+
|
|
48
|
+
Raises:
|
|
49
|
+
ValueError: If string value is not a valid level or alias
|
|
50
|
+
"""
|
|
51
|
+
if value is None or value is False:
|
|
52
|
+
return cls.LOW
|
|
53
|
+
if value is True:
|
|
54
|
+
return cls.DEFAULT
|
|
55
|
+
if isinstance(value, cls):
|
|
56
|
+
return value
|
|
57
|
+
if isinstance(value, int):
|
|
58
|
+
try:
|
|
59
|
+
return cls(value)
|
|
60
|
+
except ValueError:
|
|
61
|
+
raise ValueError(f"Invalid verbosity level: {value}. Valid levels: 0 (low), 1 (default), 2 (high)")
|
|
62
|
+
if isinstance(value, str):
|
|
63
|
+
upper_value = value.upper()
|
|
64
|
+
# check for aliases first
|
|
65
|
+
if upper_value in _VERBOSITY_ALIASES:
|
|
66
|
+
upper_value = _VERBOSITY_ALIASES[upper_value]
|
|
67
|
+
# try to get the enum member
|
|
68
|
+
try:
|
|
69
|
+
return cls[upper_value]
|
|
70
|
+
except KeyError:
|
|
71
|
+
valid = ", ".join([m.name.lower() for m in cls] + list(set(a.lower() for a in _VERBOSITY_ALIASES)))
|
|
72
|
+
raise ValueError(f"Invalid verbosity level: '{value}'. Valid levels: {valid}")
|
|
73
|
+
return cls.LOW
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class Verbosity:
|
|
77
|
+
"""
|
|
78
|
+
A class to manage verbosity levels, logging, progress and output.
|
|
79
|
+
|
|
80
|
+
Verbosity tiers:
|
|
81
|
+
- LOW (0): Essential output only - no verbose flag set
|
|
82
|
+
- DEFAULT (1): Standard verbose - --verbose or --verbose default
|
|
83
|
+
- HIGH (2): Detailed verbose - --verbose high
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
level = VerbosityLevel.LOW
|
|
87
|
+
|
|
88
|
+
PROGRESS = None
|
|
89
|
+
pbar = None
|
|
90
|
+
|
|
91
|
+
log = None
|
|
92
|
+
|
|
93
|
+
@classmethod
|
|
94
|
+
def init(cls, logfile=None, silent: bool = False, verbose: Union[bool, str, int] = False, progress=None):
|
|
95
|
+
cls.silent = silent
|
|
96
|
+
cls.level = VerbosityLevel.from_value(verbose)
|
|
97
|
+
cls.logfile = open(logfile, "w", encoding="utf-8") if logfile else None
|
|
98
|
+
cls.PROGRESS = progress
|
|
99
|
+
|
|
100
|
+
@classmethod
|
|
101
|
+
def fini(cls):
|
|
102
|
+
if cls.PROGRESS:
|
|
103
|
+
if cls.pbar is not None:
|
|
104
|
+
cls.pbar.close()
|
|
105
|
+
if cls.logfile:
|
|
106
|
+
cls.logfile.close()
|
|
107
|
+
|
|
108
|
+
@classmethod
|
|
109
|
+
def write(cls, verbose: Union[bool, str, int, None] = None, content: Union[str, list] = None):
|
|
110
|
+
"""
|
|
111
|
+
Writes log entries to stdout or logfile based on verbosity level and progress-bar status.
|
|
112
|
+
|
|
113
|
+
Determines if the message should be printed based on verbosity level.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
verbose: The required verbosity level for this message:
|
|
117
|
+
- None: Always printed (essential output)
|
|
118
|
+
- False/0/"low": Printed at LOW level and above
|
|
119
|
+
- True/1/"default": Printed at DEFAULT level and above
|
|
120
|
+
- 2/"high": Printed at HIGH level only
|
|
121
|
+
content: The message string or list of message dicts to log.
|
|
122
|
+
"""
|
|
123
|
+
if not cls.silent:
|
|
124
|
+
if isinstance(content, str):
|
|
125
|
+
content = [{"verbose": verbose, "content": content}]
|
|
126
|
+
logline = cls.filter_verbosity(content)
|
|
127
|
+
if logline:
|
|
128
|
+
if cls.logfile:
|
|
129
|
+
cls.logfile.write(logline + "\n")
|
|
130
|
+
cls.logfile.flush()
|
|
131
|
+
if not cls.PROGRESS:
|
|
132
|
+
print(logline)
|
|
133
|
+
|
|
134
|
+
@classmethod
|
|
135
|
+
def progress(cls, progress: int, maxval: int = None):
|
|
136
|
+
"""
|
|
137
|
+
Updates the progress bar.
|
|
138
|
+
|
|
139
|
+
- bar is initialized if calling with progress=0
|
|
140
|
+
- bar is updated if calling with progress > 0
|
|
141
|
+
|
|
142
|
+
"""
|
|
143
|
+
if not cls.silent:
|
|
144
|
+
if cls.PROGRESS:
|
|
145
|
+
if cls.pbar is None and progress == 0:
|
|
146
|
+
cls.pbar = Progressbar(
|
|
147
|
+
unit=" snapshot",
|
|
148
|
+
desc="download file".ljust(15),
|
|
149
|
+
total=maxval,
|
|
150
|
+
ascii="░▒█",
|
|
151
|
+
bar_format="{l_bar}{bar:50}{r_bar}{bar:-10b}",
|
|
152
|
+
)
|
|
153
|
+
if cls.pbar is not None and progress is not None and progress > 0:
|
|
154
|
+
cls.pbar.update(progress)
|
|
155
|
+
|
|
156
|
+
@classmethod
|
|
157
|
+
def filter_verbosity(cls, message: list):
|
|
158
|
+
"""
|
|
159
|
+
Removes messages from the list that do not match the verbosity level.
|
|
160
|
+
|
|
161
|
+
Messages are printed if:
|
|
162
|
+
- verbose is None (always print - essential output)
|
|
163
|
+
- The message's required level <= configured level
|
|
164
|
+
|
|
165
|
+
Returns a string containing the filtered messages, joined by newlines.
|
|
166
|
+
"""
|
|
167
|
+
filtered_message = []
|
|
168
|
+
for msg in message:
|
|
169
|
+
msg_verbose = msg.get("verbose", None)
|
|
170
|
+
if msg_verbose is None:
|
|
171
|
+
# NONE is always printed
|
|
172
|
+
filtered_message.append(msg["content"])
|
|
173
|
+
else:
|
|
174
|
+
# convert message verbosity and compare
|
|
175
|
+
msg_level = VerbosityLevel.from_value(msg_verbose)
|
|
176
|
+
if msg_level <= cls.level:
|
|
177
|
+
filtered_message.append(msg["content"])
|
|
178
|
+
return "\n".join(filtered_message)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class Progressbar(Verbosity):
|
|
182
|
+
def __init__(
|
|
183
|
+
self,
|
|
184
|
+
unit: str,
|
|
185
|
+
desc: str,
|
|
186
|
+
unit_scale: bool = False,
|
|
187
|
+
total: int = None,
|
|
188
|
+
ascii: str = None,
|
|
189
|
+
bar_format: str = None,
|
|
190
|
+
):
|
|
191
|
+
if not super().silent:
|
|
192
|
+
self.unit = unit
|
|
193
|
+
self.desc = desc
|
|
194
|
+
self.unit_scale = unit_scale
|
|
195
|
+
self.total = total
|
|
196
|
+
self.ascii = ascii
|
|
197
|
+
self.bar_format = bar_format
|
|
198
|
+
self.pbar = tqdm(
|
|
199
|
+
unit=self.unit,
|
|
200
|
+
desc=self.desc,
|
|
201
|
+
unit_scale=self.unit_scale,
|
|
202
|
+
total=self.total,
|
|
203
|
+
ascii=self.ascii,
|
|
204
|
+
bar_format=self.bar_format,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
def update(self, progress: int):
|
|
208
|
+
"""
|
|
209
|
+
Updates the progress bar with the given progress value.
|
|
210
|
+
"""
|
|
211
|
+
if not super().silent:
|
|
212
|
+
if self.pbar is not None:
|
|
213
|
+
self.pbar.update(progress)
|
|
214
|
+
self.pbar.refresh()
|
|
215
|
+
|
|
216
|
+
def close(self):
|
|
217
|
+
"""
|
|
218
|
+
Close the progress bar.
|
|
219
|
+
"""
|
|
220
|
+
if self.pbar is not None:
|
|
221
|
+
self.pbar.close()
|
|
@@ -81,7 +81,7 @@ class DownloadArchive:
|
|
|
81
81
|
sc (SnapshotCollection): The snapshot collection being processed.
|
|
82
82
|
"""
|
|
83
83
|
|
|
84
|
-
def __init__(self, mode: str, output: str, retry: int, no_redirect: bool, delay: int, workers: int):
|
|
84
|
+
def __init__(self, mode: str, output: str, retry: int, no_redirect: bool, delay: int, wait: int, workers: int):
|
|
85
85
|
"""
|
|
86
86
|
Initialize the download manager with configuration options.
|
|
87
87
|
|
|
@@ -98,8 +98,8 @@ class DownloadArchive:
|
|
|
98
98
|
self.retry = retry
|
|
99
99
|
self.no_redirect = no_redirect
|
|
100
100
|
self.delay = delay
|
|
101
|
+
self.wait = wait
|
|
101
102
|
self.workers = workers
|
|
102
|
-
self.no_redirect = no_redirect
|
|
103
103
|
self.sc = None
|
|
104
104
|
|
|
105
105
|
def run(self, SnapshotCollection: SnapshotCollection):
|
|
@@ -156,9 +156,11 @@ class DownloadArchive:
|
|
|
156
156
|
while worker.attempt <= retry_max_attempt: # retry as given by user
|
|
157
157
|
worker.message.store(
|
|
158
158
|
verbose=True,
|
|
159
|
-
content=
|
|
160
|
-
|
|
161
|
-
|
|
159
|
+
content=(
|
|
160
|
+
f"\n-----> Worker: {worker.id}"
|
|
161
|
+
f" - Attempt: [{worker.attempt}/{retry_max_attempt}]"
|
|
162
|
+
f" Snapshot ID: [{worker.snapshot.counter}/{self.sc._snapshot_total}]"
|
|
163
|
+
),
|
|
162
164
|
)
|
|
163
165
|
download_attempt = 1
|
|
164
166
|
download_max_attempt = 3
|
|
@@ -208,7 +210,7 @@ class DownloadArchive:
|
|
|
208
210
|
f"\n-----> Worker: {worker.id}"
|
|
209
211
|
f" - Attempt: [{worker.attempt}/{retry_max_attempt}]"
|
|
210
212
|
f" Snapshot ID: [{worker.snapshot.counter}/{self.sc._snapshot_total}]"
|
|
211
|
-
f" - {e.__class__.__name__} - renewing connection in
|
|
213
|
+
f" - {e.__class__.__name__} - renewing connection in {self.wait * download_attempt} seconds..."
|
|
212
214
|
),
|
|
213
215
|
)
|
|
214
216
|
vb.write(
|
|
@@ -216,10 +218,10 @@ class DownloadArchive:
|
|
|
216
218
|
content=(
|
|
217
219
|
f"Worker: {worker.id}"
|
|
218
220
|
f" - Snapshot {worker.snapshot.counter}/{self.sc._snapshot_total}"
|
|
219
|
-
f" - renewing connection in
|
|
221
|
+
f" - renewing connection in {self.wait * download_attempt} seconds..."
|
|
220
222
|
),
|
|
221
223
|
)
|
|
222
|
-
time.sleep(
|
|
224
|
+
time.sleep(self.wait * download_attempt)
|
|
223
225
|
worker.refresh_connection()
|
|
224
226
|
continue
|
|
225
227
|
else:
|
|
@@ -244,9 +246,13 @@ class DownloadArchive:
|
|
|
244
246
|
|
|
245
247
|
# depends on user - retries after timeout or proceed to next snapshot
|
|
246
248
|
if self.retry > 0:
|
|
247
|
-
worker.message.store(
|
|
249
|
+
worker.message.store(
|
|
250
|
+
verbose=True,
|
|
251
|
+
result="FAILED",
|
|
252
|
+
content=f"retry timeout: {self.wait * worker.attempt} seconds...",
|
|
253
|
+
)
|
|
248
254
|
worker.message.write()
|
|
249
|
-
time.sleep(
|
|
255
|
+
time.sleep(self.wait * worker.attempt)
|
|
250
256
|
else:
|
|
251
257
|
worker.message.store(verbose=None, result="FAILED", content="no attempt left")
|
|
252
258
|
worker.message.write()
|
|
@@ -59,7 +59,15 @@ class CDXquery:
|
|
|
59
59
|
)
|
|
60
60
|
filter_filetype = f"&filter=original:.*\\.({'|'.join(self.filter_filetype)})$" if self.filter_filetype else ""
|
|
61
61
|
|
|
62
|
-
return
|
|
62
|
+
return (
|
|
63
|
+
f"https://web.archive.org/cdx/search/cdx?"
|
|
64
|
+
f"output=json"
|
|
65
|
+
f"&url={cdx_url}{period}"
|
|
66
|
+
f"&fl=timestamp,digest,mimetype,statuscode,original"
|
|
67
|
+
f"{limit}"
|
|
68
|
+
f"{filter_filetype}"
|
|
69
|
+
f"{filter_statuscode}"
|
|
70
|
+
)
|
|
63
71
|
|
|
64
72
|
|
|
65
73
|
class File:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pywaybackup
|
|
3
|
-
Version: 4.1.
|
|
3
|
+
Version: 4.1.4
|
|
4
4
|
Summary: Query and download archive.org as simple as possible.
|
|
5
5
|
Author-email: bitdruid <bitdruid@outlook.com>
|
|
6
6
|
License: MIT License
|
|
@@ -29,6 +29,7 @@ Project-URL: homepage, https://github.com/bitdruid/python-wayback-machine-downlo
|
|
|
29
29
|
Requires-Python: >=3.8
|
|
30
30
|
Description-Content-Type: text/markdown
|
|
31
31
|
License-File: LICENSE
|
|
32
|
+
Requires-Dist: ruff
|
|
32
33
|
Requires-Dist: SQLAlchemy==2.0.43
|
|
33
34
|
Requires-Dist: requests==2.32.3
|
|
34
35
|
Requires-Dist: tqdm==4.67.1
|
|
@@ -216,8 +217,13 @@ Parameters will change the download behavior for snapshots.
|
|
|
216
217
|
- **`-m`**, **`--metadata`**<br>
|
|
217
218
|
Folder where metadata will be saved (`cdx`/`db`/`csv`/`log`). If you are downloading into a network share, you SHOULD set this to a local path because sqlite locking mechanism may cause issues with network shares.
|
|
218
219
|
|
|
219
|
-
- **`--verbose
|
|
220
|
-
|
|
220
|
+
- **`-v`**, **`--verbose`** `[level]`:<br>
|
|
221
|
+
Set verbosity level. Available levels:
|
|
222
|
+
- `low` (or `quiet`, `minimal`, `min`): Essential output only (same as no flag)
|
|
223
|
+
- `default` (or `normal`, `verbose`): Standard verbose output (default when flag is set)
|
|
224
|
+
- `high` (or `debug`, `detailed`, `max`): Detailed verbose output
|
|
225
|
+
|
|
226
|
+
Examples: `--verbose`, `--verbose default`, `--verbose high`, `-v high`
|
|
221
227
|
|
|
222
228
|
- **`--log`** <!-- `<path>` -->:<br>
|
|
223
229
|
Saves a log file into the output-dir. `waybackup_<sanitized_url>.log`.
|
|
@@ -237,6 +243,9 @@ Parameters will change the download behavior for snapshots.
|
|
|
237
243
|
- **`--delay`** `<seconds>`:<br>
|
|
238
244
|
Delay between download requests in seconds. Default is no delay (0).
|
|
239
245
|
|
|
246
|
+
- **`--wait`** `<seconds>`:<br>
|
|
247
|
+
Seconds to wait before renewing connection after HTTP errors or snapshot download errors. Default is 15 seconds.
|
|
248
|
+
|
|
240
249
|
#### Job Handling:
|
|
241
250
|
|
|
242
251
|
- **`--reset`**:
|
|
@@ -1,136 +0,0 @@
|
|
|
1
|
-
from tqdm import tqdm
|
|
2
|
-
from typing import Union
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class Verbosity:
|
|
6
|
-
"""
|
|
7
|
-
A class to manage verbosity levels, logging, progress and output.
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
verbose = False
|
|
11
|
-
|
|
12
|
-
PROGRESS = None
|
|
13
|
-
pbar = None
|
|
14
|
-
|
|
15
|
-
log = None
|
|
16
|
-
|
|
17
|
-
@classmethod
|
|
18
|
-
def init(cls, logfile=None, silent: bool = False, verbose: bool = False, progress=None):
|
|
19
|
-
cls.silent = silent
|
|
20
|
-
cls.verbose = verbose
|
|
21
|
-
cls.logfile = open(logfile, "w", encoding="utf-8") if logfile else None
|
|
22
|
-
cls.PROGRESS = progress
|
|
23
|
-
|
|
24
|
-
@classmethod
|
|
25
|
-
def fini(cls):
|
|
26
|
-
if cls.PROGRESS:
|
|
27
|
-
if cls.pbar is not None:
|
|
28
|
-
cls.pbar.close()
|
|
29
|
-
if cls.logfile:
|
|
30
|
-
cls.logfile.close()
|
|
31
|
-
|
|
32
|
-
@classmethod
|
|
33
|
-
def write(cls, verbose: bool = None, content: Union[str, list] = None):
|
|
34
|
-
"""
|
|
35
|
-
Writes log entries to stdout or logfile based on verbosity level and progress-bar status.
|
|
36
|
-
|
|
37
|
-
Determines if the message should be printed based on verbosity level.
|
|
38
|
-
- If None, the message is always printed.
|
|
39
|
-
|
|
40
|
-
Content is a list and is filtered and concatenated to a single block of loglines.
|
|
41
|
-
It should contain dictionaries with keys:
|
|
42
|
-
- 'verbose': The verbosity level of the message (True/False).
|
|
43
|
-
- 'content': The actual message to be logged.
|
|
44
|
-
"""
|
|
45
|
-
if not cls.silent:
|
|
46
|
-
if isinstance(content, str):
|
|
47
|
-
content = [{"verbose": verbose, "content": content}]
|
|
48
|
-
logline = cls.filter_verbosity(content)
|
|
49
|
-
if logline:
|
|
50
|
-
if cls.logfile:
|
|
51
|
-
cls.logfile.write(logline + "\n")
|
|
52
|
-
cls.logfile.flush()
|
|
53
|
-
if not cls.PROGRESS:
|
|
54
|
-
print(logline)
|
|
55
|
-
|
|
56
|
-
@classmethod
|
|
57
|
-
def progress(cls, progress: int, maxval: int = None):
|
|
58
|
-
"""
|
|
59
|
-
Updates the progress bar.
|
|
60
|
-
|
|
61
|
-
- bar is initialized if calling with progress=0
|
|
62
|
-
- bar is updated if calling with progress > 0
|
|
63
|
-
|
|
64
|
-
"""
|
|
65
|
-
if not cls.silent:
|
|
66
|
-
if cls.PROGRESS:
|
|
67
|
-
if cls.pbar is None and progress == 0:
|
|
68
|
-
cls.pbar = Progressbar(
|
|
69
|
-
unit=" snapshot",
|
|
70
|
-
desc="download file".ljust(15),
|
|
71
|
-
total=maxval,
|
|
72
|
-
ascii="░▒█",
|
|
73
|
-
bar_format="{l_bar}{bar:50}{r_bar}{bar:-10b}",
|
|
74
|
-
)
|
|
75
|
-
if cls.pbar is not None and progress is not None and progress > 0:
|
|
76
|
-
cls.pbar.update(progress)
|
|
77
|
-
|
|
78
|
-
@classmethod
|
|
79
|
-
def filter_verbosity(cls, message: list):
|
|
80
|
-
"""
|
|
81
|
-
Removes messages from the list that do not match the verbosity level.
|
|
82
|
-
|
|
83
|
-
- True if message is verbose None (print always)
|
|
84
|
-
- True if message has same verbosity as configured
|
|
85
|
-
|
|
86
|
-
Returns a string containing the filtered messages, joined by newlines.
|
|
87
|
-
"""
|
|
88
|
-
filtered_message = []
|
|
89
|
-
for msg in message:
|
|
90
|
-
verbose = msg.get("verbose", None)
|
|
91
|
-
if verbose is None or verbose == cls.verbose:
|
|
92
|
-
filtered_message.append(msg["content"])
|
|
93
|
-
return "\n".join(filtered_message)
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
class Progressbar(Verbosity):
|
|
97
|
-
def __init__(
|
|
98
|
-
self,
|
|
99
|
-
unit: str,
|
|
100
|
-
desc: str,
|
|
101
|
-
unit_scale: bool = False,
|
|
102
|
-
total: int = None,
|
|
103
|
-
ascii: str = None,
|
|
104
|
-
bar_format: str = None,
|
|
105
|
-
):
|
|
106
|
-
if not super().silent:
|
|
107
|
-
self.unit = unit
|
|
108
|
-
self.desc = desc
|
|
109
|
-
self.unit_scale = unit_scale
|
|
110
|
-
self.total = total
|
|
111
|
-
self.ascii = ascii
|
|
112
|
-
self.bar_format = bar_format
|
|
113
|
-
self.pbar = tqdm(
|
|
114
|
-
unit=self.unit,
|
|
115
|
-
desc=self.desc,
|
|
116
|
-
unit_scale=self.unit_scale,
|
|
117
|
-
total=self.total,
|
|
118
|
-
ascii=self.ascii,
|
|
119
|
-
bar_format=self.bar_format,
|
|
120
|
-
)
|
|
121
|
-
|
|
122
|
-
def update(self, progress: int):
|
|
123
|
-
"""
|
|
124
|
-
Updates the progress bar with the given progress value.
|
|
125
|
-
"""
|
|
126
|
-
if not super().silent:
|
|
127
|
-
if self.pbar is not None:
|
|
128
|
-
self.pbar.update(progress)
|
|
129
|
-
self.pbar.refresh()
|
|
130
|
-
|
|
131
|
-
def close(self):
|
|
132
|
-
"""
|
|
133
|
-
Close the progress bar.
|
|
134
|
-
"""
|
|
135
|
-
if self.pbar is not None:
|
|
136
|
-
self.pbar.close()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|