pywaybackup 3.3.0__tar.gz → 3.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pywaybackup-3.3.0/pywaybackup.egg-info → pywaybackup-3.4.0}/PKG-INFO +45 -18
- {pywaybackup-3.3.0 → pywaybackup-3.4.0}/README.md +44 -16
- {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pyproject.toml +2 -2
- pywaybackup-3.4.0/pywaybackup/Arguments.py +57 -0
- {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup/Exception.py +25 -26
- pywaybackup-3.4.0/pywaybackup/PyWaybackup.py +235 -0
- {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup/SnapshotCollection.py +39 -24
- pywaybackup-3.4.0/pywaybackup/Verbosity.py +120 -0
- pywaybackup-3.4.0/pywaybackup/__init__.py +1 -0
- {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup/archive_download.py +43 -21
- pywaybackup-3.4.0/pywaybackup/main.py +13 -0
- {pywaybackup-3.3.0 → pywaybackup-3.4.0/pywaybackup.egg-info}/PKG-INFO +45 -18
- {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup.egg-info/SOURCES.txt +1 -0
- pywaybackup-3.4.0/pywaybackup.egg-info/entry_points.txt +2 -0
- pywaybackup-3.3.0/pywaybackup/Arguments.py +0 -157
- pywaybackup-3.3.0/pywaybackup/Verbosity.py +0 -93
- pywaybackup-3.3.0/pywaybackup/__init__.py +0 -0
- pywaybackup-3.3.0/pywaybackup/main.py +0 -55
- pywaybackup-3.3.0/pywaybackup.egg-info/entry_points.txt +0 -2
- {pywaybackup-3.3.0 → pywaybackup-3.4.0}/LICENSE +0 -0
- {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup/Converter.py +0 -0
- {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup/Worker.py +0 -0
- {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup/archive_save.py +0 -0
- {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup/db.py +0 -0
- {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup/helper.py +0 -0
- {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup.egg-info/dependency_links.txt +0 -0
- {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup.egg-info/requires.txt +0 -0
- {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup.egg-info/top_level.txt +0 -0
- {pywaybackup-3.3.0 → pywaybackup-3.4.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: pywaybackup
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.4.0
|
|
4
4
|
Summary: Query and download archive.org as simple as possible.
|
|
5
5
|
Author-email: bitdruid <bitdruid@outlook.com>
|
|
6
6
|
License: MIT License
|
|
@@ -35,7 +35,6 @@ Requires-Dist: requests==2.32.3
|
|
|
35
35
|
Requires-Dist: tqdm==4.67.1
|
|
36
36
|
Requires-Dist: python-magic==0.4.27; sys_platform == "linux"
|
|
37
37
|
Requires-Dist: python-magic-bin==0.4.14; sys_platform == "win32"
|
|
38
|
-
Dynamic: license-file
|
|
39
38
|
|
|
40
39
|
# python wayback machine downloader
|
|
41
40
|
|
|
@@ -77,12 +76,50 @@ This tool allows you to download content from the Wayback Machine (archive.org).
|
|
|
77
76
|
<br>
|
|
78
77
|
<br>
|
|
79
78
|
|
|
80
|
-
##
|
|
79
|
+
## import
|
|
80
|
+
|
|
81
|
+
You can import pywaybackup into your own scripts and run it. Args are the same as cli.
|
|
82
|
+
|
|
83
|
+
Additional args:
|
|
84
|
+
- `silent` (default True): If True, suppresses all output to the console.
|
|
85
|
+
- `debug` (default False): If True, disables writing errors to the error log file.
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
from pywaybackup import PyWayBackup
|
|
89
|
+
|
|
90
|
+
backup = PyWayBackup(
|
|
91
|
+
url="https://example.com",
|
|
92
|
+
all=True,
|
|
93
|
+
start="20200101",
|
|
94
|
+
end="20201231",
|
|
95
|
+
silent=False,
|
|
96
|
+
debug=True,
|
|
97
|
+
log=True,
|
|
98
|
+
keep=True
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
backup.run()
|
|
102
|
+
backup_paths = backup.paths(rel=True)
|
|
103
|
+
print(backup_paths)
|
|
104
|
+
```
|
|
105
|
+
output:
|
|
106
|
+
```bash
|
|
107
|
+
{
|
|
108
|
+
'snapshots': 'output/example.com',
|
|
109
|
+
'cdxfile': 'output/waybackup_example.cdx',
|
|
110
|
+
'dbfile': 'output/waybackup_example.com.db',
|
|
111
|
+
'csvfile': 'output/waybackup_https.example.com.csv',
|
|
112
|
+
'log': 'output/waybackup_example.com.log',
|
|
113
|
+
'debug': 'output/waybackup_error.log'
|
|
114
|
+
}
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## cli
|
|
81
118
|
|
|
82
119
|
- `-h`, `--help`: Show the help message and exit.
|
|
83
120
|
- `-v`, `--version`: Show information about the tool and exit.
|
|
84
121
|
|
|
85
|
-
|
|
122
|
+
#### Required
|
|
86
123
|
|
|
87
124
|
- **`-u`**, **`--url`**:<br>
|
|
88
125
|
The URL of the web page to download. This argument is required.
|
|
@@ -107,8 +144,8 @@ This tool allows you to download content from the Wayback Machine (archive.org).
|
|
|
107
144
|
Limits the amount of snapshots to query from the CDX server. If an existing CDX file is injected, the limit will have no effect. So you would need to set `--keep`.
|
|
108
145
|
|
|
109
146
|
- **Range Selection:**<br>
|
|
110
|
-
Specify the range in years or a specific timestamp either start, end, or both. If you specify the `range
|
|
111
|
-
(year 2019, year
|
|
147
|
+
Specify the range in years or a specific timestamp either start, end, or both. If you specify the `range`, the `start` and `end` will be ignored. Format for timestamps: YYYYMMDDhhmmss. You can only give a year or increase specificity by going through the timestamp starting on the left.<br>
|
|
148
|
+
(year 2019, year+month+day 20190101, year+month+day+hour 2019010112)
|
|
112
149
|
|
|
113
150
|
- **`-r`**, **`--range`**:<br>
|
|
114
151
|
Specify the range in years for which to search and download snapshots.
|
|
@@ -144,9 +181,6 @@ This tool allows you to download content from the Wayback Machine (archive.org).
|
|
|
144
181
|
- **`--verbose`**:<br>
|
|
145
182
|
Increase output verbosity.
|
|
146
183
|
|
|
147
|
-
<!-- - **`--verbosity`** `<level>`:<br>
|
|
148
|
-
Sets verbosity level. Options are `info`and `trace`. Default is `info`. -->
|
|
149
|
-
|
|
150
184
|
- **`--log`** <!-- `<path>` -->:<br>
|
|
151
185
|
Saves a log file into the output-dir. Named as `waybackup_<sanitized_url>.log`.
|
|
152
186
|
|
|
@@ -165,9 +199,6 @@ Sets verbosity level. Options are `info`and `trace`. Default is `info`. -->
|
|
|
165
199
|
- **`--delay`** `<seconds>`:<br>
|
|
166
200
|
Specifies delay between download requests in seconds. Default is no delay (0).
|
|
167
201
|
|
|
168
|
-
<!-- - **`--convert-links`**:<br>
|
|
169
|
-
If set, all links in the downloaded files will be converted to local links. This is useful for offline browsing. The links are converted to the local path structure. Show output with `--verbosity trace`. -->
|
|
170
|
-
|
|
171
202
|
#### Job Handling:
|
|
172
203
|
|
|
173
204
|
- **`--reset`**:
|
|
@@ -265,9 +296,7 @@ your/path/waybackup_snapshots/
|
|
|
265
296
|
|
|
266
297
|
### CSV
|
|
267
298
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
For download queries:
|
|
299
|
+
The CSV contains a snapshot per row:
|
|
271
300
|
|
|
272
301
|
```
|
|
273
302
|
[
|
|
@@ -313,5 +342,3 @@ Exceptions will be written into `waybackup_error.log` (each run overwrites the f
|
|
|
313
342
|
|
|
314
343
|
I'm always happy for some feature requests to improve the usability of this tool.
|
|
315
344
|
Feel free to give suggestions and report issues. Project is still far from being perfect.
|
|
316
|
-
|
|
317
|
-
> Please PR from dev into dev.
|
|
@@ -38,12 +38,50 @@ This tool allows you to download content from the Wayback Machine (archive.org).
|
|
|
38
38
|
<br>
|
|
39
39
|
<br>
|
|
40
40
|
|
|
41
|
-
##
|
|
41
|
+
## import
|
|
42
|
+
|
|
43
|
+
You can import pywaybackup into your own scripts and run it. Args are the same as cli.
|
|
44
|
+
|
|
45
|
+
Additional args:
|
|
46
|
+
- `silent` (default True): If True, suppresses all output to the console.
|
|
47
|
+
- `debug` (default False): If True, disables writing errors to the error log file.
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from pywaybackup import PyWayBackup
|
|
51
|
+
|
|
52
|
+
backup = PyWayBackup(
|
|
53
|
+
url="https://example.com",
|
|
54
|
+
all=True,
|
|
55
|
+
start="20200101",
|
|
56
|
+
end="20201231",
|
|
57
|
+
silent=False,
|
|
58
|
+
debug=True,
|
|
59
|
+
log=True,
|
|
60
|
+
keep=True
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
backup.run()
|
|
64
|
+
backup_paths = backup.paths(rel=True)
|
|
65
|
+
print(backup_paths)
|
|
66
|
+
```
|
|
67
|
+
output:
|
|
68
|
+
```bash
|
|
69
|
+
{
|
|
70
|
+
'snapshots': 'output/example.com',
|
|
71
|
+
'cdxfile': 'output/waybackup_example.cdx',
|
|
72
|
+
'dbfile': 'output/waybackup_example.com.db',
|
|
73
|
+
'csvfile': 'output/waybackup_https.example.com.csv',
|
|
74
|
+
'log': 'output/waybackup_example.com.log',
|
|
75
|
+
'debug': 'output/waybackup_error.log'
|
|
76
|
+
}
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## cli
|
|
42
80
|
|
|
43
81
|
- `-h`, `--help`: Show the help message and exit.
|
|
44
82
|
- `-v`, `--version`: Show information about the tool and exit.
|
|
45
83
|
|
|
46
|
-
|
|
84
|
+
#### Required
|
|
47
85
|
|
|
48
86
|
- **`-u`**, **`--url`**:<br>
|
|
49
87
|
The URL of the web page to download. This argument is required.
|
|
@@ -68,8 +106,8 @@ This tool allows you to download content from the Wayback Machine (archive.org).
|
|
|
68
106
|
Limits the amount of snapshots to query from the CDX server. If an existing CDX file is injected, the limit will have no effect. So you would need to set `--keep`.
|
|
69
107
|
|
|
70
108
|
- **Range Selection:**<br>
|
|
71
|
-
Specify the range in years or a specific timestamp either start, end, or both. If you specify the `range
|
|
72
|
-
(year 2019, year
|
|
109
|
+
Specify the range in years or a specific timestamp either start, end, or both. If you specify the `range`, the `start` and `end` will be ignored. Format for timestamps: YYYYMMDDhhmmss. You can only give a year or increase specificity by going through the timestamp starting on the left.<br>
|
|
110
|
+
(year 2019, year+month+day 20190101, year+month+day+hour 2019010112)
|
|
73
111
|
|
|
74
112
|
- **`-r`**, **`--range`**:<br>
|
|
75
113
|
Specify the range in years for which to search and download snapshots.
|
|
@@ -105,9 +143,6 @@ This tool allows you to download content from the Wayback Machine (archive.org).
|
|
|
105
143
|
- **`--verbose`**:<br>
|
|
106
144
|
Increase output verbosity.
|
|
107
145
|
|
|
108
|
-
<!-- - **`--verbosity`** `<level>`:<br>
|
|
109
|
-
Sets verbosity level. Options are `info`and `trace`. Default is `info`. -->
|
|
110
|
-
|
|
111
146
|
- **`--log`** <!-- `<path>` -->:<br>
|
|
112
147
|
Saves a log file into the output-dir. Named as `waybackup_<sanitized_url>.log`.
|
|
113
148
|
|
|
@@ -126,9 +161,6 @@ Sets verbosity level. Options are `info`and `trace`. Default is `info`. -->
|
|
|
126
161
|
- **`--delay`** `<seconds>`:<br>
|
|
127
162
|
Specifies delay between download requests in seconds. Default is no delay (0).
|
|
128
163
|
|
|
129
|
-
<!-- - **`--convert-links`**:<br>
|
|
130
|
-
If set, all links in the downloaded files will be converted to local links. This is useful for offline browsing. The links are converted to the local path structure. Show output with `--verbosity trace`. -->
|
|
131
|
-
|
|
132
164
|
#### Job Handling:
|
|
133
165
|
|
|
134
166
|
- **`--reset`**:
|
|
@@ -226,9 +258,7 @@ your/path/waybackup_snapshots/
|
|
|
226
258
|
|
|
227
259
|
### CSV
|
|
228
260
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
For download queries:
|
|
261
|
+
The CSV contains a snapshot per row:
|
|
232
262
|
|
|
233
263
|
```
|
|
234
264
|
[
|
|
@@ -273,6 +303,4 @@ Exceptions will be written into `waybackup_error.log` (each run overwrites the f
|
|
|
273
303
|
## Contributing
|
|
274
304
|
|
|
275
305
|
I'm always happy for some feature requests to improve the usability of this tool.
|
|
276
|
-
Feel free to give suggestions and report issues. Project is still far from being perfect.
|
|
277
|
-
|
|
278
|
-
> Please PR from dev into dev.
|
|
306
|
+
Feel free to give suggestions and report issues. Project is still far from being perfect.
|
|
@@ -7,7 +7,7 @@ packages = ["pywaybackup"]
|
|
|
7
7
|
|
|
8
8
|
[project]
|
|
9
9
|
name = "pywaybackup"
|
|
10
|
-
version = "3.
|
|
10
|
+
version = "3.4.0"
|
|
11
11
|
description = "Query and download archive.org as simple as possible."
|
|
12
12
|
authors = [
|
|
13
13
|
{ name = "bitdruid", email = "bitdruid@outlook.com" }
|
|
@@ -25,7 +25,7 @@ dependencies = [
|
|
|
25
25
|
]
|
|
26
26
|
|
|
27
27
|
[project.scripts]
|
|
28
|
-
waybackup = "pywaybackup.main:
|
|
28
|
+
waybackup = "pywaybackup.main:cli"
|
|
29
29
|
|
|
30
30
|
[project.urls]
|
|
31
31
|
homepage = "https://github.com/bitdruid/python-wayback-machine-downloader"
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import argparse
|
|
3
|
+
|
|
4
|
+
from argparse import RawTextHelpFormatter
|
|
5
|
+
|
|
6
|
+
from importlib.metadata import version
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Arguments:
|
|
10
|
+
def __init__(self):
|
|
11
|
+
parser = argparse.ArgumentParser(
|
|
12
|
+
description=f"<<< python-wayback-machine-downloader v{version('pywaybackup')} >>>\nby @bitdruid -> https://github.com/bitdruid",
|
|
13
|
+
formatter_class=RawTextHelpFormatter,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
required = parser.add_argument_group("required (one exclusive)")
|
|
17
|
+
required.add_argument("-u", "--url", type=str, metavar="", help="url (with subdir/subdomain) to download")
|
|
18
|
+
exclusive_required = required.add_mutually_exclusive_group(required=True)
|
|
19
|
+
exclusive_required.add_argument("-a", "--all", action="store_true", help="download snapshots of all timestamps")
|
|
20
|
+
exclusive_required.add_argument("-l", "--last", action="store_true", help="download the last version of each file snapshot")
|
|
21
|
+
exclusive_required.add_argument("-f", "--first", action="store_true", help="download the first version of each file snapshot")
|
|
22
|
+
exclusive_required.add_argument("-s", "--save", action="store_true", help="save a page to the wayback machine")
|
|
23
|
+
|
|
24
|
+
optional = parser.add_argument_group("optional query parameters")
|
|
25
|
+
optional.add_argument("-e", "--explicit", action="store_true", help="search only for the explicit given url")
|
|
26
|
+
optional.add_argument("-r", "--range", type=int, metavar="", help="range in years to search")
|
|
27
|
+
optional.add_argument("--start", type=int, metavar="", help="start timestamp format: YYYYMMDDhhmmss")
|
|
28
|
+
optional.add_argument("--end", type=int, metavar="", help="end timestamp format: YYYYMMDDhhmmss")
|
|
29
|
+
optional.add_argument("--limit", type=int, nargs="?", const=True, metavar="int", help="limit the number of snapshots to download")
|
|
30
|
+
optional.add_argument("--filetype", type=str, metavar="", help="filetypes to download comma separated (js,css,...)")
|
|
31
|
+
optional.add_argument("--statuscode", type=str, metavar="", help="statuscodes to download comma separated (200,404,...)")
|
|
32
|
+
|
|
33
|
+
behavior = parser.add_argument_group("manipulate behavior")
|
|
34
|
+
behavior.add_argument("-o", "--output", type=str, metavar="", help="output for all files - defaults to current directory")
|
|
35
|
+
behavior.add_argument("-m", "--metadata", type=str, metavar="", help="change directory for db/cdx/csv/log files")
|
|
36
|
+
behavior.add_argument("-v", "--verbose", action="store_true", help="overwritten by progress - gives detailed output")
|
|
37
|
+
behavior.add_argument("--log", action="store_true", help="save a log file into the output folder")
|
|
38
|
+
behavior.add_argument("--progress", action="store_true", help="show a progress bar")
|
|
39
|
+
behavior.add_argument("--no-redirect", action="store_true", help="do not follow redirects by archive.org")
|
|
40
|
+
behavior.add_argument("--retry", type=int, default=0, metavar="", help="retry failed downloads (opt tries as int, else infinite)")
|
|
41
|
+
behavior.add_argument("--workers", type=int, default=1, metavar="", help="number of workers (simultaneous downloads)")
|
|
42
|
+
behavior.add_argument("--delay", type=int, default=0, metavar="", help="delay between each download in seconds")
|
|
43
|
+
|
|
44
|
+
special = parser.add_argument_group("special")
|
|
45
|
+
special.add_argument("--reset", action="store_true", help="reset the job and ignore existing cdx/db/csv files")
|
|
46
|
+
special.add_argument("--keep", action="store_true", help="keep all files after the job finished")
|
|
47
|
+
|
|
48
|
+
args = parser.parse_args(args=None if sys.argv[1:] else ["--help"]) # if no arguments are given, print help
|
|
49
|
+
|
|
50
|
+
args.silent = False
|
|
51
|
+
args.debug = True
|
|
52
|
+
|
|
53
|
+
self.args = args
|
|
54
|
+
|
|
55
|
+
def get_args(self) -> dict:
|
|
56
|
+
"""Returns the parsed arguments as a dictionary."""
|
|
57
|
+
return vars(self.args)
|
|
@@ -14,8 +14,9 @@ class Exception:
|
|
|
14
14
|
command = None
|
|
15
15
|
|
|
16
16
|
@classmethod
|
|
17
|
-
def init(cls, output=None, command=None):
|
|
17
|
+
def init(cls, debug=None, output=None, command=None):
|
|
18
18
|
sys.excepthook = cls.exception_handler # set custom exception handler (uncaught exceptions)
|
|
19
|
+
cls.debug = debug
|
|
19
20
|
cls.output = output
|
|
20
21
|
cls.command = command
|
|
21
22
|
|
|
@@ -44,32 +45,30 @@ class Exception:
|
|
|
44
45
|
exception_message += "!-- Traceback is None\n"
|
|
45
46
|
exception_message += f"!-- Description: {e}\n-------------------------"
|
|
46
47
|
print(exception_message)
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
48
|
+
if cls.debug:
|
|
49
|
+
print(f"Exception log: {cls.debug}")
|
|
50
|
+
if cls.new_debug: # new run, overwrite file
|
|
51
|
+
cls.new_debug = False
|
|
52
|
+
f = open(cls.debug, "w", encoding="utf-8")
|
|
53
|
+
f.write("-------------------------\n")
|
|
54
|
+
f.write(f"Version: {version('pywaybackup')}\n")
|
|
55
|
+
f.write("-------------------------\n")
|
|
56
|
+
f.write(f"Command: {cls.command}\n")
|
|
57
|
+
f.write("-------------------------\n\n")
|
|
58
|
+
else: # current run, append to file
|
|
59
|
+
f = open(cls.debug, "a", encoding="utf-8")
|
|
60
|
+
f.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n")
|
|
61
|
+
f.write(exception_message + "\n")
|
|
62
|
+
f.write("!-- Local Variables:\n")
|
|
63
|
+
for var_name, value in local_vars.items():
|
|
64
|
+
if var_name in ["status_message", "headers"]:
|
|
65
|
+
continue
|
|
66
|
+
value = cls.relativate_path(str(value))
|
|
67
|
+
value = value[:666] + " ... " if len(value) > 666 else value
|
|
68
|
+
f.write(f" -- {var_name} = {value}\n")
|
|
54
69
|
f.write("-------------------------\n")
|
|
55
|
-
f.write(
|
|
56
|
-
f.
|
|
57
|
-
f.write(f"Command: {cls.command}\n")
|
|
58
|
-
f.write("-------------------------\n\n")
|
|
59
|
-
else: # current run, append to file
|
|
60
|
-
f = open(debug_file, "a", encoding="utf-8")
|
|
61
|
-
f.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n")
|
|
62
|
-
f.write(exception_message + "\n")
|
|
63
|
-
f.write("!-- Local Variables:\n")
|
|
64
|
-
for var_name, value in local_vars.items():
|
|
65
|
-
if var_name in ["status_message", "headers"]:
|
|
66
|
-
continue
|
|
67
|
-
value = cls.relativate_path(str(value))
|
|
68
|
-
value = value[:666] + " ... " if len(value) > 666 else value
|
|
69
|
-
f.write(f" -- {var_name} = {value}\n")
|
|
70
|
-
f.write("-------------------------\n")
|
|
71
|
-
f.write(original_tb + "\n")
|
|
72
|
-
f.close()
|
|
70
|
+
f.write(original_tb + "\n")
|
|
71
|
+
f.close()
|
|
73
72
|
|
|
74
73
|
@classmethod
|
|
75
74
|
def relativate_path(cls, input_str: str) -> str:
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import os
|
|
3
|
+
import signal
|
|
4
|
+
from pywaybackup.helper import url_split, sanitize_filename
|
|
5
|
+
|
|
6
|
+
import pywaybackup.archive_download as archive_download
|
|
7
|
+
import pywaybackup.archive_save as archive_save
|
|
8
|
+
from pywaybackup.db import Database as db
|
|
9
|
+
from pywaybackup.Verbosity import Verbosity as vb
|
|
10
|
+
from pywaybackup.Exception import Exception as ex
|
|
11
|
+
from pywaybackup.SnapshotCollection import SnapshotCollection as sc
|
|
12
|
+
|
|
13
|
+
class PyWayBackup:
|
|
14
|
+
"""
|
|
15
|
+
PyWayBackup: A Python interface for downloading or saving archived web pages from the Wayback Machine (archive.org).
|
|
16
|
+
|
|
17
|
+
Supported Modes (only one must be selected):
|
|
18
|
+
- all : Download all snapshots for a URL within a time range.
|
|
19
|
+
- last : Download the latest version of each file in the range.
|
|
20
|
+
- first : Download the earliest version of each file in the range.
|
|
21
|
+
- save : Save a snapshot to the Wayback Machine (beta).
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
url (str): Target URL to download snapshots for. (Required)
|
|
25
|
+
all (bool): If True, downloads all snapshots within the range.
|
|
26
|
+
last (bool): If True, downloads the last version of each file.
|
|
27
|
+
first (bool): If True, downloads the first version of each file.
|
|
28
|
+
save (bool): If True, saves a new snapshot to archive.org (beta).
|
|
29
|
+
explicit (bool): Only use the explicitly provided URL without wildcards.
|
|
30
|
+
range (str): A year-based or timestamp-based range (e.g., '2020' or '20200101').
|
|
31
|
+
start (str): Start timestamp (YYYYMMDDhhmmss).
|
|
32
|
+
end (str): End timestamp (YYYYMMDDhhmmss).
|
|
33
|
+
limit (int): Limit the number of snapshots queried from the CDX API.
|
|
34
|
+
filetype (str): Comma-separated list of filetypes to include (e.g., 'jpg,css,js').
|
|
35
|
+
statuscode (str): Comma-separated list of HTTP status codes to include (e.g., '200,301').
|
|
36
|
+
output (str): Output path for downloaded files. Defaults to `./waybackup_snapshots`.
|
|
37
|
+
metadata (str): Path to store metadata files (`cdx`, `db`, `csv`, etc.).
|
|
38
|
+
verbose (bool): Enable verbose logging.
|
|
39
|
+
log (bool): Enable writing logs to a file.
|
|
40
|
+
progress (bool): Show a progress bar.
|
|
41
|
+
no_redirect (bool): Disable handling redirects.
|
|
42
|
+
retry (int): Retry attempts for failed downloads.
|
|
43
|
+
workers (int): Number of download workers (default: 1).
|
|
44
|
+
delay (int): Delay between download requests in seconds.
|
|
45
|
+
reset (bool): Reset job metadata (deletes `.cdx`/`.db`/`.csv` files).
|
|
46
|
+
keep (bool): Retain all job metadata after completion.
|
|
47
|
+
silent (bool): Suppress all output (for programmatic use).
|
|
48
|
+
debug (bool): Enable debug mode.
|
|
49
|
+
**kwargs: Catch-all for future expansion or external integration.
|
|
50
|
+
|
|
51
|
+
Methods:
|
|
52
|
+
run(): Executes the full download or save operation based on initialized parameters.
|
|
53
|
+
|
|
54
|
+
Example:
|
|
55
|
+
>>> from pywaybackup import PyWayBackup
|
|
56
|
+
>>> backup = PyWayBackup(url="https://example.com", all=True, start="20200101", end="20201231")
|
|
57
|
+
>>> backup.run()
|
|
58
|
+
>>> backup_paths = backup.paths(rel=True)
|
|
59
|
+
>>> print(backup_paths)
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(self, url: str = None, all: bool = False, last: bool = False, first: bool = False,
|
|
63
|
+
save: bool = False, explicit: bool = False, range: str = None, start: str = None,
|
|
64
|
+
end: str = None, limit: int = None, filetype: str = None, statuscode: str = None,
|
|
65
|
+
output: str = None, metadata: str = None, verbose: bool = False, log: bool = False,
|
|
66
|
+
progress: bool = False, no_redirect: bool = False, retry: int = 0, workers: int = 1,
|
|
67
|
+
delay: int = 0, reset: bool = False, keep: bool = False,
|
|
68
|
+
silent: bool = True, debug: bool = False, **kwargs: dict):
|
|
69
|
+
|
|
70
|
+
# restrictions
|
|
71
|
+
# url must be given
|
|
72
|
+
# all, last, first, save are mutually exclusive
|
|
73
|
+
if not url:
|
|
74
|
+
raise ValueError("URL must be provided")
|
|
75
|
+
if sum([all, last, first, save]) != 1:
|
|
76
|
+
raise ValueError("Exactly one of --all, --last, --first, or --save is allowed")
|
|
77
|
+
|
|
78
|
+
self.url = url
|
|
79
|
+
self.all = all
|
|
80
|
+
self.last = last
|
|
81
|
+
self.first = first
|
|
82
|
+
self.save = save
|
|
83
|
+
self.explicit = explicit
|
|
84
|
+
self.range = range
|
|
85
|
+
self.start = start
|
|
86
|
+
self.end = end
|
|
87
|
+
self.limit = limit
|
|
88
|
+
self.filetype = filetype
|
|
89
|
+
self.statuscode = statuscode
|
|
90
|
+
self.output = output
|
|
91
|
+
self.metadata = metadata
|
|
92
|
+
self.verbose = verbose
|
|
93
|
+
self.log = log
|
|
94
|
+
self.progress = progress
|
|
95
|
+
self.no_redirect = no_redirect
|
|
96
|
+
self.retry = retry
|
|
97
|
+
self.workers = workers
|
|
98
|
+
self.delay = delay
|
|
99
|
+
self.reset = reset
|
|
100
|
+
self.keep = keep
|
|
101
|
+
|
|
102
|
+
self.silent = silent
|
|
103
|
+
self.debug = debug
|
|
104
|
+
|
|
105
|
+
self.query_identifier = (
|
|
106
|
+
str(self.url) +
|
|
107
|
+
# required_args
|
|
108
|
+
str(self.all) + str(self.last) + str(self.first) + str(self.save) +
|
|
109
|
+
# optional_args
|
|
110
|
+
str(self.explicit) + str(self.range) + str(self.start) + str(self.end) +
|
|
111
|
+
str(self.limit) + str(self.filetype) + str(self.statuscode)
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# if sys.argv is empty, we assume this is being run as a module
|
|
115
|
+
if not sys.argv[1:]:
|
|
116
|
+
self.command = "pywaybackup_module"
|
|
117
|
+
else:
|
|
118
|
+
# otherwise, we take the command line arguments
|
|
119
|
+
self.command = ' '.join(sys.argv[1:])
|
|
120
|
+
|
|
121
|
+
self._init()
|
|
122
|
+
|
|
123
|
+
def _init(self):
|
|
124
|
+
|
|
125
|
+
self.domain, self.subdir, self.filename = url_split(self.url)
|
|
126
|
+
|
|
127
|
+
if self.output is None:
|
|
128
|
+
self.output = os.path.join(os.getcwd(), "waybackup_snapshots")
|
|
129
|
+
if self.metadata is None:
|
|
130
|
+
self.metadata = self.output
|
|
131
|
+
os.makedirs(self.output, exist_ok=True) if not self.save else None
|
|
132
|
+
os.makedirs(self.metadata, exist_ok=True) if not self.save else None
|
|
133
|
+
|
|
134
|
+
if self.all:
|
|
135
|
+
self.mode = "all"
|
|
136
|
+
if self.last:
|
|
137
|
+
self.mode = "last"
|
|
138
|
+
if self.first:
|
|
139
|
+
self.mode = "first"
|
|
140
|
+
if self.save:
|
|
141
|
+
self.mode = "save"
|
|
142
|
+
|
|
143
|
+
if self.filetype:
|
|
144
|
+
self.filetype = [f.lower().strip() for f in self.filetype.split(",")]
|
|
145
|
+
if self.statuscode:
|
|
146
|
+
self.statuscode = [s.lower().strip() for s in self.statuscode.split(",")]
|
|
147
|
+
|
|
148
|
+
base_path = self.metadata
|
|
149
|
+
base_name = f"waybackup_{sanitize_filename(self.url)}"
|
|
150
|
+
self.cdxfile = os.path.join(base_path, f"{base_name}.cdx")
|
|
151
|
+
self.dbfile = os.path.join(base_path, f"{base_name}.db")
|
|
152
|
+
self.csvfile = os.path.join(base_path, f"{base_name}.csv")
|
|
153
|
+
self.log = os.path.join(base_path, f"{base_name}.log") if self.log else None
|
|
154
|
+
self.debug = os.path.join(base_path, "waybackup_error.log") if self.debug else None
|
|
155
|
+
|
|
156
|
+
if self.reset:
|
|
157
|
+
os.remove(self.cdxfile) if os.path.isfile(self.cdxfile) else None
|
|
158
|
+
os.remove(self.dbfile) if os.path.isfile(self.dbfile) else None
|
|
159
|
+
os.remove(self.csvfile) if os.path.isfile(self.csvfile) else None
|
|
160
|
+
|
|
161
|
+
def paths(self, rel: bool = False) -> dict:
|
|
162
|
+
"""
|
|
163
|
+
Return a dictionary of existing file paths associated to the backup process:
|
|
164
|
+
{'shapshots':, 'cdxfile':, 'dbfile':, 'csvfile':, 'log':, 'debug':}
|
|
165
|
+
|
|
166
|
+
Parameters:
|
|
167
|
+
rel (bool): If True, return relative paths; otherwise, return absolute paths.
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
dict: Mapping of file types to their corresponding paths, including only files that exist.
|
|
171
|
+
"""
|
|
172
|
+
files = {
|
|
173
|
+
"snapshots": os.path.join(self.output, self.domain),
|
|
174
|
+
"cdxfile": self.cdxfile,
|
|
175
|
+
"dbfile": self.dbfile,
|
|
176
|
+
"csvfile": self.csvfile,
|
|
177
|
+
"log": self.log,
|
|
178
|
+
"debug": self.debug
|
|
179
|
+
}
|
|
180
|
+
return {
|
|
181
|
+
key: (os.path.relpath(path) if rel else path)
|
|
182
|
+
for key, path in files.items()
|
|
183
|
+
if path and os.path.exists(path)
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
def run(self):
|
|
187
|
+
"""Run the PyWayBackup process with the given configuration."""
|
|
188
|
+
ex.init(self.debug, self.output, self.command)
|
|
189
|
+
vb.init(self.silent, self.verbose, self.progress, self.log)
|
|
190
|
+
|
|
191
|
+
if self.save:
|
|
192
|
+
archive_save.save_page(self.url)
|
|
193
|
+
os._exit(1)
|
|
194
|
+
|
|
195
|
+
db.init(self.dbfile, self.query_identifier)
|
|
196
|
+
sc.init(self.mode)
|
|
197
|
+
|
|
198
|
+
if not self.save:
|
|
199
|
+
archive_download.startup()
|
|
200
|
+
|
|
201
|
+
try:
|
|
202
|
+
archive_download.query_list(
|
|
203
|
+
self.csvfile,
|
|
204
|
+
self.cdxfile,
|
|
205
|
+
self.range,
|
|
206
|
+
self.limit,
|
|
207
|
+
self.start,
|
|
208
|
+
self.end,
|
|
209
|
+
self.explicit,
|
|
210
|
+
self.filetype,
|
|
211
|
+
self.statuscode,
|
|
212
|
+
self.domain,
|
|
213
|
+
self.subdir,
|
|
214
|
+
self.filename,
|
|
215
|
+
)
|
|
216
|
+
archive_download.download_list(self.output, self.retry, self.no_redirect, self.delay, self.workers)
|
|
217
|
+
except KeyboardInterrupt:
|
|
218
|
+
print("\nInterrupted by user\n")
|
|
219
|
+
self.keep = True
|
|
220
|
+
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
|
221
|
+
|
|
222
|
+
except Exception as e:
|
|
223
|
+
self.keep = True
|
|
224
|
+
ex.exception(message="", e=e)
|
|
225
|
+
|
|
226
|
+
finally:
|
|
227
|
+
sc.csv_create(self.csvfile)
|
|
228
|
+
sc.fini()
|
|
229
|
+
vb.fini()
|
|
230
|
+
|
|
231
|
+
if not self.keep:
|
|
232
|
+
os.remove(self.dbfile) if os.path.exists(self.dbfile) else None
|
|
233
|
+
os.remove(self.cdxfile) if os.path.exists(self.cdxfile) else None
|
|
234
|
+
|
|
235
|
+
os._exit(1)
|