pywaybackup 3.4.1__tar.gz → 4.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pywaybackup-3.4.1/pywaybackup.egg-info → pywaybackup-4.1.0}/PKG-INFO +73 -43
- {pywaybackup-3.4.1 → pywaybackup-4.1.0}/README.md +71 -40
- {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pyproject.toml +2 -3
- {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup/Arguments.py +3 -2
- {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup/Exception.py +7 -7
- pywaybackup-4.1.0/pywaybackup/PyWayBackup.py +499 -0
- pywaybackup-4.1.0/pywaybackup/Snapshot.py +206 -0
- pywaybackup-4.1.0/pywaybackup/SnapshotCollection.py +340 -0
- {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup/Verbosity.py +7 -7
- {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup/Worker.py +11 -74
- pywaybackup-4.1.0/pywaybackup/archive_download.py +389 -0
- {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup/archive_save.py +7 -22
- pywaybackup-4.1.0/pywaybackup/db.py +197 -0
- pywaybackup-4.1.0/pywaybackup/files.py +191 -0
- {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup/main.py +2 -2
- {pywaybackup-3.4.1 → pywaybackup-4.1.0/pywaybackup.egg-info}/PKG-INFO +73 -43
- {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup.egg-info/SOURCES.txt +2 -1
- {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup.egg-info/requires.txt +1 -2
- pywaybackup-3.4.1/pywaybackup/Converter.py +0 -181
- pywaybackup-3.4.1/pywaybackup/PyWayBackup.py +0 -234
- pywaybackup-3.4.1/pywaybackup/SnapshotCollection.py +0 -416
- pywaybackup-3.4.1/pywaybackup/archive_download.py +0 -358
- pywaybackup-3.4.1/pywaybackup/db.py +0 -94
- {pywaybackup-3.4.1 → pywaybackup-4.1.0}/LICENSE +0 -0
- {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup/__init__.py +0 -0
- {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup/helper.py +0 -0
- {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup.egg-info/dependency_links.txt +0 -0
- {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup.egg-info/entry_points.txt +0 -0
- {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup.egg-info/top_level.txt +0 -0
- {pywaybackup-3.4.1 → pywaybackup-4.1.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pywaybackup
|
|
3
|
-
Version:
|
|
3
|
+
Version: 4.1.0
|
|
4
4
|
Summary: Query and download archive.org as simple as possible.
|
|
5
5
|
Author-email: bitdruid <bitdruid@outlook.com>
|
|
6
6
|
License: MIT License
|
|
@@ -29,8 +29,7 @@ Project-URL: homepage, https://github.com/bitdruid/python-wayback-machine-downlo
|
|
|
29
29
|
Requires-Python: >=3.8
|
|
30
30
|
Description-Content-Type: text/markdown
|
|
31
31
|
License-File: LICENSE
|
|
32
|
-
Requires-Dist:
|
|
33
|
-
Requires-Dist: pysqlite-binary; sys_platform == "win32"
|
|
32
|
+
Requires-Dist: SQLAlchemy==2.0.43
|
|
34
33
|
Requires-Dist: requests==2.32.3
|
|
35
34
|
Requires-Dist: tqdm==4.67.1
|
|
36
35
|
Requires-Dist: python-magic==0.4.27; sys_platform == "linux"
|
|
@@ -49,6 +48,17 @@ Internet-archive is a nice source for several OSINT-information. This tool is a
|
|
|
49
48
|
|
|
50
49
|
This tool allows you to download content from the Wayback Machine (archive.org). You can use it to download either the latest version or all versions of web page snapshots within a specified range.
|
|
51
50
|
|
|
51
|
+
# Content
|
|
52
|
+
|
|
53
|
+
➡️ [Installation](#installation) <br>
|
|
54
|
+
➡️ [notes / issues / hints](#notes--issues--hints) <br>
|
|
55
|
+
➡️ [import](#import) <br>
|
|
56
|
+
➡️ [cli](#cli) <br>
|
|
57
|
+
➡️ [Usage](#usage) <br>
|
|
58
|
+
➡️ [Examples](#examples) <br>
|
|
59
|
+
➡️ [Output](#output) <br>
|
|
60
|
+
➡️ [Contributing](#contributing) <br>
|
|
61
|
+
|
|
52
62
|
## Installation
|
|
53
63
|
|
|
54
64
|
### Pip
|
|
@@ -81,8 +91,14 @@ This tool allows you to download content from the Wayback Machine (archive.org).
|
|
|
81
91
|
You can import pywaybackup into your own scripts and run it. Args are the same as cli.
|
|
82
92
|
|
|
83
93
|
Additional args:
|
|
84
|
-
- `silent` (default
|
|
85
|
-
- `debug` (default
|
|
94
|
+
- `silent` (default False): If True, suppresses all output to the console.
|
|
95
|
+
- `debug` (default True): If False, disables writing errors to the error log file.
|
|
96
|
+
|
|
97
|
+
Use:
|
|
98
|
+
- `run()`
|
|
99
|
+
- `status()`
|
|
100
|
+
- `paths()`
|
|
101
|
+
- `stop()`
|
|
86
102
|
|
|
87
103
|
```python
|
|
88
104
|
from pywaybackup import PyWayBackup
|
|
@@ -114,6 +130,29 @@ output:
|
|
|
114
130
|
}
|
|
115
131
|
```
|
|
116
132
|
|
|
133
|
+
... or run it asynchronously and print the current status or stop it whenever needed.
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
import time
|
|
137
|
+
from pywaybackup import PyWayBackup
|
|
138
|
+
|
|
139
|
+
backup = PyWayBackup( ... )
|
|
140
|
+
backup.run(daemon=True)
|
|
141
|
+
print(backup.status())
|
|
142
|
+
time.sleep(10)
|
|
143
|
+
print(backup.status())
|
|
144
|
+
backup.stop()
|
|
145
|
+
```
|
|
146
|
+
output:
|
|
147
|
+
```bash
|
|
148
|
+
{
|
|
149
|
+
'task': 'downloading snapshots',
|
|
150
|
+
'current': 15,
|
|
151
|
+
'total': 84,
|
|
152
|
+
'progress': '18%'
|
|
153
|
+
}
|
|
154
|
+
```
|
|
155
|
+
|
|
117
156
|
## cli
|
|
118
157
|
|
|
119
158
|
- `-h`, `--help`: Show the help message and exit.
|
|
@@ -127,25 +166,24 @@ output:
|
|
|
127
166
|
#### Mode Selection (Choose One)
|
|
128
167
|
|
|
129
168
|
- **`-a`**, **`--all`**:<br>
|
|
130
|
-
|
|
169
|
+
All timestamps. Gives one folder per timestamp.
|
|
131
170
|
- **`-l`**, **`--last`**:<br>
|
|
132
|
-
|
|
171
|
+
Last Version. Gives one folder containing the last version of each file of specified `--range`.
|
|
133
172
|
- **`-f`**, **`--first`**:<br>
|
|
134
|
-
|
|
135
|
-
- **`-s`**, **`--save`**:<br>
|
|
136
|
-
Save a page to the Wayback Machine. (beta)
|
|
173
|
+
First Version. Gives one folder containing the first version of each file of specified `--range`.
|
|
137
174
|
|
|
138
175
|
#### Optional query parameters
|
|
139
176
|
|
|
177
|
+
Parameters for archive.org CDX query. No effect on snapshot download itself.
|
|
178
|
+
|
|
140
179
|
- **`-e`**, **`--explicit`**:<br>
|
|
141
|
-
Only
|
|
180
|
+
Only the explicit URL. No wildcard subdomains or paths. For example get: root-only (`https://example.com`) or specific file (`login.html`, `?query=this`).
|
|
142
181
|
|
|
143
182
|
- **`--limit`** `<count>`:<br>
|
|
144
|
-
Limits the
|
|
183
|
+
Limits the snapshots fetched from archive.org CDX. (Will have no effect on existing CDX files)
|
|
145
184
|
|
|
146
185
|
- **Range Selection:**<br>
|
|
147
|
-
|
|
148
|
-
(year 2019, year+month+day 20190101, year+month+day+hour 2019010112)
|
|
186
|
+
Set the query range in years (`range`) or a timestamp (`start` and/or `end`). If `range` then ignores `start` and `end`. Format for timestamps: YYYYMMDDhhmmss. Timestamp can as specific as needed (year 2019, year+month+day 20190101, ...).
|
|
149
187
|
|
|
150
188
|
- **`-r`**, **`--range`**:<br>
|
|
151
189
|
Specify the range in years for which to search and download snapshots.
|
|
@@ -155,57 +193,56 @@ output:
|
|
|
155
193
|
Timestamp to end searching.
|
|
156
194
|
|
|
157
195
|
- **Filtering:**<br>
|
|
158
|
-
A filter will result in a filtered cdx-file. So if you want to download all files later, you need to query again without the filter.
|
|
159
196
|
|
|
160
197
|
- **`--filetype`** `<filetype>`:<br>
|
|
161
|
-
Specify filetypes to download.
|
|
198
|
+
Specify filetypes to download. Example: `--filetype jpg,css,js`. You can only filter filetypes which are stored by archive.org (.html mostly not)
|
|
162
199
|
|
|
163
200
|
- **`--statuscode`** `<statuscode>`:<br>
|
|
164
|
-
Specify HTTP status codes to download.
|
|
201
|
+
Specify HTTP status codes to download. Example: `--statuscode 200,301`. PyWayBackup will always skip `404` and `301`.<br>
|
|
165
202
|
Common status codes you may want to handle/filter:
|
|
166
203
|
- `200` (OK)
|
|
167
|
-
- `301` (Moved Permanently
|
|
204
|
+
- `301` (Moved Permanently)
|
|
168
205
|
- `404` (Not Found - snapshot seems to be empty)
|
|
169
206
|
- `500` (Internal Server Error - snapshot is at least for now not available)
|
|
170
207
|
|
|
171
|
-
|
|
208
|
+
#### Optional Behavior Manipulation
|
|
172
209
|
|
|
173
|
-
|
|
210
|
+
Parameters will change the download behavior for snapshots.
|
|
174
211
|
|
|
175
212
|
- **`-o`**, **`--output`**:<br>
|
|
176
213
|
Defaults to `waybackup_snapshots` in the current directory. The folder where downloaded files will be saved.
|
|
177
214
|
|
|
178
215
|
- **`-m`**, **`--metadata`**<br>
|
|
179
|
-
|
|
216
|
+
Folder where metadata will be saved (`cdx`/`db`/`csv`/`log`). If you are downloading into a network share, you SHOULD set this to a local path because sqlite locking mechanism may cause issues with network shares.
|
|
180
217
|
|
|
181
218
|
- **`--verbose`**:<br>
|
|
182
219
|
Increase output verbosity.
|
|
183
220
|
|
|
184
221
|
- **`--log`** <!-- `<path>` -->:<br>
|
|
185
|
-
Saves a log file into the output-dir.
|
|
222
|
+
Saves a log file into the output-dir. `waybackup_<sanitized_url>.log`.
|
|
186
223
|
|
|
187
224
|
- **`--progress`**:<br>
|
|
188
225
|
Shows a progress bar instead of the default output.
|
|
189
226
|
|
|
190
227
|
- **`--workers`** `<count>`:<br>
|
|
191
|
-
|
|
228
|
+
Number of simultaneous download workers. Default is 1, safe range is about 10. Too many workers may lead to refused connections by archive.org.
|
|
192
229
|
|
|
193
230
|
- **`--no-redirect`**:<br>
|
|
194
|
-
Disables following redirects of snapshots.
|
|
231
|
+
Disables following redirects of snapshots. Can prevent timestamp-folder mismatches caused by redirects.
|
|
195
232
|
|
|
196
233
|
- **`--retry`** `<attempts>`:<br>
|
|
197
|
-
|
|
234
|
+
Retry attempts for failed downloads.
|
|
198
235
|
|
|
199
236
|
- **`--delay`** `<seconds>`:<br>
|
|
200
|
-
|
|
237
|
+
Delay between download requests in seconds. Default is no delay (0).
|
|
201
238
|
|
|
202
239
|
#### Job Handling:
|
|
203
240
|
|
|
204
241
|
- **`--reset`**:
|
|
205
|
-
If set, the job will be reset, and
|
|
242
|
+
If set, the job will be reset, and `cdx`, `db`, `csv` files will be **deleted**. This allows you to start the job from scratch.
|
|
206
243
|
|
|
207
244
|
- **`--keep`**:
|
|
208
|
-
If set,
|
|
245
|
+
If set, `cdx` and `db` files will be kept after the job is finished. Otherwise they will be deleted.
|
|
209
246
|
|
|
210
247
|
<br>
|
|
211
248
|
<br>
|
|
@@ -216,23 +253,11 @@ output:
|
|
|
216
253
|
|
|
217
254
|
`pywaybackup` resumes interrupted jobs. The tool automatically continues from where it left off.
|
|
218
255
|
|
|
219
|
-
|
|
220
|
-
-
|
|
221
|
-
-
|
|
256
|
+
Only resumes queries if:
|
|
257
|
+
- existing `.cdx` and `.db` files in an `output dir`
|
|
258
|
+
- command is identical by `URL`, `mode`, and `optional query parameters`
|
|
222
259
|
> **Note:** Changing URL, mode selection, query parameters or output prevents automatic resumption.
|
|
223
260
|
|
|
224
|
-
#### Resetting a Job (`--reset`)
|
|
225
|
-
|
|
226
|
-
- Deletes `.cdx` and `.db` files and restarts the process from scratch.
|
|
227
|
-
- Does **not** remove already downloaded files.
|
|
228
|
-
- `waybackup -u https://example.com -a --reset`
|
|
229
|
-
|
|
230
|
-
#### Keeping Job Data (`--keep`)
|
|
231
|
-
|
|
232
|
-
- Normally, `.cdx` and `.db` files are deleted after a successful job.
|
|
233
|
-
- `--keep` preserves them for future re-analysis or extending the query.
|
|
234
|
-
- `waybackup -u https://example.com -a --keep`
|
|
235
|
-
|
|
236
261
|
<br>
|
|
237
262
|
<br>
|
|
238
263
|
|
|
@@ -338,6 +363,11 @@ Exceptions will be written into `waybackup_error.log` (each run overwrites the f
|
|
|
338
363
|
<br>
|
|
339
364
|
<br>
|
|
340
365
|
|
|
366
|
+
## Future ideas (long run)
|
|
367
|
+
|
|
368
|
+
- More module functionality
|
|
369
|
+
- Docker UI
|
|
370
|
+
|
|
341
371
|
## Contributing
|
|
342
372
|
|
|
343
373
|
I'm always happy for some feature requests to improve the usability of this tool.
|
|
@@ -11,6 +11,17 @@ Internet-archive is a nice source for several OSINT-information. This tool is a
|
|
|
11
11
|
|
|
12
12
|
This tool allows you to download content from the Wayback Machine (archive.org). You can use it to download either the latest version or all versions of web page snapshots within a specified range.
|
|
13
13
|
|
|
14
|
+
# Content
|
|
15
|
+
|
|
16
|
+
➡️ [Installation](#installation) <br>
|
|
17
|
+
➡️ [notes / issues / hints](#notes--issues--hints) <br>
|
|
18
|
+
➡️ [import](#import) <br>
|
|
19
|
+
➡️ [cli](#cli) <br>
|
|
20
|
+
➡️ [Usage](#usage) <br>
|
|
21
|
+
➡️ [Examples](#examples) <br>
|
|
22
|
+
➡️ [Output](#output) <br>
|
|
23
|
+
➡️ [Contributing](#contributing) <br>
|
|
24
|
+
|
|
14
25
|
## Installation
|
|
15
26
|
|
|
16
27
|
### Pip
|
|
@@ -43,8 +54,14 @@ This tool allows you to download content from the Wayback Machine (archive.org).
|
|
|
43
54
|
You can import pywaybackup into your own scripts and run it. Args are the same as cli.
|
|
44
55
|
|
|
45
56
|
Additional args:
|
|
46
|
-
- `silent` (default
|
|
47
|
-
- `debug` (default
|
|
57
|
+
- `silent` (default False): If True, suppresses all output to the console.
|
|
58
|
+
- `debug` (default True): If False, disables writing errors to the error log file.
|
|
59
|
+
|
|
60
|
+
Use:
|
|
61
|
+
- `run()`
|
|
62
|
+
- `status()`
|
|
63
|
+
- `paths()`
|
|
64
|
+
- `stop()`
|
|
48
65
|
|
|
49
66
|
```python
|
|
50
67
|
from pywaybackup import PyWayBackup
|
|
@@ -76,6 +93,29 @@ output:
|
|
|
76
93
|
}
|
|
77
94
|
```
|
|
78
95
|
|
|
96
|
+
... or run it asynchronously and print the current status or stop it whenever needed.
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
import time
|
|
100
|
+
from pywaybackup import PyWayBackup
|
|
101
|
+
|
|
102
|
+
backup = PyWayBackup( ... )
|
|
103
|
+
backup.run(daemon=True)
|
|
104
|
+
print(backup.status())
|
|
105
|
+
time.sleep(10)
|
|
106
|
+
print(backup.status())
|
|
107
|
+
backup.stop()
|
|
108
|
+
```
|
|
109
|
+
output:
|
|
110
|
+
```bash
|
|
111
|
+
{
|
|
112
|
+
'task': 'downloading snapshots',
|
|
113
|
+
'current': 15,
|
|
114
|
+
'total': 84,
|
|
115
|
+
'progress': '18%'
|
|
116
|
+
}
|
|
117
|
+
```
|
|
118
|
+
|
|
79
119
|
## cli
|
|
80
120
|
|
|
81
121
|
- `-h`, `--help`: Show the help message and exit.
|
|
@@ -89,25 +129,24 @@ output:
|
|
|
89
129
|
#### Mode Selection (Choose One)
|
|
90
130
|
|
|
91
131
|
- **`-a`**, **`--all`**:<br>
|
|
92
|
-
|
|
132
|
+
All timestamps. Gives one folder per timestamp.
|
|
93
133
|
- **`-l`**, **`--last`**:<br>
|
|
94
|
-
|
|
134
|
+
Last Version. Gives one folder containing the last version of each file of specified `--range`.
|
|
95
135
|
- **`-f`**, **`--first`**:<br>
|
|
96
|
-
|
|
97
|
-
- **`-s`**, **`--save`**:<br>
|
|
98
|
-
Save a page to the Wayback Machine. (beta)
|
|
136
|
+
First Version. Gives one folder containing the first version of each file of specified `--range`.
|
|
99
137
|
|
|
100
138
|
#### Optional query parameters
|
|
101
139
|
|
|
140
|
+
Parameters for archive.org CDX query. No effect on snapshot download itself.
|
|
141
|
+
|
|
102
142
|
- **`-e`**, **`--explicit`**:<br>
|
|
103
|
-
Only
|
|
143
|
+
Only the explicit URL. No wildcard subdomains or paths. For example get: root-only (`https://example.com`) or specific file (`login.html`, `?query=this`).
|
|
104
144
|
|
|
105
145
|
- **`--limit`** `<count>`:<br>
|
|
106
|
-
Limits the
|
|
146
|
+
Limits the snapshots fetched from archive.org CDX. (Will have no effect on existing CDX files)
|
|
107
147
|
|
|
108
148
|
- **Range Selection:**<br>
|
|
109
|
-
|
|
110
|
-
(year 2019, year+month+day 20190101, year+month+day+hour 2019010112)
|
|
149
|
+
Set the query range in years (`range`) or a timestamp (`start` and/or `end`). If `range` then ignores `start` and `end`. Format for timestamps: YYYYMMDDhhmmss. Timestamp can as specific as needed (year 2019, year+month+day 20190101, ...).
|
|
111
150
|
|
|
112
151
|
- **`-r`**, **`--range`**:<br>
|
|
113
152
|
Specify the range in years for which to search and download snapshots.
|
|
@@ -117,57 +156,56 @@ output:
|
|
|
117
156
|
Timestamp to end searching.
|
|
118
157
|
|
|
119
158
|
- **Filtering:**<br>
|
|
120
|
-
A filter will result in a filtered cdx-file. So if you want to download all files later, you need to query again without the filter.
|
|
121
159
|
|
|
122
160
|
- **`--filetype`** `<filetype>`:<br>
|
|
123
|
-
Specify filetypes to download.
|
|
161
|
+
Specify filetypes to download. Example: `--filetype jpg,css,js`. You can only filter filetypes which are stored by archive.org (.html mostly not)
|
|
124
162
|
|
|
125
163
|
- **`--statuscode`** `<statuscode>`:<br>
|
|
126
|
-
Specify HTTP status codes to download.
|
|
164
|
+
Specify HTTP status codes to download. Example: `--statuscode 200,301`. PyWayBackup will always skip `404` and `301`.<br>
|
|
127
165
|
Common status codes you may want to handle/filter:
|
|
128
166
|
- `200` (OK)
|
|
129
|
-
- `301` (Moved Permanently
|
|
167
|
+
- `301` (Moved Permanently)
|
|
130
168
|
- `404` (Not Found - snapshot seems to be empty)
|
|
131
169
|
- `500` (Internal Server Error - snapshot is at least for now not available)
|
|
132
170
|
|
|
133
|
-
|
|
171
|
+
#### Optional Behavior Manipulation
|
|
134
172
|
|
|
135
|
-
|
|
173
|
+
Parameters will change the download behavior for snapshots.
|
|
136
174
|
|
|
137
175
|
- **`-o`**, **`--output`**:<br>
|
|
138
176
|
Defaults to `waybackup_snapshots` in the current directory. The folder where downloaded files will be saved.
|
|
139
177
|
|
|
140
178
|
- **`-m`**, **`--metadata`**<br>
|
|
141
|
-
|
|
179
|
+
Folder where metadata will be saved (`cdx`/`db`/`csv`/`log`). If you are downloading into a network share, you SHOULD set this to a local path because sqlite locking mechanism may cause issues with network shares.
|
|
142
180
|
|
|
143
181
|
- **`--verbose`**:<br>
|
|
144
182
|
Increase output verbosity.
|
|
145
183
|
|
|
146
184
|
- **`--log`** <!-- `<path>` -->:<br>
|
|
147
|
-
Saves a log file into the output-dir.
|
|
185
|
+
Saves a log file into the output-dir. `waybackup_<sanitized_url>.log`.
|
|
148
186
|
|
|
149
187
|
- **`--progress`**:<br>
|
|
150
188
|
Shows a progress bar instead of the default output.
|
|
151
189
|
|
|
152
190
|
- **`--workers`** `<count>`:<br>
|
|
153
|
-
|
|
191
|
+
Number of simultaneous download workers. Default is 1, safe range is about 10. Too many workers may lead to refused connections by archive.org.
|
|
154
192
|
|
|
155
193
|
- **`--no-redirect`**:<br>
|
|
156
|
-
Disables following redirects of snapshots.
|
|
194
|
+
Disables following redirects of snapshots. Can prevent timestamp-folder mismatches caused by redirects.
|
|
157
195
|
|
|
158
196
|
- **`--retry`** `<attempts>`:<br>
|
|
159
|
-
|
|
197
|
+
Retry attempts for failed downloads.
|
|
160
198
|
|
|
161
199
|
- **`--delay`** `<seconds>`:<br>
|
|
162
|
-
|
|
200
|
+
Delay between download requests in seconds. Default is no delay (0).
|
|
163
201
|
|
|
164
202
|
#### Job Handling:
|
|
165
203
|
|
|
166
204
|
- **`--reset`**:
|
|
167
|
-
If set, the job will be reset, and
|
|
205
|
+
If set, the job will be reset, and `cdx`, `db`, `csv` files will be **deleted**. This allows you to start the job from scratch.
|
|
168
206
|
|
|
169
207
|
- **`--keep`**:
|
|
170
|
-
If set,
|
|
208
|
+
If set, `cdx` and `db` files will be kept after the job is finished. Otherwise they will be deleted.
|
|
171
209
|
|
|
172
210
|
<br>
|
|
173
211
|
<br>
|
|
@@ -178,23 +216,11 @@ output:
|
|
|
178
216
|
|
|
179
217
|
`pywaybackup` resumes interrupted jobs. The tool automatically continues from where it left off.
|
|
180
218
|
|
|
181
|
-
|
|
182
|
-
-
|
|
183
|
-
-
|
|
219
|
+
Only resumes queries if:
|
|
220
|
+
- existing `.cdx` and `.db` files in an `output dir`
|
|
221
|
+
- command is identical by `URL`, `mode`, and `optional query parameters`
|
|
184
222
|
> **Note:** Changing URL, mode selection, query parameters or output prevents automatic resumption.
|
|
185
223
|
|
|
186
|
-
#### Resetting a Job (`--reset`)
|
|
187
|
-
|
|
188
|
-
- Deletes `.cdx` and `.db` files and restarts the process from scratch.
|
|
189
|
-
- Does **not** remove already downloaded files.
|
|
190
|
-
- `waybackup -u https://example.com -a --reset`
|
|
191
|
-
|
|
192
|
-
#### Keeping Job Data (`--keep`)
|
|
193
|
-
|
|
194
|
-
- Normally, `.cdx` and `.db` files are deleted after a successful job.
|
|
195
|
-
- `--keep` preserves them for future re-analysis or extending the query.
|
|
196
|
-
- `waybackup -u https://example.com -a --keep`
|
|
197
|
-
|
|
198
224
|
<br>
|
|
199
225
|
<br>
|
|
200
226
|
|
|
@@ -300,6 +326,11 @@ Exceptions will be written into `waybackup_error.log` (each run overwrites the f
|
|
|
300
326
|
<br>
|
|
301
327
|
<br>
|
|
302
328
|
|
|
329
|
+
## Future ideas (long run)
|
|
330
|
+
|
|
331
|
+
- More module functionality
|
|
332
|
+
- Docker UI
|
|
333
|
+
|
|
303
334
|
## Contributing
|
|
304
335
|
|
|
305
336
|
I'm always happy for some feature requests to improve the usability of this tool.
|
|
@@ -7,7 +7,7 @@ packages = ["pywaybackup"]
|
|
|
7
7
|
|
|
8
8
|
[project]
|
|
9
9
|
name = "pywaybackup"
|
|
10
|
-
version = "
|
|
10
|
+
version = "4.1.0"
|
|
11
11
|
description = "Query and download archive.org as simple as possible."
|
|
12
12
|
authors = [
|
|
13
13
|
{ name = "bitdruid", email = "bitdruid@outlook.com" }
|
|
@@ -16,8 +16,7 @@ license = { file = "LICENSE" }
|
|
|
16
16
|
readme = "README.md"
|
|
17
17
|
requires-python = ">=3.8"
|
|
18
18
|
dependencies = [
|
|
19
|
-
"
|
|
20
|
-
"pysqlite-binary; sys_platform == 'win32'",
|
|
19
|
+
"SQLAlchemy==2.0.43",
|
|
21
20
|
"requests==2.32.3",
|
|
22
21
|
"tqdm==4.67.1",
|
|
23
22
|
"python-magic==0.4.27; sys_platform == 'linux'",
|
|
@@ -24,8 +24,8 @@ class Arguments:
|
|
|
24
24
|
optional = parser.add_argument_group("optional query parameters")
|
|
25
25
|
optional.add_argument("-e", "--explicit", action="store_true", help="search only for the explicit given url")
|
|
26
26
|
optional.add_argument("-r", "--range", type=int, metavar="", help="range in years to search")
|
|
27
|
-
optional.add_argument("--start", type=int, metavar="", help="start timestamp format:
|
|
28
|
-
optional.add_argument("--end", type=int, metavar="", help="end timestamp format:
|
|
27
|
+
optional.add_argument("--start", type=int, metavar="", help="start timestamp format: YYYYMMDDHHMMSS")
|
|
28
|
+
optional.add_argument("--end", type=int, metavar="", help="end timestamp format: YYYYMMDDHHMMSS")
|
|
29
29
|
optional.add_argument("--limit", type=int, nargs="?", const=True, metavar="int", help="limit the number of snapshots to download")
|
|
30
30
|
optional.add_argument("--filetype", type=str, metavar="", help="filetypes to download comma separated (js,css,...)")
|
|
31
31
|
optional.add_argument("--statuscode", type=str, metavar="", help="statuscodes to download comma separated (200,404,...)")
|
|
@@ -55,3 +55,4 @@ class Arguments:
|
|
|
55
55
|
def get_args(self) -> dict:
|
|
56
56
|
"""Returns the parsed arguments as a dictionary."""
|
|
57
57
|
return vars(self.args)
|
|
58
|
+
|
|
@@ -14,9 +14,9 @@ class Exception:
|
|
|
14
14
|
command = None
|
|
15
15
|
|
|
16
16
|
@classmethod
|
|
17
|
-
def init(cls,
|
|
17
|
+
def init(cls, debugfile=None, output=None, command=None):
|
|
18
18
|
sys.excepthook = cls.exception_handler # set custom exception handler (uncaught exceptions)
|
|
19
|
-
cls.
|
|
19
|
+
cls.debugfile = debugfile
|
|
20
20
|
cls.output = output
|
|
21
21
|
cls.command = command
|
|
22
22
|
|
|
@@ -45,18 +45,18 @@ class Exception:
|
|
|
45
45
|
exception_message += "!-- Traceback is None\n"
|
|
46
46
|
exception_message += f"!-- Description: {e}\n-------------------------"
|
|
47
47
|
print(exception_message)
|
|
48
|
-
if cls.
|
|
49
|
-
print(f"Exception log: {cls.
|
|
48
|
+
if cls.debugfile:
|
|
49
|
+
print(f"Exception log: {cls.debugfile}")
|
|
50
50
|
if cls.new_debug: # new run, overwrite file
|
|
51
51
|
cls.new_debug = False
|
|
52
|
-
f = open(cls.
|
|
52
|
+
f = open(cls.debugfile, "w", encoding="utf-8")
|
|
53
53
|
f.write("-------------------------\n")
|
|
54
54
|
f.write(f"Version: {version('pywaybackup')}\n")
|
|
55
55
|
f.write("-------------------------\n")
|
|
56
56
|
f.write(f"Command: {cls.command}\n")
|
|
57
57
|
f.write("-------------------------\n\n")
|
|
58
58
|
else: # current run, append to file
|
|
59
|
-
f = open(cls.
|
|
59
|
+
f = open(cls.debugfile, "a", encoding="utf-8")
|
|
60
60
|
f.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n")
|
|
61
61
|
f.write(exception_message + "\n")
|
|
62
62
|
f.write("!-- Local Variables:\n")
|
|
@@ -96,4 +96,4 @@ class Exception:
|
|
|
96
96
|
if issubclass(exception_type, KeyboardInterrupt):
|
|
97
97
|
sys.__excepthook__(exception_type, exception, traceback)
|
|
98
98
|
return
|
|
99
|
-
Exception.exception(
|
|
99
|
+
Exception.exception('UNCAUGHT EXCEPTION', exception, traceback) # uncaught exceptions also with custom scheme
|