pywaybackup 3.1.0__tar.gz → 3.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pywaybackup-3.1.0/pywaybackup.egg-info → pywaybackup-3.2.1}/PKG-INFO +85 -53
- {pywaybackup-3.1.0 → pywaybackup-3.2.1}/README.md +79 -49
- {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pyproject.toml +5 -4
- {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup/Arguments.py +18 -15
- {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup/Converter.py +10 -10
- {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup/Exception.py +13 -18
- {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup/SnapshotCollection.py +118 -60
- pywaybackup-3.2.1/pywaybackup/Verbosity.py +92 -0
- pywaybackup-3.2.1/pywaybackup/Worker.py +158 -0
- pywaybackup-3.2.1/pywaybackup/archive_download.py +335 -0
- {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup/archive_save.py +19 -19
- {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup/helper.py +7 -7
- {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup/main.py +2 -2
- {pywaybackup-3.1.0 → pywaybackup-3.2.1/pywaybackup.egg-info}/PKG-INFO +85 -53
- {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup.egg-info/SOURCES.txt +1 -0
- {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup.egg-info/requires.txt +4 -3
- pywaybackup-3.1.0/pywaybackup/Verbosity.py +0 -121
- pywaybackup-3.1.0/pywaybackup/archive_download.py +0 -332
- {pywaybackup-3.1.0 → pywaybackup-3.2.1}/LICENSE +0 -0
- {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup/__init__.py +0 -0
- {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup/db.py +0 -0
- {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup.egg-info/dependency_links.txt +0 -0
- {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup.egg-info/entry_points.txt +0 -0
- {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup.egg-info/top_level.txt +0 -0
- {pywaybackup-3.1.0 → pywaybackup-3.2.1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: pywaybackup
|
|
3
|
-
Version: 3.1
|
|
3
|
+
Version: 3.2.1
|
|
4
4
|
Summary: Query and download archive.org as simple as possible.
|
|
5
5
|
Author-email: bitdruid <bitdruid@outlook.com>
|
|
6
6
|
License: MIT License
|
|
@@ -29,18 +29,19 @@ Project-URL: homepage, https://github.com/bitdruid/python-wayback-machine-downlo
|
|
|
29
29
|
Requires-Python: >=3.8
|
|
30
30
|
Description-Content-Type: text/markdown
|
|
31
31
|
License-File: LICENSE
|
|
32
|
-
Requires-Dist: pysqlite3-binary==0.5.4
|
|
33
|
-
Requires-Dist:
|
|
34
|
-
Requires-Dist:
|
|
32
|
+
Requires-Dist: pysqlite3-binary==0.5.4; sys_platform == "linux"
|
|
33
|
+
Requires-Dist: pysqlite-binary; sys_platform == "win32"
|
|
34
|
+
Requires-Dist: requests==2.32.3
|
|
35
|
+
Requires-Dist: tqdm==4.67.1
|
|
35
36
|
Requires-Dist: python-magic==0.4.27; sys_platform == "linux"
|
|
36
37
|
Requires-Dist: python-magic-bin==0.4.14; sys_platform == "win32"
|
|
38
|
+
Dynamic: license-file
|
|
37
39
|
|
|
38
40
|
# python wayback machine downloader
|
|
39
41
|
|
|
40
42
|
[](https://pypi.org/project/pywaybackup/)
|
|
41
43
|
[](https://pypi.org/project/pywaybackup/)
|
|
42
44
|

|
|
43
|
-
<!--  -->
|
|
44
45
|
[](https://opensource.org/licenses/MIT)
|
|
45
46
|
|
|
46
47
|
Downloading archived web pages from the [Wayback Machine](https://archive.org/web/).
|
|
@@ -66,11 +67,15 @@ This tool allows you to download content from the Wayback Machine (archive.org).
|
|
|
66
67
|
```pip install .```
|
|
67
68
|
- in a virtual env or use `--break-system-package`
|
|
68
69
|
|
|
69
|
-
##
|
|
70
|
+
## notes / issues / hints
|
|
70
71
|
|
|
71
|
-
- Linux recommended: On Windows machines, the path length is limited.
|
|
72
|
-
- If you query an explicit file (e.g. a query-string `?query=this` or `login.html`), the `--explicit`-argument is recommended as a wildcard query may lead to an empty result.
|
|
72
|
+
- Linux recommended: On Windows machines, the path length is limited. Files that exceed the path length will not be downloaded.
|
|
73
73
|
- The tool uses a sqlite database to handle snapshots. The database will only persist while the download is running.
|
|
74
|
+
- If you query an explicit file (e.g. a query-string `?query=this` or `login.html`), the `--explicit`-argument is recommended as a wildcard query may lead to an empty result.
|
|
75
|
+
- Downloading directly into a network share is not recommended. The sqlite locking mechanism may cause issues. If you need to download into a network share, set the `--metadata` argument to a local path.
|
|
76
|
+
|
|
77
|
+
<br>
|
|
78
|
+
<br>
|
|
74
79
|
|
|
75
80
|
## Arguments
|
|
76
81
|
|
|
@@ -92,7 +97,7 @@ This tool allows you to download content from the Wayback Machine (archive.org).
|
|
|
92
97
|
- **`-s`**, **`--save`**:<br>
|
|
93
98
|
Save a page to the Wayback Machine. (beta)
|
|
94
99
|
|
|
95
|
-
|
|
100
|
+
#### Optional query parameters
|
|
96
101
|
|
|
97
102
|
- **`-e`**, **`--explicit`**:<br>
|
|
98
103
|
Only download the explicit given URL. No wildcard subdomains or paths. Use e.g. to get root-only snapshots. This is recommended for explicit files like `login.html` or `?query=this`.
|
|
@@ -113,11 +118,16 @@ Limits the amount of snapshots to query from the CDX server. If an existing CDX
|
|
|
113
118
|
- **`--end`**:<br>
|
|
114
119
|
Timestamp to end searching.
|
|
115
120
|
|
|
116
|
-
###
|
|
121
|
+
### Optional
|
|
122
|
+
|
|
123
|
+
#### Behavior Manipulation
|
|
117
124
|
|
|
118
125
|
- **`-o`**, **`--output`**:<br>
|
|
119
126
|
Defaults to `waybackup_snapshots` in the current directory. The folder where downloaded files will be saved.
|
|
120
127
|
|
|
128
|
+
- **`-m`**, **`--metadata`**<br>
|
|
129
|
+
Change the folder where metadata will be saved (`cdx`/`db`/`csv`/`log`). Especially if you are downloading into a network share, you SHOULD set this to a local path because sqlite locking mechanism may cause issues with network shares.
|
|
130
|
+
|
|
121
131
|
<!-- - **`--verbosity`** `<level>`:<br>
|
|
122
132
|
Sets verbosity level. Options are `info`and `trace`. Default is `info`. -->
|
|
123
133
|
|
|
@@ -132,17 +142,31 @@ Sets the number of simultaneous download workers. Default is 1, safe range is ab
|
|
|
132
142
|
|
|
133
143
|
- **`--no-redirect`**:<br>
|
|
134
144
|
Disables following redirects of snapshots. Useful for preventing timestamp-folder mismatches caused by Archive.org redirects.
|
|
135
|
-
|
|
145
|
+
|
|
136
146
|
- **`--retry`** `<attempts>`:<br>
|
|
137
147
|
Specifies number of retry attempts for failed downloads.
|
|
138
148
|
|
|
139
149
|
- **`--delay`** `<seconds>`:<br>
|
|
140
150
|
Specifies delay between download requests in seconds. Default is no delay (0).
|
|
141
151
|
|
|
152
|
+
- **`--verbose`**:<br>
|
|
153
|
+
Increase output verbosity.
|
|
154
|
+
- verbose:
|
|
155
|
+
```
|
|
156
|
+
-----> Worker: 2 - Attempt: [1/1] Snapshot ID: [23/81]
|
|
157
|
+
SUCCESS -> 200 OK
|
|
158
|
+
-> URL: https://web.archive.org/web/20240225193302id_/https://example.com/assets/css/custom-styles.css
|
|
159
|
+
-> FILE: /home/manjaro/Stuff/python-wayback-machine-downloader/waybackup_snapshots/example.com/20240225193302id_/assets/css/custom-styles.css
|
|
160
|
+
```
|
|
161
|
+
- non-verbose:
|
|
162
|
+
```
|
|
163
|
+
55/81 - W:2 - SUCCESS - 20240225193302 - https://example.com/assets/css/custom-styles.css
|
|
164
|
+
```
|
|
165
|
+
|
|
142
166
|
<!-- - **`--convert-links`**:<br>
|
|
143
167
|
If set, all links in the downloaded files will be converted to local links. This is useful for offline browsing. The links are converted to the local path structure. Show output with `--verbosity trace`. -->
|
|
144
168
|
|
|
145
|
-
|
|
169
|
+
#### Job Handling:
|
|
146
170
|
|
|
147
171
|
- **`--reset`**:
|
|
148
172
|
If set, the job will be reset, and any existing `cdx`, `db`, `csv` files will be **deleted**. This allows you to start the job from scratch without considering previously downloaded data.
|
|
@@ -150,47 +174,56 @@ If set, all links in the downloaded files will be converted to local links. This
|
|
|
150
174
|
- **`--keep`**:
|
|
151
175
|
If set, all files will be kept after the job is finished. This includes the `cdx` and `db` file. Without this argument, they will be deleted if the job finished successfully.
|
|
152
176
|
|
|
153
|
-
|
|
177
|
+
<br>
|
|
178
|
+
<br>
|
|
179
|
+
|
|
180
|
+
## Usage
|
|
154
181
|
|
|
155
182
|
### Handling Interrupted Jobs
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
>
|
|
175
|
-
>
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
2.
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
183
|
+
|
|
184
|
+
`pywaybackup` resumes interrupted jobs. The tool automatically continues from where it left off.
|
|
185
|
+
|
|
186
|
+
- Detects existing `.cdx` and `.db` files in an `output dir` to resume downloading from the last successful point.
|
|
187
|
+
- Compares `URL`, `mode`, and `optional query parameters` to ensure automatic resumption.
|
|
188
|
+
- Skips previously downloaded files to save time.
|
|
189
|
+
> **Note:** Changing URL, mode selection, query parameters or output prevents automatic resumption.
|
|
190
|
+
|
|
191
|
+
#### Resetting a Job (`--reset`)
|
|
192
|
+
- Deletes `.cdx` and `.db` files and restarts the process from scratch.
|
|
193
|
+
- Does **not** remove already downloaded files.
|
|
194
|
+
- `waybackup -u https://example.com -a --reset`
|
|
195
|
+
|
|
196
|
+
#### Keeping Job Data (`--keep`)
|
|
197
|
+
- Normally, `.cdx` and `.db` files are deleted after a successful job.
|
|
198
|
+
- `--keep` preserves them for future re-analysis or extending the query.
|
|
199
|
+
- `waybackup -u https://example.com -a --keep`
|
|
200
|
+
|
|
201
|
+
<br>
|
|
202
|
+
<br>
|
|
203
|
+
|
|
204
|
+
## Examples
|
|
205
|
+
|
|
206
|
+
1. Download a specific single snapshot of all available files (starting from root):<br>
|
|
207
|
+
`waybackup -u https://example.com -a --start 20210101000000 --end 20210101000000`
|
|
208
|
+
2. Download a specific single snapshot of all available files (starting from a subdirectory):<br>
|
|
209
|
+
`waybackup -u https://example.com/subdir1/subdir2/assets/ -a --start 20210101000000 --end 20210101000000`
|
|
210
|
+
3. Download a specific single snapshot of the exact given URL (no subdirs):<br>
|
|
211
|
+
`waybackup -u https://example.com -a --start 20210101000000 --end 20210101000000 --explicit`
|
|
212
|
+
4. Download all snapshots of all available files in the given range:<br>
|
|
213
|
+
`waybackup -u https://example.com -a --start 20210101000000 --end 20231122000000`
|
|
214
|
+
|
|
215
|
+
<br>
|
|
216
|
+
<br>
|
|
217
|
+
|
|
218
|
+
## Output
|
|
219
|
+
|
|
220
|
+
### Path Structure
|
|
188
221
|
|
|
189
222
|
The output path is currently structured as follows by an example for the query:<br>
|
|
190
|
-
`http://example.com/subdir1/subdir2/assets
|
|
223
|
+
`http://example.com/subdir1/subdir2/assets/`
|
|
191
224
|
<br><br>
|
|
192
225
|
For the first and last version (`-f` or `-l`):
|
|
193
|
-
-
|
|
226
|
+
- Will only include all files/folders starting from your query-path.
|
|
194
227
|
```
|
|
195
228
|
your/path/waybackup_snapshots/
|
|
196
229
|
└── the_root_of_your_query/ (example.com/)
|
|
@@ -202,7 +235,7 @@ your/path/waybackup_snapshots/
|
|
|
202
235
|
...
|
|
203
236
|
```
|
|
204
237
|
For all versions (`-a`):
|
|
205
|
-
- Will
|
|
238
|
+
- Will create a folder named as the root of your query. Inside this folder, you will find all timestamps and per timestamp the path you requested.
|
|
206
239
|
```
|
|
207
240
|
your/path/waybackup_snapshots/
|
|
208
241
|
└── the_root_of_your_query/ (example.com/)
|
|
@@ -221,7 +254,7 @@ your/path/waybackup_snapshots/
|
|
|
221
254
|
...
|
|
222
255
|
```
|
|
223
256
|
|
|
224
|
-
|
|
257
|
+
### CSV
|
|
225
258
|
|
|
226
259
|
Each snapshot is stored with the following keys/values. These are either stored in a sqlite database while the download is running or saved into a CSV file after the download is finished.
|
|
227
260
|
|
|
@@ -247,9 +280,8 @@ For download queries:
|
|
|
247
280
|
|
|
248
281
|
Exceptions will be written into `waybackup_error.log` (each run overwrites the file).
|
|
249
282
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
- [ ] currently there is no logic to handle if both a http and https version of a page is available
|
|
283
|
+
<br>
|
|
284
|
+
<br>
|
|
253
285
|
|
|
254
286
|
## Contributing
|
|
255
287
|
|
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
[](https://pypi.org/project/pywaybackup/)
|
|
4
4
|
[](https://pypi.org/project/pywaybackup/)
|
|
5
5
|

|
|
6
|
-
<!--  -->
|
|
7
6
|
[](https://opensource.org/licenses/MIT)
|
|
8
7
|
|
|
9
8
|
Downloading archived web pages from the [Wayback Machine](https://archive.org/web/).
|
|
@@ -29,11 +28,15 @@ This tool allows you to download content from the Wayback Machine (archive.org).
|
|
|
29
28
|
```pip install .```
|
|
30
29
|
- in a virtual env or use `--break-system-package`
|
|
31
30
|
|
|
32
|
-
##
|
|
31
|
+
## notes / issues / hints
|
|
33
32
|
|
|
34
|
-
- Linux recommended: On Windows machines, the path length is limited.
|
|
35
|
-
- If you query an explicit file (e.g. a query-string `?query=this` or `login.html`), the `--explicit`-argument is recommended as a wildcard query may lead to an empty result.
|
|
33
|
+
- Linux recommended: On Windows machines, the path length is limited. Files that exceed the path length will not be downloaded.
|
|
36
34
|
- The tool uses a sqlite database to handle snapshots. The database will only persist while the download is running.
|
|
35
|
+
- If you query an explicit file (e.g. a query-string `?query=this` or `login.html`), the `--explicit`-argument is recommended as a wildcard query may lead to an empty result.
|
|
36
|
+
- Downloading directly into a network share is not recommended. The sqlite locking mechanism may cause issues. If you need to download into a network share, set the `--metadata` argument to a local path.
|
|
37
|
+
|
|
38
|
+
<br>
|
|
39
|
+
<br>
|
|
37
40
|
|
|
38
41
|
## Arguments
|
|
39
42
|
|
|
@@ -55,7 +58,7 @@ This tool allows you to download content from the Wayback Machine (archive.org).
|
|
|
55
58
|
- **`-s`**, **`--save`**:<br>
|
|
56
59
|
Save a page to the Wayback Machine. (beta)
|
|
57
60
|
|
|
58
|
-
|
|
61
|
+
#### Optional query parameters
|
|
59
62
|
|
|
60
63
|
- **`-e`**, **`--explicit`**:<br>
|
|
61
64
|
Only download the explicit given URL. No wildcard subdomains or paths. Use e.g. to get root-only snapshots. This is recommended for explicit files like `login.html` or `?query=this`.
|
|
@@ -76,11 +79,16 @@ Limits the amount of snapshots to query from the CDX server. If an existing CDX
|
|
|
76
79
|
- **`--end`**:<br>
|
|
77
80
|
Timestamp to end searching.
|
|
78
81
|
|
|
79
|
-
###
|
|
82
|
+
### Optional
|
|
83
|
+
|
|
84
|
+
#### Behavior Manipulation
|
|
80
85
|
|
|
81
86
|
- **`-o`**, **`--output`**:<br>
|
|
82
87
|
Defaults to `waybackup_snapshots` in the current directory. The folder where downloaded files will be saved.
|
|
83
88
|
|
|
89
|
+
- **`-m`**, **`--metadata`**<br>
|
|
90
|
+
Change the folder where metadata will be saved (`cdx`/`db`/`csv`/`log`). Especially if you are downloading into a network share, you SHOULD set this to a local path because sqlite locking mechanism may cause issues with network shares.
|
|
91
|
+
|
|
84
92
|
<!-- - **`--verbosity`** `<level>`:<br>
|
|
85
93
|
Sets verbosity level. Options are `info`and `trace`. Default is `info`. -->
|
|
86
94
|
|
|
@@ -95,17 +103,31 @@ Sets the number of simultaneous download workers. Default is 1, safe range is ab
|
|
|
95
103
|
|
|
96
104
|
- **`--no-redirect`**:<br>
|
|
97
105
|
Disables following redirects of snapshots. Useful for preventing timestamp-folder mismatches caused by Archive.org redirects.
|
|
98
|
-
|
|
106
|
+
|
|
99
107
|
- **`--retry`** `<attempts>`:<br>
|
|
100
108
|
Specifies number of retry attempts for failed downloads.
|
|
101
109
|
|
|
102
110
|
- **`--delay`** `<seconds>`:<br>
|
|
103
111
|
Specifies delay between download requests in seconds. Default is no delay (0).
|
|
104
112
|
|
|
113
|
+
- **`--verbose`**:<br>
|
|
114
|
+
Increase output verbosity.
|
|
115
|
+
- verbose:
|
|
116
|
+
```
|
|
117
|
+
-----> Worker: 2 - Attempt: [1/1] Snapshot ID: [23/81]
|
|
118
|
+
SUCCESS -> 200 OK
|
|
119
|
+
-> URL: https://web.archive.org/web/20240225193302id_/https://example.com/assets/css/custom-styles.css
|
|
120
|
+
-> FILE: /home/manjaro/Stuff/python-wayback-machine-downloader/waybackup_snapshots/example.com/20240225193302id_/assets/css/custom-styles.css
|
|
121
|
+
```
|
|
122
|
+
- non-verbose:
|
|
123
|
+
```
|
|
124
|
+
55/81 - W:2 - SUCCESS - 20240225193302 - https://example.com/assets/css/custom-styles.css
|
|
125
|
+
```
|
|
126
|
+
|
|
105
127
|
<!-- - **`--convert-links`**:<br>
|
|
106
128
|
If set, all links in the downloaded files will be converted to local links. This is useful for offline browsing. The links are converted to the local path structure. Show output with `--verbosity trace`. -->
|
|
107
129
|
|
|
108
|
-
|
|
130
|
+
#### Job Handling:
|
|
109
131
|
|
|
110
132
|
- **`--reset`**:
|
|
111
133
|
If set, the job will be reset, and any existing `cdx`, `db`, `csv` files will be **deleted**. This allows you to start the job from scratch without considering previously downloaded data.
|
|
@@ -113,47 +135,56 @@ If set, all links in the downloaded files will be converted to local links. This
|
|
|
113
135
|
- **`--keep`**:
|
|
114
136
|
If set, all files will be kept after the job is finished. This includes the `cdx` and `db` file. Without this argument, they will be deleted if the job finished successfully.
|
|
115
137
|
|
|
116
|
-
|
|
138
|
+
<br>
|
|
139
|
+
<br>
|
|
140
|
+
|
|
141
|
+
## Usage
|
|
117
142
|
|
|
118
143
|
### Handling Interrupted Jobs
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
>
|
|
138
|
-
>
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
2.
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
144
|
+
|
|
145
|
+
`pywaybackup` resumes interrupted jobs. The tool automatically continues from where it left off.
|
|
146
|
+
|
|
147
|
+
- Detects existing `.cdx` and `.db` files in an `output dir` to resume downloading from the last successful point.
|
|
148
|
+
- Compares `URL`, `mode`, and `optional query parameters` to ensure automatic resumption.
|
|
149
|
+
- Skips previously downloaded files to save time.
|
|
150
|
+
> **Note:** Changing URL, mode selection, query parameters or output prevents automatic resumption.
|
|
151
|
+
|
|
152
|
+
#### Resetting a Job (`--reset`)
|
|
153
|
+
- Deletes `.cdx` and `.db` files and restarts the process from scratch.
|
|
154
|
+
- Does **not** remove already downloaded files.
|
|
155
|
+
- `waybackup -u https://example.com -a --reset`
|
|
156
|
+
|
|
157
|
+
#### Keeping Job Data (`--keep`)
|
|
158
|
+
- Normally, `.cdx` and `.db` files are deleted after a successful job.
|
|
159
|
+
- `--keep` preserves them for future re-analysis or extending the query.
|
|
160
|
+
- `waybackup -u https://example.com -a --keep`
|
|
161
|
+
|
|
162
|
+
<br>
|
|
163
|
+
<br>
|
|
164
|
+
|
|
165
|
+
## Examples
|
|
166
|
+
|
|
167
|
+
1. Download a specific single snapshot of all available files (starting from root):<br>
|
|
168
|
+
`waybackup -u https://example.com -a --start 20210101000000 --end 20210101000000`
|
|
169
|
+
2. Download a specific single snapshot of all available files (starting from a subdirectory):<br>
|
|
170
|
+
`waybackup -u https://example.com/subdir1/subdir2/assets/ -a --start 20210101000000 --end 20210101000000`
|
|
171
|
+
3. Download a specific single snapshot of the exact given URL (no subdirs):<br>
|
|
172
|
+
`waybackup -u https://example.com -a --start 20210101000000 --end 20210101000000 --explicit`
|
|
173
|
+
4. Download all snapshots of all available files in the given range:<br>
|
|
174
|
+
`waybackup -u https://example.com -a --start 20210101000000 --end 20231122000000`
|
|
175
|
+
|
|
176
|
+
<br>
|
|
177
|
+
<br>
|
|
178
|
+
|
|
179
|
+
## Output
|
|
180
|
+
|
|
181
|
+
### Path Structure
|
|
151
182
|
|
|
152
183
|
The output path is currently structured as follows by an example for the query:<br>
|
|
153
|
-
`http://example.com/subdir1/subdir2/assets
|
|
184
|
+
`http://example.com/subdir1/subdir2/assets/`
|
|
154
185
|
<br><br>
|
|
155
186
|
For the first and last version (`-f` or `-l`):
|
|
156
|
-
-
|
|
187
|
+
- Will only include all files/folders starting from your query-path.
|
|
157
188
|
```
|
|
158
189
|
your/path/waybackup_snapshots/
|
|
159
190
|
└── the_root_of_your_query/ (example.com/)
|
|
@@ -165,7 +196,7 @@ your/path/waybackup_snapshots/
|
|
|
165
196
|
...
|
|
166
197
|
```
|
|
167
198
|
For all versions (`-a`):
|
|
168
|
-
- Will
|
|
199
|
+
- Will create a folder named as the root of your query. Inside this folder, you will find all timestamps and per timestamp the path you requested.
|
|
169
200
|
```
|
|
170
201
|
your/path/waybackup_snapshots/
|
|
171
202
|
└── the_root_of_your_query/ (example.com/)
|
|
@@ -184,7 +215,7 @@ your/path/waybackup_snapshots/
|
|
|
184
215
|
...
|
|
185
216
|
```
|
|
186
217
|
|
|
187
|
-
|
|
218
|
+
### CSV
|
|
188
219
|
|
|
189
220
|
Each snapshot is stored with the following keys/values. These are either stored in a sqlite database while the download is running or saved into a CSV file after the download is finished.
|
|
190
221
|
|
|
@@ -210,11 +241,10 @@ For download queries:
|
|
|
210
241
|
|
|
211
242
|
Exceptions will be written into `waybackup_error.log` (each run overwrites the file).
|
|
212
243
|
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
- [ ] currently there is no logic to handle if both a http and https version of a page is available
|
|
244
|
+
<br>
|
|
245
|
+
<br>
|
|
216
246
|
|
|
217
247
|
## Contributing
|
|
218
248
|
|
|
219
249
|
I'm always happy for some feature requests to improve the usability of this tool.
|
|
220
|
-
Feel free to give suggestions and report issues. Project is still far from being perfect.
|
|
250
|
+
Feel free to give suggestions and report issues. Project is still far from being perfect.
|
|
@@ -7,7 +7,7 @@ packages = ["pywaybackup"]
|
|
|
7
7
|
|
|
8
8
|
[project]
|
|
9
9
|
name = "pywaybackup"
|
|
10
|
-
version = "3.1
|
|
10
|
+
version = "3.2.1"
|
|
11
11
|
description = "Query and download archive.org as simple as possible."
|
|
12
12
|
authors = [
|
|
13
13
|
{ name = "bitdruid", email = "bitdruid@outlook.com" }
|
|
@@ -16,9 +16,10 @@ license = { file = "LICENSE" }
|
|
|
16
16
|
readme = "README.md"
|
|
17
17
|
requires-python = ">=3.8"
|
|
18
18
|
dependencies = [
|
|
19
|
-
"pysqlite3-binary==0.5.4",
|
|
20
|
-
"
|
|
21
|
-
"
|
|
19
|
+
"pysqlite3-binary==0.5.4; sys_platform == 'linux'",
|
|
20
|
+
"pysqlite-binary; sys_platform == 'win32'",
|
|
21
|
+
"requests==2.32.3",
|
|
22
|
+
"tqdm==4.67.1",
|
|
22
23
|
"python-magic==0.4.27; sys_platform == 'linux'",
|
|
23
24
|
"python-magic-bin==0.4.14; sys_platform == 'win32'",
|
|
24
25
|
]
|
|
@@ -6,15 +6,14 @@ import argparse
|
|
|
6
6
|
from importlib.metadata import version
|
|
7
7
|
|
|
8
8
|
from pywaybackup.helper import url_split, sanitize_filename
|
|
9
|
-
from pywaybackup.Exception import Exception as ex
|
|
10
9
|
|
|
11
10
|
class Arguments:
|
|
12
|
-
|
|
11
|
+
|
|
13
12
|
def __init__(self):
|
|
14
|
-
|
|
13
|
+
|
|
15
14
|
parser = argparse.ArgumentParser(description='Download from wayback machine (archive.org)')
|
|
16
15
|
parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + version("pywaybackup") + ' by @bitdruid -> https://github.com/bitdruid')
|
|
17
|
-
|
|
16
|
+
|
|
18
17
|
required = parser.add_argument_group('required (one exclusive)')
|
|
19
18
|
required.add_argument('-u', '--url', type=str, metavar="", help='url (with subdir/subdomain) to download')
|
|
20
19
|
exclusive_required = required.add_mutually_exclusive_group(required=True)
|
|
@@ -32,16 +31,17 @@ class Arguments:
|
|
|
32
31
|
optional.add_argument('--limit', type=int, nargs='?', const=True, metavar='int', help='limit the number of snapshots to download')
|
|
33
32
|
|
|
34
33
|
behavior = parser.add_argument_group('manipulate behavior')
|
|
35
|
-
behavior.add_argument('-o', '--output', type=str, metavar="", help='output
|
|
34
|
+
behavior.add_argument('-o', '--output', type=str, metavar="", help='output for all files - defaults to current directory')
|
|
35
|
+
behavior.add_argument('-m', '--metadata', type=str, metavar="", help='change directory for db/cdx/csv/log files')
|
|
36
36
|
behavior.add_argument('--log', action='store_true', help='save a log file into the output folder')
|
|
37
37
|
behavior.add_argument('--progress', action='store_true', help='show a progress bar')
|
|
38
38
|
behavior.add_argument('--no-redirect', action='store_true', help='do not follow redirects by archive.org')
|
|
39
|
-
#behavior.add_argument('--verbosity', type=str, default="info", metavar="", help='verbosity level (info, trace)')
|
|
40
39
|
behavior.add_argument('--retry', type=int, default=0, metavar="", help='retry failed downloads (opt tries as int, else infinite)')
|
|
41
40
|
behavior.add_argument('--workers', type=int, default=1, metavar="", help='number of workers (simultaneous downloads)')
|
|
42
41
|
# behavior.add_argument('--convert-links', action='store_true', help='Convert all links in the files to local paths. Requires -c/--current')
|
|
43
42
|
behavior.add_argument('--delay', type=int, default=0, metavar="", help='delay between each download in seconds')
|
|
44
|
-
|
|
43
|
+
behavior.add_argument('--verbose', action='store_true', help='overwritten by progress - gives detailed output')
|
|
44
|
+
|
|
45
45
|
special = parser.add_argument_group('special')
|
|
46
46
|
special.add_argument('--reset', action='store_true', help='reset the job and ignore existing cdx/db/csv files')
|
|
47
47
|
special.add_argument('--keep', action='store_true', help='keep all files after the job finished')
|
|
@@ -75,11 +75,11 @@ class Configuration:
|
|
|
75
75
|
|
|
76
76
|
if cls.output is None:
|
|
77
77
|
cls.output = os.path.join(os.getcwd(), "waybackup_snapshots")
|
|
78
|
+
if cls.metadata is None:
|
|
79
|
+
cls.metadata = cls.output
|
|
78
80
|
os.makedirs(cls.output, exist_ok=True) if not cls.save else None
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
cls.log = os.path.join(cls.output, f"waybackup_{sanitize_filename(cls.url)}.log")
|
|
82
|
-
|
|
81
|
+
os.makedirs(cls.metadata, exist_ok=True) if not cls.save else None
|
|
82
|
+
|
|
83
83
|
if cls.all:
|
|
84
84
|
cls.mode = "all"
|
|
85
85
|
if cls.last:
|
|
@@ -91,10 +91,13 @@ class Configuration:
|
|
|
91
91
|
|
|
92
92
|
if cls.filetype:
|
|
93
93
|
cls.filetype = [ft.lower().strip() for ft in cls.filetype.split(",")]
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
cls.
|
|
94
|
+
|
|
95
|
+
base_path = cls.metadata
|
|
96
|
+
base_name = f"waybackup_{sanitize_filename(cls.url)}"
|
|
97
|
+
cls.cdxfile = os.path.join(base_path, f"{base_name}.cdx")
|
|
98
|
+
cls.dbfile = os.path.join(base_path, f"{base_name}.db")
|
|
99
|
+
cls.csvfile = os.path.join(base_path, f"{base_name}.csv")
|
|
100
|
+
cls.log = os.path.join(base_path, f"{base_name}.log") if cls.log else None
|
|
98
101
|
|
|
99
102
|
if cls.reset:
|
|
100
103
|
os.remove(cls.cdxfile) if os.path.isfile(cls.cdxfile) else None
|
|
@@ -29,7 +29,7 @@ class Converter:
|
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
@classmethod
|
|
32
|
-
def links(cls, filepath,
|
|
32
|
+
def links(cls, filepath, status_content=None):
|
|
33
33
|
"""
|
|
34
34
|
Convert all links in a HTML / CSS / JS file to local paths.
|
|
35
35
|
"""
|
|
@@ -72,7 +72,7 @@ class Converter:
|
|
|
72
72
|
if original_url.startswith("//"):
|
|
73
73
|
external = True
|
|
74
74
|
if external:
|
|
75
|
-
status_message.trace(status="",
|
|
75
|
+
status_message.trace(status="", info=f"{count}/{len(links)}", content="External url")
|
|
76
76
|
return original_url
|
|
77
77
|
|
|
78
78
|
# convert the url to a relative path to the local root (download dir) if it's a valid path, else return the original url
|
|
@@ -87,7 +87,7 @@ class Converter:
|
|
|
87
87
|
if original_url.startswith("../"): # if file is already ../ check if it's not too many steps up
|
|
88
88
|
original_url = f"{cls.define_root_steps(filepath)}{original_url.split('../')[-1].lstrip('/')}"
|
|
89
89
|
else:
|
|
90
|
-
status_message.trace(status="",
|
|
90
|
+
status_message.trace(status="", info="", content=f"{count}/{len(links)}: URL is not a valid path")
|
|
91
91
|
|
|
92
92
|
return original_url
|
|
93
93
|
|
|
@@ -158,24 +158,24 @@ class Converter:
|
|
|
158
158
|
|
|
159
159
|
if os.path.isfile(filepath):
|
|
160
160
|
if magic.from_file(filepath, mime=True).split("/")[1] == "javascript":
|
|
161
|
-
status_message.trace(status="Error",
|
|
161
|
+
status_message.trace(status="Error", info="", content="JS-file is not supported")
|
|
162
162
|
return
|
|
163
163
|
try:
|
|
164
|
-
with open(filepath, "r") as file:
|
|
164
|
+
with open(filepath, "r", encoding="utf-8") as file:
|
|
165
165
|
domain = config.domain
|
|
166
166
|
content = file.read()
|
|
167
167
|
links = extract_urls(content)
|
|
168
|
-
status_message.store(
|
|
168
|
+
status_message.store(verbose=True, content=f"\n-----> Convert: [{len(links)}] links in file")
|
|
169
169
|
count = 1
|
|
170
170
|
for original_link in links:
|
|
171
|
-
status_message.trace(status="ORIG",
|
|
171
|
+
status_message.trace(status="ORIG", info=f"{count}/{len(links)}", content=original_link)
|
|
172
172
|
new_link = local_url(original_link, domain, count)
|
|
173
173
|
if new_link != original_link:
|
|
174
|
-
status_message.trace(status="CONV",
|
|
174
|
+
status_message.trace(status="CONV", info=f"{count}/{len(links)}", content=new_link)
|
|
175
175
|
content = content.replace(original_link, new_link)
|
|
176
176
|
count += 1
|
|
177
|
-
file = open(filepath, "w")
|
|
177
|
+
file = open(filepath, "w", encoding="utf-8")
|
|
178
178
|
file.write(content)
|
|
179
179
|
file.close()
|
|
180
180
|
except UnicodeDecodeError:
|
|
181
|
-
status_message.trace(status="Error",
|
|
181
|
+
status_message.trace(status="Error", info="", content="Could not decode file to convert links")
|