pywaybackup 1.5.7__tar.gz → 2.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pywaybackup-1.5.7 → pywaybackup-2.0.1}/PKG-INFO +40 -45
- {pywaybackup-1.5.7 → pywaybackup-2.0.1}/README.md +39 -44
- pywaybackup-2.0.1/pywaybackup/Arguments.py +96 -0
- {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup/Converter.py +0 -1
- pywaybackup-2.0.1/pywaybackup/SnapshotCollection.py +305 -0
- {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup/Verbosity.py +23 -19
- pywaybackup-2.0.1/pywaybackup/__version__.py +1 -0
- {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup/archive.py +91 -201
- pywaybackup-2.0.1/pywaybackup/db.py +81 -0
- pywaybackup-2.0.1/pywaybackup/main.py +53 -0
- {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup.egg-info/PKG-INFO +40 -45
- {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup.egg-info/SOURCES.txt +1 -1
- pywaybackup-1.5.7/pywaybackup/Arguments.py +0 -115
- pywaybackup-1.5.7/pywaybackup/SnapshotCollection copy.py +0 -168
- pywaybackup-1.5.7/pywaybackup/SnapshotCollection.py +0 -123
- pywaybackup-1.5.7/pywaybackup/__version__.py +0 -1
- pywaybackup-1.5.7/pywaybackup/main.py +0 -36
- {pywaybackup-1.5.7 → pywaybackup-2.0.1}/LICENSE +0 -0
- {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup/Exception.py +0 -0
- {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup/__init__.py +0 -0
- {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup/helper.py +0 -0
- {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup.egg-info/dependency_links.txt +0 -0
- {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup.egg-info/entry_points.txt +0 -0
- {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup.egg-info/requires.txt +0 -0
- {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup.egg-info/top_level.txt +0 -0
- {pywaybackup-1.5.7 → pywaybackup-2.0.1}/setup.cfg +0 -0
- {pywaybackup-1.5.7 → pywaybackup-2.0.1}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pywaybackup
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2.0.1
|
|
4
4
|
Summary: Download snapshots from the Wayback Machine
|
|
5
5
|
Home-page: https://github.com/bitdruid/python-wayback-machine-downloader
|
|
6
6
|
Author: bitdruid
|
|
@@ -48,7 +48,7 @@ This tool allows you to download content from the Wayback Machine (archive.org).
|
|
|
48
48
|
|
|
49
49
|
- Linux recommended: On Windows machines, the path length is limited. This can only be overcome by editing the registry. Files that exceed the path length will not be downloaded.
|
|
50
50
|
- If you query an explicit file (e.g. a query-string `?query=this` or `login.html`), the `--explicit`-argument is recommended as a wildcard query may lead to an empty result.
|
|
51
|
-
- The tool
|
|
51
|
+
- The tool uses a sqlite database to handle snapshots. The database will only persist while the download is running.
|
|
52
52
|
|
|
53
53
|
## Arguments
|
|
54
54
|
|
|
@@ -70,8 +70,6 @@ This tool allows you to download content from the Wayback Machine (archive.org).
|
|
|
70
70
|
|
|
71
71
|
### Optional query parameters
|
|
72
72
|
|
|
73
|
-
- **`-l`**, **`--list`**:<br>
|
|
74
|
-
Only print the snapshots available within the specified range. Does not download the snapshots.
|
|
75
73
|
- **`-e`**, **`--explicit`**:<br>
|
|
76
74
|
Only download the explicit given URL. No wildcard subdomains or paths. Use e.g. to get root-only snapshots. This is recommended for explicit files like `login.html` or `?query=this`.
|
|
77
75
|
|
|
@@ -79,7 +77,7 @@ This tool allows you to download content from the Wayback Machine (archive.org).
|
|
|
79
77
|
Specify filetypes to download. Default is all filetypes. Separate multiple filetypes with a comma. Example: `--filetype jpg,css,js`. A filter will result in a filtered cdx-file. So if you want to download all files later, you need to query again without the filter. Filetypes are filtered as they are in the snapshot. So if there is no explicit `html` file in the path (common practice) then you cant filter them.
|
|
80
78
|
|
|
81
79
|
- **`--limit`** `<count>`:<br>
|
|
82
|
-
Limits the amount of snapshots to query from the CDX server. If an existing CDX file is injected
|
|
80
|
+
Limits the amount of snapshots to query from the CDX server. If an existing CDX file is injected, the limit will have no effect. So you would need to set `--keep`.
|
|
83
81
|
|
|
84
82
|
- **Range Selection:**<br>
|
|
85
83
|
Specify the range in years or a specific timestamp either start, end, or both. If you specify the `range` argument, the `start` and `end` arguments will be ignored. Format for timestamps: YYYYMMDDhhmmss. You can only give a year or increase specificity by going through the timestamp starting on the left.<br>
|
|
@@ -94,26 +92,22 @@ Limits the amount of snapshots to query from the CDX server. If an existing CDX
|
|
|
94
92
|
### Behavior manipulation
|
|
95
93
|
|
|
96
94
|
- **`-o`**, **`--output`**:<br>
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
- **`--csv`** `<path>`:<br>
|
|
100
|
-
Path defaults to output-dir. Saves a CSV file with the json-response for successfull downloads. If `--list` is set, the CSV contains the CDX list of snapshots. If `--current` or `--full` is set, CSV contains downloaded files. Named as `waybackup_<sanitized_url>.csv`.
|
|
95
|
+
Defaults to `waybackup_snapshots` in the current directory. The folder where downloaded files will be saved.
|
|
101
96
|
|
|
102
|
-
- **`--
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
- **`--no-redirect`**:<br>
|
|
106
|
-
Disables following redirects of snapshots. Useful for preventing timestamp-folder mismatches caused by Archive.org redirects.
|
|
107
|
-
|
|
108
|
-
- **`--verbosity`** `<level>`:<br>
|
|
109
|
-
Sets verbosity level. Options are `json` (prints JSON response) or `progress` (shows progress bar).
|
|
110
|
-
<!-- Alternatively set verbosity level to `trace` for a very detailed output. -->
|
|
97
|
+
<!-- - **`--verbosity`** `<level>`:<br>
|
|
98
|
+
Sets verbosity level. Options are `info`and `trace`. Default is `info`. -->
|
|
111
99
|
|
|
112
|
-
- **`--log`** `<path
|
|
113
|
-
|
|
100
|
+
- **`--log`** <!-- `<path>` -->:<br>
|
|
101
|
+
Saves a log file into the output-dir. Named as `waybackup_<sanitized_url>.log`.
|
|
102
|
+
|
|
103
|
+
- **`--progress`**:<br>
|
|
104
|
+
Shows a progress bar instead of the default output.
|
|
114
105
|
|
|
115
106
|
- **`--workers`** `<count>`:<br>
|
|
116
107
|
Sets the number of simultaneous download workers. Default is 1, safe range is about 10. Be cautious as too many workers may lead to refused connections from the Wayback Machine.
|
|
108
|
+
|
|
109
|
+
- **`--no-redirect`**:<br>
|
|
110
|
+
Disables following redirects of snapshots. Useful for preventing timestamp-folder mismatches caused by Archive.org redirects.
|
|
117
111
|
|
|
118
112
|
- **`--retry`** `<attempts>`:<br>
|
|
119
113
|
Specifies number of retry attempts for failed downloads.
|
|
@@ -124,39 +118,39 @@ Specifies delay between download requests in seconds. Default is no delay (0).
|
|
|
124
118
|
<!-- - **`--convert-links`**:<br>
|
|
125
119
|
If set, all links in the downloaded files will be converted to local links. This is useful for offline browsing. The links are converted to the local path structure. Show output with `--verbosity trace`. -->
|
|
126
120
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
- **`--cdxinject`** `<path>`:<br>
|
|
132
|
-
Path defaults to output-dir. Injects a CDX query file to download snapshots. Ensure the query matches the previous `--url` for correct folder structure. Named as `waybackup_<sanitized_url>.cdx`.
|
|
121
|
+
## Special:
|
|
122
|
+
|
|
123
|
+
- **`--reset`**:
|
|
124
|
+
If set, the job will be reset, and any existing `cdx`, `db`, `csv` files will be **deleted**. This allows you to start the job from scratch without considering previously downloaded data.
|
|
133
125
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
If set, csv, skip and cdxbackup/cdxinject are handled automatically. Keep the files and folders as they are. Otherwise they will not be recognized when restarting a download.
|
|
126
|
+
- **`--keep`**:
|
|
127
|
+
If set, all files will be kept after the job is finished. This includes the `cdx` and `db` file. Without this argument, they will be deleted if the job finished successfully.
|
|
137
128
|
|
|
138
129
|
### Examples
|
|
139
130
|
|
|
140
|
-
Download latest snapshot of all files:<br>
|
|
131
|
+
Download the latest snapshot of all available files:<br>
|
|
141
132
|
`waybackup -u http://example.com -c`
|
|
142
133
|
|
|
143
|
-
Download latest snapshot of a specific file:<br>
|
|
144
|
-
`waybackup -u http://example.com/
|
|
134
|
+
Download the latest snapshot of a specific file (e.g., a login page):<br>
|
|
135
|
+
`waybackup -u http://example.com/login.html -c --explicit`
|
|
145
136
|
|
|
146
|
-
Download all snapshots
|
|
137
|
+
Download all snapshots within the last 5 years and prevent redirects:<br>
|
|
147
138
|
`waybackup -u http://example.com -f -r 5 --no-redirect`
|
|
148
139
|
|
|
149
|
-
Download all snapshots
|
|
140
|
+
Download all snapshots from a specific range (2020 to December 12, 2022) with 4 workers, and show a progress bar:<br>
|
|
141
|
+
`waybackup -u http://example.com -f --start 2020 --end 20221212 --workers 4 --progress`
|
|
142
|
+
|
|
143
|
+
Download all snapshots and save the output in a specific folder with 3 workers:<br>
|
|
150
144
|
`waybackup -u http://example.com -f -r 5 -o /home/user/Downloads/snapshots --workers 3`
|
|
151
145
|
|
|
152
|
-
Download all snapshots
|
|
153
|
-
`waybackup -u http://example.com -f --
|
|
146
|
+
Download all snapshots but only images and CSS files, filtering for specific filetypes (jpg, css):<br>
|
|
147
|
+
`waybackup -u http://example.com -f --filetype jpg,css`
|
|
154
148
|
|
|
155
|
-
Download all
|
|
156
|
-
`waybackup -u http://example.com -f --
|
|
149
|
+
Download all timestamps but start over and ignore existing progress, log the output, and retry 3 times if any error occurs:<br>
|
|
150
|
+
`waybackup -u http://example.com -f --log --retry 3 --reset`
|
|
157
151
|
|
|
158
|
-
|
|
159
|
-
`waybackup -u http://example.com -
|
|
152
|
+
Download the latest snapshot, follow no redirects but keep the database and cdx-file:<br>
|
|
153
|
+
`waybackup -u http://example.com -c --no-redirect --keep`
|
|
160
154
|
|
|
161
155
|
## Output path structure
|
|
162
156
|
|
|
@@ -195,8 +189,9 @@ your/path/waybackup_snapshots/
|
|
|
195
189
|
...
|
|
196
190
|
```
|
|
197
191
|
|
|
192
|
+
## CSV Output
|
|
198
193
|
|
|
199
|
-
|
|
194
|
+
Each snapshot is stored with the following keys/values. These are either stored in a sqlite database while the download is running or saved into a CSV file after the download is finished.
|
|
200
195
|
|
|
201
196
|
For download queries:
|
|
202
197
|
|
|
@@ -232,14 +227,14 @@ For list queries:
|
|
|
232
227
|
]
|
|
233
228
|
```
|
|
234
229
|
|
|
235
|
-
## CSV Output
|
|
236
|
-
|
|
237
|
-
The csv contains the json response in a table format.
|
|
238
|
-
|
|
239
230
|
### Debugging
|
|
240
231
|
|
|
241
232
|
Exceptions will be written into `waybackup_error.log` (each run overwrites the file).
|
|
242
233
|
|
|
234
|
+
### Known ToDos
|
|
235
|
+
|
|
236
|
+
- [ ] currently there is no logic to handle if both a http and https version of a page is available
|
|
237
|
+
|
|
243
238
|
## Contributing
|
|
244
239
|
|
|
245
240
|
I'm always happy for some feature requests to improve the usability of this tool.
|
|
@@ -32,7 +32,7 @@ This tool allows you to download content from the Wayback Machine (archive.org).
|
|
|
32
32
|
|
|
33
33
|
- Linux recommended: On Windows machines, the path length is limited. This can only be overcome by editing the registry. Files that exceed the path length will not be downloaded.
|
|
34
34
|
- If you query an explicit file (e.g. a query-string `?query=this` or `login.html`), the `--explicit`-argument is recommended as a wildcard query may lead to an empty result.
|
|
35
|
-
- The tool
|
|
35
|
+
- The tool uses a sqlite database to handle snapshots. The database will only persist while the download is running.
|
|
36
36
|
|
|
37
37
|
## Arguments
|
|
38
38
|
|
|
@@ -54,8 +54,6 @@ This tool allows you to download content from the Wayback Machine (archive.org).
|
|
|
54
54
|
|
|
55
55
|
### Optional query parameters
|
|
56
56
|
|
|
57
|
-
- **`-l`**, **`--list`**:<br>
|
|
58
|
-
Only print the snapshots available within the specified range. Does not download the snapshots.
|
|
59
57
|
- **`-e`**, **`--explicit`**:<br>
|
|
60
58
|
Only download the explicit given URL. No wildcard subdomains or paths. Use e.g. to get root-only snapshots. This is recommended for explicit files like `login.html` or `?query=this`.
|
|
61
59
|
|
|
@@ -63,7 +61,7 @@ This tool allows you to download content from the Wayback Machine (archive.org).
|
|
|
63
61
|
Specify filetypes to download. Default is all filetypes. Separate multiple filetypes with a comma. Example: `--filetype jpg,css,js`. A filter will result in a filtered cdx-file. So if you want to download all files later, you need to query again without the filter. Filetypes are filtered as they are in the snapshot. So if there is no explicit `html` file in the path (common practice) then you cant filter them.
|
|
64
62
|
|
|
65
63
|
- **`--limit`** `<count>`:<br>
|
|
66
|
-
Limits the amount of snapshots to query from the CDX server. If an existing CDX file is injected
|
|
64
|
+
Limits the amount of snapshots to query from the CDX server. If an existing CDX file is injected, the limit will have no effect. So you would need to set `--keep`.
|
|
67
65
|
|
|
68
66
|
- **Range Selection:**<br>
|
|
69
67
|
Specify the range in years or a specific timestamp either start, end, or both. If you specify the `range` argument, the `start` and `end` arguments will be ignored. Format for timestamps: YYYYMMDDhhmmss. You can only give a year or increase specificity by going through the timestamp starting on the left.<br>
|
|
@@ -78,26 +76,22 @@ Limits the amount of snapshots to query from the CDX server. If an existing CDX
|
|
|
78
76
|
### Behavior manipulation
|
|
79
77
|
|
|
80
78
|
- **`-o`**, **`--output`**:<br>
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
- **`--csv`** `<path>`:<br>
|
|
84
|
-
Path defaults to output-dir. Saves a CSV file with the json-response for successfull downloads. If `--list` is set, the CSV contains the CDX list of snapshots. If `--current` or `--full` is set, CSV contains downloaded files. Named as `waybackup_<sanitized_url>.csv`.
|
|
79
|
+
Defaults to `waybackup_snapshots` in the current directory. The folder where downloaded files will be saved.
|
|
85
80
|
|
|
86
|
-
- **`--
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
- **`--no-redirect`**:<br>
|
|
90
|
-
Disables following redirects of snapshots. Useful for preventing timestamp-folder mismatches caused by Archive.org redirects.
|
|
91
|
-
|
|
92
|
-
- **`--verbosity`** `<level>`:<br>
|
|
93
|
-
Sets verbosity level. Options are `json` (prints JSON response) or `progress` (shows progress bar).
|
|
94
|
-
<!-- Alternatively set verbosity level to `trace` for a very detailed output. -->
|
|
81
|
+
<!-- - **`--verbosity`** `<level>`:<br>
|
|
82
|
+
Sets verbosity level. Options are `info`and `trace`. Default is `info`. -->
|
|
95
83
|
|
|
96
|
-
- **`--log`** `<path
|
|
97
|
-
|
|
84
|
+
- **`--log`** <!-- `<path>` -->:<br>
|
|
85
|
+
Saves a log file into the output-dir. Named as `waybackup_<sanitized_url>.log`.
|
|
86
|
+
|
|
87
|
+
- **`--progress`**:<br>
|
|
88
|
+
Shows a progress bar instead of the default output.
|
|
98
89
|
|
|
99
90
|
- **`--workers`** `<count>`:<br>
|
|
100
91
|
Sets the number of simultaneous download workers. Default is 1, safe range is about 10. Be cautious as too many workers may lead to refused connections from the Wayback Machine.
|
|
92
|
+
|
|
93
|
+
- **`--no-redirect`**:<br>
|
|
94
|
+
Disables following redirects of snapshots. Useful for preventing timestamp-folder mismatches caused by Archive.org redirects.
|
|
101
95
|
|
|
102
96
|
- **`--retry`** `<attempts>`:<br>
|
|
103
97
|
Specifies number of retry attempts for failed downloads.
|
|
@@ -108,39 +102,39 @@ Specifies delay between download requests in seconds. Default is no delay (0).
|
|
|
108
102
|
<!-- - **`--convert-links`**:<br>
|
|
109
103
|
If set, all links in the downloaded files will be converted to local links. This is useful for offline browsing. The links are converted to the local path structure. Show output with `--verbosity trace`. -->
|
|
110
104
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
- **`--cdxinject`** `<path>`:<br>
|
|
116
|
-
Path defaults to output-dir. Injects a CDX query file to download snapshots. Ensure the query matches the previous `--url` for correct folder structure. Named as `waybackup_<sanitized_url>.cdx`.
|
|
105
|
+
## Special:
|
|
106
|
+
|
|
107
|
+
- **`--reset`**:
|
|
108
|
+
If set, the job will be reset, and any existing `cdx`, `db`, `csv` files will be **deleted**. This allows you to start the job from scratch without considering previously downloaded data.
|
|
117
109
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
If set, csv, skip and cdxbackup/cdxinject are handled automatically. Keep the files and folders as they are. Otherwise they will not be recognized when restarting a download.
|
|
110
|
+
- **`--keep`**:
|
|
111
|
+
If set, all files will be kept after the job is finished. This includes the `cdx` and `db` file. Without this argument, they will be deleted if the job finished successfully.
|
|
121
112
|
|
|
122
113
|
### Examples
|
|
123
114
|
|
|
124
|
-
Download latest snapshot of all files:<br>
|
|
115
|
+
Download the latest snapshot of all available files:<br>
|
|
125
116
|
`waybackup -u http://example.com -c`
|
|
126
117
|
|
|
127
|
-
Download latest snapshot of a specific file:<br>
|
|
128
|
-
`waybackup -u http://example.com/
|
|
118
|
+
Download the latest snapshot of a specific file (e.g., a login page):<br>
|
|
119
|
+
`waybackup -u http://example.com/login.html -c --explicit`
|
|
129
120
|
|
|
130
|
-
Download all snapshots
|
|
121
|
+
Download all snapshots within the last 5 years and prevent redirects:<br>
|
|
131
122
|
`waybackup -u http://example.com -f -r 5 --no-redirect`
|
|
132
123
|
|
|
133
|
-
Download all snapshots
|
|
124
|
+
Download all snapshots from a specific range (2020 to December 12, 2022) with 4 workers, and show a progress bar:<br>
|
|
125
|
+
`waybackup -u http://example.com -f --start 2020 --end 20221212 --workers 4 --progress`
|
|
126
|
+
|
|
127
|
+
Download all snapshots and save the output in a specific folder with 3 workers:<br>
|
|
134
128
|
`waybackup -u http://example.com -f -r 5 -o /home/user/Downloads/snapshots --workers 3`
|
|
135
129
|
|
|
136
|
-
Download all snapshots
|
|
137
|
-
`waybackup -u http://example.com -f --
|
|
130
|
+
Download all snapshots but only images and CSS files, filtering for specific filetypes (jpg, css):<br>
|
|
131
|
+
`waybackup -u http://example.com -f --filetype jpg,css`
|
|
138
132
|
|
|
139
|
-
Download all
|
|
140
|
-
`waybackup -u http://example.com -f --
|
|
133
|
+
Download all timestamps but start over and ignore existing progress, log the output, and retry 3 times if any error occurs:<br>
|
|
134
|
+
`waybackup -u http://example.com -f --log --retry 3 --reset`
|
|
141
135
|
|
|
142
|
-
|
|
143
|
-
`waybackup -u http://example.com -
|
|
136
|
+
Download the latest snapshot, follow no redirects but keep the database and cdx-file:<br>
|
|
137
|
+
`waybackup -u http://example.com -c --no-redirect --keep`
|
|
144
138
|
|
|
145
139
|
## Output path structure
|
|
146
140
|
|
|
@@ -179,8 +173,9 @@ your/path/waybackup_snapshots/
|
|
|
179
173
|
...
|
|
180
174
|
```
|
|
181
175
|
|
|
176
|
+
## CSV Output
|
|
182
177
|
|
|
183
|
-
|
|
178
|
+
Each snapshot is stored with the following keys/values. These are either stored in a sqlite database while the download is running or saved into a CSV file after the download is finished.
|
|
184
179
|
|
|
185
180
|
For download queries:
|
|
186
181
|
|
|
@@ -216,14 +211,14 @@ For list queries:
|
|
|
216
211
|
]
|
|
217
212
|
```
|
|
218
213
|
|
|
219
|
-
## CSV Output
|
|
220
|
-
|
|
221
|
-
The csv contains the json response in a table format.
|
|
222
|
-
|
|
223
214
|
### Debugging
|
|
224
215
|
|
|
225
216
|
Exceptions will be written into `waybackup_error.log` (each run overwrites the file).
|
|
226
217
|
|
|
218
|
+
### Known ToDos
|
|
219
|
+
|
|
220
|
+
- [ ] currently there is no logic to handle if both a http and https version of a page is available
|
|
221
|
+
|
|
227
222
|
## Contributing
|
|
228
223
|
|
|
229
224
|
I'm always happy for some feature requests to improve the usability of this tool.
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
|
|
2
|
+
import sys
|
|
3
|
+
import os
|
|
4
|
+
import argparse
|
|
5
|
+
|
|
6
|
+
from pywaybackup.helper import url_split, sanitize_filename
|
|
7
|
+
|
|
8
|
+
from pywaybackup.__version__ import __version__
|
|
9
|
+
|
|
10
|
+
class Arguments:
|
|
11
|
+
|
|
12
|
+
def __init__(self):
|
|
13
|
+
|
|
14
|
+
parser = argparse.ArgumentParser(description='Download from wayback machine (archive.org)')
|
|
15
|
+
parser.add_argument('-a', '--about', action='version', version='%(prog)s ' + __version__ + ' by @bitdruid -> https://github.com/bitdruid')
|
|
16
|
+
|
|
17
|
+
required = parser.add_argument_group('required (one exclusive)')
|
|
18
|
+
required.add_argument('-u', '--url', type=str, metavar="", help='url (with subdir/subdomain) to download')
|
|
19
|
+
exclusive_required = required.add_mutually_exclusive_group(required=True)
|
|
20
|
+
exclusive_required.add_argument('-c', '--current', action='store_true', help='download the latest version of each file snapshot')
|
|
21
|
+
exclusive_required.add_argument('-f', '--full', action='store_true', help='download snapshots of all timestamps')
|
|
22
|
+
exclusive_required.add_argument('-s', '--save', action='store_true', help='save a page to the wayback machine')
|
|
23
|
+
|
|
24
|
+
optional = parser.add_argument_group('optional query parameters')
|
|
25
|
+
optional.add_argument('-e', '--explicit', action='store_true', help='search only for the explicit given url')
|
|
26
|
+
optional.add_argument('-r', '--range', type=int, metavar="", help='range in years to search')
|
|
27
|
+
optional.add_argument('--start', type=int, metavar="", help='start timestamp format: YYYYMMDDhhmmss')
|
|
28
|
+
optional.add_argument('--end', type=int, metavar="", help='end timestamp format: YYYYMMDDhhmmss')
|
|
29
|
+
optional.add_argument('--filetype', type=str, metavar="", help='filetypes to download comma separated (e.g. "html,css")')
|
|
30
|
+
optional.add_argument('--limit', type=int, nargs='?', const=True, metavar='int', help='limit the number of snapshots to download')
|
|
31
|
+
|
|
32
|
+
behavior = parser.add_argument_group('manipulate behavior')
|
|
33
|
+
behavior.add_argument('-o', '--output', type=str, metavar="", help='output folder - defaults to current directory')
|
|
34
|
+
behavior.add_argument('--log', action='store_true', help='save a log file into the output folder')
|
|
35
|
+
behavior.add_argument('--progress', action='store_true', help='show a progress bar')
|
|
36
|
+
behavior.add_argument('--no-redirect', action='store_true', help='do not follow redirects by archive.org')
|
|
37
|
+
#behavior.add_argument('--verbosity', type=str, default="info", metavar="", help='verbosity level (info, trace)')
|
|
38
|
+
behavior.add_argument('--retry', type=int, default=0, metavar="", help='retry failed downloads (opt tries as int, else infinite)')
|
|
39
|
+
behavior.add_argument('--workers', type=int, default=1, metavar="", help='number of workers (simultaneous downloads)')
|
|
40
|
+
# behavior.add_argument('--convert-links', action='store_true', help='Convert all links in the files to local paths. Requires -c/--current')
|
|
41
|
+
behavior.add_argument('--delay', type=int, default=0, metavar="", help='delay between each download in seconds')
|
|
42
|
+
|
|
43
|
+
special = parser.add_argument_group('special')
|
|
44
|
+
special.add_argument('--reset', action='store_true', help='reset the job and ignore existing cdx/db/csv files')
|
|
45
|
+
special.add_argument('--keep', action='store_true', help='keep all files after the job finished')
|
|
46
|
+
|
|
47
|
+
args = parser.parse_args(args=None if sys.argv[1:] else ['--help']) # if no arguments are given, print help
|
|
48
|
+
|
|
49
|
+
required_args = {action.dest: getattr(args, action.dest) for action in exclusive_required._group_actions}
|
|
50
|
+
optional_args = {action.dest: getattr(args, action.dest) for action in optional._group_actions}
|
|
51
|
+
args.query_identifier = str(args.url) + str(required_args) + str(optional_args)
|
|
52
|
+
|
|
53
|
+
# if args.convert_links and not args.current:
|
|
54
|
+
# parser.error("--convert-links can only be used with the -c/--current option")
|
|
55
|
+
|
|
56
|
+
self.args = args
|
|
57
|
+
|
|
58
|
+
def get_args(self):
|
|
59
|
+
return self.args
|
|
60
|
+
|
|
61
|
+
class Configuration:
|
|
62
|
+
|
|
63
|
+
@classmethod
|
|
64
|
+
def init(cls):
|
|
65
|
+
|
|
66
|
+
cls.args = Arguments().get_args()
|
|
67
|
+
for key, value in vars(cls.args).items():
|
|
68
|
+
setattr(Configuration, key, value)
|
|
69
|
+
|
|
70
|
+
# args now attributes of Configuration // Configuration.output, ...
|
|
71
|
+
cls.command = ' '.join(sys.argv[1:])
|
|
72
|
+
cls.domain, cls.subdir, cls.filename = url_split(cls.url)
|
|
73
|
+
|
|
74
|
+
if cls.output is None:
|
|
75
|
+
cls.output = os.path.join(os.getcwd(), "waybackup_snapshots")
|
|
76
|
+
os.makedirs(cls.output, exist_ok=True)
|
|
77
|
+
|
|
78
|
+
if cls.log is True:
|
|
79
|
+
cls.log = os.path.join(cls.output, f"waybackup_{sanitize_filename(cls.url)}.log")
|
|
80
|
+
|
|
81
|
+
if cls.full:
|
|
82
|
+
cls.mode = "full"
|
|
83
|
+
if cls.current:
|
|
84
|
+
cls.mode = "current"
|
|
85
|
+
|
|
86
|
+
if cls.filetype:
|
|
87
|
+
cls.filetype = [ft.lower().strip() for ft in cls.filetype.split(",")]
|
|
88
|
+
|
|
89
|
+
cls.cdxfile = os.path.join(cls.output, f"waybackup_{sanitize_filename(cls.url)}.cdx")
|
|
90
|
+
cls.dbfile = os.path.join(cls.output, f"waybackup_{sanitize_filename(cls.url)}.db")
|
|
91
|
+
cls.csvfile = os.path.join(cls.output, f"waybackup_{sanitize_filename(cls.url)}.csv")
|
|
92
|
+
|
|
93
|
+
if cls.reset:
|
|
94
|
+
os.remove(cls.cdxfile) if os.path.isfile(cls.cdxfile) else None
|
|
95
|
+
os.remove(cls.dbfile) if os.path.isfile(cls.dbfile) else None
|
|
96
|
+
os.remove(cls.csvfile) if os.path.isfile(cls.csvfile) else None
|