pywaybackup 3.4.1__tar.gz → 4.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {pywaybackup-3.4.1/pywaybackup.egg-info → pywaybackup-4.1.0}/PKG-INFO +73 -43
  2. {pywaybackup-3.4.1 → pywaybackup-4.1.0}/README.md +71 -40
  3. {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pyproject.toml +2 -3
  4. {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup/Arguments.py +3 -2
  5. {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup/Exception.py +7 -7
  6. pywaybackup-4.1.0/pywaybackup/PyWayBackup.py +499 -0
  7. pywaybackup-4.1.0/pywaybackup/Snapshot.py +206 -0
  8. pywaybackup-4.1.0/pywaybackup/SnapshotCollection.py +340 -0
  9. {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup/Verbosity.py +7 -7
  10. {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup/Worker.py +11 -74
  11. pywaybackup-4.1.0/pywaybackup/archive_download.py +389 -0
  12. {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup/archive_save.py +7 -22
  13. pywaybackup-4.1.0/pywaybackup/db.py +197 -0
  14. pywaybackup-4.1.0/pywaybackup/files.py +191 -0
  15. {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup/main.py +2 -2
  16. {pywaybackup-3.4.1 → pywaybackup-4.1.0/pywaybackup.egg-info}/PKG-INFO +73 -43
  17. {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup.egg-info/SOURCES.txt +2 -1
  18. {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup.egg-info/requires.txt +1 -2
  19. pywaybackup-3.4.1/pywaybackup/Converter.py +0 -181
  20. pywaybackup-3.4.1/pywaybackup/PyWayBackup.py +0 -234
  21. pywaybackup-3.4.1/pywaybackup/SnapshotCollection.py +0 -416
  22. pywaybackup-3.4.1/pywaybackup/archive_download.py +0 -358
  23. pywaybackup-3.4.1/pywaybackup/db.py +0 -94
  24. {pywaybackup-3.4.1 → pywaybackup-4.1.0}/LICENSE +0 -0
  25. {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup/__init__.py +0 -0
  26. {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup/helper.py +0 -0
  27. {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup.egg-info/dependency_links.txt +0 -0
  28. {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup.egg-info/entry_points.txt +0 -0
  29. {pywaybackup-3.4.1 → pywaybackup-4.1.0}/pywaybackup.egg-info/top_level.txt +0 -0
  30. {pywaybackup-3.4.1 → pywaybackup-4.1.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pywaybackup
3
- Version: 3.4.1
3
+ Version: 4.1.0
4
4
  Summary: Query and download archive.org as simple as possible.
5
5
  Author-email: bitdruid <bitdruid@outlook.com>
6
6
  License: MIT License
@@ -29,8 +29,7 @@ Project-URL: homepage, https://github.com/bitdruid/python-wayback-machine-downlo
29
29
  Requires-Python: >=3.8
30
30
  Description-Content-Type: text/markdown
31
31
  License-File: LICENSE
32
- Requires-Dist: pysqlite3-binary==0.5.4; sys_platform == "linux"
33
- Requires-Dist: pysqlite-binary; sys_platform == "win32"
32
+ Requires-Dist: SQLAlchemy==2.0.43
34
33
  Requires-Dist: requests==2.32.3
35
34
  Requires-Dist: tqdm==4.67.1
36
35
  Requires-Dist: python-magic==0.4.27; sys_platform == "linux"
@@ -49,6 +48,17 @@ Internet-archive is a nice source for several OSINT-information. This tool is a
49
48
 
50
49
  This tool allows you to download content from the Wayback Machine (archive.org). You can use it to download either the latest version or all versions of web page snapshots within a specified range.
51
50
 
51
+ # Content
52
+
53
+ ➡️ [Installation](#installation) <br>
54
+ ➡️ [notes / issues / hints](#notes--issues--hints) <br>
55
+ ➡️ [import](#import) <br>
56
+ ➡️ [cli](#cli) <br>
57
+ ➡️ [Usage](#usage) <br>
58
+ ➡️ [Examples](#examples) <br>
59
+ ➡️ [Output](#output) <br>
60
+ ➡️ [Contributing](#contributing) <br>
61
+
52
62
  ## Installation
53
63
 
54
64
  ### Pip
@@ -81,8 +91,14 @@ This tool allows you to download content from the Wayback Machine (archive.org).
81
91
  You can import pywaybackup into your own scripts and run it. Args are the same as cli.
82
92
 
83
93
  Additional args:
84
- - `silent` (default True): If True, suppresses all output to the console.
85
- - `debug` (default False): If True, disables writing errors to the error log file.
94
+ - `silent` (default False): If True, suppresses all output to the console.
95
+ - `debug` (default True): If False, disables writing errors to the error log file.
96
+
97
+ Use:
98
+ - `run()`
99
+ - `status()`
100
+ - `paths()`
101
+ - `stop()`
86
102
 
87
103
  ```python
88
104
  from pywaybackup import PyWayBackup
@@ -114,6 +130,29 @@ output:
114
130
  }
115
131
  ```
116
132
 
133
+ ... or run it asynchronously and print the current status or stop it whenever needed.
134
+
135
+ ```python
136
+ import time
137
+ from pywaybackup import PyWayBackup
138
+
139
+ backup = PyWayBackup( ... )
140
+ backup.run(daemon=True)
141
+ print(backup.status())
142
+ time.sleep(10)
143
+ print(backup.status())
144
+ backup.stop()
145
+ ```
146
+ output:
147
+ ```bash
148
+ {
149
+ 'task': 'downloading snapshots',
150
+ 'current': 15,
151
+ 'total': 84,
152
+ 'progress': '18%'
153
+ }
154
+ ```
155
+
117
156
  ## cli
118
157
 
119
158
  - `-h`, `--help`: Show the help message and exit.
@@ -127,25 +166,24 @@ output:
127
166
  #### Mode Selection (Choose One)
128
167
 
129
168
  - **`-a`**, **`--all`**:<br>
130
- Download snapshots of all timestamps. You will get a folder per timestamp with the files available at that time.
169
+ All timestamps. Gives one folder per timestamp.
131
170
  - **`-l`**, **`--last`**:<br>
132
- Download the last version of each file snapshot. You will get one directory with a rebuild of the page. It contains the last version of each file of your specified `--range`.
171
+ Last Version. Gives one folder containing the last version of each file of specified `--range`.
133
172
  - **`-f`**, **`--first`**:<br>
134
- Download the first version of each file snapshot. You will get one directory with a rebuild of the page. It contains the first version of each file of your specified `--range`.
135
- - **`-s`**, **`--save`**:<br>
136
- Save a page to the Wayback Machine. (beta)
173
+ First Version. Gives one folder containing the first version of each file of specified `--range`.
137
174
 
138
175
  #### Optional query parameters
139
176
 
177
+ Parameters for archive.org CDX query. No effect on snapshot download itself.
178
+
140
179
  - **`-e`**, **`--explicit`**:<br>
141
- Only download the explicit given URL. No wildcard subdomains or paths. Use e.g. to get root-only snapshots. This is recommended for explicit files like `login.html` or `?query=this`.
180
+ Only the explicit URL. No wildcard subdomains or paths. For example get: root-only (`https://example.com`) or specific file (`login.html`, `?query=this`).
142
181
 
143
182
  - **`--limit`** `<count>`:<br>
144
- Limits the amount of snapshots to query from the CDX server. If an existing CDX file is injected, the limit will have no effect. So you would need to set `--keep`.
183
+ Limits the snapshots fetched from archive.org CDX. (Will have no effect on existing CDX files)
145
184
 
146
185
  - **Range Selection:**<br>
147
- Specify the range in years or a specific timestamp either start, end, or both. If you specify the `range`, the `start` and `end` will be ignored. Format for timestamps: YYYYMMDDhhmmss. You can only give a year or increase specificity by going through the timestamp starting on the left.<br>
148
- (year 2019, year+month+day 20190101, year+month+day+hour 2019010112)
186
+ Set the query range in years (`range`) or a timestamp (`start` and/or `end`). If `range` then ignores `start` and `end`. Format for timestamps: YYYYMMDDhhmmss. Timestamp can as specific as needed (year 2019, year+month+day 20190101, ...).
149
187
 
150
188
  - **`-r`**, **`--range`**:<br>
151
189
  Specify the range in years for which to search and download snapshots.
@@ -155,57 +193,56 @@ output:
155
193
  Timestamp to end searching.
156
194
 
157
195
  - **Filtering:**<br>
158
- A filter will result in a filtered cdx-file. So if you want to download all files later, you need to query again without the filter.
159
196
 
160
197
  - **`--filetype`** `<filetype>`:<br>
161
- Specify filetypes to download. Default is all filetypes. Separate multiple filetypes with a comma. Example: `--filetype jpg,css,js`. Filetypes are filtered as they are in the snapshot. So if there is no explicit `html` file in the path (common practice) then you cant filter them.
198
+ Specify filetypes to download. Example: `--filetype jpg,css,js`. You can only filter filetypes which are stored by archive.org (.html mostly not)
162
199
 
163
200
  - **`--statuscode`** `<statuscode>`:<br>
164
- Specify HTTP status codes to download. Default is all statuscodes. Separate multiple status codes with a comma. Example: `--statuscode 200,301`. Pywaybackup will try to download any snapshot regardless of it's statuscode. For 404 of course this means logged errors and corresponding entries in the csv. However, you may want to get a csv that includes these negative attempts for your needs.<br>
201
+ Specify HTTP status codes to download. Example: `--statuscode 200,301`. PyWayBackup will always skip `404` and `301`.<br>
165
202
  Common status codes you may want to handle/filter:
166
203
  - `200` (OK)
167
- - `301` (Moved Permanently - will redirect snapshot)
204
+ - `301` (Moved Permanently)
168
205
  - `404` (Not Found - snapshot seems to be empty)
169
206
  - `500` (Internal Server Error - snapshot is at least for now not available)
170
207
 
171
- ### Optional
208
+ #### Optional Behavior Manipulation
172
209
 
173
- #### Behavior Manipulation
210
+ Parameters will change the download behavior for snapshots.
174
211
 
175
212
  - **`-o`**, **`--output`**:<br>
176
213
  Defaults to `waybackup_snapshots` in the current directory. The folder where downloaded files will be saved.
177
214
 
178
215
  - **`-m`**, **`--metadata`**<br>
179
- Change the folder where metadata will be saved (`cdx`/`db`/`csv`/`log`). Especially if you are downloading into a network share, you SHOULD set this to a local path because sqlite locking mechanism may cause issues with network shares.
216
+ Folder where metadata will be saved (`cdx`/`db`/`csv`/`log`). If you are downloading into a network share, you SHOULD set this to a local path because sqlite locking mechanism may cause issues with network shares.
180
217
 
181
218
  - **`--verbose`**:<br>
182
219
  Increase output verbosity.
183
220
 
184
221
  - **`--log`** <!-- `<path>` -->:<br>
185
- Saves a log file into the output-dir. Named as `waybackup_<sanitized_url>.log`.
222
+ Saves a log file into the output-dir. `waybackup_<sanitized_url>.log`.
186
223
 
187
224
  - **`--progress`**:<br>
188
225
  Shows a progress bar instead of the default output.
189
226
 
190
227
  - **`--workers`** `<count>`:<br>
191
- Sets the number of simultaneous download workers. Default is 1, safe range is about 10. Be cautious as too many workers may lead to refused connections from the Wayback Machine.
228
+ Number of simultaneous download workers. Default is 1, safe range is about 10. Too many workers may lead to refused connections by archive.org.
192
229
 
193
230
  - **`--no-redirect`**:<br>
194
- Disables following redirects of snapshots. Useful for preventing timestamp-folder mismatches caused by Archive.org redirects.
231
+ Disables following redirects of snapshots. Can prevent timestamp-folder mismatches caused by redirects.
195
232
 
196
233
  - **`--retry`** `<attempts>`:<br>
197
- Specifies number of retry attempts for failed downloads.
234
+ Retry attempts for failed downloads.
198
235
 
199
236
  - **`--delay`** `<seconds>`:<br>
200
- Specifies delay between download requests in seconds. Default is no delay (0).
237
+ Delay between download requests in seconds. Default is no delay (0).
201
238
 
202
239
  #### Job Handling:
203
240
 
204
241
  - **`--reset`**:
205
- If set, the job will be reset, and any existing `cdx`, `db`, `csv` files will be **deleted**. This allows you to start the job from scratch without considering previously downloaded data.
242
+ If set, the job will be reset, and `cdx`, `db`, `csv` files will be **deleted**. This allows you to start the job from scratch.
206
243
 
207
244
  - **`--keep`**:
208
- If set, all files will be kept after the job is finished. This includes the `cdx` and `db` file. Without this argument, they will be deleted if the job finished successfully.
245
+ If set, `cdx` and `db` files will be kept after the job is finished. Otherwise they will be deleted.
209
246
 
210
247
  <br>
211
248
  <br>
@@ -216,23 +253,11 @@ output:
216
253
 
217
254
  `pywaybackup` resumes interrupted jobs. The tool automatically continues from where it left off.
218
255
 
219
- - Detects existing `.cdx` and `.db` files in an `output dir` to resume downloading from the last successful point.
220
- - Compares `URL`, `mode`, and `optional query parameters` to ensure automatic resumption.
221
- - Skips previously downloaded files to save time.
256
+ Only resumes queries if:
257
+ - existing `.cdx` and `.db` files in an `output dir`
258
+ - command is identical by `URL`, `mode`, and `optional query parameters`
222
259
  > **Note:** Changing URL, mode selection, query parameters or output prevents automatic resumption.
223
260
 
224
- #### Resetting a Job (`--reset`)
225
-
226
- - Deletes `.cdx` and `.db` files and restarts the process from scratch.
227
- - Does **not** remove already downloaded files.
228
- - `waybackup -u https://example.com -a --reset`
229
-
230
- #### Keeping Job Data (`--keep`)
231
-
232
- - Normally, `.cdx` and `.db` files are deleted after a successful job.
233
- - `--keep` preserves them for future re-analysis or extending the query.
234
- - `waybackup -u https://example.com -a --keep`
235
-
236
261
  <br>
237
262
  <br>
238
263
 
@@ -338,6 +363,11 @@ Exceptions will be written into `waybackup_error.log` (each run overwrites the f
338
363
  <br>
339
364
  <br>
340
365
 
366
+ ## Future ideas (long run)
367
+
368
+ - More module functionality
369
+ - Docker UI
370
+
341
371
  ## Contributing
342
372
 
343
373
  I'm always happy for some feature requests to improve the usability of this tool.
@@ -11,6 +11,17 @@ Internet-archive is a nice source for several OSINT-information. This tool is a
11
11
 
12
12
  This tool allows you to download content from the Wayback Machine (archive.org). You can use it to download either the latest version or all versions of web page snapshots within a specified range.
13
13
 
14
+ # Content
15
+
16
+ ➡️ [Installation](#installation) <br>
17
+ ➡️ [notes / issues / hints](#notes--issues--hints) <br>
18
+ ➡️ [import](#import) <br>
19
+ ➡️ [cli](#cli) <br>
20
+ ➡️ [Usage](#usage) <br>
21
+ ➡️ [Examples](#examples) <br>
22
+ ➡️ [Output](#output) <br>
23
+ ➡️ [Contributing](#contributing) <br>
24
+
14
25
  ## Installation
15
26
 
16
27
  ### Pip
@@ -43,8 +54,14 @@ This tool allows you to download content from the Wayback Machine (archive.org).
43
54
  You can import pywaybackup into your own scripts and run it. Args are the same as cli.
44
55
 
45
56
  Additional args:
46
- - `silent` (default True): If True, suppresses all output to the console.
47
- - `debug` (default False): If True, disables writing errors to the error log file.
57
+ - `silent` (default False): If True, suppresses all output to the console.
58
+ - `debug` (default True): If False, disables writing errors to the error log file.
59
+
60
+ Use:
61
+ - `run()`
62
+ - `status()`
63
+ - `paths()`
64
+ - `stop()`
48
65
 
49
66
  ```python
50
67
  from pywaybackup import PyWayBackup
@@ -76,6 +93,29 @@ output:
76
93
  }
77
94
  ```
78
95
 
96
+ ... or run it asynchronously and print the current status or stop it whenever needed.
97
+
98
+ ```python
99
+ import time
100
+ from pywaybackup import PyWayBackup
101
+
102
+ backup = PyWayBackup( ... )
103
+ backup.run(daemon=True)
104
+ print(backup.status())
105
+ time.sleep(10)
106
+ print(backup.status())
107
+ backup.stop()
108
+ ```
109
+ output:
110
+ ```bash
111
+ {
112
+ 'task': 'downloading snapshots',
113
+ 'current': 15,
114
+ 'total': 84,
115
+ 'progress': '18%'
116
+ }
117
+ ```
118
+
79
119
  ## cli
80
120
 
81
121
  - `-h`, `--help`: Show the help message and exit.
@@ -89,25 +129,24 @@ output:
89
129
  #### Mode Selection (Choose One)
90
130
 
91
131
  - **`-a`**, **`--all`**:<br>
92
- Download snapshots of all timestamps. You will get a folder per timestamp with the files available at that time.
132
+ All timestamps. Gives one folder per timestamp.
93
133
  - **`-l`**, **`--last`**:<br>
94
- Download the last version of each file snapshot. You will get one directory with a rebuild of the page. It contains the last version of each file of your specified `--range`.
134
+ Last Version. Gives one folder containing the last version of each file of specified `--range`.
95
135
  - **`-f`**, **`--first`**:<br>
96
- Download the first version of each file snapshot. You will get one directory with a rebuild of the page. It contains the first version of each file of your specified `--range`.
97
- - **`-s`**, **`--save`**:<br>
98
- Save a page to the Wayback Machine. (beta)
136
+ First Version. Gives one folder containing the first version of each file of specified `--range`.
99
137
 
100
138
  #### Optional query parameters
101
139
 
140
+ Parameters for archive.org CDX query. No effect on snapshot download itself.
141
+
102
142
  - **`-e`**, **`--explicit`**:<br>
103
- Only download the explicit given URL. No wildcard subdomains or paths. Use e.g. to get root-only snapshots. This is recommended for explicit files like `login.html` or `?query=this`.
143
+ Only the explicit URL. No wildcard subdomains or paths. For example get: root-only (`https://example.com`) or specific file (`login.html`, `?query=this`).
104
144
 
105
145
  - **`--limit`** `<count>`:<br>
106
- Limits the amount of snapshots to query from the CDX server. If an existing CDX file is injected, the limit will have no effect. So you would need to set `--keep`.
146
+ Limits the snapshots fetched from archive.org CDX. (Will have no effect on existing CDX files)
107
147
 
108
148
  - **Range Selection:**<br>
109
- Specify the range in years or a specific timestamp either start, end, or both. If you specify the `range`, the `start` and `end` will be ignored. Format for timestamps: YYYYMMDDhhmmss. You can only give a year or increase specificity by going through the timestamp starting on the left.<br>
110
- (year 2019, year+month+day 20190101, year+month+day+hour 2019010112)
149
+ Set the query range in years (`range`) or a timestamp (`start` and/or `end`). If `range` then ignores `start` and `end`. Format for timestamps: YYYYMMDDhhmmss. Timestamp can as specific as needed (year 2019, year+month+day 20190101, ...).
111
150
 
112
151
  - **`-r`**, **`--range`**:<br>
113
152
  Specify the range in years for which to search and download snapshots.
@@ -117,57 +156,56 @@ output:
117
156
  Timestamp to end searching.
118
157
 
119
158
  - **Filtering:**<br>
120
- A filter will result in a filtered cdx-file. So if you want to download all files later, you need to query again without the filter.
121
159
 
122
160
  - **`--filetype`** `<filetype>`:<br>
123
- Specify filetypes to download. Default is all filetypes. Separate multiple filetypes with a comma. Example: `--filetype jpg,css,js`. Filetypes are filtered as they are in the snapshot. So if there is no explicit `html` file in the path (common practice) then you cant filter them.
161
+ Specify filetypes to download. Example: `--filetype jpg,css,js`. You can only filter filetypes which are stored by archive.org (.html mostly not)
124
162
 
125
163
  - **`--statuscode`** `<statuscode>`:<br>
126
- Specify HTTP status codes to download. Default is all statuscodes. Separate multiple status codes with a comma. Example: `--statuscode 200,301`. Pywaybackup will try to download any snapshot regardless of it's statuscode. For 404 of course this means logged errors and corresponding entries in the csv. However, you may want to get a csv that includes these negative attempts for your needs.<br>
164
+ Specify HTTP status codes to download. Example: `--statuscode 200,301`. PyWayBackup will always skip `404` and `301`.<br>
127
165
  Common status codes you may want to handle/filter:
128
166
  - `200` (OK)
129
- - `301` (Moved Permanently - will redirect snapshot)
167
+ - `301` (Moved Permanently)
130
168
  - `404` (Not Found - snapshot seems to be empty)
131
169
  - `500` (Internal Server Error - snapshot is at least for now not available)
132
170
 
133
- ### Optional
171
+ #### Optional Behavior Manipulation
134
172
 
135
- #### Behavior Manipulation
173
+ Parameters will change the download behavior for snapshots.
136
174
 
137
175
  - **`-o`**, **`--output`**:<br>
138
176
  Defaults to `waybackup_snapshots` in the current directory. The folder where downloaded files will be saved.
139
177
 
140
178
  - **`-m`**, **`--metadata`**<br>
141
- Change the folder where metadata will be saved (`cdx`/`db`/`csv`/`log`). Especially if you are downloading into a network share, you SHOULD set this to a local path because sqlite locking mechanism may cause issues with network shares.
179
+ Folder where metadata will be saved (`cdx`/`db`/`csv`/`log`). If you are downloading into a network share, you SHOULD set this to a local path because sqlite locking mechanism may cause issues with network shares.
142
180
 
143
181
  - **`--verbose`**:<br>
144
182
  Increase output verbosity.
145
183
 
146
184
  - **`--log`** <!-- `<path>` -->:<br>
147
- Saves a log file into the output-dir. Named as `waybackup_<sanitized_url>.log`.
185
+ Saves a log file into the output-dir. `waybackup_<sanitized_url>.log`.
148
186
 
149
187
  - **`--progress`**:<br>
150
188
  Shows a progress bar instead of the default output.
151
189
 
152
190
  - **`--workers`** `<count>`:<br>
153
- Sets the number of simultaneous download workers. Default is 1, safe range is about 10. Be cautious as too many workers may lead to refused connections from the Wayback Machine.
191
+ Number of simultaneous download workers. Default is 1, safe range is about 10. Too many workers may lead to refused connections by archive.org.
154
192
 
155
193
  - **`--no-redirect`**:<br>
156
- Disables following redirects of snapshots. Useful for preventing timestamp-folder mismatches caused by Archive.org redirects.
194
+ Disables following redirects of snapshots. Can prevent timestamp-folder mismatches caused by redirects.
157
195
 
158
196
  - **`--retry`** `<attempts>`:<br>
159
- Specifies number of retry attempts for failed downloads.
197
+ Retry attempts for failed downloads.
160
198
 
161
199
  - **`--delay`** `<seconds>`:<br>
162
- Specifies delay between download requests in seconds. Default is no delay (0).
200
+ Delay between download requests in seconds. Default is no delay (0).
163
201
 
164
202
  #### Job Handling:
165
203
 
166
204
  - **`--reset`**:
167
- If set, the job will be reset, and any existing `cdx`, `db`, `csv` files will be **deleted**. This allows you to start the job from scratch without considering previously downloaded data.
205
+ If set, the job will be reset, and `cdx`, `db`, `csv` files will be **deleted**. This allows you to start the job from scratch.
168
206
 
169
207
  - **`--keep`**:
170
- If set, all files will be kept after the job is finished. This includes the `cdx` and `db` file. Without this argument, they will be deleted if the job finished successfully.
208
+ If set, `cdx` and `db` files will be kept after the job is finished. Otherwise they will be deleted.
171
209
 
172
210
  <br>
173
211
  <br>
@@ -178,23 +216,11 @@ output:
178
216
 
179
217
  `pywaybackup` resumes interrupted jobs. The tool automatically continues from where it left off.
180
218
 
181
- - Detects existing `.cdx` and `.db` files in an `output dir` to resume downloading from the last successful point.
182
- - Compares `URL`, `mode`, and `optional query parameters` to ensure automatic resumption.
183
- - Skips previously downloaded files to save time.
219
+ Only resumes queries if:
220
+ - existing `.cdx` and `.db` files in an `output dir`
221
+ - command is identical by `URL`, `mode`, and `optional query parameters`
184
222
  > **Note:** Changing URL, mode selection, query parameters or output prevents automatic resumption.
185
223
 
186
- #### Resetting a Job (`--reset`)
187
-
188
- - Deletes `.cdx` and `.db` files and restarts the process from scratch.
189
- - Does **not** remove already downloaded files.
190
- - `waybackup -u https://example.com -a --reset`
191
-
192
- #### Keeping Job Data (`--keep`)
193
-
194
- - Normally, `.cdx` and `.db` files are deleted after a successful job.
195
- - `--keep` preserves them for future re-analysis or extending the query.
196
- - `waybackup -u https://example.com -a --keep`
197
-
198
224
  <br>
199
225
  <br>
200
226
 
@@ -300,6 +326,11 @@ Exceptions will be written into `waybackup_error.log` (each run overwrites the f
300
326
  <br>
301
327
  <br>
302
328
 
329
+ ## Future ideas (long run)
330
+
331
+ - More module functionality
332
+ - Docker UI
333
+
303
334
  ## Contributing
304
335
 
305
336
  I'm always happy for some feature requests to improve the usability of this tool.
@@ -7,7 +7,7 @@ packages = ["pywaybackup"]
7
7
 
8
8
  [project]
9
9
  name = "pywaybackup"
10
- version = "3.4.1"
10
+ version = "4.1.0"
11
11
  description = "Query and download archive.org as simple as possible."
12
12
  authors = [
13
13
  { name = "bitdruid", email = "bitdruid@outlook.com" }
@@ -16,8 +16,7 @@ license = { file = "LICENSE" }
16
16
  readme = "README.md"
17
17
  requires-python = ">=3.8"
18
18
  dependencies = [
19
- "pysqlite3-binary==0.5.4; sys_platform == 'linux'",
20
- "pysqlite-binary; sys_platform == 'win32'",
19
+ "SQLAlchemy==2.0.43",
21
20
  "requests==2.32.3",
22
21
  "tqdm==4.67.1",
23
22
  "python-magic==0.4.27; sys_platform == 'linux'",
@@ -24,8 +24,8 @@ class Arguments:
24
24
  optional = parser.add_argument_group("optional query parameters")
25
25
  optional.add_argument("-e", "--explicit", action="store_true", help="search only for the explicit given url")
26
26
  optional.add_argument("-r", "--range", type=int, metavar="", help="range in years to search")
27
- optional.add_argument("--start", type=int, metavar="", help="start timestamp format: YYYYMMDDhhmmss")
28
- optional.add_argument("--end", type=int, metavar="", help="end timestamp format: YYYYMMDDhhmmss")
27
+ optional.add_argument("--start", type=int, metavar="", help="start timestamp format: YYYYMMDDHHMMSS")
28
+ optional.add_argument("--end", type=int, metavar="", help="end timestamp format: YYYYMMDDHHMMSS")
29
29
  optional.add_argument("--limit", type=int, nargs="?", const=True, metavar="int", help="limit the number of snapshots to download")
30
30
  optional.add_argument("--filetype", type=str, metavar="", help="filetypes to download comma separated (js,css,...)")
31
31
  optional.add_argument("--statuscode", type=str, metavar="", help="statuscodes to download comma separated (200,404,...)")
@@ -55,3 +55,4 @@ class Arguments:
55
55
  def get_args(self) -> dict:
56
56
  """Returns the parsed arguments as a dictionary."""
57
57
  return vars(self.args)
58
+
@@ -14,9 +14,9 @@ class Exception:
14
14
  command = None
15
15
 
16
16
  @classmethod
17
- def init(cls, debug=None, output=None, command=None):
17
+ def init(cls, debugfile=None, output=None, command=None):
18
18
  sys.excepthook = cls.exception_handler # set custom exception handler (uncaught exceptions)
19
- cls.debug = debug
19
+ cls.debugfile = debugfile
20
20
  cls.output = output
21
21
  cls.command = command
22
22
 
@@ -45,18 +45,18 @@ class Exception:
45
45
  exception_message += "!-- Traceback is None\n"
46
46
  exception_message += f"!-- Description: {e}\n-------------------------"
47
47
  print(exception_message)
48
- if cls.debug:
49
- print(f"Exception log: {cls.debug}")
48
+ if cls.debugfile:
49
+ print(f"Exception log: {cls.debugfile}")
50
50
  if cls.new_debug: # new run, overwrite file
51
51
  cls.new_debug = False
52
- f = open(cls.debug, "w", encoding="utf-8")
52
+ f = open(cls.debugfile, "w", encoding="utf-8")
53
53
  f.write("-------------------------\n")
54
54
  f.write(f"Version: {version('pywaybackup')}\n")
55
55
  f.write("-------------------------\n")
56
56
  f.write(f"Command: {cls.command}\n")
57
57
  f.write("-------------------------\n\n")
58
58
  else: # current run, append to file
59
- f = open(cls.debug, "a", encoding="utf-8")
59
+ f = open(cls.debugfile, "a", encoding="utf-8")
60
60
  f.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n")
61
61
  f.write(exception_message + "\n")
62
62
  f.write("!-- Local Variables:\n")
@@ -96,4 +96,4 @@ class Exception:
96
96
  if issubclass(exception_type, KeyboardInterrupt):
97
97
  sys.__excepthook__(exception_type, exception, traceback)
98
98
  return
99
- Exception.exception("UNCAUGHT EXCEPTION", exception, traceback) # uncaught exceptions also with custom scheme
99
+ Exception.exception('UNCAUGHT EXCEPTION', exception, traceback) # uncaught exceptions also with custom scheme