pywaybackup 4.0.0__tar.gz → 4.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pywaybackup-4.0.0/pywaybackup.egg-info → pywaybackup-4.1.0}/PKG-INFO +31 -41
- {pywaybackup-4.0.0 → pywaybackup-4.1.0}/README.md +29 -38
- {pywaybackup-4.0.0 → pywaybackup-4.1.0}/pyproject.toml +2 -3
- {pywaybackup-4.0.0 → pywaybackup-4.1.0}/pywaybackup/PyWayBackup.py +12 -12
- pywaybackup-4.1.0/pywaybackup/Snapshot.py +206 -0
- {pywaybackup-4.0.0 → pywaybackup-4.1.0}/pywaybackup/SnapshotCollection.py +135 -110
- {pywaybackup-4.0.0 → pywaybackup-4.1.0}/pywaybackup/Worker.py +1 -2
- {pywaybackup-4.0.0 → pywaybackup-4.1.0}/pywaybackup/archive_download.py +147 -6
- {pywaybackup-4.0.0 → pywaybackup-4.1.0}/pywaybackup/archive_save.py +7 -5
- pywaybackup-4.1.0/pywaybackup/db.py +197 -0
- {pywaybackup-4.0.0 → pywaybackup-4.1.0}/pywaybackup/files.py +32 -20
- {pywaybackup-4.0.0 → pywaybackup-4.1.0/pywaybackup.egg-info}/PKG-INFO +31 -41
- {pywaybackup-4.0.0 → pywaybackup-4.1.0}/pywaybackup.egg-info/requires.txt +1 -2
- pywaybackup-4.0.0/pywaybackup/Snapshot.py +0 -129
- pywaybackup-4.0.0/pywaybackup/db.py +0 -103
- {pywaybackup-4.0.0 → pywaybackup-4.1.0}/LICENSE +0 -0
- {pywaybackup-4.0.0 → pywaybackup-4.1.0}/pywaybackup/Arguments.py +0 -0
- {pywaybackup-4.0.0 → pywaybackup-4.1.0}/pywaybackup/Exception.py +0 -0
- {pywaybackup-4.0.0 → pywaybackup-4.1.0}/pywaybackup/Verbosity.py +0 -0
- {pywaybackup-4.0.0 → pywaybackup-4.1.0}/pywaybackup/__init__.py +0 -0
- {pywaybackup-4.0.0 → pywaybackup-4.1.0}/pywaybackup/helper.py +0 -0
- {pywaybackup-4.0.0 → pywaybackup-4.1.0}/pywaybackup/main.py +0 -0
- {pywaybackup-4.0.0 → pywaybackup-4.1.0}/pywaybackup.egg-info/SOURCES.txt +0 -0
- {pywaybackup-4.0.0 → pywaybackup-4.1.0}/pywaybackup.egg-info/dependency_links.txt +0 -0
- {pywaybackup-4.0.0 → pywaybackup-4.1.0}/pywaybackup.egg-info/entry_points.txt +0 -0
- {pywaybackup-4.0.0 → pywaybackup-4.1.0}/pywaybackup.egg-info/top_level.txt +0 -0
- {pywaybackup-4.0.0 → pywaybackup-4.1.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pywaybackup
|
|
3
|
-
Version: 4.
|
|
3
|
+
Version: 4.1.0
|
|
4
4
|
Summary: Query and download archive.org as simple as possible.
|
|
5
5
|
Author-email: bitdruid <bitdruid@outlook.com>
|
|
6
6
|
License: MIT License
|
|
@@ -29,8 +29,7 @@ Project-URL: homepage, https://github.com/bitdruid/python-wayback-machine-downlo
|
|
|
29
29
|
Requires-Python: >=3.8
|
|
30
30
|
Description-Content-Type: text/markdown
|
|
31
31
|
License-File: LICENSE
|
|
32
|
-
Requires-Dist:
|
|
33
|
-
Requires-Dist: pysqlite-binary; sys_platform == "win32"
|
|
32
|
+
Requires-Dist: SQLAlchemy==2.0.43
|
|
34
33
|
Requires-Dist: requests==2.32.3
|
|
35
34
|
Requires-Dist: tqdm==4.67.1
|
|
36
35
|
Requires-Dist: python-magic==0.4.27; sys_platform == "linux"
|
|
@@ -167,25 +166,24 @@ output:
|
|
|
167
166
|
#### Mode Selection (Choose One)
|
|
168
167
|
|
|
169
168
|
- **`-a`**, **`--all`**:<br>
|
|
170
|
-
|
|
169
|
+
All timestamps. Gives one folder per timestamp.
|
|
171
170
|
- **`-l`**, **`--last`**:<br>
|
|
172
|
-
|
|
171
|
+
Last Version. Gives one folder containing the last version of each file of specified `--range`.
|
|
173
172
|
- **`-f`**, **`--first`**:<br>
|
|
174
|
-
|
|
175
|
-
- **`-s`**, **`--save`**:<br>
|
|
176
|
-
Save a page to the Wayback Machine. (beta)
|
|
173
|
+
First Version. Gives one folder containing the first version of each file of specified `--range`.
|
|
177
174
|
|
|
178
175
|
#### Optional query parameters
|
|
179
176
|
|
|
177
|
+
Parameters for archive.org CDX query. No effect on snapshot download itself.
|
|
178
|
+
|
|
180
179
|
- **`-e`**, **`--explicit`**:<br>
|
|
181
|
-
Only
|
|
180
|
+
Only the explicit URL. No wildcard subdomains or paths. For example get: root-only (`https://example.com`) or specific file (`login.html`, `?query=this`).
|
|
182
181
|
|
|
183
182
|
- **`--limit`** `<count>`:<br>
|
|
184
|
-
Limits the
|
|
183
|
+
Limits the snapshots fetched from archive.org CDX. (Will have no effect on existing CDX files)
|
|
185
184
|
|
|
186
185
|
- **Range Selection:**<br>
|
|
187
|
-
|
|
188
|
-
(year 2019, year+month+day 20190101, year+month+day+hour 2019010112)
|
|
186
|
+
Set the query range in years (`range`) or a timestamp (`start` and/or `end`). If `range` then ignores `start` and `end`. Format for timestamps: YYYYMMDDhhmmss. Timestamp can as specific as needed (year 2019, year+month+day 20190101, ...).
|
|
189
187
|
|
|
190
188
|
- **`-r`**, **`--range`**:<br>
|
|
191
189
|
Specify the range in years for which to search and download snapshots.
|
|
@@ -195,57 +193,56 @@ output:
|
|
|
195
193
|
Timestamp to end searching.
|
|
196
194
|
|
|
197
195
|
- **Filtering:**<br>
|
|
198
|
-
A filter will result in a filtered cdx-file. So if you want to download all files later, you need to query again without the filter.
|
|
199
196
|
|
|
200
197
|
- **`--filetype`** `<filetype>`:<br>
|
|
201
|
-
Specify filetypes to download.
|
|
198
|
+
Specify filetypes to download. Example: `--filetype jpg,css,js`. You can only filter filetypes which are stored by archive.org (.html mostly not)
|
|
202
199
|
|
|
203
200
|
- **`--statuscode`** `<statuscode>`:<br>
|
|
204
|
-
Specify HTTP status codes to download.
|
|
201
|
+
Specify HTTP status codes to download. Example: `--statuscode 200,301`. PyWayBackup will always skip `404` and `301`.<br>
|
|
205
202
|
Common status codes you may want to handle/filter:
|
|
206
203
|
- `200` (OK)
|
|
207
|
-
- `301` (Moved Permanently
|
|
204
|
+
- `301` (Moved Permanently)
|
|
208
205
|
- `404` (Not Found - snapshot seems to be empty)
|
|
209
206
|
- `500` (Internal Server Error - snapshot is at least for now not available)
|
|
210
207
|
|
|
211
|
-
|
|
208
|
+
#### Optional Behavior Manipulation
|
|
212
209
|
|
|
213
|
-
|
|
210
|
+
Parameters will change the download behavior for snapshots.
|
|
214
211
|
|
|
215
212
|
- **`-o`**, **`--output`**:<br>
|
|
216
213
|
Defaults to `waybackup_snapshots` in the current directory. The folder where downloaded files will be saved.
|
|
217
214
|
|
|
218
215
|
- **`-m`**, **`--metadata`**<br>
|
|
219
|
-
|
|
216
|
+
Folder where metadata will be saved (`cdx`/`db`/`csv`/`log`). If you are downloading into a network share, you SHOULD set this to a local path because sqlite locking mechanism may cause issues with network shares.
|
|
220
217
|
|
|
221
218
|
- **`--verbose`**:<br>
|
|
222
219
|
Increase output verbosity.
|
|
223
220
|
|
|
224
221
|
- **`--log`** <!-- `<path>` -->:<br>
|
|
225
|
-
Saves a log file into the output-dir.
|
|
222
|
+
Saves a log file into the output-dir. `waybackup_<sanitized_url>.log`.
|
|
226
223
|
|
|
227
224
|
- **`--progress`**:<br>
|
|
228
225
|
Shows a progress bar instead of the default output.
|
|
229
226
|
|
|
230
227
|
- **`--workers`** `<count>`:<br>
|
|
231
|
-
|
|
228
|
+
Number of simultaneous download workers. Default is 1, safe range is about 10. Too many workers may lead to refused connections by archive.org.
|
|
232
229
|
|
|
233
230
|
- **`--no-redirect`**:<br>
|
|
234
|
-
Disables following redirects of snapshots.
|
|
231
|
+
Disables following redirects of snapshots. Can prevent timestamp-folder mismatches caused by redirects.
|
|
235
232
|
|
|
236
233
|
- **`--retry`** `<attempts>`:<br>
|
|
237
|
-
|
|
234
|
+
Retry attempts for failed downloads.
|
|
238
235
|
|
|
239
236
|
- **`--delay`** `<seconds>`:<br>
|
|
240
|
-
|
|
237
|
+
Delay between download requests in seconds. Default is no delay (0).
|
|
241
238
|
|
|
242
239
|
#### Job Handling:
|
|
243
240
|
|
|
244
241
|
- **`--reset`**:
|
|
245
|
-
If set, the job will be reset, and
|
|
242
|
+
If set, the job will be reset, and `cdx`, `db`, `csv` files will be **deleted**. This allows you to start the job from scratch.
|
|
246
243
|
|
|
247
244
|
- **`--keep`**:
|
|
248
|
-
If set,
|
|
245
|
+
If set, `cdx` and `db` files will be kept after the job is finished. Otherwise they will be deleted.
|
|
249
246
|
|
|
250
247
|
<br>
|
|
251
248
|
<br>
|
|
@@ -256,23 +253,11 @@ output:
|
|
|
256
253
|
|
|
257
254
|
`pywaybackup` resumes interrupted jobs. The tool automatically continues from where it left off.
|
|
258
255
|
|
|
259
|
-
|
|
260
|
-
-
|
|
261
|
-
-
|
|
256
|
+
Only resumes queries if:
|
|
257
|
+
- existing `.cdx` and `.db` files in an `output dir`
|
|
258
|
+
- command is identical by `URL`, `mode`, and `optional query parameters`
|
|
262
259
|
> **Note:** Changing URL, mode selection, query parameters or output prevents automatic resumption.
|
|
263
260
|
|
|
264
|
-
#### Resetting a Job (`--reset`)
|
|
265
|
-
|
|
266
|
-
- Deletes `.cdx` and `.db` files and restarts the process from scratch.
|
|
267
|
-
- Does **not** remove already downloaded files.
|
|
268
|
-
- `waybackup -u https://example.com -a --reset`
|
|
269
|
-
|
|
270
|
-
#### Keeping Job Data (`--keep`)
|
|
271
|
-
|
|
272
|
-
- Normally, `.cdx` and `.db` files are deleted after a successful job.
|
|
273
|
-
- `--keep` preserves them for future re-analysis or extending the query.
|
|
274
|
-
- `waybackup -u https://example.com -a --keep`
|
|
275
|
-
|
|
276
261
|
<br>
|
|
277
262
|
<br>
|
|
278
263
|
|
|
@@ -378,6 +363,11 @@ Exceptions will be written into `waybackup_error.log` (each run overwrites the f
|
|
|
378
363
|
<br>
|
|
379
364
|
<br>
|
|
380
365
|
|
|
366
|
+
## Future ideas (long run)
|
|
367
|
+
|
|
368
|
+
- More module functionality
|
|
369
|
+
- Docker UI
|
|
370
|
+
|
|
381
371
|
## Contributing
|
|
382
372
|
|
|
383
373
|
I'm always happy for some feature requests to improve the usability of this tool.
|
|
@@ -129,25 +129,24 @@ output:
|
|
|
129
129
|
#### Mode Selection (Choose One)
|
|
130
130
|
|
|
131
131
|
- **`-a`**, **`--all`**:<br>
|
|
132
|
-
|
|
132
|
+
All timestamps. Gives one folder per timestamp.
|
|
133
133
|
- **`-l`**, **`--last`**:<br>
|
|
134
|
-
|
|
134
|
+
Last Version. Gives one folder containing the last version of each file of specified `--range`.
|
|
135
135
|
- **`-f`**, **`--first`**:<br>
|
|
136
|
-
|
|
137
|
-
- **`-s`**, **`--save`**:<br>
|
|
138
|
-
Save a page to the Wayback Machine. (beta)
|
|
136
|
+
First Version. Gives one folder containing the first version of each file of specified `--range`.
|
|
139
137
|
|
|
140
138
|
#### Optional query parameters
|
|
141
139
|
|
|
140
|
+
Parameters for archive.org CDX query. No effect on snapshot download itself.
|
|
141
|
+
|
|
142
142
|
- **`-e`**, **`--explicit`**:<br>
|
|
143
|
-
Only
|
|
143
|
+
Only the explicit URL. No wildcard subdomains or paths. For example get: root-only (`https://example.com`) or specific file (`login.html`, `?query=this`).
|
|
144
144
|
|
|
145
145
|
- **`--limit`** `<count>`:<br>
|
|
146
|
-
Limits the
|
|
146
|
+
Limits the snapshots fetched from archive.org CDX. (Will have no effect on existing CDX files)
|
|
147
147
|
|
|
148
148
|
- **Range Selection:**<br>
|
|
149
|
-
|
|
150
|
-
(year 2019, year+month+day 20190101, year+month+day+hour 2019010112)
|
|
149
|
+
Set the query range in years (`range`) or a timestamp (`start` and/or `end`). If `range` then ignores `start` and `end`. Format for timestamps: YYYYMMDDhhmmss. Timestamp can as specific as needed (year 2019, year+month+day 20190101, ...).
|
|
151
150
|
|
|
152
151
|
- **`-r`**, **`--range`**:<br>
|
|
153
152
|
Specify the range in years for which to search and download snapshots.
|
|
@@ -157,57 +156,56 @@ output:
|
|
|
157
156
|
Timestamp to end searching.
|
|
158
157
|
|
|
159
158
|
- **Filtering:**<br>
|
|
160
|
-
A filter will result in a filtered cdx-file. So if you want to download all files later, you need to query again without the filter.
|
|
161
159
|
|
|
162
160
|
- **`--filetype`** `<filetype>`:<br>
|
|
163
|
-
Specify filetypes to download.
|
|
161
|
+
Specify filetypes to download. Example: `--filetype jpg,css,js`. You can only filter filetypes which are stored by archive.org (.html mostly not)
|
|
164
162
|
|
|
165
163
|
- **`--statuscode`** `<statuscode>`:<br>
|
|
166
|
-
Specify HTTP status codes to download.
|
|
164
|
+
Specify HTTP status codes to download. Example: `--statuscode 200,301`. PyWayBackup will always skip `404` and `301`.<br>
|
|
167
165
|
Common status codes you may want to handle/filter:
|
|
168
166
|
- `200` (OK)
|
|
169
|
-
- `301` (Moved Permanently
|
|
167
|
+
- `301` (Moved Permanently)
|
|
170
168
|
- `404` (Not Found - snapshot seems to be empty)
|
|
171
169
|
- `500` (Internal Server Error - snapshot is at least for now not available)
|
|
172
170
|
|
|
173
|
-
|
|
171
|
+
#### Optional Behavior Manipulation
|
|
174
172
|
|
|
175
|
-
|
|
173
|
+
Parameters will change the download behavior for snapshots.
|
|
176
174
|
|
|
177
175
|
- **`-o`**, **`--output`**:<br>
|
|
178
176
|
Defaults to `waybackup_snapshots` in the current directory. The folder where downloaded files will be saved.
|
|
179
177
|
|
|
180
178
|
- **`-m`**, **`--metadata`**<br>
|
|
181
|
-
|
|
179
|
+
Folder where metadata will be saved (`cdx`/`db`/`csv`/`log`). If you are downloading into a network share, you SHOULD set this to a local path because sqlite locking mechanism may cause issues with network shares.
|
|
182
180
|
|
|
183
181
|
- **`--verbose`**:<br>
|
|
184
182
|
Increase output verbosity.
|
|
185
183
|
|
|
186
184
|
- **`--log`** <!-- `<path>` -->:<br>
|
|
187
|
-
Saves a log file into the output-dir.
|
|
185
|
+
Saves a log file into the output-dir. `waybackup_<sanitized_url>.log`.
|
|
188
186
|
|
|
189
187
|
- **`--progress`**:<br>
|
|
190
188
|
Shows a progress bar instead of the default output.
|
|
191
189
|
|
|
192
190
|
- **`--workers`** `<count>`:<br>
|
|
193
|
-
|
|
191
|
+
Number of simultaneous download workers. Default is 1, safe range is about 10. Too many workers may lead to refused connections by archive.org.
|
|
194
192
|
|
|
195
193
|
- **`--no-redirect`**:<br>
|
|
196
|
-
Disables following redirects of snapshots.
|
|
194
|
+
Disables following redirects of snapshots. Can prevent timestamp-folder mismatches caused by redirects.
|
|
197
195
|
|
|
198
196
|
- **`--retry`** `<attempts>`:<br>
|
|
199
|
-
|
|
197
|
+
Retry attempts for failed downloads.
|
|
200
198
|
|
|
201
199
|
- **`--delay`** `<seconds>`:<br>
|
|
202
|
-
|
|
200
|
+
Delay between download requests in seconds. Default is no delay (0).
|
|
203
201
|
|
|
204
202
|
#### Job Handling:
|
|
205
203
|
|
|
206
204
|
- **`--reset`**:
|
|
207
|
-
If set, the job will be reset, and
|
|
205
|
+
If set, the job will be reset, and `cdx`, `db`, `csv` files will be **deleted**. This allows you to start the job from scratch.
|
|
208
206
|
|
|
209
207
|
- **`--keep`**:
|
|
210
|
-
If set,
|
|
208
|
+
If set, `cdx` and `db` files will be kept after the job is finished. Otherwise they will be deleted.
|
|
211
209
|
|
|
212
210
|
<br>
|
|
213
211
|
<br>
|
|
@@ -218,23 +216,11 @@ output:
|
|
|
218
216
|
|
|
219
217
|
`pywaybackup` resumes interrupted jobs. The tool automatically continues from where it left off.
|
|
220
218
|
|
|
221
|
-
|
|
222
|
-
-
|
|
223
|
-
-
|
|
219
|
+
Only resumes queries if:
|
|
220
|
+
- existing `.cdx` and `.db` files in an `output dir`
|
|
221
|
+
- command is identical by `URL`, `mode`, and `optional query parameters`
|
|
224
222
|
> **Note:** Changing URL, mode selection, query parameters or output prevents automatic resumption.
|
|
225
223
|
|
|
226
|
-
#### Resetting a Job (`--reset`)
|
|
227
|
-
|
|
228
|
-
- Deletes `.cdx` and `.db` files and restarts the process from scratch.
|
|
229
|
-
- Does **not** remove already downloaded files.
|
|
230
|
-
- `waybackup -u https://example.com -a --reset`
|
|
231
|
-
|
|
232
|
-
#### Keeping Job Data (`--keep`)
|
|
233
|
-
|
|
234
|
-
- Normally, `.cdx` and `.db` files are deleted after a successful job.
|
|
235
|
-
- `--keep` preserves them for future re-analysis or extending the query.
|
|
236
|
-
- `waybackup -u https://example.com -a --keep`
|
|
237
|
-
|
|
238
224
|
<br>
|
|
239
225
|
<br>
|
|
240
226
|
|
|
@@ -340,6 +326,11 @@ Exceptions will be written into `waybackup_error.log` (each run overwrites the f
|
|
|
340
326
|
<br>
|
|
341
327
|
<br>
|
|
342
328
|
|
|
329
|
+
## Future ideas (long run)
|
|
330
|
+
|
|
331
|
+
- More module functionality
|
|
332
|
+
- Docker UI
|
|
333
|
+
|
|
343
334
|
## Contributing
|
|
344
335
|
|
|
345
336
|
I'm always happy for some feature requests to improve the usability of this tool.
|
|
@@ -7,7 +7,7 @@ packages = ["pywaybackup"]
|
|
|
7
7
|
|
|
8
8
|
[project]
|
|
9
9
|
name = "pywaybackup"
|
|
10
|
-
version = "4.
|
|
10
|
+
version = "4.1.0"
|
|
11
11
|
description = "Query and download archive.org as simple as possible."
|
|
12
12
|
authors = [
|
|
13
13
|
{ name = "bitdruid", email = "bitdruid@outlook.com" }
|
|
@@ -16,8 +16,7 @@ license = { file = "LICENSE" }
|
|
|
16
16
|
readme = "README.md"
|
|
17
17
|
requires-python = ">=3.8"
|
|
18
18
|
dependencies = [
|
|
19
|
-
"
|
|
20
|
-
"pysqlite-binary; sys_platform == 'win32'",
|
|
19
|
+
"SQLAlchemy==2.0.43",
|
|
21
20
|
"requests==2.32.3",
|
|
22
21
|
"tqdm==4.67.1",
|
|
23
22
|
"python-magic==0.4.27; sys_platform == 'linux'",
|
|
@@ -1,18 +1,18 @@
|
|
|
1
|
-
import
|
|
1
|
+
import multiprocessing
|
|
2
2
|
import os
|
|
3
|
-
import time
|
|
4
3
|
import signal
|
|
5
|
-
import
|
|
6
|
-
|
|
4
|
+
import sys
|
|
5
|
+
import time
|
|
7
6
|
from importlib.metadata import version
|
|
8
7
|
|
|
9
8
|
import pywaybackup.archive_save as archive_save
|
|
9
|
+
from pywaybackup.archive_download import DownloadArchive
|
|
10
10
|
from pywaybackup.db import Database as db
|
|
11
|
-
from pywaybackup.Verbosity import Verbosity as vb
|
|
12
11
|
from pywaybackup.Exception import Exception as ex
|
|
12
|
+
from pywaybackup.files import CDXfile, CDXquery, CSVfile, File
|
|
13
|
+
from pywaybackup.helper import sanitize_filename, url_split
|
|
13
14
|
from pywaybackup.SnapshotCollection import SnapshotCollection
|
|
14
|
-
from pywaybackup.
|
|
15
|
-
from pywaybackup.files import CDXquery, CDXfile, CSVfile, File
|
|
15
|
+
from pywaybackup.Verbosity import Verbosity as vb
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class _Status:
|
|
@@ -401,15 +401,15 @@ class PyWayBackup:
|
|
|
401
401
|
"""
|
|
402
402
|
files = {
|
|
403
403
|
"snapshots": os.path.join(self._output, self._domain),
|
|
404
|
-
"cdxfile": self._cdxfile,
|
|
404
|
+
"cdxfile": self._cdxfile.filepath,
|
|
405
405
|
"dbfile": self._dbfile,
|
|
406
|
-
"csvfile": self._csvfile,
|
|
406
|
+
"csvfile": self._csvfile.filepath,
|
|
407
407
|
"log": self._log,
|
|
408
408
|
"debug": self._debug,
|
|
409
409
|
}
|
|
410
410
|
return {key: (os.path.relpath(path) if rel else path) for key, path in files.items() if path and os.path.exists(path)}
|
|
411
411
|
|
|
412
|
-
def status(self):
|
|
412
|
+
def status(self) -> dict:
|
|
413
413
|
"""
|
|
414
414
|
Return the current status of the backup process by a dictionary:
|
|
415
415
|
{'task':, 'current':, 'total':, 'progress':}
|
|
@@ -475,10 +475,10 @@ class PyWayBackup:
|
|
|
475
475
|
return False
|
|
476
476
|
|
|
477
477
|
def _startup(self):
|
|
478
|
-
if db.
|
|
478
|
+
if db.query_exist:
|
|
479
479
|
self._status.task = "resuming"
|
|
480
480
|
vb.write(
|
|
481
|
-
content=f"\nDOWNLOAD job exist - processed: {db.
|
|
481
|
+
content=f"\nDOWNLOAD job exist - processed: {db.query_progress}\nResuming download... (to reset the job use '--reset')"
|
|
482
482
|
)
|
|
483
483
|
|
|
484
484
|
if not self._silent:
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import threading
|
|
3
|
+
|
|
4
|
+
from pywaybackup.db import Database, select, update, waybackup_snapshots
|
|
5
|
+
from pywaybackup.helper import url_split
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Snapshot:
|
|
9
|
+
"""
|
|
10
|
+
Represents a single snapshot entry and manages its state and persistence.
|
|
11
|
+
|
|
12
|
+
When a relevant property of the snapshot is modified, the change is automatically
|
|
13
|
+
pushed to the database:
|
|
14
|
+
- redirect_url
|
|
15
|
+
- redirect_timestamp
|
|
16
|
+
- response_status
|
|
17
|
+
- file
|
|
18
|
+
|
|
19
|
+
Thread-safe for SQLite operations using a lock.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
__sqlite_lock = threading.Lock()
|
|
23
|
+
|
|
24
|
+
def __init__(self, db: Database, output: str, mode: str):
|
|
25
|
+
"""
|
|
26
|
+
Initialize a Snapshot instance and fetch its database row if available.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
db (Database): Database connection/session manager.
|
|
30
|
+
output (str): Output directory for downloaded files.
|
|
31
|
+
mode (str): Download mode ('first', 'last', or default).
|
|
32
|
+
"""
|
|
33
|
+
self._db = db
|
|
34
|
+
self.output = output
|
|
35
|
+
self.mode = mode
|
|
36
|
+
|
|
37
|
+
self._redirect_url = None
|
|
38
|
+
self._redirect_timestamp = None
|
|
39
|
+
self._response_status = None
|
|
40
|
+
self._file = None
|
|
41
|
+
|
|
42
|
+
self._row = self.fetch()
|
|
43
|
+
if self._row:
|
|
44
|
+
self.scid = self._row.scid
|
|
45
|
+
self.counter = self._row.counter
|
|
46
|
+
self.timestamp = self._row.timestamp
|
|
47
|
+
self.url_archive = self._row.url_archive
|
|
48
|
+
self.url_origin = self._row.url_origin
|
|
49
|
+
self.redirect_url = self._row.redirect_url
|
|
50
|
+
self.redirect_timestamp = self._row.redirect_timestamp
|
|
51
|
+
self.response_status = self._row.response
|
|
52
|
+
self.file = self._row.file
|
|
53
|
+
else:
|
|
54
|
+
self.counter = False
|
|
55
|
+
|
|
56
|
+
def fetch(self):
|
|
57
|
+
"""
|
|
58
|
+
Fetch a snapshot row from the database with response=NULL (not processed).
|
|
59
|
+
Uses row locking to prevent concurrent workers from processing the same row.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
waybackup_snapshots or None: The next unprocessed snapshot row, or None if none available.
|
|
63
|
+
"""
|
|
64
|
+
# mark as locked for other workers // only visual because get_snapshot fetches by NULL
|
|
65
|
+
# prevent another worker from fetching between LOCK-update (for sqlite by threading.Lock, else lock row)
|
|
66
|
+
|
|
67
|
+
def __on_sqlite():
|
|
68
|
+
if self._db.session.bind.dialect.name == "sqlite":
|
|
69
|
+
return True
|
|
70
|
+
return False
|
|
71
|
+
|
|
72
|
+
def __get_row():
|
|
73
|
+
with self._db.session.begin():
|
|
74
|
+
row = self._db.session.execute(
|
|
75
|
+
select(waybackup_snapshots)
|
|
76
|
+
.where(waybackup_snapshots.response.is_(None))
|
|
77
|
+
.order_by(waybackup_snapshots.scid)
|
|
78
|
+
.limit(1)
|
|
79
|
+
.with_for_update(skip_locked=True)
|
|
80
|
+
).scalar_one_or_none()
|
|
81
|
+
|
|
82
|
+
if row is None:
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
row.response = "LOCK"
|
|
86
|
+
|
|
87
|
+
return row
|
|
88
|
+
|
|
89
|
+
if __on_sqlite():
|
|
90
|
+
with self.__sqlite_lock:
|
|
91
|
+
return __get_row()
|
|
92
|
+
else:
|
|
93
|
+
return __get_row()
|
|
94
|
+
|
|
95
|
+
def modify(self, column, value):
|
|
96
|
+
"""
|
|
97
|
+
Update a column value for this snapshot in the database.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
column (str): Name of the column to update.
|
|
101
|
+
value: New value to set for the column.
|
|
102
|
+
"""
|
|
103
|
+
column = getattr(waybackup_snapshots, column)
|
|
104
|
+
self._db.session.execute(update(waybackup_snapshots).where(waybackup_snapshots.scid == self.scid).values({column: value}))
|
|
105
|
+
self._db.session.commit()
|
|
106
|
+
|
|
107
|
+
def create_output(self):
|
|
108
|
+
"""
|
|
109
|
+
Generate the file path for the snapshot download.
|
|
110
|
+
|
|
111
|
+
If mode is 'first' or 'last', the path does not include the timestamp.
|
|
112
|
+
Otherwise, the timestamp is included in the path.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
str: Absolute path to the output file for the snapshot.
|
|
116
|
+
"""
|
|
117
|
+
domain, subdir, filename = url_split(self.url_archive.split("id_/")[1], index=True)
|
|
118
|
+
|
|
119
|
+
if self.mode == "last" or self.mode == "first":
|
|
120
|
+
download_dir = os.path.join(self.output, domain, subdir)
|
|
121
|
+
else:
|
|
122
|
+
download_dir = os.path.join(self.output, domain, self.timestamp, subdir)
|
|
123
|
+
|
|
124
|
+
download_file = os.path.abspath(os.path.join(download_dir, filename))
|
|
125
|
+
|
|
126
|
+
return download_file
|
|
127
|
+
|
|
128
|
+
@property
|
|
129
|
+
def redirect_url(self):
|
|
130
|
+
"""
|
|
131
|
+
str: The redirect URL for this snapshot, if any.
|
|
132
|
+
"""
|
|
133
|
+
return self._redirect_url
|
|
134
|
+
|
|
135
|
+
@redirect_url.setter
|
|
136
|
+
def redirect_url(self, value):
|
|
137
|
+
"""
|
|
138
|
+
Set the redirect URL and update the database.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
value (str): The new redirect URL.
|
|
142
|
+
"""
|
|
143
|
+
if self.redirect_timestamp is None and value is None:
|
|
144
|
+
return
|
|
145
|
+
self._redirect_url = value
|
|
146
|
+
self.modify(column="redirect_url", value=value)
|
|
147
|
+
|
|
148
|
+
@property
|
|
149
|
+
def redirect_timestamp(self):
|
|
150
|
+
"""
|
|
151
|
+
str: The timestamp of the redirect, if any.
|
|
152
|
+
"""
|
|
153
|
+
return self._redirect_timestamp
|
|
154
|
+
|
|
155
|
+
@redirect_timestamp.setter
|
|
156
|
+
def redirect_timestamp(self, value):
|
|
157
|
+
"""
|
|
158
|
+
Set the redirect timestamp and update the database.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
value (str): The new redirect timestamp.
|
|
162
|
+
"""
|
|
163
|
+
if self.redirect_url is None and value is None:
|
|
164
|
+
return
|
|
165
|
+
self._redirect_timestamp = value
|
|
166
|
+
self.modify(column="redirect_timestamp", value=value)
|
|
167
|
+
|
|
168
|
+
@property
|
|
169
|
+
def response_status(self):
|
|
170
|
+
"""
|
|
171
|
+
str: The HTTP response/status for this snapshot.
|
|
172
|
+
"""
|
|
173
|
+
return self._response_status
|
|
174
|
+
|
|
175
|
+
@response_status.setter
|
|
176
|
+
def response_status(self, value):
|
|
177
|
+
"""
|
|
178
|
+
Set the response status and update the database.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
value (str): The new response status.
|
|
182
|
+
"""
|
|
183
|
+
if self.response_status is None and value is None:
|
|
184
|
+
return
|
|
185
|
+
self._response_status = value
|
|
186
|
+
self.modify(column="response", value=value)
|
|
187
|
+
|
|
188
|
+
@property
|
|
189
|
+
def file(self):
|
|
190
|
+
"""
|
|
191
|
+
str: The file path for the downloaded snapshot.
|
|
192
|
+
"""
|
|
193
|
+
return self._file
|
|
194
|
+
|
|
195
|
+
@file.setter
|
|
196
|
+
def file(self, value):
|
|
197
|
+
"""
|
|
198
|
+
Set the file path and update the database.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
value (str): The new file path.
|
|
202
|
+
"""
|
|
203
|
+
if self.file is None and value is None:
|
|
204
|
+
return
|
|
205
|
+
self._file = value
|
|
206
|
+
self.modify(column="file", value=value)
|