pywaybackup 4.1.5__tar.gz → 4.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pywaybackup-4.1.5/pywaybackup.egg-info → pywaybackup-4.2.0}/PKG-INFO +17 -7
- {pywaybackup-4.1.5 → pywaybackup-4.2.0}/README.md +14 -4
- {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pyproject.toml +3 -3
- {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/PyWayBackup.py +8 -0
- {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/Snapshot.py +69 -18
- {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/SnapshotCollection.py +103 -74
- {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/Worker.py +26 -1
- {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/archive_download.py +2 -0
- pywaybackup-4.2.0/pywaybackup/arg_parser.py +55 -0
- pywaybackup-4.2.0/pywaybackup/arg_specs.py +257 -0
- {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/db.py +42 -8
- {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/helper.py +8 -8
- pywaybackup-4.2.0/pywaybackup/interactive.py +144 -0
- pywaybackup-4.2.0/pywaybackup/main.py +32 -0
- {pywaybackup-4.1.5 → pywaybackup-4.2.0/pywaybackup.egg-info}/PKG-INFO +17 -7
- {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup.egg-info/SOURCES.txt +3 -1
- {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup.egg-info/requires.txt +1 -1
- pywaybackup-4.1.5/pywaybackup/Arguments.py +0 -71
- pywaybackup-4.1.5/pywaybackup/main.py +0 -13
- {pywaybackup-4.1.5 → pywaybackup-4.2.0}/LICENSE +0 -0
- {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/Exception.py +0 -0
- {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/Verbosity.py +0 -0
- {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/__init__.py +0 -0
- {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/archive_save.py +0 -0
- {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/files.py +0 -0
- {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup.egg-info/dependency_links.txt +0 -0
- {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup.egg-info/entry_points.txt +0 -0
- {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup.egg-info/top_level.txt +0 -0
- {pywaybackup-4.1.5 → pywaybackup-4.2.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pywaybackup
|
|
3
|
-
Version: 4.
|
|
3
|
+
Version: 4.2.0
|
|
4
4
|
Summary: Query and download archive.org as simple as possible.
|
|
5
5
|
Author-email: bitdruid <bitdruid@outlook.com>
|
|
6
6
|
License: MIT License
|
|
@@ -26,11 +26,11 @@ License: MIT License
|
|
|
26
26
|
SOFTWARE.
|
|
27
27
|
|
|
28
28
|
Project-URL: homepage, https://github.com/bitdruid/python-wayback-machine-downloader
|
|
29
|
-
Requires-Python:
|
|
29
|
+
Requires-Python: >=3.8
|
|
30
30
|
Description-Content-Type: text/markdown
|
|
31
31
|
License-File: LICENSE
|
|
32
32
|
Requires-Dist: ruff
|
|
33
|
-
Requires-Dist: SQLAlchemy==2.0.
|
|
33
|
+
Requires-Dist: SQLAlchemy==2.0.51
|
|
34
34
|
Requires-Dist: requests==2.32.3
|
|
35
35
|
Requires-Dist: tqdm==4.67.1
|
|
36
36
|
Requires-Dist: python-magic-standalone==0.4.28
|
|
@@ -39,7 +39,7 @@ Requires-Dist: python-magic-standalone==0.4.28
|
|
|
39
39
|
|
|
40
40
|
[](https://pypi.org/project/pywaybackup/)
|
|
41
41
|
[](https://pypi.org/project/pywaybackup/)
|
|
42
|
-

|
|
42
|
+

|
|
43
43
|
[](https://opensource.org/licenses/MIT)
|
|
44
44
|
|
|
45
45
|
Downloading archived web pages from the [Wayback Machine](https://archive.org/web/).
|
|
@@ -68,6 +68,13 @@ This tool allows you to download content from the Wayback Machine (archive.org).
|
|
|
68
68
|
2. Run the tool <br>
|
|
69
69
|
`waybackup -h`
|
|
70
70
|
|
|
71
|
+
### Standalone binary
|
|
72
|
+
|
|
73
|
+
Prebuilt executables for Windows, Linux and macOS are attached to each [release](https://github.com/bitdruid/python-wayback-machine-downloader/releases). No Python required.
|
|
74
|
+
|
|
75
|
+
- Run from a terminal with arguments like the pip version: `waybackup -h`
|
|
76
|
+
- Or start it without arguments (e.g. double-click on Windows) to enter **interactive mode** — the tool will prompt you for URL, mode and optional settings.
|
|
77
|
+
|
|
71
78
|
### Manual
|
|
72
79
|
|
|
73
80
|
1. Clone the repository <br>
|
|
@@ -155,8 +162,9 @@ output:
|
|
|
155
162
|
|
|
156
163
|
## cli
|
|
157
164
|
|
|
158
|
-
- `-h`, `--help`: Show the help message and exit.
|
|
159
|
-
|
|
165
|
+
- `-h`, `--help`: Show the help message and exit. Version info is shown in the help header.
|
|
166
|
+
|
|
167
|
+
> **Interactive mode:** running `waybackup` without any arguments in a terminal starts a guided prompt for URL, mode and optional settings. Without a terminal (scripts/cron), the help is printed instead.
|
|
160
168
|
|
|
161
169
|
#### Required
|
|
162
170
|
|
|
@@ -171,6 +179,8 @@ output:
|
|
|
171
179
|
Last Version. Gives one folder containing the last version of each file of specified `--range`.
|
|
172
180
|
- **`-f`**, **`--first`**:<br>
|
|
173
181
|
First Version. Gives one folder containing the first version of each file of specified `--range`.
|
|
182
|
+
- **`-s`**, **`--save`**:<br>
|
|
183
|
+
Save a page to the wayback machine (no download).
|
|
174
184
|
|
|
175
185
|
#### Optional query parameters
|
|
176
186
|
|
|
@@ -219,7 +229,7 @@ Parameters will change the download behavior for snapshots.
|
|
|
219
229
|
Set verbosity level. Available levels:
|
|
220
230
|
- `low` (or `quiet`, `minimal`, `min`): Essential output only (same as no flag)
|
|
221
231
|
- `default` (or `normal`, `verbose`): Standard verbose output (default when flag is set)
|
|
222
|
-
- `high` (or `
|
|
232
|
+
- `high` (or `detailed`, `max`): Detailed verbose output
|
|
223
233
|
|
|
224
234
|
Examples: `--verbose`, `--verbose default`, `--verbose high`, `-v high`
|
|
225
235
|
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
[](https://pypi.org/project/pywaybackup/)
|
|
4
4
|
[](https://pypi.org/project/pywaybackup/)
|
|
5
|
-

|
|
5
|
+

|
|
6
6
|
[](https://opensource.org/licenses/MIT)
|
|
7
7
|
|
|
8
8
|
Downloading archived web pages from the [Wayback Machine](https://archive.org/web/).
|
|
@@ -31,6 +31,13 @@ This tool allows you to download content from the Wayback Machine (archive.org).
|
|
|
31
31
|
2. Run the tool <br>
|
|
32
32
|
`waybackup -h`
|
|
33
33
|
|
|
34
|
+
### Standalone binary
|
|
35
|
+
|
|
36
|
+
Prebuilt executables for Windows, Linux and macOS are attached to each [release](https://github.com/bitdruid/python-wayback-machine-downloader/releases). No Python required.
|
|
37
|
+
|
|
38
|
+
- Run from a terminal with arguments like the pip version: `waybackup -h`
|
|
39
|
+
- Or start it without arguments (e.g. double-click on Windows) to enter **interactive mode** — the tool will prompt you for URL, mode and optional settings.
|
|
40
|
+
|
|
34
41
|
### Manual
|
|
35
42
|
|
|
36
43
|
1. Clone the repository <br>
|
|
@@ -118,8 +125,9 @@ output:
|
|
|
118
125
|
|
|
119
126
|
## cli
|
|
120
127
|
|
|
121
|
-
- `-h`, `--help`: Show the help message and exit.
|
|
122
|
-
|
|
128
|
+
- `-h`, `--help`: Show the help message and exit. Version info is shown in the help header.
|
|
129
|
+
|
|
130
|
+
> **Interactive mode:** running `waybackup` without any arguments in a terminal starts a guided prompt for URL, mode and optional settings. Without a terminal (scripts/cron), the help is printed instead.
|
|
123
131
|
|
|
124
132
|
#### Required
|
|
125
133
|
|
|
@@ -134,6 +142,8 @@ output:
|
|
|
134
142
|
Last Version. Gives one folder containing the last version of each file of specified `--range`.
|
|
135
143
|
- **`-f`**, **`--first`**:<br>
|
|
136
144
|
First Version. Gives one folder containing the first version of each file of specified `--range`.
|
|
145
|
+
- **`-s`**, **`--save`**:<br>
|
|
146
|
+
Save a page to the wayback machine (no download).
|
|
137
147
|
|
|
138
148
|
#### Optional query parameters
|
|
139
149
|
|
|
@@ -182,7 +192,7 @@ Parameters will change the download behavior for snapshots.
|
|
|
182
192
|
Set verbosity level. Available levels:
|
|
183
193
|
- `low` (or `quiet`, `minimal`, `min`): Essential output only (same as no flag)
|
|
184
194
|
- `default` (or `normal`, `verbose`): Standard verbose output (default when flag is set)
|
|
185
|
-
- `high` (or `
|
|
195
|
+
- `high` (or `detailed`, `max`): Detailed verbose output
|
|
186
196
|
|
|
187
197
|
Examples: `--verbose`, `--verbose default`, `--verbose high`, `-v high`
|
|
188
198
|
|
|
@@ -10,18 +10,18 @@ packages = ["pywaybackup"]
|
|
|
10
10
|
[project]
|
|
11
11
|
dependencies = [
|
|
12
12
|
"ruff",
|
|
13
|
-
"SQLAlchemy==2.0.
|
|
13
|
+
"SQLAlchemy==2.0.51",
|
|
14
14
|
"requests==2.32.3",
|
|
15
15
|
"tqdm==4.67.1",
|
|
16
16
|
"python-magic-standalone==0.4.28",
|
|
17
17
|
]
|
|
18
18
|
name = "pywaybackup"
|
|
19
|
-
version = "4.
|
|
19
|
+
version = "4.2.0"
|
|
20
20
|
description = "Query and download archive.org as simple as possible."
|
|
21
21
|
authors = [{ name = "bitdruid", email = "bitdruid@outlook.com" }]
|
|
22
22
|
license = { file = "LICENSE" }
|
|
23
23
|
readme = "README.md"
|
|
24
|
-
requires-python = ">=3.8
|
|
24
|
+
requires-python = ">=3.8"
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
[project.scripts]
|
|
@@ -364,6 +364,7 @@ class PyWayBackup:
|
|
|
364
364
|
resources after the backup is complete.
|
|
365
365
|
|
|
366
366
|
"""
|
|
367
|
+
collection = None
|
|
367
368
|
try:
|
|
368
369
|
self._startup()
|
|
369
370
|
|
|
@@ -385,6 +386,12 @@ class PyWayBackup:
|
|
|
385
386
|
self._keep = True
|
|
386
387
|
ex.exception(message="", e=e)
|
|
387
388
|
finally:
|
|
389
|
+
# if a collection was created during the workflow, close its DB session cleanly
|
|
390
|
+
try:
|
|
391
|
+
if collection:
|
|
392
|
+
collection.close()
|
|
393
|
+
except Exception:
|
|
394
|
+
pass
|
|
388
395
|
self._shutdown()
|
|
389
396
|
|
|
390
397
|
def paths(self, rel: bool = False) -> dict:
|
|
@@ -503,6 +510,7 @@ class PyWayBackup:
|
|
|
503
510
|
collection = SnapshotCollection()
|
|
504
511
|
collection.close()
|
|
505
512
|
self._csvfile.store_result()
|
|
513
|
+
db.close_engine()
|
|
506
514
|
self._f_keep()
|
|
507
515
|
vb.fini()
|
|
508
516
|
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import threading
|
|
3
3
|
|
|
4
|
-
from pywaybackup.db import Database, select, update, waybackup_snapshots
|
|
4
|
+
from pywaybackup.db import Database, select, update, waybackup_snapshots, and_
|
|
5
5
|
from pywaybackup.helper import url_split
|
|
6
|
+
from pywaybackup.Verbosity import Verbosity as vb
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class Snapshot:
|
|
@@ -70,20 +71,58 @@ class Snapshot:
|
|
|
70
71
|
return False
|
|
71
72
|
|
|
72
73
|
def __get_row():
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
74
|
+
# Atomic claim: find next unprocessed scid, set response='LOCK' only if still unprocessed,
|
|
75
|
+
# then fetch that row. This avoids relying on FOR UPDATE or explicit nested transactions
|
|
76
|
+
# which can trigger "A transaction is already begun on this Session" errors.
|
|
77
|
+
|
|
78
|
+
session = self._db.session
|
|
79
|
+
|
|
80
|
+
# get next available SnapshotId
|
|
81
|
+
vb.write(verbose="high", content="[Snapshot.fetch] selecting next scid")
|
|
82
|
+
scid = session.execute(
|
|
83
|
+
select(waybackup_snapshots.scid)
|
|
84
|
+
.where(waybackup_snapshots.response.is_(None))
|
|
85
|
+
.order_by(waybackup_snapshots.scid)
|
|
86
|
+
.limit(1)
|
|
87
|
+
).scalar_one_or_none()
|
|
88
|
+
|
|
89
|
+
if scid is None:
|
|
90
|
+
vb.write(verbose="high", content="[Snapshot.fetch] no unprocessed scid found")
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
# try to atomically claim the row by updating only if still unclaimed
|
|
94
|
+
result = session.execute(
|
|
95
|
+
update(waybackup_snapshots)
|
|
96
|
+
.where(and_(waybackup_snapshots.scid == scid, waybackup_snapshots.response.is_(None)))
|
|
97
|
+
.values(response="LOCK")
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# if another worker claimed it first, rowcount will be 0 — retry to get next available row
|
|
101
|
+
vb.write(
|
|
102
|
+
verbose="high", content=f"[Snapshot.fetch] attempted to claim scid={scid}, rowcount={result.rowcount}"
|
|
103
|
+
)
|
|
104
|
+
if result.rowcount == 0:
|
|
105
|
+
# TOCTOU: __get_row(): another worker claimed this row between our SELECT and UPDATE.
|
|
106
|
+
# Retry instead of returning None to avoid premature worker termination.
|
|
107
|
+
try:
|
|
108
|
+
session.commit()
|
|
109
|
+
except Exception:
|
|
110
|
+
pass
|
|
111
|
+
vb.write(verbose="high", content=f"[Snapshot.fetch] scid={scid} already claimed, retrying")
|
|
112
|
+
return __get_row()
|
|
86
113
|
|
|
114
|
+
# The row has been claimed by the worker and can now be fetched.
|
|
115
|
+
row = session.execute(
|
|
116
|
+
select(waybackup_snapshots).where(waybackup_snapshots.scid == scid)
|
|
117
|
+
).scalar_one_or_none()
|
|
118
|
+
try:
|
|
119
|
+
session.commit()
|
|
120
|
+
except Exception:
|
|
121
|
+
try:
|
|
122
|
+
session.rollback()
|
|
123
|
+
except Exception:
|
|
124
|
+
pass
|
|
125
|
+
vb.write(verbose="high", content=f"[Snapshot.fetch] claimed scid={scid} and fetched row")
|
|
87
126
|
return row
|
|
88
127
|
|
|
89
128
|
if __on_sqlite():
|
|
@@ -101,10 +140,22 @@ class Snapshot:
|
|
|
101
140
|
value: New value to set for the column.
|
|
102
141
|
"""
|
|
103
142
|
column = getattr(waybackup_snapshots, column)
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
143
|
+
try:
|
|
144
|
+
vb.write(verbose="high", content=f"[Snapshot.modify] updating scid={self.scid} column={column.key}")
|
|
145
|
+
self._db.session.execute(
|
|
146
|
+
update(waybackup_snapshots).where(waybackup_snapshots.scid == self.scid).values({column: value})
|
|
147
|
+
)
|
|
148
|
+
self._db.session.commit()
|
|
149
|
+
vb.write(verbose="high", content=f"[Snapshot.modify] update committed scid={self.scid} column={column.key}")
|
|
150
|
+
except Exception as e:
|
|
151
|
+
vb.write(
|
|
152
|
+
verbose="high", content=f"[Snapshot.modify] update failed scid={self.scid} error={e}; rolling back"
|
|
153
|
+
)
|
|
154
|
+
try:
|
|
155
|
+
self._db.session.rollback()
|
|
156
|
+
except Exception:
|
|
157
|
+
pass
|
|
158
|
+
raise
|
|
108
159
|
|
|
109
160
|
def create_output(self):
|
|
110
161
|
"""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
|
|
3
|
-
from pywaybackup.db import Database,
|
|
3
|
+
from pywaybackup.db import Database, and_, delete, func, or_, select, text, tuple_, update, waybackup_snapshots
|
|
4
4
|
from pywaybackup.files import CDXfile, CSVfile
|
|
5
5
|
from pywaybackup.Verbosity import Progressbar
|
|
6
6
|
from pywaybackup.Verbosity import Verbosity as vb
|
|
@@ -163,73 +163,89 @@ class SnapshotCollection:
|
|
|
163
163
|
|
|
164
164
|
vb.write(verbose=None, content="\nInserting CDX data into database...")
|
|
165
165
|
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
line = line.
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
166
|
+
try:
|
|
167
|
+
vb.write(verbose=True, content="[SnapshotCollection._insert_cdx] starting insert_cdx operation")
|
|
168
|
+
progressbar = Progressbar(
|
|
169
|
+
unit=" lines",
|
|
170
|
+
total=self._cdx_total,
|
|
171
|
+
desc="process cdx".ljust(15),
|
|
172
|
+
ascii="░▒█",
|
|
173
|
+
bar_format="{l_bar}{bar:50}{r_bar}{bar:-10b}",
|
|
174
|
+
)
|
|
175
|
+
line_batchsize = 2500
|
|
176
|
+
line_batch = []
|
|
177
|
+
total_inserted = 0
|
|
178
|
+
first_line = True
|
|
179
|
+
|
|
180
|
+
with self.cdxfile as f:
|
|
181
|
+
for line in f:
|
|
182
|
+
if first_line:
|
|
183
|
+
first_line = False
|
|
184
|
+
continue
|
|
185
|
+
line = line.strip()
|
|
186
|
+
if line.endswith("]]"):
|
|
187
|
+
line = line.rsplit("]", 1)[0]
|
|
188
|
+
if line.endswith(","):
|
|
189
|
+
line = line.rsplit(",", 1)[0]
|
|
190
|
+
|
|
191
|
+
try:
|
|
192
|
+
line_batch.append(__parse_line(line))
|
|
193
|
+
except json.decoder.JSONDecodeError:
|
|
194
|
+
self._snapshot_faulty += 1
|
|
195
|
+
continue
|
|
196
|
+
|
|
197
|
+
if len(line_batch) >= line_batchsize:
|
|
198
|
+
total_inserted += _insert_batch_safe(line_batch=line_batch)
|
|
199
|
+
line_batch = []
|
|
200
|
+
progressbar.update(line_batchsize)
|
|
201
|
+
|
|
202
|
+
if line_batch:
|
|
196
203
|
total_inserted += _insert_batch_safe(line_batch=line_batch)
|
|
197
|
-
line_batch
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
204
|
+
progressbar.update(len(line_batch))
|
|
205
|
+
|
|
206
|
+
self.db.session.commit()
|
|
207
|
+
vb.write(verbose=True, content="[SnapshotCollection._insert_cdx] insert_cdx commit successful")
|
|
208
|
+
except Exception as e:
|
|
209
|
+
vb.write(verbose=True, content=f"[SnapshotCollection._insert_cdx] exception: {e}; rolling back")
|
|
210
|
+
try:
|
|
211
|
+
self.db.session.rollback()
|
|
212
|
+
vb.write(verbose=True, content="[SnapshotCollection._insert_cdx] rollback successful")
|
|
213
|
+
except Exception:
|
|
214
|
+
vb.write(verbose=True, content="[SnapshotCollection._insert_cdx] rollback failed")
|
|
215
|
+
raise
|
|
205
216
|
|
|
206
217
|
def _index_snapshots(self):
|
|
207
218
|
"""
|
|
208
219
|
Create indexes for the snapshot table.
|
|
220
|
+
|
|
221
|
+
Raw DDL instead of sqlalchemy Index objects: Index(...) attaches to the
|
|
222
|
+
module-global table metadata, which accumulates duplicates when the
|
|
223
|
+
package is reused in-process (library usage) and breaks create_all().
|
|
209
224
|
"""
|
|
210
225
|
# index for filtering last snapshots
|
|
211
226
|
if self._mode_last:
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
227
|
+
self.db.session.execute(
|
|
228
|
+
text(
|
|
229
|
+
"CREATE INDEX IF NOT EXISTS idx_waybackup_snapshots_url_origin_timestamp_desc "
|
|
230
|
+
"ON waybackup_snapshots (url_origin, timestamp DESC)"
|
|
231
|
+
)
|
|
216
232
|
)
|
|
217
|
-
idx1.create(self.db.session.bind, checkfirst=True)
|
|
218
233
|
# index for filtering first snapshots
|
|
219
234
|
if self._mode_first:
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
235
|
+
self.db.session.execute(
|
|
236
|
+
text(
|
|
237
|
+
"CREATE INDEX IF NOT EXISTS idx_waybackup_snapshots_url_origin_timestamp_asc "
|
|
238
|
+
"ON waybackup_snapshots (url_origin, timestamp ASC)"
|
|
239
|
+
)
|
|
224
240
|
)
|
|
225
|
-
idx2.create(self.db.session.bind, checkfirst=True)
|
|
226
241
|
# index for skippable snapshots
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
242
|
+
self.db.session.execute(
|
|
243
|
+
text(
|
|
244
|
+
"CREATE INDEX IF NOT EXISTS idx_waybackup_snapshots_timestamp_url_origin_response "
|
|
245
|
+
"ON waybackup_snapshots (timestamp, url_origin)"
|
|
246
|
+
)
|
|
231
247
|
)
|
|
232
|
-
|
|
248
|
+
self.db.session.commit()
|
|
233
249
|
|
|
234
250
|
def _filter_snapshots(self):
|
|
235
251
|
"""
|
|
@@ -297,29 +313,42 @@ class SnapshotCollection:
|
|
|
297
313
|
"""
|
|
298
314
|
|
|
299
315
|
# ? for now per row / no bulk for compatibility
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
.
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
316
|
+
try:
|
|
317
|
+
vb.write(verbose=True, content="[SnapshotCollection._skip_set] applying CSV skips to DB")
|
|
318
|
+
with self.csvfile as f:
|
|
319
|
+
total_skipped = 0
|
|
320
|
+
for row in f:
|
|
321
|
+
self.db.session.execute(
|
|
322
|
+
update(waybackup_snapshots)
|
|
323
|
+
.where(
|
|
324
|
+
and_(
|
|
325
|
+
waybackup_snapshots.timestamp == row["timestamp"],
|
|
326
|
+
waybackup_snapshots.url_origin == row["url_origin"],
|
|
327
|
+
)
|
|
328
|
+
)
|
|
329
|
+
.values(
|
|
330
|
+
url_archive=row["url_archive"],
|
|
331
|
+
redirect_url=row["redirect_url"],
|
|
332
|
+
redirect_timestamp=row["redirect_timestamp"],
|
|
333
|
+
response=row["response"],
|
|
334
|
+
file=row["file"],
|
|
309
335
|
)
|
|
310
336
|
)
|
|
311
|
-
|
|
312
|
-
url_archive=row["url_archive"],
|
|
313
|
-
redirect_url=row["redirect_url"],
|
|
314
|
-
redirect_timestamp=row["redirect_timestamp"],
|
|
315
|
-
response=row["response"],
|
|
316
|
-
file=row["file"],
|
|
317
|
-
)
|
|
318
|
-
)
|
|
319
|
-
total_skipped += 1
|
|
337
|
+
total_skipped += 1
|
|
320
338
|
|
|
321
|
-
|
|
322
|
-
|
|
339
|
+
self.db.session.commit()
|
|
340
|
+
self._filter_skip = total_skipped
|
|
341
|
+
vb.write(
|
|
342
|
+
verbose=True, content=f"[SnapshotCollection._skip_set] commit successful, total_skipped={total_skipped}"
|
|
343
|
+
)
|
|
344
|
+
except Exception as e:
|
|
345
|
+
vb.write(verbose=True, content=f"[SnapshotCollection._skip_set] exception: {e}; rolling back")
|
|
346
|
+
try:
|
|
347
|
+
self.db.session.rollback()
|
|
348
|
+
vb.write(verbose=True, content="[SnapshotCollection._skip_set] rollback successful")
|
|
349
|
+
except Exception:
|
|
350
|
+
vb.write(verbose=True, content="[SnapshotCollection._skip_set] rollback failed")
|
|
351
|
+
raise
|
|
323
352
|
|
|
324
353
|
def count_total(self) -> int:
|
|
325
354
|
return self.db.session.query(waybackup_snapshots.scid).count()
|
|
@@ -21,6 +21,27 @@ class Worker:
|
|
|
21
21
|
self.db = Database()
|
|
22
22
|
self.connection = http.client.HTTPSConnection("web.archive.org")
|
|
23
23
|
|
|
24
|
+
def close(self):
|
|
25
|
+
"""
|
|
26
|
+
Try to close the database and connection.
|
|
27
|
+
"""
|
|
28
|
+
try:
|
|
29
|
+
if hasattr(self, "db") and self.db:
|
|
30
|
+
try:
|
|
31
|
+
vb.write(verbose="high", content=f"[Worker.close] closing DB for worker {self.id}")
|
|
32
|
+
self.db.close()
|
|
33
|
+
vb.write(verbose="high", content=f"[Worker.close] DB closed for worker {self.id}")
|
|
34
|
+
except Exception:
|
|
35
|
+
pass
|
|
36
|
+
finally:
|
|
37
|
+
try:
|
|
38
|
+
if hasattr(self, "connection") and self.connection:
|
|
39
|
+
vb.write(verbose="high", content=f"[Worker.close] closing connection for worker {self.id}")
|
|
40
|
+
self.connection.close()
|
|
41
|
+
vb.write(verbose="high", content=f"[Worker.close] connection closed for worker {self.id}")
|
|
42
|
+
except Exception:
|
|
43
|
+
pass
|
|
44
|
+
|
|
24
45
|
def assign_snapshot(self, total_amount: int):
|
|
25
46
|
self.snapshot = Snapshot(self.db, output=self.output, mode=self.mode)
|
|
26
47
|
self.total_amount = total_amount
|
|
@@ -84,7 +105,11 @@ class Message(Worker):
|
|
|
84
105
|
content = content + " - " if content else ""
|
|
85
106
|
self.message = {
|
|
86
107
|
"verbose": False,
|
|
87
|
-
"content":
|
|
108
|
+
"content": (
|
|
109
|
+
f"{self.worker.snapshot.counter}/{self.worker.total_amount}"
|
|
110
|
+
f" - W:{self.worker.id} - {result}{content}"
|
|
111
|
+
f"{self.worker.snapshot.timestamp} - {self.worker.snapshot.url_origin}"
|
|
112
|
+
),
|
|
88
113
|
}
|
|
89
114
|
self.buffer.append(self.message)
|
|
90
115
|
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import sys
|
|
3
|
+
from argparse import RawTextHelpFormatter
|
|
4
|
+
from importlib.metadata import version
|
|
5
|
+
|
|
6
|
+
from pywaybackup.arg_specs import ARG_GROUPS, ARG_SPECS, EXCLUSIVE_GROUPS
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Arguments:
|
|
10
|
+
def __init__(self):
|
|
11
|
+
parser = argparse.ArgumentParser(
|
|
12
|
+
description=f"<<< python-wayback-machine-downloader v{version('pywaybackup')} >>>\nby @bitdruid -> https://github.com/bitdruid",
|
|
13
|
+
formatter_class=RawTextHelpFormatter,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
groups = {name: parser.add_argument_group(name) for name in ARG_GROUPS}
|
|
17
|
+
|
|
18
|
+
exclusive = {
|
|
19
|
+
ex_name: groups[ex_meta["parent_group"]].add_mutually_exclusive_group(required=ex_meta["required"])
|
|
20
|
+
for ex_name, ex_meta in EXCLUSIVE_GROUPS.items()
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
for spec in ARG_SPECS:
|
|
24
|
+
target = exclusive[spec.exclusive_group] if spec.exclusive_group else groups[spec.group]
|
|
25
|
+
target.add_argument(*spec.flags, **_argparse_kwargs(spec))
|
|
26
|
+
|
|
27
|
+
args = parser.parse_args(args=None if sys.argv[1:] else ["--help"]) # if no arguments are given, print help
|
|
28
|
+
|
|
29
|
+
args.silent = False
|
|
30
|
+
args.debug = True
|
|
31
|
+
|
|
32
|
+
self.args = args
|
|
33
|
+
|
|
34
|
+
def get_args(self) -> dict:
|
|
35
|
+
"""Returns the parsed arguments as a dictionary."""
|
|
36
|
+
return vars(self.args)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _argparse_kwargs(spec) -> dict:
|
|
40
|
+
"""Translate an ArgSpec into kwargs for argparse.add_argument()."""
|
|
41
|
+
kwargs = {"help": spec.help}
|
|
42
|
+
if spec.action == "store_true":
|
|
43
|
+
kwargs["action"] = "store_true"
|
|
44
|
+
kwargs["default"] = bool(spec.default)
|
|
45
|
+
elif spec.action == "optional_value":
|
|
46
|
+
kwargs["type"] = spec.type
|
|
47
|
+
kwargs["nargs"] = "?"
|
|
48
|
+
kwargs["const"] = spec.const
|
|
49
|
+
kwargs["metavar"] = spec.metavar
|
|
50
|
+
kwargs["default"] = spec.default
|
|
51
|
+
else:
|
|
52
|
+
kwargs["type"] = spec.type
|
|
53
|
+
kwargs["metavar"] = spec.metavar
|
|
54
|
+
kwargs["default"] = spec.default
|
|
55
|
+
return kwargs
|