pywaybackup 4.1.5__tar.gz → 4.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {pywaybackup-4.1.5/pywaybackup.egg-info → pywaybackup-4.2.0}/PKG-INFO +17 -7
  2. {pywaybackup-4.1.5 → pywaybackup-4.2.0}/README.md +14 -4
  3. {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pyproject.toml +3 -3
  4. {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/PyWayBackup.py +8 -0
  5. {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/Snapshot.py +69 -18
  6. {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/SnapshotCollection.py +103 -74
  7. {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/Worker.py +26 -1
  8. {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/archive_download.py +2 -0
  9. pywaybackup-4.2.0/pywaybackup/arg_parser.py +55 -0
  10. pywaybackup-4.2.0/pywaybackup/arg_specs.py +257 -0
  11. {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/db.py +42 -8
  12. {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/helper.py +8 -8
  13. pywaybackup-4.2.0/pywaybackup/interactive.py +144 -0
  14. pywaybackup-4.2.0/pywaybackup/main.py +32 -0
  15. {pywaybackup-4.1.5 → pywaybackup-4.2.0/pywaybackup.egg-info}/PKG-INFO +17 -7
  16. {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup.egg-info/SOURCES.txt +3 -1
  17. {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup.egg-info/requires.txt +1 -1
  18. pywaybackup-4.1.5/pywaybackup/Arguments.py +0 -71
  19. pywaybackup-4.1.5/pywaybackup/main.py +0 -13
  20. {pywaybackup-4.1.5 → pywaybackup-4.2.0}/LICENSE +0 -0
  21. {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/Exception.py +0 -0
  22. {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/Verbosity.py +0 -0
  23. {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/__init__.py +0 -0
  24. {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/archive_save.py +0 -0
  25. {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup/files.py +0 -0
  26. {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup.egg-info/dependency_links.txt +0 -0
  27. {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup.egg-info/entry_points.txt +0 -0
  28. {pywaybackup-4.1.5 → pywaybackup-4.2.0}/pywaybackup.egg-info/top_level.txt +0 -0
  29. {pywaybackup-4.1.5 → pywaybackup-4.2.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pywaybackup
3
- Version: 4.1.5
3
+ Version: 4.2.0
4
4
  Summary: Query and download archive.org as simple as possible.
5
5
  Author-email: bitdruid <bitdruid@outlook.com>
6
6
  License: MIT License
@@ -26,11 +26,11 @@ License: MIT License
26
26
  SOFTWARE.
27
27
 
28
28
  Project-URL: homepage, https://github.com/bitdruid/python-wayback-machine-downloader
29
- Requires-Python: <3.14,>=3.8
29
+ Requires-Python: >=3.8
30
30
  Description-Content-Type: text/markdown
31
31
  License-File: LICENSE
32
32
  Requires-Dist: ruff
33
- Requires-Dist: SQLAlchemy==2.0.43
33
+ Requires-Dist: SQLAlchemy==2.0.51
34
34
  Requires-Dist: requests==2.32.3
35
35
  Requires-Dist: tqdm==4.67.1
36
36
  Requires-Dist: python-magic-standalone==0.4.28
@@ -39,7 +39,7 @@ Requires-Dist: python-magic-standalone==0.4.28
39
39
 
40
40
  [![PyPI](https://img.shields.io/pypi/v/pywaybackup)](https://pypi.org/project/pywaybackup/)
41
41
  [![PyPI - Downloads](https://img.shields.io/pypi/dm/pywaybackup)](https://pypi.org/project/pywaybackup/)
42
- ![Python Version](https://img.shields.io/badge/Python-3.8-blue)
42
+ ![Python Version](https://img.shields.io/badge/Python-3.8%2B-blue)
43
43
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
44
44
 
45
45
  Downloading archived web pages from the [Wayback Machine](https://archive.org/web/).
@@ -68,6 +68,13 @@ This tool allows you to download content from the Wayback Machine (archive.org).
68
68
  2. Run the tool <br>
69
69
  `waybackup -h`
70
70
 
71
+ ### Standalone binary
72
+
73
+ Prebuilt executables for Windows, Linux and macOS are attached to each [release](https://github.com/bitdruid/python-wayback-machine-downloader/releases). No Python required.
74
+
75
+ - Run from a terminal with arguments like the pip version: `waybackup -h`
76
+ - Or start it without arguments (e.g. double-click on Windows) to enter **interactive mode** — the tool will prompt you for URL, mode and optional settings.
77
+
71
78
  ### Manual
72
79
 
73
80
  1. Clone the repository <br>
@@ -155,8 +162,9 @@ output:
155
162
 
156
163
  ## cli
157
164
 
158
- - `-h`, `--help`: Show the help message and exit.
159
- - `-v`, `--version`: Show information about the tool and exit.
165
+ - `-h`, `--help`: Show the help message and exit. Version info is shown in the help header.
166
+
167
+ > **Interactive mode:** running `waybackup` without any arguments in a terminal starts a guided prompt for URL, mode and optional settings. Without a terminal (scripts/cron), the help is printed instead.
160
168
 
161
169
  #### Required
162
170
 
@@ -171,6 +179,8 @@ output:
171
179
  Last Version. Gives one folder containing the last version of each file of specified `--range`.
172
180
  - **`-f`**, **`--first`**:<br>
173
181
  First Version. Gives one folder containing the first version of each file of specified `--range`.
182
+ - **`-s`**, **`--save`**:<br>
183
+ Save a page to the wayback machine (no download).
174
184
 
175
185
  #### Optional query parameters
176
186
 
@@ -219,7 +229,7 @@ Parameters will change the download behavior for snapshots.
219
229
  Set verbosity level. Available levels:
220
230
  - `low` (or `quiet`, `minimal`, `min`): Essential output only (same as no flag)
221
231
  - `default` (or `normal`, `verbose`): Standard verbose output (default when flag is set)
222
- - `high` (or `debug`, `detailed`, `max`): Detailed verbose output
232
+ - `high` (or `detailed`, `max`): Detailed verbose output
223
233
 
224
234
  Examples: `--verbose`, `--verbose default`, `--verbose high`, `-v high`
225
235
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  [![PyPI](https://img.shields.io/pypi/v/pywaybackup)](https://pypi.org/project/pywaybackup/)
4
4
  [![PyPI - Downloads](https://img.shields.io/pypi/dm/pywaybackup)](https://pypi.org/project/pywaybackup/)
5
- ![Python Version](https://img.shields.io/badge/Python-3.8-blue)
5
+ ![Python Version](https://img.shields.io/badge/Python-3.8%2B-blue)
6
6
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
7
 
8
8
  Downloading archived web pages from the [Wayback Machine](https://archive.org/web/).
@@ -31,6 +31,13 @@ This tool allows you to download content from the Wayback Machine (archive.org).
31
31
  2. Run the tool <br>
32
32
  `waybackup -h`
33
33
 
34
+ ### Standalone binary
35
+
36
+ Prebuilt executables for Windows, Linux and macOS are attached to each [release](https://github.com/bitdruid/python-wayback-machine-downloader/releases). No Python required.
37
+
38
+ - Run from a terminal with arguments like the pip version: `waybackup -h`
39
+ - Or start it without arguments (e.g. double-click on Windows) to enter **interactive mode** — the tool will prompt you for URL, mode and optional settings.
40
+
34
41
  ### Manual
35
42
 
36
43
  1. Clone the repository <br>
@@ -118,8 +125,9 @@ output:
118
125
 
119
126
  ## cli
120
127
 
121
- - `-h`, `--help`: Show the help message and exit.
122
- - `-v`, `--version`: Show information about the tool and exit.
128
+ - `-h`, `--help`: Show the help message and exit. Version info is shown in the help header.
129
+
130
+ > **Interactive mode:** running `waybackup` without any arguments in a terminal starts a guided prompt for URL, mode and optional settings. Without a terminal (scripts/cron), the help is printed instead.
123
131
 
124
132
  #### Required
125
133
 
@@ -134,6 +142,8 @@ output:
134
142
  Last Version. Gives one folder containing the last version of each file of specified `--range`.
135
143
  - **`-f`**, **`--first`**:<br>
136
144
  First Version. Gives one folder containing the first version of each file of specified `--range`.
145
+ - **`-s`**, **`--save`**:<br>
146
+ Save a page to the wayback machine (no download).
137
147
 
138
148
  #### Optional query parameters
139
149
 
@@ -182,7 +192,7 @@ Parameters will change the download behavior for snapshots.
182
192
  Set verbosity level. Available levels:
183
193
  - `low` (or `quiet`, `minimal`, `min`): Essential output only (same as no flag)
184
194
  - `default` (or `normal`, `verbose`): Standard verbose output (default when flag is set)
185
- - `high` (or `debug`, `detailed`, `max`): Detailed verbose output
195
+ - `high` (or `detailed`, `max`): Detailed verbose output
186
196
 
187
197
  Examples: `--verbose`, `--verbose default`, `--verbose high`, `-v high`
188
198
 
@@ -10,18 +10,18 @@ packages = ["pywaybackup"]
10
10
  [project]
11
11
  dependencies = [
12
12
  "ruff",
13
- "SQLAlchemy==2.0.43",
13
+ "SQLAlchemy==2.0.51",
14
14
  "requests==2.32.3",
15
15
  "tqdm==4.67.1",
16
16
  "python-magic-standalone==0.4.28",
17
17
  ]
18
18
  name = "pywaybackup"
19
- version = "4.1.5"
19
+ version = "4.2.0"
20
20
  description = "Query and download archive.org as simple as possible."
21
21
  authors = [{ name = "bitdruid", email = "bitdruid@outlook.com" }]
22
22
  license = { file = "LICENSE" }
23
23
  readme = "README.md"
24
- requires-python = ">=3.8,<3.14"
24
+ requires-python = ">=3.8"
25
25
 
26
26
 
27
27
  [project.scripts]
@@ -364,6 +364,7 @@ class PyWayBackup:
364
364
  resources after the backup is complete.
365
365
 
366
366
  """
367
+ collection = None
367
368
  try:
368
369
  self._startup()
369
370
 
@@ -385,6 +386,12 @@ class PyWayBackup:
385
386
  self._keep = True
386
387
  ex.exception(message="", e=e)
387
388
  finally:
389
+ # if a collection was created during the workflow, close its DB session cleanly
390
+ try:
391
+ if collection:
392
+ collection.close()
393
+ except Exception:
394
+ pass
388
395
  self._shutdown()
389
396
 
390
397
  def paths(self, rel: bool = False) -> dict:
@@ -503,6 +510,7 @@ class PyWayBackup:
503
510
  collection = SnapshotCollection()
504
511
  collection.close()
505
512
  self._csvfile.store_result()
513
+ db.close_engine()
506
514
  self._f_keep()
507
515
  vb.fini()
508
516
  signal.signal(signal.SIGINT, signal.SIG_IGN)
@@ -1,8 +1,9 @@
1
1
  import os
2
2
  import threading
3
3
 
4
- from pywaybackup.db import Database, select, update, waybackup_snapshots
4
+ from pywaybackup.db import Database, select, update, waybackup_snapshots, and_
5
5
  from pywaybackup.helper import url_split
6
+ from pywaybackup.Verbosity import Verbosity as vb
6
7
 
7
8
 
8
9
  class Snapshot:
@@ -70,20 +71,58 @@ class Snapshot:
70
71
  return False
71
72
 
72
73
  def __get_row():
73
- with self._db.session.begin():
74
- row = self._db.session.execute(
75
- select(waybackup_snapshots)
76
- .where(waybackup_snapshots.response.is_(None))
77
- .order_by(waybackup_snapshots.scid)
78
- .limit(1)
79
- .with_for_update(skip_locked=True)
80
- ).scalar_one_or_none()
81
-
82
- if row is None:
83
- return None
84
-
85
- row.response = "LOCK"
74
+ # Atomic claim: find next unprocessed scid, set response='LOCK' only if still unprocessed,
75
+ # then fetch that row. This avoids relying on FOR UPDATE or explicit nested transactions
76
+ # which can trigger "A transaction is already begun on this Session" errors.
77
+
78
+ session = self._db.session
79
+
80
+ # get next available SnapshotId
81
+ vb.write(verbose="high", content="[Snapshot.fetch] selecting next scid")
82
+ scid = session.execute(
83
+ select(waybackup_snapshots.scid)
84
+ .where(waybackup_snapshots.response.is_(None))
85
+ .order_by(waybackup_snapshots.scid)
86
+ .limit(1)
87
+ ).scalar_one_or_none()
88
+
89
+ if scid is None:
90
+ vb.write(verbose="high", content="[Snapshot.fetch] no unprocessed scid found")
91
+ return None
92
+
93
+ # try to atomically claim the row by updating only if still unclaimed
94
+ result = session.execute(
95
+ update(waybackup_snapshots)
96
+ .where(and_(waybackup_snapshots.scid == scid, waybackup_snapshots.response.is_(None)))
97
+ .values(response="LOCK")
98
+ )
99
+
100
+ # if another worker claimed it first, rowcount will be 0 — retry to get next available row
101
+ vb.write(
102
+ verbose="high", content=f"[Snapshot.fetch] attempted to claim scid={scid}, rowcount={result.rowcount}"
103
+ )
104
+ if result.rowcount == 0:
105
+ # TOCTOU: __get_row(): another worker claimed this row between our SELECT and UPDATE.
106
+ # Retry instead of returning None to avoid premature worker termination.
107
+ try:
108
+ session.commit()
109
+ except Exception:
110
+ pass
111
+ vb.write(verbose="high", content=f"[Snapshot.fetch] scid={scid} already claimed, retrying")
112
+ return __get_row()
86
113
 
114
+ # The row has been claimed by the worker and can now be fetched.
115
+ row = session.execute(
116
+ select(waybackup_snapshots).where(waybackup_snapshots.scid == scid)
117
+ ).scalar_one_or_none()
118
+ try:
119
+ session.commit()
120
+ except Exception:
121
+ try:
122
+ session.rollback()
123
+ except Exception:
124
+ pass
125
+ vb.write(verbose="high", content=f"[Snapshot.fetch] claimed scid={scid} and fetched row")
87
126
  return row
88
127
 
89
128
  if __on_sqlite():
@@ -101,10 +140,22 @@ class Snapshot:
101
140
  value: New value to set for the column.
102
141
  """
103
142
  column = getattr(waybackup_snapshots, column)
104
- self._db.session.execute(
105
- update(waybackup_snapshots).where(waybackup_snapshots.scid == self.scid).values({column: value})
106
- )
107
- self._db.session.commit()
143
+ try:
144
+ vb.write(verbose="high", content=f"[Snapshot.modify] updating scid={self.scid} column={column.key}")
145
+ self._db.session.execute(
146
+ update(waybackup_snapshots).where(waybackup_snapshots.scid == self.scid).values({column: value})
147
+ )
148
+ self._db.session.commit()
149
+ vb.write(verbose="high", content=f"[Snapshot.modify] update committed scid={self.scid} column={column.key}")
150
+ except Exception as e:
151
+ vb.write(
152
+ verbose="high", content=f"[Snapshot.modify] update failed scid={self.scid} error={e}; rolling back"
153
+ )
154
+ try:
155
+ self._db.session.rollback()
156
+ except Exception:
157
+ pass
158
+ raise
108
159
 
109
160
  def create_output(self):
110
161
  """
@@ -1,6 +1,6 @@
1
1
  import json
2
2
 
3
- from pywaybackup.db import Database, Index, and_, delete, func, or_, select, tuple_, update, waybackup_snapshots
3
+ from pywaybackup.db import Database, and_, delete, func, or_, select, text, tuple_, update, waybackup_snapshots
4
4
  from pywaybackup.files import CDXfile, CSVfile
5
5
  from pywaybackup.Verbosity import Progressbar
6
6
  from pywaybackup.Verbosity import Verbosity as vb
@@ -163,73 +163,89 @@ class SnapshotCollection:
163
163
 
164
164
  vb.write(verbose=None, content="\nInserting CDX data into database...")
165
165
 
166
- progressbar = Progressbar(
167
- unit=" lines",
168
- total=self._cdx_total,
169
- desc="process cdx".ljust(15),
170
- ascii="░▒█",
171
- bar_format="{l_bar}{bar:50}{r_bar}{bar:-10b}",
172
- )
173
- line_batchsize = 2500
174
- line_batch = []
175
- total_inserted = 0
176
- first_line = True
177
-
178
- with self.cdxfile as f:
179
- for line in f:
180
- if first_line:
181
- first_line = False
182
- continue
183
- line = line.strip()
184
- if line.endswith("]]"):
185
- line = line.rsplit("]", 1)[0]
186
- if line.endswith(","):
187
- line = line.rsplit(",", 1)[0]
188
-
189
- try:
190
- line_batch.append(__parse_line(line))
191
- except json.decoder.JSONDecodeError:
192
- self._snapshot_faulty += 1
193
- continue
194
-
195
- if len(line_batch) >= line_batchsize:
166
+ try:
167
+ vb.write(verbose=True, content="[SnapshotCollection._insert_cdx] starting insert_cdx operation")
168
+ progressbar = Progressbar(
169
+ unit=" lines",
170
+ total=self._cdx_total,
171
+ desc="process cdx".ljust(15),
172
+ ascii="░▒█",
173
+ bar_format="{l_bar}{bar:50}{r_bar}{bar:-10b}",
174
+ )
175
+ line_batchsize = 2500
176
+ line_batch = []
177
+ total_inserted = 0
178
+ first_line = True
179
+
180
+ with self.cdxfile as f:
181
+ for line in f:
182
+ if first_line:
183
+ first_line = False
184
+ continue
185
+ line = line.strip()
186
+ if line.endswith("]]"):
187
+ line = line.rsplit("]", 1)[0]
188
+ if line.endswith(","):
189
+ line = line.rsplit(",", 1)[0]
190
+
191
+ try:
192
+ line_batch.append(__parse_line(line))
193
+ except json.decoder.JSONDecodeError:
194
+ self._snapshot_faulty += 1
195
+ continue
196
+
197
+ if len(line_batch) >= line_batchsize:
198
+ total_inserted += _insert_batch_safe(line_batch=line_batch)
199
+ line_batch = []
200
+ progressbar.update(line_batchsize)
201
+
202
+ if line_batch:
196
203
  total_inserted += _insert_batch_safe(line_batch=line_batch)
197
- line_batch = []
198
- progressbar.update(line_batchsize)
199
-
200
- if line_batch:
201
- total_inserted += _insert_batch_safe(line_batch=line_batch)
202
- progressbar.update(len(line_batch))
203
-
204
- self.db.session.commit()
204
+ progressbar.update(len(line_batch))
205
+
206
+ self.db.session.commit()
207
+ vb.write(verbose=True, content="[SnapshotCollection._insert_cdx] insert_cdx commit successful")
208
+ except Exception as e:
209
+ vb.write(verbose=True, content=f"[SnapshotCollection._insert_cdx] exception: {e}; rolling back")
210
+ try:
211
+ self.db.session.rollback()
212
+ vb.write(verbose=True, content="[SnapshotCollection._insert_cdx] rollback successful")
213
+ except Exception:
214
+ vb.write(verbose=True, content="[SnapshotCollection._insert_cdx] rollback failed")
215
+ raise
205
216
 
206
217
  def _index_snapshots(self):
207
218
  """
208
219
  Create indexes for the snapshot table.
220
+
221
+ Raw DDL instead of sqlalchemy Index objects: Index(...) attaches to the
222
+ module-global table metadata, which accumulates duplicates when the
223
+ package is reused in-process (library usage) and breaks create_all().
209
224
  """
210
225
  # index for filtering last snapshots
211
226
  if self._mode_last:
212
- idx1 = Index(
213
- "idx_waybackup_snapshots_url_origin_timestamp_desc",
214
- waybackup_snapshots.url_origin,
215
- waybackup_snapshots.timestamp.desc(),
227
+ self.db.session.execute(
228
+ text(
229
+ "CREATE INDEX IF NOT EXISTS idx_waybackup_snapshots_url_origin_timestamp_desc "
230
+ "ON waybackup_snapshots (url_origin, timestamp DESC)"
231
+ )
216
232
  )
217
- idx1.create(self.db.session.bind, checkfirst=True)
218
233
  # index for filtering first snapshots
219
234
  if self._mode_first:
220
- idx2 = Index(
221
- "idx_waybackup_snapshots_url_origin_timestamp_asc",
222
- waybackup_snapshots.url_origin,
223
- waybackup_snapshots.timestamp.asc(),
235
+ self.db.session.execute(
236
+ text(
237
+ "CREATE INDEX IF NOT EXISTS idx_waybackup_snapshots_url_origin_timestamp_asc "
238
+ "ON waybackup_snapshots (url_origin, timestamp ASC)"
239
+ )
224
240
  )
225
- idx2.create(self.db.session.bind, checkfirst=True)
226
241
  # index for skippable snapshots
227
- idx3 = Index(
228
- "idx_waybackup_snapshots_timestamp_url_origin_response",
229
- waybackup_snapshots.timestamp,
230
- waybackup_snapshots.url_origin,
242
+ self.db.session.execute(
243
+ text(
244
+ "CREATE INDEX IF NOT EXISTS idx_waybackup_snapshots_timestamp_url_origin_response "
245
+ "ON waybackup_snapshots (timestamp, url_origin)"
246
+ )
231
247
  )
232
- idx3.create(self.db.session.bind, checkfirst=True)
248
+ self.db.session.commit()
233
249
 
234
250
  def _filter_snapshots(self):
235
251
  """
@@ -297,29 +313,42 @@ class SnapshotCollection:
297
313
  """
298
314
 
299
315
  # ? for now per row / no bulk for compatibility
300
- with self.csvfile as f:
301
- total_skipped = 0
302
- for row in f:
303
- self.db.session.execute(
304
- update(waybackup_snapshots)
305
- .where(
306
- and_(
307
- waybackup_snapshots.timestamp == row["timestamp"],
308
- waybackup_snapshots.url_origin == row["url_origin"],
316
+ try:
317
+ vb.write(verbose=True, content="[SnapshotCollection._skip_set] applying CSV skips to DB")
318
+ with self.csvfile as f:
319
+ total_skipped = 0
320
+ for row in f:
321
+ self.db.session.execute(
322
+ update(waybackup_snapshots)
323
+ .where(
324
+ and_(
325
+ waybackup_snapshots.timestamp == row["timestamp"],
326
+ waybackup_snapshots.url_origin == row["url_origin"],
327
+ )
328
+ )
329
+ .values(
330
+ url_archive=row["url_archive"],
331
+ redirect_url=row["redirect_url"],
332
+ redirect_timestamp=row["redirect_timestamp"],
333
+ response=row["response"],
334
+ file=row["file"],
309
335
  )
310
336
  )
311
- .values(
312
- url_archive=row["url_archive"],
313
- redirect_url=row["redirect_url"],
314
- redirect_timestamp=row["redirect_timestamp"],
315
- response=row["response"],
316
- file=row["file"],
317
- )
318
- )
319
- total_skipped += 1
337
+ total_skipped += 1
320
338
 
321
- self.db.session.commit()
322
- self._filter_skip = total_skipped
339
+ self.db.session.commit()
340
+ self._filter_skip = total_skipped
341
+ vb.write(
342
+ verbose=True, content=f"[SnapshotCollection._skip_set] commit successful, total_skipped={total_skipped}"
343
+ )
344
+ except Exception as e:
345
+ vb.write(verbose=True, content=f"[SnapshotCollection._skip_set] exception: {e}; rolling back")
346
+ try:
347
+ self.db.session.rollback()
348
+ vb.write(verbose=True, content="[SnapshotCollection._skip_set] rollback successful")
349
+ except Exception:
350
+ vb.write(verbose=True, content="[SnapshotCollection._skip_set] rollback failed")
351
+ raise
323
352
 
324
353
  def count_total(self) -> int:
325
354
  return self.db.session.query(waybackup_snapshots.scid).count()
@@ -21,6 +21,27 @@ class Worker:
21
21
  self.db = Database()
22
22
  self.connection = http.client.HTTPSConnection("web.archive.org")
23
23
 
24
+ def close(self):
25
+ """
26
+ Try to close the database and connection.
27
+ """
28
+ try:
29
+ if hasattr(self, "db") and self.db:
30
+ try:
31
+ vb.write(verbose="high", content=f"[Worker.close] closing DB for worker {self.id}")
32
+ self.db.close()
33
+ vb.write(verbose="high", content=f"[Worker.close] DB closed for worker {self.id}")
34
+ except Exception:
35
+ pass
36
+ finally:
37
+ try:
38
+ if hasattr(self, "connection") and self.connection:
39
+ vb.write(verbose="high", content=f"[Worker.close] closing connection for worker {self.id}")
40
+ self.connection.close()
41
+ vb.write(verbose="high", content=f"[Worker.close] connection closed for worker {self.id}")
42
+ except Exception:
43
+ pass
44
+
24
45
  def assign_snapshot(self, total_amount: int):
25
46
  self.snapshot = Snapshot(self.db, output=self.output, mode=self.mode)
26
47
  self.total_amount = total_amount
@@ -84,7 +105,11 @@ class Message(Worker):
84
105
  content = content + " - " if content else ""
85
106
  self.message = {
86
107
  "verbose": False,
87
- "content": f"{self.worker.snapshot.counter}/{self.worker.total_amount} - W:{self.worker.id} - {result}{content}{self.worker.snapshot.timestamp} - {self.worker.snapshot.url_origin}",
108
+ "content": (
109
+ f"{self.worker.snapshot.counter}/{self.worker.total_amount}"
110
+ f" - W:{self.worker.id} - {result}{content}"
111
+ f"{self.worker.snapshot.timestamp} - {self.worker.snapshot.url_origin}"
112
+ ),
88
113
  }
89
114
  self.buffer.append(self.message)
90
115
 
@@ -267,6 +267,8 @@ class DownloadArchive:
267
267
 
268
268
  except Exception as e:
269
269
  ex.exception(f"\nWorker: {worker.id} - Exception", e)
270
+ finally:
271
+ worker.close()
270
272
 
271
273
  def _download(self, worker: Worker):
272
274
  """
@@ -0,0 +1,55 @@
1
+ import argparse
2
+ import sys
3
+ from argparse import RawTextHelpFormatter
4
+ from importlib.metadata import version
5
+
6
+ from pywaybackup.arg_specs import ARG_GROUPS, ARG_SPECS, EXCLUSIVE_GROUPS
7
+
8
+
9
+ class Arguments:
10
+ def __init__(self):
11
+ parser = argparse.ArgumentParser(
12
+ description=f"<<< python-wayback-machine-downloader v{version('pywaybackup')} >>>\nby @bitdruid -> https://github.com/bitdruid",
13
+ formatter_class=RawTextHelpFormatter,
14
+ )
15
+
16
+ groups = {name: parser.add_argument_group(name) for name in ARG_GROUPS}
17
+
18
+ exclusive = {
19
+ ex_name: groups[ex_meta["parent_group"]].add_mutually_exclusive_group(required=ex_meta["required"])
20
+ for ex_name, ex_meta in EXCLUSIVE_GROUPS.items()
21
+ }
22
+
23
+ for spec in ARG_SPECS:
24
+ target = exclusive[spec.exclusive_group] if spec.exclusive_group else groups[spec.group]
25
+ target.add_argument(*spec.flags, **_argparse_kwargs(spec))
26
+
27
+ args = parser.parse_args(args=None if sys.argv[1:] else ["--help"]) # if no arguments are given, print help
28
+
29
+ args.silent = False
30
+ args.debug = True
31
+
32
+ self.args = args
33
+
34
+ def get_args(self) -> dict:
35
+ """Returns the parsed arguments as a dictionary."""
36
+ return vars(self.args)
37
+
38
+
39
+ def _argparse_kwargs(spec) -> dict:
40
+ """Translate an ArgSpec into kwargs for argparse.add_argument()."""
41
+ kwargs = {"help": spec.help}
42
+ if spec.action == "store_true":
43
+ kwargs["action"] = "store_true"
44
+ kwargs["default"] = bool(spec.default)
45
+ elif spec.action == "optional_value":
46
+ kwargs["type"] = spec.type
47
+ kwargs["nargs"] = "?"
48
+ kwargs["const"] = spec.const
49
+ kwargs["metavar"] = spec.metavar
50
+ kwargs["default"] = spec.default
51
+ else:
52
+ kwargs["type"] = spec.type
53
+ kwargs["metavar"] = spec.metavar
54
+ kwargs["default"] = spec.default
55
+ return kwargs