pywaybackup 3.0.4__tar.gz → 3.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {pywaybackup-3.0.4/pywaybackup.egg-info → pywaybackup-3.1.0}/PKG-INFO +3 -2
  2. {pywaybackup-3.0.4 → pywaybackup-3.1.0}/README.md +1 -1
  3. {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pyproject.toml +2 -1
  4. {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup/Arguments.py +2 -1
  5. {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup/Exception.py +16 -19
  6. {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup/SnapshotCollection.py +24 -22
  7. {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup/Verbosity.py +7 -5
  8. pywaybackup-3.0.4/pywaybackup/archive.py → pywaybackup-3.1.0/pywaybackup/archive_download.py +21 -81
  9. pywaybackup-3.1.0/pywaybackup/archive_save.py +81 -0
  10. {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup/db.py +1 -2
  11. {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup/helper.py +20 -15
  12. {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup/main.py +13 -11
  13. {pywaybackup-3.0.4 → pywaybackup-3.1.0/pywaybackup.egg-info}/PKG-INFO +3 -2
  14. {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup.egg-info/SOURCES.txt +2 -1
  15. {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup.egg-info/requires.txt +1 -0
  16. {pywaybackup-3.0.4 → pywaybackup-3.1.0}/LICENSE +0 -0
  17. {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup/Converter.py +0 -0
  18. {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup/__init__.py +0 -0
  19. {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup.egg-info/dependency_links.txt +0 -0
  20. {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup.egg-info/entry_points.txt +0 -0
  21. {pywaybackup-3.0.4 → pywaybackup-3.1.0}/pywaybackup.egg-info/top_level.txt +0 -0
  22. {pywaybackup-3.0.4 → pywaybackup-3.1.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: pywaybackup
3
- Version: 3.0.4
3
+ Version: 3.1.0
4
4
  Summary: Query and download archive.org as simple as possible.
5
5
  Author-email: bitdruid <bitdruid@outlook.com>
6
6
  License: MIT License
@@ -29,6 +29,7 @@ Project-URL: homepage, https://github.com/bitdruid/python-wayback-machine-downlo
29
29
  Requires-Python: >=3.8
30
30
  Description-Content-Type: text/markdown
31
31
  License-File: LICENSE
32
+ Requires-Dist: pysqlite3-binary==0.5.4
32
33
  Requires-Dist: requests==2.31.0
33
34
  Requires-Dist: tqdm==4.66.2
34
35
  Requires-Dist: python-magic==0.4.27; sys_platform == "linux"
@@ -39,7 +40,7 @@ Requires-Dist: python-magic-bin==0.4.14; sys_platform == "win32"
39
40
  [![PyPI](https://img.shields.io/pypi/v/pywaybackup)](https://pypi.org/project/pywaybackup/)
40
41
  [![PyPI - Downloads](https://img.shields.io/pypi/dm/pywaybackup)](https://pypi.org/project/pywaybackup/)
41
42
  ![Python Version](https://img.shields.io/badge/Python-3.8-blue)
42
- ![Python_Sqlite3 Version](https://img.shields.io/badge/Python_Sqlite3-3.25-blue)
43
+ <!-- ![Python_Sqlite3 Version](https://img.shields.io/badge/Python_Sqlite3-3.35-blue) -->
43
44
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
44
45
 
45
46
  Downloading archived web pages from the [Wayback Machine](https://archive.org/web/).
@@ -3,7 +3,7 @@
3
3
  [![PyPI](https://img.shields.io/pypi/v/pywaybackup)](https://pypi.org/project/pywaybackup/)
4
4
  [![PyPI - Downloads](https://img.shields.io/pypi/dm/pywaybackup)](https://pypi.org/project/pywaybackup/)
5
5
  ![Python Version](https://img.shields.io/badge/Python-3.8-blue)
6
- ![Python_Sqlite3 Version](https://img.shields.io/badge/Python_Sqlite3-3.25-blue)
6
+ <!-- ![Python_Sqlite3 Version](https://img.shields.io/badge/Python_Sqlite3-3.35-blue) -->
7
7
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
8
 
9
9
  Downloading archived web pages from the [Wayback Machine](https://archive.org/web/).
@@ -7,7 +7,7 @@ packages = ["pywaybackup"]
7
7
 
8
8
  [project]
9
9
  name = "pywaybackup"
10
- version = "3.0.4"
10
+ version = "3.1.0"
11
11
  description = "Query and download archive.org as simple as possible."
12
12
  authors = [
13
13
  { name = "bitdruid", email = "bitdruid@outlook.com" }
@@ -16,6 +16,7 @@ license = { file = "LICENSE" }
16
16
  readme = "README.md"
17
17
  requires-python = ">=3.8"
18
18
  dependencies = [
19
+ "pysqlite3-binary==0.5.4",
19
20
  "requests==2.31.0",
20
21
  "tqdm==4.66.2",
21
22
  "python-magic==0.4.27; sys_platform == 'linux'",
@@ -2,6 +2,7 @@
2
2
  import sys
3
3
  import os
4
4
  import argparse
5
+
5
6
  from importlib.metadata import version
6
7
 
7
8
  from pywaybackup.helper import url_split, sanitize_filename
@@ -74,7 +75,7 @@ class Configuration:
74
75
 
75
76
  if cls.output is None:
76
77
  cls.output = os.path.join(os.getcwd(), "waybackup_snapshots")
77
- os.makedirs(cls.output, exist_ok=True)
78
+ os.makedirs(cls.output, exist_ok=True) if not cls.save else None
78
79
 
79
80
  if cls.log is True:
80
81
  cls.log = os.path.join(cls.output, f"waybackup_{sanitize_filename(cls.url)}.log")
@@ -1,34 +1,33 @@
1
-
2
1
  import sys
3
2
  import os
4
- from datetime import datetime
3
+ import re
5
4
  import linecache
6
5
  import traceback
7
-
8
- import re
6
+ from datetime import datetime
9
7
 
10
8
  from importlib.metadata import version
11
9
 
12
- class Exception:
13
10
 
11
+ class Exception:
14
12
  new_debug = True
15
13
  output = None
16
14
  command = None
17
15
 
18
16
  @classmethod
19
17
  def init(cls, output=None, command=None):
20
- sys.excepthook = cls.exception_handler # set custom exception handler (uncaught exceptions)
18
+ sys.excepthook = (
19
+ cls.exception_handler
20
+ ) # set custom exception handler (uncaught exceptions)
21
21
  cls.output = output
22
22
  cls.command = command
23
23
 
24
24
  @classmethod
25
25
  def exception(cls, message: str, e: Exception, tb=None):
26
26
  custom_tb = sys.exc_info()[-1] if tb is None else tb
27
- original_tb = cls.relativate_path("".join(traceback.format_exception(type(e), e, e.__traceback__)))
28
- exception_message = (
29
- "-------------------------\n"
30
- f"!-- Exception: {message}\n"
27
+ original_tb = cls.relativate_path(
28
+ "".join(traceback.format_exception(type(e), e, e.__traceback__))
31
29
  )
30
+ exception_message = f"-------------------------\n!-- Exception: {message}\n"
32
31
  if custom_tb is not None:
33
32
  while custom_tb.tb_next: # loop to last traceback frame
34
33
  custom_tb = custom_tb.tb_next
@@ -46,10 +45,7 @@ class Exception:
46
45
  )
47
46
  else:
48
47
  exception_message += "!-- Traceback is None\n"
49
- exception_message += (
50
- f"!-- Description: {e}\n"
51
- "-------------------------"
52
- )
48
+ exception_message += f"!-- Description: {e}\n-------------------------"
53
49
  print(exception_message)
54
50
  debug_file = os.path.join(cls.output, "waybackup_error.log")
55
51
  print(f"Exception log: {debug_file}")
@@ -85,10 +81,10 @@ class Exception:
85
81
  if os.path.isfile(input): # case single path
86
82
  return os.path.relpath(input, os.getcwd())
87
83
  input_modified = ""
88
- input_lines = input.split('\n')
89
- if len(input_lines) == 1: # case single line
84
+ input_lines = input.split("\n")
85
+ if len(input_lines) == 1: # case single line
90
86
  return input
91
- for line in input.split('\n'): # case multiple lines
87
+ for line in input.split("\n"): # case multiple lines
92
88
  match = path_pattern.search(line)
93
89
  if match:
94
90
  original_path = match.group(1)
@@ -104,5 +100,6 @@ class Exception:
104
100
  if issubclass(exception_type, KeyboardInterrupt):
105
101
  sys.__excepthook__(exception_type, exception, traceback)
106
102
  return
107
- Exception.exception("UNCAUGHT EXCEPTION", exception, traceback) # uncaught exceptions also with custom scheme
108
-
103
+ Exception.exception(
104
+ "UNCAUGHT EXCEPTION", exception, traceback
105
+ ) # uncaught exceptions also with custom scheme
@@ -1,7 +1,6 @@
1
1
  import json
2
2
  import csv
3
3
  import os
4
- import threading
5
4
 
6
5
  from tqdm import tqdm
7
6
 
@@ -9,8 +8,6 @@ from pywaybackup.Verbosity import Verbosity as vb
9
8
  from pywaybackup.helper import url_split
10
9
  from pywaybackup.db import Database
11
10
 
12
- LOCK = threading.Lock() # thread safe lock
13
-
14
11
  class SnapshotCollection:
15
12
  """
16
13
  Represents the interaction with the snapshot-collection contained in the snapshot database.
@@ -283,31 +280,36 @@ class SnapshotCollection:
283
280
  """
284
281
  Modify a snapshot-row in the snapshot table.
285
282
  """
286
- global LOCK
287
- with LOCK:
288
- connection.cursor.execute(
289
- f"""
290
- UPDATE snapshot_tbl
291
- SET {column} = ?
292
- WHERE rowid = ?
293
- """,
294
- (value, snapshot_id)
295
- )
296
- connection.conn.commit()
283
+ connection.cursor.execute(
284
+ f"""
285
+ UPDATE snapshot_tbl
286
+ SET {column} = ?
287
+ WHERE rowid = ?
288
+ """,
289
+ (value, snapshot_id)
290
+ )
291
+ connection.conn.commit()
297
292
 
298
293
  def get_snapshot(connection):
299
294
  """
300
295
  Get a snapshot-row from the snapshot table with response NULL. (not processed)
301
296
  """
302
- global LOCK
303
- with LOCK:
304
- connection.cursor.execute(
305
- """
306
- SELECT rowid, * FROM snapshot_tbl WHERE response IS NULL LIMIT 1
307
- """
297
+ # mark as locked for other workers // only visual because get_snapshot fetches by NULL
298
+ connection.cursor.execute(
299
+ """
300
+ UPDATE snapshot_tbl
301
+ SET response = 'LOCK'
302
+ WHERE rowid = (
303
+ SELECT rowid FROM snapshot_tbl
304
+ WHERE response IS NULL
305
+ LIMIT 1
308
306
  )
309
- row = connection.cursor.fetchone()
310
- return row
307
+ RETURNING rowid, *;
308
+ """
309
+ )
310
+ row = connection.cursor.fetchone()
311
+ connection.conn.commit()
312
+ return row
311
313
 
312
314
  @classmethod
313
315
  def create_output(cls, url: str, timestamp: str, output: str):
@@ -1,4 +1,3 @@
1
- import sys
2
1
  from tqdm import tqdm
3
2
 
4
3
  class Verbosity:
@@ -63,21 +62,24 @@ class Verbosity:
63
62
  cls.pbar.refresh()
64
63
 
65
64
  @classmethod
66
- def generate_logline(cls, status: str = "", type: str = "", message: str = ""):
65
+ def generate_logline(cls, status: str, type: str, message: str):
67
66
  """
68
- STATUS -> TYPE: MESSAGE
67
+ STATUS TYPE: MESSAGE
69
68
  """
70
69
 
71
70
  if not status and not type:
72
71
  return message
73
72
 
74
- status_length = 11
73
+ status_length = 10
75
74
  type_length = 5
76
75
 
77
76
  status = status.ljust(status_length)
77
+ status = f"{status} -> "
78
+
78
79
  type = type.ljust(type_length)
80
+ type = f"{type}: " if type.strip() else ""
79
81
 
80
- log_entry = f"{status} -> {type}: {message}"
82
+ log_entry = f"{status}{type}{message}"
81
83
 
82
84
  return log_entry
83
85
 
@@ -1,84 +1,26 @@
1
- import requests
2
- import os
3
1
  import gzip
2
+ import http.client
3
+ import os
4
4
  import threading
5
5
  import time
6
6
  import urllib.parse
7
- import http.client
7
+ from datetime import datetime
8
+ from socket import timeout
8
9
  from urllib.parse import urljoin
9
- from datetime import datetime, timezone
10
-
11
- from tqdm import tqdm
12
10
 
13
- from socket import timeout
11
+ from importlib.metadata import version
14
12
 
15
- from pywaybackup.helper import url_get_timestamp, move_index, check_nt
13
+ import requests
14
+ from tqdm import tqdm
16
15
 
17
- from pywaybackup.SnapshotCollection import SnapshotCollection as sc
18
16
  from pywaybackup.Arguments import Configuration as config
19
- from pywaybackup.db import Database
20
-
21
- from importlib.metadata import version
22
-
17
+ from pywaybackup.Exception import Exception as ex
18
+ from pywaybackup.SnapshotCollection import SnapshotCollection as sc
23
19
  from pywaybackup.Verbosity import Message
24
20
  from pywaybackup.Verbosity import Verbosity as vb
25
- from pywaybackup.Exception import Exception as ex
26
-
27
-
28
-
29
-
30
-
31
- # GET: store page to wayback machine and response with redirect to snapshot
32
- # POST: store page to wayback machine and response with wayback machine status-page
33
- # tag_jobid = '<script>spn.watchJob("spn2-%s", "/_static/",6000);</script>'
34
- # tag_result_timeout = '<p>The same snapshot had been made %s minutes ago. You can make new capture of this URL after 1 hour.</p>'
35
- # tag_result_success = ' A snapshot was captured. Visit page: <a href="%s">%s</a>'
36
- def save_page(url: str):
37
- """
38
- Saves a webpage to the Wayback Machine.
39
-
40
- Args:
41
- url (str): The URL of the webpage to be saved.
42
-
43
- Returns:
44
- None: The function does not return any value. It only prints messages to the console.
45
- """
46
- vb.write(message="\nSaving page to the Wayback Machine...")
47
- connection = http.client.HTTPSConnection("web.archive.org")
48
- headers = {
49
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
50
- }
51
- connection.request("GET", f"https://web.archive.org/save/{url}", headers=headers)
52
- vb.write(message="\n-----> Request sent")
53
- response = connection.getresponse()
54
- response_status = response.status
55
-
56
- if response_status == 302:
57
- location = response.getheader("Location")
58
- vb.write(message="\n-----> Response: 302 (redirect to snapshot)")
59
- snapshot_timestamp = datetime.strptime(url_get_timestamp(location), '%Y%m%d%H%M%S').strftime('%Y-%m-%d %H:%M:%S')
60
- current_timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
61
- timestamp_difference = (datetime.strptime(current_timestamp, '%Y-%m-%d %H:%M:%S') - datetime.strptime(snapshot_timestamp, '%Y-%m-%d %H:%M:%S')).seconds / 60
62
- timestamp_difference = int(round(timestamp_difference, 0))
63
-
64
- if timestamp_difference < 1:
65
- vb.write(message="\n-----> New snapshot created")
66
- elif timestamp_difference > 1:
67
- vb.write(message=f"\n-----> Snapshot already exists. (1 hour limit) - wait for {60 - timestamp_difference} minutes")
68
- vb.write(message=f"TIMESTAMP SNAPSHOT: {snapshot_timestamp}")
69
- vb.write(message=f"TIMESTAMP REQUEST : {current_timestamp}")
70
- vb.write(message=f"\nLAST SNAPSHOT BACK: {timestamp_difference} minutes")
71
-
72
- vb.write(message=f"\nURL: {location}")
73
-
74
- elif response_status == 404:
75
- vb.write(message="\n-----> Response: 404 (not found)")
76
- vb.write(message=f"\nFAILED -> URL: {url}")
77
- else:
78
- vb.write(message="\n-----> Response: unexpected")
79
- vb.write(message=f"\nFAILED -> URL: {url}")
21
+ from pywaybackup.db import Database
22
+ from pywaybackup.helper import check_nt, move_index, url_get_timestamp
80
23
 
81
- connection.close()
82
24
 
83
25
 
84
26
 
@@ -89,7 +31,7 @@ def startup():
89
31
  vb.write(message=f"\n<<< python-wayback-machine-downloader v{version('pywaybackup')} >>>")
90
32
 
91
33
  if Database.QUERY_EXIST:
92
- vb.write(message=f"\nExisting query snapshots processed: {Database.QUERY_PROGRESS}\nResuming download... (to reset the job use '--reset')\n")
34
+ vb.write(message=f"\nDOWNLOAD job exist - processed: {Database.QUERY_PROGRESS}\nResuming download... (to reset the job use '--reset')\n")
93
35
 
94
36
  for i in range(5, -1, -1):
95
37
  vb.write(message=f"\r{i}...")
@@ -224,8 +166,6 @@ def download_loop(output, worker, retry, no_redirect, delay):
224
166
 
225
167
  snapshot = sc.get_snapshot(db)
226
168
  if not snapshot: break
227
- # mark as locked for other workers // only visual because get_snapshot fetches by NULL
228
- sc.modify_snapshot(db, snapshot["rowid"], "response", "LOCK")
229
169
  SNAPSHOT_CURRENT = snapshot["rowid"]
230
170
 
231
171
  retry_attempt = 1
@@ -273,18 +213,18 @@ def download_loop(output, worker, retry, no_redirect, delay):
273
213
 
274
214
  # depends on user - retries after timeout or proceed to next snapshot
275
215
  if retry > 0:
276
- status_message.store(message=f"\n-----> Worker: {worker} - Attempt: [{retry_attempt}/{retry_max_attempt}] Snapshot ID: [{SNAPSHOT_CURRENT}/{sc.SNAPSHOT_TOTAL}] - Download failed - retry Timeout: 15 seconds...")
216
+ status_message.store(status="FAILED", message="retry timeout: 15 seconds...")
277
217
  status_message.write()
278
218
  time.sleep(15)
279
219
  else:
280
- status_message.store(message=f"\n-----> Worker: {worker} - Attempt: [{retry_attempt}/{retry_max_attempt}] Snapshot ID: [{SNAPSHOT_CURRENT}/{sc.SNAPSHOT_TOTAL}] - Download failed")
220
+ status_message.store(status="FAILED", message="no attempt left")
281
221
  status_message.write()
282
222
  sc.SNAPSHOT_HANDLED += 1
283
223
  break # break all loops and do a user-defined retry
284
224
 
285
225
  retry_attempt += 1
286
226
  # if retry_attempt > retry_max_attempt:
287
- # status_message.store(status="FAILED", type="HTTP", message="Max retries exceeded")
227
+ # status_message.store(status="FAILED", message="Max retries exceeded")
288
228
  # status_message.store(status="", type="URL", message=snapshot["url_archive"])
289
229
  # status_message.write()
290
230
  # vb.progress(1)
@@ -309,7 +249,7 @@ def download(db, output, snapshot_entry, connection, status_message, no_redirect
309
249
  response, response_data, response_status, response_status_message = download_response(connection, encoded_download_url, headers)
310
250
  sc.modify_snapshot(db, snapshot_entry["rowid"], "response", response_status)
311
251
  if not no_redirect and response_status == 302:
312
- status_message.store(status="REDIRECT", type="HTTP", message=f"{response.status} - {response_status_message}")
252
+ status_message.store(status="REDIRECT", message=f"{response.status} - {response_status_message}")
313
253
  status_message.store(status="", type="FROM", message=download_url)
314
254
  for _ in range(5):
315
255
  response, response_data, response_status, response_status_message = download_response(connection, encoded_download_url, headers)
@@ -327,7 +267,7 @@ def download(db, output, snapshot_entry, connection, status_message, no_redirect
327
267
 
328
268
  # if output_file is too long for windows, skip download
329
269
  if check_nt() and len(output_file) > 255:
330
- status_message.store(status="PATH > 255", type="HTTP", message=f"{response.status} - {response_status_message}")
270
+ status_message.store(status="PATH > 255", message=f"{response.status} - {response_status_message}")
331
271
  status_message.store(status="", type="URL", message=download_url)
332
272
  sc.entry_modify(snapshot_entry, "file", "PATH TOO LONG TO SAVE FILE")
333
273
  #status_message.write()
@@ -348,9 +288,9 @@ def download(db, output, snapshot_entry, connection, status_message, no_redirect
348
288
  file.write(response_data)
349
289
  # check if file is downloaded
350
290
  if os.path.isfile(output_file):
351
- status_message.store(status="SUCCESS", type="HTTP", message=f"{response.status} - {response_status_message}")
291
+ status_message.store(status="SUCCESS", message=f"{response.status} - {response_status_message}")
352
292
  else:
353
- status_message.store(status="EXISTING", type="HTTP", message=f"{response.status} - {response_status_message}")
293
+ status_message.store(status="EXISTING", message=f"{response.status} - {response_status_message}")
354
294
  status_message.store(status="", type="URL", message=download_url)
355
295
  status_message.store(status="", type="FILE", message=output_file)
356
296
  sc.modify_snapshot(db, snapshot_entry["rowid"], "file", output_file)
@@ -359,7 +299,7 @@ def download(db, output, snapshot_entry, connection, status_message, no_redirect
359
299
  #status_message.write()
360
300
  return True
361
301
  else:
362
- status_message.store(status="UNEXPECTED", type="HTTP", message=f"{response.status} - {response_status_message}")
302
+ status_message.store(status="UNEXPECTED", message=f"{response.status} - {response_status_message}")
363
303
  status_message.store(status="", type="URL", message=download_url)
364
304
  #status_message.write()
365
305
  return False
@@ -375,7 +315,7 @@ def download_response(connection, encoded_download_url, headers):
375
315
  RESPONSE_CODE_DICT = {
376
316
  200: "OK",
377
317
  301: "Moved Permanently",
378
- 302: "Found (redirect)",
318
+ 302: "Redirect",
379
319
  400: "Bad Request",
380
320
  403: "Forbidden",
381
321
  404: "Not Found",
@@ -0,0 +1,81 @@
1
+ import http.client
2
+ from datetime import datetime, timezone
3
+
4
+ from importlib.metadata import version
5
+
6
+ from pywaybackup.helper import url_get_timestamp
7
+ from pywaybackup.Verbosity import Verbosity as vb
8
+
9
+ # def startup():
10
+ # try:
11
+ # vb.write(message=f"\n<<< python-wayback-machine-downloader v{version('pywaybackup')} >>>")
12
+
13
+ # if Database.QUERY_EXIST:
14
+ # vb.write(message=f"\nSAVE job exist - processed {Database.QUERY_PROGRESS}\nResuming save... (to reset the job use '--reset')\n")
15
+
16
+ # for i in range(5, -1, -1):
17
+ # vb.write(message=f"\r{i}...")
18
+ # print("\033[F", end="")
19
+ # print("\033[K", end="")
20
+
21
+ # time.sleep(1)
22
+
23
+ # #vb.write(message="\n")
24
+ # except KeyboardInterrupt:
25
+ # os._exit(1)
26
+
27
+ # GET: store page to wayback machine and response with redirect to snapshot
28
+ # POST: store page to wayback machine and response with wayback machine status-page
29
+ # tag_jobid = '<script>spn.watchJob("spn2-%s", "/_static/",6000);</script>'
30
+ # tag_result_timeout = '<p>The same snapshot had been made %s minutes ago. You can make new capture of this URL after 1 hour.</p>'
31
+ # tag_result_success = ' A snapshot was captured. Visit page: <a href="%s">%s</a>'
32
+ def save_page(url: str):
33
+ """
34
+ Saves a webpage to the Wayback Machine.
35
+
36
+ Args:
37
+ url (str): The URL of the webpage to be saved.
38
+
39
+ Returns:
40
+ None: The function does not return any value. It only prints messages to the console.
41
+ """
42
+ try:
43
+ connection = http.client.HTTPSConnection("web.archive.org")
44
+ headers = {"User-Agent": f"bitdruid-python-wayback-downloader/{version('pywaybackup')}"}
45
+ vb.write(message="\nSaving page to the Wayback Machine...")
46
+ connection.request("GET", f"https://web.archive.org/save/{url}", headers=headers)
47
+ vb.write(message=f"\n-----> Request sent -> URL: {url}")
48
+ response = connection.getresponse()
49
+ response_status = response.status
50
+
51
+ if response_status == 302:
52
+ location = response.getheader("Location")
53
+ snapshot_timestamp = datetime.strptime(url_get_timestamp(location), '%Y%m%d%H%M%S').strftime('%Y-%m-%d %H:%M:%S')
54
+ current_timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
55
+ timestamp_difference = (datetime.strptime(current_timestamp, '%Y-%m-%d %H:%M:%S') - datetime.strptime(snapshot_timestamp, '%Y-%m-%d %H:%M:%S')).seconds / 60
56
+ timestamp_difference = int(round(timestamp_difference, 0))
57
+
58
+ if timestamp_difference < 1:
59
+ vb.write(message="\n-----> Response: 302 (new snapshot)")
60
+ vb.write(status="SNAPSHOT", type="URL", message=f"{location}")
61
+ elif timestamp_difference >= 1:
62
+ vb.write(message=f"\n-----> Response: 302 (existing snapshot - wait for {60 - timestamp_difference} minutes)")
63
+ vb.write(status="SNAPSHOT", type="URL", message=f"{location}")
64
+ vb.write(status="WAYBACK", type="TIME", message=f"{snapshot_timestamp}")
65
+ vb.write(status="REQUEST", type="TIME", message=f"{current_timestamp}")
66
+
67
+ elif response_status == 429:
68
+ vb.write(message="\n-----> Response: 429 (too many requests)")
69
+ vb.write(message="- no simultaneous allowed")
70
+ vb.write(message="- 15 per 5 minutes\n")
71
+ elif response_status == 520:
72
+ vb.write(message="\n-----> Response: 520 (job failed)")
73
+ elif response_status == 404:
74
+ vb.write(message="\n-----> Response: 404 (not found)")
75
+ else:
76
+ vb.write(message=f"\n-----> Response: {response_status} - UNHANDLED")
77
+
78
+ connection.close()
79
+ except ConnectionRefusedError:
80
+ vb.write(message="\nCONNECTION REFUSED -> could not connect to wayback machine")
81
+
@@ -1,4 +1,4 @@
1
- import sqlite3
1
+ import pysqlite3 as sqlite3
2
2
 
3
3
  class Database:
4
4
 
@@ -40,7 +40,6 @@ class Database:
40
40
  db.cursor.execute(cls.snapshot_table)
41
41
  db.cursor.execute("SELECT query_identifier FROM waybackup_table WHERE query_identifier = ?", (query_identifier,))
42
42
  if db.cursor.fetchone():
43
- print("found")
44
43
  cls.QUERY_EXIST = True
45
44
  cls.QUERY_PROGRESS = db.get_progress()
46
45
  else:
@@ -1,4 +1,3 @@
1
-
2
1
  import os
3
2
  import shutil
4
3
  import magic
@@ -15,12 +14,13 @@ def sanitize_filename(input: str) -> str:
15
14
  """
16
15
  Sanitize a string to be used as (part of) a filename.
17
16
  """
18
- disallowed = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
17
+ disallowed = ["<", ">", ":", '"', "/", "\\", "|", "?", "*"]
19
18
  for char in disallowed:
20
19
  input = input.replace(char, ".")
21
- input = '.'.join(filter(None, input.split('.')))
20
+ input = ".".join(filter(None, input.split(".")))
22
21
  return input
23
22
 
23
+
24
24
  def sanitize_url(input: str) -> str:
25
25
  """
26
26
  Sanitize a url by encoding special characters.
@@ -32,12 +32,13 @@ def sanitize_url(input: str) -> str:
32
32
 
33
33
 
34
34
  def url_get_timestamp(url):
35
- """
36
- Extract the timestamp from a wayback machine URL.
37
- """
38
- timestamp = url.split("web/")[1].split("/")[0]
39
- if "id_" in url: timestamp = timestamp.split("id_")[0]
40
- return timestamp
35
+ """
36
+ Extract the timestamp from a wayback machine URL.
37
+ """
38
+ timestamp = url.split("web/")[1].split("/")[0]
39
+ if "id_" in url:
40
+ timestamp = timestamp.split("id_")[0]
41
+ return timestamp
41
42
 
42
43
 
43
44
  def url_split(url, index=False):
@@ -52,8 +53,8 @@ def url_split(url, index=False):
52
53
  if "://" in url:
53
54
  url = url.split("://")[1]
54
55
  domain = url.split("/")[0]
55
- path = url[len(domain):]
56
- domain = domain.split("@")[-1].split(":")[0] # remove mailto and port
56
+ path = url[len(domain) :]
57
+ domain = domain.split("@")[-1].split(":")[0] # remove mailto and port
57
58
  path_parts = path.split("/")
58
59
  path_end = path_parts[-1]
59
60
  if not url.endswith("/") or "." in path_end:
@@ -87,21 +88,25 @@ def move_index(existpath: str = None, existfile: str = None, filebuffer: bytes =
87
88
  shutil.move(existpath, existpath + "_exist")
88
89
  os.makedirs(existpath, exist_ok=True)
89
90
  if not check_index_mime(existpath):
90
- new_file = os.path.join(existpath, os.path.basename(os.path.normpath(existpath)))
91
+ new_file = os.path.join(
92
+ existpath, os.path.basename(os.path.normpath(existpath))
93
+ )
91
94
  else:
92
95
  new_file = os.path.join(existpath, "index.html")
93
96
  shutil.move(existpath + "_exist", new_file)
94
97
  elif existfile:
95
98
  if filebuffer:
96
99
  if not check_index_mime(filebuffer):
97
- return os.path.join(existfile, os.path.basename(os.path.normpath(existfile)))
100
+ return os.path.join(
101
+ existfile, os.path.basename(os.path.normpath(existfile))
102
+ )
98
103
  else:
99
104
  return os.path.join(existfile, "index.html")
100
-
105
+
101
106
 
102
107
  def check_index_mime(filebuffer: bytes) -> bool:
103
108
  mime = magic.Magic(mime=True)
104
109
  mime_type = mime.from_buffer(filebuffer)
105
110
  if mime_type != "text/html":
106
111
  return False
107
- return True
112
+ return True
@@ -1,11 +1,10 @@
1
1
  import os
2
-
3
2
  import signal
4
3
 
5
- import pywaybackup.archive as archive
4
+ import pywaybackup.archive_download as archive_download
5
+ import pywaybackup.archive_save as archive_save
6
6
 
7
7
  from pywaybackup.SnapshotCollection import SnapshotCollection as sc
8
-
9
8
  from pywaybackup.Arguments import Configuration as config
10
9
  from pywaybackup.db import Database as db
11
10
  from pywaybackup.Verbosity import Verbosity as vb
@@ -16,19 +15,22 @@ def main():
16
15
  config.init()
17
16
  ex.init(config.output, config.command)
18
17
  vb.init(config.progress, config.log)
18
+
19
+ if config.save:
20
+ archive_save.save_page(config.url)
21
+ os._exit(1)
22
+
19
23
  db.init(config.dbfile, config.query_identifier)
20
24
  sc.init(config.mode)
21
25
 
22
- if config.save:
23
- archive.save_page(config.url)
24
26
 
25
- else:
27
+ if not config.save:
26
28
 
27
- archive.startup()
29
+ archive_download.startup()
28
30
 
29
31
  try:
30
- archive.query_list(config.csvfile, config.cdxfile, config.range, config.limit, config.start, config.end, config.explicit, config.filetype)
31
- archive.download_list(config.output, config.retry, config.no_redirect, config.delay, config.workers)
32
+ archive_download.query_list(config.csvfile, config.cdxfile, config.range, config.limit, config.start, config.end, config.explicit, config.filetype)
33
+ archive_download.download_list(config.output, config.retry, config.no_redirect, config.delay, config.workers)
32
34
  except KeyboardInterrupt:
33
35
  print("\nInterrupted by user\n")
34
36
  config.keep = True
@@ -44,8 +46,8 @@ def main():
44
46
  vb.fini()
45
47
 
46
48
  if not config.keep:
47
- os.remove(config.dbfile)
48
- os.remove(config.cdxfile)
49
+ os.remove(config.dbfile) if os.path.exists(config.dbfile) else None
50
+ os.remove(config.cdxfile) if os.path.exists(config.cdxfile) else None
49
51
 
50
52
  os._exit(1)
51
53
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: pywaybackup
3
- Version: 3.0.4
3
+ Version: 3.1.0
4
4
  Summary: Query and download archive.org as simple as possible.
5
5
  Author-email: bitdruid <bitdruid@outlook.com>
6
6
  License: MIT License
@@ -29,6 +29,7 @@ Project-URL: homepage, https://github.com/bitdruid/python-wayback-machine-downlo
29
29
  Requires-Python: >=3.8
30
30
  Description-Content-Type: text/markdown
31
31
  License-File: LICENSE
32
+ Requires-Dist: pysqlite3-binary==0.5.4
32
33
  Requires-Dist: requests==2.31.0
33
34
  Requires-Dist: tqdm==4.66.2
34
35
  Requires-Dist: python-magic==0.4.27; sys_platform == "linux"
@@ -39,7 +40,7 @@ Requires-Dist: python-magic-bin==0.4.14; sys_platform == "win32"
39
40
  [![PyPI](https://img.shields.io/pypi/v/pywaybackup)](https://pypi.org/project/pywaybackup/)
40
41
  [![PyPI - Downloads](https://img.shields.io/pypi/dm/pywaybackup)](https://pypi.org/project/pywaybackup/)
41
42
  ![Python Version](https://img.shields.io/badge/Python-3.8-blue)
42
- ![Python_Sqlite3 Version](https://img.shields.io/badge/Python_Sqlite3-3.25-blue)
43
+ <!-- ![Python_Sqlite3 Version](https://img.shields.io/badge/Python_Sqlite3-3.35-blue) -->
43
44
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
44
45
 
45
46
  Downloading archived web pages from the [Wayback Machine](https://archive.org/web/).
@@ -7,7 +7,8 @@ pywaybackup/Exception.py
7
7
  pywaybackup/SnapshotCollection.py
8
8
  pywaybackup/Verbosity.py
9
9
  pywaybackup/__init__.py
10
- pywaybackup/archive.py
10
+ pywaybackup/archive_download.py
11
+ pywaybackup/archive_save.py
11
12
  pywaybackup/db.py
12
13
  pywaybackup/helper.py
13
14
  pywaybackup/main.py
@@ -1,3 +1,4 @@
1
+ pysqlite3-binary==0.5.4
1
2
  requests==2.31.0
2
3
  tqdm==4.66.2
3
4
 
File without changes
File without changes