pywaybackup 3.3.0__tar.gz → 3.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {pywaybackup-3.3.0/pywaybackup.egg-info → pywaybackup-3.4.0}/PKG-INFO +45 -18
  2. {pywaybackup-3.3.0 → pywaybackup-3.4.0}/README.md +44 -16
  3. {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pyproject.toml +2 -2
  4. pywaybackup-3.4.0/pywaybackup/Arguments.py +57 -0
  5. {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup/Exception.py +25 -26
  6. pywaybackup-3.4.0/pywaybackup/PyWaybackup.py +235 -0
  7. {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup/SnapshotCollection.py +39 -24
  8. pywaybackup-3.4.0/pywaybackup/Verbosity.py +120 -0
  9. pywaybackup-3.4.0/pywaybackup/__init__.py +1 -0
  10. {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup/archive_download.py +43 -21
  11. pywaybackup-3.4.0/pywaybackup/main.py +13 -0
  12. {pywaybackup-3.3.0 → pywaybackup-3.4.0/pywaybackup.egg-info}/PKG-INFO +45 -18
  13. {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup.egg-info/SOURCES.txt +1 -0
  14. pywaybackup-3.4.0/pywaybackup.egg-info/entry_points.txt +2 -0
  15. pywaybackup-3.3.0/pywaybackup/Arguments.py +0 -157
  16. pywaybackup-3.3.0/pywaybackup/Verbosity.py +0 -93
  17. pywaybackup-3.3.0/pywaybackup/__init__.py +0 -0
  18. pywaybackup-3.3.0/pywaybackup/main.py +0 -55
  19. pywaybackup-3.3.0/pywaybackup.egg-info/entry_points.txt +0 -2
  20. {pywaybackup-3.3.0 → pywaybackup-3.4.0}/LICENSE +0 -0
  21. {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup/Converter.py +0 -0
  22. {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup/Worker.py +0 -0
  23. {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup/archive_save.py +0 -0
  24. {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup/db.py +0 -0
  25. {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup/helper.py +0 -0
  26. {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup.egg-info/dependency_links.txt +0 -0
  27. {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup.egg-info/requires.txt +0 -0
  28. {pywaybackup-3.3.0 → pywaybackup-3.4.0}/pywaybackup.egg-info/top_level.txt +0 -0
  29. {pywaybackup-3.3.0 → pywaybackup-3.4.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.1
2
2
  Name: pywaybackup
3
- Version: 3.3.0
3
+ Version: 3.4.0
4
4
  Summary: Query and download archive.org as simple as possible.
5
5
  Author-email: bitdruid <bitdruid@outlook.com>
6
6
  License: MIT License
@@ -35,7 +35,6 @@ Requires-Dist: requests==2.32.3
35
35
  Requires-Dist: tqdm==4.67.1
36
36
  Requires-Dist: python-magic==0.4.27; sys_platform == "linux"
37
37
  Requires-Dist: python-magic-bin==0.4.14; sys_platform == "win32"
38
- Dynamic: license-file
39
38
 
40
39
  # python wayback machine downloader
41
40
 
@@ -77,12 +76,50 @@ This tool allows you to download content from the Wayback Machine (archive.org).
77
76
  <br>
78
77
  <br>
79
78
 
80
- ## Arguments
79
+ ## import
80
+
81
+ You can import pywaybackup into your own scripts and run it. Args are the same as cli.
82
+
83
+ Additional args:
84
+ - `silent` (default True): If True, suppresses all output to the console.
85
+ - `debug` (default False): If True, disables writing errors to the error log file.
86
+
87
+ ```python
88
+ from pywaybackup import PyWayBackup
89
+
90
+ backup = PyWayBackup(
91
+ url="https://example.com",
92
+ all=True,
93
+ start="20200101",
94
+ end="20201231",
95
+ silent=False,
96
+ debug=True,
97
+ log=True,
98
+ keep=True
99
+ )
100
+
101
+ backup.run()
102
+ backup_paths = backup.paths(rel=True)
103
+ print(backup_paths)
104
+ ```
105
+ output:
106
+ ```bash
107
+ {
108
+ 'snapshots': 'output/example.com',
109
+ 'cdxfile': 'output/waybackup_example.cdx',
110
+ 'dbfile': 'output/waybackup_example.com.db',
111
+ 'csvfile': 'output/waybackup_https.example.com.csv',
112
+ 'log': 'output/waybackup_example.com.log',
113
+ 'debug': 'output/waybackup_error.log'
114
+ }
115
+ ```
116
+
117
+ ## cli
81
118
 
82
119
  - `-h`, `--help`: Show the help message and exit.
83
120
  - `-v`, `--version`: Show information about the tool and exit.
84
121
 
85
- ### Required
122
+ #### Required
86
123
 
87
124
  - **`-u`**, **`--url`**:<br>
88
125
  The URL of the web page to download. This argument is required.
@@ -107,8 +144,8 @@ This tool allows you to download content from the Wayback Machine (archive.org).
107
144
  Limits the amount of snapshots to query from the CDX server. If an existing CDX file is injected, the limit will have no effect. So you would need to set `--keep`.
108
145
 
109
146
  - **Range Selection:**<br>
110
- Specify the range in years or a specific timestamp either start, end, or both. If you specify the `range` argument, the `start` and `end` arguments will be ignored. Format for timestamps: YYYYMMDDhhmmss. You can only give a year or increase specificity by going through the timestamp starting on the left.<br>
111
- (year 2019, year 2019, year+month+day 20190101, year+month+day+hour 2019010112)
147
+ Specify the range in years or a specific timestamp either start, end, or both. If you specify the `range`, the `start` and `end` will be ignored. Format for timestamps: YYYYMMDDhhmmss. You can only give a year or increase specificity by going through the timestamp starting on the left.<br>
148
+ (year 2019, year+month+day 20190101, year+month+day+hour 2019010112)
112
149
 
113
150
  - **`-r`**, **`--range`**:<br>
114
151
  Specify the range in years for which to search and download snapshots.
@@ -144,9 +181,6 @@ This tool allows you to download content from the Wayback Machine (archive.org).
144
181
  - **`--verbose`**:<br>
145
182
  Increase output verbosity.
146
183
 
147
- <!-- - **`--verbosity`** `<level>`:<br>
148
- Sets verbosity level. Options are `info`and `trace`. Default is `info`. -->
149
-
150
184
  - **`--log`** <!-- `<path>` -->:<br>
151
185
  Saves a log file into the output-dir. Named as `waybackup_<sanitized_url>.log`.
152
186
 
@@ -165,9 +199,6 @@ Sets verbosity level. Options are `info`and `trace`. Default is `info`. -->
165
199
  - **`--delay`** `<seconds>`:<br>
166
200
  Specifies delay between download requests in seconds. Default is no delay (0).
167
201
 
168
- <!-- - **`--convert-links`**:<br>
169
- If set, all links in the downloaded files will be converted to local links. This is useful for offline browsing. The links are converted to the local path structure. Show output with `--verbosity trace`. -->
170
-
171
202
  #### Job Handling:
172
203
 
173
204
  - **`--reset`**:
@@ -265,9 +296,7 @@ your/path/waybackup_snapshots/
265
296
 
266
297
  ### CSV
267
298
 
268
- Each snapshot is stored with the following keys/values. These are either stored in a sqlite database while the download is running or saved into a CSV file after the download is finished.
269
-
270
- For download queries:
299
+ The CSV contains a snapshot per row:
271
300
 
272
301
  ```
273
302
  [
@@ -313,5 +342,3 @@ Exceptions will be written into `waybackup_error.log` (each run overwrites the f
313
342
 
314
343
  I'm always happy for some feature requests to improve the usability of this tool.
315
344
  Feel free to give suggestions and report issues. Project is still far from being perfect.
316
-
317
- > Please PR from dev into dev.
@@ -38,12 +38,50 @@ This tool allows you to download content from the Wayback Machine (archive.org).
38
38
  <br>
39
39
  <br>
40
40
 
41
- ## Arguments
41
+ ## import
42
+
43
+ You can import pywaybackup into your own scripts and run it. Args are the same as cli.
44
+
45
+ Additional args:
46
+ - `silent` (default True): If True, suppresses all output to the console.
47
+ - `debug` (default False): If True, disables writing errors to the error log file.
48
+
49
+ ```python
50
+ from pywaybackup import PyWayBackup
51
+
52
+ backup = PyWayBackup(
53
+ url="https://example.com",
54
+ all=True,
55
+ start="20200101",
56
+ end="20201231",
57
+ silent=False,
58
+ debug=True,
59
+ log=True,
60
+ keep=True
61
+ )
62
+
63
+ backup.run()
64
+ backup_paths = backup.paths(rel=True)
65
+ print(backup_paths)
66
+ ```
67
+ output:
68
+ ```bash
69
+ {
70
+ 'snapshots': 'output/example.com',
71
+ 'cdxfile': 'output/waybackup_example.cdx',
72
+ 'dbfile': 'output/waybackup_example.com.db',
73
+ 'csvfile': 'output/waybackup_https.example.com.csv',
74
+ 'log': 'output/waybackup_example.com.log',
75
+ 'debug': 'output/waybackup_error.log'
76
+ }
77
+ ```
78
+
79
+ ## cli
42
80
 
43
81
  - `-h`, `--help`: Show the help message and exit.
44
82
  - `-v`, `--version`: Show information about the tool and exit.
45
83
 
46
- ### Required
84
+ #### Required
47
85
 
48
86
  - **`-u`**, **`--url`**:<br>
49
87
  The URL of the web page to download. This argument is required.
@@ -68,8 +106,8 @@ This tool allows you to download content from the Wayback Machine (archive.org).
68
106
  Limits the amount of snapshots to query from the CDX server. If an existing CDX file is injected, the limit will have no effect. So you would need to set `--keep`.
69
107
 
70
108
  - **Range Selection:**<br>
71
- Specify the range in years or a specific timestamp either start, end, or both. If you specify the `range` argument, the `start` and `end` arguments will be ignored. Format for timestamps: YYYYMMDDhhmmss. You can only give a year or increase specificity by going through the timestamp starting on the left.<br>
72
- (year 2019, year 2019, year+month+day 20190101, year+month+day+hour 2019010112)
109
+ Specify the range in years or a specific timestamp either start, end, or both. If you specify the `range`, the `start` and `end` will be ignored. Format for timestamps: YYYYMMDDhhmmss. You can only give a year or increase specificity by going through the timestamp starting on the left.<br>
110
+ (year 2019, year+month+day 20190101, year+month+day+hour 2019010112)
73
111
 
74
112
  - **`-r`**, **`--range`**:<br>
75
113
  Specify the range in years for which to search and download snapshots.
@@ -105,9 +143,6 @@ This tool allows you to download content from the Wayback Machine (archive.org).
105
143
  - **`--verbose`**:<br>
106
144
  Increase output verbosity.
107
145
 
108
- <!-- - **`--verbosity`** `<level>`:<br>
109
- Sets verbosity level. Options are `info`and `trace`. Default is `info`. -->
110
-
111
146
  - **`--log`** <!-- `<path>` -->:<br>
112
147
  Saves a log file into the output-dir. Named as `waybackup_<sanitized_url>.log`.
113
148
 
@@ -126,9 +161,6 @@ Sets verbosity level. Options are `info`and `trace`. Default is `info`. -->
126
161
  - **`--delay`** `<seconds>`:<br>
127
162
  Specifies delay between download requests in seconds. Default is no delay (0).
128
163
 
129
- <!-- - **`--convert-links`**:<br>
130
- If set, all links in the downloaded files will be converted to local links. This is useful for offline browsing. The links are converted to the local path structure. Show output with `--verbosity trace`. -->
131
-
132
164
  #### Job Handling:
133
165
 
134
166
  - **`--reset`**:
@@ -226,9 +258,7 @@ your/path/waybackup_snapshots/
226
258
 
227
259
  ### CSV
228
260
 
229
- Each snapshot is stored with the following keys/values. These are either stored in a sqlite database while the download is running or saved into a CSV file after the download is finished.
230
-
231
- For download queries:
261
+ The CSV contains a snapshot per row:
232
262
 
233
263
  ```
234
264
  [
@@ -273,6 +303,4 @@ Exceptions will be written into `waybackup_error.log` (each run overwrites the f
273
303
  ## Contributing
274
304
 
275
305
  I'm always happy for some feature requests to improve the usability of this tool.
276
- Feel free to give suggestions and report issues. Project is still far from being perfect.
277
-
278
- > Please PR from dev into dev.
306
+ Feel free to give suggestions and report issues. Project is still far from being perfect.
@@ -7,7 +7,7 @@ packages = ["pywaybackup"]
7
7
 
8
8
  [project]
9
9
  name = "pywaybackup"
10
- version = "3.3.0"
10
+ version = "3.4.0"
11
11
  description = "Query and download archive.org as simple as possible."
12
12
  authors = [
13
13
  { name = "bitdruid", email = "bitdruid@outlook.com" }
@@ -25,7 +25,7 @@ dependencies = [
25
25
  ]
26
26
 
27
27
  [project.scripts]
28
- waybackup = "pywaybackup.main:main"
28
+ waybackup = "pywaybackup.main:cli"
29
29
 
30
30
  [project.urls]
31
31
  homepage = "https://github.com/bitdruid/python-wayback-machine-downloader"
@@ -0,0 +1,57 @@
1
+ import sys
2
+ import argparse
3
+
4
+ from argparse import RawTextHelpFormatter
5
+
6
+ from importlib.metadata import version
7
+
8
+
9
+ class Arguments:
10
+ def __init__(self):
11
+ parser = argparse.ArgumentParser(
12
+ description=f"<<< python-wayback-machine-downloader v{version('pywaybackup')} >>>\nby @bitdruid -> https://github.com/bitdruid",
13
+ formatter_class=RawTextHelpFormatter,
14
+ )
15
+
16
+ required = parser.add_argument_group("required (one exclusive)")
17
+ required.add_argument("-u", "--url", type=str, metavar="", help="url (with subdir/subdomain) to download")
18
+ exclusive_required = required.add_mutually_exclusive_group(required=True)
19
+ exclusive_required.add_argument("-a", "--all", action="store_true", help="download snapshots of all timestamps")
20
+ exclusive_required.add_argument("-l", "--last", action="store_true", help="download the last version of each file snapshot")
21
+ exclusive_required.add_argument("-f", "--first", action="store_true", help="download the first version of each file snapshot")
22
+ exclusive_required.add_argument("-s", "--save", action="store_true", help="save a page to the wayback machine")
23
+
24
+ optional = parser.add_argument_group("optional query parameters")
25
+ optional.add_argument("-e", "--explicit", action="store_true", help="search only for the explicit given url")
26
+ optional.add_argument("-r", "--range", type=int, metavar="", help="range in years to search")
27
+ optional.add_argument("--start", type=int, metavar="", help="start timestamp format: YYYYMMDDhhmmss")
28
+ optional.add_argument("--end", type=int, metavar="", help="end timestamp format: YYYYMMDDhhmmss")
29
+ optional.add_argument("--limit", type=int, nargs="?", const=True, metavar="int", help="limit the number of snapshots to download")
30
+ optional.add_argument("--filetype", type=str, metavar="", help="filetypes to download comma separated (js,css,...)")
31
+ optional.add_argument("--statuscode", type=str, metavar="", help="statuscodes to download comma separated (200,404,...)")
32
+
33
+ behavior = parser.add_argument_group("manipulate behavior")
34
+ behavior.add_argument("-o", "--output", type=str, metavar="", help="output for all files - defaults to current directory")
35
+ behavior.add_argument("-m", "--metadata", type=str, metavar="", help="change directory for db/cdx/csv/log files")
36
+ behavior.add_argument("-v", "--verbose", action="store_true", help="overwritten by progress - gives detailed output")
37
+ behavior.add_argument("--log", action="store_true", help="save a log file into the output folder")
38
+ behavior.add_argument("--progress", action="store_true", help="show a progress bar")
39
+ behavior.add_argument("--no-redirect", action="store_true", help="do not follow redirects by archive.org")
40
+ behavior.add_argument("--retry", type=int, default=0, metavar="", help="retry failed downloads (opt tries as int, else infinite)")
41
+ behavior.add_argument("--workers", type=int, default=1, metavar="", help="number of workers (simultaneous downloads)")
42
+ behavior.add_argument("--delay", type=int, default=0, metavar="", help="delay between each download in seconds")
43
+
44
+ special = parser.add_argument_group("special")
45
+ special.add_argument("--reset", action="store_true", help="reset the job and ignore existing cdx/db/csv files")
46
+ special.add_argument("--keep", action="store_true", help="keep all files after the job finished")
47
+
48
+ args = parser.parse_args(args=None if sys.argv[1:] else ["--help"]) # if no arguments are given, print help
49
+
50
+ args.silent = False
51
+ args.debug = True
52
+
53
+ self.args = args
54
+
55
+ def get_args(self) -> dict:
56
+ """Returns the parsed arguments as a dictionary."""
57
+ return vars(self.args)
@@ -14,8 +14,9 @@ class Exception:
14
14
  command = None
15
15
 
16
16
  @classmethod
17
- def init(cls, output=None, command=None):
17
+ def init(cls, debug=None, output=None, command=None):
18
18
  sys.excepthook = cls.exception_handler # set custom exception handler (uncaught exceptions)
19
+ cls.debug = debug
19
20
  cls.output = output
20
21
  cls.command = command
21
22
 
@@ -44,32 +45,30 @@ class Exception:
44
45
  exception_message += "!-- Traceback is None\n"
45
46
  exception_message += f"!-- Description: {e}\n-------------------------"
46
47
  print(exception_message)
47
- debug_file = os.path.join(cls.output, "waybackup_error.log")
48
- print(f"Exception log: {debug_file}")
49
- # print("-------------------------")
50
- # print(f"Full traceback:\n{original_tb}")
51
- if cls.new_debug: # new run, overwrite file
52
- cls.new_debug = False
53
- f = open(debug_file, "w", encoding="utf-8")
48
+ if cls.debug:
49
+ print(f"Exception log: {cls.debug}")
50
+ if cls.new_debug: # new run, overwrite file
51
+ cls.new_debug = False
52
+ f = open(cls.debug, "w", encoding="utf-8")
53
+ f.write("-------------------------\n")
54
+ f.write(f"Version: {version('pywaybackup')}\n")
55
+ f.write("-------------------------\n")
56
+ f.write(f"Command: {cls.command}\n")
57
+ f.write("-------------------------\n\n")
58
+ else: # current run, append to file
59
+ f = open(cls.debug, "a", encoding="utf-8")
60
+ f.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n")
61
+ f.write(exception_message + "\n")
62
+ f.write("!-- Local Variables:\n")
63
+ for var_name, value in local_vars.items():
64
+ if var_name in ["status_message", "headers"]:
65
+ continue
66
+ value = cls.relativate_path(str(value))
67
+ value = value[:666] + " ... " if len(value) > 666 else value
68
+ f.write(f" -- {var_name} = {value}\n")
54
69
  f.write("-------------------------\n")
55
- f.write(f"Version: {version('pywaybackup')}\n")
56
- f.write("-------------------------\n")
57
- f.write(f"Command: {cls.command}\n")
58
- f.write("-------------------------\n\n")
59
- else: # current run, append to file
60
- f = open(debug_file, "a", encoding="utf-8")
61
- f.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n")
62
- f.write(exception_message + "\n")
63
- f.write("!-- Local Variables:\n")
64
- for var_name, value in local_vars.items():
65
- if var_name in ["status_message", "headers"]:
66
- continue
67
- value = cls.relativate_path(str(value))
68
- value = value[:666] + " ... " if len(value) > 666 else value
69
- f.write(f" -- {var_name} = {value}\n")
70
- f.write("-------------------------\n")
71
- f.write(original_tb + "\n")
72
- f.close()
70
+ f.write(original_tb + "\n")
71
+ f.close()
73
72
 
74
73
  @classmethod
75
74
  def relativate_path(cls, input_str: str) -> str:
@@ -0,0 +1,235 @@
1
+ import sys
2
+ import os
3
+ import signal
4
+ from pywaybackup.helper import url_split, sanitize_filename
5
+
6
+ import pywaybackup.archive_download as archive_download
7
+ import pywaybackup.archive_save as archive_save
8
+ from pywaybackup.db import Database as db
9
+ from pywaybackup.Verbosity import Verbosity as vb
10
+ from pywaybackup.Exception import Exception as ex
11
+ from pywaybackup.SnapshotCollection import SnapshotCollection as sc
12
+
13
+ class PyWayBackup:
14
+ """
15
+ PyWayBackup: A Python interface for downloading or saving archived web pages from the Wayback Machine (archive.org).
16
+
17
+ Supported Modes (only one must be selected):
18
+ - all : Download all snapshots for a URL within a time range.
19
+ - last : Download the latest version of each file in the range.
20
+ - first : Download the earliest version of each file in the range.
21
+ - save : Save a snapshot to the Wayback Machine (beta).
22
+
23
+ Args:
24
+ url (str): Target URL to download snapshots for. (Required)
25
+ all (bool): If True, downloads all snapshots within the range.
26
+ last (bool): If True, downloads the last version of each file.
27
+ first (bool): If True, downloads the first version of each file.
28
+ save (bool): If True, saves a new snapshot to archive.org (beta).
29
+ explicit (bool): Only use the explicitly provided URL without wildcards.
30
+ range (str): A year-based or timestamp-based range (e.g., '2020' or '20200101').
31
+ start (str): Start timestamp (YYYYMMDDhhmmss).
32
+ end (str): End timestamp (YYYYMMDDhhmmss).
33
+ limit (int): Limit the number of snapshots queried from the CDX API.
34
+ filetype (str): Comma-separated list of filetypes to include (e.g., 'jpg,css,js').
35
+ statuscode (str): Comma-separated list of HTTP status codes to include (e.g., '200,301').
36
+ output (str): Output path for downloaded files. Defaults to `./waybackup_snapshots`.
37
+ metadata (str): Path to store metadata files (`cdx`, `db`, `csv`, etc.).
38
+ verbose (bool): Enable verbose logging.
39
+ log (bool): Enable writing logs to a file.
40
+ progress (bool): Show a progress bar.
41
+ no_redirect (bool): Disable handling redirects.
42
+ retry (int): Retry attempts for failed downloads.
43
+ workers (int): Number of download workers (default: 1).
44
+ delay (int): Delay between download requests in seconds.
45
+ reset (bool): Reset job metadata (deletes `.cdx`/`.db`/`.csv` files).
46
+ keep (bool): Retain all job metadata after completion.
47
+ silent (bool): Suppress all output (for programmatic use).
48
+ debug (bool): Enable debug mode.
49
+ **kwargs: Catch-all for future expansion or external integration.
50
+
51
+ Methods:
52
+ run(): Executes the full download or save operation based on initialized parameters.
53
+
54
+ Example:
55
+ >>> from pywaybackup import PyWayBackup
56
+ >>> backup = PyWayBackup(url="https://example.com", all=True, start="20200101", end="20201231")
57
+ >>> backup.run()
58
+ >>> backup_paths = backup.paths(rel=True)
59
+ >>> print(backup_paths)
60
+ """
61
+
62
+ def __init__(self, url: str = None, all: bool = False, last: bool = False, first: bool = False,
63
+ save: bool = False, explicit: bool = False, range: str = None, start: str = None,
64
+ end: str = None, limit: int = None, filetype: str = None, statuscode: str = None,
65
+ output: str = None, metadata: str = None, verbose: bool = False, log: bool = False,
66
+ progress: bool = False, no_redirect: bool = False, retry: int = 0, workers: int = 1,
67
+ delay: int = 0, reset: bool = False, keep: bool = False,
68
+ silent: bool = True, debug: bool = False, **kwargs: dict):
69
+
70
+ # restrictions
71
+ # url must be given
72
+ # all, last, first, save are mutually exclusive
73
+ if not url:
74
+ raise ValueError("URL must be provided")
75
+ if sum([all, last, first, save]) != 1:
76
+ raise ValueError("Exactly one of --all, --last, --first, or --save is allowed")
77
+
78
+ self.url = url
79
+ self.all = all
80
+ self.last = last
81
+ self.first = first
82
+ self.save = save
83
+ self.explicit = explicit
84
+ self.range = range
85
+ self.start = start
86
+ self.end = end
87
+ self.limit = limit
88
+ self.filetype = filetype
89
+ self.statuscode = statuscode
90
+ self.output = output
91
+ self.metadata = metadata
92
+ self.verbose = verbose
93
+ self.log = log
94
+ self.progress = progress
95
+ self.no_redirect = no_redirect
96
+ self.retry = retry
97
+ self.workers = workers
98
+ self.delay = delay
99
+ self.reset = reset
100
+ self.keep = keep
101
+
102
+ self.silent = silent
103
+ self.debug = debug
104
+
105
+ self.query_identifier = (
106
+ str(self.url) +
107
+ # required_args
108
+ str(self.all) + str(self.last) + str(self.first) + str(self.save) +
109
+ # optional_args
110
+ str(self.explicit) + str(self.range) + str(self.start) + str(self.end) +
111
+ str(self.limit) + str(self.filetype) + str(self.statuscode)
112
+ )
113
+
114
+ # if sys.argv is empty, we assume this is being run as a module
115
+ if not sys.argv[1:]:
116
+ self.command = "pywaybackup_module"
117
+ else:
118
+ # otherwise, we take the command line arguments
119
+ self.command = ' '.join(sys.argv[1:])
120
+
121
+ self._init()
122
+
123
+ def _init(self):
124
+
125
+ self.domain, self.subdir, self.filename = url_split(self.url)
126
+
127
+ if self.output is None:
128
+ self.output = os.path.join(os.getcwd(), "waybackup_snapshots")
129
+ if self.metadata is None:
130
+ self.metadata = self.output
131
+ os.makedirs(self.output, exist_ok=True) if not self.save else None
132
+ os.makedirs(self.metadata, exist_ok=True) if not self.save else None
133
+
134
+ if self.all:
135
+ self.mode = "all"
136
+ if self.last:
137
+ self.mode = "last"
138
+ if self.first:
139
+ self.mode = "first"
140
+ if self.save:
141
+ self.mode = "save"
142
+
143
+ if self.filetype:
144
+ self.filetype = [f.lower().strip() for f in self.filetype.split(",")]
145
+ if self.statuscode:
146
+ self.statuscode = [s.lower().strip() for s in self.statuscode.split(",")]
147
+
148
+ base_path = self.metadata
149
+ base_name = f"waybackup_{sanitize_filename(self.url)}"
150
+ self.cdxfile = os.path.join(base_path, f"{base_name}.cdx")
151
+ self.dbfile = os.path.join(base_path, f"{base_name}.db")
152
+ self.csvfile = os.path.join(base_path, f"{base_name}.csv")
153
+ self.log = os.path.join(base_path, f"{base_name}.log") if self.log else None
154
+ self.debug = os.path.join(base_path, "waybackup_error.log") if self.debug else None
155
+
156
+ if self.reset:
157
+ os.remove(self.cdxfile) if os.path.isfile(self.cdxfile) else None
158
+ os.remove(self.dbfile) if os.path.isfile(self.dbfile) else None
159
+ os.remove(self.csvfile) if os.path.isfile(self.csvfile) else None
160
+
161
+ def paths(self, rel: bool = False) -> dict:
162
+ """
163
+ Return a dictionary of existing file paths associated to the backup process:
164
+ {'shapshots':, 'cdxfile':, 'dbfile':, 'csvfile':, 'log':, 'debug':}
165
+
166
+ Parameters:
167
+ rel (bool): If True, return relative paths; otherwise, return absolute paths.
168
+
169
+ Returns:
170
+ dict: Mapping of file types to their corresponding paths, including only files that exist.
171
+ """
172
+ files = {
173
+ "snapshots": os.path.join(self.output, self.domain),
174
+ "cdxfile": self.cdxfile,
175
+ "dbfile": self.dbfile,
176
+ "csvfile": self.csvfile,
177
+ "log": self.log,
178
+ "debug": self.debug
179
+ }
180
+ return {
181
+ key: (os.path.relpath(path) if rel else path)
182
+ for key, path in files.items()
183
+ if path and os.path.exists(path)
184
+ }
185
+
186
+ def run(self):
187
+ """Run the PyWayBackup process with the given configuration."""
188
+ ex.init(self.debug, self.output, self.command)
189
+ vb.init(self.silent, self.verbose, self.progress, self.log)
190
+
191
+ if self.save:
192
+ archive_save.save_page(self.url)
193
+ os._exit(1)
194
+
195
+ db.init(self.dbfile, self.query_identifier)
196
+ sc.init(self.mode)
197
+
198
+ if not self.save:
199
+ archive_download.startup()
200
+
201
+ try:
202
+ archive_download.query_list(
203
+ self.csvfile,
204
+ self.cdxfile,
205
+ self.range,
206
+ self.limit,
207
+ self.start,
208
+ self.end,
209
+ self.explicit,
210
+ self.filetype,
211
+ self.statuscode,
212
+ self.domain,
213
+ self.subdir,
214
+ self.filename,
215
+ )
216
+ archive_download.download_list(self.output, self.retry, self.no_redirect, self.delay, self.workers)
217
+ except KeyboardInterrupt:
218
+ print("\nInterrupted by user\n")
219
+ self.keep = True
220
+ signal.signal(signal.SIGINT, signal.SIG_IGN)
221
+
222
+ except Exception as e:
223
+ self.keep = True
224
+ ex.exception(message="", e=e)
225
+
226
+ finally:
227
+ sc.csv_create(self.csvfile)
228
+ sc.fini()
229
+ vb.fini()
230
+
231
+ if not self.keep:
232
+ os.remove(self.dbfile) if os.path.exists(self.dbfile) else None
233
+ os.remove(self.cdxfile) if os.path.exists(self.cdxfile) else None
234
+
235
+ os._exit(1)