pywaybackup 1.5.7__tar.gz → 2.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {pywaybackup-1.5.7 → pywaybackup-2.0.1}/PKG-INFO +40 -45
  2. {pywaybackup-1.5.7 → pywaybackup-2.0.1}/README.md +39 -44
  3. pywaybackup-2.0.1/pywaybackup/Arguments.py +96 -0
  4. {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup/Converter.py +0 -1
  5. pywaybackup-2.0.1/pywaybackup/SnapshotCollection.py +305 -0
  6. {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup/Verbosity.py +23 -19
  7. pywaybackup-2.0.1/pywaybackup/__version__.py +1 -0
  8. {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup/archive.py +91 -201
  9. pywaybackup-2.0.1/pywaybackup/db.py +81 -0
  10. pywaybackup-2.0.1/pywaybackup/main.py +53 -0
  11. {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup.egg-info/PKG-INFO +40 -45
  12. {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup.egg-info/SOURCES.txt +1 -1
  13. pywaybackup-1.5.7/pywaybackup/Arguments.py +0 -115
  14. pywaybackup-1.5.7/pywaybackup/SnapshotCollection copy.py +0 -168
  15. pywaybackup-1.5.7/pywaybackup/SnapshotCollection.py +0 -123
  16. pywaybackup-1.5.7/pywaybackup/__version__.py +0 -1
  17. pywaybackup-1.5.7/pywaybackup/main.py +0 -36
  18. {pywaybackup-1.5.7 → pywaybackup-2.0.1}/LICENSE +0 -0
  19. {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup/Exception.py +0 -0
  20. {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup/__init__.py +0 -0
  21. {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup/helper.py +0 -0
  22. {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup.egg-info/dependency_links.txt +0 -0
  23. {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup.egg-info/entry_points.txt +0 -0
  24. {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup.egg-info/requires.txt +0 -0
  25. {pywaybackup-1.5.7 → pywaybackup-2.0.1}/pywaybackup.egg-info/top_level.txt +0 -0
  26. {pywaybackup-1.5.7 → pywaybackup-2.0.1}/setup.cfg +0 -0
  27. {pywaybackup-1.5.7 → pywaybackup-2.0.1}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pywaybackup
3
- Version: 1.5.7
3
+ Version: 2.0.1
4
4
  Summary: Download snapshots from the Wayback Machine
5
5
  Home-page: https://github.com/bitdruid/python-wayback-machine-downloader
6
6
  Author: bitdruid
@@ -48,7 +48,7 @@ This tool allows you to download content from the Wayback Machine (archive.org).
48
48
 
49
49
  - Linux recommended: On Windows machines, the path length is limited. This can only be overcome by editing the registry. Files that exceed the path length will not be downloaded.
50
50
  - If you query an explicit file (e.g. a query-string `?query=this` or `login.html`), the `--explicit`-argument is recommended as a wildcard query may lead to an empty result.
51
- - The tool will inform you if your query has an immense amount of snapshots which could consume your system memory and lead to a crash. Consider splitting your query into smaller jobs by specifying a range e.g. `--start 2023 --end 2024` or `--range 1`.
51
+ - The tool uses a sqlite database to handle snapshots. The database will only persist while the download is running.
52
52
 
53
53
  ## Arguments
54
54
 
@@ -70,8 +70,6 @@ This tool allows you to download content from the Wayback Machine (archive.org).
70
70
 
71
71
  ### Optional query parameters
72
72
 
73
- - **`-l`**, **`--list`**:<br>
74
- Only print the snapshots available within the specified range. Does not download the snapshots.
75
73
  - **`-e`**, **`--explicit`**:<br>
76
74
  Only download the explicit given URL. No wildcard subdomains or paths. Use e.g. to get root-only snapshots. This is recommended for explicit files like `login.html` or `?query=this`.
77
75
 
@@ -79,7 +77,7 @@ This tool allows you to download content from the Wayback Machine (archive.org).
79
77
  Specify filetypes to download. Default is all filetypes. Separate multiple filetypes with a comma. Example: `--filetype jpg,css,js`. A filter will result in a filtered cdx-file. So if you want to download all files later, you need to query again without the filter. Filetypes are filtered as they are in the snapshot. So if there is no explicit `html` file in the path (common practice) then you cant filter them.
80
78
 
81
79
  - **`--limit`** `<count>`:<br>
82
- Limits the amount of snapshots to query from the CDX server. If an existing CDX file is injected (with `--cdxinject` or `--auto`), the limit will have no effect.
80
+ Limits the amount of snapshots to query from the CDX server. If an existing CDX file is injected, the limit will have no effect. So you would need to set `--keep`.
83
81
 
84
82
  - **Range Selection:**<br>
85
83
  Specify the range in years or a specific timestamp either start, end, or both. If you specify the `range` argument, the `start` and `end` arguments will be ignored. Format for timestamps: YYYYMMDDhhmmss. You can only give a year or increase specificity by going through the timestamp starting on the left.<br>
@@ -94,26 +92,22 @@ Limits the amount of snapshots to query from the CDX server. If an existing CDX
94
92
  ### Behavior manipulation
95
93
 
96
94
  - **`-o`**, **`--output`**:<br>
97
- Defaults to `waybackup_snapshots` in the current directory. The folder where downloaded files will be saved.
98
-
99
- - **`--csv`** `<path>`:<br>
100
- Path defaults to output-dir. Saves a CSV file with the json-response for successfull downloads. If `--list` is set, the CSV contains the CDX list of snapshots. If `--current` or `--full` is set, CSV contains downloaded files. Named as `waybackup_<sanitized_url>.csv`.
95
+ Defaults to `waybackup_snapshots` in the current directory. The folder where downloaded files will be saved.
101
96
 
102
- - **`--skip`** `<path>`:<br>
103
- Path defaults to output-dir. Checks for an existing `waybackup_<sanitized_url>.csv` for URLs to skip downloading. Useful for interrupted downloads. Files are checked by their root-domain, ensuring consistency across queries. This means that if you download `http://example.com/subdir1/` and later `http://example.com`, the second query will skip the first path.
104
-
105
- - **`--no-redirect`**:<br>
106
- Disables following redirects of snapshots. Useful for preventing timestamp-folder mismatches caused by Archive.org redirects.
107
-
108
- - **`--verbosity`** `<level>`:<br>
109
- Sets verbosity level. Options are `json` (prints JSON response) or `progress` (shows progress bar).
110
- <!-- Alternatively set verbosity level to `trace` for a very detailed output. -->
97
+ <!-- - **`--verbosity`** `<level>`:<br>
98
+ Sets verbosity level. Options are `info`and `trace`. Default is `info`. -->
111
99
 
112
- - **`--log`** `<path>`:<br>
113
- Path defaults to output-dir. Saves a log file with the output of the tool. Named as `waybackup_<sanitized_url>.log`.
100
+ - **`--log`** <!-- `<path>` -->:<br>
101
+ Saves a log file into the output-dir. Named as `waybackup_<sanitized_url>.log`.
102
+
103
+ - **`--progress`**:<br>
104
+ Shows a progress bar instead of the default output.
114
105
 
115
106
  - **`--workers`** `<count>`:<br>
116
107
  Sets the number of simultaneous download workers. Default is 1, safe range is about 10. Be cautious as too many workers may lead to refused connections from the Wayback Machine.
108
+
109
+ - **`--no-redirect`**:<br>
110
+ Disables following redirects of snapshots. Useful for preventing timestamp-folder mismatches caused by Archive.org redirects.
117
111
 
118
112
  - **`--retry`** `<attempts>`:<br>
119
113
  Specifies number of retry attempts for failed downloads.
@@ -124,39 +118,39 @@ Specifies delay between download requests in seconds. Default is no delay (0).
124
118
  <!-- - **`--convert-links`**:<br>
125
119
  If set, all links in the downloaded files will be converted to local links. This is useful for offline browsing. The links are converted to the local path structure. Show output with `--verbosity trace`. -->
126
120
 
127
- **CDX Query Result Handling:**
128
- - **`--cdxbackup`** `<path>`:<br>
129
- Path defaults to output-dir. Saves the result of CDX query as a file. Useful for later downloading snapshots and overcoming refused connections by CDX server due to too many queries. Named as `waybackup_<sanitized_url>.cdx`.
130
-
131
- - **`--cdxinject`** `<path>`:<br>
132
- Path defaults to output-dir. Injects a CDX query file to download snapshots. Ensure the query matches the previous `--url` for correct folder structure. Named as `waybackup_<sanitized_url>.cdx`.
121
+ ## Special:
122
+
123
+ - **`--reset`**:
124
+ If set, the job will be reset, and any existing `cdx`, `db`, `csv` files will be **deleted**. This allows you to start the job from scratch without considering previously downloaded data.
133
125
 
134
- **Auto:**
135
- - **`--auto`**:<br>
136
- If set, csv, skip and cdxbackup/cdxinject are handled automatically. Keep the files and folders as they are. Otherwise they will not be recognized when restarting a download.
126
+ - **`--keep`**:
127
+ If set, all files will be kept after the job is finished. This includes the `cdx` and `db` file. Without this argument, they will be deleted if the job finished successfully.
137
128
 
138
129
  ### Examples
139
130
 
140
- Download latest snapshot of all files:<br>
131
+ Download the latest snapshot of all available files:<br>
141
132
  `waybackup -u http://example.com -c`
142
133
 
143
- Download latest snapshot of a specific file:<br>
144
- `waybackup -u http://example.com/subdir/file.html -c`
134
+ Download the latest snapshot of a specific file (e.g., a login page):<br>
135
+ `waybackup -u http://example.com/login.html -c --explicit`
145
136
 
146
- Download all snapshots sorted per timestamp with a specified range and do not follow redirects:<br>
137
+ Download all snapshots within the last 5 years and prevent redirects:<br>
147
138
  `waybackup -u http://example.com -f -r 5 --no-redirect`
148
139
 
149
- Download all snapshots sorted per timestamp with a specified range and save to a specified folder with 3 workers:<br>
140
+ Download all snapshots from a specific range (2020 to December 12, 2022) with 4 workers, and show a progress bar:<br>
141
+ `waybackup -u http://example.com -f --start 2020 --end 20221212 --workers 4 --progress`
142
+
143
+ Download all snapshots and save the output in a specific folder with 3 workers:<br>
150
144
  `waybackup -u http://example.com -f -r 5 -o /home/user/Downloads/snapshots --workers 3`
151
145
 
152
- Download all snapshots from 2020 to 12th of December 2022 with 4 workers, save a csv and show a progress bar:
153
- `waybackup -u http://example.com -f --start 2020 --end 20221212 --workers 4 --csv --verbosity progress`
146
+ Download all snapshots but only images and CSS files, filtering for specific filetypes (jpg, css):<br>
147
+ `waybackup -u http://example.com -f --filetype jpg,css`
154
148
 
155
- Download all snapshots and output a json response:<br>
156
- `waybackup -u http://example.com -f --verbosity json`
149
+ Download all timestamps but start over and ignore existing progress, log the output, and retry 3 times if any error occurs:<br>
150
+ `waybackup -u http://example.com -f --log --retry 3 --reset`
157
151
 
158
- List available snapshots per timestamp without downloading and save a csv file to home folder:<br>
159
- `waybackup -u http://example.com -f -l --csv /home/user/Downloads`
152
+ Download the latest snapshot, follow no redirects but keep the database and cdx-file:<br>
153
+ `waybackup -u http://example.com -c --no-redirect --keep`
160
154
 
161
155
  ## Output path structure
162
156
 
@@ -195,8 +189,9 @@ your/path/waybackup_snapshots/
195
189
  ...
196
190
  ```
197
191
 
192
+ ## CSV Output
198
193
 
199
- ### Json Response
194
+ Each snapshot is stored with the following keys/values. These are either stored in a sqlite database while the download is running or saved into a CSV file after the download is finished.
200
195
 
201
196
  For download queries:
202
197
 
@@ -232,14 +227,14 @@ For list queries:
232
227
  ]
233
228
  ```
234
229
 
235
- ## CSV Output
236
-
237
- The csv contains the json response in a table format.
238
-
239
230
  ### Debugging
240
231
 
241
232
  Exceptions will be written into `waybackup_error.log` (each run overwrites the file).
242
233
 
234
+ ### Known ToDos
235
+
236
+ - [ ] currently there is no logic to handle if both a http and https version of a page is available
237
+
243
238
  ## Contributing
244
239
 
245
240
  I'm always happy for some feature requests to improve the usability of this tool.
@@ -32,7 +32,7 @@ This tool allows you to download content from the Wayback Machine (archive.org).
32
32
 
33
33
  - Linux recommended: On Windows machines, the path length is limited. This can only be overcome by editing the registry. Files that exceed the path length will not be downloaded.
34
34
  - If you query an explicit file (e.g. a query-string `?query=this` or `login.html`), the `--explicit`-argument is recommended as a wildcard query may lead to an empty result.
35
- - The tool will inform you if your query has an immense amount of snapshots which could consume your system memory and lead to a crash. Consider splitting your query into smaller jobs by specifying a range e.g. `--start 2023 --end 2024` or `--range 1`.
35
+ - The tool uses a sqlite database to handle snapshots. The database will only persist while the download is running.
36
36
 
37
37
  ## Arguments
38
38
 
@@ -54,8 +54,6 @@ This tool allows you to download content from the Wayback Machine (archive.org).
54
54
 
55
55
  ### Optional query parameters
56
56
 
57
- - **`-l`**, **`--list`**:<br>
58
- Only print the snapshots available within the specified range. Does not download the snapshots.
59
57
  - **`-e`**, **`--explicit`**:<br>
60
58
  Only download the explicit given URL. No wildcard subdomains or paths. Use e.g. to get root-only snapshots. This is recommended for explicit files like `login.html` or `?query=this`.
61
59
 
@@ -63,7 +61,7 @@ This tool allows you to download content from the Wayback Machine (archive.org).
63
61
  Specify filetypes to download. Default is all filetypes. Separate multiple filetypes with a comma. Example: `--filetype jpg,css,js`. A filter will result in a filtered cdx-file. So if you want to download all files later, you need to query again without the filter. Filetypes are filtered as they are in the snapshot. So if there is no explicit `html` file in the path (common practice) then you cant filter them.
64
62
 
65
63
  - **`--limit`** `<count>`:<br>
66
- Limits the amount of snapshots to query from the CDX server. If an existing CDX file is injected (with `--cdxinject` or `--auto`), the limit will have no effect.
64
+ Limits the amount of snapshots to query from the CDX server. If an existing CDX file is injected, the limit will have no effect. So you would need to set `--keep`.
67
65
 
68
66
  - **Range Selection:**<br>
69
67
  Specify the range in years or a specific timestamp either start, end, or both. If you specify the `range` argument, the `start` and `end` arguments will be ignored. Format for timestamps: YYYYMMDDhhmmss. You can only give a year or increase specificity by going through the timestamp starting on the left.<br>
@@ -78,26 +76,22 @@ Limits the amount of snapshots to query from the CDX server. If an existing CDX
78
76
  ### Behavior manipulation
79
77
 
80
78
  - **`-o`**, **`--output`**:<br>
81
- Defaults to `waybackup_snapshots` in the current directory. The folder where downloaded files will be saved.
82
-
83
- - **`--csv`** `<path>`:<br>
84
- Path defaults to output-dir. Saves a CSV file with the json-response for successfull downloads. If `--list` is set, the CSV contains the CDX list of snapshots. If `--current` or `--full` is set, CSV contains downloaded files. Named as `waybackup_<sanitized_url>.csv`.
79
+ Defaults to `waybackup_snapshots` in the current directory. The folder where downloaded files will be saved.
85
80
 
86
- - **`--skip`** `<path>`:<br>
87
- Path defaults to output-dir. Checks for an existing `waybackup_<sanitized_url>.csv` for URLs to skip downloading. Useful for interrupted downloads. Files are checked by their root-domain, ensuring consistency across queries. This means that if you download `http://example.com/subdir1/` and later `http://example.com`, the second query will skip the first path.
88
-
89
- - **`--no-redirect`**:<br>
90
- Disables following redirects of snapshots. Useful for preventing timestamp-folder mismatches caused by Archive.org redirects.
91
-
92
- - **`--verbosity`** `<level>`:<br>
93
- Sets verbosity level. Options are `json` (prints JSON response) or `progress` (shows progress bar).
94
- <!-- Alternatively set verbosity level to `trace` for a very detailed output. -->
81
+ <!-- - **`--verbosity`** `<level>`:<br>
82
+ Sets verbosity level. Options are `info`and `trace`. Default is `info`. -->
95
83
 
96
- - **`--log`** `<path>`:<br>
97
- Path defaults to output-dir. Saves a log file with the output of the tool. Named as `waybackup_<sanitized_url>.log`.
84
+ - **`--log`** <!-- `<path>` -->:<br>
85
+ Saves a log file into the output-dir. Named as `waybackup_<sanitized_url>.log`.
86
+
87
+ - **`--progress`**:<br>
88
+ Shows a progress bar instead of the default output.
98
89
 
99
90
  - **`--workers`** `<count>`:<br>
100
91
  Sets the number of simultaneous download workers. Default is 1, safe range is about 10. Be cautious as too many workers may lead to refused connections from the Wayback Machine.
92
+
93
+ - **`--no-redirect`**:<br>
94
+ Disables following redirects of snapshots. Useful for preventing timestamp-folder mismatches caused by Archive.org redirects.
101
95
 
102
96
  - **`--retry`** `<attempts>`:<br>
103
97
  Specifies number of retry attempts for failed downloads.
@@ -108,39 +102,39 @@ Specifies delay between download requests in seconds. Default is no delay (0).
108
102
  <!-- - **`--convert-links`**:<br>
109
103
  If set, all links in the downloaded files will be converted to local links. This is useful for offline browsing. The links are converted to the local path structure. Show output with `--verbosity trace`. -->
110
104
 
111
- **CDX Query Result Handling:**
112
- - **`--cdxbackup`** `<path>`:<br>
113
- Path defaults to output-dir. Saves the result of CDX query as a file. Useful for later downloading snapshots and overcoming refused connections by CDX server due to too many queries. Named as `waybackup_<sanitized_url>.cdx`.
114
-
115
- - **`--cdxinject`** `<path>`:<br>
116
- Path defaults to output-dir. Injects a CDX query file to download snapshots. Ensure the query matches the previous `--url` for correct folder structure. Named as `waybackup_<sanitized_url>.cdx`.
105
+ ## Special:
106
+
107
+ - **`--reset`**:
108
+ If set, the job will be reset, and any existing `cdx`, `db`, `csv` files will be **deleted**. This allows you to start the job from scratch without considering previously downloaded data.
117
109
 
118
- **Auto:**
119
- - **`--auto`**:<br>
120
- If set, csv, skip and cdxbackup/cdxinject are handled automatically. Keep the files and folders as they are. Otherwise they will not be recognized when restarting a download.
110
+ - **`--keep`**:
111
+ If set, all files will be kept after the job is finished. This includes the `cdx` and `db` file. Without this argument, they will be deleted if the job finished successfully.
121
112
 
122
113
  ### Examples
123
114
 
124
- Download latest snapshot of all files:<br>
115
+ Download the latest snapshot of all available files:<br>
125
116
  `waybackup -u http://example.com -c`
126
117
 
127
- Download latest snapshot of a specific file:<br>
128
- `waybackup -u http://example.com/subdir/file.html -c`
118
+ Download the latest snapshot of a specific file (e.g., a login page):<br>
119
+ `waybackup -u http://example.com/login.html -c --explicit`
129
120
 
130
- Download all snapshots sorted per timestamp with a specified range and do not follow redirects:<br>
121
+ Download all snapshots within the last 5 years and prevent redirects:<br>
131
122
  `waybackup -u http://example.com -f -r 5 --no-redirect`
132
123
 
133
- Download all snapshots sorted per timestamp with a specified range and save to a specified folder with 3 workers:<br>
124
+ Download all snapshots from a specific range (2020 to December 12, 2022) with 4 workers, and show a progress bar:<br>
125
+ `waybackup -u http://example.com -f --start 2020 --end 20221212 --workers 4 --progress`
126
+
127
+ Download all snapshots and save the output in a specific folder with 3 workers:<br>
134
128
  `waybackup -u http://example.com -f -r 5 -o /home/user/Downloads/snapshots --workers 3`
135
129
 
136
- Download all snapshots from 2020 to 12th of December 2022 with 4 workers, save a csv and show a progress bar:
137
- `waybackup -u http://example.com -f --start 2020 --end 20221212 --workers 4 --csv --verbosity progress`
130
+ Download all snapshots but only images and CSS files, filtering for specific filetypes (jpg, css):<br>
131
+ `waybackup -u http://example.com -f --filetype jpg,css`
138
132
 
139
- Download all snapshots and output a json response:<br>
140
- `waybackup -u http://example.com -f --verbosity json`
133
+ Download all timestamps but start over and ignore existing progress, log the output, and retry 3 times if any error occurs:<br>
134
+ `waybackup -u http://example.com -f --log --retry 3 --reset`
141
135
 
142
- List available snapshots per timestamp without downloading and save a csv file to home folder:<br>
143
- `waybackup -u http://example.com -f -l --csv /home/user/Downloads`
136
+ Download the latest snapshot, follow no redirects but keep the database and cdx-file:<br>
137
+ `waybackup -u http://example.com -c --no-redirect --keep`
144
138
 
145
139
  ## Output path structure
146
140
 
@@ -179,8 +173,9 @@ your/path/waybackup_snapshots/
179
173
  ...
180
174
  ```
181
175
 
176
+ ## CSV Output
182
177
 
183
- ### Json Response
178
+ Each snapshot is stored with the following keys/values. These are either stored in a sqlite database while the download is running or saved into a CSV file after the download is finished.
184
179
 
185
180
  For download queries:
186
181
 
@@ -216,14 +211,14 @@ For list queries:
216
211
  ]
217
212
  ```
218
213
 
219
- ## CSV Output
220
-
221
- The csv contains the json response in a table format.
222
-
223
214
  ### Debugging
224
215
 
225
216
  Exceptions will be written into `waybackup_error.log` (each run overwrites the file).
226
217
 
218
+ ### Known ToDos
219
+
220
+ - [ ] currently there is no logic to handle if both a http and https version of a page is available
221
+
227
222
  ## Contributing
228
223
 
229
224
  I'm always happy for some feature requests to improve the usability of this tool.
@@ -0,0 +1,96 @@
1
+
2
+ import sys
3
+ import os
4
+ import argparse
5
+
6
+ from pywaybackup.helper import url_split, sanitize_filename
7
+
8
+ from pywaybackup.__version__ import __version__
9
+
10
+ class Arguments:
11
+
12
+ def __init__(self):
13
+
14
+ parser = argparse.ArgumentParser(description='Download from wayback machine (archive.org)')
15
+ parser.add_argument('-a', '--about', action='version', version='%(prog)s ' + __version__ + ' by @bitdruid -> https://github.com/bitdruid')
16
+
17
+ required = parser.add_argument_group('required (one exclusive)')
18
+ required.add_argument('-u', '--url', type=str, metavar="", help='url (with subdir/subdomain) to download')
19
+ exclusive_required = required.add_mutually_exclusive_group(required=True)
20
+ exclusive_required.add_argument('-c', '--current', action='store_true', help='download the latest version of each file snapshot')
21
+ exclusive_required.add_argument('-f', '--full', action='store_true', help='download snapshots of all timestamps')
22
+ exclusive_required.add_argument('-s', '--save', action='store_true', help='save a page to the wayback machine')
23
+
24
+ optional = parser.add_argument_group('optional query parameters')
25
+ optional.add_argument('-e', '--explicit', action='store_true', help='search only for the explicit given url')
26
+ optional.add_argument('-r', '--range', type=int, metavar="", help='range in years to search')
27
+ optional.add_argument('--start', type=int, metavar="", help='start timestamp format: YYYYMMDDhhmmss')
28
+ optional.add_argument('--end', type=int, metavar="", help='end timestamp format: YYYYMMDDhhmmss')
29
+ optional.add_argument('--filetype', type=str, metavar="", help='filetypes to download comma separated (e.g. "html,css")')
30
+ optional.add_argument('--limit', type=int, nargs='?', const=True, metavar='int', help='limit the number of snapshots to download')
31
+
32
+ behavior = parser.add_argument_group('manipulate behavior')
33
+ behavior.add_argument('-o', '--output', type=str, metavar="", help='output folder - defaults to current directory')
34
+ behavior.add_argument('--log', action='store_true', help='save a log file into the output folder')
35
+ behavior.add_argument('--progress', action='store_true', help='show a progress bar')
36
+ behavior.add_argument('--no-redirect', action='store_true', help='do not follow redirects by archive.org')
37
+ #behavior.add_argument('--verbosity', type=str, default="info", metavar="", help='verbosity level (info, trace)')
38
+ behavior.add_argument('--retry', type=int, default=0, metavar="", help='retry failed downloads (opt tries as int, else infinite)')
39
+ behavior.add_argument('--workers', type=int, default=1, metavar="", help='number of workers (simultaneous downloads)')
40
+ # behavior.add_argument('--convert-links', action='store_true', help='Convert all links in the files to local paths. Requires -c/--current')
41
+ behavior.add_argument('--delay', type=int, default=0, metavar="", help='delay between each download in seconds')
42
+
43
+ special = parser.add_argument_group('special')
44
+ special.add_argument('--reset', action='store_true', help='reset the job and ignore existing cdx/db/csv files')
45
+ special.add_argument('--keep', action='store_true', help='keep all files after the job finished')
46
+
47
+ args = parser.parse_args(args=None if sys.argv[1:] else ['--help']) # if no arguments are given, print help
48
+
49
+ required_args = {action.dest: getattr(args, action.dest) for action in exclusive_required._group_actions}
50
+ optional_args = {action.dest: getattr(args, action.dest) for action in optional._group_actions}
51
+ args.query_identifier = str(args.url) + str(required_args) + str(optional_args)
52
+
53
+ # if args.convert_links and not args.current:
54
+ # parser.error("--convert-links can only be used with the -c/--current option")
55
+
56
+ self.args = args
57
+
58
+ def get_args(self):
59
+ return self.args
60
+
61
+ class Configuration:
62
+
63
+ @classmethod
64
+ def init(cls):
65
+
66
+ cls.args = Arguments().get_args()
67
+ for key, value in vars(cls.args).items():
68
+ setattr(Configuration, key, value)
69
+
70
+ # args now attributes of Configuration // Configuration.output, ...
71
+ cls.command = ' '.join(sys.argv[1:])
72
+ cls.domain, cls.subdir, cls.filename = url_split(cls.url)
73
+
74
+ if cls.output is None:
75
+ cls.output = os.path.join(os.getcwd(), "waybackup_snapshots")
76
+ os.makedirs(cls.output, exist_ok=True)
77
+
78
+ if cls.log is True:
79
+ cls.log = os.path.join(cls.output, f"waybackup_{sanitize_filename(cls.url)}.log")
80
+
81
+ if cls.full:
82
+ cls.mode = "full"
83
+ if cls.current:
84
+ cls.mode = "current"
85
+
86
+ if cls.filetype:
87
+ cls.filetype = [ft.lower().strip() for ft in cls.filetype.split(",")]
88
+
89
+ cls.cdxfile = os.path.join(cls.output, f"waybackup_{sanitize_filename(cls.url)}.cdx")
90
+ cls.dbfile = os.path.join(cls.output, f"waybackup_{sanitize_filename(cls.url)}.db")
91
+ cls.csvfile = os.path.join(cls.output, f"waybackup_{sanitize_filename(cls.url)}.csv")
92
+
93
+ if cls.reset:
94
+ os.remove(cls.cdxfile) if os.path.isfile(cls.cdxfile) else None
95
+ os.remove(cls.dbfile) if os.path.isfile(cls.dbfile) else None
96
+ os.remove(cls.csvfile) if os.path.isfile(cls.csvfile) else None
@@ -4,7 +4,6 @@ import magic
4
4
  from pywaybackup.helper import url_split
5
5
 
6
6
  from pywaybackup.Arguments import Configuration as config
7
- from pywaybackup.Verbosity import Verbosity as vb
8
7
  import re
9
8
 
10
9
  class Converter: