pywaybackup 3.1.0__tar.gz → 3.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {pywaybackup-3.1.0/pywaybackup.egg-info → pywaybackup-3.2.1}/PKG-INFO +85 -53
  2. {pywaybackup-3.1.0 → pywaybackup-3.2.1}/README.md +79 -49
  3. {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pyproject.toml +5 -4
  4. {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup/Arguments.py +18 -15
  5. {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup/Converter.py +10 -10
  6. {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup/Exception.py +13 -18
  7. {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup/SnapshotCollection.py +118 -60
  8. pywaybackup-3.2.1/pywaybackup/Verbosity.py +92 -0
  9. pywaybackup-3.2.1/pywaybackup/Worker.py +158 -0
  10. pywaybackup-3.2.1/pywaybackup/archive_download.py +335 -0
  11. {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup/archive_save.py +19 -19
  12. {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup/helper.py +7 -7
  13. {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup/main.py +2 -2
  14. {pywaybackup-3.1.0 → pywaybackup-3.2.1/pywaybackup.egg-info}/PKG-INFO +85 -53
  15. {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup.egg-info/SOURCES.txt +1 -0
  16. {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup.egg-info/requires.txt +4 -3
  17. pywaybackup-3.1.0/pywaybackup/Verbosity.py +0 -121
  18. pywaybackup-3.1.0/pywaybackup/archive_download.py +0 -332
  19. {pywaybackup-3.1.0 → pywaybackup-3.2.1}/LICENSE +0 -0
  20. {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup/__init__.py +0 -0
  21. {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup/db.py +0 -0
  22. {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup.egg-info/dependency_links.txt +0 -0
  23. {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup.egg-info/entry_points.txt +0 -0
  24. {pywaybackup-3.1.0 → pywaybackup-3.2.1}/pywaybackup.egg-info/top_level.txt +0 -0
  25. {pywaybackup-3.1.0 → pywaybackup-3.2.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: pywaybackup
3
- Version: 3.1.0
3
+ Version: 3.2.1
4
4
  Summary: Query and download archive.org as simple as possible.
5
5
  Author-email: bitdruid <bitdruid@outlook.com>
6
6
  License: MIT License
@@ -29,18 +29,19 @@ Project-URL: homepage, https://github.com/bitdruid/python-wayback-machine-downlo
29
29
  Requires-Python: >=3.8
30
30
  Description-Content-Type: text/markdown
31
31
  License-File: LICENSE
32
- Requires-Dist: pysqlite3-binary==0.5.4
33
- Requires-Dist: requests==2.31.0
34
- Requires-Dist: tqdm==4.66.2
32
+ Requires-Dist: pysqlite3-binary==0.5.4; sys_platform == "linux"
33
+ Requires-Dist: pysqlite-binary; sys_platform == "win32"
34
+ Requires-Dist: requests==2.32.3
35
+ Requires-Dist: tqdm==4.67.1
35
36
  Requires-Dist: python-magic==0.4.27; sys_platform == "linux"
36
37
  Requires-Dist: python-magic-bin==0.4.14; sys_platform == "win32"
38
+ Dynamic: license-file
37
39
 
38
40
  # python wayback machine downloader
39
41
 
40
42
  [![PyPI](https://img.shields.io/pypi/v/pywaybackup)](https://pypi.org/project/pywaybackup/)
41
43
  [![PyPI - Downloads](https://img.shields.io/pypi/dm/pywaybackup)](https://pypi.org/project/pywaybackup/)
42
44
  ![Python Version](https://img.shields.io/badge/Python-3.8-blue)
43
- <!-- ![Python_Sqlite3 Version](https://img.shields.io/badge/Python_Sqlite3-3.35-blue) -->
44
45
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
45
46
 
46
47
  Downloading archived web pages from the [Wayback Machine](https://archive.org/web/).
@@ -66,11 +67,15 @@ This tool allows you to download content from the Wayback Machine (archive.org).
66
67
  ```pip install .```
67
68
  - in a virtual env or use `--break-system-package`
68
69
 
69
- ## Usage infos - important notes
70
+ ## notes / issues / hints
70
71
 
71
- - Linux recommended: On Windows machines, the path length is limited. This can only be overcome by editing the registry. Files that exceed the path length will not be downloaded.
72
- - If you query an explicit file (e.g. a query-string `?query=this` or `login.html`), the `--explicit`-argument is recommended as a wildcard query may lead to an empty result.
72
+ - Linux recommended: On Windows machines, the path length is limited. Files that exceed the path length will not be downloaded.
73
73
  - The tool uses a sqlite database to handle snapshots. The database will only persist while the download is running.
74
+ - If you query an explicit file (e.g. a query-string `?query=this` or `login.html`), the `--explicit`-argument is recommended as a wildcard query may lead to an empty result.
75
+ - Downloading directly into a network share is not recommended. The sqlite locking mechanism may cause issues. If you need to download into a network share, set the `--metadata` argument to a local path.
76
+
77
+ <br>
78
+ <br>
74
79
 
75
80
  ## Arguments
76
81
 
@@ -92,7 +97,7 @@ This tool allows you to download content from the Wayback Machine (archive.org).
92
97
  - **`-s`**, **`--save`**:<br>
93
98
  Save a page to the Wayback Machine. (beta)
94
99
 
95
- ### Optional query parameters
100
+ #### Optional query parameters
96
101
 
97
102
  - **`-e`**, **`--explicit`**:<br>
98
103
  Only download the explicit given URL. No wildcard subdomains or paths. Use e.g. to get root-only snapshots. This is recommended for explicit files like `login.html` or `?query=this`.
@@ -113,11 +118,16 @@ Limits the amount of snapshots to query from the CDX server. If an existing CDX
113
118
  - **`--end`**:<br>
114
119
  Timestamp to end searching.
115
120
 
116
- ### Behavior manipulation
121
+ ### Optional
122
+
123
+ #### Behavior Manipulation
117
124
 
118
125
  - **`-o`**, **`--output`**:<br>
119
126
  Defaults to `waybackup_snapshots` in the current directory. The folder where downloaded files will be saved.
120
127
 
128
+ - **`-m`**, **`--metadata`**<br>
129
+ Change the folder where metadata will be saved (`cdx`/`db`/`csv`/`log`). Especially if you are downloading into a network share, you SHOULD set this to a local path because sqlite locking mechanism may cause issues with network shares.
130
+
121
131
  <!-- - **`--verbosity`** `<level>`:<br>
122
132
  Sets verbosity level. Options are `info`and `trace`. Default is `info`. -->
123
133
 
@@ -132,17 +142,31 @@ Sets the number of simultaneous download workers. Default is 1, safe range is ab
132
142
 
133
143
  - **`--no-redirect`**:<br>
134
144
  Disables following redirects of snapshots. Useful for preventing timestamp-folder mismatches caused by Archive.org redirects.
135
-
145
+
136
146
  - **`--retry`** `<attempts>`:<br>
137
147
  Specifies number of retry attempts for failed downloads.
138
148
 
139
149
  - **`--delay`** `<seconds>`:<br>
140
150
  Specifies delay between download requests in seconds. Default is no delay (0).
141
151
 
152
+ - **`--verbose`**:<br>
153
+ Increase output verbosity.
154
+ - verbose:
155
+ ```
156
+ -----> Worker: 2 - Attempt: [1/1] Snapshot ID: [23/81]
157
+ SUCCESS -> 200 OK
158
+ -> URL: https://web.archive.org/web/20240225193302id_/https://example.com/assets/css/custom-styles.css
159
+ -> FILE: /home/manjaro/Stuff/python-wayback-machine-downloader/waybackup_snapshots/example.com/20240225193302id_/assets/css/custom-styles.css
160
+ ```
161
+ - non-verbose:
162
+ ```
163
+ 55/81 - W:2 - SUCCESS - 20240225193302 - https://example.com/assets/css/custom-styles.css
164
+ ```
165
+
142
166
  <!-- - **`--convert-links`**:<br>
143
167
  If set, all links in the downloaded files will be converted to local links. This is useful for offline browsing. The links are converted to the local path structure. Show output with `--verbosity trace`. -->
144
168
 
145
- ### Special:
169
+ #### Job Handling:
146
170
 
147
171
  - **`--reset`**:
148
172
  If set, the job will be reset, and any existing `cdx`, `db`, `csv` files will be **deleted**. This allows you to start the job from scratch without considering previously downloaded data.
@@ -150,47 +174,56 @@ If set, all links in the downloaded files will be converted to local links. This
150
174
  - **`--keep`**:
151
175
  If set, all files will be kept after the job is finished. This includes the `cdx` and `db` file. Without this argument, they will be deleted if the job finished successfully.
152
176
 
153
- # Usage
177
+ <br>
178
+ <br>
179
+
180
+ ## Usage
154
181
 
155
182
  ### Handling Interrupted Jobs
156
- When a job is interrupted (by any reason), `pywaybackup` is designed to resume the job from where it left off. It automatically detects existing job data (based on the URL and <u>**optional query parameters**</u> - including output directory) and resumes the process without requiring manual intervention. Here's how the tool handles different scenarios:
157
-
158
- - **Default Behavior:**
159
- - On restarting the same job (same URL, <u>**optional query parameters**</u>, and output directory), the tool will:
160
- - Reuse the existing `.cdx` and `.db` files.
161
- - Resume downloading snapshots from the last successful point.
162
- - Skip previously downloaded files to save time and resources.
163
-
164
- - **Manual Reset with `--reset`:**
165
- - This command deletes any existing `.cdx` and `.db` files associated with the job and starts the process from scratch.
166
- - Useful if:
167
- - The previous data is corrupted.
168
- - You want to re-query the snapshots without considering previously downloaded data.
169
-
170
- - **Preserving Job Data with `--keep`:**
171
- - Normally, `.cdx` and `.db` files are deleted after the job finishes successfully.
172
- - Use `--keep` to retain these files for future use (e.g., re-analysis or extending the query later).
173
-
174
- > **Note1:** The resumption process only works if the output directory remains the same as the one used during the initial job.
175
- >
176
- > **Note2:** `--reset` will NOT delete the already downloaded files for now. You have to remove them 'by hand'.
177
-
178
- ### Example
179
-
180
- 1. Start downloading all available snapshots:<br>`waybackup -u https://example.com -a`
181
- 2. Interrupt the process `CTRL+C`<br>
182
- 3. The tool will detect the existing job data and resume downloading from the last completed point:<br>`waybackup -u https://example.com -a`
183
- > **Important:** `waybackup -u https://example.com -c` -> The tool will NOT resume because a necessary identifier-changed
184
- 4. This deletes any existing .cdx and .db files associated with the job and starts the process from scratch:<br>`waybackup -u https://example.com -a --reset`
185
- 5. This ensures all job-related files are kept for future use, such as re-analysis or extending the query later:<br>`waybackup -u https://example.com -a --keep`
186
-
187
- ## Output path structure
183
+
184
+ `pywaybackup` resumes interrupted jobs. The tool automatically continues from where it left off.
185
+
186
+ - Detects existing `.cdx` and `.db` files in an `output dir` to resume downloading from the last successful point.
187
+ - Compares `URL`, `mode`, and `optional query parameters` to ensure automatic resumption.
188
+ - Skips previously downloaded files to save time.
189
+ > **Note:** Changing URL, mode selection, query parameters or output prevents automatic resumption.
190
+
191
+ #### Resetting a Job (`--reset`)
192
+ - Deletes `.cdx` and `.db` files and restarts the process from scratch.
193
+ - Does **not** remove already downloaded files.
194
+ - `waybackup -u https://example.com -a --reset`
195
+
196
+ #### Keeping Job Data (`--keep`)
197
+ - Normally, `.cdx` and `.db` files are deleted after a successful job.
198
+ - `--keep` preserves them for future re-analysis or extending the query.
199
+ - `waybackup -u https://example.com -a --keep`
200
+
201
+ <br>
202
+ <br>
203
+
204
+ ## Examples
205
+
206
+ 1. Download a specific single snapshot of all available files (starting from root):<br>
207
+ `waybackup -u https://example.com -a --start 20210101000000 --end 20210101000000`
208
+ 2. Download a specific single snapshot of all available files (starting from a subdirectory):<br>
209
+ `waybackup -u https://example.com/subdir1/subdir2/assets/ -a --start 20210101000000 --end 20210101000000`
210
+ 3. Download a specific single snapshot of the exact given URL (no subdirs):<br>
211
+ `waybackup -u https://example.com -a --start 20210101000000 --end 20210101000000 --explicit`
212
+ 4. Download all snapshots of all available files in the given range:<br>
213
+ `waybackup -u https://example.com -a --start 20210101000000 --end 20231122000000`
214
+
215
+ <br>
216
+ <br>
217
+
218
+ ## Output
219
+
220
+ ### Path Structure
188
221
 
189
222
  The output path is currently structured as follows by an example for the query:<br>
190
- `http://example.com/subdir1/subdir2/assets/`:
223
+ `http://example.com/subdir1/subdir2/assets/`
191
224
  <br><br>
192
225
  For the first and last version (`-f` or `-l`):
193
- - The requested path will only include all files/folders starting from your query-path.
226
+ - Will only include all files/folders starting from your query-path.
194
227
  ```
195
228
  your/path/waybackup_snapshots/
196
229
  └── the_root_of_your_query/ (example.com/)
@@ -202,7 +235,7 @@ your/path/waybackup_snapshots/
202
235
  ...
203
236
  ```
204
237
  For all versions (`-a`):
205
- - Will currently create a folder named as the root of your query. Inside this folder, you will find all timestamps and per timestamp the path you requested.
238
+ - Will create a folder named as the root of your query. Inside this folder, you will find all timestamps and per timestamp the path you requested.
206
239
  ```
207
240
  your/path/waybackup_snapshots/
208
241
  └── the_root_of_your_query/ (example.com/)
@@ -221,7 +254,7 @@ your/path/waybackup_snapshots/
221
254
  ...
222
255
  ```
223
256
 
224
- ## CSV Output
257
+ ### CSV
225
258
 
226
259
  Each snapshot is stored with the following keys/values. These are either stored in a sqlite database while the download is running or saved into a CSV file after the download is finished.
227
260
 
@@ -247,9 +280,8 @@ For download queries:
247
280
 
248
281
  Exceptions will be written into `waybackup_error.log` (each run overwrites the file).
249
282
 
250
- ### Known ToDos
251
-
252
- - [ ] currently there is no logic to handle if both a http and https version of a page is available
283
+ <br>
284
+ <br>
253
285
 
254
286
  ## Contributing
255
287
 
@@ -3,7 +3,6 @@
3
3
  [![PyPI](https://img.shields.io/pypi/v/pywaybackup)](https://pypi.org/project/pywaybackup/)
4
4
  [![PyPI - Downloads](https://img.shields.io/pypi/dm/pywaybackup)](https://pypi.org/project/pywaybackup/)
5
5
  ![Python Version](https://img.shields.io/badge/Python-3.8-blue)
6
- <!-- ![Python_Sqlite3 Version](https://img.shields.io/badge/Python_Sqlite3-3.35-blue) -->
7
6
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
7
 
9
8
  Downloading archived web pages from the [Wayback Machine](https://archive.org/web/).
@@ -29,11 +28,15 @@ This tool allows you to download content from the Wayback Machine (archive.org).
29
28
  ```pip install .```
30
29
  - in a virtual env or use `--break-system-package`
31
30
 
32
- ## Usage infos - important notes
31
+ ## notes / issues / hints
33
32
 
34
- - Linux recommended: On Windows machines, the path length is limited. This can only be overcome by editing the registry. Files that exceed the path length will not be downloaded.
35
- - If you query an explicit file (e.g. a query-string `?query=this` or `login.html`), the `--explicit`-argument is recommended as a wildcard query may lead to an empty result.
33
+ - Linux recommended: On Windows machines, the path length is limited. Files that exceed the path length will not be downloaded.
36
34
  - The tool uses a sqlite database to handle snapshots. The database will only persist while the download is running.
35
+ - If you query an explicit file (e.g. a query-string `?query=this` or `login.html`), the `--explicit`-argument is recommended as a wildcard query may lead to an empty result.
36
+ - Downloading directly into a network share is not recommended. The sqlite locking mechanism may cause issues. If you need to download into a network share, set the `--metadata` argument to a local path.
37
+
38
+ <br>
39
+ <br>
37
40
 
38
41
  ## Arguments
39
42
 
@@ -55,7 +58,7 @@ This tool allows you to download content from the Wayback Machine (archive.org).
55
58
  - **`-s`**, **`--save`**:<br>
56
59
  Save a page to the Wayback Machine. (beta)
57
60
 
58
- ### Optional query parameters
61
+ #### Optional query parameters
59
62
 
60
63
  - **`-e`**, **`--explicit`**:<br>
61
64
  Only download the explicit given URL. No wildcard subdomains or paths. Use e.g. to get root-only snapshots. This is recommended for explicit files like `login.html` or `?query=this`.
@@ -76,11 +79,16 @@ Limits the amount of snapshots to query from the CDX server. If an existing CDX
76
79
  - **`--end`**:<br>
77
80
  Timestamp to end searching.
78
81
 
79
- ### Behavior manipulation
82
+ ### Optional
83
+
84
+ #### Behavior Manipulation
80
85
 
81
86
  - **`-o`**, **`--output`**:<br>
82
87
  Defaults to `waybackup_snapshots` in the current directory. The folder where downloaded files will be saved.
83
88
 
89
+ - **`-m`**, **`--metadata`**<br>
90
+ Change the folder where metadata will be saved (`cdx`/`db`/`csv`/`log`). Especially if you are downloading into a network share, you SHOULD set this to a local path because sqlite locking mechanism may cause issues with network shares.
91
+
84
92
  <!-- - **`--verbosity`** `<level>`:<br>
85
93
  Sets verbosity level. Options are `info`and `trace`. Default is `info`. -->
86
94
 
@@ -95,17 +103,31 @@ Sets the number of simultaneous download workers. Default is 1, safe range is ab
95
103
 
96
104
  - **`--no-redirect`**:<br>
97
105
  Disables following redirects of snapshots. Useful for preventing timestamp-folder mismatches caused by Archive.org redirects.
98
-
106
+
99
107
  - **`--retry`** `<attempts>`:<br>
100
108
  Specifies number of retry attempts for failed downloads.
101
109
 
102
110
  - **`--delay`** `<seconds>`:<br>
103
111
  Specifies delay between download requests in seconds. Default is no delay (0).
104
112
 
113
+ - **`--verbose`**:<br>
114
+ Increase output verbosity.
115
+ - verbose:
116
+ ```
117
+ -----> Worker: 2 - Attempt: [1/1] Snapshot ID: [23/81]
118
+ SUCCESS -> 200 OK
119
+ -> URL: https://web.archive.org/web/20240225193302id_/https://example.com/assets/css/custom-styles.css
120
+ -> FILE: /home/manjaro/Stuff/python-wayback-machine-downloader/waybackup_snapshots/example.com/20240225193302id_/assets/css/custom-styles.css
121
+ ```
122
+ - non-verbose:
123
+ ```
124
+ 55/81 - W:2 - SUCCESS - 20240225193302 - https://example.com/assets/css/custom-styles.css
125
+ ```
126
+
105
127
  <!-- - **`--convert-links`**:<br>
106
128
  If set, all links in the downloaded files will be converted to local links. This is useful for offline browsing. The links are converted to the local path structure. Show output with `--verbosity trace`. -->
107
129
 
108
- ### Special:
130
+ #### Job Handling:
109
131
 
110
132
  - **`--reset`**:
111
133
  If set, the job will be reset, and any existing `cdx`, `db`, `csv` files will be **deleted**. This allows you to start the job from scratch without considering previously downloaded data.
@@ -113,47 +135,56 @@ If set, all links in the downloaded files will be converted to local links. This
113
135
  - **`--keep`**:
114
136
  If set, all files will be kept after the job is finished. This includes the `cdx` and `db` file. Without this argument, they will be deleted if the job finished successfully.
115
137
 
116
- # Usage
138
+ <br>
139
+ <br>
140
+
141
+ ## Usage
117
142
 
118
143
  ### Handling Interrupted Jobs
119
- When a job is interrupted (by any reason), `pywaybackup` is designed to resume the job from where it left off. It automatically detects existing job data (based on the URL and <u>**optional query parameters**</u> - including output directory) and resumes the process without requiring manual intervention. Here's how the tool handles different scenarios:
120
-
121
- - **Default Behavior:**
122
- - On restarting the same job (same URL, <u>**optional query parameters**</u>, and output directory), the tool will:
123
- - Reuse the existing `.cdx` and `.db` files.
124
- - Resume downloading snapshots from the last successful point.
125
- - Skip previously downloaded files to save time and resources.
126
-
127
- - **Manual Reset with `--reset`:**
128
- - This command deletes any existing `.cdx` and `.db` files associated with the job and starts the process from scratch.
129
- - Useful if:
130
- - The previous data is corrupted.
131
- - You want to re-query the snapshots without considering previously downloaded data.
132
-
133
- - **Preserving Job Data with `--keep`:**
134
- - Normally, `.cdx` and `.db` files are deleted after the job finishes successfully.
135
- - Use `--keep` to retain these files for future use (e.g., re-analysis or extending the query later).
136
-
137
- > **Note1:** The resumption process only works if the output directory remains the same as the one used during the initial job.
138
- >
139
- > **Note2:** `--reset` will NOT delete the already downloaded files for now. You have to remove them 'by hand'.
140
-
141
- ### Example
142
-
143
- 1. Start downloading all available snapshots:<br>`waybackup -u https://example.com -a`
144
- 2. Interrupt the process `CTRL+C`<br>
145
- 3. The tool will detect the existing job data and resume downloading from the last completed point:<br>`waybackup -u https://example.com -a`
146
- > **Important:** `waybackup -u https://example.com -c` -> The tool will NOT resume because a necessary identifier-changed
147
- 4. This deletes any existing .cdx and .db files associated with the job and starts the process from scratch:<br>`waybackup -u https://example.com -a --reset`
148
- 5. This ensures all job-related files are kept for future use, such as re-analysis or extending the query later:<br>`waybackup -u https://example.com -a --keep`
149
-
150
- ## Output path structure
144
+
145
+ `pywaybackup` resumes interrupted jobs. The tool automatically continues from where it left off.
146
+
147
+ - Detects existing `.cdx` and `.db` files in an `output dir` to resume downloading from the last successful point.
148
+ - Compares `URL`, `mode`, and `optional query parameters` to ensure automatic resumption.
149
+ - Skips previously downloaded files to save time.
150
+ > **Note:** Changing URL, mode selection, query parameters or output prevents automatic resumption.
151
+
152
+ #### Resetting a Job (`--reset`)
153
+ - Deletes `.cdx` and `.db` files and restarts the process from scratch.
154
+ - Does **not** remove already downloaded files.
155
+ - `waybackup -u https://example.com -a --reset`
156
+
157
+ #### Keeping Job Data (`--keep`)
158
+ - Normally, `.cdx` and `.db` files are deleted after a successful job.
159
+ - `--keep` preserves them for future re-analysis or extending the query.
160
+ - `waybackup -u https://example.com -a --keep`
161
+
162
+ <br>
163
+ <br>
164
+
165
+ ## Examples
166
+
167
+ 1. Download a specific single snapshot of all available files (starting from root):<br>
168
+ `waybackup -u https://example.com -a --start 20210101000000 --end 20210101000000`
169
+ 2. Download a specific single snapshot of all available files (starting from a subdirectory):<br>
170
+ `waybackup -u https://example.com/subdir1/subdir2/assets/ -a --start 20210101000000 --end 20210101000000`
171
+ 3. Download a specific single snapshot of the exact given URL (no subdirs):<br>
172
+ `waybackup -u https://example.com -a --start 20210101000000 --end 20210101000000 --explicit`
173
+ 4. Download all snapshots of all available files in the given range:<br>
174
+ `waybackup -u https://example.com -a --start 20210101000000 --end 20231122000000`
175
+
176
+ <br>
177
+ <br>
178
+
179
+ ## Output
180
+
181
+ ### Path Structure
151
182
 
152
183
  The output path is currently structured as follows by an example for the query:<br>
153
- `http://example.com/subdir1/subdir2/assets/`:
184
+ `http://example.com/subdir1/subdir2/assets/`
154
185
  <br><br>
155
186
  For the first and last version (`-f` or `-l`):
156
- - The requested path will only include all files/folders starting from your query-path.
187
+ - Will only include all files/folders starting from your query-path.
157
188
  ```
158
189
  your/path/waybackup_snapshots/
159
190
  └── the_root_of_your_query/ (example.com/)
@@ -165,7 +196,7 @@ your/path/waybackup_snapshots/
165
196
  ...
166
197
  ```
167
198
  For all versions (`-a`):
168
- - Will currently create a folder named as the root of your query. Inside this folder, you will find all timestamps and per timestamp the path you requested.
199
+ - Will create a folder named as the root of your query. Inside this folder, you will find all timestamps and per timestamp the path you requested.
169
200
  ```
170
201
  your/path/waybackup_snapshots/
171
202
  └── the_root_of_your_query/ (example.com/)
@@ -184,7 +215,7 @@ your/path/waybackup_snapshots/
184
215
  ...
185
216
  ```
186
217
 
187
- ## CSV Output
218
+ ### CSV
188
219
 
189
220
  Each snapshot is stored with the following keys/values. These are either stored in a sqlite database while the download is running or saved into a CSV file after the download is finished.
190
221
 
@@ -210,11 +241,10 @@ For download queries:
210
241
 
211
242
  Exceptions will be written into `waybackup_error.log` (each run overwrites the file).
212
243
 
213
- ### Known ToDos
214
-
215
- - [ ] currently there is no logic to handle if both a http and https version of a page is available
244
+ <br>
245
+ <br>
216
246
 
217
247
  ## Contributing
218
248
 
219
249
  I'm always happy for some feature requests to improve the usability of this tool.
220
- Feel free to give suggestions and report issues. Project is still far from being perfect.
250
+ Feel free to give suggestions and report issues. Project is still far from being perfect.
@@ -7,7 +7,7 @@ packages = ["pywaybackup"]
7
7
 
8
8
  [project]
9
9
  name = "pywaybackup"
10
- version = "3.1.0"
10
+ version = "3.2.1"
11
11
  description = "Query and download archive.org as simple as possible."
12
12
  authors = [
13
13
  { name = "bitdruid", email = "bitdruid@outlook.com" }
@@ -16,9 +16,10 @@ license = { file = "LICENSE" }
16
16
  readme = "README.md"
17
17
  requires-python = ">=3.8"
18
18
  dependencies = [
19
- "pysqlite3-binary==0.5.4",
20
- "requests==2.31.0",
21
- "tqdm==4.66.2",
19
+ "pysqlite3-binary==0.5.4; sys_platform == 'linux'",
20
+ "pysqlite-binary; sys_platform == 'win32'",
21
+ "requests==2.32.3",
22
+ "tqdm==4.67.1",
22
23
  "python-magic==0.4.27; sys_platform == 'linux'",
23
24
  "python-magic-bin==0.4.14; sys_platform == 'win32'",
24
25
  ]
@@ -6,15 +6,14 @@ import argparse
6
6
  from importlib.metadata import version
7
7
 
8
8
  from pywaybackup.helper import url_split, sanitize_filename
9
- from pywaybackup.Exception import Exception as ex
10
9
 
11
10
  class Arguments:
12
-
11
+
13
12
  def __init__(self):
14
-
13
+
15
14
  parser = argparse.ArgumentParser(description='Download from wayback machine (archive.org)')
16
15
  parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + version("pywaybackup") + ' by @bitdruid -> https://github.com/bitdruid')
17
-
16
+
18
17
  required = parser.add_argument_group('required (one exclusive)')
19
18
  required.add_argument('-u', '--url', type=str, metavar="", help='url (with subdir/subdomain) to download')
20
19
  exclusive_required = required.add_mutually_exclusive_group(required=True)
@@ -32,16 +31,17 @@ class Arguments:
32
31
  optional.add_argument('--limit', type=int, nargs='?', const=True, metavar='int', help='limit the number of snapshots to download')
33
32
 
34
33
  behavior = parser.add_argument_group('manipulate behavior')
35
- behavior.add_argument('-o', '--output', type=str, metavar="", help='output folder - defaults to current directory')
34
+ behavior.add_argument('-o', '--output', type=str, metavar="", help='output for all files - defaults to current directory')
35
+ behavior.add_argument('-m', '--metadata', type=str, metavar="", help='change directory for db/cdx/csv/log files')
36
36
  behavior.add_argument('--log', action='store_true', help='save a log file into the output folder')
37
37
  behavior.add_argument('--progress', action='store_true', help='show a progress bar')
38
38
  behavior.add_argument('--no-redirect', action='store_true', help='do not follow redirects by archive.org')
39
- #behavior.add_argument('--verbosity', type=str, default="info", metavar="", help='verbosity level (info, trace)')
40
39
  behavior.add_argument('--retry', type=int, default=0, metavar="", help='retry failed downloads (opt tries as int, else infinite)')
41
40
  behavior.add_argument('--workers', type=int, default=1, metavar="", help='number of workers (simultaneous downloads)')
42
41
  # behavior.add_argument('--convert-links', action='store_true', help='Convert all links in the files to local paths. Requires -c/--current')
43
42
  behavior.add_argument('--delay', type=int, default=0, metavar="", help='delay between each download in seconds')
44
-
43
+ behavior.add_argument('--verbose', action='store_true', help='overwritten by progress - gives detailed output')
44
+
45
45
  special = parser.add_argument_group('special')
46
46
  special.add_argument('--reset', action='store_true', help='reset the job and ignore existing cdx/db/csv files')
47
47
  special.add_argument('--keep', action='store_true', help='keep all files after the job finished')
@@ -75,11 +75,11 @@ class Configuration:
75
75
 
76
76
  if cls.output is None:
77
77
  cls.output = os.path.join(os.getcwd(), "waybackup_snapshots")
78
+ if cls.metadata is None:
79
+ cls.metadata = cls.output
78
80
  os.makedirs(cls.output, exist_ok=True) if not cls.save else None
79
-
80
- if cls.log is True:
81
- cls.log = os.path.join(cls.output, f"waybackup_{sanitize_filename(cls.url)}.log")
82
-
81
+ os.makedirs(cls.metadata, exist_ok=True) if not cls.save else None
82
+
83
83
  if cls.all:
84
84
  cls.mode = "all"
85
85
  if cls.last:
@@ -91,10 +91,13 @@ class Configuration:
91
91
 
92
92
  if cls.filetype:
93
93
  cls.filetype = [ft.lower().strip() for ft in cls.filetype.split(",")]
94
-
95
- cls.cdxfile = os.path.join(cls.output, f"waybackup_{sanitize_filename(cls.url)}.cdx")
96
- cls.dbfile = os.path.join(cls.output, f"waybackup_{sanitize_filename(cls.url)}.db")
97
- cls.csvfile = os.path.join(cls.output, f"waybackup_{sanitize_filename(cls.url)}.csv")
94
+
95
+ base_path = cls.metadata
96
+ base_name = f"waybackup_{sanitize_filename(cls.url)}"
97
+ cls.cdxfile = os.path.join(base_path, f"{base_name}.cdx")
98
+ cls.dbfile = os.path.join(base_path, f"{base_name}.db")
99
+ cls.csvfile = os.path.join(base_path, f"{base_name}.csv")
100
+ cls.log = os.path.join(base_path, f"{base_name}.log") if cls.log else None
98
101
 
99
102
  if cls.reset:
100
103
  os.remove(cls.cdxfile) if os.path.isfile(cls.cdxfile) else None
@@ -29,7 +29,7 @@ class Converter:
29
29
 
30
30
 
31
31
  @classmethod
32
- def links(cls, filepath, status_message=None):
32
+ def links(cls, filepath, status_content=None):
33
33
  """
34
34
  Convert all links in a HTML / CSS / JS file to local paths.
35
35
  """
@@ -72,7 +72,7 @@ class Converter:
72
72
  if original_url.startswith("//"):
73
73
  external = True
74
74
  if external:
75
- status_message.trace(status="", type=f"{count}/{len(links)}", message="External url")
75
+ status_message.trace(status="", info=f"{count}/{len(links)}", content="External url")
76
76
  return original_url
77
77
 
78
78
  # convert the url to a relative path to the local root (download dir) if it's a valid path, else return the original url
@@ -87,7 +87,7 @@ class Converter:
87
87
  if original_url.startswith("../"): # if file is already ../ check if it's not too many steps up
88
88
  original_url = f"{cls.define_root_steps(filepath)}{original_url.split('../')[-1].lstrip('/')}"
89
89
  else:
90
- status_message.trace(status="", type="", message=f"{count}/{len(links)}: URL is not a valid path")
90
+ status_message.trace(status="", info="", content=f"{count}/{len(links)}: URL is not a valid path")
91
91
 
92
92
  return original_url
93
93
 
@@ -158,24 +158,24 @@ class Converter:
158
158
 
159
159
  if os.path.isfile(filepath):
160
160
  if magic.from_file(filepath, mime=True).split("/")[1] == "javascript":
161
- status_message.trace(status="Error", type="", message="JS-file is not supported")
161
+ status_message.trace(status="Error", info="", content="JS-file is not supported")
162
162
  return
163
163
  try:
164
- with open(filepath, "r") as file:
164
+ with open(filepath, "r", encoding="utf-8") as file:
165
165
  domain = config.domain
166
166
  content = file.read()
167
167
  links = extract_urls(content)
168
- status_message.store(message=f"\n-----> Convert: [{len(links)}] links in file")
168
+ status_message.store(verbose=True, content=f"\n-----> Convert: [{len(links)}] links in file")
169
169
  count = 1
170
170
  for original_link in links:
171
- status_message.trace(status="ORIG", type=f"{count}/{len(links)}", message=original_link)
171
+ status_message.trace(status="ORIG", info=f"{count}/{len(links)}", content=original_link)
172
172
  new_link = local_url(original_link, domain, count)
173
173
  if new_link != original_link:
174
- status_message.trace(status="CONV", type=f"{count}/{len(links)}", message=new_link)
174
+ status_message.trace(status="CONV", info=f"{count}/{len(links)}", content=new_link)
175
175
  content = content.replace(original_link, new_link)
176
176
  count += 1
177
- file = open(filepath, "w")
177
+ file = open(filepath, "w", encoding="utf-8")
178
178
  file.write(content)
179
179
  file.close()
180
180
  except UnicodeDecodeError:
181
- status_message.trace(status="Error", type="", message="Could not decode file to convert links")
181
+ status_message.trace(status="Error", info="", content="Could not decode file to convert links")