wayback_machine_downloader_straw 2.3.3 → 2.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +9 -1
- data/lib/wayback_machine_downloader/archive_api.rb +1 -1
- data/lib/wayback_machine_downloader.rb +249 -53
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d04d7ddf4b722425377ce84ad5c0f917e35553e38edb755c029ae7a2b8f8055d
|
4
|
+
data.tar.gz: 3d93f41ef2ba3b366a3adf071b947eabb66caf931d022b7ad8a521d3930dfe27
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 312dcd879a3589aa0a75d47296dbe04920eee06ae2fb83c18274d918d12517c8a2064c1e9e7a3c774e656636187cd5fc510f249ae71ea6a319cb437ee8d0314b
|
7
|
+
data.tar.gz: 8cd5dcd421077405f920a8ff966387817450e6169aa9578a9b5f25284d4100f1f5bf77304cda727f975464782064643c96d3ee0985a2b57e16aec523c9e17429
|
@@ -59,7 +59,15 @@ option_parser = OptionParser.new do |opts|
|
|
59
59
|
end
|
60
60
|
|
61
61
|
opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t|
|
62
|
-
options[:rewritten] =
|
62
|
+
options[:rewritten] = true
|
63
|
+
end
|
64
|
+
|
65
|
+
opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
|
66
|
+
options[:reset] = true
|
67
|
+
end
|
68
|
+
|
69
|
+
opts.on("--keep", "Keep state files (.cdx.json, .downloaded.txt) after a successful download") do |t|
|
70
|
+
options[:keep] = true
|
63
71
|
end
|
64
72
|
|
65
73
|
opts.on("-v", "--version", "Display version") do |t|
|
@@ -4,7 +4,7 @@ require 'uri'
|
|
4
4
|
module ArchiveAPI
|
5
5
|
|
6
6
|
def get_raw_list_from_api(url, page_index, http)
|
7
|
-
request_url = URI("https://web.archive.org/cdx/search/
|
7
|
+
request_url = URI("https://web.archive.org/cdx/search/cdx")
|
8
8
|
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
|
9
9
|
request_url.query = URI.encode_www_form(params)
|
10
10
|
|
@@ -9,6 +9,8 @@ require 'json'
|
|
9
9
|
require 'time'
|
10
10
|
require 'concurrent-ruby'
|
11
11
|
require 'logger'
|
12
|
+
require 'zlib'
|
13
|
+
require 'stringio'
|
12
14
|
require_relative 'wayback_machine_downloader/tidy_bytes'
|
13
15
|
require_relative 'wayback_machine_downloader/to_regex'
|
14
16
|
require_relative 'wayback_machine_downloader/archive_api'
|
@@ -111,17 +113,19 @@ class WaybackMachineDownloader
|
|
111
113
|
|
112
114
|
include ArchiveAPI
|
113
115
|
|
114
|
-
VERSION = "2.3.
|
116
|
+
VERSION = "2.3.5"
|
115
117
|
DEFAULT_TIMEOUT = 30
|
116
118
|
MAX_RETRIES = 3
|
117
119
|
RETRY_DELAY = 2
|
118
120
|
RATE_LIMIT = 0.25 # Delay between requests in seconds
|
119
121
|
CONNECTION_POOL_SIZE = 10
|
120
122
|
MEMORY_BUFFER_SIZE = 16384 # 16KB chunks
|
123
|
+
STATE_CDX_FILENAME = ".cdx.json"
|
124
|
+
STATE_DB_FILENAME = ".downloaded.txt"
|
121
125
|
|
122
126
|
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
123
127
|
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
124
|
-
:all, :maximum_pages, :threads_count, :logger
|
128
|
+
:all, :maximum_pages, :threads_count, :logger, :reset, :keep
|
125
129
|
|
126
130
|
def initialize params
|
127
131
|
validate_params(params)
|
@@ -137,10 +141,15 @@ class WaybackMachineDownloader
|
|
137
141
|
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
|
138
142
|
@threads_count = [params[:threads_count].to_i, 1].max
|
139
143
|
@rewritten = params[:rewritten]
|
144
|
+
@reset = params[:reset]
|
145
|
+
@keep = params[:keep]
|
140
146
|
@timeout = params[:timeout] || DEFAULT_TIMEOUT
|
141
147
|
@logger = setup_logger
|
142
148
|
@failed_downloads = Concurrent::Array.new
|
143
149
|
@connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
|
150
|
+
@db_mutex = Mutex.new
|
151
|
+
|
152
|
+
handle_reset
|
144
153
|
end
|
145
154
|
|
146
155
|
def backup_name
|
@@ -163,6 +172,23 @@ class WaybackMachineDownloader
|
|
163
172
|
end
|
164
173
|
end
|
165
174
|
|
175
|
+
def cdx_path
|
176
|
+
File.join(backup_path, STATE_CDX_FILENAME)
|
177
|
+
end
|
178
|
+
|
179
|
+
def db_path
|
180
|
+
File.join(backup_path, STATE_DB_FILENAME)
|
181
|
+
end
|
182
|
+
|
183
|
+
def handle_reset
|
184
|
+
if @reset
|
185
|
+
puts "Resetting download state..."
|
186
|
+
FileUtils.rm_f(cdx_path)
|
187
|
+
FileUtils.rm_f(db_path)
|
188
|
+
puts "Removed state files: #{cdx_path}, #{db_path}"
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
166
192
|
def match_only_filter file_url
|
167
193
|
if @only_filter
|
168
194
|
only_filter_regex = @only_filter.to_regex
|
@@ -190,28 +216,100 @@ class WaybackMachineDownloader
|
|
190
216
|
end
|
191
217
|
|
192
218
|
def get_all_snapshots_to_consider
|
193
|
-
|
219
|
+
if File.exist?(cdx_path) && !@reset
|
220
|
+
puts "Loading snapshot list from #{cdx_path}"
|
221
|
+
begin
|
222
|
+
snapshot_list_to_consider = JSON.parse(File.read(cdx_path))
|
223
|
+
puts "Loaded #{snapshot_list_to_consider.length} snapshots from cache."
|
224
|
+
puts
|
225
|
+
return Concurrent::Array.new(snapshot_list_to_consider)
|
226
|
+
rescue JSON::ParserError => e
|
227
|
+
puts "Error reading snapshot cache file #{cdx_path}: #{e.message}. Refetching..."
|
228
|
+
FileUtils.rm_f(cdx_path)
|
229
|
+
rescue => e
|
230
|
+
puts "Error loading snapshot cache #{cdx_path}: #{e.message}. Refetching..."
|
231
|
+
FileUtils.rm_f(cdx_path)
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
snapshot_list_to_consider = Concurrent::Array.new
|
236
|
+
mutex = Mutex.new
|
194
237
|
|
238
|
+
puts "Getting snapshot pages from Wayback Machine API..."
|
239
|
+
|
240
|
+
# Fetch the initial set of snapshots, sequentially
|
195
241
|
@connection_pool.with_connection do |connection|
|
196
|
-
|
242
|
+
initial_list = get_raw_list_from_api(@base_url, nil, connection)
|
243
|
+
mutex.synchronize do
|
244
|
+
snapshot_list_to_consider.concat(initial_list)
|
245
|
+
print "."
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
# Fetch additional pages if the exact URL flag is not set
|
250
|
+
unless @exact_url
|
251
|
+
page_index = 0
|
252
|
+
batch_size = [@threads_count, 5].min
|
253
|
+
continue_fetching = true
|
254
|
+
|
255
|
+
while continue_fetching && page_index < @maximum_pages
|
256
|
+
# Determine the range of pages to fetch in this batch
|
257
|
+
end_index = [page_index + batch_size, @maximum_pages].min
|
258
|
+
current_batch = (page_index...end_index).to_a
|
259
|
+
|
260
|
+
# Create futures for concurrent API calls
|
261
|
+
futures = current_batch.map do |page|
|
262
|
+
Concurrent::Future.execute do
|
263
|
+
result = nil
|
264
|
+
@connection_pool.with_connection do |connection|
|
265
|
+
result = get_raw_list_from_api("#{@base_url}/*", page, connection)
|
266
|
+
end
|
267
|
+
[page, result]
|
268
|
+
end
|
269
|
+
end
|
197
270
|
|
198
|
-
|
199
|
-
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil, connection)
|
200
|
-
print "."
|
271
|
+
results = []
|
201
272
|
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
273
|
+
futures.each do |future|
|
274
|
+
begin
|
275
|
+
results << future.value
|
276
|
+
rescue => e
|
277
|
+
puts "\nError fetching page #{future}: #{e.message}"
|
278
|
+
end
|
279
|
+
end
|
280
|
+
|
281
|
+
# Sort results by page number to maintain order
|
282
|
+
results.sort_by! { |page, _| page }
|
207
283
|
|
208
|
-
|
209
|
-
|
284
|
+
# Process results and check for empty pages
|
285
|
+
results.each do |page, result|
|
286
|
+
if result.empty?
|
287
|
+
continue_fetching = false
|
288
|
+
break
|
289
|
+
else
|
290
|
+
mutex.synchronize do
|
291
|
+
snapshot_list_to_consider.concat(result)
|
292
|
+
print "."
|
293
|
+
end
|
294
|
+
end
|
210
295
|
end
|
296
|
+
|
297
|
+
page_index = end_index
|
298
|
+
|
299
|
+
sleep(RATE_LIMIT) if continue_fetching
|
211
300
|
end
|
212
301
|
end
|
213
302
|
|
214
|
-
puts " found #{snapshot_list_to_consider.length} snapshots
|
303
|
+
puts " found #{snapshot_list_to_consider.length} snapshots."
|
304
|
+
|
305
|
+
# Save the fetched list to the cache file
|
306
|
+
begin
|
307
|
+
FileUtils.mkdir_p(File.dirname(cdx_path))
|
308
|
+
File.write(cdx_path, JSON.pretty_generate(snapshot_list_to_consider.to_a)) # Convert Concurrent::Array back to Array for JSON
|
309
|
+
puts "Saved snapshot list to #{cdx_path}"
|
310
|
+
rescue => e
|
311
|
+
puts "Error saving snapshot cache to #{cdx_path}: #{e.message}"
|
312
|
+
end
|
215
313
|
puts
|
216
314
|
|
217
315
|
snapshot_list_to_consider
|
@@ -301,32 +399,103 @@ class WaybackMachineDownloader
|
|
301
399
|
puts "]"
|
302
400
|
end
|
303
401
|
|
402
|
+
def load_downloaded_ids
|
403
|
+
downloaded_ids = Set.new
|
404
|
+
if File.exist?(db_path) && !@reset
|
405
|
+
puts "Loading list of already downloaded files from #{db_path}"
|
406
|
+
begin
|
407
|
+
File.foreach(db_path) { |line| downloaded_ids.add(line.strip) }
|
408
|
+
rescue => e
|
409
|
+
puts "Error reading downloaded files list #{db_path}: #{e.message}. Assuming no files downloaded."
|
410
|
+
downloaded_ids.clear
|
411
|
+
end
|
412
|
+
end
|
413
|
+
downloaded_ids
|
414
|
+
end
|
415
|
+
|
416
|
+
def append_to_db(file_id)
|
417
|
+
@db_mutex.synchronize do
|
418
|
+
begin
|
419
|
+
FileUtils.mkdir_p(File.dirname(db_path))
|
420
|
+
File.open(db_path, 'a') { |f| f.puts(file_id) }
|
421
|
+
rescue => e
|
422
|
+
@logger.error("Failed to append downloaded file ID #{file_id} to #{db_path}: #{e.message}")
|
423
|
+
end
|
424
|
+
end
|
425
|
+
end
|
426
|
+
|
304
427
|
def download_files
|
305
428
|
start_time = Time.now
|
306
429
|
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
|
307
|
-
|
308
|
-
|
309
|
-
|
430
|
+
|
431
|
+
FileUtils.mkdir_p(backup_path)
|
432
|
+
|
433
|
+
# Load the list of files to potentially download
|
434
|
+
files_to_download = file_list_by_timestamp
|
435
|
+
|
436
|
+
if files_to_download.empty?
|
437
|
+
puts "No files found matching criteria."
|
438
|
+
cleanup
|
439
|
+
return
|
440
|
+
end
|
441
|
+
|
442
|
+
total_files = files_to_download.count
|
443
|
+
puts "#{total_files} files found matching criteria."
|
444
|
+
|
445
|
+
# Load IDs of already downloaded files
|
446
|
+
downloaded_ids = load_downloaded_ids
|
447
|
+
files_to_process = files_to_download.reject do |file_info|
|
448
|
+
downloaded_ids.include?(file_info[:file_id])
|
449
|
+
end
|
450
|
+
|
451
|
+
remaining_count = files_to_process.count
|
452
|
+
skipped_count = total_files - remaining_count
|
453
|
+
|
454
|
+
if skipped_count > 0
|
455
|
+
puts "Found #{skipped_count} previously downloaded files, skipping them."
|
456
|
+
end
|
457
|
+
|
458
|
+
if remaining_count == 0
|
459
|
+
puts "All matching files have already been downloaded."
|
460
|
+
cleanup
|
310
461
|
return
|
311
462
|
end
|
312
463
|
|
313
|
-
|
314
|
-
|
315
|
-
|
464
|
+
puts "#{remaining_count} files to download:"
|
465
|
+
|
316
466
|
@processed_file_count = 0
|
467
|
+
@total_to_download = remaining_count
|
317
468
|
@download_mutex = Mutex.new
|
318
|
-
|
469
|
+
|
319
470
|
thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
|
320
471
|
pool = Concurrent::FixedThreadPool.new(thread_count)
|
321
|
-
|
322
|
-
|
472
|
+
|
473
|
+
files_to_process.each do |file_remote_info|
|
323
474
|
pool.post do
|
324
|
-
|
325
|
-
|
326
|
-
@
|
327
|
-
|
328
|
-
|
475
|
+
download_success = false
|
476
|
+
begin
|
477
|
+
@connection_pool.with_connection do |connection|
|
478
|
+
result_message = download_file(file_remote_info, connection)
|
479
|
+
# for now, assume success if no exception and message doesn't indicate error/skip
|
480
|
+
if result_message && !result_message.downcase.include?('error') && !result_message.downcase.include?('failed') && !result_message.downcase.include?('skipping') && !result_message.downcase.include?('already exists')
|
481
|
+
download_success = true
|
482
|
+
end
|
483
|
+
@download_mutex.synchronize do
|
484
|
+
@processed_file_count += 1
|
485
|
+
# adjust progress message to reflect remaining files
|
486
|
+
progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
|
487
|
+
puts progress_message if progress_message
|
488
|
+
end
|
489
|
+
end
|
490
|
+
# sppend to DB only after successful download outside the connection block
|
491
|
+
if download_success
|
492
|
+
append_to_db(file_remote_info[:file_id])
|
329
493
|
end
|
494
|
+
rescue => e
|
495
|
+
@logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
|
496
|
+
@download_mutex.synchronize do
|
497
|
+
@processed_file_count += 1
|
498
|
+
end
|
330
499
|
end
|
331
500
|
sleep(RATE_LIMIT)
|
332
501
|
end
|
@@ -336,7 +505,8 @@ class WaybackMachineDownloader
|
|
336
505
|
pool.wait_for_termination
|
337
506
|
|
338
507
|
end_time = Time.now
|
339
|
-
puts "\nDownload
|
508
|
+
puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
|
509
|
+
puts "Results saved in #{backup_path}"
|
340
510
|
cleanup
|
341
511
|
end
|
342
512
|
|
@@ -384,21 +554,24 @@ class WaybackMachineDownloader
|
|
384
554
|
dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
385
555
|
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
386
556
|
end
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
557
|
+
|
558
|
+
# check existence *before* download attempt
|
559
|
+
# this handles cases where a file was created manually or by a previous partial run without a .db entry
|
560
|
+
if File.exist? file_path
|
561
|
+
return "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})"
|
562
|
+
end
|
563
|
+
|
564
|
+
begin
|
565
|
+
structure_dir_path dir_path
|
566
|
+
download_with_retry(file_path, file_url, file_timestamp, http)
|
567
|
+
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
|
568
|
+
rescue StandardError => e
|
569
|
+
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
|
570
|
+
if not @all and File.exist?(file_path) and File.size(file_path) == 0
|
571
|
+
File.delete(file_path)
|
572
|
+
msg += "\n#{file_path} was empty and was removed."
|
399
573
|
end
|
400
|
-
|
401
|
-
"#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
|
574
|
+
msg
|
402
575
|
end
|
403
576
|
end
|
404
577
|
|
@@ -431,23 +604,33 @@ class WaybackMachineDownloader
|
|
431
604
|
begin
|
432
605
|
wayback_url = if @rewritten
|
433
606
|
"https://web.archive.org/web/#{file_timestamp}/#{file_url}"
|
434
|
-
else
|
607
|
+
else
|
435
608
|
"https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
|
436
609
|
end
|
437
|
-
|
610
|
+
|
438
611
|
request = Net::HTTP::Get.new(URI(wayback_url))
|
439
612
|
request["Connection"] = "keep-alive"
|
440
613
|
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
|
441
|
-
|
614
|
+
request["Accept-Encoding"] = "gzip, deflate"
|
615
|
+
|
442
616
|
response = connection.request(request)
|
443
|
-
|
617
|
+
|
444
618
|
case response
|
445
619
|
when Net::HTTPSuccess
|
446
620
|
File.open(file_path, "wb") do |file|
|
447
|
-
|
448
|
-
|
621
|
+
body = response.body
|
622
|
+
if response['content-encoding'] == 'gzip' && body && !body.empty?
|
623
|
+
begin
|
624
|
+
gz = Zlib::GzipReader.new(StringIO.new(body))
|
625
|
+
decompressed_body = gz.read
|
626
|
+
gz.close
|
627
|
+
file.write(decompressed_body)
|
628
|
+
rescue Zlib::GzipFile::Error => e
|
629
|
+
@logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}")
|
630
|
+
file.write(body)
|
631
|
+
end
|
449
632
|
else
|
450
|
-
file.write(
|
633
|
+
file.write(body) if body
|
451
634
|
end
|
452
635
|
end
|
453
636
|
when Net::HTTPRedirection
|
@@ -464,7 +647,7 @@ class WaybackMachineDownloader
|
|
464
647
|
else
|
465
648
|
raise "HTTP Error: #{response.code} #{response.message}"
|
466
649
|
end
|
467
|
-
|
650
|
+
|
468
651
|
rescue StandardError => e
|
469
652
|
if retries < MAX_RETRIES
|
470
653
|
retries += 1
|
@@ -480,12 +663,25 @@ class WaybackMachineDownloader
|
|
480
663
|
|
481
664
|
def cleanup
|
482
665
|
@connection_pool.shutdown
|
483
|
-
|
666
|
+
|
484
667
|
if @failed_downloads.any?
|
668
|
+
@logger.error("Download completed with errors.")
|
485
669
|
@logger.error("Failed downloads summary:")
|
486
670
|
@failed_downloads.each do |failure|
|
487
671
|
@logger.error(" #{failure[:url]} - #{failure[:error]}")
|
488
672
|
end
|
673
|
+
unless @reset
|
674
|
+
puts "State files kept due to download errors: #{cdx_path}, #{db_path}"
|
675
|
+
return
|
676
|
+
end
|
677
|
+
end
|
678
|
+
|
679
|
+
if !@keep || @reset
|
680
|
+
puts "Cleaning up state files..." unless @keep && !@reset
|
681
|
+
FileUtils.rm_f(cdx_path)
|
682
|
+
FileUtils.rm_f(db_path)
|
683
|
+
elsif @keep
|
684
|
+
puts "Keeping state files as requested: #{cdx_path}, #{db_path}"
|
489
685
|
end
|
490
686
|
end
|
491
687
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader_straw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.3.
|
4
|
+
version: 2.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- strawberrymaster
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-04-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: concurrent-ruby
|