wayback_machine_downloader_straw 2.3.3 → 2.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +9 -1
- data/lib/wayback_machine_downloader/archive_api.rb +1 -1
- data/lib/wayback_machine_downloader.rb +252 -55
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 220999514eb0c1dd5bce948a2ac028e4527eb07f089b9f6b437f02a6a00860be
|
4
|
+
data.tar.gz: '03780351285ee37d38ba04652725ffa33f6112837e01e68469c8be3cda13eb45'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3b05448a6271b8e45d5655b5ee415851f6e8e2daaec5f9bb12b0681e58292c06fe4ab91ab4f2ca1530edb0632755808dc8a465165c5e73fea2673481dddad610
|
7
|
+
data.tar.gz: 95440ee51316da6f2e48c3ec1d54f9fc391b2d59447625f07052222ecfeacf6fc26d430ac64ede6da589c0115fddc7a71fc7eb2fa45ff403491f1b3dc51b66ec
|
@@ -59,7 +59,15 @@ option_parser = OptionParser.new do |opts|
|
|
59
59
|
end
|
60
60
|
|
61
61
|
opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t|
|
62
|
-
options[:rewritten] =
|
62
|
+
options[:rewritten] = true
|
63
|
+
end
|
64
|
+
|
65
|
+
opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
|
66
|
+
options[:reset] = true
|
67
|
+
end
|
68
|
+
|
69
|
+
opts.on("--keep", "Keep state files (.cdx.json, .downloaded.txt) after a successful download") do |t|
|
70
|
+
options[:keep] = true
|
63
71
|
end
|
64
72
|
|
65
73
|
opts.on("-v", "--version", "Display version") do |t|
|
@@ -4,7 +4,7 @@ require 'uri'
|
|
4
4
|
module ArchiveAPI
|
5
5
|
|
6
6
|
def get_raw_list_from_api(url, page_index, http)
|
7
|
-
request_url = URI("https://web.archive.org/cdx/search/
|
7
|
+
request_url = URI("https://web.archive.org/cdx/search/cdx")
|
8
8
|
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
|
9
9
|
request_url.query = URI.encode_www_form(params)
|
10
10
|
|
@@ -9,6 +9,8 @@ require 'json'
|
|
9
9
|
require 'time'
|
10
10
|
require 'concurrent-ruby'
|
11
11
|
require 'logger'
|
12
|
+
require 'zlib'
|
13
|
+
require 'stringio'
|
12
14
|
require_relative 'wayback_machine_downloader/tidy_bytes'
|
13
15
|
require_relative 'wayback_machine_downloader/to_regex'
|
14
16
|
require_relative 'wayback_machine_downloader/archive_api'
|
@@ -111,17 +113,19 @@ class WaybackMachineDownloader
|
|
111
113
|
|
112
114
|
include ArchiveAPI
|
113
115
|
|
114
|
-
VERSION = "2.3.
|
116
|
+
VERSION = "2.3.4"
|
115
117
|
DEFAULT_TIMEOUT = 30
|
116
118
|
MAX_RETRIES = 3
|
117
119
|
RETRY_DELAY = 2
|
118
120
|
RATE_LIMIT = 0.25 # Delay between requests in seconds
|
119
121
|
CONNECTION_POOL_SIZE = 10
|
120
122
|
MEMORY_BUFFER_SIZE = 16384 # 16KB chunks
|
123
|
+
STATE_CDX_FILENAME = ".cdx.json"
|
124
|
+
STATE_DB_FILENAME = ".downloaded.txt"
|
121
125
|
|
122
126
|
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
123
127
|
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
124
|
-
:all, :maximum_pages, :threads_count, :logger
|
128
|
+
:all, :maximum_pages, :threads_count, :logger, :reset, :keep
|
125
129
|
|
126
130
|
def initialize params
|
127
131
|
validate_params(params)
|
@@ -137,10 +141,15 @@ class WaybackMachineDownloader
|
|
137
141
|
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
|
138
142
|
@threads_count = [params[:threads_count].to_i, 1].max
|
139
143
|
@rewritten = params[:rewritten]
|
144
|
+
@reset = params[:reset]
|
145
|
+
@keep = params[:keep]
|
140
146
|
@timeout = params[:timeout] || DEFAULT_TIMEOUT
|
141
147
|
@logger = setup_logger
|
142
148
|
@failed_downloads = Concurrent::Array.new
|
143
149
|
@connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
|
150
|
+
@db_mutex = Mutex.new
|
151
|
+
|
152
|
+
handle_reset
|
144
153
|
end
|
145
154
|
|
146
155
|
def backup_name
|
@@ -163,6 +172,23 @@ class WaybackMachineDownloader
|
|
163
172
|
end
|
164
173
|
end
|
165
174
|
|
175
|
+
def cdx_path
|
176
|
+
File.join(backup_path, STATE_CDX_FILENAME)
|
177
|
+
end
|
178
|
+
|
179
|
+
def db_path
|
180
|
+
File.join(backup_path, STATE_DB_FILENAME)
|
181
|
+
end
|
182
|
+
|
183
|
+
def handle_reset
|
184
|
+
if @reset
|
185
|
+
puts "Resetting download state..."
|
186
|
+
FileUtils.rm_f(cdx_path)
|
187
|
+
FileUtils.rm_f(db_path)
|
188
|
+
puts "Removed state files: #{cdx_path}, #{db_path}"
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
166
192
|
def match_only_filter file_url
|
167
193
|
if @only_filter
|
168
194
|
only_filter_regex = @only_filter.to_regex
|
@@ -190,28 +216,100 @@ class WaybackMachineDownloader
|
|
190
216
|
end
|
191
217
|
|
192
218
|
def get_all_snapshots_to_consider
|
193
|
-
|
219
|
+
if File.exist?(cdx_path) && !@reset
|
220
|
+
puts "Loading snapshot list from #{cdx_path}"
|
221
|
+
begin
|
222
|
+
snapshot_list_to_consider = JSON.parse(File.read(cdx_path))
|
223
|
+
puts "Loaded #{snapshot_list_to_consider.length} snapshots from cache."
|
224
|
+
puts
|
225
|
+
return Concurrent::Array.new(snapshot_list_to_consider)
|
226
|
+
rescue JSON::ParserError => e
|
227
|
+
puts "Error reading snapshot cache file #{cdx_path}: #{e.message}. Refetching..."
|
228
|
+
FileUtils.rm_f(cdx_path)
|
229
|
+
rescue => e
|
230
|
+
puts "Error loading snapshot cache #{cdx_path}: #{e.message}. Refetching..."
|
231
|
+
FileUtils.rm_f(cdx_path)
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
snapshot_list_to_consider = Concurrent::Array.new
|
236
|
+
mutex = Mutex.new
|
194
237
|
|
238
|
+
puts "Getting snapshot pages from Wayback Machine API..."
|
239
|
+
|
240
|
+
# Fetch the initial set of snapshots, sequentially
|
195
241
|
@connection_pool.with_connection do |connection|
|
196
|
-
|
242
|
+
initial_list = get_raw_list_from_api(@base_url, nil, connection)
|
243
|
+
mutex.synchronize do
|
244
|
+
snapshot_list_to_consider.concat(initial_list)
|
245
|
+
print "."
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
# Fetch additional pages if the exact URL flag is not set
|
250
|
+
unless @exact_url
|
251
|
+
page_index = 0
|
252
|
+
batch_size = [@threads_count, 5].min
|
253
|
+
continue_fetching = true
|
254
|
+
|
255
|
+
while continue_fetching && page_index < @maximum_pages
|
256
|
+
# Determine the range of pages to fetch in this batch
|
257
|
+
end_index = [page_index + batch_size, @maximum_pages].min
|
258
|
+
current_batch = (page_index...end_index).to_a
|
259
|
+
|
260
|
+
# Create futures for concurrent API calls
|
261
|
+
futures = current_batch.map do |page|
|
262
|
+
Concurrent::Future.execute do
|
263
|
+
result = nil
|
264
|
+
@connection_pool.with_connection do |connection|
|
265
|
+
result = get_raw_list_from_api("#{@base_url}/*", page, connection)
|
266
|
+
end
|
267
|
+
[page, result]
|
268
|
+
end
|
269
|
+
end
|
197
270
|
|
198
|
-
|
199
|
-
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil, connection)
|
200
|
-
print "."
|
271
|
+
results = []
|
201
272
|
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
273
|
+
futures.each do |future|
|
274
|
+
begin
|
275
|
+
results << future.value
|
276
|
+
rescue => e
|
277
|
+
puts "\nError fetching page #{future}: #{e.message}"
|
278
|
+
end
|
279
|
+
end
|
280
|
+
|
281
|
+
# Sort results by page number to maintain order
|
282
|
+
results.sort_by! { |page, _| page }
|
207
283
|
|
208
|
-
|
209
|
-
|
284
|
+
# Process results and check for empty pages
|
285
|
+
results.each do |page, result|
|
286
|
+
if result.empty?
|
287
|
+
continue_fetching = false
|
288
|
+
break
|
289
|
+
else
|
290
|
+
mutex.synchronize do
|
291
|
+
snapshot_list_to_consider.concat(result)
|
292
|
+
print "."
|
293
|
+
end
|
294
|
+
end
|
210
295
|
end
|
296
|
+
|
297
|
+
page_index = end_index
|
298
|
+
|
299
|
+
sleep(RATE_LIMIT) if continue_fetching
|
211
300
|
end
|
212
301
|
end
|
213
302
|
|
214
|
-
puts " found #{snapshot_list_to_consider.length} snapshots
|
303
|
+
puts " found #{snapshot_list_to_consider.length} snapshots."
|
304
|
+
|
305
|
+
# Save the fetched list to the cache file
|
306
|
+
begin
|
307
|
+
FileUtils.mkdir_p(File.dirname(cdx_path))
|
308
|
+
File.write(cdx_path, JSON.pretty_generate(snapshot_list_to_consider.to_a)) # Convert Concurrent::Array back to Array for JSON
|
309
|
+
puts "Saved snapshot list to #{cdx_path}"
|
310
|
+
rescue => e
|
311
|
+
puts "Error saving snapshot cache to #{cdx_path}: #{e.message}"
|
312
|
+
end
|
215
313
|
puts
|
216
314
|
|
217
315
|
snapshot_list_to_consider
|
@@ -301,32 +399,103 @@ class WaybackMachineDownloader
|
|
301
399
|
puts "]"
|
302
400
|
end
|
303
401
|
|
402
|
+
def load_downloaded_ids
|
403
|
+
downloaded_ids = Set.new
|
404
|
+
if File.exist?(db_path) && !@reset
|
405
|
+
puts "Loading list of already downloaded files from #{db_path}"
|
406
|
+
begin
|
407
|
+
File.foreach(db_path) { |line| downloaded_ids.add(line.strip) }
|
408
|
+
rescue => e
|
409
|
+
puts "Error reading downloaded files list #{db_path}: #{e.message}. Assuming no files downloaded."
|
410
|
+
downloaded_ids.clear
|
411
|
+
end
|
412
|
+
end
|
413
|
+
downloaded_ids
|
414
|
+
end
|
415
|
+
|
416
|
+
def append_to_db(file_id)
|
417
|
+
@db_mutex.synchronize do
|
418
|
+
begin
|
419
|
+
FileUtils.mkdir_p(File.dirname(db_path))
|
420
|
+
File.open(db_path, 'a') { |f| f.puts(file_id) }
|
421
|
+
rescue => e
|
422
|
+
@logger.error("Failed to append downloaded file ID #{file_id} to #{db_path}: #{e.message}")
|
423
|
+
end
|
424
|
+
end
|
425
|
+
end
|
426
|
+
|
304
427
|
def download_files
|
305
428
|
start_time = Time.now
|
306
429
|
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
|
307
|
-
|
308
|
-
|
309
|
-
|
430
|
+
|
431
|
+
FileUtils.mkdir_p(backup_path)
|
432
|
+
|
433
|
+
# Load the list of files to potentially download
|
434
|
+
files_to_download = file_list_by_timestamp
|
435
|
+
|
436
|
+
if files_to_download.empty?
|
437
|
+
puts "No files found matching criteria."
|
438
|
+
cleanup
|
439
|
+
return
|
440
|
+
end
|
441
|
+
|
442
|
+
total_files = files_to_download.count
|
443
|
+
puts "#{total_files} files found matching criteria."
|
444
|
+
|
445
|
+
# Load IDs of already downloaded files
|
446
|
+
downloaded_ids = load_downloaded_ids
|
447
|
+
files_to_process = files_to_download.reject do |file_info|
|
448
|
+
downloaded_ids.include?(file_info[:file_id])
|
449
|
+
end
|
450
|
+
|
451
|
+
remaining_count = files_to_process.count
|
452
|
+
skipped_count = total_files - remaining_count
|
453
|
+
|
454
|
+
if skipped_count > 0
|
455
|
+
puts "Found #{skipped_count} previously downloaded files, skipping them."
|
456
|
+
end
|
457
|
+
|
458
|
+
if remaining_count == 0
|
459
|
+
puts "All matching files have already been downloaded."
|
460
|
+
cleanup
|
310
461
|
return
|
311
462
|
end
|
312
463
|
|
313
|
-
|
314
|
-
|
315
|
-
|
464
|
+
puts "#{remaining_count} files to download:"
|
465
|
+
|
316
466
|
@processed_file_count = 0
|
467
|
+
@total_to_download = remaining_count
|
317
468
|
@download_mutex = Mutex.new
|
318
|
-
|
469
|
+
|
319
470
|
thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
|
320
471
|
pool = Concurrent::FixedThreadPool.new(thread_count)
|
321
|
-
|
322
|
-
|
472
|
+
|
473
|
+
files_to_process.each do |file_remote_info|
|
323
474
|
pool.post do
|
324
|
-
|
325
|
-
|
326
|
-
@
|
327
|
-
|
328
|
-
|
475
|
+
download_success = false
|
476
|
+
begin
|
477
|
+
@connection_pool.with_connection do |connection|
|
478
|
+
result_message = download_file(file_remote_info, connection)
|
479
|
+
# for now, assume success if no exception and message doesn't indicate error/skip
|
480
|
+
if result_message && !result_message.downcase.include?('error') && !result_message.downcase.include?('failed') && !result_message.downcase.include?('skipping') && !result_message.downcase.include?('already exists')
|
481
|
+
download_success = true
|
482
|
+
end
|
483
|
+
@download_mutex.synchronize do
|
484
|
+
@processed_file_count += 1
|
485
|
+
# adjust progress message to reflect remaining files
|
486
|
+
progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
|
487
|
+
puts progress_message if progress_message
|
488
|
+
end
|
489
|
+
end
|
490
|
+
# sppend to DB only after successful download outside the connection block
|
491
|
+
if download_success
|
492
|
+
append_to_db(file_remote_info[:file_id])
|
329
493
|
end
|
494
|
+
rescue => e
|
495
|
+
@logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
|
496
|
+
@download_mutex.synchronize do
|
497
|
+
@processed_file_count += 1
|
498
|
+
end
|
330
499
|
end
|
331
500
|
sleep(RATE_LIMIT)
|
332
501
|
end
|
@@ -336,7 +505,8 @@ class WaybackMachineDownloader
|
|
336
505
|
pool.wait_for_termination
|
337
506
|
|
338
507
|
end_time = Time.now
|
339
|
-
puts "\nDownload
|
508
|
+
puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
|
509
|
+
puts "Results saved in #{backup_path}"
|
340
510
|
cleanup
|
341
511
|
end
|
342
512
|
|
@@ -368,9 +538,10 @@ class WaybackMachineDownloader
|
|
368
538
|
file_url = file_remote_info[:file_url].encode(current_encoding)
|
369
539
|
file_id = file_remote_info[:file_id]
|
370
540
|
file_timestamp = file_remote_info[:timestamp]
|
371
|
-
|
541
|
+
original_file_id = @all_timestamps ? file_id.split('/', 2)[1] : file_id
|
542
|
+
file_path_elements = original_file_id.split('/')
|
372
543
|
|
373
|
-
if
|
544
|
+
if original_file_id == ""
|
374
545
|
dir_path = backup_path
|
375
546
|
file_path = backup_path + 'index.html'
|
376
547
|
elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
|
@@ -384,21 +555,24 @@ class WaybackMachineDownloader
|
|
384
555
|
dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
385
556
|
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
386
557
|
end
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
558
|
+
|
559
|
+
# check existence *before* download attempt
|
560
|
+
# this handles cases where a file was created manually or by a previous partial run without a .db entry
|
561
|
+
if File.exist? file_path
|
562
|
+
return "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})"
|
563
|
+
end
|
564
|
+
|
565
|
+
begin
|
566
|
+
structure_dir_path dir_path
|
567
|
+
download_with_retry(file_path, file_url, file_timestamp, http)
|
568
|
+
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
|
569
|
+
rescue StandardError => e
|
570
|
+
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
|
571
|
+
if not @all and File.exist?(file_path) and File.size(file_path) == 0
|
572
|
+
File.delete(file_path)
|
573
|
+
msg += "\n#{file_path} was empty and was removed."
|
399
574
|
end
|
400
|
-
|
401
|
-
"#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
|
575
|
+
msg
|
402
576
|
end
|
403
577
|
end
|
404
578
|
|
@@ -431,23 +605,33 @@ class WaybackMachineDownloader
|
|
431
605
|
begin
|
432
606
|
wayback_url = if @rewritten
|
433
607
|
"https://web.archive.org/web/#{file_timestamp}/#{file_url}"
|
434
|
-
else
|
608
|
+
else
|
435
609
|
"https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
|
436
610
|
end
|
437
|
-
|
611
|
+
|
438
612
|
request = Net::HTTP::Get.new(URI(wayback_url))
|
439
613
|
request["Connection"] = "keep-alive"
|
440
614
|
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
|
441
|
-
|
615
|
+
request["Accept-Encoding"] = "gzip, deflate"
|
616
|
+
|
442
617
|
response = connection.request(request)
|
443
|
-
|
618
|
+
|
444
619
|
case response
|
445
620
|
when Net::HTTPSuccess
|
446
621
|
File.open(file_path, "wb") do |file|
|
447
|
-
|
448
|
-
|
622
|
+
body = response.body
|
623
|
+
if response['content-encoding'] == 'gzip' && body && !body.empty?
|
624
|
+
begin
|
625
|
+
gz = Zlib::GzipReader.new(StringIO.new(body))
|
626
|
+
decompressed_body = gz.read
|
627
|
+
gz.close
|
628
|
+
file.write(decompressed_body)
|
629
|
+
rescue Zlib::GzipFile::Error => e
|
630
|
+
@logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}")
|
631
|
+
file.write(body)
|
632
|
+
end
|
449
633
|
else
|
450
|
-
file.write(
|
634
|
+
file.write(body) if body
|
451
635
|
end
|
452
636
|
end
|
453
637
|
when Net::HTTPRedirection
|
@@ -464,7 +648,7 @@ class WaybackMachineDownloader
|
|
464
648
|
else
|
465
649
|
raise "HTTP Error: #{response.code} #{response.message}"
|
466
650
|
end
|
467
|
-
|
651
|
+
|
468
652
|
rescue StandardError => e
|
469
653
|
if retries < MAX_RETRIES
|
470
654
|
retries += 1
|
@@ -480,12 +664,25 @@ class WaybackMachineDownloader
|
|
480
664
|
|
481
665
|
def cleanup
|
482
666
|
@connection_pool.shutdown
|
483
|
-
|
667
|
+
|
484
668
|
if @failed_downloads.any?
|
669
|
+
@logger.error("Download completed with errors.")
|
485
670
|
@logger.error("Failed downloads summary:")
|
486
671
|
@failed_downloads.each do |failure|
|
487
672
|
@logger.error(" #{failure[:url]} - #{failure[:error]}")
|
488
673
|
end
|
674
|
+
unless @reset
|
675
|
+
puts "State files kept due to download errors: #{cdx_path}, #{db_path}"
|
676
|
+
return
|
677
|
+
end
|
678
|
+
end
|
679
|
+
|
680
|
+
if !@keep || @reset
|
681
|
+
puts "Cleaning up state files..." unless @keep && !@reset
|
682
|
+
FileUtils.rm_f(cdx_path)
|
683
|
+
FileUtils.rm_f(db_path)
|
684
|
+
elsif @keep
|
685
|
+
puts "Keeping state files as requested: #{cdx_path}, #{db_path}"
|
489
686
|
end
|
490
687
|
end
|
491
688
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader_straw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.3.
|
4
|
+
version: 2.3.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- strawberrymaster
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-04-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: concurrent-ruby
|