wayback_machine_downloader_straw 2.3.3 → 2.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a8d577ca08cca3858efd95bfd879b198a57aa6262fa8e0a7f83ab4f3a362f1fc
4
- data.tar.gz: ef73d81d745e7b3e9226458a66b5d54c2410db646ea85cc7145813bc26789dc7
3
+ metadata.gz: 220999514eb0c1dd5bce948a2ac028e4527eb07f089b9f6b437f02a6a00860be
4
+ data.tar.gz: '03780351285ee37d38ba04652725ffa33f6112837e01e68469c8be3cda13eb45'
5
5
  SHA512:
6
- metadata.gz: 938e8544bb16b4afc6c81d0e4da602b5d3cd3e05482b3cc945ad3405681278fc03c7ccc1b84992b1ecafe66cf202aa71f306226b92f4b93b30b1c5c7edcbc86e
7
- data.tar.gz: 8c236877be6274b9bb3c474fde9ad5a72a30abd5db3eadfc11eeae997488a2ddf27a72388b6d7d47455235ed0e084b1d14a7782911d0dd69e41f2fdcada713e2
6
+ metadata.gz: 3b05448a6271b8e45d5655b5ee415851f6e8e2daaec5f9bb12b0681e58292c06fe4ab91ab4f2ca1530edb0632755808dc8a465165c5e73fea2673481dddad610
7
+ data.tar.gz: 95440ee51316da6f2e48c3ec1d54f9fc391b2d59447625f07052222ecfeacf6fc26d430ac64ede6da589c0115fddc7a71fc7eb2fa45ff403491f1b3dc51b66ec
@@ -59,7 +59,15 @@ option_parser = OptionParser.new do |opts|
59
59
  end
60
60
 
61
61
  opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t|
62
- options[:rewritten] = t
62
+ options[:rewritten] = true
63
+ end
64
+
65
+ opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
66
+ options[:reset] = true
67
+ end
68
+
69
+ opts.on("--keep", "Keep state files (.cdx.json, .downloaded.txt) after a successful download") do |t|
70
+ options[:keep] = true
63
71
  end
64
72
 
65
73
  opts.on("-v", "--version", "Display version") do |t|
@@ -4,7 +4,7 @@ require 'uri'
4
4
  module ArchiveAPI
5
5
 
6
6
  def get_raw_list_from_api(url, page_index, http)
7
- request_url = URI("https://web.archive.org/cdx/search/xd")
7
+ request_url = URI("https://web.archive.org/cdx/search/cdx")
8
8
  params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
9
9
  request_url.query = URI.encode_www_form(params)
10
10
 
@@ -9,6 +9,8 @@ require 'json'
9
9
  require 'time'
10
10
  require 'concurrent-ruby'
11
11
  require 'logger'
12
+ require 'zlib'
13
+ require 'stringio'
12
14
  require_relative 'wayback_machine_downloader/tidy_bytes'
13
15
  require_relative 'wayback_machine_downloader/to_regex'
14
16
  require_relative 'wayback_machine_downloader/archive_api'
@@ -111,17 +113,19 @@ class WaybackMachineDownloader
111
113
 
112
114
  include ArchiveAPI
113
115
 
114
- VERSION = "2.3.3"
116
+ VERSION = "2.3.4"
115
117
  DEFAULT_TIMEOUT = 30
116
118
  MAX_RETRIES = 3
117
119
  RETRY_DELAY = 2
118
120
  RATE_LIMIT = 0.25 # Delay between requests in seconds
119
121
  CONNECTION_POOL_SIZE = 10
120
122
  MEMORY_BUFFER_SIZE = 16384 # 16KB chunks
123
+ STATE_CDX_FILENAME = ".cdx.json"
124
+ STATE_DB_FILENAME = ".downloaded.txt"
121
125
 
122
126
  attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
123
127
  :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
124
- :all, :maximum_pages, :threads_count, :logger
128
+ :all, :maximum_pages, :threads_count, :logger, :reset, :keep
125
129
 
126
130
  def initialize params
127
131
  validate_params(params)
@@ -137,10 +141,15 @@ class WaybackMachineDownloader
137
141
  @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
138
142
  @threads_count = [params[:threads_count].to_i, 1].max
139
143
  @rewritten = params[:rewritten]
144
+ @reset = params[:reset]
145
+ @keep = params[:keep]
140
146
  @timeout = params[:timeout] || DEFAULT_TIMEOUT
141
147
  @logger = setup_logger
142
148
  @failed_downloads = Concurrent::Array.new
143
149
  @connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
150
+ @db_mutex = Mutex.new
151
+
152
+ handle_reset
144
153
  end
145
154
 
146
155
  def backup_name
@@ -163,6 +172,23 @@ class WaybackMachineDownloader
163
172
  end
164
173
  end
165
174
 
175
+ def cdx_path
176
+ File.join(backup_path, STATE_CDX_FILENAME)
177
+ end
178
+
179
+ def db_path
180
+ File.join(backup_path, STATE_DB_FILENAME)
181
+ end
182
+
183
+ def handle_reset
184
+ if @reset
185
+ puts "Resetting download state..."
186
+ FileUtils.rm_f(cdx_path)
187
+ FileUtils.rm_f(db_path)
188
+ puts "Removed state files: #{cdx_path}, #{db_path}"
189
+ end
190
+ end
191
+
166
192
  def match_only_filter file_url
167
193
  if @only_filter
168
194
  only_filter_regex = @only_filter.to_regex
@@ -190,28 +216,100 @@ class WaybackMachineDownloader
190
216
  end
191
217
 
192
218
  def get_all_snapshots_to_consider
193
- snapshot_list_to_consider = []
219
+ if File.exist?(cdx_path) && !@reset
220
+ puts "Loading snapshot list from #{cdx_path}"
221
+ begin
222
+ snapshot_list_to_consider = JSON.parse(File.read(cdx_path))
223
+ puts "Loaded #{snapshot_list_to_consider.length} snapshots from cache."
224
+ puts
225
+ return Concurrent::Array.new(snapshot_list_to_consider)
226
+ rescue JSON::ParserError => e
227
+ puts "Error reading snapshot cache file #{cdx_path}: #{e.message}. Refetching..."
228
+ FileUtils.rm_f(cdx_path)
229
+ rescue => e
230
+ puts "Error loading snapshot cache #{cdx_path}: #{e.message}. Refetching..."
231
+ FileUtils.rm_f(cdx_path)
232
+ end
233
+ end
234
+
235
+ snapshot_list_to_consider = Concurrent::Array.new
236
+ mutex = Mutex.new
194
237
 
238
+ puts "Getting snapshot pages from Wayback Machine API..."
239
+
240
+ # Fetch the initial set of snapshots, sequentially
195
241
  @connection_pool.with_connection do |connection|
196
- puts "Getting snapshot pages"
242
+ initial_list = get_raw_list_from_api(@base_url, nil, connection)
243
+ mutex.synchronize do
244
+ snapshot_list_to_consider.concat(initial_list)
245
+ print "."
246
+ end
247
+ end
248
+
249
+ # Fetch additional pages if the exact URL flag is not set
250
+ unless @exact_url
251
+ page_index = 0
252
+ batch_size = [@threads_count, 5].min
253
+ continue_fetching = true
254
+
255
+ while continue_fetching && page_index < @maximum_pages
256
+ # Determine the range of pages to fetch in this batch
257
+ end_index = [page_index + batch_size, @maximum_pages].min
258
+ current_batch = (page_index...end_index).to_a
259
+
260
+ # Create futures for concurrent API calls
261
+ futures = current_batch.map do |page|
262
+ Concurrent::Future.execute do
263
+ result = nil
264
+ @connection_pool.with_connection do |connection|
265
+ result = get_raw_list_from_api("#{@base_url}/*", page, connection)
266
+ end
267
+ [page, result]
268
+ end
269
+ end
197
270
 
198
- # Fetch the initial set of snapshots
199
- snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil, connection)
200
- print "."
271
+ results = []
201
272
 
202
- # Fetch additional pages if the exact URL flag is not set
203
- unless @exact_url
204
- @maximum_pages.times do |page_index|
205
- snapshot_list = get_raw_list_from_api("#{@base_url}/*", page_index, connection)
206
- break if snapshot_list.empty?
273
+ futures.each do |future|
274
+ begin
275
+ results << future.value
276
+ rescue => e
277
+ puts "\nError fetching page #{future}: #{e.message}"
278
+ end
279
+ end
280
+
281
+ # Sort results by page number to maintain order
282
+ results.sort_by! { |page, _| page }
207
283
 
208
- snapshot_list_to_consider += snapshot_list
209
- print "."
284
+ # Process results and check for empty pages
285
+ results.each do |page, result|
286
+ if result.empty?
287
+ continue_fetching = false
288
+ break
289
+ else
290
+ mutex.synchronize do
291
+ snapshot_list_to_consider.concat(result)
292
+ print "."
293
+ end
294
+ end
210
295
  end
296
+
297
+ page_index = end_index
298
+
299
+ sleep(RATE_LIMIT) if continue_fetching
211
300
  end
212
301
  end
213
302
 
214
- puts " found #{snapshot_list_to_consider.length} snapshots to consider."
303
+ puts " found #{snapshot_list_to_consider.length} snapshots."
304
+
305
+ # Save the fetched list to the cache file
306
+ begin
307
+ FileUtils.mkdir_p(File.dirname(cdx_path))
308
+ File.write(cdx_path, JSON.pretty_generate(snapshot_list_to_consider.to_a)) # Convert Concurrent::Array back to Array for JSON
309
+ puts "Saved snapshot list to #{cdx_path}"
310
+ rescue => e
311
+ puts "Error saving snapshot cache to #{cdx_path}: #{e.message}"
312
+ end
215
313
  puts
216
314
 
217
315
  snapshot_list_to_consider
@@ -301,32 +399,103 @@ class WaybackMachineDownloader
301
399
  puts "]"
302
400
  end
303
401
 
402
+ def load_downloaded_ids
403
+ downloaded_ids = Set.new
404
+ if File.exist?(db_path) && !@reset
405
+ puts "Loading list of already downloaded files from #{db_path}"
406
+ begin
407
+ File.foreach(db_path) { |line| downloaded_ids.add(line.strip) }
408
+ rescue => e
409
+ puts "Error reading downloaded files list #{db_path}: #{e.message}. Assuming no files downloaded."
410
+ downloaded_ids.clear
411
+ end
412
+ end
413
+ downloaded_ids
414
+ end
415
+
416
+ def append_to_db(file_id)
417
+ @db_mutex.synchronize do
418
+ begin
419
+ FileUtils.mkdir_p(File.dirname(db_path))
420
+ File.open(db_path, 'a') { |f| f.puts(file_id) }
421
+ rescue => e
422
+ @logger.error("Failed to append downloaded file ID #{file_id} to #{db_path}: #{e.message}")
423
+ end
424
+ end
425
+ end
426
+
304
427
  def download_files
305
428
  start_time = Time.now
306
429
  puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
307
-
308
- if file_list_by_timestamp.empty?
309
- puts "No files to download."
430
+
431
+ FileUtils.mkdir_p(backup_path)
432
+
433
+ # Load the list of files to potentially download
434
+ files_to_download = file_list_by_timestamp
435
+
436
+ if files_to_download.empty?
437
+ puts "No files found matching criteria."
438
+ cleanup
439
+ return
440
+ end
441
+
442
+ total_files = files_to_download.count
443
+ puts "#{total_files} files found matching criteria."
444
+
445
+ # Load IDs of already downloaded files
446
+ downloaded_ids = load_downloaded_ids
447
+ files_to_process = files_to_download.reject do |file_info|
448
+ downloaded_ids.include?(file_info[:file_id])
449
+ end
450
+
451
+ remaining_count = files_to_process.count
452
+ skipped_count = total_files - remaining_count
453
+
454
+ if skipped_count > 0
455
+ puts "Found #{skipped_count} previously downloaded files, skipping them."
456
+ end
457
+
458
+ if remaining_count == 0
459
+ puts "All matching files have already been downloaded."
460
+ cleanup
310
461
  return
311
462
  end
312
463
 
313
- total_files = file_list_by_timestamp.count
314
- puts "#{total_files} files to download:"
315
-
464
+ puts "#{remaining_count} files to download:"
465
+
316
466
  @processed_file_count = 0
467
+ @total_to_download = remaining_count
317
468
  @download_mutex = Mutex.new
318
-
469
+
319
470
  thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
320
471
  pool = Concurrent::FixedThreadPool.new(thread_count)
321
-
322
- file_list_by_timestamp.each do |file_remote_info|
472
+
473
+ files_to_process.each do |file_remote_info|
323
474
  pool.post do
324
- @connection_pool.with_connection do |connection|
325
- result = download_file(file_remote_info, connection)
326
- @download_mutex.synchronize do
327
- @processed_file_count += 1
328
- puts result if result
475
+ download_success = false
476
+ begin
477
+ @connection_pool.with_connection do |connection|
478
+ result_message = download_file(file_remote_info, connection)
479
+ # for now, assume success if no exception and message doesn't indicate error/skip
480
+ if result_message && !result_message.downcase.include?('error') && !result_message.downcase.include?('failed') && !result_message.downcase.include?('skipping') && !result_message.downcase.include?('already exists')
481
+ download_success = true
482
+ end
483
+ @download_mutex.synchronize do
484
+ @processed_file_count += 1
485
+ # adjust progress message to reflect remaining files
486
+ progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
487
+ puts progress_message if progress_message
488
+ end
489
+ end
490
+ # sppend to DB only after successful download outside the connection block
491
+ if download_success
492
+ append_to_db(file_remote_info[:file_id])
329
493
  end
494
+ rescue => e
495
+ @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
496
+ @download_mutex.synchronize do
497
+ @processed_file_count += 1
498
+ end
330
499
  end
331
500
  sleep(RATE_LIMIT)
332
501
  end
@@ -336,7 +505,8 @@ class WaybackMachineDownloader
336
505
  pool.wait_for_termination
337
506
 
338
507
  end_time = Time.now
339
- puts "\nDownload completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path}"
508
+ puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
509
+ puts "Results saved in #{backup_path}"
340
510
  cleanup
341
511
  end
342
512
 
@@ -368,9 +538,10 @@ class WaybackMachineDownloader
368
538
  file_url = file_remote_info[:file_url].encode(current_encoding)
369
539
  file_id = file_remote_info[:file_id]
370
540
  file_timestamp = file_remote_info[:timestamp]
371
- file_path_elements = file_id.split('/')
541
+ original_file_id = @all_timestamps ? file_id.split('/', 2)[1] : file_id
542
+ file_path_elements = original_file_id.split('/')
372
543
 
373
- if file_id == ""
544
+ if original_file_id == ""
374
545
  dir_path = backup_path
375
546
  file_path = backup_path + 'index.html'
376
547
  elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
@@ -384,21 +555,24 @@ class WaybackMachineDownloader
384
555
  dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
385
556
  file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
386
557
  end
387
- unless File.exist? file_path
388
- begin
389
- structure_dir_path dir_path
390
- download_with_retry(file_path, file_url, file_timestamp, http)
391
- "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
392
- rescue StandardError => e
393
- msg = "#{file_url} # #{e}"
394
- if not @all and File.exist?(file_path) and File.size(file_path) == 0
395
- File.delete(file_path)
396
- msg += "\n#{file_path} was empty and was removed."
397
- end
398
- msg
558
+
559
+ # check existence *before* download attempt
560
+ # this handles cases where a file was created manually or by a previous partial run without a .db entry
561
+ if File.exist? file_path
562
+ return "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})"
563
+ end
564
+
565
+ begin
566
+ structure_dir_path dir_path
567
+ download_with_retry(file_path, file_url, file_timestamp, http)
568
+ "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
569
+ rescue StandardError => e
570
+ msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
571
+ if not @all and File.exist?(file_path) and File.size(file_path) == 0
572
+ File.delete(file_path)
573
+ msg += "\n#{file_path} was empty and was removed."
399
574
  end
400
- else
401
- "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
575
+ msg
402
576
  end
403
577
  end
404
578
 
@@ -431,23 +605,33 @@ class WaybackMachineDownloader
431
605
  begin
432
606
  wayback_url = if @rewritten
433
607
  "https://web.archive.org/web/#{file_timestamp}/#{file_url}"
434
- else
608
+ else
435
609
  "https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
436
610
  end
437
-
611
+
438
612
  request = Net::HTTP::Get.new(URI(wayback_url))
439
613
  request["Connection"] = "keep-alive"
440
614
  request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
441
-
615
+ request["Accept-Encoding"] = "gzip, deflate"
616
+
442
617
  response = connection.request(request)
443
-
618
+
444
619
  case response
445
620
  when Net::HTTPSuccess
446
621
  File.open(file_path, "wb") do |file|
447
- if block_given?
448
- yield(response, file)
622
+ body = response.body
623
+ if response['content-encoding'] == 'gzip' && body && !body.empty?
624
+ begin
625
+ gz = Zlib::GzipReader.new(StringIO.new(body))
626
+ decompressed_body = gz.read
627
+ gz.close
628
+ file.write(decompressed_body)
629
+ rescue Zlib::GzipFile::Error => e
630
+ @logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}")
631
+ file.write(body)
632
+ end
449
633
  else
450
- file.write(response.body)
634
+ file.write(body) if body
451
635
  end
452
636
  end
453
637
  when Net::HTTPRedirection
@@ -464,7 +648,7 @@ class WaybackMachineDownloader
464
648
  else
465
649
  raise "HTTP Error: #{response.code} #{response.message}"
466
650
  end
467
-
651
+
468
652
  rescue StandardError => e
469
653
  if retries < MAX_RETRIES
470
654
  retries += 1
@@ -480,12 +664,25 @@ class WaybackMachineDownloader
480
664
 
481
665
  def cleanup
482
666
  @connection_pool.shutdown
483
-
667
+
484
668
  if @failed_downloads.any?
669
+ @logger.error("Download completed with errors.")
485
670
  @logger.error("Failed downloads summary:")
486
671
  @failed_downloads.each do |failure|
487
672
  @logger.error(" #{failure[:url]} - #{failure[:error]}")
488
673
  end
674
+ unless @reset
675
+ puts "State files kept due to download errors: #{cdx_path}, #{db_path}"
676
+ return
677
+ end
678
+ end
679
+
680
+ if !@keep || @reset
681
+ puts "Cleaning up state files..." unless @keep && !@reset
682
+ FileUtils.rm_f(cdx_path)
683
+ FileUtils.rm_f(db_path)
684
+ elsif @keep
685
+ puts "Keeping state files as requested: #{cdx_path}, #{db_path}"
489
686
  end
490
687
  end
491
688
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader_straw
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.3.3
4
+ version: 2.3.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - strawberrymaster
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-03-08 00:00:00.000000000 Z
11
+ date: 2025-04-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: concurrent-ruby