RubyGems - wayback_machine_downloader - Versions diffs - 2.1.1 → 2.3.1 - Mend

wayback_machine_downloader 2.1.1 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +5 -5
data/bin/wayback_machine_downloader +5 -1
data/lib/wayback_machine_downloader/archive_api.rb +22 -12
data/lib/wayback_machine_downloader/tidy_bytes.rb +1 -1
data/lib/wayback_machine_downloader.rb +54 -15
metadata +6 -7

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: 48f524cedc0e9f66c7b0acca132a71557a327ea2
-  data.tar.gz: 1d70bb2a76cd07c82c08674fdc96b543caec48c0
+SHA256:
+  metadata.gz: 54752c73ebfac815e91ef6bba40547a36282e5ec9c3ef2792370c13352fce0b6
+  data.tar.gz: df2f5d94981eeb2d1e55d2b4a9dd8fe57a24e8b29cf79a700ca520b7c3bc1a21
 SHA512:
-  metadata.gz: 26eb05cbeebd911502bd01513535c7cc2d4ad0fe3850adc0205ca4f649351e56855af66915d86c501fb8be64963fe1d409d013d8afcd24064cc15673b2cc0854
-  data.tar.gz: 0dbbd54b4b4ab231adcae908bbf6cd3865768590263e767fe5e45fb3a9d70676c337f79aba576378272ddc14647ecd06fc26820fc1dec8cb52704aa6740582b7
+  metadata.gz: 108d33cf57b738ba69ccf960f503ab5ea44b296ba043716fb2e83e9fa5bebcaec9a488bc4a5ab64dad55c1f23434c2b71005a86389e9b26fd07b38372f96b6d4
+  data.tar.gz: 62afad1698415e0c80b85599da7aba1e19574ec571862f8d69c56d1fe718f8c65cae3e3be2293d8418ecc7dd09803b4d9908186e93f3062ccd85b363a5e7dde4

data/bin/wayback_machine_downloader CHANGED Viewed

@@ -18,6 +18,10 @@ option_parser = OptionParser.new do |opts|
     options[:directory] = t
   end
+  opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
+    options[:all_timestamps] = true
+  end
   opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
     options[:from_timestamp] = t
   end
@@ -42,7 +46,7 @@ option_parser = OptionParser.new do |opts|
     options[:all] = true
   end
-  opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time", "Default is one file at a time (ie. 20)") do |t|
+  opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
     options[:threads_count] = t
   end

data/lib/wayback_machine_downloader/archive_api.rb CHANGED Viewed

@@ -1,28 +1,38 @@
+require 'json'
+require 'uri'
 module ArchiveAPI
   def get_raw_list_from_api url, page_index
-    request_url = "http://web.archive.org/cdx/search/xd?url="
-    request_url += url
-    request_url += parameters_for_api page_index
+    request_url = URI("https://web.archive.org/cdx/search/xd")
+    params = [["output", "json"], ["url", url]]
+    params += parameters_for_api page_index
+    request_url.query = URI.encode_www_form(params)
-    open(request_url).read
+    begin
+      json = JSON.parse(URI(request_url).open.read)
+      if (json[0] <=> ["timestamp","original"]) == 0
+        json.shift
+      end
+      json
+    rescue JSON::ParserError
+      []
+    end
   end
   def parameters_for_api page_index
-    parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
-    if @all
-      parameters += ""
-    else
-      parameters += "&filter=statuscode:200"
+    parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
+    if !@all
+      parameters.push(["filter", "statuscode:200"])
     end
     if @from_timestamp and @from_timestamp != 0
-      parameters += "&from=" + @from_timestamp.to_s
+      parameters.push(["from", @from_timestamp.to_s])
     end
     if @to_timestamp and @to_timestamp != 0
-      parameters += "&to=" + @to_timestamp.to_s
+      parameters.push(["to", @to_timestamp.to_s])
     end
     if page_index
-      parameters += "&page=#{page_index}"
+      parameters.push(["page", page_index])
     end
     parameters
   end

data/lib/wayback_machine_downloader/tidy_bytes.rb CHANGED Viewed

@@ -70,7 +70,7 @@ module TibyBytes
         if is_unused || is_restricted
           bytes[i] = tidy_byte(byte)
         elsif is_cont
-          # Not expecting contination byte? Clean up. Otherwise, now expect one less.
+          # Not expecting continuation byte? Clean up. Otherwise, now expect one less.
           conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
         else
           if conts_expected > 0

data/lib/wayback_machine_downloader.rb CHANGED Viewed

@@ -14,9 +14,9 @@ class WaybackMachineDownloader
   include ArchiveAPI
-  VERSION = "2.1.1"
+  VERSION = "2.3.1"
-  attr_accessor :base_url, :exact_url, :directory,
+  attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
     :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
     :all, :maximum_pages, :threads_count
@@ -24,6 +24,7 @@ class WaybackMachineDownloader
     @base_url = params[:base_url]
     @exact_url = params[:exact_url]
     @directory = params[:directory]
+    @all_timestamps = params[:all_timestamps]
     @from_timestamp = params[:from_timestamp].to_i
     @to_timestamp = params[:to_timestamp].to_i
     @only_filter = params[:only_filter]
@@ -83,7 +84,7 @@ class WaybackMachineDownloader
     # Note: Passing a page index parameter allow us to get more snapshots,
     # but from a less fresh index
     print "Getting snapshot pages"
-    snapshot_list_to_consider = ""
+    snapshot_list_to_consider = []
     snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
     print "."
     unless @exact_url
@@ -94,17 +95,15 @@ class WaybackMachineDownloader
         print "."
       end
     end
-    puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
+    puts " found #{snapshot_list_to_consider.length} snaphots to consider."
     puts
     snapshot_list_to_consider
   end
   def get_file_list_curated
     file_list_curated = Hash.new
-    get_all_snapshots_to_consider.each_line do |line|
-      next unless line.include?('/')
-      file_timestamp = line[0..13].to_i
-      file_url = line[15..-2]
+    get_all_snapshots_to_consider.each do |file_timestamp, file_url|
+      next unless file_url.include?('/')
       file_id = file_url.split('/')[3..-1].join('/')
       file_id = CGI::unescape file_id
       file_id = file_id.tidy_bytes unless file_id == ""
@@ -127,22 +126,61 @@ class WaybackMachineDownloader
     file_list_curated
   end
+  def get_file_list_all_timestamps
+    file_list_curated = Hash.new
+    get_all_snapshots_to_consider.each do |file_timestamp, file_url|
+      next unless file_url.include?('/')
+      file_id = file_url.split('/')[3..-1].join('/')
+      file_id_and_timestamp = [file_timestamp, file_id].join('/')
+      file_id_and_timestamp = CGI::unescape file_id_and_timestamp
+      file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
+      if file_id.nil?
+        puts "Malformed file url, ignoring: #{file_url}"
+      else
+        if match_exclude_filter(file_url)
+          puts "File url matches exclude filter, ignoring: #{file_url}"
+        elsif not match_only_filter(file_url)
+          puts "File url doesn't match only filter, ignoring: #{file_url}"
+        elsif file_list_curated[file_id_and_timestamp]
+          puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
+        else
+          file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
+        end
+      end
+    end
+    puts "file_list_curated: " + file_list_curated.count.to_s
+    file_list_curated
+  end
   def get_file_list_by_timestamp
-    file_list_curated = get_file_list_curated
-    file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
-    file_list_curated.map do |file_remote_info|
-      file_remote_info[1][:file_id] = file_remote_info[0]
-      file_remote_info[1]
+    if @all_timestamps
+      file_list_curated = get_file_list_all_timestamps
+      file_list_curated.map do |file_remote_info|
+        file_remote_info[1][:file_id] = file_remote_info[0]
+        file_remote_info[1]
+      end
+    else
+      file_list_curated = get_file_list_curated
+      file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
+      file_list_curated.map do |file_remote_info|
+        file_remote_info[1][:file_id] = file_remote_info[0]
+        file_remote_info[1]
+      end
     end
   end
   def list_files
     # retrieval produces its own output
+    @orig_stdout = $stdout
+    $stdout = $stderr
     files = get_file_list_by_timestamp
+    $stdout = @orig_stdout
     puts "["
-    files.each do |file|
+    files[0...-1].each do |file|
       puts file.to_json + ","
     end
+    puts files[-1].to_json
     puts "]"
   end
@@ -222,6 +260,7 @@ class WaybackMachineDownloader
       file_path = backup_path + file_path_elements[0..-1].join('/')
     end
     if Gem.win_platform?
+      dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
       file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
     end
     unless File.exist? file_path
@@ -229,7 +268,7 @@ class WaybackMachineDownloader
         structure_dir_path dir_path
         open(file_path, "wb") do |file|
           begin
-            open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
+            URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}").open("Accept-Encoding" => "plain") do |uri|
               file.write(uri.read)
             end
           rescue OpenURI::HTTPError => e

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: wayback_machine_downloader
 version: !ruby/object:Gem::Version
-  version: 2.1.1
+  version: 2.3.1
 platform: ruby
 authors:
 - hartator
-autorequire:
+autorequire:
 bindir: bin
 cert_chain: []
-date: 2017-06-12 00:00:00.000000000 Z
+date: 2021-09-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
@@ -57,7 +57,7 @@ homepage: https://github.com/hartator/wayback-machine-downloader
 licenses:
 - MIT
 metadata: {}
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -72,9 +72,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.5.2
-signing_key:
+rubygems_version: 3.1.4
+signing_key:
 specification_version: 4
 summary: Download an entire website from the Wayback Machine.
 test_files: []