RubyGems - wayback_machine_downloader - Versions diffs - 2.0.0 → 2.3.0 - Mend

wayback_machine_downloader 2.0.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +5 -5
data/bin/wayback_machine_downloader +12 -4
data/lib/wayback_machine_downloader.rb +71 -27
data/lib/wayback_machine_downloader/archive_api.rb +25 -15
data/lib/wayback_machine_downloader/tidy_bytes.rb +4 -4
data/lib/wayback_machine_downloader/to_regex.rb +1 -1
metadata +6 -7

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: e2132b28dea0a03978384a3b337b1107562e644a
-  data.tar.gz: e8b6421b78d02505a8498c79cd1761ebb28a3290
+SHA256:
+  metadata.gz: 57cbbb04b38525f6dd9c1a8f4022ee28ce45c76d1d26acc90076a4b8b6014b44
+  data.tar.gz: 4128b3ab753e91bea93ddebdafba133091663617e0c247c022076a8c11dfa5c2
 SHA512:
-  metadata.gz: d1d0944e9593aadc02db950aa9826491d727f93c6185f23aac20b24b48da086ed67f2be76d91184d2709610b84160c2665ca4b30bcddfcd4981b6840c988e1d0
-  data.tar.gz: aa40fb4da67241e972c86631b9390703ec77643b17b7f62ae2cfbffe49f276ff6f77d901aac87df66aca94d6da7ec3d230ceb001aa9b47f697b5ef1c98b4194f
+  metadata.gz: bb08b6f6e9fa930b025fbf0c783476bd965e364ef46ccd2fecc8e5d0954be062b67b801acf4d168556f7d90f1d5c836a16184e371a5e5d47da7e804278d893ab
+  data.tar.gz: e750e04ab4e1f795e061f0fe91581abb60baecc7d5427d9ec8e724f931d1afba207512731f350eee7c04ceacf12b4a90823e9b3c750e3799810944430279c330

data/bin/wayback_machine_downloader CHANGED Viewed

@@ -18,6 +18,10 @@ option_parser = OptionParser.new do |opts|
     options[:directory] = t
   end
+  opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
+    options[:all_timestamps] = true
+  end
   opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
     options[:from_timestamp] = t
   end
@@ -26,6 +30,10 @@ option_parser = OptionParser.new do |opts|
     options[:to_timestamp] = t
   end
+  opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
+    options[:exact_url] = t
+  end
   opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
     options[:only_filter] = t
   end
@@ -38,15 +46,15 @@ option_parser = OptionParser.new do |opts|
     options[:all] = true
   end
-  opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time", "Default is one file at a time (ie. 20)") do |t|
+  opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
     options[:threads_count] = t
   end
-  opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page ") do |t|
+  opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
     options[:maximum_pages] = t
   end
-  opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything.") do |t|
+  opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
     options[:list] = true
   end
@@ -58,7 +66,7 @@ end.parse!
 if (base_url = ARGV[-1])
   options[:base_url] = base_url
   wayback_machine_downloader = WaybackMachineDownloader.new options
-  if wayback_machine_downloader.list
+  if options[:list]
     wayback_machine_downloader.list_files
   else
     wayback_machine_downloader.download_files

data/lib/wayback_machine_downloader.rb CHANGED Viewed

@@ -14,19 +14,22 @@ class WaybackMachineDownloader
   include ArchiveAPI
-  VERSION = "2.0.0"
+  VERSION = "2.3.0"
-  attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count
+  attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
+    :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
+    :all, :maximum_pages, :threads_count
   def initialize params
     @base_url = params[:base_url]
+    @exact_url = params[:exact_url]
     @directory = params[:directory]
+    @all_timestamps = params[:all_timestamps]
     @from_timestamp = params[:from_timestamp].to_i
     @to_timestamp = params[:to_timestamp].to_i
     @only_filter = params[:only_filter]
     @exclude_filter = params[:exclude_filter]
     @all = params[:all]
-    @list = params[:list]
     @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
     @threads_count = params[:threads_count].to_i
   end
@@ -78,30 +81,29 @@ class WaybackMachineDownloader
   end
   def get_all_snapshots_to_consider
-    # Note: Passing a page index parameter allow us to get more snapshots, but from a less fresh index
+    # Note: Passing a page index parameter allow us to get more snapshots,
+    # but from a less fresh index
     print "Getting snapshot pages"
-    snapshot_list_to_consider = ""
+    snapshot_list_to_consider = []
     snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
     print "."
-    snapshot_list_to_consider += get_raw_list_from_api(@base_url + '/*', nil)
-    print "."
-    @maximum_pages.times do |page_index|
-      snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
-      break if snapshot_list.empty?
-      snapshot_list_to_consider += snapshot_list
-      print "."
+    unless @exact_url
+      @maximum_pages.times do |page_index|
+        snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
+        break if snapshot_list.empty?
+        snapshot_list_to_consider += snapshot_list
+        print "."
+      end
     end
-    puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
+    puts " found #{snapshot_list_to_consider.length} snaphots to consider."
     puts
     snapshot_list_to_consider
   end
   def get_file_list_curated
     file_list_curated = Hash.new
-    get_all_snapshots_to_consider.each_line do |line|
-      next unless line.include?('/')
-      file_timestamp = line[0..13].to_i
-      file_url = line[15..-2]
+    get_all_snapshots_to_consider.each do |file_timestamp, file_url|
+      next unless file_url.include?('/')
       file_id = file_url.split('/')[3..-1].join('/')
       file_id = CGI::unescape file_id
       file_id = file_id.tidy_bytes unless file_id == ""
@@ -124,20 +126,61 @@ class WaybackMachineDownloader
     file_list_curated
   end
+  def get_file_list_all_timestamps
+    file_list_curated = Hash.new
+    get_all_snapshots_to_consider.each do |file_timestamp, file_url|
+      next unless file_url.include?('/')
+      file_id = file_url.split('/')[3..-1].join('/')
+      file_id_and_timestamp = [file_timestamp, file_id].join('/')
+      file_id_and_timestamp = CGI::unescape file_id_and_timestamp
+      file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
+      if file_id.nil?
+        puts "Malformed file url, ignoring: #{file_url}"
+      else
+        if match_exclude_filter(file_url)
+          puts "File url matches exclude filter, ignoring: #{file_url}"
+        elsif not match_only_filter(file_url)
+          puts "File url doesn't match only filter, ignoring: #{file_url}"
+        elsif file_list_curated[file_id_and_timestamp]
+          puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
+        else
+          file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
+        end
+      end
+    end
+    puts "file_list_curated: " + file_list_curated.count.to_s
+    file_list_curated
+  end
   def get_file_list_by_timestamp
-    file_list_curated = get_file_list_curated
-    file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
-    file_list_curated.map do |file_remote_info|
-      file_remote_info[1][:file_id] = file_remote_info[0]
-      file_remote_info[1]
+    if @all_timestamps
+      file_list_curated = get_file_list_all_timestamps
+      file_list_curated.map do |file_remote_info|
+        file_remote_info[1][:file_id] = file_remote_info[0]
+        file_remote_info[1]
+      end
+    else
+      file_list_curated = get_file_list_curated
+      file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
+      file_list_curated.map do |file_remote_info|
+        file_remote_info[1][:file_id] = file_remote_info[0]
+        file_remote_info[1]
+      end
     end
   end
   def list_files
+    # retrieval produces its own output
+    @orig_stdout = $stdout
+    $stdout = $stderr
+    files = get_file_list_by_timestamp
+    $stdout = @orig_stdout
     puts "["
-    get_file_list_by_timestamp.each do |file|
+    files[0...-1].each do |file|
       puts file.to_json + ","
     end
+    puts files[-1].to_json
     puts "]"
   end
@@ -179,7 +222,7 @@ class WaybackMachineDownloader
   def structure_dir_path dir_path
     begin
-      FileUtils::mkdir_p dir_path unless File.exists? dir_path
+      FileUtils::mkdir_p dir_path unless File.exist? dir_path
     rescue Errno::EEXIST => e
       error_to_string = e.to_s
       puts "# #{error_to_string}"
@@ -217,14 +260,15 @@ class WaybackMachineDownloader
       file_path = backup_path + file_path_elements[0..-1].join('/')
     end
     if Gem.win_platform?
+      dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
       file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
     end
-    unless File.exists? file_path
+    unless File.exist? file_path
       begin
         structure_dir_path dir_path
         open(file_path, "wb") do |file|
           begin
-            open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
+            URI.open("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
               file.write(uri.read)
             end
           rescue OpenURI::HTTPError => e
@@ -240,7 +284,7 @@ class WaybackMachineDownloader
       rescue StandardError => e
         puts "#{file_url} # #{e}"
       ensure
-        if not @all and File.exists?(file_path) and File.size(file_path) == 0
+        if not @all and File.exist?(file_path) and File.size(file_path) == 0
           File.delete(file_path)
           puts "#{file_path} was empty and was removed."
         end

data/lib/wayback_machine_downloader/archive_api.rb CHANGED Viewed

@@ -1,28 +1,38 @@
+require 'json'
+require 'uri'
 module ArchiveAPI
-	def get_raw_list_from_api url, page_index
-		request_url = "http://web.archive.org/cdx/search/xd?url="
-		request_url += url
-		request_url += parameters_for_api page_index
+  def get_raw_list_from_api url, page_index
+    request_url = URI("https://web.archive.org/cdx/search/xd")
+    params = [["output", "json"], ["url", url]]
+    params += parameters_for_api page_index
+    request_url.query = URI.encode_www_form(params)
-    open(request_url).read
-	end
+    begin
+      json = JSON.parse(URI(request_url).open.read)
+      if (json[0] <=> ["timestamp","original"]) == 0
+        json.shift
+      end
+      json
+    rescue JSON::ParserError
+      []
+    end
+  end
-	def parameters_for_api page_index
-		parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
-    if @all
-      parameters += ""
-    else
-      parameters += "&filter=statuscode:200"
+  def parameters_for_api page_index
+    parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
+    if !@all
+      parameters.push(["filter", "statuscode:200"])
     end
     if @from_timestamp and @from_timestamp != 0
-      parameters += "&from=" + @from_timestamp.to_s
+      parameters.push(["from", @from_timestamp.to_s])
     end
     if @to_timestamp and @to_timestamp != 0
-      parameters += "&to=" + @to_timestamp.to_s
+      parameters.push(["to", @to_timestamp.to_s])
     end
     if page_index
-      parameters += "&page=#{page_index}"
+      parameters.push(["page", page_index])
     end
     parameters
   end

data/lib/wayback_machine_downloader/tidy_bytes.rb CHANGED Viewed

@@ -60,7 +60,7 @@ module TibyBytes
       bytes.each_index do |i|
         byte          = bytes[i]
-        is_ascii      = byte < 128
+        _is_ascii     = byte < 128
         is_cont       = byte > 127 && byte < 192
         is_lead       = byte > 191 && byte < 245
         is_unused     = byte > 240
@@ -70,7 +70,7 @@ module TibyBytes
         if is_unused || is_restricted
           bytes[i] = tidy_byte(byte)
         elsif is_cont
-          # Not expecting contination byte? Clean up. Otherwise, now expect one less.
+          # Not expecting continuation byte? Clean up. Otherwise, now expect one less.
           conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
         else
           if conts_expected > 0
@@ -78,7 +78,7 @@ module TibyBytes
             # the leading byte.
             begin
               (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
-            rescue NoMethodError => e
+            rescue NoMethodError
               next
             end
             conts_expected = 0
@@ -98,7 +98,7 @@ module TibyBytes
       end
       begin
         bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
-      rescue ArgumentError => e
+      rescue ArgumentError
         nil
       end
     end

data/lib/wayback_machine_downloader/to_regex.rb CHANGED Viewed

@@ -25,7 +25,7 @@ module ToRegex
     # @option options [true,false] :lang /foo/[nesu]
     def to_regex(options = {})
       if args = as_regexp(options)
-        ::Regexp.new *args
+        ::Regexp.new(*args)
       end
     end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: wayback_machine_downloader
 version: !ruby/object:Gem::Version
-  version: 2.0.0
+  version: 2.3.0
 platform: ruby
 authors:
 - hartator
-autorequire:
+autorequire:
 bindir: bin
 cert_chain: []
-date: 2017-06-10 00:00:00.000000000 Z
+date: 2021-06-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
@@ -57,7 +57,7 @@ homepage: https://github.com/hartator/wayback-machine-downloader
 licenses:
 - MIT
 metadata: {}
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -72,9 +72,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.5.2
-signing_key:
+rubygems_version: 3.1.4
+signing_key:
 specification_version: 4
 summary: Download an entire website from the Wayback Machine.
 test_files: []