RubyGems - wayback_machine_downloader - Versions diffs - 2.0.0 → 2.1.0 - Mend

wayback_machine_downloader 2.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/bin/wayback_machine_downloader +7 -3
data/lib/wayback_machine_downloader.rb +20 -15
data/lib/wayback_machine_downloader/archive_api.rb +7 -7
data/lib/wayback_machine_downloader/tidy_bytes.rb +3 -3
data/lib/wayback_machine_downloader/to_regex.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: e2132b28dea0a03978384a3b337b1107562e644a
-  data.tar.gz: e8b6421b78d02505a8498c79cd1761ebb28a3290
+  metadata.gz: 520f637efbb03d1e3ac87aadf1a937cc132f6c32
+  data.tar.gz: 90ec9079f5420153e1b7c149ce5aedb45bd4ba2c
 SHA512:
-  metadata.gz: d1d0944e9593aadc02db950aa9826491d727f93c6185f23aac20b24b48da086ed67f2be76d91184d2709610b84160c2665ca4b30bcddfcd4981b6840c988e1d0
-  data.tar.gz: aa40fb4da67241e972c86631b9390703ec77643b17b7f62ae2cfbffe49f276ff6f77d901aac87df66aca94d6da7ec3d230ceb001aa9b47f697b5ef1c98b4194f
+  metadata.gz: 6009139fbad22b7e269582d905956147df5cec565e376cbf1bdbb1125bb906fdea31c6501b3dfd9fa1b35d730444c976bcf187b8c869e8ed7db146ea155ba8fb
+  data.tar.gz: b4cff49e64c3ec528b544184e1426b76d1284c7f97ee9c787f370cc305b652f328c53ac48d3fc9bd9c9405b26edceb7a888e7f4a8f061ee704608e95a0642108

data/bin/wayback_machine_downloader CHANGED Viewed

@@ -26,6 +26,10 @@ option_parser = OptionParser.new do |opts|
     options[:to_timestamp] = t
   end
+  opts.on("-e", "--exact_url", String, "Download only the url provied and not the full site") do |t|
+    options[:only_filter] = t
+  end
   opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
     options[:only_filter] = t
   end
@@ -42,11 +46,11 @@ option_parser = OptionParser.new do |opts|
     options[:threads_count] = t
   end
-  opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page ") do |t|
+  opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
     options[:maximum_pages] = t
   end
-  opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything.") do |t|
+  opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
     options[:list] = true
   end
@@ -58,7 +62,7 @@ end.parse!
 if (base_url = ARGV[-1])
   options[:base_url] = base_url
   wayback_machine_downloader = WaybackMachineDownloader.new options
-  if wayback_machine_downloader.list
+  if options[:list]
     wayback_machine_downloader.list_files
   else
     wayback_machine_downloader.download_files

data/lib/wayback_machine_downloader.rb CHANGED Viewed

@@ -14,19 +14,21 @@ class WaybackMachineDownloader
   include ArchiveAPI
-  VERSION = "2.0.0"
+  VERSION = "2.1.0"
-  attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count
+  attr_accessor :base_url, :exact_url, :directory,
+    :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
+    :all, :maximum_pages, :threads_count
   def initialize params
     @base_url = params[:base_url]
+    @exact_url = params[:exact_url]
     @directory = params[:directory]
     @from_timestamp = params[:from_timestamp].to_i
     @to_timestamp = params[:to_timestamp].to_i
     @only_filter = params[:only_filter]
     @exclude_filter = params[:exclude_filter]
     @all = params[:all]
-    @list = params[:list]
     @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
     @threads_count = params[:threads_count].to_i
   end
@@ -78,18 +80,19 @@ class WaybackMachineDownloader
   end
   def get_all_snapshots_to_consider
-    # Note: Passing a page index parameter allow us to get more snapshots, but from a less fresh index
+    # Note: Passing a page index parameter allow us to get more snapshots,
+    # but from a less fresh index
     print "Getting snapshot pages"
     snapshot_list_to_consider = ""
     snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
     print "."
-    snapshot_list_to_consider += get_raw_list_from_api(@base_url + '/*', nil)
-    print "."
-    @maximum_pages.times do |page_index|
-      snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
-      break if snapshot_list.empty?
-      snapshot_list_to_consider += snapshot_list
-      print "."
+    unless @exact_url
+      @maximum_pages.times do |page_index|
+        snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
+        break if snapshot_list.empty?
+        snapshot_list_to_consider += snapshot_list
+        print "."
+      end
     end
     puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
     puts
@@ -134,8 +137,10 @@ class WaybackMachineDownloader
   end
   def list_files
+    # retrieval produces its own output
+    files = get_file_list_by_timestamp
     puts "["
-    get_file_list_by_timestamp.each do |file|
+    files.each do |file|
       puts file.to_json + ","
     end
     puts "]"
@@ -179,7 +184,7 @@ class WaybackMachineDownloader
   def structure_dir_path dir_path
     begin
-      FileUtils::mkdir_p dir_path unless File.exists? dir_path
+      FileUtils::mkdir_p dir_path unless File.exist? dir_path
     rescue Errno::EEXIST => e
       error_to_string = e.to_s
       puts "# #{error_to_string}"
@@ -219,7 +224,7 @@ class WaybackMachineDownloader
     if Gem.win_platform?
       file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
     end
-    unless File.exists? file_path
+    unless File.exist? file_path
       begin
         structure_dir_path dir_path
         open(file_path, "wb") do |file|
@@ -240,7 +245,7 @@ class WaybackMachineDownloader
       rescue StandardError => e
         puts "#{file_url} # #{e}"
       ensure
-        if not @all and File.exists?(file_path) and File.size(file_path) == 0
+        if not @all and File.exist?(file_path) and File.size(file_path) == 0
           File.delete(file_path)
           puts "#{file_path} was empty and was removed."
         end

data/lib/wayback_machine_downloader/archive_api.rb CHANGED Viewed

@@ -1,15 +1,15 @@
 module ArchiveAPI
-	def get_raw_list_from_api url, page_index
-		request_url = "http://web.archive.org/cdx/search/xd?url="
-		request_url += url
-		request_url += parameters_for_api page_index
+  def get_raw_list_from_api url, page_index
+    request_url = "http://web.archive.org/cdx/search/xd?url="
+    request_url += url
+    request_url += parameters_for_api page_index
     open(request_url).read
-	end
+  end
-	def parameters_for_api page_index
-		parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
+  def parameters_for_api page_index
+    parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
     if @all
       parameters += ""
     else

data/lib/wayback_machine_downloader/tidy_bytes.rb CHANGED Viewed

@@ -60,7 +60,7 @@ module TibyBytes
       bytes.each_index do |i|
         byte          = bytes[i]
-        is_ascii      = byte < 128
+        _is_ascii     = byte < 128
         is_cont       = byte > 127 && byte < 192
         is_lead       = byte > 191 && byte < 245
         is_unused     = byte > 240
@@ -78,7 +78,7 @@ module TibyBytes
             # the leading byte.
             begin
               (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
-            rescue NoMethodError => e
+            rescue NoMethodError
               next
             end
             conts_expected = 0
@@ -98,7 +98,7 @@ module TibyBytes
       end
       begin
         bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
-      rescue ArgumentError => e
+      rescue ArgumentError
         nil
       end
     end

data/lib/wayback_machine_downloader/to_regex.rb CHANGED Viewed

@@ -25,7 +25,7 @@ module ToRegex
     # @option options [true,false] :lang /foo/[nesu]
     def to_regex(options = {})
       if args = as_regexp(options)
-        ::Regexp.new *args
+        ::Regexp.new(*args)
       end
     end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: wayback_machine_downloader
 version: !ruby/object:Gem::Version
-  version: 2.0.0
+  version: 2.1.0
 platform: ruby
 authors:
 - hartator
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2017-06-10 00:00:00.000000000 Z
+date: 2017-06-12 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake