RubyGems - wayback_machine_downloader - Versions diffs - 1.0.0 → 1.1.0 - Mend

wayback_machine_downloader 1.0.0 → 1.1.0

Files changed (5) hide show

checksums.yaml +4 -4
data/bin/wayback_machine_downloader +9 -5
data/lib/wayback_machine_downloader/archive_api.rb +12 -9
data/lib/wayback_machine_downloader.rb +40 -25
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: a09bb38fc8d8a248fe84bd05eda03a92b49cc16e
-  data.tar.gz: 13d4bbb5e2041c3414a568afdbe719e475e57039
+  metadata.gz: 88515e4c80ca24904b31eb4261287894176a35a4
+  data.tar.gz: b7d82944989a68cb64c84fc78bc7bbbcce15d58b
 SHA512:
-  metadata.gz: 7070199a5c9935b6617d7c5a390756ca517acbbb28a193619e192a8c31b6b3cdaa83bbbbae3fbebfb4ce379af4bd2a0e12dd7969739b6cbcf00f357a997a9ded
-  data.tar.gz: b17716daad7329c37ff9a2e876bf443aed97878c69e41ac0143c9040955f67269070f5193f26ac7093050e9edc9def8084223c02805fdf3ebf38ba53b18bf554
+  metadata.gz: 3567625f16ff6b38ce3da0b0e720f1a20a16ee2aef34ab5085eaa320dba72f7fad1e42e644704af6ff70506c24fc778fd690a2268386eb226e43010bf7724029
+  data.tar.gz: cd81b5a1b75a1e077806966c9d253d171167d81c09b49cfcdde8e315193142907a49a6d99e062ee4925aeaf9885621d2195d5d358c9e9e0f1bf2ae4ef55df26f

data/bin/wayback_machine_downloader CHANGED Viewed

@@ -14,7 +14,7 @@ option_parser = OptionParser.new do |opts|
   opts.separator ""
   opts.separator "Optional options:"
-  opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files to. Default is ./websites/ plus the domain name.") do |t|
+  opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into\n\t\t\t\t     Default is ./websites/ plus the domain name") do |t|
     options[:directory] = t
   end
@@ -26,11 +26,11 @@ option_parser = OptionParser.new do |opts|
     options[:to_timestamp] = t
   end
-  opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter (use // notation for the filter to be treated as a regex)") do |t|
+  opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter\n\t\t\t\t     (use // notation for the filter to be treated as a regex)") do |t|
     options[:only_filter] = t
   end
-  opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter (use // notation for the filter to be treated as a regex)") do |t|
+  opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter\n\t\t\t\t     (use // notation for the filter to be treated as a regex)") do |t|
     options[:exclude_filter] = t
   end
@@ -38,11 +38,15 @@ option_parser = OptionParser.new do |opts|
     options[:all] = true
   end
-  opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time. Default is one file at a time. (ie. 20)") do |t|
+  opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time\n\t\t\t\t     Default is one file at a time (ie. 20)") do |t|
     options[:threads_count] = t
   end
-  opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps. Won't download anything.") do |t|
+  opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)\n\t\t\t\t     Count an average of 150,000 snapshots per page ") do |t|
+    options[:maximum_pages] = t
+  end
+  opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything.") do |t|
     options[:list] = true
   end

data/lib/wayback_machine_downloader/archive_api.rb CHANGED Viewed

@@ -1,20 +1,20 @@
 module ArchiveAPI
-	def get_raw_list_from_api url
+	def get_raw_list_from_api url, page_index
 		request_url = "http://web.archive.org/cdx/search/xd?url="
 		request_url += url
-		request_url += parameters_for_api
-		request_uri = URI.parse request_url
-		response = Net::HTTP.get_response request_uri
-		response.body
+		request_url += parameters_for_api page_index
+    request_uri = URI.parse request_url
+    response = Net::HTTP.get_response request_uri
+    response.body
 	end
-	def parameters_for_api
-		parameters = "&fl=timestamp,original&gzip=false"
+	def parameters_for_api page_index
+		parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
     if @all
-      parameters += "&collapse=digest"
+      parameters += ""
     else
-      parameters += "&filter=statuscode:200&collapse=original"
+      parameters += "&filter=statuscode:200"
     end
     if @from_timestamp and @from_timestamp != 0
       parameters += "&from=" + @from_timestamp.to_s
@@ -22,6 +22,9 @@ module ArchiveAPI
     if @to_timestamp and @to_timestamp != 0
       parameters += "&to=" + @to_timestamp.to_s
     end
+    if page_index
+      parameters += "&page=#{page_index}"
+    end
     parameters
   end

data/lib/wayback_machine_downloader.rb CHANGED Viewed

@@ -13,9 +13,9 @@ class WaybackMachineDownloader
   include ArchiveAPI
-  VERSION = "1.0.0"
+  VERSION = "1.1.0"
-  attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :threads_count
+  attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count
   def initialize params
     @base_url = params[:base_url]
@@ -26,6 +26,7 @@ class WaybackMachineDownloader
     @exclude_filter = params[:exclude_filter]
     @all = params[:all]
     @list = params[:list]
+    @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
     @threads_count = params[:threads_count].to_i
   end
@@ -75,32 +76,44 @@ class WaybackMachineDownloader
     end
   end
+  def get_all_snapshots_to_consider
+    print "Getting snapshot pages"
+    snapshot_list_to_consider = ""
+    snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
+    print "."
+    @maximum_pages.times do |page_index|
+      snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
+      break if snapshot_list.empty?
+      snapshot_list_to_consider += snapshot_list
+      print "."
+    end
+    puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
+    puts
+    snapshot_list_to_consider
+  end
   def get_file_list_curated
-    index_file_list_raw = get_raw_list_from_api(@base_url)
-    all_file_list_raw = get_raw_list_from_api(@base_url + '/*')
     file_list_curated = Hash.new
-    [index_file_list_raw, all_file_list_raw].each do |file|
-      file.each_line do |line|
-        next unless line.include?('/')
-        file_timestamp = line[0..13].to_i
-        file_url = line[15..-2]
-        file_id = file_url.split('/')[3..-1].join('/')
-        file_id = CGI::unescape file_id
-        file_id = file_id.tidy_bytes unless file_id == ""
-        if file_id.nil?
-          puts "Malformed file url, ignoring: #{file_url}"
-        else
-          if match_exclude_filter(file_url)
-            puts "File url matches exclude filter, ignoring: #{file_url}"
-          elsif not match_only_filter(file_url)
-            puts "File url doesn't match only filter, ignoring: #{file_url}"
-          elsif file_list_curated[file_id]
-            unless file_list_curated[file_id][:timestamp] > file_timestamp
-              file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
-            end
-          else
+    get_all_snapshots_to_consider.each_line do |line|
+      next unless line.include?('/')
+      file_timestamp = line[0..13].to_i
+      file_url = line[15..-2]
+      file_id = file_url.split('/')[3..-1].join('/')
+      file_id = CGI::unescape file_id
+      file_id = file_id.tidy_bytes unless file_id == ""
+      if file_id.nil?
+        puts "Malformed file url, ignoring: #{file_url}"
+      else
+        if match_exclude_filter(file_url)
+          puts "File url matches exclude filter, ignoring: #{file_url}"
+        elsif not match_only_filter(file_url)
+          puts "File url doesn't match only filter, ignoring: #{file_url}"
+        elsif file_list_curated[file_id]
+          unless file_list_curated[file_id][:timestamp] > file_timestamp
             file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
           end
+        else
+          file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
         end
       end
     end
@@ -126,7 +139,7 @@ class WaybackMachineDownloader
   def download_files
     start_time = Time.now
-    puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine..."
+    puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
     puts
     if file_list_by_timestamp.count == 0
@@ -139,6 +152,8 @@ class WaybackMachineDownloader
       puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
       return
     end
+    puts "#{file_list_by_timestamp.count} files to download:"
     threads = []
     @processed_file_count = 0

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: wayback_machine_downloader
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 1.1.0
 platform: ruby
 authors:
 - hartator
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-09-18 00:00:00.000000000 Z
+date: 2016-09-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake