RubyGems - wayback_machine_downloader - Versions diffs - 1.0.0 → 1.1.0 - Mend

wayback_machine_downloader 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml +4 -4
data/bin/wayback_machine_downloader +9 -5
data/lib/wayback_machine_downloader/archive_api.rb +12 -9
data/lib/wayback_machine_downloader.rb +40 -25
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: a09bb38fc8d8a248fe84bd05eda03a92b49cc16e
-  data.tar.gz: 13d4bbb5e2041c3414a568afdbe719e475e57039
+  metadata.gz: 88515e4c80ca24904b31eb4261287894176a35a4
+  data.tar.gz: b7d82944989a68cb64c84fc78bc7bbbcce15d58b
 SHA512:
-  metadata.gz: 7070199a5c9935b6617d7c5a390756ca517acbbb28a193619e192a8c31b6b3cdaa83bbbbae3fbebfb4ce379af4bd2a0e12dd7969739b6cbcf00f357a997a9ded
-  data.tar.gz: b17716daad7329c37ff9a2e876bf443aed97878c69e41ac0143c9040955f67269070f5193f26ac7093050e9edc9def8084223c02805fdf3ebf38ba53b18bf554
+  metadata.gz: 3567625f16ff6b38ce3da0b0e720f1a20a16ee2aef34ab5085eaa320dba72f7fad1e42e644704af6ff70506c24fc778fd690a2268386eb226e43010bf7724029
+  data.tar.gz: cd81b5a1b75a1e077806966c9d253d171167d81c09b49cfcdde8e315193142907a49a6d99e062ee4925aeaf9885621d2195d5d358c9e9e0f1bf2ae4ef55df26f

data/bin/wayback_machine_downloader CHANGED Viewed

@@ -14,7 +14,7 @@ option_parser = OptionParser.new do |opts|
   opts.separator ""
   opts.separator "Optional options:"
-  opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files to. Default is ./websites/ plus the domain name.") do |t|
+  opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into\n\t\t\t\t     Default is ./websites/ plus the domain name") do |t|
     options[:directory] = t
   end
@@ -26,11 +26,11 @@ option_parser = OptionParser.new do |opts|
     options[:to_timestamp] = t
   end
-  opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter (use // notation for the filter to be treated as a regex)") do |t|
+  opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter\n\t\t\t\t     (use // notation for the filter to be treated as a regex)") do |t|
     options[:only_filter] = t
   end
-  opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter (use // notation for the filter to be treated as a regex)") do |t|
+  opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter\n\t\t\t\t     (use // notation for the filter to be treated as a regex)") do |t|
     options[:exclude_filter] = t
   end
@@ -38,11 +38,15 @@ option_parser = OptionParser.new do |opts|
     options[:all] = true
   end
-  opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time. Default is one file at a time. (ie. 20)") do |t|
+  opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time\n\t\t\t\t     Default is one file at a time (ie. 20)") do |t|
     options[:threads_count] = t
   end
-  opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps. Won't download anything.") do |t|
+  opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)\n\t\t\t\t     Count an average of 150,000 snapshots per page ") do |t|
+    options[:maximum_pages] = t
+  end
+  opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything.") do |t|
     options[:list] = true
   end

data/lib/wayback_machine_downloader/archive_api.rb CHANGED Viewed

@@ -1,20 +1,20 @@
 module ArchiveAPI
-	def get_raw_list_from_api url
+	def get_raw_list_from_api url, page_index
 		request_url = "http://web.archive.org/cdx/search/xd?url="
 		request_url += url
-		request_url += parameters_for_api
-		request_uri = URI.parse request_url
-		response = Net::HTTP.get_response request_uri
-		response.body
+		request_url += parameters_for_api page_index
+    request_uri = URI.parse request_url
+    response = Net::HTTP.get_response request_uri
+    response.body
 	end
-	def parameters_for_api
-		parameters = "&fl=timestamp,original&gzip=false"
+	def parameters_for_api page_index
+		parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
     if @all
-      parameters += "&collapse=digest"
+      parameters += ""
     else
-      parameters += "&filter=statuscode:200&collapse=original"
+      parameters += "&filter=statuscode:200"
     end
     if @from_timestamp and @from_timestamp != 0
       parameters += "&from=" + @from_timestamp.to_s
@@ -22,6 +22,9 @@ module ArchiveAPI
     if @to_timestamp and @to_timestamp != 0
       parameters += "&to=" + @to_timestamp.to_s
     end
+    if page_index
+      parameters += "&page=#{page_index}"
+    end
     parameters
   end

data/lib/wayback_machine_downloader.rb CHANGED Viewed

@@ -13,9 +13,9 @@ class WaybackMachineDownloader
   include ArchiveAPI
-  VERSION = "1.0.0"
+  VERSION = "1.1.0"
-  attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :threads_count
+  attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count
   def initialize params
     @base_url = params[:base_url]
@@ -26,6 +26,7 @@ class WaybackMachineDownloader
     @exclude_filter = params[:exclude_filter]
     @all = params[:all]
     @list = params[:list]
+    @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
     @threads_count = params[:threads_count].to_i
   end
@@ -75,32 +76,44 @@ class WaybackMachineDownloader
     end
   end
+  def get_all_snapshots_to_consider
+    print "Getting snapshot pages"
+    snapshot_list_to_consider = ""
+    snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
+    print "."
+    @maximum_pages.times do |page_index|
+      snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
+      break if snapshot_list.empty?
+      snapshot_list_to_consider += snapshot_list
+      print "."
+    end
+    puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
+    puts
+    snapshot_list_to_consider
+  end
   def get_file_list_curated
-    index_file_list_raw = get_raw_list_from_api(@base_url)
-    all_file_list_raw = get_raw_list_from_api(@base_url + '/*')
     file_list_curated = Hash.new
-    [index_file_list_raw, all_file_list_raw].each do |file|
-      file.each_line do |line|
-        next unless line.include?('/')
-        file_timestamp = line[0..13].to_i
-        file_url = line[15..-2]
-        file_id = file_url.split('/')[3..-1].join('/')
-        file_id = CGI::unescape file_id
-        file_id = file_id.tidy_bytes unless file_id == ""
-        if file_id.nil?
-          puts "Malformed file url, ignoring: #{file_url}"
-        else
-          if match_exclude_filter(file_url)
-            puts "File url matches exclude filter, ignoring: #{file_url}"
-          elsif not match_only_filter(file_url)
-            puts "File url doesn't match only filter, ignoring: #{file_url}"
-          elsif file_list_curated[file_id]
-            unless file_list_curated[file_id][:timestamp] > file_timestamp
-              file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
-            end
-          else
+    get_all_snapshots_to_consider.each_line do |line|
+      next unless line.include?('/')
+      file_timestamp = line[0..13].to_i
+      file_url = line[15..-2]
+      file_id = file_url.split('/')[3..-1].join('/')
+      file_id = CGI::unescape file_id
+      file_id = file_id.tidy_bytes unless file_id == ""
+      if file_id.nil?
+        puts "Malformed file url, ignoring: #{file_url}"
+      else
+        if match_exclude_filter(file_url)
+          puts "File url matches exclude filter, ignoring: #{file_url}"
+        elsif not match_only_filter(file_url)
+          puts "File url doesn't match only filter, ignoring: #{file_url}"
+        elsif file_list_curated[file_id]
+          unless file_list_curated[file_id][:timestamp] > file_timestamp
             file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
           end
+        else
+          file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
         end
       end
     end
@@ -126,7 +139,7 @@ class WaybackMachineDownloader
   def download_files
     start_time = Time.now
-    puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine..."
+    puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
     puts
     if file_list_by_timestamp.count == 0
@@ -139,6 +152,8 @@ class WaybackMachineDownloader
       puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
       return
     end
+    puts "#{file_list_by_timestamp.count} files to download:"
     threads = []
     @processed_file_count = 0

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: wayback_machine_downloader
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 1.1.0
 platform: ruby
 authors:
 - hartator
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-09-18 00:00:00.000000000 Z
+date: 2016-09-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake