RubyGems - wayback_machine_downloader - Versions diffs - 0.1.3 → 0.1.5 - Mend

wayback_machine_downloader 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml +4 -4
data/bin/wayback_machine_downloader +2 -5
data/lib/wayback_machine_downloader.rb +43 -15
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 9ace9b823151cb8ec95c7b942cbe07a1de388218
-  data.tar.gz: 7a4af6e514823ddc120272ac3d236ea435747e22
+  metadata.gz: 8532168d675aff20ea6578d90cbf8c1087cdbd9a
+  data.tar.gz: e4476b5c8504a2466b42be02c5a1fefe7e898c95
 SHA512:
-  metadata.gz: e76d5c4fb1cb024619e5eb90fdc43b4a71d27b0430b8f5251a07ac0b7367c8247a6c2ed29660d5e38e45cd7614d20f250eec3ea6e4633e35b0e6bc740869e6a2
-  data.tar.gz: 5f9307192fecf31509894dc86c4c98b07d9cd5aa28ae046b0d93d797b08eb7e019c9d59023f35c4e625826d4987b8c1716916759df94bc0394e2af726580fddd
+  metadata.gz: 3527721f2675c6aba366c88b5a70da8b85615d9beefc9db51403a607040d8dfa012ba9f58bb202d6f0b6d5f8b8b4316ee892d8cbc588153da72086df14bbd509
+  data.tar.gz: 240f40fbca00affa948adfcecc67fa8426888c166c926b5b890761502426baf1b4a5f93c12c6a7b64847a4e0276c36cf1e8f3766943c61ac2a3e279ea913b6fd

data/bin/wayback_machine_downloader CHANGED Viewed

@@ -2,14 +2,13 @@
 require 'wayback_machine_downloader'
 require 'optparse'
-require 'pry-rescue'
 options = {}
 option_parser = OptionParser.new do |opts|
   opts.banner = "Usage: wayback_machine_downloader http://example.com"
   opts.separator ""
-  opts.separator "Download a website from Wayback Machine."
+  opts.separator "Download any website from the Wayback Machine."
   opts.separator ""
   opts.separator "Optional option:"
@@ -21,10 +20,8 @@ end.parse!
 if base_url = ARGV[0]
   wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp]
-  puts "Downlading #{wayback_machine_downloader.base_url} from Wayback Machine..."
-  binding.pry
   wayback_machine_downloader.download_files
 else
-  puts "You need to specify a websire to backup. (ie. http://example.com)"
+  puts "You need to specify a website to backup. (e.g., http://example.com)"
   puts "Run `wayback_machine_downloader --help` for more help."
 end

data/lib/wayback_machine_downloader.rb CHANGED Viewed

@@ -3,11 +3,13 @@ require 'fileutils'
 class WaybackMachineDownloader
+  VERSION = "0.1.5"
   attr_accessor :base_url, :timestamp
   def initialize params
     @base_url = params[:base_url]
-    @timestamp = params[:timestamp]
+    @timestamp = params[:timestamp].to_i
   end
   def backup_name
@@ -18,58 +20,84 @@ class WaybackMachineDownloader
     'websites/' + backup_name + '/'
   end
-  def file_list_curated
+  def get_file_list_curated
     file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
     file_list_curated = Hash.new
     file_list_raw.each_line do |line|
       line = line.split(' ')
-      timestamp = line[1].to_i
+      file_timestamp = line[1].to_i
       file_url = line[2]
       file_id = file_url.split('/')[3..-1].join('/')
       file_id = URI.unescape file_id
-      if file_list_curated[file_id]
-        unless file_list_curated[file_id][:timestamp] > timestamp
-          file_list_curated[file_id] = {file_url: file_url, timestamp: timestamp}
+      if @timestamp == 0 or file_timestamp <= @timestamp
+        if file_list_curated[file_id]
+          unless file_list_curated[file_id][:timestamp] > file_timestamp
+            file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
+          end
+        else
+          file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
         end
-      else
-        file_list_curated[file_id] = {file_url: file_url, timestamp: timestamp}
       end
     end
     file_list_curated
   end
   def download_files
+    puts "Downlading #{@base_url} from Wayback Machine..."
+    puts
+    file_list_curated = get_file_list_curated
+    count = 0
     file_list_curated.each do |file_id, file_remote_info|
-      timestamp = file_remote_info[:timestamp]
+      count += 1
       file_url = file_remote_info[:file_url]
       file_path_elements = file_id.split('/')
       if file_id == ""
         dir_path = backup_path
         file_path = backup_path + 'index.html'
-      elsif file_url[-1] == '/'
+      elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
         dir_path = backup_path + file_path_elements[0..-1].join('/')
-        file_path = backup_path + file_path_elements[0..-1].join('/') + 'index.html'
+        file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
       else
         dir_path = backup_path + file_path_elements[0..-2].join('/')
         file_path = backup_path + file_path_elements[0..-1].join('/')
       end
       unless File.exists? file_path
-        FileUtils::mkdir_p dir_path unless File.exists? dir_path
+        structure_dir_path dir_path
         open(file_path, "wb") do |file|
           begin
             open("http://web.archive.org/web/#{timestamp}id_/#{file_url}") do |uri|
               file.write(uri.read)
             end
           rescue OpenURI::HTTPError => e
-            puts "#{file_url} # 404"
+            puts "#{file_url} # #{e}"
             file.write(e.io.read)
+          rescue Exception => e
+            puts "#{file_url} # #{e}"
           end
         end
-        puts "#{file_url} -> #{file_path}"
+        puts "#{file_url} -> #{file_path} (#{count}/#{file_list_curated.size})"
       else
-        puts "#{file_url} # #{file_path} already exists."
+        puts "#{file_url} # #{file_path} already exists. (#{count}/#{file_list_curated.size})"
       end
     end
+    puts
+    puts "Download complete, saved in #{backup_path}. (#{file_list_curated.size} files)"
+  end
+  def structure_dir_path dir_path
+    begin
+      FileUtils::mkdir_p dir_path unless File.exists? dir_path
+    rescue Errno::EEXIST => e
+      puts "# #{e}"
+      file_already_existing = e.to_s.split("File exists @ dir_s_mkdir - ")[-1]
+      file_already_existing_temporary = file_already_existing + '.temp'
+      file_already_existing_permanent = file_already_existing + '/index.html'
+      FileUtils::mv file_already_existing, file_already_existing_temporary
+      FileUtils::mkdir_p file_already_existing
+      FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
+      puts "#{file_already_existing} ->  #{file_already_existing_permanent}"
+      structure_dir_path dir_path
+    end
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: wayback_machine_downloader
 version: !ruby/object:Gem::Version
-  version: 0.1.3
+  version: 0.1.5
 platform: ruby
 authors:
 - hartator