RubyGems - wayback_machine_downloader - Versions diffs - 0.1.2 → 0.1.3 - Mend

wayback_machine_downloader 0.1.2 → 0.1.3

Files changed (4) hide show

checksums.yaml +4 -4
data/bin/wayback_machine_downloader +27 -1
data/lib/wayback_machine_downloader.rb +53 -6
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 77db4ae324d457ea724d2316c75e2221b4971279
-  data.tar.gz: f274b7de7e8b2948a5eb2b9bd207ecd5da5bf832
+  metadata.gz: 9ace9b823151cb8ec95c7b942cbe07a1de388218
+  data.tar.gz: 7a4af6e514823ddc120272ac3d236ea435747e22
 SHA512:
-  metadata.gz: f78bf1585bb402a71e6970b084085369c836168379003555d61525eb19bb35ed7acb34510e36ea52611cf417b600e175f345ce1317e1db4c8c53684414d37557
-  data.tar.gz: f2f14bf88f8e8726c48e850ec91eea1a9055e8aabf6a569e1972cd3b67cc69a094ba1c7603b4178378d3a7cd3fdec99930382c5396c4ad2fbcb21c9b45a73e03
+  metadata.gz: e76d5c4fb1cb024619e5eb90fdc43b4a71d27b0430b8f5251a07ac0b7367c8247a6c2ed29660d5e38e45cd7614d20f250eec3ea6e4633e35b0e6bc740869e6a2
+  data.tar.gz: 5f9307192fecf31509894dc86c4c98b07d9cd5aa28ae046b0d93d797b08eb7e019c9d59023f35c4e625826d4987b8c1716916759df94bc0394e2af726580fddd

data/bin/wayback_machine_downloader CHANGED Viewed

@@ -1,4 +1,30 @@
 #!/usr/bin/env ruby
 require 'wayback_machine_downloader'
-puts WaybackMachineDownloader.hi(ARGV[0])
+require 'optparse'
+require 'pry-rescue'
+options = {}
+option_parser = OptionParser.new do |opts|
+  opts.banner = "Usage: wayback_machine_downloader http://example.com"
+  opts.separator ""
+  opts.separator "Download a website from Wayback Machine."
+  opts.separator ""
+  opts.separator "Optional option:"
+  opts.on("-t", "--timestamp TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20150806225358)") do |t|
+    options[:timestamp] = t
+  end
+end.parse!
+if base_url = ARGV[0]
+  wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp]
+  puts "Downlading #{wayback_machine_downloader.base_url} from Wayback Machine..."
+  binding.pry
+  wayback_machine_downloader.download_files
+else
+  puts "You need to specify a websire to backup. (ie. http://example.com)"
+  puts "Run `wayback_machine_downloader --help` for more help."
+end

data/lib/wayback_machine_downloader.rb CHANGED Viewed

@@ -1,28 +1,75 @@
 require 'open-uri'
+require 'fileutils'
 class WaybackMachineDownloader
-  attr_accessor :base_url
+  attr_accessor :base_url, :timestamp
   def initialize params
     @base_url = params[:base_url]
+    @timestamp = params[:timestamp]
+  end
+  def backup_name
+    @base_url.split('/')[2]
+  end
+  def backup_path
+    'websites/' + backup_name + '/'
   end
   def file_list_curated
-    file_list_raw = open "http://web.archive.org/web/*/#{@base_url}/*"
+    file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
     file_list_curated = Hash.new
     file_list_raw.each_line do |line|
       line = line.split(' ')
       timestamp = line[1].to_i
       file_url = line[2]
-      if file_list_curated[file_url]
-        unless file_list_curated[file_url] > timestamp
-          file_list_curated[file_url] = timestamp
+      file_id = file_url.split('/')[3..-1].join('/')
+      file_id = URI.unescape file_id
+      if file_list_curated[file_id]
+        unless file_list_curated[file_id][:timestamp] > timestamp
+          file_list_curated[file_id] = {file_url: file_url, timestamp: timestamp}
         end
       else
-        file_list_curated[file_url] = timestamp
+        file_list_curated[file_id] = {file_url: file_url, timestamp: timestamp}
       end
     end
     file_list_curated
   end
+  def download_files
+    file_list_curated.each do |file_id, file_remote_info|
+      timestamp = file_remote_info[:timestamp]
+      file_url = file_remote_info[:file_url]
+      file_path_elements = file_id.split('/')
+      if file_id == ""
+        dir_path = backup_path
+        file_path = backup_path + 'index.html'
+      elsif file_url[-1] == '/'
+        dir_path = backup_path + file_path_elements[0..-1].join('/')
+        file_path = backup_path + file_path_elements[0..-1].join('/') + 'index.html'
+      else
+        dir_path = backup_path + file_path_elements[0..-2].join('/')
+        file_path = backup_path + file_path_elements[0..-1].join('/')
+      end
+      unless File.exists? file_path
+        FileUtils::mkdir_p dir_path unless File.exists? dir_path
+        open(file_path, "wb") do |file|
+          begin
+            open("http://web.archive.org/web/#{timestamp}id_/#{file_url}") do |uri|
+              file.write(uri.read)
+            end
+          rescue OpenURI::HTTPError => e
+            puts "#{file_url} # 404"
+            file.write(e.io.read)
+          end
+        end
+        puts "#{file_url} -> #{file_path}"
+      else
+        puts "#{file_url} # #{file_path} already exists."
+      end
+    end
+  end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: wayback_machine_downloader
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.1.3
 platform: ruby
 authors:
 - hartator