wayback_machine_downloader 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 77db4ae324d457ea724d2316c75e2221b4971279
4
- data.tar.gz: f274b7de7e8b2948a5eb2b9bd207ecd5da5bf832
3
+ metadata.gz: 9ace9b823151cb8ec95c7b942cbe07a1de388218
4
+ data.tar.gz: 7a4af6e514823ddc120272ac3d236ea435747e22
5
5
  SHA512:
6
- metadata.gz: f78bf1585bb402a71e6970b084085369c836168379003555d61525eb19bb35ed7acb34510e36ea52611cf417b600e175f345ce1317e1db4c8c53684414d37557
7
- data.tar.gz: f2f14bf88f8e8726c48e850ec91eea1a9055e8aabf6a569e1972cd3b67cc69a094ba1c7603b4178378d3a7cd3fdec99930382c5396c4ad2fbcb21c9b45a73e03
6
+ metadata.gz: e76d5c4fb1cb024619e5eb90fdc43b4a71d27b0430b8f5251a07ac0b7367c8247a6c2ed29660d5e38e45cd7614d20f250eec3ea6e4633e35b0e6bc740869e6a2
7
+ data.tar.gz: 5f9307192fecf31509894dc86c4c98b07d9cd5aa28ae046b0d93d797b08eb7e019c9d59023f35c4e625826d4987b8c1716916759df94bc0394e2af726580fddd
@@ -1,4 +1,30 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'wayback_machine_downloader'
4
- puts WaybackMachineDownloader.hi(ARGV[0])
4
+ require 'optparse'
5
+ require 'pry-rescue'
6
+
7
+ options = {}
8
+ option_parser = OptionParser.new do |opts|
9
+ opts.banner = "Usage: wayback_machine_downloader http://example.com"
10
+
11
+ opts.separator ""
12
+ opts.separator "Download a website from Wayback Machine."
13
+
14
+ opts.separator ""
15
+ opts.separator "Optional option:"
16
+
17
+ opts.on("-t", "--timestamp TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20150806225358)") do |t|
18
+ options[:timestamp] = t
19
+ end
20
+ end.parse!
21
+
22
+ if base_url = ARGV[0]
23
+ wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp]
24
+ puts "Downlading #{wayback_machine_downloader.base_url} from Wayback Machine..."
25
+ binding.pry
26
+ wayback_machine_downloader.download_files
27
+ else
28
+ puts "You need to specify a websire to backup. (ie. http://example.com)"
29
+ puts "Run `wayback_machine_downloader --help` for more help."
30
+ end
@@ -1,28 +1,75 @@
1
1
  require 'open-uri'
2
+ require 'fileutils'
2
3
 
3
4
  class WaybackMachineDownloader
4
5
 
5
- attr_accessor :base_url
6
+ attr_accessor :base_url, :timestamp
6
7
 
7
8
  def initialize params
8
9
  @base_url = params[:base_url]
10
+ @timestamp = params[:timestamp]
11
+ end
12
+
13
+ def backup_name
14
+ @base_url.split('/')[2]
15
+ end
16
+
17
+ def backup_path
18
+ 'websites/' + backup_name + '/'
9
19
  end
10
20
 
11
21
  def file_list_curated
12
- file_list_raw = open "http://web.archive.org/web/*/#{@base_url}/*"
22
+ file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
13
23
  file_list_curated = Hash.new
14
24
  file_list_raw.each_line do |line|
15
25
  line = line.split(' ')
16
26
  timestamp = line[1].to_i
17
27
  file_url = line[2]
18
- if file_list_curated[file_url]
19
- unless file_list_curated[file_url] > timestamp
20
- file_list_curated[file_url] = timestamp
28
+ file_id = file_url.split('/')[3..-1].join('/')
29
+ file_id = URI.unescape file_id
30
+ if file_list_curated[file_id]
31
+ unless file_list_curated[file_id][:timestamp] > timestamp
32
+ file_list_curated[file_id] = {file_url: file_url, timestamp: timestamp}
21
33
  end
22
34
  else
23
- file_list_curated[file_url] = timestamp
35
+ file_list_curated[file_id] = {file_url: file_url, timestamp: timestamp}
24
36
  end
25
37
  end
26
38
  file_list_curated
27
39
  end
40
+
41
+ def download_files
42
+ file_list_curated.each do |file_id, file_remote_info|
43
+ timestamp = file_remote_info[:timestamp]
44
+ file_url = file_remote_info[:file_url]
45
+ file_path_elements = file_id.split('/')
46
+ if file_id == ""
47
+ dir_path = backup_path
48
+ file_path = backup_path + 'index.html'
49
+ elsif file_url[-1] == '/'
50
+ dir_path = backup_path + file_path_elements[0..-1].join('/')
51
+ file_path = backup_path + file_path_elements[0..-1].join('/') + 'index.html'
52
+ else
53
+ dir_path = backup_path + file_path_elements[0..-2].join('/')
54
+ file_path = backup_path + file_path_elements[0..-1].join('/')
55
+ end
56
+ unless File.exists? file_path
57
+ FileUtils::mkdir_p dir_path unless File.exists? dir_path
58
+ open(file_path, "wb") do |file|
59
+ begin
60
+ open("http://web.archive.org/web/#{timestamp}id_/#{file_url}") do |uri|
61
+ file.write(uri.read)
62
+ end
63
+ rescue OpenURI::HTTPError => e
64
+ puts "#{file_url} # 404"
65
+ file.write(e.io.read)
66
+ end
67
+ end
68
+ puts "#{file_url} -> #{file_path}"
69
+ else
70
+ puts "#{file_url} # #{file_path} already exists."
71
+ end
72
+ end
73
+ end
74
+
28
75
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator