wayback_machine_downloader 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 77db4ae324d457ea724d2316c75e2221b4971279
4
- data.tar.gz: f274b7de7e8b2948a5eb2b9bd207ecd5da5bf832
3
+ metadata.gz: 9ace9b823151cb8ec95c7b942cbe07a1de388218
4
+ data.tar.gz: 7a4af6e514823ddc120272ac3d236ea435747e22
5
5
  SHA512:
6
- metadata.gz: f78bf1585bb402a71e6970b084085369c836168379003555d61525eb19bb35ed7acb34510e36ea52611cf417b600e175f345ce1317e1db4c8c53684414d37557
7
- data.tar.gz: f2f14bf88f8e8726c48e850ec91eea1a9055e8aabf6a569e1972cd3b67cc69a094ba1c7603b4178378d3a7cd3fdec99930382c5396c4ad2fbcb21c9b45a73e03
6
+ metadata.gz: e76d5c4fb1cb024619e5eb90fdc43b4a71d27b0430b8f5251a07ac0b7367c8247a6c2ed29660d5e38e45cd7614d20f250eec3ea6e4633e35b0e6bc740869e6a2
7
+ data.tar.gz: 5f9307192fecf31509894dc86c4c98b07d9cd5aa28ae046b0d93d797b08eb7e019c9d59023f35c4e625826d4987b8c1716916759df94bc0394e2af726580fddd
@@ -1,4 +1,30 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'wayback_machine_downloader'
4
- puts WaybackMachineDownloader.hi(ARGV[0])
4
+ require 'optparse'
5
+ require 'pry-rescue'
6
+
7
+ options = {}
8
+ option_parser = OptionParser.new do |opts|
9
+ opts.banner = "Usage: wayback_machine_downloader http://example.com"
10
+
11
+ opts.separator ""
12
+ opts.separator "Download a website from Wayback Machine."
13
+
14
+ opts.separator ""
15
+ opts.separator "Optional option:"
16
+
17
+ opts.on("-t", "--timestamp TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20150806225358)") do |t|
18
+ options[:timestamp] = t
19
+ end
20
+ end.parse!
21
+
22
+ if base_url = ARGV[0]
23
+ wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp]
24
+ puts "Downlading #{wayback_machine_downloader.base_url} from Wayback Machine..."
25
+ binding.pry
26
+ wayback_machine_downloader.download_files
27
+ else
28
+ puts "You need to specify a websire to backup. (ie. http://example.com)"
29
+ puts "Run `wayback_machine_downloader --help` for more help."
30
+ end
@@ -1,28 +1,75 @@
1
1
  require 'open-uri'
2
+ require 'fileutils'
2
3
 
3
4
  class WaybackMachineDownloader
4
5
 
5
- attr_accessor :base_url
6
+ attr_accessor :base_url, :timestamp
6
7
 
7
8
  def initialize params
8
9
  @base_url = params[:base_url]
10
+ @timestamp = params[:timestamp]
11
+ end
12
+
13
+ def backup_name
14
+ @base_url.split('/')[2]
15
+ end
16
+
17
+ def backup_path
18
+ 'websites/' + backup_name + '/'
9
19
  end
10
20
 
11
21
  def file_list_curated
12
- file_list_raw = open "http://web.archive.org/web/*/#{@base_url}/*"
22
+ file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
13
23
  file_list_curated = Hash.new
14
24
  file_list_raw.each_line do |line|
15
25
  line = line.split(' ')
16
26
  timestamp = line[1].to_i
17
27
  file_url = line[2]
18
- if file_list_curated[file_url]
19
- unless file_list_curated[file_url] > timestamp
20
- file_list_curated[file_url] = timestamp
28
+ file_id = file_url.split('/')[3..-1].join('/')
29
+ file_id = URI.unescape file_id
30
+ if file_list_curated[file_id]
31
+ unless file_list_curated[file_id][:timestamp] > timestamp
32
+ file_list_curated[file_id] = {file_url: file_url, timestamp: timestamp}
21
33
  end
22
34
  else
23
- file_list_curated[file_url] = timestamp
35
+ file_list_curated[file_id] = {file_url: file_url, timestamp: timestamp}
24
36
  end
25
37
  end
26
38
  file_list_curated
27
39
  end
40
+
41
+ def download_files
42
+ file_list_curated.each do |file_id, file_remote_info|
43
+ timestamp = file_remote_info[:timestamp]
44
+ file_url = file_remote_info[:file_url]
45
+ file_path_elements = file_id.split('/')
46
+ if file_id == ""
47
+ dir_path = backup_path
48
+ file_path = backup_path + 'index.html'
49
+ elsif file_url[-1] == '/'
50
+ dir_path = backup_path + file_path_elements[0..-1].join('/')
51
+ file_path = backup_path + file_path_elements[0..-1].join('/') + 'index.html'
52
+ else
53
+ dir_path = backup_path + file_path_elements[0..-2].join('/')
54
+ file_path = backup_path + file_path_elements[0..-1].join('/')
55
+ end
56
+ unless File.exists? file_path
57
+ FileUtils::mkdir_p dir_path unless File.exists? dir_path
58
+ open(file_path, "wb") do |file|
59
+ begin
60
+ open("http://web.archive.org/web/#{timestamp}id_/#{file_url}") do |uri|
61
+ file.write(uri.read)
62
+ end
63
+ rescue OpenURI::HTTPError => e
64
+ puts "#{file_url} # 404"
65
+ file.write(e.io.read)
66
+ end
67
+ end
68
+ puts "#{file_url} -> #{file_path}"
69
+ else
70
+ puts "#{file_url} # #{file_path} already exists."
71
+ end
72
+ end
73
+ end
74
+
28
75
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator