wayback_machine_downloader 0.1.3 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9ace9b823151cb8ec95c7b942cbe07a1de388218
4
- data.tar.gz: 7a4af6e514823ddc120272ac3d236ea435747e22
3
+ metadata.gz: 8532168d675aff20ea6578d90cbf8c1087cdbd9a
4
+ data.tar.gz: e4476b5c8504a2466b42be02c5a1fefe7e898c95
5
5
  SHA512:
6
- metadata.gz: e76d5c4fb1cb024619e5eb90fdc43b4a71d27b0430b8f5251a07ac0b7367c8247a6c2ed29660d5e38e45cd7614d20f250eec3ea6e4633e35b0e6bc740869e6a2
7
- data.tar.gz: 5f9307192fecf31509894dc86c4c98b07d9cd5aa28ae046b0d93d797b08eb7e019c9d59023f35c4e625826d4987b8c1716916759df94bc0394e2af726580fddd
6
+ metadata.gz: 3527721f2675c6aba366c88b5a70da8b85615d9beefc9db51403a607040d8dfa012ba9f58bb202d6f0b6d5f8b8b4316ee892d8cbc588153da72086df14bbd509
7
+ data.tar.gz: 240f40fbca00affa948adfcecc67fa8426888c166c926b5b890761502426baf1b4a5f93c12c6a7b64847a4e0276c36cf1e8f3766943c61ac2a3e279ea913b6fd
@@ -2,14 +2,13 @@
2
2
 
3
3
  require 'wayback_machine_downloader'
4
4
  require 'optparse'
5
- require 'pry-rescue'
6
5
 
7
6
  options = {}
8
7
  option_parser = OptionParser.new do |opts|
9
8
  opts.banner = "Usage: wayback_machine_downloader http://example.com"
10
9
 
11
10
  opts.separator ""
12
- opts.separator "Download a website from Wayback Machine."
11
+ opts.separator "Download any website from the Wayback Machine."
13
12
 
14
13
  opts.separator ""
15
14
  opts.separator "Optional option:"
@@ -21,10 +20,8 @@ end.parse!
21
20
 
22
21
  if base_url = ARGV[0]
23
22
  wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp]
24
- puts "Downlading #{wayback_machine_downloader.base_url} from Wayback Machine..."
25
- binding.pry
26
23
  wayback_machine_downloader.download_files
27
24
  else
28
- puts "You need to specify a websire to backup. (ie. http://example.com)"
25
+ puts "You need to specify a website to backup. (e.g., http://example.com)"
29
26
  puts "Run `wayback_machine_downloader --help` for more help."
30
27
  end
@@ -3,11 +3,13 @@ require 'fileutils'
3
3
 
4
4
  class WaybackMachineDownloader
5
5
 
6
+ VERSION = "0.1.5"
7
+
6
8
  attr_accessor :base_url, :timestamp
7
9
 
8
10
  def initialize params
9
11
  @base_url = params[:base_url]
10
- @timestamp = params[:timestamp]
12
+ @timestamp = params[:timestamp].to_i
11
13
  end
12
14
 
13
15
  def backup_name
@@ -18,58 +20,84 @@ class WaybackMachineDownloader
18
20
  'websites/' + backup_name + '/'
19
21
  end
20
22
 
21
- def file_list_curated
23
+ def get_file_list_curated
22
24
  file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
23
25
  file_list_curated = Hash.new
24
26
  file_list_raw.each_line do |line|
25
27
  line = line.split(' ')
26
- timestamp = line[1].to_i
28
+ file_timestamp = line[1].to_i
27
29
  file_url = line[2]
28
30
  file_id = file_url.split('/')[3..-1].join('/')
29
31
  file_id = URI.unescape file_id
30
- if file_list_curated[file_id]
31
- unless file_list_curated[file_id][:timestamp] > timestamp
32
- file_list_curated[file_id] = {file_url: file_url, timestamp: timestamp}
32
+ if @timestamp == 0 or file_timestamp <= @timestamp
33
+ if file_list_curated[file_id]
34
+ unless file_list_curated[file_id][:timestamp] > file_timestamp
35
+ file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
36
+ end
37
+ else
38
+ file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
33
39
  end
34
- else
35
- file_list_curated[file_id] = {file_url: file_url, timestamp: timestamp}
36
40
  end
37
41
  end
38
42
  file_list_curated
39
43
  end
40
44
 
41
45
  def download_files
46
+ puts "Downlading #{@base_url} from Wayback Machine..."
47
+ puts
48
+ file_list_curated = get_file_list_curated
49
+ count = 0
42
50
  file_list_curated.each do |file_id, file_remote_info|
43
- timestamp = file_remote_info[:timestamp]
51
+ count += 1
44
52
  file_url = file_remote_info[:file_url]
45
53
  file_path_elements = file_id.split('/')
46
54
  if file_id == ""
47
55
  dir_path = backup_path
48
56
  file_path = backup_path + 'index.html'
49
- elsif file_url[-1] == '/'
57
+ elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
50
58
  dir_path = backup_path + file_path_elements[0..-1].join('/')
51
- file_path = backup_path + file_path_elements[0..-1].join('/') + 'index.html'
59
+ file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
52
60
  else
53
61
  dir_path = backup_path + file_path_elements[0..-2].join('/')
54
62
  file_path = backup_path + file_path_elements[0..-1].join('/')
55
63
  end
56
64
  unless File.exists? file_path
57
- FileUtils::mkdir_p dir_path unless File.exists? dir_path
65
+ structure_dir_path dir_path
58
66
  open(file_path, "wb") do |file|
59
67
  begin
60
68
  open("http://web.archive.org/web/#{timestamp}id_/#{file_url}") do |uri|
61
69
  file.write(uri.read)
62
70
  end
63
71
  rescue OpenURI::HTTPError => e
64
- puts "#{file_url} # 404"
72
+ puts "#{file_url} # #{e}"
65
73
  file.write(e.io.read)
74
+ rescue Exception => e
75
+ puts "#{file_url} # #{e}"
66
76
  end
67
77
  end
68
- puts "#{file_url} -> #{file_path}"
78
+ puts "#{file_url} -> #{file_path} (#{count}/#{file_list_curated.size})"
69
79
  else
70
- puts "#{file_url} # #{file_path} already exists."
80
+ puts "#{file_url} # #{file_path} already exists. (#{count}/#{file_list_curated.size})"
71
81
  end
72
82
  end
83
+ puts
84
+ puts "Download complete, saved in #{backup_path}. (#{file_list_curated.size} files)"
85
+ end
86
+
87
+ def structure_dir_path dir_path
88
+ begin
89
+ FileUtils::mkdir_p dir_path unless File.exists? dir_path
90
+ rescue Errno::EEXIST => e
91
+ puts "# #{e}"
92
+ file_already_existing = e.to_s.split("File exists @ dir_s_mkdir - ")[-1]
93
+ file_already_existing_temporary = file_already_existing + '.temp'
94
+ file_already_existing_permanent = file_already_existing + '/index.html'
95
+ FileUtils::mv file_already_existing, file_already_existing_temporary
96
+ FileUtils::mkdir_p file_already_existing
97
+ FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
98
+ puts "#{file_already_existing} -> #{file_already_existing_permanent}"
99
+ structure_dir_path dir_path
100
+ end
73
101
  end
74
102
 
75
103
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator