wayback_machine_downloader 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9ace9b823151cb8ec95c7b942cbe07a1de388218
4
- data.tar.gz: 7a4af6e514823ddc120272ac3d236ea435747e22
3
+ metadata.gz: 8532168d675aff20ea6578d90cbf8c1087cdbd9a
4
+ data.tar.gz: e4476b5c8504a2466b42be02c5a1fefe7e898c95
5
5
  SHA512:
6
- metadata.gz: e76d5c4fb1cb024619e5eb90fdc43b4a71d27b0430b8f5251a07ac0b7367c8247a6c2ed29660d5e38e45cd7614d20f250eec3ea6e4633e35b0e6bc740869e6a2
7
- data.tar.gz: 5f9307192fecf31509894dc86c4c98b07d9cd5aa28ae046b0d93d797b08eb7e019c9d59023f35c4e625826d4987b8c1716916759df94bc0394e2af726580fddd
6
+ metadata.gz: 3527721f2675c6aba366c88b5a70da8b85615d9beefc9db51403a607040d8dfa012ba9f58bb202d6f0b6d5f8b8b4316ee892d8cbc588153da72086df14bbd509
7
+ data.tar.gz: 240f40fbca00affa948adfcecc67fa8426888c166c926b5b890761502426baf1b4a5f93c12c6a7b64847a4e0276c36cf1e8f3766943c61ac2a3e279ea913b6fd
@@ -2,14 +2,13 @@
2
2
 
3
3
  require 'wayback_machine_downloader'
4
4
  require 'optparse'
5
- require 'pry-rescue'
6
5
 
7
6
  options = {}
8
7
  option_parser = OptionParser.new do |opts|
9
8
  opts.banner = "Usage: wayback_machine_downloader http://example.com"
10
9
 
11
10
  opts.separator ""
12
- opts.separator "Download a website from Wayback Machine."
11
+ opts.separator "Download any website from the Wayback Machine."
13
12
 
14
13
  opts.separator ""
15
14
  opts.separator "Optional option:"
@@ -21,10 +20,8 @@ end.parse!
21
20
 
22
21
  if base_url = ARGV[0]
23
22
  wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp]
24
- puts "Downlading #{wayback_machine_downloader.base_url} from Wayback Machine..."
25
- binding.pry
26
23
  wayback_machine_downloader.download_files
27
24
  else
28
- puts "You need to specify a websire to backup. (ie. http://example.com)"
25
+ puts "You need to specify a website to backup. (e.g., http://example.com)"
29
26
  puts "Run `wayback_machine_downloader --help` for more help."
30
27
  end
@@ -3,11 +3,13 @@ require 'fileutils'
3
3
 
4
4
  class WaybackMachineDownloader
5
5
 
6
+ VERSION = "0.1.5"
7
+
6
8
  attr_accessor :base_url, :timestamp
7
9
 
8
10
  def initialize params
9
11
  @base_url = params[:base_url]
10
- @timestamp = params[:timestamp]
12
+ @timestamp = params[:timestamp].to_i
11
13
  end
12
14
 
13
15
  def backup_name
@@ -18,58 +20,84 @@ class WaybackMachineDownloader
18
20
  'websites/' + backup_name + '/'
19
21
  end
20
22
 
21
- def file_list_curated
23
+ def get_file_list_curated
22
24
  file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
23
25
  file_list_curated = Hash.new
24
26
  file_list_raw.each_line do |line|
25
27
  line = line.split(' ')
26
- timestamp = line[1].to_i
28
+ file_timestamp = line[1].to_i
27
29
  file_url = line[2]
28
30
  file_id = file_url.split('/')[3..-1].join('/')
29
31
  file_id = URI.unescape file_id
30
- if file_list_curated[file_id]
31
- unless file_list_curated[file_id][:timestamp] > timestamp
32
- file_list_curated[file_id] = {file_url: file_url, timestamp: timestamp}
32
+ if @timestamp == 0 or file_timestamp <= @timestamp
33
+ if file_list_curated[file_id]
34
+ unless file_list_curated[file_id][:timestamp] > file_timestamp
35
+ file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
36
+ end
37
+ else
38
+ file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
33
39
  end
34
- else
35
- file_list_curated[file_id] = {file_url: file_url, timestamp: timestamp}
36
40
  end
37
41
  end
38
42
  file_list_curated
39
43
  end
40
44
 
41
45
  def download_files
46
+ puts "Downlading #{@base_url} from Wayback Machine..."
47
+ puts
48
+ file_list_curated = get_file_list_curated
49
+ count = 0
42
50
  file_list_curated.each do |file_id, file_remote_info|
43
- timestamp = file_remote_info[:timestamp]
51
+ count += 1
44
52
  file_url = file_remote_info[:file_url]
45
53
  file_path_elements = file_id.split('/')
46
54
  if file_id == ""
47
55
  dir_path = backup_path
48
56
  file_path = backup_path + 'index.html'
49
- elsif file_url[-1] == '/'
57
+ elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
50
58
  dir_path = backup_path + file_path_elements[0..-1].join('/')
51
- file_path = backup_path + file_path_elements[0..-1].join('/') + 'index.html'
59
+ file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
52
60
  else
53
61
  dir_path = backup_path + file_path_elements[0..-2].join('/')
54
62
  file_path = backup_path + file_path_elements[0..-1].join('/')
55
63
  end
56
64
  unless File.exists? file_path
57
- FileUtils::mkdir_p dir_path unless File.exists? dir_path
65
+ structure_dir_path dir_path
58
66
  open(file_path, "wb") do |file|
59
67
  begin
60
68
  open("http://web.archive.org/web/#{timestamp}id_/#{file_url}") do |uri|
61
69
  file.write(uri.read)
62
70
  end
63
71
  rescue OpenURI::HTTPError => e
64
- puts "#{file_url} # 404"
72
+ puts "#{file_url} # #{e}"
65
73
  file.write(e.io.read)
74
+ rescue Exception => e
75
+ puts "#{file_url} # #{e}"
66
76
  end
67
77
  end
68
- puts "#{file_url} -> #{file_path}"
78
+ puts "#{file_url} -> #{file_path} (#{count}/#{file_list_curated.size})"
69
79
  else
70
- puts "#{file_url} # #{file_path} already exists."
80
+ puts "#{file_url} # #{file_path} already exists. (#{count}/#{file_list_curated.size})"
71
81
  end
72
82
  end
83
+ puts
84
+ puts "Download complete, saved in #{backup_path}. (#{file_list_curated.size} files)"
85
+ end
86
+
87
+ def structure_dir_path dir_path
88
+ begin
89
+ FileUtils::mkdir_p dir_path unless File.exists? dir_path
90
+ rescue Errno::EEXIST => e
91
+ puts "# #{e}"
92
+ file_already_existing = e.to_s.split("File exists @ dir_s_mkdir - ")[-1]
93
+ file_already_existing_temporary = file_already_existing + '.temp'
94
+ file_already_existing_permanent = file_already_existing + '/index.html'
95
+ FileUtils::mv file_already_existing, file_already_existing_temporary
96
+ FileUtils::mkdir_p file_already_existing
97
+ FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
98
+ puts "#{file_already_existing} -> #{file_already_existing_permanent}"
99
+ structure_dir_path dir_path
100
+ end
73
101
  end
74
102
 
75
103
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator