wayback_machine_downloader 2.0.0 → 2.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: e2132b28dea0a03978384a3b337b1107562e644a
4
- data.tar.gz: e8b6421b78d02505a8498c79cd1761ebb28a3290
2
+ SHA256:
3
+ metadata.gz: 57cbbb04b38525f6dd9c1a8f4022ee28ce45c76d1d26acc90076a4b8b6014b44
4
+ data.tar.gz: 4128b3ab753e91bea93ddebdafba133091663617e0c247c022076a8c11dfa5c2
5
5
  SHA512:
6
- metadata.gz: d1d0944e9593aadc02db950aa9826491d727f93c6185f23aac20b24b48da086ed67f2be76d91184d2709610b84160c2665ca4b30bcddfcd4981b6840c988e1d0
7
- data.tar.gz: aa40fb4da67241e972c86631b9390703ec77643b17b7f62ae2cfbffe49f276ff6f77d901aac87df66aca94d6da7ec3d230ceb001aa9b47f697b5ef1c98b4194f
6
+ metadata.gz: bb08b6f6e9fa930b025fbf0c783476bd965e364ef46ccd2fecc8e5d0954be062b67b801acf4d168556f7d90f1d5c836a16184e371a5e5d47da7e804278d893ab
7
+ data.tar.gz: e750e04ab4e1f795e061f0fe91581abb60baecc7d5427d9ec8e724f931d1afba207512731f350eee7c04ceacf12b4a90823e9b3c750e3799810944430279c330
@@ -18,6 +18,10 @@ option_parser = OptionParser.new do |opts|
18
18
  options[:directory] = t
19
19
  end
20
20
 
21
+ opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
22
+ options[:all_timestamps] = true
23
+ end
24
+
21
25
  opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
22
26
  options[:from_timestamp] = t
23
27
  end
@@ -26,6 +30,10 @@ option_parser = OptionParser.new do |opts|
26
30
  options[:to_timestamp] = t
27
31
  end
28
32
 
33
+ opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
34
+ options[:exact_url] = t
35
+ end
36
+
29
37
  opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
30
38
  options[:only_filter] = t
31
39
  end
@@ -38,15 +46,15 @@ option_parser = OptionParser.new do |opts|
38
46
  options[:all] = true
39
47
  end
40
48
 
41
- opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time", "Default is one file at a time (ie. 20)") do |t|
49
+ opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
42
50
  options[:threads_count] = t
43
51
  end
44
52
 
45
- opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page ") do |t|
53
+ opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
46
54
  options[:maximum_pages] = t
47
55
  end
48
56
 
49
- opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything.") do |t|
57
+ opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
50
58
  options[:list] = true
51
59
  end
52
60
 
@@ -58,7 +66,7 @@ end.parse!
58
66
  if (base_url = ARGV[-1])
59
67
  options[:base_url] = base_url
60
68
  wayback_machine_downloader = WaybackMachineDownloader.new options
61
- if wayback_machine_downloader.list
69
+ if options[:list]
62
70
  wayback_machine_downloader.list_files
63
71
  else
64
72
  wayback_machine_downloader.download_files
@@ -14,19 +14,22 @@ class WaybackMachineDownloader
14
14
 
15
15
  include ArchiveAPI
16
16
 
17
- VERSION = "2.0.0"
17
+ VERSION = "2.3.0"
18
18
 
19
- attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count
19
+ attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
20
+ :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
21
+ :all, :maximum_pages, :threads_count
20
22
 
21
23
  def initialize params
22
24
  @base_url = params[:base_url]
25
+ @exact_url = params[:exact_url]
23
26
  @directory = params[:directory]
27
+ @all_timestamps = params[:all_timestamps]
24
28
  @from_timestamp = params[:from_timestamp].to_i
25
29
  @to_timestamp = params[:to_timestamp].to_i
26
30
  @only_filter = params[:only_filter]
27
31
  @exclude_filter = params[:exclude_filter]
28
32
  @all = params[:all]
29
- @list = params[:list]
30
33
  @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
31
34
  @threads_count = params[:threads_count].to_i
32
35
  end
@@ -78,30 +81,29 @@ class WaybackMachineDownloader
78
81
  end
79
82
 
80
83
  def get_all_snapshots_to_consider
81
- # Note: Passing a page index parameter allow us to get more snapshots, but from a less fresh index
84
+ # Note: Passing a page index parameter allow us to get more snapshots,
85
+ # but from a less fresh index
82
86
  print "Getting snapshot pages"
83
- snapshot_list_to_consider = ""
87
+ snapshot_list_to_consider = []
84
88
  snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
85
89
  print "."
86
- snapshot_list_to_consider += get_raw_list_from_api(@base_url + '/*', nil)
87
- print "."
88
- @maximum_pages.times do |page_index|
89
- snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
90
- break if snapshot_list.empty?
91
- snapshot_list_to_consider += snapshot_list
92
- print "."
90
+ unless @exact_url
91
+ @maximum_pages.times do |page_index|
92
+ snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
93
+ break if snapshot_list.empty?
94
+ snapshot_list_to_consider += snapshot_list
95
+ print "."
96
+ end
93
97
  end
94
- puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
98
+ puts " found #{snapshot_list_to_consider.length} snaphots to consider."
95
99
  puts
96
100
  snapshot_list_to_consider
97
101
  end
98
102
 
99
103
  def get_file_list_curated
100
104
  file_list_curated = Hash.new
101
- get_all_snapshots_to_consider.each_line do |line|
102
- next unless line.include?('/')
103
- file_timestamp = line[0..13].to_i
104
- file_url = line[15..-2]
105
+ get_all_snapshots_to_consider.each do |file_timestamp, file_url|
106
+ next unless file_url.include?('/')
105
107
  file_id = file_url.split('/')[3..-1].join('/')
106
108
  file_id = CGI::unescape file_id
107
109
  file_id = file_id.tidy_bytes unless file_id == ""
@@ -124,20 +126,61 @@ class WaybackMachineDownloader
124
126
  file_list_curated
125
127
  end
126
128
 
129
+ def get_file_list_all_timestamps
130
+ file_list_curated = Hash.new
131
+ get_all_snapshots_to_consider.each do |file_timestamp, file_url|
132
+ next unless file_url.include?('/')
133
+ file_id = file_url.split('/')[3..-1].join('/')
134
+ file_id_and_timestamp = [file_timestamp, file_id].join('/')
135
+ file_id_and_timestamp = CGI::unescape file_id_and_timestamp
136
+ file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
137
+ if file_id.nil?
138
+ puts "Malformed file url, ignoring: #{file_url}"
139
+ else
140
+ if match_exclude_filter(file_url)
141
+ puts "File url matches exclude filter, ignoring: #{file_url}"
142
+ elsif not match_only_filter(file_url)
143
+ puts "File url doesn't match only filter, ignoring: #{file_url}"
144
+ elsif file_list_curated[file_id_and_timestamp]
145
+ puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
146
+ else
147
+ file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
148
+ end
149
+ end
150
+ end
151
+ puts "file_list_curated: " + file_list_curated.count.to_s
152
+ file_list_curated
153
+ end
154
+
155
+
127
156
  def get_file_list_by_timestamp
128
- file_list_curated = get_file_list_curated
129
- file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
130
- file_list_curated.map do |file_remote_info|
131
- file_remote_info[1][:file_id] = file_remote_info[0]
132
- file_remote_info[1]
157
+ if @all_timestamps
158
+ file_list_curated = get_file_list_all_timestamps
159
+ file_list_curated.map do |file_remote_info|
160
+ file_remote_info[1][:file_id] = file_remote_info[0]
161
+ file_remote_info[1]
162
+ end
163
+ else
164
+ file_list_curated = get_file_list_curated
165
+ file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
166
+ file_list_curated.map do |file_remote_info|
167
+ file_remote_info[1][:file_id] = file_remote_info[0]
168
+ file_remote_info[1]
169
+ end
133
170
  end
134
171
  end
135
172
 
136
173
  def list_files
174
+ # retrieval produces its own output
175
+ @orig_stdout = $stdout
176
+ $stdout = $stderr
177
+ files = get_file_list_by_timestamp
178
+ $stdout = @orig_stdout
137
179
  puts "["
138
- get_file_list_by_timestamp.each do |file|
180
+ files[0...-1].each do |file|
139
181
  puts file.to_json + ","
140
182
  end
183
+ puts files[-1].to_json
141
184
  puts "]"
142
185
  end
143
186
 
@@ -179,7 +222,7 @@ class WaybackMachineDownloader
179
222
 
180
223
  def structure_dir_path dir_path
181
224
  begin
182
- FileUtils::mkdir_p dir_path unless File.exists? dir_path
225
+ FileUtils::mkdir_p dir_path unless File.exist? dir_path
183
226
  rescue Errno::EEXIST => e
184
227
  error_to_string = e.to_s
185
228
  puts "# #{error_to_string}"
@@ -217,14 +260,15 @@ class WaybackMachineDownloader
217
260
  file_path = backup_path + file_path_elements[0..-1].join('/')
218
261
  end
219
262
  if Gem.win_platform?
263
+ dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
220
264
  file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
221
265
  end
222
- unless File.exists? file_path
266
+ unless File.exist? file_path
223
267
  begin
224
268
  structure_dir_path dir_path
225
269
  open(file_path, "wb") do |file|
226
270
  begin
227
- open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
271
+ URI.open("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
228
272
  file.write(uri.read)
229
273
  end
230
274
  rescue OpenURI::HTTPError => e
@@ -240,7 +284,7 @@ class WaybackMachineDownloader
240
284
  rescue StandardError => e
241
285
  puts "#{file_url} # #{e}"
242
286
  ensure
243
- if not @all and File.exists?(file_path) and File.size(file_path) == 0
287
+ if not @all and File.exist?(file_path) and File.size(file_path) == 0
244
288
  File.delete(file_path)
245
289
  puts "#{file_path} was empty and was removed."
246
290
  end
@@ -1,28 +1,38 @@
1
+ require 'json'
2
+ require 'uri'
3
+
1
4
  module ArchiveAPI
2
5
 
3
- def get_raw_list_from_api url, page_index
4
- request_url = "http://web.archive.org/cdx/search/xd?url="
5
- request_url += url
6
- request_url += parameters_for_api page_index
6
+ def get_raw_list_from_api url, page_index
7
+ request_url = URI("https://web.archive.org/cdx/search/xd")
8
+ params = [["output", "json"], ["url", url]]
9
+ params += parameters_for_api page_index
10
+ request_url.query = URI.encode_www_form(params)
7
11
 
8
- open(request_url).read
9
- end
12
+ begin
13
+ json = JSON.parse(URI(request_url).open.read)
14
+ if (json[0] <=> ["timestamp","original"]) == 0
15
+ json.shift
16
+ end
17
+ json
18
+ rescue JSON::ParserError
19
+ []
20
+ end
21
+ end
10
22
 
11
- def parameters_for_api page_index
12
- parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
13
- if @all
14
- parameters += ""
15
- else
16
- parameters += "&filter=statuscode:200"
23
+ def parameters_for_api page_index
24
+ parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
25
+ if !@all
26
+ parameters.push(["filter", "statuscode:200"])
17
27
  end
18
28
  if @from_timestamp and @from_timestamp != 0
19
- parameters += "&from=" + @from_timestamp.to_s
29
+ parameters.push(["from", @from_timestamp.to_s])
20
30
  end
21
31
  if @to_timestamp and @to_timestamp != 0
22
- parameters += "&to=" + @to_timestamp.to_s
32
+ parameters.push(["to", @to_timestamp.to_s])
23
33
  end
24
34
  if page_index
25
- parameters += "&page=#{page_index}"
35
+ parameters.push(["page", page_index])
26
36
  end
27
37
  parameters
28
38
  end
@@ -60,7 +60,7 @@ module TibyBytes
60
60
  bytes.each_index do |i|
61
61
 
62
62
  byte = bytes[i]
63
- is_ascii = byte < 128
63
+ _is_ascii = byte < 128
64
64
  is_cont = byte > 127 && byte < 192
65
65
  is_lead = byte > 191 && byte < 245
66
66
  is_unused = byte > 240
@@ -70,7 +70,7 @@ module TibyBytes
70
70
  if is_unused || is_restricted
71
71
  bytes[i] = tidy_byte(byte)
72
72
  elsif is_cont
73
- # Not expecting contination byte? Clean up. Otherwise, now expect one less.
73
+ # Not expecting continuation byte? Clean up. Otherwise, now expect one less.
74
74
  conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
75
75
  else
76
76
  if conts_expected > 0
@@ -78,7 +78,7 @@ module TibyBytes
78
78
  # the leading byte.
79
79
  begin
80
80
  (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
81
- rescue NoMethodError => e
81
+ rescue NoMethodError
82
82
  next
83
83
  end
84
84
  conts_expected = 0
@@ -98,7 +98,7 @@ module TibyBytes
98
98
  end
99
99
  begin
100
100
  bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
101
- rescue ArgumentError => e
101
+ rescue ArgumentError
102
102
  nil
103
103
  end
104
104
  end
@@ -25,7 +25,7 @@ module ToRegex
25
25
  # @option options [true,false] :lang /foo/[nesu]
26
26
  def to_regex(options = {})
27
27
  if args = as_regexp(options)
28
- ::Regexp.new *args
28
+ ::Regexp.new(*args)
29
29
  end
30
30
  end
31
31
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.0
4
+ version: 2.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-06-10 00:00:00.000000000 Z
11
+ date: 2021-06-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -57,7 +57,7 @@ homepage: https://github.com/hartator/wayback-machine-downloader
57
57
  licenses:
58
58
  - MIT
59
59
  metadata: {}
60
- post_install_message:
60
+ post_install_message:
61
61
  rdoc_options: []
62
62
  require_paths:
63
63
  - lib
@@ -72,9 +72,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
72
72
  - !ruby/object:Gem::Version
73
73
  version: '0'
74
74
  requirements: []
75
- rubyforge_project:
76
- rubygems_version: 2.5.2
77
- signing_key:
75
+ rubygems_version: 3.1.4
76
+ signing_key:
78
77
  specification_version: 4
79
78
  summary: Download an entire website from the Wayback Machine.
80
79
  test_files: []