wayback_machine_downloader 2.0.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: e2132b28dea0a03978384a3b337b1107562e644a
4
- data.tar.gz: e8b6421b78d02505a8498c79cd1761ebb28a3290
2
+ SHA256:
3
+ metadata.gz: 57cbbb04b38525f6dd9c1a8f4022ee28ce45c76d1d26acc90076a4b8b6014b44
4
+ data.tar.gz: 4128b3ab753e91bea93ddebdafba133091663617e0c247c022076a8c11dfa5c2
5
5
  SHA512:
6
- metadata.gz: d1d0944e9593aadc02db950aa9826491d727f93c6185f23aac20b24b48da086ed67f2be76d91184d2709610b84160c2665ca4b30bcddfcd4981b6840c988e1d0
7
- data.tar.gz: aa40fb4da67241e972c86631b9390703ec77643b17b7f62ae2cfbffe49f276ff6f77d901aac87df66aca94d6da7ec3d230ceb001aa9b47f697b5ef1c98b4194f
6
+ metadata.gz: bb08b6f6e9fa930b025fbf0c783476bd965e364ef46ccd2fecc8e5d0954be062b67b801acf4d168556f7d90f1d5c836a16184e371a5e5d47da7e804278d893ab
7
+ data.tar.gz: e750e04ab4e1f795e061f0fe91581abb60baecc7d5427d9ec8e724f931d1afba207512731f350eee7c04ceacf12b4a90823e9b3c750e3799810944430279c330
@@ -18,6 +18,10 @@ option_parser = OptionParser.new do |opts|
18
18
  options[:directory] = t
19
19
  end
20
20
 
21
+ opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
22
+ options[:all_timestamps] = true
23
+ end
24
+
21
25
  opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
22
26
  options[:from_timestamp] = t
23
27
  end
@@ -26,6 +30,10 @@ option_parser = OptionParser.new do |opts|
26
30
  options[:to_timestamp] = t
27
31
  end
28
32
 
33
+ opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
34
+ options[:exact_url] = t
35
+ end
36
+
29
37
  opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
30
38
  options[:only_filter] = t
31
39
  end
@@ -38,15 +46,15 @@ option_parser = OptionParser.new do |opts|
38
46
  options[:all] = true
39
47
  end
40
48
 
41
- opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time", "Default is one file at a time (ie. 20)") do |t|
49
+ opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
42
50
  options[:threads_count] = t
43
51
  end
44
52
 
45
- opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page ") do |t|
53
+ opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
46
54
  options[:maximum_pages] = t
47
55
  end
48
56
 
49
- opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything.") do |t|
57
+ opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
50
58
  options[:list] = true
51
59
  end
52
60
 
@@ -58,7 +66,7 @@ end.parse!
58
66
  if (base_url = ARGV[-1])
59
67
  options[:base_url] = base_url
60
68
  wayback_machine_downloader = WaybackMachineDownloader.new options
61
- if wayback_machine_downloader.list
69
+ if options[:list]
62
70
  wayback_machine_downloader.list_files
63
71
  else
64
72
  wayback_machine_downloader.download_files
@@ -14,19 +14,22 @@ class WaybackMachineDownloader
14
14
 
15
15
  include ArchiveAPI
16
16
 
17
- VERSION = "2.0.0"
17
+ VERSION = "2.3.0"
18
18
 
19
- attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count
19
+ attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
20
+ :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
21
+ :all, :maximum_pages, :threads_count
20
22
 
21
23
  def initialize params
22
24
  @base_url = params[:base_url]
25
+ @exact_url = params[:exact_url]
23
26
  @directory = params[:directory]
27
+ @all_timestamps = params[:all_timestamps]
24
28
  @from_timestamp = params[:from_timestamp].to_i
25
29
  @to_timestamp = params[:to_timestamp].to_i
26
30
  @only_filter = params[:only_filter]
27
31
  @exclude_filter = params[:exclude_filter]
28
32
  @all = params[:all]
29
- @list = params[:list]
30
33
  @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
31
34
  @threads_count = params[:threads_count].to_i
32
35
  end
@@ -78,30 +81,29 @@ class WaybackMachineDownloader
78
81
  end
79
82
 
80
83
  def get_all_snapshots_to_consider
81
- # Note: Passing a page index parameter allow us to get more snapshots, but from a less fresh index
84
+ # Note: Passing a page index parameter allow us to get more snapshots,
85
+ # but from a less fresh index
82
86
  print "Getting snapshot pages"
83
- snapshot_list_to_consider = ""
87
+ snapshot_list_to_consider = []
84
88
  snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
85
89
  print "."
86
- snapshot_list_to_consider += get_raw_list_from_api(@base_url + '/*', nil)
87
- print "."
88
- @maximum_pages.times do |page_index|
89
- snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
90
- break if snapshot_list.empty?
91
- snapshot_list_to_consider += snapshot_list
92
- print "."
90
+ unless @exact_url
91
+ @maximum_pages.times do |page_index|
92
+ snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
93
+ break if snapshot_list.empty?
94
+ snapshot_list_to_consider += snapshot_list
95
+ print "."
96
+ end
93
97
  end
94
- puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
98
+ puts " found #{snapshot_list_to_consider.length} snaphots to consider."
95
99
  puts
96
100
  snapshot_list_to_consider
97
101
  end
98
102
 
99
103
  def get_file_list_curated
100
104
  file_list_curated = Hash.new
101
- get_all_snapshots_to_consider.each_line do |line|
102
- next unless line.include?('/')
103
- file_timestamp = line[0..13].to_i
104
- file_url = line[15..-2]
105
+ get_all_snapshots_to_consider.each do |file_timestamp, file_url|
106
+ next unless file_url.include?('/')
105
107
  file_id = file_url.split('/')[3..-1].join('/')
106
108
  file_id = CGI::unescape file_id
107
109
  file_id = file_id.tidy_bytes unless file_id == ""
@@ -124,20 +126,61 @@ class WaybackMachineDownloader
124
126
  file_list_curated
125
127
  end
126
128
 
129
+ def get_file_list_all_timestamps
130
+ file_list_curated = Hash.new
131
+ get_all_snapshots_to_consider.each do |file_timestamp, file_url|
132
+ next unless file_url.include?('/')
133
+ file_id = file_url.split('/')[3..-1].join('/')
134
+ file_id_and_timestamp = [file_timestamp, file_id].join('/')
135
+ file_id_and_timestamp = CGI::unescape file_id_and_timestamp
136
+ file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
137
+ if file_id.nil?
138
+ puts "Malformed file url, ignoring: #{file_url}"
139
+ else
140
+ if match_exclude_filter(file_url)
141
+ puts "File url matches exclude filter, ignoring: #{file_url}"
142
+ elsif not match_only_filter(file_url)
143
+ puts "File url doesn't match only filter, ignoring: #{file_url}"
144
+ elsif file_list_curated[file_id_and_timestamp]
145
+ puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
146
+ else
147
+ file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
148
+ end
149
+ end
150
+ end
151
+ puts "file_list_curated: " + file_list_curated.count.to_s
152
+ file_list_curated
153
+ end
154
+
155
+
127
156
  def get_file_list_by_timestamp
128
- file_list_curated = get_file_list_curated
129
- file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
130
- file_list_curated.map do |file_remote_info|
131
- file_remote_info[1][:file_id] = file_remote_info[0]
132
- file_remote_info[1]
157
+ if @all_timestamps
158
+ file_list_curated = get_file_list_all_timestamps
159
+ file_list_curated.map do |file_remote_info|
160
+ file_remote_info[1][:file_id] = file_remote_info[0]
161
+ file_remote_info[1]
162
+ end
163
+ else
164
+ file_list_curated = get_file_list_curated
165
+ file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
166
+ file_list_curated.map do |file_remote_info|
167
+ file_remote_info[1][:file_id] = file_remote_info[0]
168
+ file_remote_info[1]
169
+ end
133
170
  end
134
171
  end
135
172
 
136
173
  def list_files
174
+ # retrieval produces its own output
175
+ @orig_stdout = $stdout
176
+ $stdout = $stderr
177
+ files = get_file_list_by_timestamp
178
+ $stdout = @orig_stdout
137
179
  puts "["
138
- get_file_list_by_timestamp.each do |file|
180
+ files[0...-1].each do |file|
139
181
  puts file.to_json + ","
140
182
  end
183
+ puts files[-1].to_json
141
184
  puts "]"
142
185
  end
143
186
 
@@ -179,7 +222,7 @@ class WaybackMachineDownloader
179
222
 
180
223
  def structure_dir_path dir_path
181
224
  begin
182
- FileUtils::mkdir_p dir_path unless File.exists? dir_path
225
+ FileUtils::mkdir_p dir_path unless File.exist? dir_path
183
226
  rescue Errno::EEXIST => e
184
227
  error_to_string = e.to_s
185
228
  puts "# #{error_to_string}"
@@ -217,14 +260,15 @@ class WaybackMachineDownloader
217
260
  file_path = backup_path + file_path_elements[0..-1].join('/')
218
261
  end
219
262
  if Gem.win_platform?
263
+ dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
220
264
  file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
221
265
  end
222
- unless File.exists? file_path
266
+ unless File.exist? file_path
223
267
  begin
224
268
  structure_dir_path dir_path
225
269
  open(file_path, "wb") do |file|
226
270
  begin
227
- open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
271
+ URI.open("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
228
272
  file.write(uri.read)
229
273
  end
230
274
  rescue OpenURI::HTTPError => e
@@ -240,7 +284,7 @@ class WaybackMachineDownloader
240
284
  rescue StandardError => e
241
285
  puts "#{file_url} # #{e}"
242
286
  ensure
243
- if not @all and File.exists?(file_path) and File.size(file_path) == 0
287
+ if not @all and File.exist?(file_path) and File.size(file_path) == 0
244
288
  File.delete(file_path)
245
289
  puts "#{file_path} was empty and was removed."
246
290
  end
@@ -1,28 +1,38 @@
1
+ require 'json'
2
+ require 'uri'
3
+
1
4
  module ArchiveAPI
2
5
 
3
- def get_raw_list_from_api url, page_index
4
- request_url = "http://web.archive.org/cdx/search/xd?url="
5
- request_url += url
6
- request_url += parameters_for_api page_index
6
+ def get_raw_list_from_api url, page_index
7
+ request_url = URI("https://web.archive.org/cdx/search/xd")
8
+ params = [["output", "json"], ["url", url]]
9
+ params += parameters_for_api page_index
10
+ request_url.query = URI.encode_www_form(params)
7
11
 
8
- open(request_url).read
9
- end
12
+ begin
13
+ json = JSON.parse(URI(request_url).open.read)
14
+ if (json[0] <=> ["timestamp","original"]) == 0
15
+ json.shift
16
+ end
17
+ json
18
+ rescue JSON::ParserError
19
+ []
20
+ end
21
+ end
10
22
 
11
- def parameters_for_api page_index
12
- parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
13
- if @all
14
- parameters += ""
15
- else
16
- parameters += "&filter=statuscode:200"
23
+ def parameters_for_api page_index
24
+ parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
25
+ if !@all
26
+ parameters.push(["filter", "statuscode:200"])
17
27
  end
18
28
  if @from_timestamp and @from_timestamp != 0
19
- parameters += "&from=" + @from_timestamp.to_s
29
+ parameters.push(["from", @from_timestamp.to_s])
20
30
  end
21
31
  if @to_timestamp and @to_timestamp != 0
22
- parameters += "&to=" + @to_timestamp.to_s
32
+ parameters.push(["to", @to_timestamp.to_s])
23
33
  end
24
34
  if page_index
25
- parameters += "&page=#{page_index}"
35
+ parameters.push(["page", page_index])
26
36
  end
27
37
  parameters
28
38
  end
@@ -60,7 +60,7 @@ module TibyBytes
60
60
  bytes.each_index do |i|
61
61
 
62
62
  byte = bytes[i]
63
- is_ascii = byte < 128
63
+ _is_ascii = byte < 128
64
64
  is_cont = byte > 127 && byte < 192
65
65
  is_lead = byte > 191 && byte < 245
66
66
  is_unused = byte > 240
@@ -70,7 +70,7 @@ module TibyBytes
70
70
  if is_unused || is_restricted
71
71
  bytes[i] = tidy_byte(byte)
72
72
  elsif is_cont
73
- # Not expecting contination byte? Clean up. Otherwise, now expect one less.
73
+ # Not expecting continuation byte? Clean up. Otherwise, now expect one less.
74
74
  conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
75
75
  else
76
76
  if conts_expected > 0
@@ -78,7 +78,7 @@ module TibyBytes
78
78
  # the leading byte.
79
79
  begin
80
80
  (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
81
- rescue NoMethodError => e
81
+ rescue NoMethodError
82
82
  next
83
83
  end
84
84
  conts_expected = 0
@@ -98,7 +98,7 @@ module TibyBytes
98
98
  end
99
99
  begin
100
100
  bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
101
- rescue ArgumentError => e
101
+ rescue ArgumentError
102
102
  nil
103
103
  end
104
104
  end
@@ -25,7 +25,7 @@ module ToRegex
25
25
  # @option options [true,false] :lang /foo/[nesu]
26
26
  def to_regex(options = {})
27
27
  if args = as_regexp(options)
28
- ::Regexp.new *args
28
+ ::Regexp.new(*args)
29
29
  end
30
30
  end
31
31
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.0
4
+ version: 2.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-06-10 00:00:00.000000000 Z
11
+ date: 2021-06-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -57,7 +57,7 @@ homepage: https://github.com/hartator/wayback-machine-downloader
57
57
  licenses:
58
58
  - MIT
59
59
  metadata: {}
60
- post_install_message:
60
+ post_install_message:
61
61
  rdoc_options: []
62
62
  require_paths:
63
63
  - lib
@@ -72,9 +72,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
72
72
  - !ruby/object:Gem::Version
73
73
  version: '0'
74
74
  requirements: []
75
- rubyforge_project:
76
- rubygems_version: 2.5.2
77
- signing_key:
75
+ rubygems_version: 3.1.4
76
+ signing_key:
78
77
  specification_version: 4
79
78
  summary: Download an entire website from the Wayback Machine.
80
79
  test_files: []