wayback_machine_downloader 2.1.1 → 2.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 48f524cedc0e9f66c7b0acca132a71557a327ea2
4
- data.tar.gz: 1d70bb2a76cd07c82c08674fdc96b543caec48c0
2
+ SHA256:
3
+ metadata.gz: 54752c73ebfac815e91ef6bba40547a36282e5ec9c3ef2792370c13352fce0b6
4
+ data.tar.gz: df2f5d94981eeb2d1e55d2b4a9dd8fe57a24e8b29cf79a700ca520b7c3bc1a21
5
5
  SHA512:
6
- metadata.gz: 26eb05cbeebd911502bd01513535c7cc2d4ad0fe3850adc0205ca4f649351e56855af66915d86c501fb8be64963fe1d409d013d8afcd24064cc15673b2cc0854
7
- data.tar.gz: 0dbbd54b4b4ab231adcae908bbf6cd3865768590263e767fe5e45fb3a9d70676c337f79aba576378272ddc14647ecd06fc26820fc1dec8cb52704aa6740582b7
6
+ metadata.gz: 108d33cf57b738ba69ccf960f503ab5ea44b296ba043716fb2e83e9fa5bebcaec9a488bc4a5ab64dad55c1f23434c2b71005a86389e9b26fd07b38372f96b6d4
7
+ data.tar.gz: 62afad1698415e0c80b85599da7aba1e19574ec571862f8d69c56d1fe718f8c65cae3e3be2293d8418ecc7dd09803b4d9908186e93f3062ccd85b363a5e7dde4
@@ -18,6 +18,10 @@ option_parser = OptionParser.new do |opts|
18
18
  options[:directory] = t
19
19
  end
20
20
 
21
+ opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
22
+ options[:all_timestamps] = true
23
+ end
24
+
21
25
  opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
22
26
  options[:from_timestamp] = t
23
27
  end
@@ -42,7 +46,7 @@ option_parser = OptionParser.new do |opts|
42
46
  options[:all] = true
43
47
  end
44
48
 
45
- opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time", "Default is one file at a time (ie. 20)") do |t|
49
+ opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
46
50
  options[:threads_count] = t
47
51
  end
48
52
 
@@ -1,28 +1,38 @@
1
+ require 'json'
2
+ require 'uri'
3
+
1
4
  module ArchiveAPI
2
5
 
3
6
  def get_raw_list_from_api url, page_index
4
- request_url = "http://web.archive.org/cdx/search/xd?url="
5
- request_url += url
6
- request_url += parameters_for_api page_index
7
+ request_url = URI("https://web.archive.org/cdx/search/xd")
8
+ params = [["output", "json"], ["url", url]]
9
+ params += parameters_for_api page_index
10
+ request_url.query = URI.encode_www_form(params)
7
11
 
8
- open(request_url).read
12
+ begin
13
+ json = JSON.parse(URI(request_url).open.read)
14
+ if (json[0] <=> ["timestamp","original"]) == 0
15
+ json.shift
16
+ end
17
+ json
18
+ rescue JSON::ParserError
19
+ []
20
+ end
9
21
  end
10
22
 
11
23
  def parameters_for_api page_index
12
- parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
13
- if @all
14
- parameters += ""
15
- else
16
- parameters += "&filter=statuscode:200"
24
+ parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
25
+ if !@all
26
+ parameters.push(["filter", "statuscode:200"])
17
27
  end
18
28
  if @from_timestamp and @from_timestamp != 0
19
- parameters += "&from=" + @from_timestamp.to_s
29
+ parameters.push(["from", @from_timestamp.to_s])
20
30
  end
21
31
  if @to_timestamp and @to_timestamp != 0
22
- parameters += "&to=" + @to_timestamp.to_s
32
+ parameters.push(["to", @to_timestamp.to_s])
23
33
  end
24
34
  if page_index
25
- parameters += "&page=#{page_index}"
35
+ parameters.push(["page", page_index])
26
36
  end
27
37
  parameters
28
38
  end
@@ -70,7 +70,7 @@ module TibyBytes
70
70
  if is_unused || is_restricted
71
71
  bytes[i] = tidy_byte(byte)
72
72
  elsif is_cont
73
- # Not expecting contination byte? Clean up. Otherwise, now expect one less.
73
+ # Not expecting continuation byte? Clean up. Otherwise, now expect one less.
74
74
  conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
75
75
  else
76
76
  if conts_expected > 0
@@ -14,9 +14,9 @@ class WaybackMachineDownloader
14
14
 
15
15
  include ArchiveAPI
16
16
 
17
- VERSION = "2.1.1"
17
+ VERSION = "2.3.1"
18
18
 
19
- attr_accessor :base_url, :exact_url, :directory,
19
+ attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
20
20
  :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
21
21
  :all, :maximum_pages, :threads_count
22
22
 
@@ -24,6 +24,7 @@ class WaybackMachineDownloader
24
24
  @base_url = params[:base_url]
25
25
  @exact_url = params[:exact_url]
26
26
  @directory = params[:directory]
27
+ @all_timestamps = params[:all_timestamps]
27
28
  @from_timestamp = params[:from_timestamp].to_i
28
29
  @to_timestamp = params[:to_timestamp].to_i
29
30
  @only_filter = params[:only_filter]
@@ -83,7 +84,7 @@ class WaybackMachineDownloader
83
84
  # Note: Passing a page index parameter allow us to get more snapshots,
84
85
  # but from a less fresh index
85
86
  print "Getting snapshot pages"
86
- snapshot_list_to_consider = ""
87
+ snapshot_list_to_consider = []
87
88
  snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
88
89
  print "."
89
90
  unless @exact_url
@@ -94,17 +95,15 @@ class WaybackMachineDownloader
94
95
  print "."
95
96
  end
96
97
  end
97
- puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
98
+ puts " found #{snapshot_list_to_consider.length} snaphots to consider."
98
99
  puts
99
100
  snapshot_list_to_consider
100
101
  end
101
102
 
102
103
  def get_file_list_curated
103
104
  file_list_curated = Hash.new
104
- get_all_snapshots_to_consider.each_line do |line|
105
- next unless line.include?('/')
106
- file_timestamp = line[0..13].to_i
107
- file_url = line[15..-2]
105
+ get_all_snapshots_to_consider.each do |file_timestamp, file_url|
106
+ next unless file_url.include?('/')
108
107
  file_id = file_url.split('/')[3..-1].join('/')
109
108
  file_id = CGI::unescape file_id
110
109
  file_id = file_id.tidy_bytes unless file_id == ""
@@ -127,22 +126,61 @@ class WaybackMachineDownloader
127
126
  file_list_curated
128
127
  end
129
128
 
129
+ def get_file_list_all_timestamps
130
+ file_list_curated = Hash.new
131
+ get_all_snapshots_to_consider.each do |file_timestamp, file_url|
132
+ next unless file_url.include?('/')
133
+ file_id = file_url.split('/')[3..-1].join('/')
134
+ file_id_and_timestamp = [file_timestamp, file_id].join('/')
135
+ file_id_and_timestamp = CGI::unescape file_id_and_timestamp
136
+ file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
137
+ if file_id.nil?
138
+ puts "Malformed file url, ignoring: #{file_url}"
139
+ else
140
+ if match_exclude_filter(file_url)
141
+ puts "File url matches exclude filter, ignoring: #{file_url}"
142
+ elsif not match_only_filter(file_url)
143
+ puts "File url doesn't match only filter, ignoring: #{file_url}"
144
+ elsif file_list_curated[file_id_and_timestamp]
145
+ puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
146
+ else
147
+ file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
148
+ end
149
+ end
150
+ end
151
+ puts "file_list_curated: " + file_list_curated.count.to_s
152
+ file_list_curated
153
+ end
154
+
155
+
130
156
  def get_file_list_by_timestamp
131
- file_list_curated = get_file_list_curated
132
- file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
133
- file_list_curated.map do |file_remote_info|
134
- file_remote_info[1][:file_id] = file_remote_info[0]
135
- file_remote_info[1]
157
+ if @all_timestamps
158
+ file_list_curated = get_file_list_all_timestamps
159
+ file_list_curated.map do |file_remote_info|
160
+ file_remote_info[1][:file_id] = file_remote_info[0]
161
+ file_remote_info[1]
162
+ end
163
+ else
164
+ file_list_curated = get_file_list_curated
165
+ file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
166
+ file_list_curated.map do |file_remote_info|
167
+ file_remote_info[1][:file_id] = file_remote_info[0]
168
+ file_remote_info[1]
169
+ end
136
170
  end
137
171
  end
138
172
 
139
173
  def list_files
140
174
  # retrieval produces its own output
175
+ @orig_stdout = $stdout
176
+ $stdout = $stderr
141
177
  files = get_file_list_by_timestamp
178
+ $stdout = @orig_stdout
142
179
  puts "["
143
- files.each do |file|
180
+ files[0...-1].each do |file|
144
181
  puts file.to_json + ","
145
182
  end
183
+ puts files[-1].to_json
146
184
  puts "]"
147
185
  end
148
186
 
@@ -222,6 +260,7 @@ class WaybackMachineDownloader
222
260
  file_path = backup_path + file_path_elements[0..-1].join('/')
223
261
  end
224
262
  if Gem.win_platform?
263
+ dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
225
264
  file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
226
265
  end
227
266
  unless File.exist? file_path
@@ -229,7 +268,7 @@ class WaybackMachineDownloader
229
268
  structure_dir_path dir_path
230
269
  open(file_path, "wb") do |file|
231
270
  begin
232
- open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
271
+ URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}").open("Accept-Encoding" => "plain") do |uri|
233
272
  file.write(uri.read)
234
273
  end
235
274
  rescue OpenURI::HTTPError => e
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.1
4
+ version: 2.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-06-12 00:00:00.000000000 Z
11
+ date: 2021-09-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -57,7 +57,7 @@ homepage: https://github.com/hartator/wayback-machine-downloader
57
57
  licenses:
58
58
  - MIT
59
59
  metadata: {}
60
- post_install_message:
60
+ post_install_message:
61
61
  rdoc_options: []
62
62
  require_paths:
63
63
  - lib
@@ -72,9 +72,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
72
72
  - !ruby/object:Gem::Version
73
73
  version: '0'
74
74
  requirements: []
75
- rubyforge_project:
76
- rubygems_version: 2.5.2
77
- signing_key:
75
+ rubygems_version: 3.1.4
76
+ signing_key:
78
77
  specification_version: 4
79
78
  summary: Download an entire website from the Wayback Machine.
80
79
  test_files: []