wayback_machine_downloader 2.1.1 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 48f524cedc0e9f66c7b0acca132a71557a327ea2
4
- data.tar.gz: 1d70bb2a76cd07c82c08674fdc96b543caec48c0
2
+ SHA256:
3
+ metadata.gz: 54752c73ebfac815e91ef6bba40547a36282e5ec9c3ef2792370c13352fce0b6
4
+ data.tar.gz: df2f5d94981eeb2d1e55d2b4a9dd8fe57a24e8b29cf79a700ca520b7c3bc1a21
5
5
  SHA512:
6
- metadata.gz: 26eb05cbeebd911502bd01513535c7cc2d4ad0fe3850adc0205ca4f649351e56855af66915d86c501fb8be64963fe1d409d013d8afcd24064cc15673b2cc0854
7
- data.tar.gz: 0dbbd54b4b4ab231adcae908bbf6cd3865768590263e767fe5e45fb3a9d70676c337f79aba576378272ddc14647ecd06fc26820fc1dec8cb52704aa6740582b7
6
+ metadata.gz: 108d33cf57b738ba69ccf960f503ab5ea44b296ba043716fb2e83e9fa5bebcaec9a488bc4a5ab64dad55c1f23434c2b71005a86389e9b26fd07b38372f96b6d4
7
+ data.tar.gz: 62afad1698415e0c80b85599da7aba1e19574ec571862f8d69c56d1fe718f8c65cae3e3be2293d8418ecc7dd09803b4d9908186e93f3062ccd85b363a5e7dde4
@@ -18,6 +18,10 @@ option_parser = OptionParser.new do |opts|
18
18
  options[:directory] = t
19
19
  end
20
20
 
21
+ opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
22
+ options[:all_timestamps] = true
23
+ end
24
+
21
25
  opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
22
26
  options[:from_timestamp] = t
23
27
  end
@@ -42,7 +46,7 @@ option_parser = OptionParser.new do |opts|
42
46
  options[:all] = true
43
47
  end
44
48
 
45
- opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time", "Default is one file at a time (ie. 20)") do |t|
49
+ opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
46
50
  options[:threads_count] = t
47
51
  end
48
52
 
@@ -1,28 +1,38 @@
1
+ require 'json'
2
+ require 'uri'
3
+
1
4
  module ArchiveAPI
2
5
 
3
6
  def get_raw_list_from_api url, page_index
4
- request_url = "http://web.archive.org/cdx/search/xd?url="
5
- request_url += url
6
- request_url += parameters_for_api page_index
7
+ request_url = URI("https://web.archive.org/cdx/search/xd")
8
+ params = [["output", "json"], ["url", url]]
9
+ params += parameters_for_api page_index
10
+ request_url.query = URI.encode_www_form(params)
7
11
 
8
- open(request_url).read
12
+ begin
13
+ json = JSON.parse(URI(request_url).open.read)
14
+ if (json[0] <=> ["timestamp","original"]) == 0
15
+ json.shift
16
+ end
17
+ json
18
+ rescue JSON::ParserError
19
+ []
20
+ end
9
21
  end
10
22
 
11
23
  def parameters_for_api page_index
12
- parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
13
- if @all
14
- parameters += ""
15
- else
16
- parameters += "&filter=statuscode:200"
24
+ parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
25
+ if !@all
26
+ parameters.push(["filter", "statuscode:200"])
17
27
  end
18
28
  if @from_timestamp and @from_timestamp != 0
19
- parameters += "&from=" + @from_timestamp.to_s
29
+ parameters.push(["from", @from_timestamp.to_s])
20
30
  end
21
31
  if @to_timestamp and @to_timestamp != 0
22
- parameters += "&to=" + @to_timestamp.to_s
32
+ parameters.push(["to", @to_timestamp.to_s])
23
33
  end
24
34
  if page_index
25
- parameters += "&page=#{page_index}"
35
+ parameters.push(["page", page_index])
26
36
  end
27
37
  parameters
28
38
  end
@@ -70,7 +70,7 @@ module TibyBytes
70
70
  if is_unused || is_restricted
71
71
  bytes[i] = tidy_byte(byte)
72
72
  elsif is_cont
73
- # Not expecting contination byte? Clean up. Otherwise, now expect one less.
73
+ # Not expecting continuation byte? Clean up. Otherwise, now expect one less.
74
74
  conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
75
75
  else
76
76
  if conts_expected > 0
@@ -14,9 +14,9 @@ class WaybackMachineDownloader
14
14
 
15
15
  include ArchiveAPI
16
16
 
17
- VERSION = "2.1.1"
17
+ VERSION = "2.3.1"
18
18
 
19
- attr_accessor :base_url, :exact_url, :directory,
19
+ attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
20
20
  :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
21
21
  :all, :maximum_pages, :threads_count
22
22
 
@@ -24,6 +24,7 @@ class WaybackMachineDownloader
24
24
  @base_url = params[:base_url]
25
25
  @exact_url = params[:exact_url]
26
26
  @directory = params[:directory]
27
+ @all_timestamps = params[:all_timestamps]
27
28
  @from_timestamp = params[:from_timestamp].to_i
28
29
  @to_timestamp = params[:to_timestamp].to_i
29
30
  @only_filter = params[:only_filter]
@@ -83,7 +84,7 @@ class WaybackMachineDownloader
83
84
  # Note: Passing a page index parameter allow us to get more snapshots,
84
85
  # but from a less fresh index
85
86
  print "Getting snapshot pages"
86
- snapshot_list_to_consider = ""
87
+ snapshot_list_to_consider = []
87
88
  snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
88
89
  print "."
89
90
  unless @exact_url
@@ -94,17 +95,15 @@ class WaybackMachineDownloader
94
95
  print "."
95
96
  end
96
97
  end
97
- puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
98
+ puts " found #{snapshot_list_to_consider.length} snaphots to consider."
98
99
  puts
99
100
  snapshot_list_to_consider
100
101
  end
101
102
 
102
103
  def get_file_list_curated
103
104
  file_list_curated = Hash.new
104
- get_all_snapshots_to_consider.each_line do |line|
105
- next unless line.include?('/')
106
- file_timestamp = line[0..13].to_i
107
- file_url = line[15..-2]
105
+ get_all_snapshots_to_consider.each do |file_timestamp, file_url|
106
+ next unless file_url.include?('/')
108
107
  file_id = file_url.split('/')[3..-1].join('/')
109
108
  file_id = CGI::unescape file_id
110
109
  file_id = file_id.tidy_bytes unless file_id == ""
@@ -127,22 +126,61 @@ class WaybackMachineDownloader
127
126
  file_list_curated
128
127
  end
129
128
 
129
+ def get_file_list_all_timestamps
130
+ file_list_curated = Hash.new
131
+ get_all_snapshots_to_consider.each do |file_timestamp, file_url|
132
+ next unless file_url.include?('/')
133
+ file_id = file_url.split('/')[3..-1].join('/')
134
+ file_id_and_timestamp = [file_timestamp, file_id].join('/')
135
+ file_id_and_timestamp = CGI::unescape file_id_and_timestamp
136
+ file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
137
+ if file_id.nil?
138
+ puts "Malformed file url, ignoring: #{file_url}"
139
+ else
140
+ if match_exclude_filter(file_url)
141
+ puts "File url matches exclude filter, ignoring: #{file_url}"
142
+ elsif not match_only_filter(file_url)
143
+ puts "File url doesn't match only filter, ignoring: #{file_url}"
144
+ elsif file_list_curated[file_id_and_timestamp]
145
+ puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
146
+ else
147
+ file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
148
+ end
149
+ end
150
+ end
151
+ puts "file_list_curated: " + file_list_curated.count.to_s
152
+ file_list_curated
153
+ end
154
+
155
+
130
156
  def get_file_list_by_timestamp
131
- file_list_curated = get_file_list_curated
132
- file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
133
- file_list_curated.map do |file_remote_info|
134
- file_remote_info[1][:file_id] = file_remote_info[0]
135
- file_remote_info[1]
157
+ if @all_timestamps
158
+ file_list_curated = get_file_list_all_timestamps
159
+ file_list_curated.map do |file_remote_info|
160
+ file_remote_info[1][:file_id] = file_remote_info[0]
161
+ file_remote_info[1]
162
+ end
163
+ else
164
+ file_list_curated = get_file_list_curated
165
+ file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
166
+ file_list_curated.map do |file_remote_info|
167
+ file_remote_info[1][:file_id] = file_remote_info[0]
168
+ file_remote_info[1]
169
+ end
136
170
  end
137
171
  end
138
172
 
139
173
  def list_files
140
174
  # retrieval produces its own output
175
+ @orig_stdout = $stdout
176
+ $stdout = $stderr
141
177
  files = get_file_list_by_timestamp
178
+ $stdout = @orig_stdout
142
179
  puts "["
143
- files.each do |file|
180
+ files[0...-1].each do |file|
144
181
  puts file.to_json + ","
145
182
  end
183
+ puts files[-1].to_json
146
184
  puts "]"
147
185
  end
148
186
 
@@ -222,6 +260,7 @@ class WaybackMachineDownloader
222
260
  file_path = backup_path + file_path_elements[0..-1].join('/')
223
261
  end
224
262
  if Gem.win_platform?
263
+ dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
225
264
  file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
226
265
  end
227
266
  unless File.exist? file_path
@@ -229,7 +268,7 @@ class WaybackMachineDownloader
229
268
  structure_dir_path dir_path
230
269
  open(file_path, "wb") do |file|
231
270
  begin
232
- open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
271
+ URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}").open("Accept-Encoding" => "plain") do |uri|
233
272
  file.write(uri.read)
234
273
  end
235
274
  rescue OpenURI::HTTPError => e
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.1
4
+ version: 2.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-06-12 00:00:00.000000000 Z
11
+ date: 2021-09-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -57,7 +57,7 @@ homepage: https://github.com/hartator/wayback-machine-downloader
57
57
  licenses:
58
58
  - MIT
59
59
  metadata: {}
60
- post_install_message:
60
+ post_install_message:
61
61
  rdoc_options: []
62
62
  require_paths:
63
63
  - lib
@@ -72,9 +72,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
72
72
  - !ruby/object:Gem::Version
73
73
  version: '0'
74
74
  requirements: []
75
- rubyforge_project:
76
- rubygems_version: 2.5.2
77
- signing_key:
75
+ rubygems_version: 3.1.4
76
+ signing_key:
78
77
  specification_version: 4
79
78
  summary: Download an entire website from the Wayback Machine.
80
79
  test_files: []