wayback_machine_downloader 2.2.1 → 2.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: d037bdcdc516a9366f9d6181d63e61970f3a2ec1
4
- data.tar.gz: 03eef551fbb7be1d6dfb29f05c380e517b2870f8
2
+ SHA256:
3
+ metadata.gz: 57cbbb04b38525f6dd9c1a8f4022ee28ce45c76d1d26acc90076a4b8b6014b44
4
+ data.tar.gz: 4128b3ab753e91bea93ddebdafba133091663617e0c247c022076a8c11dfa5c2
5
5
  SHA512:
6
- metadata.gz: ee25c7e833907143a08d9b6438482d1c9bc76219eb62b4ac179ef770c323b60c3956bb7c1c1bd33ce0e7f5b3cda620fda4c06258837066762541b4404fd4d2cc
7
- data.tar.gz: cffca8734ae0ee449b35aae26f77032cd27bd749bbc34e925a64ea51f52a8a55c9b7fe85c5a3c2a8787715fba15f5c4138a04d4499ab170a0404d4fb0c2624d2
6
+ metadata.gz: bb08b6f6e9fa930b025fbf0c783476bd965e364ef46ccd2fecc8e5d0954be062b67b801acf4d168556f7d90f1d5c836a16184e371a5e5d47da7e804278d893ab
7
+ data.tar.gz: e750e04ab4e1f795e061f0fe91581abb60baecc7d5427d9ec8e724f931d1afba207512731f350eee7c04ceacf12b4a90823e9b3c750e3799810944430279c330
@@ -46,7 +46,7 @@ option_parser = OptionParser.new do |opts|
46
46
  options[:all] = true
47
47
  end
48
48
 
49
- opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time", "Default is one file at a time (ie. 20)") do |t|
49
+ opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
50
50
  options[:threads_count] = t
51
51
  end
52
52
 
@@ -14,7 +14,7 @@ class WaybackMachineDownloader
14
14
 
15
15
  include ArchiveAPI
16
16
 
17
- VERSION = "2.2.1"
17
+ VERSION = "2.3.0"
18
18
 
19
19
  attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
20
20
  :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
@@ -84,7 +84,7 @@ class WaybackMachineDownloader
84
84
  # Note: Passing a page index parameter allow us to get more snapshots,
85
85
  # but from a less fresh index
86
86
  print "Getting snapshot pages"
87
- snapshot_list_to_consider = ""
87
+ snapshot_list_to_consider = []
88
88
  snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
89
89
  print "."
90
90
  unless @exact_url
@@ -95,17 +95,15 @@ class WaybackMachineDownloader
95
95
  print "."
96
96
  end
97
97
  end
98
- puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
98
+ puts " found #{snapshot_list_to_consider.length} snaphots to consider."
99
99
  puts
100
100
  snapshot_list_to_consider
101
101
  end
102
102
 
103
103
  def get_file_list_curated
104
104
  file_list_curated = Hash.new
105
- get_all_snapshots_to_consider.each_line do |line|
106
- next unless line.include?('/')
107
- file_timestamp = line[0..13].to_i
108
- file_url = line[15..-2]
105
+ get_all_snapshots_to_consider.each do |file_timestamp, file_url|
106
+ next unless file_url.include?('/')
109
107
  file_id = file_url.split('/')[3..-1].join('/')
110
108
  file_id = CGI::unescape file_id
111
109
  file_id = file_id.tidy_bytes unless file_id == ""
@@ -130,10 +128,8 @@ class WaybackMachineDownloader
130
128
 
131
129
  def get_file_list_all_timestamps
132
130
  file_list_curated = Hash.new
133
- get_all_snapshots_to_consider.each_line do |line|
134
- next unless line.include?('/')
135
- file_timestamp = line[0..13].to_i
136
- file_url = line[15..-2]
131
+ get_all_snapshots_to_consider.each do |file_timestamp, file_url|
132
+ next unless file_url.include?('/')
137
133
  file_id = file_url.split('/')[3..-1].join('/')
138
134
  file_id_and_timestamp = [file_timestamp, file_id].join('/')
139
135
  file_id_and_timestamp = CGI::unescape file_id_and_timestamp
@@ -176,11 +172,15 @@ class WaybackMachineDownloader
176
172
 
177
173
  def list_files
178
174
  # retrieval produces its own output
175
+ @orig_stdout = $stdout
176
+ $stdout = $stderr
179
177
  files = get_file_list_by_timestamp
178
+ $stdout = @orig_stdout
180
179
  puts "["
181
- files.each do |file|
180
+ files[0...-1].each do |file|
182
181
  puts file.to_json + ","
183
182
  end
183
+ puts files[-1].to_json
184
184
  puts "]"
185
185
  end
186
186
 
@@ -268,7 +268,7 @@ class WaybackMachineDownloader
268
268
  structure_dir_path dir_path
269
269
  open(file_path, "wb") do |file|
270
270
  begin
271
- open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
271
+ URI.open("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
272
272
  file.write(uri.read)
273
273
  end
274
274
  rescue OpenURI::HTTPError => e
@@ -1,28 +1,38 @@
1
+ require 'json'
2
+ require 'uri'
3
+
1
4
  module ArchiveAPI
2
5
 
3
6
  def get_raw_list_from_api url, page_index
4
- request_url = "http://web.archive.org/cdx/search/xd?url="
5
- request_url += url
6
- request_url += parameters_for_api page_index
7
+ request_url = URI("https://web.archive.org/cdx/search/xd")
8
+ params = [["output", "json"], ["url", url]]
9
+ params += parameters_for_api page_index
10
+ request_url.query = URI.encode_www_form(params)
7
11
 
8
- open(request_url).read
12
+ begin
13
+ json = JSON.parse(URI(request_url).open.read)
14
+ if (json[0] <=> ["timestamp","original"]) == 0
15
+ json.shift
16
+ end
17
+ json
18
+ rescue JSON::ParserError
19
+ []
20
+ end
9
21
  end
10
22
 
11
23
  def parameters_for_api page_index
12
- parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
13
- if @all
14
- parameters += ""
15
- else
16
- parameters += "&filter=statuscode:200"
24
+ parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
25
+ if !@all
26
+ parameters.push(["filter", "statuscode:200"])
17
27
  end
18
28
  if @from_timestamp and @from_timestamp != 0
19
- parameters += "&from=" + @from_timestamp.to_s
29
+ parameters.push(["from", @from_timestamp.to_s])
20
30
  end
21
31
  if @to_timestamp and @to_timestamp != 0
22
- parameters += "&to=" + @to_timestamp.to_s
32
+ parameters.push(["to", @to_timestamp.to_s])
23
33
  end
24
34
  if page_index
25
- parameters += "&page=#{page_index}"
35
+ parameters.push(["page", page_index])
26
36
  end
27
37
  parameters
28
38
  end
@@ -70,7 +70,7 @@ module TibyBytes
70
70
  if is_unused || is_restricted
71
71
  bytes[i] = tidy_byte(byte)
72
72
  elsif is_cont
73
- # Not expecting contination byte? Clean up. Otherwise, now expect one less.
73
+ # Not expecting continuation byte? Clean up. Otherwise, now expect one less.
74
74
  conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
75
75
  else
76
76
  if conts_expected > 0
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.2.1
4
+ version: 2.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-10-27 00:00:00.000000000 Z
11
+ date: 2021-06-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -57,7 +57,7 @@ homepage: https://github.com/hartator/wayback-machine-downloader
57
57
  licenses:
58
58
  - MIT
59
59
  metadata: {}
60
- post_install_message:
60
+ post_install_message:
61
61
  rdoc_options: []
62
62
  require_paths:
63
63
  - lib
@@ -72,9 +72,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
72
72
  - !ruby/object:Gem::Version
73
73
  version: '0'
74
74
  requirements: []
75
- rubyforge_project:
76
- rubygems_version: 2.5.2
77
- signing_key:
75
+ rubygems_version: 3.1.4
76
+ signing_key:
78
77
  specification_version: 4
79
78
  summary: Download an entire website from the Wayback Machine.
80
79
  test_files: []