wayback_machine_downloader 2.0.0 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 520f637efbb03d1e3ac87aadf1a937cc132f6c32
|
4
|
+
data.tar.gz: 90ec9079f5420153e1b7c149ce5aedb45bd4ba2c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6009139fbad22b7e269582d905956147df5cec565e376cbf1bdbb1125bb906fdea31c6501b3dfd9fa1b35d730444c976bcf187b8c869e8ed7db146ea155ba8fb
|
7
|
+
data.tar.gz: b4cff49e64c3ec528b544184e1426b76d1284c7f97ee9c787f370cc305b652f328c53ac48d3fc9bd9c9405b26edceb7a888e7f4a8f061ee704608e95a0642108
|
@@ -26,6 +26,10 @@ option_parser = OptionParser.new do |opts|
|
|
26
26
|
options[:to_timestamp] = t
|
27
27
|
end
|
28
28
|
|
29
|
+
opts.on("-e", "--exact_url", String, "Download only the url provied and not the full site") do |t|
|
30
|
+
options[:only_filter] = t
|
31
|
+
end
|
32
|
+
|
29
33
|
opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
|
30
34
|
options[:only_filter] = t
|
31
35
|
end
|
@@ -42,11 +46,11 @@ option_parser = OptionParser.new do |opts|
|
|
42
46
|
options[:threads_count] = t
|
43
47
|
end
|
44
48
|
|
45
|
-
opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page
|
49
|
+
opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
|
46
50
|
options[:maximum_pages] = t
|
47
51
|
end
|
48
52
|
|
49
|
-
opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything
|
53
|
+
opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
|
50
54
|
options[:list] = true
|
51
55
|
end
|
52
56
|
|
@@ -58,7 +62,7 @@ end.parse!
|
|
58
62
|
if (base_url = ARGV[-1])
|
59
63
|
options[:base_url] = base_url
|
60
64
|
wayback_machine_downloader = WaybackMachineDownloader.new options
|
61
|
-
if
|
65
|
+
if options[:list]
|
62
66
|
wayback_machine_downloader.list_files
|
63
67
|
else
|
64
68
|
wayback_machine_downloader.download_files
|
@@ -14,19 +14,21 @@ class WaybackMachineDownloader
|
|
14
14
|
|
15
15
|
include ArchiveAPI
|
16
16
|
|
17
|
-
VERSION = "2.
|
17
|
+
VERSION = "2.1.0"
|
18
18
|
|
19
|
-
attr_accessor :base_url, :
|
19
|
+
attr_accessor :base_url, :exact_url, :directory,
|
20
|
+
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
21
|
+
:all, :maximum_pages, :threads_count
|
20
22
|
|
21
23
|
def initialize params
|
22
24
|
@base_url = params[:base_url]
|
25
|
+
@exact_url = params[:exact_url]
|
23
26
|
@directory = params[:directory]
|
24
27
|
@from_timestamp = params[:from_timestamp].to_i
|
25
28
|
@to_timestamp = params[:to_timestamp].to_i
|
26
29
|
@only_filter = params[:only_filter]
|
27
30
|
@exclude_filter = params[:exclude_filter]
|
28
31
|
@all = params[:all]
|
29
|
-
@list = params[:list]
|
30
32
|
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
|
31
33
|
@threads_count = params[:threads_count].to_i
|
32
34
|
end
|
@@ -78,18 +80,19 @@ class WaybackMachineDownloader
|
|
78
80
|
end
|
79
81
|
|
80
82
|
def get_all_snapshots_to_consider
|
81
|
-
# Note: Passing a page index parameter allow us to get more snapshots,
|
83
|
+
# Note: Passing a page index parameter allow us to get more snapshots,
|
84
|
+
# but from a less fresh index
|
82
85
|
print "Getting snapshot pages"
|
83
86
|
snapshot_list_to_consider = ""
|
84
87
|
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
|
85
88
|
print "."
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
89
|
+
unless @exact_url
|
90
|
+
@maximum_pages.times do |page_index|
|
91
|
+
snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
|
92
|
+
break if snapshot_list.empty?
|
93
|
+
snapshot_list_to_consider += snapshot_list
|
94
|
+
print "."
|
95
|
+
end
|
93
96
|
end
|
94
97
|
puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
|
95
98
|
puts
|
@@ -134,8 +137,10 @@ class WaybackMachineDownloader
|
|
134
137
|
end
|
135
138
|
|
136
139
|
def list_files
|
140
|
+
# retrieval produces its own output
|
141
|
+
files = get_file_list_by_timestamp
|
137
142
|
puts "["
|
138
|
-
|
143
|
+
files.each do |file|
|
139
144
|
puts file.to_json + ","
|
140
145
|
end
|
141
146
|
puts "]"
|
@@ -179,7 +184,7 @@ class WaybackMachineDownloader
|
|
179
184
|
|
180
185
|
def structure_dir_path dir_path
|
181
186
|
begin
|
182
|
-
FileUtils::mkdir_p dir_path unless File.
|
187
|
+
FileUtils::mkdir_p dir_path unless File.exist? dir_path
|
183
188
|
rescue Errno::EEXIST => e
|
184
189
|
error_to_string = e.to_s
|
185
190
|
puts "# #{error_to_string}"
|
@@ -219,7 +224,7 @@ class WaybackMachineDownloader
|
|
219
224
|
if Gem.win_platform?
|
220
225
|
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
221
226
|
end
|
222
|
-
unless File.
|
227
|
+
unless File.exist? file_path
|
223
228
|
begin
|
224
229
|
structure_dir_path dir_path
|
225
230
|
open(file_path, "wb") do |file|
|
@@ -240,7 +245,7 @@ class WaybackMachineDownloader
|
|
240
245
|
rescue StandardError => e
|
241
246
|
puts "#{file_url} # #{e}"
|
242
247
|
ensure
|
243
|
-
if not @all and File.
|
248
|
+
if not @all and File.exist?(file_path) and File.size(file_path) == 0
|
244
249
|
File.delete(file_path)
|
245
250
|
puts "#{file_path} was empty and was removed."
|
246
251
|
end
|
@@ -1,15 +1,15 @@
|
|
1
1
|
module ArchiveAPI
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
3
|
+
def get_raw_list_from_api url, page_index
|
4
|
+
request_url = "http://web.archive.org/cdx/search/xd?url="
|
5
|
+
request_url += url
|
6
|
+
request_url += parameters_for_api page_index
|
7
7
|
|
8
8
|
open(request_url).read
|
9
|
-
|
9
|
+
end
|
10
10
|
|
11
|
-
|
12
|
-
|
11
|
+
def parameters_for_api page_index
|
12
|
+
parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
|
13
13
|
if @all
|
14
14
|
parameters += ""
|
15
15
|
else
|
@@ -60,7 +60,7 @@ module TibyBytes
|
|
60
60
|
bytes.each_index do |i|
|
61
61
|
|
62
62
|
byte = bytes[i]
|
63
|
-
|
63
|
+
_is_ascii = byte < 128
|
64
64
|
is_cont = byte > 127 && byte < 192
|
65
65
|
is_lead = byte > 191 && byte < 245
|
66
66
|
is_unused = byte > 240
|
@@ -78,7 +78,7 @@ module TibyBytes
|
|
78
78
|
# the leading byte.
|
79
79
|
begin
|
80
80
|
(1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
|
81
|
-
rescue NoMethodError
|
81
|
+
rescue NoMethodError
|
82
82
|
next
|
83
83
|
end
|
84
84
|
conts_expected = 0
|
@@ -98,7 +98,7 @@ module TibyBytes
|
|
98
98
|
end
|
99
99
|
begin
|
100
100
|
bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
101
|
-
rescue ArgumentError
|
101
|
+
rescue ArgumentError
|
102
102
|
nil
|
103
103
|
end
|
104
104
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hartator
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-06-
|
11
|
+
date: 2017-06-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|