wayback_machine_downloader 0.4.9 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +4 -0
- data/lib/wayback_machine_downloader.rb +87 -54
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6dd8b2f20e56a00a8601a4e4f9ecd9e4305bcce7
|
4
|
+
data.tar.gz: 3274121f9abb86d37fc9820bb8465f608ec78366
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 739d5f8a16e0a0d8f6ba81ff6eddffb9bd7d9337e02bd9215cce2e31a033e67037a0bf3678192f4872e891f52dff240565a0b8e202b92ad38c1d6d6ca3940878
|
7
|
+
data.tar.gz: 98f4387c00da2c5c827e7cf9b2f63d51a0c29f98f7b5bf2a2f30c36552e524294f824a74bc9124f6b1d7ed098f6a7afcc6508de68e1cc6249fa8ff737a36a441
|
@@ -38,6 +38,10 @@ option_parser = OptionParser.new do |opts|
|
|
38
38
|
options[:list] = true
|
39
39
|
end
|
40
40
|
|
41
|
+
opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time. Default is one file at a time. (ie. 20)") do |t|
|
42
|
+
options[:threads_count] = t
|
43
|
+
end
|
44
|
+
|
41
45
|
opts.on("-v", "--version", "Display version") do |t|
|
42
46
|
options[:version] = t
|
43
47
|
end
|
@@ -9,9 +9,9 @@ require_relative 'wayback_machine_downloader/to_regex'
|
|
9
9
|
|
10
10
|
class WaybackMachineDownloader
|
11
11
|
|
12
|
-
VERSION = "0.
|
12
|
+
VERSION = "0.5.0"
|
13
13
|
|
14
|
-
attr_accessor :base_url, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list
|
14
|
+
attr_accessor :base_url, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :threads_count
|
15
15
|
|
16
16
|
def initialize params
|
17
17
|
@base_url = params[:base_url]
|
@@ -21,6 +21,7 @@ class WaybackMachineDownloader
|
|
21
21
|
@exclude_filter = params[:exclude_filter]
|
22
22
|
@all = params[:all]
|
23
23
|
@list = params[:list]
|
24
|
+
@threads_count = params[:threads_count].to_i
|
24
25
|
end
|
25
26
|
|
26
27
|
def backup_name
|
@@ -121,72 +122,37 @@ class WaybackMachineDownloader
|
|
121
122
|
end
|
122
123
|
|
123
124
|
def download_files
|
125
|
+
start_time = Time.now
|
124
126
|
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine..."
|
125
127
|
puts
|
126
|
-
|
128
|
+
|
127
129
|
if file_list_by_timestamp.count == 0
|
128
130
|
puts "No files to download."
|
129
131
|
puts "Possible reasons:"
|
130
132
|
puts "\t* Site is not in Wayback Machine Archive."
|
131
|
-
puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
|
132
|
-
puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
|
133
|
+
puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
|
134
|
+
puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
|
133
135
|
puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
|
134
136
|
puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
|
135
137
|
return
|
136
138
|
end
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
file_path = backup_path + 'index.html'
|
147
|
-
elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
|
148
|
-
dir_path = backup_path + file_path_elements[0..-1].join('/')
|
149
|
-
file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
|
150
|
-
else
|
151
|
-
dir_path = backup_path + file_path_elements[0..-2].join('/')
|
152
|
-
file_path = backup_path + file_path_elements[0..-1].join('/')
|
153
|
-
end
|
154
|
-
if Gem.win_platform?
|
155
|
-
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
156
|
-
end
|
157
|
-
unless File.exists? file_path
|
158
|
-
begin
|
159
|
-
structure_dir_path dir_path
|
160
|
-
open(file_path, "wb") do |file|
|
161
|
-
begin
|
162
|
-
open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
|
163
|
-
file.write(uri.read)
|
164
|
-
end
|
165
|
-
rescue OpenURI::HTTPError => e
|
166
|
-
puts "#{file_url} # #{e}"
|
167
|
-
if @all
|
168
|
-
file.write(e.io.read)
|
169
|
-
puts "#{file_path} saved anyway."
|
170
|
-
end
|
171
|
-
rescue StandardError => e
|
172
|
-
puts "#{file_url} # #{e}"
|
173
|
-
end
|
174
|
-
end
|
175
|
-
rescue StandardError => e
|
176
|
-
puts "#{file_url} # #{e}"
|
177
|
-
ensure
|
178
|
-
if not @all and File.exists?(file_path) and File.size(file_path) == 0
|
179
|
-
File.delete(file_path)
|
180
|
-
puts "#{file_path} was empty and was removed."
|
181
|
-
end
|
139
|
+
|
140
|
+
threads = []
|
141
|
+
@processed_file_count = 0
|
142
|
+
@threads_count = 1 unless @threads_count != 0
|
143
|
+
@threads_count.times do
|
144
|
+
threads << Thread.new do
|
145
|
+
until file_queue.empty?
|
146
|
+
file_remote_info = file_queue.pop(true) rescue nil
|
147
|
+
download_file(file_remote_info) if file_remote_info
|
182
148
|
end
|
183
|
-
puts "#{file_url} -> #{file_path} (#{count}/#{file_list_by_timestamp.size})"
|
184
|
-
else
|
185
|
-
puts "#{file_url} # #{file_path} already exists. (#{count}/#{file_list_by_timestamp.size})"
|
186
149
|
end
|
187
150
|
end
|
151
|
+
|
152
|
+
threads.each(&:join)
|
153
|
+
end_time = Time.now
|
188
154
|
puts
|
189
|
-
puts "Download
|
155
|
+
puts "Download completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path} (#{file_list_by_timestamp.size} files)"
|
190
156
|
end
|
191
157
|
|
192
158
|
def structure_dir_path dir_path
|
@@ -212,4 +178,71 @@ class WaybackMachineDownloader
|
|
212
178
|
end
|
213
179
|
end
|
214
180
|
|
181
|
+
def download_file file_remote_info
|
182
|
+
file_url = file_remote_info[:file_url]
|
183
|
+
file_id = file_remote_info[:file_id]
|
184
|
+
file_timestamp = file_remote_info[:timestamp]
|
185
|
+
file_path_elements = file_id.split('/')
|
186
|
+
if file_id == ""
|
187
|
+
dir_path = backup_path
|
188
|
+
file_path = backup_path + 'index.html'
|
189
|
+
elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
|
190
|
+
dir_path = backup_path + file_path_elements[0..-1].join('/')
|
191
|
+
file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
|
192
|
+
else
|
193
|
+
dir_path = backup_path + file_path_elements[0..-2].join('/')
|
194
|
+
file_path = backup_path + file_path_elements[0..-1].join('/')
|
195
|
+
end
|
196
|
+
if Gem.win_platform?
|
197
|
+
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
198
|
+
end
|
199
|
+
unless File.exists? file_path
|
200
|
+
begin
|
201
|
+
structure_dir_path dir_path
|
202
|
+
open(file_path, "wb") do |file|
|
203
|
+
begin
|
204
|
+
open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
|
205
|
+
file.write(uri.read)
|
206
|
+
end
|
207
|
+
rescue OpenURI::HTTPError => e
|
208
|
+
puts "#{file_url} # #{e}"
|
209
|
+
if @all
|
210
|
+
file.write(e.io.read)
|
211
|
+
puts "#{file_path} saved anyway."
|
212
|
+
end
|
213
|
+
rescue StandardError => e
|
214
|
+
puts "#{file_url} # #{e}"
|
215
|
+
end
|
216
|
+
end
|
217
|
+
rescue StandardError => e
|
218
|
+
puts "#{file_url} # #{e}"
|
219
|
+
ensure
|
220
|
+
if not @all and File.exists?(file_path) and File.size(file_path) == 0
|
221
|
+
File.delete(file_path)
|
222
|
+
puts "#{file_path} was empty and was removed."
|
223
|
+
end
|
224
|
+
end
|
225
|
+
semaphore.synchronize do
|
226
|
+
@processed_file_count += 1
|
227
|
+
puts "#{file_url} -> #{file_path} (#{@processed_file_count}/#{file_list_by_timestamp.size})"
|
228
|
+
end
|
229
|
+
else
|
230
|
+
semaphore.synchronize do
|
231
|
+
@processed_file_count += 1
|
232
|
+
puts "#{file_url} # #{file_path} already exists. (#{@processed_file_count}/#{file_list_by_timestamp.size})"
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
def file_queue
|
238
|
+
@file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
|
239
|
+
end
|
240
|
+
|
241
|
+
def file_list_by_timestamp
|
242
|
+
@file_list_by_timestamp ||= get_file_list_by_timestamp
|
243
|
+
end
|
244
|
+
|
245
|
+
def semaphore
|
246
|
+
@semaphore ||= Mutex.new
|
247
|
+
end
|
215
248
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hartator
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-09-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|