wayback_machine_downloader 0.4.9 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +4 -0
- data/lib/wayback_machine_downloader.rb +87 -54
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6dd8b2f20e56a00a8601a4e4f9ecd9e4305bcce7
|
4
|
+
data.tar.gz: 3274121f9abb86d37fc9820bb8465f608ec78366
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 739d5f8a16e0a0d8f6ba81ff6eddffb9bd7d9337e02bd9215cce2e31a033e67037a0bf3678192f4872e891f52dff240565a0b8e202b92ad38c1d6d6ca3940878
|
7
|
+
data.tar.gz: 98f4387c00da2c5c827e7cf9b2f63d51a0c29f98f7b5bf2a2f30c36552e524294f824a74bc9124f6b1d7ed098f6a7afcc6508de68e1cc6249fa8ff737a36a441
|
@@ -38,6 +38,10 @@ option_parser = OptionParser.new do |opts|
|
|
38
38
|
options[:list] = true
|
39
39
|
end
|
40
40
|
|
41
|
+
opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time. Default is one file at a time. (ie. 20)") do |t|
|
42
|
+
options[:threads_count] = t
|
43
|
+
end
|
44
|
+
|
41
45
|
opts.on("-v", "--version", "Display version") do |t|
|
42
46
|
options[:version] = t
|
43
47
|
end
|
@@ -9,9 +9,9 @@ require_relative 'wayback_machine_downloader/to_regex'
|
|
9
9
|
|
10
10
|
class WaybackMachineDownloader
|
11
11
|
|
12
|
-
VERSION = "0.
|
12
|
+
VERSION = "0.5.0"
|
13
13
|
|
14
|
-
attr_accessor :base_url, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list
|
14
|
+
attr_accessor :base_url, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :threads_count
|
15
15
|
|
16
16
|
def initialize params
|
17
17
|
@base_url = params[:base_url]
|
@@ -21,6 +21,7 @@ class WaybackMachineDownloader
|
|
21
21
|
@exclude_filter = params[:exclude_filter]
|
22
22
|
@all = params[:all]
|
23
23
|
@list = params[:list]
|
24
|
+
@threads_count = params[:threads_count].to_i
|
24
25
|
end
|
25
26
|
|
26
27
|
def backup_name
|
@@ -121,72 +122,37 @@ class WaybackMachineDownloader
|
|
121
122
|
end
|
122
123
|
|
123
124
|
def download_files
|
125
|
+
start_time = Time.now
|
124
126
|
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine..."
|
125
127
|
puts
|
126
|
-
|
128
|
+
|
127
129
|
if file_list_by_timestamp.count == 0
|
128
130
|
puts "No files to download."
|
129
131
|
puts "Possible reasons:"
|
130
132
|
puts "\t* Site is not in Wayback Machine Archive."
|
131
|
-
puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
|
132
|
-
puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
|
133
|
+
puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
|
134
|
+
puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
|
133
135
|
puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
|
134
136
|
puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
|
135
137
|
return
|
136
138
|
end
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
file_path = backup_path + 'index.html'
|
147
|
-
elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
|
148
|
-
dir_path = backup_path + file_path_elements[0..-1].join('/')
|
149
|
-
file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
|
150
|
-
else
|
151
|
-
dir_path = backup_path + file_path_elements[0..-2].join('/')
|
152
|
-
file_path = backup_path + file_path_elements[0..-1].join('/')
|
153
|
-
end
|
154
|
-
if Gem.win_platform?
|
155
|
-
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
156
|
-
end
|
157
|
-
unless File.exists? file_path
|
158
|
-
begin
|
159
|
-
structure_dir_path dir_path
|
160
|
-
open(file_path, "wb") do |file|
|
161
|
-
begin
|
162
|
-
open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
|
163
|
-
file.write(uri.read)
|
164
|
-
end
|
165
|
-
rescue OpenURI::HTTPError => e
|
166
|
-
puts "#{file_url} # #{e}"
|
167
|
-
if @all
|
168
|
-
file.write(e.io.read)
|
169
|
-
puts "#{file_path} saved anyway."
|
170
|
-
end
|
171
|
-
rescue StandardError => e
|
172
|
-
puts "#{file_url} # #{e}"
|
173
|
-
end
|
174
|
-
end
|
175
|
-
rescue StandardError => e
|
176
|
-
puts "#{file_url} # #{e}"
|
177
|
-
ensure
|
178
|
-
if not @all and File.exists?(file_path) and File.size(file_path) == 0
|
179
|
-
File.delete(file_path)
|
180
|
-
puts "#{file_path} was empty and was removed."
|
181
|
-
end
|
139
|
+
|
140
|
+
threads = []
|
141
|
+
@processed_file_count = 0
|
142
|
+
@threads_count = 1 unless @threads_count != 0
|
143
|
+
@threads_count.times do
|
144
|
+
threads << Thread.new do
|
145
|
+
until file_queue.empty?
|
146
|
+
file_remote_info = file_queue.pop(true) rescue nil
|
147
|
+
download_file(file_remote_info) if file_remote_info
|
182
148
|
end
|
183
|
-
puts "#{file_url} -> #{file_path} (#{count}/#{file_list_by_timestamp.size})"
|
184
|
-
else
|
185
|
-
puts "#{file_url} # #{file_path} already exists. (#{count}/#{file_list_by_timestamp.size})"
|
186
149
|
end
|
187
150
|
end
|
151
|
+
|
152
|
+
threads.each(&:join)
|
153
|
+
end_time = Time.now
|
188
154
|
puts
|
189
|
-
puts "Download
|
155
|
+
puts "Download completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path} (#{file_list_by_timestamp.size} files)"
|
190
156
|
end
|
191
157
|
|
192
158
|
def structure_dir_path dir_path
|
@@ -212,4 +178,71 @@ class WaybackMachineDownloader
|
|
212
178
|
end
|
213
179
|
end
|
214
180
|
|
181
|
+
def download_file file_remote_info
|
182
|
+
file_url = file_remote_info[:file_url]
|
183
|
+
file_id = file_remote_info[:file_id]
|
184
|
+
file_timestamp = file_remote_info[:timestamp]
|
185
|
+
file_path_elements = file_id.split('/')
|
186
|
+
if file_id == ""
|
187
|
+
dir_path = backup_path
|
188
|
+
file_path = backup_path + 'index.html'
|
189
|
+
elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
|
190
|
+
dir_path = backup_path + file_path_elements[0..-1].join('/')
|
191
|
+
file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
|
192
|
+
else
|
193
|
+
dir_path = backup_path + file_path_elements[0..-2].join('/')
|
194
|
+
file_path = backup_path + file_path_elements[0..-1].join('/')
|
195
|
+
end
|
196
|
+
if Gem.win_platform?
|
197
|
+
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
198
|
+
end
|
199
|
+
unless File.exists? file_path
|
200
|
+
begin
|
201
|
+
structure_dir_path dir_path
|
202
|
+
open(file_path, "wb") do |file|
|
203
|
+
begin
|
204
|
+
open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
|
205
|
+
file.write(uri.read)
|
206
|
+
end
|
207
|
+
rescue OpenURI::HTTPError => e
|
208
|
+
puts "#{file_url} # #{e}"
|
209
|
+
if @all
|
210
|
+
file.write(e.io.read)
|
211
|
+
puts "#{file_path} saved anyway."
|
212
|
+
end
|
213
|
+
rescue StandardError => e
|
214
|
+
puts "#{file_url} # #{e}"
|
215
|
+
end
|
216
|
+
end
|
217
|
+
rescue StandardError => e
|
218
|
+
puts "#{file_url} # #{e}"
|
219
|
+
ensure
|
220
|
+
if not @all and File.exists?(file_path) and File.size(file_path) == 0
|
221
|
+
File.delete(file_path)
|
222
|
+
puts "#{file_path} was empty and was removed."
|
223
|
+
end
|
224
|
+
end
|
225
|
+
semaphore.synchronize do
|
226
|
+
@processed_file_count += 1
|
227
|
+
puts "#{file_url} -> #{file_path} (#{@processed_file_count}/#{file_list_by_timestamp.size})"
|
228
|
+
end
|
229
|
+
else
|
230
|
+
semaphore.synchronize do
|
231
|
+
@processed_file_count += 1
|
232
|
+
puts "#{file_url} # #{file_path} already exists. (#{@processed_file_count}/#{file_list_by_timestamp.size})"
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
def file_queue
|
238
|
+
@file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
|
239
|
+
end
|
240
|
+
|
241
|
+
def file_list_by_timestamp
|
242
|
+
@file_list_by_timestamp ||= get_file_list_by_timestamp
|
243
|
+
end
|
244
|
+
|
245
|
+
def semaphore
|
246
|
+
@semaphore ||= Mutex.new
|
247
|
+
end
|
215
248
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hartator
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-09-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|