wayback_machine_downloader 0.4.9 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 73c0ec7ba53bc62a4ef801970325bcc56144e04d
4
- data.tar.gz: 9096da258417570d757decce4ca2a847f39febac
3
+ metadata.gz: 6dd8b2f20e56a00a8601a4e4f9ecd9e4305bcce7
4
+ data.tar.gz: 3274121f9abb86d37fc9820bb8465f608ec78366
5
5
  SHA512:
6
- metadata.gz: 27407c32b02bfc2d1ffb267c6b654552a37fa17bb0605605bae4a2dd0fe89573a1e0e092ee3bc00faad97895d0e31277c7db0e675ea846c4e8be908d0d3b317f
7
- data.tar.gz: 2155e4e85e4192b252c5b92b46237b824b34c6b905d1446586452163255687fc946e1e433cf03c764b6df0c71065be302f742c5a0b631ed4ef748c6ed1cc2d77
6
+ metadata.gz: 739d5f8a16e0a0d8f6ba81ff6eddffb9bd7d9337e02bd9215cce2e31a033e67037a0bf3678192f4872e891f52dff240565a0b8e202b92ad38c1d6d6ca3940878
7
+ data.tar.gz: 98f4387c00da2c5c827e7cf9b2f63d51a0c29f98f7b5bf2a2f30c36552e524294f824a74bc9124f6b1d7ed098f6a7afcc6508de68e1cc6249fa8ff737a36a441
@@ -38,6 +38,10 @@ option_parser = OptionParser.new do |opts|
38
38
  options[:list] = true
39
39
  end
40
40
 
41
+ opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time. Default is one file at a time. (ie. 20)") do |t|
42
+ options[:threads_count] = t
43
+ end
44
+
41
45
  opts.on("-v", "--version", "Display version") do |t|
42
46
  options[:version] = t
43
47
  end
@@ -9,9 +9,9 @@ require_relative 'wayback_machine_downloader/to_regex'
9
9
 
10
10
  class WaybackMachineDownloader
11
11
 
12
- VERSION = "0.4.9"
12
+ VERSION = "0.5.0"
13
13
 
14
- attr_accessor :base_url, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list
14
+ attr_accessor :base_url, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :threads_count
15
15
 
16
16
  def initialize params
17
17
  @base_url = params[:base_url]
@@ -21,6 +21,7 @@ class WaybackMachineDownloader
21
21
  @exclude_filter = params[:exclude_filter]
22
22
  @all = params[:all]
23
23
  @list = params[:list]
24
+ @threads_count = params[:threads_count].to_i
24
25
  end
25
26
 
26
27
  def backup_name
@@ -121,72 +122,37 @@ class WaybackMachineDownloader
121
122
  end
122
123
 
123
124
  def download_files
125
+ start_time = Time.now
124
126
  puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine..."
125
127
  puts
126
- file_list_by_timestamp = get_file_list_by_timestamp
128
+
127
129
  if file_list_by_timestamp.count == 0
128
130
  puts "No files to download."
129
131
  puts "Possible reasons:"
130
132
  puts "\t* Site is not in Wayback Machine Archive."
131
- puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
132
- puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
133
+ puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
134
+ puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
133
135
  puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
134
136
  puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
135
137
  return
136
138
  end
137
- count = 0
138
- file_list_by_timestamp.each do |file_remote_info|
139
- count += 1
140
- file_url = file_remote_info[:file_url]
141
- file_id = file_remote_info[:file_id]
142
- file_timestamp = file_remote_info[:timestamp]
143
- file_path_elements = file_id.split('/')
144
- if file_id == ""
145
- dir_path = backup_path
146
- file_path = backup_path + 'index.html'
147
- elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
148
- dir_path = backup_path + file_path_elements[0..-1].join('/')
149
- file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
150
- else
151
- dir_path = backup_path + file_path_elements[0..-2].join('/')
152
- file_path = backup_path + file_path_elements[0..-1].join('/')
153
- end
154
- if Gem.win_platform?
155
- file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
156
- end
157
- unless File.exists? file_path
158
- begin
159
- structure_dir_path dir_path
160
- open(file_path, "wb") do |file|
161
- begin
162
- open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
163
- file.write(uri.read)
164
- end
165
- rescue OpenURI::HTTPError => e
166
- puts "#{file_url} # #{e}"
167
- if @all
168
- file.write(e.io.read)
169
- puts "#{file_path} saved anyway."
170
- end
171
- rescue StandardError => e
172
- puts "#{file_url} # #{e}"
173
- end
174
- end
175
- rescue StandardError => e
176
- puts "#{file_url} # #{e}"
177
- ensure
178
- if not @all and File.exists?(file_path) and File.size(file_path) == 0
179
- File.delete(file_path)
180
- puts "#{file_path} was empty and was removed."
181
- end
139
+
140
+ threads = []
141
+ @processed_file_count = 0
142
+ @threads_count = 1 unless @threads_count != 0
143
+ @threads_count.times do
144
+ threads << Thread.new do
145
+ until file_queue.empty?
146
+ file_remote_info = file_queue.pop(true) rescue nil
147
+ download_file(file_remote_info) if file_remote_info
182
148
  end
183
- puts "#{file_url} -> #{file_path} (#{count}/#{file_list_by_timestamp.size})"
184
- else
185
- puts "#{file_url} # #{file_path} already exists. (#{count}/#{file_list_by_timestamp.size})"
186
149
  end
187
150
  end
151
+
152
+ threads.each(&:join)
153
+ end_time = Time.now
188
154
  puts
189
- puts "Download complete, saved in #{backup_path} (#{file_list_by_timestamp.size} files)"
155
+ puts "Download completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path} (#{file_list_by_timestamp.size} files)"
190
156
  end
191
157
 
192
158
  def structure_dir_path dir_path
@@ -212,4 +178,71 @@ class WaybackMachineDownloader
212
178
  end
213
179
  end
214
180
 
181
+ def download_file file_remote_info
182
+ file_url = file_remote_info[:file_url]
183
+ file_id = file_remote_info[:file_id]
184
+ file_timestamp = file_remote_info[:timestamp]
185
+ file_path_elements = file_id.split('/')
186
+ if file_id == ""
187
+ dir_path = backup_path
188
+ file_path = backup_path + 'index.html'
189
+ elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
190
+ dir_path = backup_path + file_path_elements[0..-1].join('/')
191
+ file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
192
+ else
193
+ dir_path = backup_path + file_path_elements[0..-2].join('/')
194
+ file_path = backup_path + file_path_elements[0..-1].join('/')
195
+ end
196
+ if Gem.win_platform?
197
+ file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
198
+ end
199
+ unless File.exists? file_path
200
+ begin
201
+ structure_dir_path dir_path
202
+ open(file_path, "wb") do |file|
203
+ begin
204
+ open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
205
+ file.write(uri.read)
206
+ end
207
+ rescue OpenURI::HTTPError => e
208
+ puts "#{file_url} # #{e}"
209
+ if @all
210
+ file.write(e.io.read)
211
+ puts "#{file_path} saved anyway."
212
+ end
213
+ rescue StandardError => e
214
+ puts "#{file_url} # #{e}"
215
+ end
216
+ end
217
+ rescue StandardError => e
218
+ puts "#{file_url} # #{e}"
219
+ ensure
220
+ if not @all and File.exists?(file_path) and File.size(file_path) == 0
221
+ File.delete(file_path)
222
+ puts "#{file_path} was empty and was removed."
223
+ end
224
+ end
225
+ semaphore.synchronize do
226
+ @processed_file_count += 1
227
+ puts "#{file_url} -> #{file_path} (#{@processed_file_count}/#{file_list_by_timestamp.size})"
228
+ end
229
+ else
230
+ semaphore.synchronize do
231
+ @processed_file_count += 1
232
+ puts "#{file_url} # #{file_path} already exists. (#{@processed_file_count}/#{file_list_by_timestamp.size})"
233
+ end
234
+ end
235
+ end
236
+
237
+ def file_queue
238
+ @file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
239
+ end
240
+
241
+ def file_list_by_timestamp
242
+ @file_list_by_timestamp ||= get_file_list_by_timestamp
243
+ end
244
+
245
+ def semaphore
246
+ @semaphore ||= Mutex.new
247
+ end
215
248
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.9
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-08-16 00:00:00.000000000 Z
11
+ date: 2016-09-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake