wayback_machine_downloader 0.4.9 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 73c0ec7ba53bc62a4ef801970325bcc56144e04d
4
- data.tar.gz: 9096da258417570d757decce4ca2a847f39febac
3
+ metadata.gz: 6dd8b2f20e56a00a8601a4e4f9ecd9e4305bcce7
4
+ data.tar.gz: 3274121f9abb86d37fc9820bb8465f608ec78366
5
5
  SHA512:
6
- metadata.gz: 27407c32b02bfc2d1ffb267c6b654552a37fa17bb0605605bae4a2dd0fe89573a1e0e092ee3bc00faad97895d0e31277c7db0e675ea846c4e8be908d0d3b317f
7
- data.tar.gz: 2155e4e85e4192b252c5b92b46237b824b34c6b905d1446586452163255687fc946e1e433cf03c764b6df0c71065be302f742c5a0b631ed4ef748c6ed1cc2d77
6
+ metadata.gz: 739d5f8a16e0a0d8f6ba81ff6eddffb9bd7d9337e02bd9215cce2e31a033e67037a0bf3678192f4872e891f52dff240565a0b8e202b92ad38c1d6d6ca3940878
7
+ data.tar.gz: 98f4387c00da2c5c827e7cf9b2f63d51a0c29f98f7b5bf2a2f30c36552e524294f824a74bc9124f6b1d7ed098f6a7afcc6508de68e1cc6249fa8ff737a36a441
@@ -38,6 +38,10 @@ option_parser = OptionParser.new do |opts|
38
38
  options[:list] = true
39
39
  end
40
40
 
41
+ opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time. Default is one file at a time. (ie. 20)") do |t|
42
+ options[:threads_count] = t
43
+ end
44
+
41
45
  opts.on("-v", "--version", "Display version") do |t|
42
46
  options[:version] = t
43
47
  end
@@ -9,9 +9,9 @@ require_relative 'wayback_machine_downloader/to_regex'
9
9
 
10
10
  class WaybackMachineDownloader
11
11
 
12
- VERSION = "0.4.9"
12
+ VERSION = "0.5.0"
13
13
 
14
- attr_accessor :base_url, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list
14
+ attr_accessor :base_url, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :threads_count
15
15
 
16
16
  def initialize params
17
17
  @base_url = params[:base_url]
@@ -21,6 +21,7 @@ class WaybackMachineDownloader
21
21
  @exclude_filter = params[:exclude_filter]
22
22
  @all = params[:all]
23
23
  @list = params[:list]
24
+ @threads_count = params[:threads_count].to_i
24
25
  end
25
26
 
26
27
  def backup_name
@@ -121,72 +122,37 @@ class WaybackMachineDownloader
121
122
  end
122
123
 
123
124
  def download_files
125
+ start_time = Time.now
124
126
  puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine..."
125
127
  puts
126
- file_list_by_timestamp = get_file_list_by_timestamp
128
+
127
129
  if file_list_by_timestamp.count == 0
128
130
  puts "No files to download."
129
131
  puts "Possible reasons:"
130
132
  puts "\t* Site is not in Wayback Machine Archive."
131
- puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
132
- puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
133
+ puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
134
+ puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
133
135
  puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
134
136
  puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
135
137
  return
136
138
  end
137
- count = 0
138
- file_list_by_timestamp.each do |file_remote_info|
139
- count += 1
140
- file_url = file_remote_info[:file_url]
141
- file_id = file_remote_info[:file_id]
142
- file_timestamp = file_remote_info[:timestamp]
143
- file_path_elements = file_id.split('/')
144
- if file_id == ""
145
- dir_path = backup_path
146
- file_path = backup_path + 'index.html'
147
- elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
148
- dir_path = backup_path + file_path_elements[0..-1].join('/')
149
- file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
150
- else
151
- dir_path = backup_path + file_path_elements[0..-2].join('/')
152
- file_path = backup_path + file_path_elements[0..-1].join('/')
153
- end
154
- if Gem.win_platform?
155
- file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
156
- end
157
- unless File.exists? file_path
158
- begin
159
- structure_dir_path dir_path
160
- open(file_path, "wb") do |file|
161
- begin
162
- open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
163
- file.write(uri.read)
164
- end
165
- rescue OpenURI::HTTPError => e
166
- puts "#{file_url} # #{e}"
167
- if @all
168
- file.write(e.io.read)
169
- puts "#{file_path} saved anyway."
170
- end
171
- rescue StandardError => e
172
- puts "#{file_url} # #{e}"
173
- end
174
- end
175
- rescue StandardError => e
176
- puts "#{file_url} # #{e}"
177
- ensure
178
- if not @all and File.exists?(file_path) and File.size(file_path) == 0
179
- File.delete(file_path)
180
- puts "#{file_path} was empty and was removed."
181
- end
139
+
140
+ threads = []
141
+ @processed_file_count = 0
142
+ @threads_count = 1 unless @threads_count != 0
143
+ @threads_count.times do
144
+ threads << Thread.new do
145
+ until file_queue.empty?
146
+ file_remote_info = file_queue.pop(true) rescue nil
147
+ download_file(file_remote_info) if file_remote_info
182
148
  end
183
- puts "#{file_url} -> #{file_path} (#{count}/#{file_list_by_timestamp.size})"
184
- else
185
- puts "#{file_url} # #{file_path} already exists. (#{count}/#{file_list_by_timestamp.size})"
186
149
  end
187
150
  end
151
+
152
+ threads.each(&:join)
153
+ end_time = Time.now
188
154
  puts
189
- puts "Download complete, saved in #{backup_path} (#{file_list_by_timestamp.size} files)"
155
+ puts "Download completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path} (#{file_list_by_timestamp.size} files)"
190
156
  end
191
157
 
192
158
  def structure_dir_path dir_path
@@ -212,4 +178,71 @@ class WaybackMachineDownloader
212
178
  end
213
179
  end
214
180
 
181
+ def download_file file_remote_info
182
+ file_url = file_remote_info[:file_url]
183
+ file_id = file_remote_info[:file_id]
184
+ file_timestamp = file_remote_info[:timestamp]
185
+ file_path_elements = file_id.split('/')
186
+ if file_id == ""
187
+ dir_path = backup_path
188
+ file_path = backup_path + 'index.html'
189
+ elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
190
+ dir_path = backup_path + file_path_elements[0..-1].join('/')
191
+ file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
192
+ else
193
+ dir_path = backup_path + file_path_elements[0..-2].join('/')
194
+ file_path = backup_path + file_path_elements[0..-1].join('/')
195
+ end
196
+ if Gem.win_platform?
197
+ file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
198
+ end
199
+ unless File.exists? file_path
200
+ begin
201
+ structure_dir_path dir_path
202
+ open(file_path, "wb") do |file|
203
+ begin
204
+ open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
205
+ file.write(uri.read)
206
+ end
207
+ rescue OpenURI::HTTPError => e
208
+ puts "#{file_url} # #{e}"
209
+ if @all
210
+ file.write(e.io.read)
211
+ puts "#{file_path} saved anyway."
212
+ end
213
+ rescue StandardError => e
214
+ puts "#{file_url} # #{e}"
215
+ end
216
+ end
217
+ rescue StandardError => e
218
+ puts "#{file_url} # #{e}"
219
+ ensure
220
+ if not @all and File.exists?(file_path) and File.size(file_path) == 0
221
+ File.delete(file_path)
222
+ puts "#{file_path} was empty and was removed."
223
+ end
224
+ end
225
+ semaphore.synchronize do
226
+ @processed_file_count += 1
227
+ puts "#{file_url} -> #{file_path} (#{@processed_file_count}/#{file_list_by_timestamp.size})"
228
+ end
229
+ else
230
+ semaphore.synchronize do
231
+ @processed_file_count += 1
232
+ puts "#{file_url} # #{file_path} already exists. (#{@processed_file_count}/#{file_list_by_timestamp.size})"
233
+ end
234
+ end
235
+ end
236
+
237
+ def file_queue
238
+ @file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
239
+ end
240
+
241
+ def file_list_by_timestamp
242
+ @file_list_by_timestamp ||= get_file_list_by_timestamp
243
+ end
244
+
245
+ def semaphore
246
+ @semaphore ||= Mutex.new
247
+ end
215
248
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.9
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-08-16 00:00:00.000000000 Z
11
+ date: 2016-09-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake