webarchive 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 74bc28f7e8b343b9ef2c558991d22ea8560a9fc29a5501fc0ee0514078df4d14
4
- data.tar.gz: 6ec0a915b37359d681caad36ddfdb6f2febce1cea43bfb8493b4702c611e67aa
3
+ metadata.gz: c9fc5065342461039496f77e140585d7023878acf1053ac820bc262fdf6917db
4
+ data.tar.gz: d093202d5cf905a359d7abb7af6cbf67a88898217493a7239eef5d63015ae831
5
5
  SHA512:
6
- metadata.gz: 5666adba0ad58e9168874a205dde99288e407a3a2260543c9aa4c6ebc23c04b5705f61d4926dba7bb219d3cd5cab17406732aef45ec50b5d1fd2f289c58a272e
7
- data.tar.gz: 7a5cf05ee6857a85e87508bd3c404be5a4e5cc7b3d1c8589cde4db8a374a3365f03c68fd52f0b9ac2e36301266f7f011c117a3ebd8d8539a04d9f1411e9ba766
6
+ metadata.gz: 3eedec1586a6a33d9a6a95b63dc98ef3b0fde2a000389743b807f2a074c74b05341b2870acc3d20ee7b994f3cfcbb931e9451f02d70a5645ec327bce6ac80679
7
+ data.tar.gz: 1c8b60d67cce9e228c8785a79df5bc056bf88141c7bf7dd7c83018aa0fc28d11ca101bfd78af4ea9289761797fee214dfa2831be7e6acf45a6d887356927c0ba
data/README.md CHANGED
@@ -1,9 +1,16 @@
1
1
  # Webarchive
2
2
 
3
- This is a CUI tool for sending URIs to public web archiving tools such as web.archive.org and archive.today.
3
+ This is a CUI tool for sending URIs to public web archiving tools such
4
+ as web.archive.org and archive.today.
4
5
 
5
6
  Requests are throttled.
6
7
 
8
+ ## Rationale
9
+
10
+ This tool's motivation is simple - increased availability by redundancy. Your favorite web archiving service might be down at some point in time, or blocked by certain websites. Use 2 or more services to archive something, and your archive will be safe if at least one of them remains available.
11
+
12
+ Browser extensions with similar functionalities might exist, but this tool might be for you when you need to archive a large number of URLs, and well, if you like CUI.
13
+
7
14
  ## Installation
8
15
 
9
16
  Use this line to install it:
@@ -22,25 +29,36 @@ If you have a list of URIs in a file, use pipe.
22
29
 
23
30
  $ cat list.txt | webarchive
24
31
 
32
+ Note that, by default, this program logs all the URIs you enter into
33
+ `~/.webarchive_history`.
34
+
25
35
  It has optional command-line parameters:
26
36
 
27
37
  $ webarchive -h
28
-
38
+
29
39
  Usage: webarchive [options]
30
- -w, --wait=SECS wait for SECS before sending a request
40
+ -w, --wait=SECONDS wait for SECONDS between requests [default: 5.0]
41
+ -r, --retry=N retry for N times when failed [default: 5]
42
+ --[no-]history record history [default: enabled]
31
43
  -d, --debug add debug output, implies verbose
32
44
  --verbose
33
- --version
45
+ -h, --help show help
34
46
 
35
47
  ## Development
36
48
 
37
- After checking out the repo, run `bundle install` to install dependencies. Then, run `rake spec` to run the tests.
49
+ After checking out the repo, run `bundle install` to install
50
+ dependencies. Then, run `rake spec` to run the tests.
38
51
 
39
- To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
52
+ To install this gem onto your local machine, run `bundle exec rake
53
+ install`. To release a new version, update the version number in
54
+ `version.rb`, and then run `bundle exec rake release`, which will
55
+ create a git tag for the version, push git commits and tags, and push
56
+ the `.gem` file to [rubygems.org](https://rubygems.org).
40
57
 
41
58
  ## Contributing
42
59
 
43
- Bug reports and pull requests are welcome at https://gitlab.com/yusuke.matsubara/webarchive.
60
+ Bug reports and pull requests are welcome at
61
+ https://gitlab.com/yusuke.matsubara/webarchive.
44
62
 
45
63
  ## License
46
64
 
data/bin/webarchive CHANGED
@@ -1,18 +1,34 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'webarchive'
4
5
  require 'webarchive/version'
5
6
  require 'optparse'
6
7
 
7
- wait = 4.0
8
+ wait = 5.0
9
+ retry_ = 5
8
10
  debug = false
9
11
  verbose = false
10
12
  history = true
11
13
  Version = WebArchive::VERSION
14
+
12
15
  OptionParser.new do |opt|
13
- opt.on('-w', '--wait=SECS', 'wait for SECS before sending a request') { |v| wait = v.to_f }
14
- opt.on('-d', '--debug', 'add debug output, implies verbose') { debug = true }
15
- opt.on('--[no-]history', 'record history (enabled by default)') { |v| history = v }
16
+ opt.on('-w', '--wait=SECONDS',
17
+ "wait for SECONDS between requests [default: #{wait}]") do |v|
18
+ wait = v.to_f
19
+ end
20
+ opt.on('-r', '--retry=N',
21
+ "retry for N times when failed [default: #{retry_}]") do |v|
22
+ retry_ = v.to_i
23
+ end
24
+ opt.on('--[no-]history',
25
+ "record history [default: #{history ? 'enabled' : 'disabled'}]") do |v|
26
+ history = v
27
+ end
28
+ opt.on('-d', '--debug', 'add debug output, implies verbose') do
29
+ debug = true
30
+ verbose = true
31
+ end
16
32
  opt.on('--verbose') { verbose = true }
17
33
  opt.on('-h', '--help', 'show help') do
18
34
  puts opt
@@ -20,5 +36,9 @@ OptionParser.new do |opt|
20
36
  end
21
37
  end.parse!(ARGV)
22
38
 
23
- warn "#{WebArchive} #{WebArchive::VERSION}" if verbose
24
- WebArchive.launch(wait_secs: wait, debug: debug, verbose: verbose, history: history)
39
+ tagline = "#{WebArchive} #{WebArchive::VERSION}"
40
+ tagline += ' (debug)' if debug
41
+ warn tagline if verbose
42
+
43
+ WebArchive.launch(wait_secs: wait, max_retry: retry_, history: history,
44
+ debug: debug, verbose: verbose)
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module WebArchive
2
- VERSION = '0.1.3'
4
+ VERSION = '0.1.4'
3
5
  end
data/lib/webarchive.rb CHANGED
@@ -1,6 +1,11 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
- # (this file is also an executable - see the bottom)
4
+ # This tool allows you to call multiple web archiving services. It
5
+ # works from command-line interactively. In the inside, it maintains a
6
+ # throttled queue for each service provider supported.
7
+
8
+ # (This file itself is executable without installation - see the bottom.)
4
9
 
5
10
  require 'open-uri'
6
11
  require 'readline'
@@ -8,22 +13,40 @@ require 'tempfile'
8
13
  require 'simpleidn'
9
14
  require 'net/http'
10
15
  require 'addressable/uri'
16
+ require 'digest/sha2'
11
17
  require 'mechanize'
12
18
  require 'trie'
19
+ require 'concurrent'
13
20
 
14
21
  # classes and functions of webarchive package
15
22
  module WebArchive
23
+ Req = Struct.new("Req", :uri, :wait, :max_retry)
24
+
25
+ # error class for unexpected response from archiving service
26
+ class UnexpectedResponseError < StandardError
27
+ def initialize(cause = nil)
28
+ super(cause)
29
+ end
30
+ end
31
+
32
+ # error class for when redirect/canonical uri is not found
33
+ class NoAlternativeURIError < StandardError
34
+ def initialize(cause = nil)
35
+ super(cause)
36
+ end
37
+ end
38
+
16
39
  begin
17
40
  require 'libnotify'
18
- def self.warn_archive_fail(uri, archiver, body)
19
- warn "Not archived: #{uri} by #{archiver}; #{body}"
20
- Libnotify.show(summary: "Not archived: #{uri} by #{archiver}",
41
+ def self.warn_archive_fail(req, archiver, body)
42
+ warn "Not archived: #{req} by #{archiver}; #{body}"
43
+ Libnotify.show(summary: "Not archived: #{req} by #{archiver}",
21
44
  body: body, timeout: 3)
22
45
  end
23
46
  rescue LoadError
24
- unless defined? warn_archive_fail
25
- def self.warn_archive_fail(uri, archiver, body)
26
- warn "Not archived: #{uri} by #{archiver}; #{body}"
47
+ unless respond_to? :warn_archive_fail
48
+ def self.warn_archive_fail(req, archiver, body)
49
+ warn "Not archived: #{req} by #{archiver}; #{body}"
27
50
  end
28
51
  end
29
52
  end
@@ -31,44 +54,183 @@ module WebArchive
31
54
  # Queue for sending URLs to a certain archiving web site
32
55
  # The block given to constructor will be executed for each '<<'
33
56
  class ArchiveQueue < Queue
34
- def initialize(name, wait)
57
+ # Create a new instance of ArchiveQueue
58
+ # @param name [String] name of the queue
59
+ # @param interval [Float] length of the wait between requests
60
+ # @yield [String] URI that the queue receives
61
+ def initialize(name, interval)
35
62
  super()
36
63
  @name = name
64
+ @interval = interval
37
65
  @all_sent = false
38
- @in_process = 0 # always <= 1
66
+ @in_process = Concurrent::AtomicFixnum.new(0)
67
+ last_request_time = Time.now - interval
39
68
  @consumer = Thread.new do
40
69
  loop do
41
- uri = self.pop
42
- @in_process += 1
70
+ req = self.deq # deq blocks until non-empty
71
+ @in_process.value += 1
43
72
  begin
44
- yield uri
73
+ sleep time_until_next_req(last_request_time, Time.now)
74
+ last_request_time = Time.now
75
+ yield req.uri
45
76
  rescue StandardError => e
46
- WebArchive.warn_archive_fail(
47
- uri, name, ([e.inspect] + e.backtrace).join("\n")
48
- )
77
+ if retry?(e) && req.max_retry.positive?
78
+ buff = [].tap { |a| a << self.deq until self.empty? }
79
+ @in_process.value += buff.size + 1
80
+ Concurrent::ScheduledTask.execute(req.wait) do
81
+ @in_process.value -= buff.size + 1
82
+ self.enq Req.new(req.uri, req.wait * 2, req.max_retry - 1)
83
+ buff.each { |x| self.enq x }
84
+ end
85
+ else
86
+ WebArchive.warn_archive_fail(
87
+ req.uri, name, ([e.inspect] + e.backtrace).join("\n")
88
+ )
89
+ end
49
90
  ensure
50
- @in_process -= 1
51
- break if @all_sent && self.empty?
52
-
53
- sleep wait
91
+ @in_process.value -= 1
92
+ break if @all_sent && self.remaining.zero?
54
93
  end
55
94
  end
56
95
  end
57
96
  end
58
97
 
98
+ def time_until_next_req(last_req, current)
99
+ elapsed = [current - last_req, 0].max
100
+ [@interval - elapsed, 0].max
101
+ end
102
+
103
+ # @param exc [Exception]
104
+ # @return [Boolean]
105
+ def retry?(exc)
106
+ [
107
+ Errno::ECONNRESET,
108
+ Errno::EHOSTUNREACH
109
+ ].include?(exc.class) ||
110
+ (exc.is_a?(OpenURI::HTTPError) && exc.message.start_with?('429 ')) ||
111
+ (exc.is_a?(OpenURI::HTTPError) && exc.message.start_with?('502 ')) ||
112
+ (exc.is_a?(OpenURI::HTTPError) && exc.message.start_with?('503 ')) ||
113
+ (exc.is_a?(Mechanize::ResponseCodeError) && exc.response_code == '503')
114
+ end
115
+
59
116
  # mark as 'sending done' and wait for items to be processed
117
+ # @return [Boolean]
60
118
  def done_sending
61
119
  @all_sent = true
62
- @consumer.join if self.remaining > 0
120
+ @consumer.join if self.remaining.positive?
63
121
  end
64
122
 
65
123
  # number of queued items (including those being processed)
124
+ # @return [Integer]
66
125
  def remaining
67
- self.size + @in_process
126
+ self.size + @in_process.value
68
127
  end
69
128
  end
70
129
 
71
- def self.my_normalize(str)
130
+ # Client with multiple queues
131
+ class Client
132
+ def initialize(wait_secs: 1, max_retry: 3,
133
+ redirect: false, canonical_uri: true)
134
+ @wait_secs = wait_secs
135
+ @max_retry = max_retry
136
+ @redirect = redirect
137
+ @canonical_uri = canonical_uri
138
+
139
+ @wait_secs = 0 if @wait_secs.negative?
140
+ @max_retry = 0 if @max_retry.negative?
141
+ @queues = []
142
+ end
143
+
144
+ # @param queue [ArchiveQueue]
145
+ def add_queue(queue)
146
+ @queues << queue
147
+ end
148
+
149
+ def queued_uris
150
+ @queues.map(&:remaining).inject(:+)
151
+ end
152
+
153
+ # @param uri [String]
154
+ # @return [Concurrent::Promises::Future] Gives the target URI if redirected
155
+ def with_redirect(uri)
156
+ Concurrent::Promises.future do
157
+ res = Net::HTTP.get_response(Addressable::URI.parse(uri))
158
+ raise NoAlternativeURIError, 'no redirect found' if
159
+ !res['location'] || res['location'] == uri
160
+
161
+ res['location']
162
+ end
163
+ end
164
+
165
+ def add_scheme(uri, scheme)
166
+ if uri.relative?
167
+ uri = uri.dup
168
+ uri.scheme = scheme
169
+ end
170
+ uri
171
+ end
172
+
173
+ def equivalent_uri?(uri, str)
174
+ uri = add_scheme(uri, Addressable::URI.parse(str).scheme)
175
+ uri.to_s == str
176
+ end
177
+
178
+ # @param uri [String]
179
+ # @return [Concurrent::Promises::Future] Gives the canonical URI if there is one
180
+ def with_canonical_uri(uri)
181
+ Concurrent::Promises.future do
182
+ agent = Mechanize.new
183
+ page = agent.get(uri)
184
+ ret = nil
185
+ raise NoAlternativeURIError, 'no canonical URI found' unless
186
+ page.canonical_uri &&
187
+ page.class == Mechanize::Page &&
188
+ page.canonical_uri != page.uri
189
+
190
+ if page.canonical_uri.relative?
191
+ u2 = URI.join(page.uri, page.canonical_uri)
192
+ ret = u2.to_s if !equivalent_uri?(u2, uri) &&
193
+ !equivalent_uri?(u2, page.uri)
194
+ else
195
+ u1 = page.canonical_uri
196
+ u1 = add_scheme(u1, 'http') unless u1.scheme
197
+ ret = u1.to_s if !equivalent_uri?(u1, uri) &&
198
+ !equivalent_uri?(u1, page.uri)
199
+ end
200
+
201
+ raise NoAlternativeURIError, 'no canonical URI found' unless ret
202
+
203
+ ret
204
+ end
205
+ end
206
+
207
+ # @param uri [String]
208
+ # @return [void]
209
+ def send_single_uri(uri)
210
+ @queues.each do |q|
211
+ q.enq Req.new(uri, @wait_secs, @max_retry)
212
+ end
213
+ end
214
+
215
+ # @param uri [String]
216
+ # @return [Concurrent::Promises::Future]
217
+ def send_uri(uri)
218
+ f0 = Concurrent::Promises.future{ send_single_uri(uri) }
219
+ f1 = with_canonical_uri(uri).then { |x| send_single_uri(x) } if @canonical_uri
220
+ f2 = with_redirect(uri).then { |x| send_single_uri(x) } if @redirect
221
+ f1 ||= Concurrent::Promises.future{}
222
+ f2 ||= Concurrent::Promises.future{}
223
+ f0.zip(f1).zip(f2)
224
+ end
225
+
226
+ def wait_for_queues
227
+ @queues.each(&:done_sending)
228
+ end
229
+ end
230
+
231
+ # @param str [String]
232
+ # @return [String]
233
+ def self.encode_non_ascii(str)
72
234
  if str =~ /[^[:ascii:]]/
73
235
  Addressable::URI.encode(str)
74
236
  else
@@ -76,51 +238,58 @@ module WebArchive
76
238
  end
77
239
  end
78
240
 
79
- def self.to_ascii_uri(str)
80
- uri = str.strip
81
- if %r/\.[a-z]{2,4}(\/|$)/.match(uri) && !uri.include?('://') && !uri.start_with?('http')
82
- uri = 'http://' + uri
241
+ # @param str [String]
242
+ # @return [String]
243
+ def self.prepend_http(uri)
244
+ if %r/\.[a-z]{2,4}(\/|$)/.match(uri) && %r{(^http|://)}.match(uri).nil?
245
+ 'http://' + uri
246
+ else
247
+ uri
83
248
  end
249
+ end
84
250
 
251
+ # Encode non-ASCII components in the given string and make a URI instance from
252
+ # @param str [String]
253
+ # @return [Addressable::URI]
254
+ def self.to_ascii_uri(str)
255
+ uri = prepend_http(str.strip)
85
256
  u = Addressable::URI.parse(uri)
86
257
  u.host = SimpleIDN.to_ascii(u.host)
87
- u.path = my_normalize(u.path)
88
- u.query = my_normalize(u.query)
89
- u.fragment = my_normalize(u.fragment)
258
+ u.path, u.query, u.fragment = [
259
+ u.path, u.query, u.fragment
260
+ ].map(&method(:encode_non_ascii))
90
261
  u
91
262
  end
92
263
 
93
- def self.with_canonical_uri_and_redirect(uri, canonical, redirect)
94
- if redirect
95
- res = Net::HTTP.get_response(URI.parse(uri))
96
- yield res['location'] if res['location'] &&
97
- res['location'] != uri
98
- end
99
- if canonical
100
- agent = Mechanize.new
101
- page = agent.get(uri)
102
- yield page.canonical_uri.to_s if page.class == Mechanize::Page &&
103
- page.canonical_uri &&
104
- page.canonical_uri.to_s != uri &&
105
- page.canonical_uri != page.uri
106
- end
107
- rescue Net::HTTPClientError, Mechanize::ResponseCodeError
108
- # ignore since it will cause a warning later anyway
264
+ # Write log to a file
265
+ # @param source [String]
266
+ # @param content [String]
267
+ # @return [void]
268
+ def self.debug_output(source, uri, content)
269
+ ts = Time.now.strftime('%Y%m%d%H%M%S')
270
+ filename = "#{self}-#{source}-#{uri.gsub(/\W+/, '_')[0..30]}-"
271
+ filename += Digest::SHA256.hexdigest(uri + ts)[0..8]
272
+ Tempfile.open(filename) do |f|
273
+ f.puts content
274
+ end
109
275
  end
110
276
 
111
277
  # completer for URLs
112
278
  class Completer
279
+ # @param history_file [File]
113
280
  def initialize(history_file)
114
- @file = File.expand_path(history_file)
281
+ @file = history_file
115
282
  @trie = Trie.new
116
- self.reload
283
+ reload!
117
284
  end
118
285
 
119
- def update
120
- self.reload if File.stat(@file).mtime > @lastupdate
286
+ # @return [void]
287
+ def update!
288
+ reload! if File.stat(@file).mtime > @lastupdate
121
289
  end
122
290
 
123
- def reload
291
+ # @return [void]
292
+ def reload!
124
293
  if File.exist? @file
125
294
  File.open(@file, encoding: 'utf-8').each_line do |x|
126
295
  @trie.add x.strip
@@ -131,13 +300,16 @@ module WebArchive
131
300
  @lastupdate = Time.now
132
301
  end
133
302
 
303
+ # @return [Proc]
134
304
  def to_proc
135
305
  proc do |s|
136
- self.update
306
+ update!
137
307
  @trie.children(s)
138
308
  end
139
309
  end
140
310
 
311
+ # @param str [String]
312
+ # @return [void]
141
313
  def append_to_history(str)
142
314
  File.open(@file, mode: 'a', encoding: 'utf-8') do |f|
143
315
  f.puts str
@@ -147,65 +319,89 @@ module WebArchive
147
319
 
148
320
  HISTORY_FILE = '~/.webarchive_history'
149
321
 
150
- def self.launch(wait_secs: 1, debug: false, verbose: false, redirect: false, canonical_uri: true, history: true)
151
- verbose = true if debug
322
+ # Launch the CLI
323
+ # @return [Concurrent::Promises::Future]
324
+ def self.launch(wait_secs: 1, max_retry: 3,
325
+ redirect: false, canonical_uri: true,
326
+ history: true, debug: false, verbose: false)
327
+ wait_secs = 0 if wait_secs.negative?
328
+ max_retry = 0 if max_retry.negative?
329
+
152
330
  Thread.abort_on_exception = true
153
331
  completer = nil
154
332
  if history
155
- completer = Completer.new(HISTORY_FILE)
333
+ completer = Completer.new(File.expand_path(HISTORY_FILE))
156
334
  Readline.completion_proc = completer.to_proc
157
335
  Readline.completion_append_character = ''
158
336
  end
159
337
 
160
- queues = []
338
+ client = Client.new(wait_secs: wait_secs, max_retry: max_retry,
339
+ redirect: redirect, canonical_uri: canonical_uri)
161
340
 
162
- queues << ArchiveQueue.new('archive.org', wait_secs) do |uri|
163
- URI.parse("https://web.archive.org/save/#{uri}").open do |f|
164
- if f.meta['content-location'] && verbose
165
- puts "<https://web.archive.org#{f.meta['content-location']}>"
166
- elsif verbose
167
- puts f.meta.inspect
341
+ # prepare queues
342
+ client.add_queue(
343
+ ArchiveQueue.new('archive.org (logged out)', wait_secs) do |uri|
344
+ URI.parse('https://web.archive.org/save/' + uri).open do |f|
345
+ if f.meta['content-location'] && verbose
346
+ puts "<https://web.archive.org#{f.meta['content-location']}>"
347
+ elsif verbose
348
+ puts f.meta.inspect
349
+ end
168
350
  end
169
351
  end
170
- end
352
+ )
171
353
 
172
- queues << ArchiveQueue.new('megalodon.jp', wait_secs) do |uri|
173
- agent = Mechanize.new
174
- page = agent.get('https://megalodon.jp/pc/?' +
175
- Addressable::URI.form_encode(url: uri))
176
- res = agent.submit(page.forms.first)
177
- if debug
178
- Tempfile.open("#{self}-#{uri.gsub(/\W/, '_')}") do |f|
179
- f.puts res.body
354
+ client.add_queue(
355
+ ArchiveQueue.new('megalodon.jp', wait_secs) do |uri|
356
+ agent = Mechanize.new
357
+ page = agent.get('https://megalodon.jp/pc/?' +
358
+ Addressable::URI.form_encode(url: uri))
359
+ form = page.forms.first
360
+ raise UnexpectedResponseError, page.inspect unless form
361
+
362
+ res = agent.submit(form)
363
+ if debug
364
+ debug_output('megalodonjp', uri, res.body)
180
365
  end
181
- end
182
- og = res.at('meta[property="og:url"]')
183
- uri = if og
184
- og[:content]
185
- else
186
- res.links.map(&:href).find(-> { res.uri.to_s }) do |x|
187
- x =~ %r{megalodon\.jp/[\d-]+/}
366
+ og = res.at('meta[property="og:url"]')
367
+ uri = if og
368
+ og[:content]
369
+ else
370
+ res.links.map(&:href).find(-> { res.uri.to_s }) do |x|
371
+ x =~ %r{megalodon\.jp/[\d-]+/}
372
+ end
188
373
  end
189
- end
190
- puts "<#{uri}>" if verbose
191
- agent.shutdown
192
- end
374
+ puts "<#{uri}>" if verbose
375
+ agent.shutdown
376
+ end
377
+ )
193
378
 
194
- queues << ArchiveQueue.new('archive.today', wait_secs) do |uri|
195
- agent = Mechanize.new
196
- agent.follow_meta_refresh = true
379
+ client.add_queue(
380
+ ArchiveQueue.new('archive.today', wait_secs) do |uri|
381
+ agent = Mechanize.new
382
+ agent.follow_meta_refresh = true
197
383
 
198
- page = agent.get('https://archive.today/')
199
- form = page.form_with(id: 'submiturl')
200
- form['anyway'] = '1'
201
- form.field_with(name: 'url').value = uri
202
- page = agent.submit(form)
203
- puts "<#{page.uri}>" if verbose
204
- agent.shutdown
205
- end
384
+ page = agent.get('https://archive.is/')
385
+ form = page.form_with(id: 'submiturl')
386
+ if debug
387
+ debug_output('archivetoday', uri, page.inspect)
388
+ end
389
+ raise UnexpectedResponseError, page.inspect unless form
390
+
391
+ form['anyway'] = '1'
392
+ form.field_with(name: 'url').value = uri
393
+ sleep 5.0 # not submit too fast
394
+ page = agent.submit(form)
395
+ puts "<#{page.uri}>" if verbose
396
+ agent.shutdown
397
+ end
398
+ )
399
+
400
+ # main loop
206
401
 
207
402
  uri_regexp = URI::DEFAULT_PARSER.make_regexp
208
- while line = Readline.readline("Q(#{queues.map(&:remaining).inject(:+)})> ", add_hist: true)
403
+ all = Concurrent::Promises.future{}
404
+ while line = Readline.readline("Q(#{client.queued_uris})> ", add_hist: true)
209
405
  uri = ''
210
406
  begin
211
407
  uri = to_ascii_uri(line).to_s
@@ -221,25 +417,22 @@ module WebArchive
221
417
  next
222
418
  end
223
419
 
224
- queues.each do |q|
225
- q << uri
226
- end
227
- begin
228
- with_canonical_uri_and_redirect(uri, canonical_uri, redirect) do |x|
229
- queues.each do |q|
230
- q << x
231
- end
232
- end
233
- rescue StandardError => e
234
- warn "skipping canonical/redirect for #{uri}: #{e.message}"
235
- end
236
-
237
- completer.append_to_history(uri) if completer
420
+ f = client.send_uri(uri).then {
421
+ completer&.append_to_history(uri)
422
+ }.on_rejection { |reason, _|
423
+ warn "skipping canonical/redirect for #{uri}: #{reason}" if
424
+ !x.is_a?(NoAlternativeURIError)
425
+ }
426
+ all = all.zip(f)
238
427
  end
239
428
 
240
- queues.each(&:done_sending)
429
+ all.wait
430
+ client.wait_for_queues
241
431
  # TODO: trap INT and ask for confirmation
432
+ all
242
433
  end
243
434
  end
244
435
 
245
- WebArchive.launch(wait_secs: 1.0, verbose: true, debug: true) if $PROGRAM_NAME == __FILE__
436
+ if $PROGRAM_NAME == __FILE__
437
+ WebArchive.launch(wait_secs: 1.0, verbose: true, debug: true)
438
+ end
metadata CHANGED
@@ -1,11 +1,11 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webarchive
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yusuke Matsubara
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
  date: 2019-06-30 00:00:00.000000000 Z
@@ -16,14 +16,28 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: 2.6.0
19
+ version: 2.8.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: 2.6.0
26
+ version: 2.8.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: concurrent-ruby
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.1.6
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 1.1.6
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: fast_trie
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -44,72 +58,114 @@ dependencies:
44
58
  requirements:
45
59
  - - "~>"
46
60
  - !ruby/object:Gem::Version
47
- version: 2.7.6
61
+ version: 2.8.0
48
62
  type: :runtime
49
63
  prerelease: false
50
64
  version_requirements: !ruby/object:Gem::Requirement
51
65
  requirements:
52
66
  - - "~>"
53
67
  - !ruby/object:Gem::Version
54
- version: 2.7.6
68
+ version: 2.8.0
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: net-http-persistent
57
71
  requirement: !ruby/object:Gem::Requirement
58
72
  requirements:
59
73
  - - "~>"
60
74
  - !ruby/object:Gem::Version
61
- version: 3.1.0
75
+ version: 4.0.0
62
76
  type: :runtime
63
77
  prerelease: false
64
78
  version_requirements: !ruby/object:Gem::Requirement
65
79
  requirements:
66
80
  - - "~>"
67
81
  - !ruby/object:Gem::Version
68
- version: 3.1.0
82
+ version: 4.0.0
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: simpleidn
71
85
  requirement: !ruby/object:Gem::Requirement
72
86
  requirements:
73
87
  - - "~>"
74
88
  - !ruby/object:Gem::Version
75
- version: 0.1.1
89
+ version: 0.2.1
76
90
  type: :runtime
77
91
  prerelease: false
78
92
  version_requirements: !ruby/object:Gem::Requirement
79
93
  requirements:
80
94
  - - "~>"
81
95
  - !ruby/object:Gem::Version
82
- version: 0.1.1
96
+ version: 0.2.1
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: bundler
85
99
  requirement: !ruby/object:Gem::Requirement
86
100
  requirements:
87
101
  - - "~>"
88
102
  - !ruby/object:Gem::Version
89
- version: '1.17'
103
+ version: '2.1'
90
104
  type: :development
91
105
  prerelease: false
92
106
  version_requirements: !ruby/object:Gem::Requirement
93
107
  requirements:
94
108
  - - "~>"
95
109
  - !ruby/object:Gem::Version
96
- version: '1.17'
110
+ version: '2.1'
97
111
  - !ruby/object:Gem::Dependency
98
112
  name: rake
99
113
  requirement: !ruby/object:Gem::Requirement
100
114
  requirements:
101
115
  - - "~>"
102
116
  - !ruby/object:Gem::Version
103
- version: '10.0'
117
+ version: '13.0'
104
118
  type: :development
105
119
  prerelease: false
106
120
  version_requirements: !ruby/object:Gem::Requirement
107
121
  requirements:
108
122
  - - "~>"
109
123
  - !ruby/object:Gem::Version
110
- version: '10.0'
124
+ version: '13.0'
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: rspec
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '3.10'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '3.10'
139
+ - !ruby/object:Gem::Dependency
140
+ name: rubocop
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: 0.81.0
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: 0.81.0
153
+ - !ruby/object:Gem::Dependency
154
+ name: webmock
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: '3.13'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - "~>"
165
+ - !ruby/object:Gem::Version
166
+ version: '3.13'
167
+ - !ruby/object:Gem::Dependency
168
+ name: pry
113
169
  requirement: !ruby/object:Gem::Requirement
114
170
  requirements:
115
171
  - - ">="
@@ -123,7 +179,7 @@ dependencies:
123
179
  - !ruby/object:Gem::Version
124
180
  version: '0'
125
181
  - !ruby/object:Gem::Dependency
126
- name: webmock
182
+ name: pry-doc
127
183
  requirement: !ruby/object:Gem::Requirement
128
184
  requirements:
129
185
  - - ">="
@@ -154,7 +210,7 @@ licenses:
154
210
  metadata:
155
211
  homepage_uri: https://rubygems.org/gems/webarchive
156
212
  source_code_uri: https://gitlab.com/yusuke.matsubara/webarchive
157
- post_install_message:
213
+ post_install_message:
158
214
  rdoc_options: []
159
215
  require_paths:
160
216
  - lib
@@ -162,15 +218,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
162
218
  requirements:
163
219
  - - "~>"
164
220
  - !ruby/object:Gem::Version
165
- version: '2.0'
221
+ version: '3.0'
166
222
  required_rubygems_version: !ruby/object:Gem::Requirement
167
223
  requirements:
168
224
  - - ">="
169
225
  - !ruby/object:Gem::Version
170
226
  version: '0'
171
227
  requirements: []
172
- rubygems_version: 3.0.1
173
- signing_key:
228
+ rubygems_version: 3.3.7
229
+ signing_key:
174
230
  specification_version: 4
175
231
  summary: webarchive - CUI tool to archive URIs
176
232
  test_files: []