webarchive 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 74bc28f7e8b343b9ef2c558991d22ea8560a9fc29a5501fc0ee0514078df4d14
4
- data.tar.gz: 6ec0a915b37359d681caad36ddfdb6f2febce1cea43bfb8493b4702c611e67aa
3
+ metadata.gz: 16eba058e7574f02d9e444a6d72b2aefb41b55fb0acb9adc1b6eef492cef306d
4
+ data.tar.gz: 33411ba62ea808f72b930ef52a5f112f4df80b4febb1d2bd88c7017a62785550
5
5
  SHA512:
6
- metadata.gz: 5666adba0ad58e9168874a205dde99288e407a3a2260543c9aa4c6ebc23c04b5705f61d4926dba7bb219d3cd5cab17406732aef45ec50b5d1fd2f289c58a272e
7
- data.tar.gz: 7a5cf05ee6857a85e87508bd3c404be5a4e5cc7b3d1c8589cde4db8a374a3365f03c68fd52f0b9ac2e36301266f7f011c117a3ebd8d8539a04d9f1411e9ba766
6
+ metadata.gz: 8787995fe6bdc059e2275371ed33eb332a8f1c99aa099b995a20805258ed2d05983507a8f2958e104edcc31e7259bc3be68d22f838c917294687533718a799c5
7
+ data.tar.gz: f9e73ef39c3cb49b6bd25569cc2549292982aa47e71e1c31ce735eb350e8a1fe0882fe8cb0a3d9ee23d3b6075d33ec149a951f4d70f9095d4b8f03bc8c306e6c
data/README.md CHANGED
@@ -1,9 +1,16 @@
1
1
  # Webarchive
2
2
 
3
- This is a CUI tool for sending URIs to public web archiving tools such as web.archive.org and archive.today.
3
+ This is a CUI tool for sending URIs to public web archiving tools such
4
+ as web.archive.org and archive.today.
4
5
 
5
6
  Requests are throttled.
6
7
 
8
+ ## Rationale
9
+
10
+ This tool's motivation is simple - increased availability by redundancy. Your favorite web archiving service might be down at some point in time, or blocked by certain websites. Use 2 or more services to archive something, and your archive will be safe if at least one of them remains available.
11
+
12
+ Browser extensions with similar functionalities might exist, but this tool might be for you when you need to archive a large number of URLs, and well, if you like CUI.
13
+
7
14
  ## Installation
8
15
 
9
16
  Use this line to install it:
@@ -22,25 +29,37 @@ If you have a list of URIs in a file, use pipe.
22
29
 
23
30
  $ cat list.txt | webarchive
24
31
 
32
+ Note that, by default, this program logs all the URIs you enter into
33
+ `~/.webarchive_history`.
34
+
25
35
  It has optional command-line parameters:
26
36
 
27
37
  $ webarchive -h
28
-
38
+
29
39
  Usage: webarchive [options]
30
- -w, --wait=SECS wait for SECS before sending a request
40
+ -w, --wait=SECONDS wait for SECONDS between requests [default: 5.0]
41
+ -r, --retry=N retry for N times when failed [default: 5]
42
+ -t, --timeout=SECONDS timeout after SECONDS [default: 60.0]
43
+ --[no-]history record history [default: enabled]
31
44
  -d, --debug add debug output, implies verbose
32
45
  --verbose
33
- --version
46
+ -h, --help show help
34
47
 
35
48
  ## Development
36
49
 
37
- After checking out the repo, run `bundle install` to install dependencies. Then, run `rake spec` to run the tests.
50
+ After checking out the repo, run `bundle install` to install
51
+ dependencies. Then, run `rake spec` to run the tests.
38
52
 
39
- To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
53
+ To install this gem onto your local machine, run `bundle exec rake
54
+ install`. To release a new version, update the version number in
55
+ `version.rb`, and then run `bundle exec rake release`, which will
56
+ create a git tag for the version, push git commits and tags, and push
57
+ the `.gem` file to [rubygems.org](https://rubygems.org).
40
58
 
41
59
  ## Contributing
42
60
 
43
- Bug reports and pull requests are welcome at https://gitlab.com/yusuke.matsubara/webarchive.
61
+ Bug reports and pull requests are welcome at
62
+ https://gitlab.com/yusuke.matsubara/webarchive.
44
63
 
45
64
  ## License
46
65
 
data/bin/webarchive CHANGED
@@ -1,18 +1,39 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'webarchive'
4
5
  require 'webarchive/version'
5
6
  require 'optparse'
6
7
 
7
- wait = 4.0
8
+ wait = 5.0
9
+ retry_ = 5
10
+ timeout = 60.0
8
11
  debug = false
9
12
  verbose = false
10
13
  history = true
11
14
  Version = WebArchive::VERSION
15
+
12
16
  OptionParser.new do |opt|
13
- opt.on('-w', '--wait=SECS', 'wait for SECS before sending a request') { |v| wait = v.to_f }
14
- opt.on('-d', '--debug', 'add debug output, implies verbose') { debug = true }
15
- opt.on('--[no-]history', 'record history (enabled by default)') { |v| history = v }
17
+ opt.on('-w', '--wait=SECONDS',
18
+ "wait for SECONDS between requests [default: #{wait}]") do |v|
19
+ wait = v.to_f
20
+ end
21
+ opt.on('-r', '--retry=N',
22
+ "retry for N times when failed [default: #{retry_}]") do |v|
23
+ retry_ = v.to_i
24
+ end
25
+ opt.on('-t', '--timeout=SECONDS',
26
+ "timeout after SECONDS [default: #{timeout}]") do |v|
27
+ timeout = v.to_i
28
+ end
29
+ opt.on('--[no-]history',
30
+ "record history [default: #{history ? 'enabled' : 'disabled'}]") do |v|
31
+ history = v
32
+ end
33
+ opt.on('-d', '--debug', 'add debug output, implies verbose') do
34
+ debug = true
35
+ verbose = true
36
+ end
16
37
  opt.on('--verbose') { verbose = true }
17
38
  opt.on('-h', '--help', 'show help') do
18
39
  puts opt
@@ -20,5 +41,10 @@ OptionParser.new do |opt|
20
41
  end
21
42
  end.parse!(ARGV)
22
43
 
23
- warn "#{WebArchive} #{WebArchive::VERSION}" if verbose
24
- WebArchive.launch(wait_secs: wait, debug: debug, verbose: verbose, history: history)
44
+ tagline = "#{WebArchive} #{WebArchive::VERSION}"
45
+ tagline += ' (debug)' if debug
46
+ warn tagline if verbose
47
+
48
+ WebArchive.launch(wait_secs: wait, max_retry: retry_, history: history,
49
+ read_timeout_secs: timeout,
50
+ debug: debug, verbose: verbose)
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module WebArchive
2
- VERSION = '0.1.3'
4
+ VERSION = '0.1.5'
3
5
  end
data/lib/webarchive.rb CHANGED
@@ -1,6 +1,11 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
- # (this file is also an executable - see the bottom)
4
+ # This tool allows you to call multiple web archiving services. It
5
+ # works from command-line interactively. In the inside, it maintains a
6
+ # throttled queue for each service provider supported.
7
+
8
+ # (This file itself is executable without installation - see the bottom.)
4
9
 
5
10
  require 'open-uri'
6
11
  require 'readline'
@@ -8,22 +13,40 @@ require 'tempfile'
8
13
  require 'simpleidn'
9
14
  require 'net/http'
10
15
  require 'addressable/uri'
16
+ require 'digest/sha2'
11
17
  require 'mechanize'
12
18
  require 'trie'
19
+ require 'concurrent'
13
20
 
14
21
  # classes and functions of webarchive package
15
22
  module WebArchive
23
+ Req = Struct.new("Req", :uri, :wait, :max_retry)
24
+
25
+ # error class for unexpected response from archiving service
26
+ class UnexpectedResponseError < StandardError
27
+ def initialize(cause = nil)
28
+ super(cause)
29
+ end
30
+ end
31
+
32
+ # error class for when redirect/canonical uri is not found
33
+ class NoAlternativeURIError < StandardError
34
+ def initialize(cause = nil)
35
+ super(cause)
36
+ end
37
+ end
38
+
16
39
  begin
17
40
  require 'libnotify'
18
- def self.warn_archive_fail(uri, archiver, body)
19
- warn "Not archived: #{uri} by #{archiver}; #{body}"
20
- Libnotify.show(summary: "Not archived: #{uri} by #{archiver}",
41
+ def self.warn_archive_fail(req, archiver, body)
42
+ warn "Not archived: #{req} by #{archiver}; #{body}"
43
+ Libnotify.show(summary: "Not archived: #{req} by #{archiver}",
21
44
  body: body, timeout: 3)
22
45
  end
23
46
  rescue LoadError
24
- unless defined? warn_archive_fail
25
- def self.warn_archive_fail(uri, archiver, body)
26
- warn "Not archived: #{uri} by #{archiver}; #{body}"
47
+ unless respond_to? :warn_archive_fail
48
+ def self.warn_archive_fail(req, archiver, body)
49
+ warn "Not archived: #{req} by #{archiver}; #{body}"
27
50
  end
28
51
  end
29
52
  end
@@ -31,44 +54,183 @@ module WebArchive
31
54
  # Queue for sending URLs to a certain archiving web site
32
55
  # The block given to constructor will be executed for each '<<'
33
56
  class ArchiveQueue < Queue
34
- def initialize(name, wait)
57
+ # Create a new instance of ArchiveQueue
58
+ # @param name [String] name of the queue
59
+ # @param interval [Float] length of the wait between requests
60
+ # @yield [String] URI that the queue receives
61
+ def initialize(name, interval)
35
62
  super()
36
63
  @name = name
64
+ @interval = interval
37
65
  @all_sent = false
38
- @in_process = 0 # always <= 1
66
+ @in_process = Concurrent::AtomicFixnum.new(0)
67
+ last_request_time = Time.now - interval
39
68
  @consumer = Thread.new do
40
69
  loop do
41
- uri = self.pop
42
- @in_process += 1
70
+ req = self.deq # deq blocks until non-empty
71
+ @in_process.value += 1
43
72
  begin
44
- yield uri
73
+ sleep time_until_next_req(last_request_time, Time.now)
74
+ last_request_time = Time.now
75
+ yield req.uri
45
76
  rescue StandardError => e
46
- WebArchive.warn_archive_fail(
47
- uri, name, ([e.inspect] + e.backtrace).join("\n")
48
- )
77
+ if retry?(e) && req.max_retry.positive?
78
+ buff = [].tap { |a| a << self.deq until self.empty? }
79
+ @in_process.value += buff.size + 1
80
+ Concurrent::ScheduledTask.execute(req.wait) do
81
+ @in_process.value -= buff.size + 1
82
+ self.enq Req.new(req.uri, req.wait * 2, req.max_retry - 1)
83
+ buff.each { |x| self.enq x }
84
+ end
85
+ else
86
+ WebArchive.warn_archive_fail(
87
+ req.uri, name, ([e.inspect] + e.backtrace).join("\n")
88
+ )
89
+ end
49
90
  ensure
50
- @in_process -= 1
51
- break if @all_sent && self.empty?
52
-
53
- sleep wait
91
+ @in_process.value -= 1
92
+ break if @all_sent && self.remaining.zero?
54
93
  end
55
94
  end
56
95
  end
57
96
  end
58
97
 
98
+ def time_until_next_req(last_req, current)
99
+ elapsed = [current - last_req, 0].max
100
+ [@interval - elapsed, 0].max
101
+ end
102
+
103
+ # @param exc [Exception]
104
+ # @return [Boolean]
105
+ def retry?(exc)
106
+ [
107
+ Errno::ECONNRESET,
108
+ Errno::EHOSTUNREACH
109
+ ].include?(exc.class) ||
110
+ (exc.is_a?(OpenURI::HTTPError) && exc.message.start_with?('429 ')) ||
111
+ (exc.is_a?(OpenURI::HTTPError) && exc.message.start_with?('502 ')) ||
112
+ (exc.is_a?(OpenURI::HTTPError) && exc.message.start_with?('503 ')) ||
113
+ (exc.is_a?(Mechanize::ResponseCodeError) && exc.response_code == '503')
114
+ end
115
+
59
116
  # mark as 'sending done' and wait for items to be processed
117
+ # @return [Boolean]
60
118
  def done_sending
61
119
  @all_sent = true
62
- @consumer.join if self.remaining > 0
120
+ @consumer.join if self.remaining.positive?
63
121
  end
64
122
 
65
123
  # number of queued items (including those being processed)
124
+ # @return [Integer]
66
125
  def remaining
67
- self.size + @in_process
126
+ self.size + @in_process.value
68
127
  end
69
128
  end
70
129
 
71
- def self.my_normalize(str)
130
+ # Client with multiple queues
131
+ class Client
132
+ def initialize(wait_secs: 1, max_retry: 3,
133
+ redirect: false, canonical_uri: true)
134
+ @wait_secs = wait_secs
135
+ @max_retry = max_retry
136
+ @redirect = redirect
137
+ @canonical_uri = canonical_uri
138
+
139
+ @wait_secs = 0 if @wait_secs.negative?
140
+ @max_retry = 0 if @max_retry.negative?
141
+ @queues = []
142
+ end
143
+
144
+ # @param queue [ArchiveQueue]
145
+ def add_queue(queue)
146
+ @queues << queue
147
+ end
148
+
149
+ def queued_uris
150
+ @queues.map(&:remaining).inject(:+)
151
+ end
152
+
153
+ # @param uri [String]
154
+ # @return [Concurrent::Promises::Future] Gives the target URI if redirected
155
+ def with_redirect(uri)
156
+ Concurrent::Promises.future do
157
+ res = Net::HTTP.get_response(Addressable::URI.parse(uri))
158
+ raise NoAlternativeURIError, 'no redirect found' if
159
+ !res['location'] || res['location'] == uri
160
+
161
+ res['location']
162
+ end
163
+ end
164
+
165
+ def add_scheme(uri, scheme)
166
+ if uri.relative?
167
+ uri = uri.dup
168
+ uri.scheme = scheme
169
+ end
170
+ uri
171
+ end
172
+
173
+ def equivalent_uri?(uri, str)
174
+ uri = add_scheme(uri, Addressable::URI.parse(str).scheme)
175
+ uri.to_s == str
176
+ end
177
+
178
+ # @param uri [String]
179
+ # @return [Concurrent::Promises::Future] Gives the canonical URI if there is one
180
+ def with_canonical_uri(uri)
181
+ Concurrent::Promises.future do
182
+ agent = Mechanize.new
183
+ page = agent.get(uri)
184
+ ret = nil
185
+ raise NoAlternativeURIError, 'no canonical URI found' unless
186
+ page.canonical_uri &&
187
+ page.class == Mechanize::Page &&
188
+ page.canonical_uri != page.uri
189
+
190
+ if page.canonical_uri.relative?
191
+ u2 = URI.join(page.uri, page.canonical_uri)
192
+ ret = u2.to_s if !equivalent_uri?(u2, uri) &&
193
+ !equivalent_uri?(u2, page.uri)
194
+ else
195
+ u1 = page.canonical_uri
196
+ u1 = add_scheme(u1, 'http') unless u1.scheme
197
+ ret = u1.to_s if !equivalent_uri?(u1, uri) &&
198
+ !equivalent_uri?(u1, page.uri)
199
+ end
200
+
201
+ raise NoAlternativeURIError, 'no canonical URI found' unless ret
202
+
203
+ ret
204
+ end
205
+ end
206
+
207
+ # @param uri [String]
208
+ # @return [void]
209
+ def send_single_uri(uri)
210
+ @queues.each do |q|
211
+ q.enq Req.new(uri, @wait_secs, @max_retry)
212
+ end
213
+ end
214
+
215
+ # @param uri [String]
216
+ # @return [Concurrent::Promises::Future]
217
+ def send_uri(uri)
218
+ f0 = Concurrent::Promises.future{ send_single_uri(uri) }
219
+ f1 = with_canonical_uri(uri).then { |x| send_single_uri(x) } if @canonical_uri
220
+ f2 = with_redirect(uri).then { |x| send_single_uri(x) } if @redirect
221
+ f1 ||= Concurrent::Promises.future{}
222
+ f2 ||= Concurrent::Promises.future{}
223
+ f0.zip(f1).zip(f2)
224
+ end
225
+
226
+ def wait_for_queues
227
+ @queues.each(&:done_sending)
228
+ end
229
+ end
230
+
231
+ # @param str [String]
232
+ # @return [String]
233
+ def self.encode_non_ascii(str)
72
234
  if str =~ /[^[:ascii:]]/
73
235
  Addressable::URI.encode(str)
74
236
  else
@@ -76,51 +238,58 @@ module WebArchive
76
238
  end
77
239
  end
78
240
 
79
- def self.to_ascii_uri(str)
80
- uri = str.strip
81
- if %r/\.[a-z]{2,4}(\/|$)/.match(uri) && !uri.include?('://') && !uri.start_with?('http')
82
- uri = 'http://' + uri
241
+ # @param str [String]
242
+ # @return [String]
243
+ def self.prepend_http(uri)
244
+ if %r/\.[a-z]{2,4}(\/|$)/.match(uri) && %r{(^http|://)}.match(uri).nil?
245
+ 'http://' + uri
246
+ else
247
+ uri
83
248
  end
249
+ end
84
250
 
251
+ # Encode non-ASCII components in the given string and make a URI instance from
252
+ # @param str [String]
253
+ # @return [Addressable::URI]
254
+ def self.to_ascii_uri(str)
255
+ uri = prepend_http(str.strip)
85
256
  u = Addressable::URI.parse(uri)
86
257
  u.host = SimpleIDN.to_ascii(u.host)
87
- u.path = my_normalize(u.path)
88
- u.query = my_normalize(u.query)
89
- u.fragment = my_normalize(u.fragment)
258
+ u.path, u.query, u.fragment = [
259
+ u.path, u.query, u.fragment
260
+ ].map(&method(:encode_non_ascii))
90
261
  u
91
262
  end
92
263
 
93
- def self.with_canonical_uri_and_redirect(uri, canonical, redirect)
94
- if redirect
95
- res = Net::HTTP.get_response(URI.parse(uri))
96
- yield res['location'] if res['location'] &&
97
- res['location'] != uri
98
- end
99
- if canonical
100
- agent = Mechanize.new
101
- page = agent.get(uri)
102
- yield page.canonical_uri.to_s if page.class == Mechanize::Page &&
103
- page.canonical_uri &&
104
- page.canonical_uri.to_s != uri &&
105
- page.canonical_uri != page.uri
106
- end
107
- rescue Net::HTTPClientError, Mechanize::ResponseCodeError
108
- # ignore since it will cause a warning later anyway
264
+ # Write log to a file
265
+ # @param source [String]
266
+ # @param content [String]
267
+ # @return [void]
268
+ def self.debug_output(source, uri, content)
269
+ ts = Time.now.strftime('%Y%m%d%H%M%S')
270
+ filename = "#{self}-#{source}-#{uri.gsub(/\W+/, '_')[0..30]}-"
271
+ filename += Digest::SHA256.hexdigest(uri + ts)[0..8]
272
+ Tempfile.open(filename) do |f|
273
+ f.puts content
274
+ end
109
275
  end
110
276
 
111
277
  # completer for URLs
112
278
  class Completer
279
+ # @param history_file [File]
113
280
  def initialize(history_file)
114
- @file = File.expand_path(history_file)
281
+ @file = history_file
115
282
  @trie = Trie.new
116
- self.reload
283
+ reload!
117
284
  end
118
285
 
119
- def update
120
- self.reload if File.stat(@file).mtime > @lastupdate
286
+ # @return [void]
287
+ def update!
288
+ reload! if File.stat(@file).mtime > @lastupdate
121
289
  end
122
290
 
123
- def reload
291
+ # @return [void]
292
+ def reload!
124
293
  if File.exist? @file
125
294
  File.open(@file, encoding: 'utf-8').each_line do |x|
126
295
  @trie.add x.strip
@@ -131,13 +300,16 @@ module WebArchive
131
300
  @lastupdate = Time.now
132
301
  end
133
302
 
303
+ # @return [Proc]
134
304
  def to_proc
135
305
  proc do |s|
136
- self.update
306
+ update!
137
307
  @trie.children(s)
138
308
  end
139
309
  end
140
310
 
311
+ # @param str [String]
312
+ # @return [void]
141
313
  def append_to_history(str)
142
314
  File.open(@file, mode: 'a', encoding: 'utf-8') do |f|
143
315
  f.puts str
@@ -147,65 +319,97 @@ module WebArchive
147
319
 
148
320
  HISTORY_FILE = '~/.webarchive_history'
149
321
 
150
- def self.launch(wait_secs: 1, debug: false, verbose: false, redirect: false, canonical_uri: true, history: true)
151
- verbose = true if debug
322
+ # Launch the CLI
323
+ # @return [Concurrent::Promises::Future]
324
+ def self.launch(wait_secs: 1, max_retry: 3,
325
+ read_timeout_secs: 15,
326
+ redirect: false, canonical_uri: true,
327
+ history: true, debug: false, verbose: false)
328
+
329
+ args = method(__method__).parameters.map{ |k, v| [v, binding.local_variable_get(v)] }
330
+ puts "Arguments: #{args.inspect}" if verbose
331
+
332
+ wait_secs = 0 if wait_secs.negative?
333
+ max_retry = 0 if max_retry.negative?
334
+
152
335
  Thread.abort_on_exception = true
153
336
  completer = nil
154
337
  if history
155
- completer = Completer.new(HISTORY_FILE)
338
+ completer = Completer.new(File.expand_path(HISTORY_FILE))
156
339
  Readline.completion_proc = completer.to_proc
157
340
  Readline.completion_append_character = ''
158
341
  end
159
342
 
160
- queues = []
343
+ client = Client.new(wait_secs: wait_secs, max_retry: max_retry,
344
+ redirect: redirect, canonical_uri: canonical_uri)
161
345
 
162
- queues << ArchiveQueue.new('archive.org', wait_secs) do |uri|
163
- URI.parse("https://web.archive.org/save/#{uri}").open do |f|
164
- if f.meta['content-location'] && verbose
165
- puts "<https://web.archive.org#{f.meta['content-location']}>"
166
- elsif verbose
167
- puts f.meta.inspect
346
+ # prepare queues
347
+ client.add_queue(
348
+ ArchiveQueue.new('archive.org (logged out)', wait_secs) do |uri|
349
+ u = URI.parse('https://web.archive.org/save/' + uri)
350
+ u.open(read_timeout: read_timeout_secs) do |f|
351
+ if f.meta['content-location'] && verbose
352
+ puts "<https://web.archive.org#{f.meta['content-location']}>"
353
+ elsif verbose
354
+ puts f.meta.inspect
355
+ end
168
356
  end
169
357
  end
170
- end
358
+ )
171
359
 
172
- queues << ArchiveQueue.new('megalodon.jp', wait_secs) do |uri|
173
- agent = Mechanize.new
174
- page = agent.get('https://megalodon.jp/pc/?' +
175
- Addressable::URI.form_encode(url: uri))
176
- res = agent.submit(page.forms.first)
177
- if debug
178
- Tempfile.open("#{self}-#{uri.gsub(/\W/, '_')}") do |f|
179
- f.puts res.body
360
+ client.add_queue(
361
+ ArchiveQueue.new('megalodon.jp', wait_secs) do |uri|
362
+ agent = Mechanize.new
363
+ agent.read_timeout = read_timeout_secs
364
+ page = agent.get('https://megalodon.jp/pc/?' +
365
+ Addressable::URI.form_encode(url: uri))
366
+ form = page.forms.first
367
+ raise UnexpectedResponseError, page.inspect unless form
368
+
369
+ res = agent.submit(form)
370
+ if debug
371
+ debug_output('megalodonjp', uri, res.body)
180
372
  end
181
- end
182
- og = res.at('meta[property="og:url"]')
183
- uri = if og
184
- og[:content]
185
- else
186
- res.links.map(&:href).find(-> { res.uri.to_s }) do |x|
187
- x =~ %r{megalodon\.jp/[\d-]+/}
373
+ og = res.at('meta[property="og:url"]')
374
+ uri = if og
375
+ og[:content]
376
+ else
377
+ res.links.map(&:href).find(-> { res.uri.to_s }) do |x|
378
+ x =~ %r{megalodon\.jp/[\d-]+/}
379
+ end
188
380
  end
189
- end
190
- puts "<#{uri}>" if verbose
191
- agent.shutdown
192
- end
381
+ puts "<#{uri}>" if verbose
382
+ agent.shutdown
383
+ end
384
+ )
193
385
 
194
- queues << ArchiveQueue.new('archive.today', wait_secs) do |uri|
195
- agent = Mechanize.new
196
- agent.follow_meta_refresh = true
386
+ client.add_queue(
387
+ ArchiveQueue.new('archive.today', wait_secs) do |uri|
388
+ agent = Mechanize.new
389
+ agent.read_timeout = read_timeout_secs
390
+ agent.follow_meta_refresh = true
197
391
 
198
- page = agent.get('https://archive.today/')
199
- form = page.form_with(id: 'submiturl')
200
- form['anyway'] = '1'
201
- form.field_with(name: 'url').value = uri
202
- page = agent.submit(form)
203
- puts "<#{page.uri}>" if verbose
204
- agent.shutdown
205
- end
392
+ page = agent.get('https://archive.is/')
393
+ form = page.form_with(id: 'submiturl')
394
+ if debug
395
+ debug_output('archivetoday', uri, page.inspect)
396
+ end
397
+ raise UnexpectedResponseError, page.inspect unless form
398
+
399
+ form['anyway'] = '1'
400
+ form.field_with(name: 'url').value = uri
401
+ sleep 5.0 # not submit too fast
402
+ page = agent.submit(form)
403
+ puts "<#{page.uri}>" if verbose
404
+ agent.shutdown
405
+ end
406
+ )
407
+
408
+ # main loop
206
409
 
207
410
  uri_regexp = URI::DEFAULT_PARSER.make_regexp
208
- while line = Readline.readline("Q(#{queues.map(&:remaining).inject(:+)})> ", add_hist: true)
411
+ all = Concurrent::Promises.future{}
412
+ while line = Readline.readline("Q(#{client.queued_uris})> ", add_hist: true)
209
413
  uri = ''
210
414
  begin
211
415
  uri = to_ascii_uri(line).to_s
@@ -221,25 +425,22 @@ module WebArchive
221
425
  next
222
426
  end
223
427
 
224
- queues.each do |q|
225
- q << uri
226
- end
227
- begin
228
- with_canonical_uri_and_redirect(uri, canonical_uri, redirect) do |x|
229
- queues.each do |q|
230
- q << x
231
- end
232
- end
233
- rescue StandardError => e
234
- warn "skipping canonical/redirect for #{uri}: #{e.message}"
235
- end
236
-
237
- completer.append_to_history(uri) if completer
428
+ f = client.send_uri(uri).then {
429
+ completer&.append_to_history(uri)
430
+ }.on_rejection { |reason, _|
431
+ warn "skipping canonical/redirect for #{uri}: #{reason}" if
432
+ !x.is_a?(NoAlternativeURIError)
433
+ }
434
+ all = all.zip(f)
238
435
  end
239
436
 
240
- queues.each(&:done_sending)
437
+ all.wait
438
+ client.wait_for_queues
241
439
  # TODO: trap INT and ask for confirmation
440
+ all
242
441
  end
243
442
  end
244
443
 
245
- WebArchive.launch(wait_secs: 1.0, verbose: true, debug: true) if $PROGRAM_NAME == __FILE__
444
+ if $PROGRAM_NAME == __FILE__
445
+ WebArchive.launch(wait_secs: 1.0, verbose: true, debug: true)
446
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webarchive
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yusuke Matsubara
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-06-30 00:00:00.000000000 Z
11
+ date: 2025-01-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -16,14 +16,28 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: 2.6.0
19
+ version: 2.8.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: 2.6.0
26
+ version: 2.8.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: concurrent-ruby
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.3.0
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 1.3.0
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: fast_trie
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -44,72 +58,114 @@ dependencies:
44
58
  requirements:
45
59
  - - "~>"
46
60
  - !ruby/object:Gem::Version
47
- version: 2.7.6
61
+ version: 2.14.0
48
62
  type: :runtime
49
63
  prerelease: false
50
64
  version_requirements: !ruby/object:Gem::Requirement
51
65
  requirements:
52
66
  - - "~>"
53
67
  - !ruby/object:Gem::Version
54
- version: 2.7.6
68
+ version: 2.14.0
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: net-http-persistent
57
71
  requirement: !ruby/object:Gem::Requirement
58
72
  requirements:
59
73
  - - "~>"
60
74
  - !ruby/object:Gem::Version
61
- version: 3.1.0
75
+ version: 4.0.0
62
76
  type: :runtime
63
77
  prerelease: false
64
78
  version_requirements: !ruby/object:Gem::Requirement
65
79
  requirements:
66
80
  - - "~>"
67
81
  - !ruby/object:Gem::Version
68
- version: 3.1.0
82
+ version: 4.0.0
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: simpleidn
71
85
  requirement: !ruby/object:Gem::Requirement
72
86
  requirements:
73
87
  - - "~>"
74
88
  - !ruby/object:Gem::Version
75
- version: 0.1.1
89
+ version: 0.2.1
76
90
  type: :runtime
77
91
  prerelease: false
78
92
  version_requirements: !ruby/object:Gem::Requirement
79
93
  requirements:
80
94
  - - "~>"
81
95
  - !ruby/object:Gem::Version
82
- version: 0.1.1
96
+ version: 0.2.1
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: bundler
85
99
  requirement: !ruby/object:Gem::Requirement
86
100
  requirements:
87
101
  - - "~>"
88
102
  - !ruby/object:Gem::Version
89
- version: '1.17'
103
+ version: '2.1'
90
104
  type: :development
91
105
  prerelease: false
92
106
  version_requirements: !ruby/object:Gem::Requirement
93
107
  requirements:
94
108
  - - "~>"
95
109
  - !ruby/object:Gem::Version
96
- version: '1.17'
110
+ version: '2.1'
97
111
  - !ruby/object:Gem::Dependency
98
112
  name: rake
99
113
  requirement: !ruby/object:Gem::Requirement
100
114
  requirements:
101
115
  - - "~>"
102
116
  - !ruby/object:Gem::Version
103
- version: '10.0'
117
+ version: '13.0'
104
118
  type: :development
105
119
  prerelease: false
106
120
  version_requirements: !ruby/object:Gem::Requirement
107
121
  requirements:
108
122
  - - "~>"
109
123
  - !ruby/object:Gem::Version
110
- version: '10.0'
124
+ version: '13.0'
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: rspec
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '3.13'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '3.13'
139
+ - !ruby/object:Gem::Dependency
140
+ name: rubocop
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: '1.70'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '1.70'
153
+ - !ruby/object:Gem::Dependency
154
+ name: webmock
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: '3.24'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - "~>"
165
+ - !ruby/object:Gem::Version
166
+ version: '3.24'
167
+ - !ruby/object:Gem::Dependency
168
+ name: pry
113
169
  requirement: !ruby/object:Gem::Requirement
114
170
  requirements:
115
171
  - - ">="
@@ -123,7 +179,7 @@ dependencies:
123
179
  - !ruby/object:Gem::Version
124
180
  version: '0'
125
181
  - !ruby/object:Gem::Dependency
126
- name: webmock
182
+ name: pry-doc
127
183
  requirement: !ruby/object:Gem::Requirement
128
184
  requirements:
129
185
  - - ">="
@@ -154,7 +210,7 @@ licenses:
154
210
  metadata:
155
211
  homepage_uri: https://rubygems.org/gems/webarchive
156
212
  source_code_uri: https://gitlab.com/yusuke.matsubara/webarchive
157
- post_install_message:
213
+ post_install_message:
158
214
  rdoc_options: []
159
215
  require_paths:
160
216
  - lib
@@ -162,15 +218,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
162
218
  requirements:
163
219
  - - "~>"
164
220
  - !ruby/object:Gem::Version
165
- version: '2.0'
221
+ version: '3.0'
166
222
  required_rubygems_version: !ruby/object:Gem::Requirement
167
223
  requirements:
168
224
  - - ">="
169
225
  - !ruby/object:Gem::Version
170
226
  version: '0'
171
227
  requirements: []
172
- rubygems_version: 3.0.1
173
- signing_key:
228
+ rubygems_version: 3.4.19
229
+ signing_key:
174
230
  specification_version: 4
175
231
  summary: webarchive - CUI tool to archive URIs
176
232
  test_files: []