webarchive 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +25 -7
- data/bin/webarchive +26 -6
- data/lib/webarchive/version.rb +3 -1
- data/lib/webarchive.rb +302 -109
- metadata +75 -19
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c9fc5065342461039496f77e140585d7023878acf1053ac820bc262fdf6917db
|
4
|
+
data.tar.gz: d093202d5cf905a359d7abb7af6cbf67a88898217493a7239eef5d63015ae831
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3eedec1586a6a33d9a6a95b63dc98ef3b0fde2a000389743b807f2a074c74b05341b2870acc3d20ee7b994f3cfcbb931e9451f02d70a5645ec327bce6ac80679
|
7
|
+
data.tar.gz: 1c8b60d67cce9e228c8785a79df5bc056bf88141c7bf7dd7c83018aa0fc28d11ca101bfd78af4ea9289761797fee214dfa2831be7e6acf45a6d887356927c0ba
|
data/README.md
CHANGED
@@ -1,9 +1,16 @@
|
|
1
1
|
# Webarchive
|
2
2
|
|
3
|
-
This is a CUI tool for sending URIs to public web archiving tools such
|
3
|
+
This is a CUI tool for sending URIs to public web archiving tools such
|
4
|
+
as web.archive.org and archive.today.
|
4
5
|
|
5
6
|
Requests are throttled.
|
6
7
|
|
8
|
+
## Rationale
|
9
|
+
|
10
|
+
This tool's motivation is simple - increased availability by redundancy. Your favorite web archiving service might be down at some point in time, or blocked by certain websites. Use 2 or more services to archive something, and your archive will be safe if at least one of them remains available.
|
11
|
+
|
12
|
+
Browser extensions with similar functionalities might exist, but this tool might be for you when you need to archive a large number of URLs, and well, if you like CUI.
|
13
|
+
|
7
14
|
## Installation
|
8
15
|
|
9
16
|
Use this line to install it:
|
@@ -22,25 +29,36 @@ If you have a list of URIs in a file, use pipe.
|
|
22
29
|
|
23
30
|
$ cat list.txt | webarchive
|
24
31
|
|
32
|
+
Note that, by default, this program logs all the URIs you enter into
|
33
|
+
`~/.webarchive_history`.
|
34
|
+
|
25
35
|
It has optional command-line parameters:
|
26
36
|
|
27
37
|
$ webarchive -h
|
28
|
-
|
38
|
+
|
29
39
|
Usage: webarchive [options]
|
30
|
-
-w, --wait=
|
40
|
+
-w, --wait=SECONDS wait for SECONDS between requests [default: 5.0]
|
41
|
+
-r, --retry=N retry for N times when failed [default: 5]
|
42
|
+
--[no-]history record history [default: enabled]
|
31
43
|
-d, --debug add debug output, implies verbose
|
32
44
|
--verbose
|
33
|
-
|
45
|
+
-h, --help show help
|
34
46
|
|
35
47
|
## Development
|
36
48
|
|
37
|
-
After checking out the repo, run `bundle install` to install
|
49
|
+
After checking out the repo, run `bundle install` to install
|
50
|
+
dependencies. Then, run `rake spec` to run the tests.
|
38
51
|
|
39
|
-
To install this gem onto your local machine, run `bundle exec rake
|
52
|
+
To install this gem onto your local machine, run `bundle exec rake
|
53
|
+
install`. To release a new version, update the version number in
|
54
|
+
`version.rb`, and then run `bundle exec rake release`, which will
|
55
|
+
create a git tag for the version, push git commits and tags, and push
|
56
|
+
the `.gem` file to [rubygems.org](https://rubygems.org).
|
40
57
|
|
41
58
|
## Contributing
|
42
59
|
|
43
|
-
Bug reports and pull requests are welcome at
|
60
|
+
Bug reports and pull requests are welcome at
|
61
|
+
https://gitlab.com/yusuke.matsubara/webarchive.
|
44
62
|
|
45
63
|
## License
|
46
64
|
|
data/bin/webarchive
CHANGED
@@ -1,18 +1,34 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
require 'webarchive'
|
4
5
|
require 'webarchive/version'
|
5
6
|
require 'optparse'
|
6
7
|
|
7
|
-
wait =
|
8
|
+
wait = 5.0
|
9
|
+
retry_ = 5
|
8
10
|
debug = false
|
9
11
|
verbose = false
|
10
12
|
history = true
|
11
13
|
Version = WebArchive::VERSION
|
14
|
+
|
12
15
|
OptionParser.new do |opt|
|
13
|
-
opt.on('-w', '--wait=
|
14
|
-
|
15
|
-
|
16
|
+
opt.on('-w', '--wait=SECONDS',
|
17
|
+
"wait for SECONDS between requests [default: #{wait}]") do |v|
|
18
|
+
wait = v.to_f
|
19
|
+
end
|
20
|
+
opt.on('-r', '--retry=N',
|
21
|
+
"retry for N times when failed [default: #{retry_}]") do |v|
|
22
|
+
retry_ = v.to_i
|
23
|
+
end
|
24
|
+
opt.on('--[no-]history',
|
25
|
+
"record history [default: #{history ? 'enabled' : 'disabled'}]") do |v|
|
26
|
+
history = v
|
27
|
+
end
|
28
|
+
opt.on('-d', '--debug', 'add debug output, implies verbose') do
|
29
|
+
debug = true
|
30
|
+
verbose = true
|
31
|
+
end
|
16
32
|
opt.on('--verbose') { verbose = true }
|
17
33
|
opt.on('-h', '--help', 'show help') do
|
18
34
|
puts opt
|
@@ -20,5 +36,9 @@ OptionParser.new do |opt|
|
|
20
36
|
end
|
21
37
|
end.parse!(ARGV)
|
22
38
|
|
23
|
-
|
24
|
-
|
39
|
+
tagline = "#{WebArchive} #{WebArchive::VERSION}"
|
40
|
+
tagline += ' (debug)' if debug
|
41
|
+
warn tagline if verbose
|
42
|
+
|
43
|
+
WebArchive.launch(wait_secs: wait, max_retry: retry_, history: history,
|
44
|
+
debug: debug, verbose: verbose)
|
data/lib/webarchive/version.rb
CHANGED
data/lib/webarchive.rb
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
|
-
#
|
4
|
+
# This tool allows you to call multiple web archiving services. It
|
5
|
+
# works from command-line interactively. In the inside, it maintains a
|
6
|
+
# throttled queue for each service provider supported.
|
7
|
+
|
8
|
+
# (This file itself is executable without installation - see the bottom.)
|
4
9
|
|
5
10
|
require 'open-uri'
|
6
11
|
require 'readline'
|
@@ -8,22 +13,40 @@ require 'tempfile'
|
|
8
13
|
require 'simpleidn'
|
9
14
|
require 'net/http'
|
10
15
|
require 'addressable/uri'
|
16
|
+
require 'digest/sha2'
|
11
17
|
require 'mechanize'
|
12
18
|
require 'trie'
|
19
|
+
require 'concurrent'
|
13
20
|
|
14
21
|
# classes and functions of webarchive package
|
15
22
|
module WebArchive
|
23
|
+
Req = Struct.new("Req", :uri, :wait, :max_retry)
|
24
|
+
|
25
|
+
# error class for unexpected response from archiving service
|
26
|
+
class UnexpectedResponseError < StandardError
|
27
|
+
def initialize(cause = nil)
|
28
|
+
super(cause)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# error class for when redirect/canonical uri is not found
|
33
|
+
class NoAlternativeURIError < StandardError
|
34
|
+
def initialize(cause = nil)
|
35
|
+
super(cause)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
16
39
|
begin
|
17
40
|
require 'libnotify'
|
18
|
-
def self.warn_archive_fail(
|
19
|
-
warn "Not archived: #{
|
20
|
-
Libnotify.show(summary: "Not archived: #{
|
41
|
+
def self.warn_archive_fail(req, archiver, body)
|
42
|
+
warn "Not archived: #{req} by #{archiver}; #{body}"
|
43
|
+
Libnotify.show(summary: "Not archived: #{req} by #{archiver}",
|
21
44
|
body: body, timeout: 3)
|
22
45
|
end
|
23
46
|
rescue LoadError
|
24
|
-
unless
|
25
|
-
def self.warn_archive_fail(
|
26
|
-
warn "Not archived: #{
|
47
|
+
unless respond_to? :warn_archive_fail
|
48
|
+
def self.warn_archive_fail(req, archiver, body)
|
49
|
+
warn "Not archived: #{req} by #{archiver}; #{body}"
|
27
50
|
end
|
28
51
|
end
|
29
52
|
end
|
@@ -31,44 +54,183 @@ module WebArchive
|
|
31
54
|
# Queue for sending URLs to a certain archiving web site
|
32
55
|
# The block given to constructor will be executed for each '<<'
|
33
56
|
class ArchiveQueue < Queue
|
34
|
-
|
57
|
+
# Create a new instance of ArchiveQueue
|
58
|
+
# @param name [String] name of the queue
|
59
|
+
# @param interval [Float] length of the wait between requests
|
60
|
+
# @yield [String] URI that the queue receives
|
61
|
+
def initialize(name, interval)
|
35
62
|
super()
|
36
63
|
@name = name
|
64
|
+
@interval = interval
|
37
65
|
@all_sent = false
|
38
|
-
@in_process = 0
|
66
|
+
@in_process = Concurrent::AtomicFixnum.new(0)
|
67
|
+
last_request_time = Time.now - interval
|
39
68
|
@consumer = Thread.new do
|
40
69
|
loop do
|
41
|
-
|
42
|
-
@in_process += 1
|
70
|
+
req = self.deq # deq blocks until non-empty
|
71
|
+
@in_process.value += 1
|
43
72
|
begin
|
44
|
-
|
73
|
+
sleep time_until_next_req(last_request_time, Time.now)
|
74
|
+
last_request_time = Time.now
|
75
|
+
yield req.uri
|
45
76
|
rescue StandardError => e
|
46
|
-
|
47
|
-
|
48
|
-
|
77
|
+
if retry?(e) && req.max_retry.positive?
|
78
|
+
buff = [].tap { |a| a << self.deq until self.empty? }
|
79
|
+
@in_process.value += buff.size + 1
|
80
|
+
Concurrent::ScheduledTask.execute(req.wait) do
|
81
|
+
@in_process.value -= buff.size + 1
|
82
|
+
self.enq Req.new(req.uri, req.wait * 2, req.max_retry - 1)
|
83
|
+
buff.each { |x| self.enq x }
|
84
|
+
end
|
85
|
+
else
|
86
|
+
WebArchive.warn_archive_fail(
|
87
|
+
req.uri, name, ([e.inspect] + e.backtrace).join("\n")
|
88
|
+
)
|
89
|
+
end
|
49
90
|
ensure
|
50
|
-
@in_process -= 1
|
51
|
-
break if @all_sent && self.
|
52
|
-
|
53
|
-
sleep wait
|
91
|
+
@in_process.value -= 1
|
92
|
+
break if @all_sent && self.remaining.zero?
|
54
93
|
end
|
55
94
|
end
|
56
95
|
end
|
57
96
|
end
|
58
97
|
|
98
|
+
def time_until_next_req(last_req, current)
|
99
|
+
elapsed = [current - last_req, 0].max
|
100
|
+
[@interval - elapsed, 0].max
|
101
|
+
end
|
102
|
+
|
103
|
+
# @param exc [Exception]
|
104
|
+
# @return [Boolean]
|
105
|
+
def retry?(exc)
|
106
|
+
[
|
107
|
+
Errno::ECONNRESET,
|
108
|
+
Errno::EHOSTUNREACH
|
109
|
+
].include?(exc.class) ||
|
110
|
+
(exc.is_a?(OpenURI::HTTPError) && exc.message.start_with?('429 ')) ||
|
111
|
+
(exc.is_a?(OpenURI::HTTPError) && exc.message.start_with?('502 ')) ||
|
112
|
+
(exc.is_a?(OpenURI::HTTPError) && exc.message.start_with?('503 ')) ||
|
113
|
+
(exc.is_a?(Mechanize::ResponseCodeError) && exc.response_code == '503')
|
114
|
+
end
|
115
|
+
|
59
116
|
# mark as 'sending done' and wait for items to be processed
|
117
|
+
# @return [Boolean]
|
60
118
|
def done_sending
|
61
119
|
@all_sent = true
|
62
|
-
@consumer.join if self.remaining
|
120
|
+
@consumer.join if self.remaining.positive?
|
63
121
|
end
|
64
122
|
|
65
123
|
# number of queued items (including those being processed)
|
124
|
+
# @return [Integer]
|
66
125
|
def remaining
|
67
|
-
self.size + @in_process
|
126
|
+
self.size + @in_process.value
|
68
127
|
end
|
69
128
|
end
|
70
129
|
|
71
|
-
|
130
|
+
# Client with multiple queues
|
131
|
+
class Client
|
132
|
+
def initialize(wait_secs: 1, max_retry: 3,
|
133
|
+
redirect: false, canonical_uri: true)
|
134
|
+
@wait_secs = wait_secs
|
135
|
+
@max_retry = max_retry
|
136
|
+
@redirect = redirect
|
137
|
+
@canonical_uri = canonical_uri
|
138
|
+
|
139
|
+
@wait_secs = 0 if @wait_secs.negative?
|
140
|
+
@max_retry = 0 if @max_retry.negative?
|
141
|
+
@queues = []
|
142
|
+
end
|
143
|
+
|
144
|
+
# @param queue [ArchiveQueue]
|
145
|
+
def add_queue(queue)
|
146
|
+
@queues << queue
|
147
|
+
end
|
148
|
+
|
149
|
+
def queued_uris
|
150
|
+
@queues.map(&:remaining).inject(:+)
|
151
|
+
end
|
152
|
+
|
153
|
+
# @param uri [String]
|
154
|
+
# @return [Concurrent::Promises::Future] Gives the target URI if redirected
|
155
|
+
def with_redirect(uri)
|
156
|
+
Concurrent::Promises.future do
|
157
|
+
res = Net::HTTP.get_response(Addressable::URI.parse(uri))
|
158
|
+
raise NoAlternativeURIError, 'no redirect found' if
|
159
|
+
!res['location'] || res['location'] == uri
|
160
|
+
|
161
|
+
res['location']
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
def add_scheme(uri, scheme)
|
166
|
+
if uri.relative?
|
167
|
+
uri = uri.dup
|
168
|
+
uri.scheme = scheme
|
169
|
+
end
|
170
|
+
uri
|
171
|
+
end
|
172
|
+
|
173
|
+
def equivalent_uri?(uri, str)
|
174
|
+
uri = add_scheme(uri, Addressable::URI.parse(str).scheme)
|
175
|
+
uri.to_s == str
|
176
|
+
end
|
177
|
+
|
178
|
+
# @param uri [String]
|
179
|
+
# @return [Concurrent::Promises::Future] Gives the canonical URI if there is one
|
180
|
+
def with_canonical_uri(uri)
|
181
|
+
Concurrent::Promises.future do
|
182
|
+
agent = Mechanize.new
|
183
|
+
page = agent.get(uri)
|
184
|
+
ret = nil
|
185
|
+
raise NoAlternativeURIError, 'no canonical URI found' unless
|
186
|
+
page.canonical_uri &&
|
187
|
+
page.class == Mechanize::Page &&
|
188
|
+
page.canonical_uri != page.uri
|
189
|
+
|
190
|
+
if page.canonical_uri.relative?
|
191
|
+
u2 = URI.join(page.uri, page.canonical_uri)
|
192
|
+
ret = u2.to_s if !equivalent_uri?(u2, uri) &&
|
193
|
+
!equivalent_uri?(u2, page.uri)
|
194
|
+
else
|
195
|
+
u1 = page.canonical_uri
|
196
|
+
u1 = add_scheme(u1, 'http') unless u1.scheme
|
197
|
+
ret = u1.to_s if !equivalent_uri?(u1, uri) &&
|
198
|
+
!equivalent_uri?(u1, page.uri)
|
199
|
+
end
|
200
|
+
|
201
|
+
raise NoAlternativeURIError, 'no canonical URI found' unless ret
|
202
|
+
|
203
|
+
ret
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
# @param uri [String]
|
208
|
+
# @return [void]
|
209
|
+
def send_single_uri(uri)
|
210
|
+
@queues.each do |q|
|
211
|
+
q.enq Req.new(uri, @wait_secs, @max_retry)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
# @param uri [String]
|
216
|
+
# @return [Concurrent::Promises::Future]
|
217
|
+
def send_uri(uri)
|
218
|
+
f0 = Concurrent::Promises.future{ send_single_uri(uri) }
|
219
|
+
f1 = with_canonical_uri(uri).then { |x| send_single_uri(x) } if @canonical_uri
|
220
|
+
f2 = with_redirect(uri).then { |x| send_single_uri(x) } if @redirect
|
221
|
+
f1 ||= Concurrent::Promises.future{}
|
222
|
+
f2 ||= Concurrent::Promises.future{}
|
223
|
+
f0.zip(f1).zip(f2)
|
224
|
+
end
|
225
|
+
|
226
|
+
def wait_for_queues
|
227
|
+
@queues.each(&:done_sending)
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
# @param str [String]
|
232
|
+
# @return [String]
|
233
|
+
def self.encode_non_ascii(str)
|
72
234
|
if str =~ /[^[:ascii:]]/
|
73
235
|
Addressable::URI.encode(str)
|
74
236
|
else
|
@@ -76,51 +238,58 @@ module WebArchive
|
|
76
238
|
end
|
77
239
|
end
|
78
240
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
241
|
+
# @param str [String]
|
242
|
+
# @return [String]
|
243
|
+
def self.prepend_http(uri)
|
244
|
+
if %r/\.[a-z]{2,4}(\/|$)/.match(uri) && %r{(^http|://)}.match(uri).nil?
|
245
|
+
'http://' + uri
|
246
|
+
else
|
247
|
+
uri
|
83
248
|
end
|
249
|
+
end
|
84
250
|
|
251
|
+
# Encode non-ASCII components in the given string and make a URI instance from
|
252
|
+
# @param str [String]
|
253
|
+
# @return [Addressable::URI]
|
254
|
+
def self.to_ascii_uri(str)
|
255
|
+
uri = prepend_http(str.strip)
|
85
256
|
u = Addressable::URI.parse(uri)
|
86
257
|
u.host = SimpleIDN.to_ascii(u.host)
|
87
|
-
u.path
|
88
|
-
|
89
|
-
|
258
|
+
u.path, u.query, u.fragment = [
|
259
|
+
u.path, u.query, u.fragment
|
260
|
+
].map(&method(:encode_non_ascii))
|
90
261
|
u
|
91
262
|
end
|
92
263
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
page.canonical_uri.to_s != uri &&
|
105
|
-
page.canonical_uri != page.uri
|
106
|
-
end
|
107
|
-
rescue Net::HTTPClientError, Mechanize::ResponseCodeError
|
108
|
-
# ignore since it will cause a warning later anyway
|
264
|
+
# Write log to a file
|
265
|
+
# @param source [String]
|
266
|
+
# @param content [String]
|
267
|
+
# @return [void]
|
268
|
+
def self.debug_output(source, uri, content)
|
269
|
+
ts = Time.now.strftime('%Y%m%d%H%M%S')
|
270
|
+
filename = "#{self}-#{source}-#{uri.gsub(/\W+/, '_')[0..30]}-"
|
271
|
+
filename += Digest::SHA256.hexdigest(uri + ts)[0..8]
|
272
|
+
Tempfile.open(filename) do |f|
|
273
|
+
f.puts content
|
274
|
+
end
|
109
275
|
end
|
110
276
|
|
111
277
|
# completer for URLs
|
112
278
|
class Completer
|
279
|
+
# @param history_file [File]
|
113
280
|
def initialize(history_file)
|
114
|
-
@file =
|
281
|
+
@file = history_file
|
115
282
|
@trie = Trie.new
|
116
|
-
|
283
|
+
reload!
|
117
284
|
end
|
118
285
|
|
119
|
-
|
120
|
-
|
286
|
+
# @return [void]
|
287
|
+
def update!
|
288
|
+
reload! if File.stat(@file).mtime > @lastupdate
|
121
289
|
end
|
122
290
|
|
123
|
-
|
291
|
+
# @return [void]
|
292
|
+
def reload!
|
124
293
|
if File.exist? @file
|
125
294
|
File.open(@file, encoding: 'utf-8').each_line do |x|
|
126
295
|
@trie.add x.strip
|
@@ -131,13 +300,16 @@ module WebArchive
|
|
131
300
|
@lastupdate = Time.now
|
132
301
|
end
|
133
302
|
|
303
|
+
# @return [Proc]
|
134
304
|
def to_proc
|
135
305
|
proc do |s|
|
136
|
-
|
306
|
+
update!
|
137
307
|
@trie.children(s)
|
138
308
|
end
|
139
309
|
end
|
140
310
|
|
311
|
+
# @param str [String]
|
312
|
+
# @return [void]
|
141
313
|
def append_to_history(str)
|
142
314
|
File.open(@file, mode: 'a', encoding: 'utf-8') do |f|
|
143
315
|
f.puts str
|
@@ -147,65 +319,89 @@ module WebArchive
|
|
147
319
|
|
148
320
|
HISTORY_FILE = '~/.webarchive_history'
|
149
321
|
|
150
|
-
|
151
|
-
|
322
|
+
# Launch the CLI
|
323
|
+
# @return [Concurrent::Promises::Future]
|
324
|
+
def self.launch(wait_secs: 1, max_retry: 3,
|
325
|
+
redirect: false, canonical_uri: true,
|
326
|
+
history: true, debug: false, verbose: false)
|
327
|
+
wait_secs = 0 if wait_secs.negative?
|
328
|
+
max_retry = 0 if max_retry.negative?
|
329
|
+
|
152
330
|
Thread.abort_on_exception = true
|
153
331
|
completer = nil
|
154
332
|
if history
|
155
|
-
completer = Completer.new(HISTORY_FILE)
|
333
|
+
completer = Completer.new(File.expand_path(HISTORY_FILE))
|
156
334
|
Readline.completion_proc = completer.to_proc
|
157
335
|
Readline.completion_append_character = ''
|
158
336
|
end
|
159
337
|
|
160
|
-
|
338
|
+
client = Client.new(wait_secs: wait_secs, max_retry: max_retry,
|
339
|
+
redirect: redirect, canonical_uri: canonical_uri)
|
161
340
|
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
341
|
+
# prepare queues
|
342
|
+
client.add_queue(
|
343
|
+
ArchiveQueue.new('archive.org (logged out)', wait_secs) do |uri|
|
344
|
+
URI.parse('https://web.archive.org/save/' + uri).open do |f|
|
345
|
+
if f.meta['content-location'] && verbose
|
346
|
+
puts "<https://web.archive.org#{f.meta['content-location']}>"
|
347
|
+
elsif verbose
|
348
|
+
puts f.meta.inspect
|
349
|
+
end
|
168
350
|
end
|
169
351
|
end
|
170
|
-
|
352
|
+
)
|
171
353
|
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
354
|
+
client.add_queue(
|
355
|
+
ArchiveQueue.new('megalodon.jp', wait_secs) do |uri|
|
356
|
+
agent = Mechanize.new
|
357
|
+
page = agent.get('https://megalodon.jp/pc/?' +
|
358
|
+
Addressable::URI.form_encode(url: uri))
|
359
|
+
form = page.forms.first
|
360
|
+
raise UnexpectedResponseError, page.inspect unless form
|
361
|
+
|
362
|
+
res = agent.submit(form)
|
363
|
+
if debug
|
364
|
+
debug_output('megalodonjp', uri, res.body)
|
180
365
|
end
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
366
|
+
og = res.at('meta[property="og:url"]')
|
367
|
+
uri = if og
|
368
|
+
og[:content]
|
369
|
+
else
|
370
|
+
res.links.map(&:href).find(-> { res.uri.to_s }) do |x|
|
371
|
+
x =~ %r{megalodon\.jp/[\d-]+/}
|
372
|
+
end
|
188
373
|
end
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
374
|
+
puts "<#{uri}>" if verbose
|
375
|
+
agent.shutdown
|
376
|
+
end
|
377
|
+
)
|
193
378
|
|
194
|
-
|
195
|
-
|
196
|
-
|
379
|
+
client.add_queue(
|
380
|
+
ArchiveQueue.new('archive.today', wait_secs) do |uri|
|
381
|
+
agent = Mechanize.new
|
382
|
+
agent.follow_meta_refresh = true
|
197
383
|
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
384
|
+
page = agent.get('https://archive.is/')
|
385
|
+
form = page.form_with(id: 'submiturl')
|
386
|
+
if debug
|
387
|
+
debug_output('archivetoday', uri, page.inspect)
|
388
|
+
end
|
389
|
+
raise UnexpectedResponseError, page.inspect unless form
|
390
|
+
|
391
|
+
form['anyway'] = '1'
|
392
|
+
form.field_with(name: 'url').value = uri
|
393
|
+
sleep 5.0 # not submit too fast
|
394
|
+
page = agent.submit(form)
|
395
|
+
puts "<#{page.uri}>" if verbose
|
396
|
+
agent.shutdown
|
397
|
+
end
|
398
|
+
)
|
399
|
+
|
400
|
+
# main loop
|
206
401
|
|
207
402
|
uri_regexp = URI::DEFAULT_PARSER.make_regexp
|
208
|
-
|
403
|
+
all = Concurrent::Promises.future{}
|
404
|
+
while line = Readline.readline("Q(#{client.queued_uris})> ", add_hist: true)
|
209
405
|
uri = ''
|
210
406
|
begin
|
211
407
|
uri = to_ascii_uri(line).to_s
|
@@ -221,25 +417,22 @@ module WebArchive
|
|
221
417
|
next
|
222
418
|
end
|
223
419
|
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
end
|
232
|
-
end
|
233
|
-
rescue StandardError => e
|
234
|
-
warn "skipping canonical/redirect for #{uri}: #{e.message}"
|
235
|
-
end
|
236
|
-
|
237
|
-
completer.append_to_history(uri) if completer
|
420
|
+
f = client.send_uri(uri).then {
|
421
|
+
completer&.append_to_history(uri)
|
422
|
+
}.on_rejection { |reason, _|
|
423
|
+
warn "skipping canonical/redirect for #{uri}: #{reason}" if
|
424
|
+
!x.is_a?(NoAlternativeURIError)
|
425
|
+
}
|
426
|
+
all = all.zip(f)
|
238
427
|
end
|
239
428
|
|
240
|
-
|
429
|
+
all.wait
|
430
|
+
client.wait_for_queues
|
241
431
|
# TODO: trap INT and ask for confirmation
|
432
|
+
all
|
242
433
|
end
|
243
434
|
end
|
244
435
|
|
245
|
-
|
436
|
+
if $PROGRAM_NAME == __FILE__
|
437
|
+
WebArchive.launch(wait_secs: 1.0, verbose: true, debug: true)
|
438
|
+
end
|
metadata
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webarchive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yusuke Matsubara
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
date: 2019-06-30 00:00:00.000000000 Z
|
@@ -16,14 +16,28 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 2.
|
19
|
+
version: 2.8.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 2.
|
26
|
+
version: 2.8.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: concurrent-ruby
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.1.6
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.1.6
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: fast_trie
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -44,72 +58,114 @@ dependencies:
|
|
44
58
|
requirements:
|
45
59
|
- - "~>"
|
46
60
|
- !ruby/object:Gem::Version
|
47
|
-
version: 2.
|
61
|
+
version: 2.8.0
|
48
62
|
type: :runtime
|
49
63
|
prerelease: false
|
50
64
|
version_requirements: !ruby/object:Gem::Requirement
|
51
65
|
requirements:
|
52
66
|
- - "~>"
|
53
67
|
- !ruby/object:Gem::Version
|
54
|
-
version: 2.
|
68
|
+
version: 2.8.0
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: net-http-persistent
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
58
72
|
requirements:
|
59
73
|
- - "~>"
|
60
74
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
75
|
+
version: 4.0.0
|
62
76
|
type: :runtime
|
63
77
|
prerelease: false
|
64
78
|
version_requirements: !ruby/object:Gem::Requirement
|
65
79
|
requirements:
|
66
80
|
- - "~>"
|
67
81
|
- !ruby/object:Gem::Version
|
68
|
-
version:
|
82
|
+
version: 4.0.0
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: simpleidn
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
72
86
|
requirements:
|
73
87
|
- - "~>"
|
74
88
|
- !ruby/object:Gem::Version
|
75
|
-
version: 0.
|
89
|
+
version: 0.2.1
|
76
90
|
type: :runtime
|
77
91
|
prerelease: false
|
78
92
|
version_requirements: !ruby/object:Gem::Requirement
|
79
93
|
requirements:
|
80
94
|
- - "~>"
|
81
95
|
- !ruby/object:Gem::Version
|
82
|
-
version: 0.
|
96
|
+
version: 0.2.1
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: bundler
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
86
100
|
requirements:
|
87
101
|
- - "~>"
|
88
102
|
- !ruby/object:Gem::Version
|
89
|
-
version: '1
|
103
|
+
version: '2.1'
|
90
104
|
type: :development
|
91
105
|
prerelease: false
|
92
106
|
version_requirements: !ruby/object:Gem::Requirement
|
93
107
|
requirements:
|
94
108
|
- - "~>"
|
95
109
|
- !ruby/object:Gem::Version
|
96
|
-
version: '1
|
110
|
+
version: '2.1'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: rake
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
100
114
|
requirements:
|
101
115
|
- - "~>"
|
102
116
|
- !ruby/object:Gem::Version
|
103
|
-
version: '
|
117
|
+
version: '13.0'
|
104
118
|
type: :development
|
105
119
|
prerelease: false
|
106
120
|
version_requirements: !ruby/object:Gem::Requirement
|
107
121
|
requirements:
|
108
122
|
- - "~>"
|
109
123
|
- !ruby/object:Gem::Version
|
110
|
-
version: '
|
124
|
+
version: '13.0'
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: rspec
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - "~>"
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '3.10'
|
132
|
+
type: :development
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - "~>"
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '3.10'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: rubocop
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - "~>"
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: 0.81.0
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - "~>"
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: 0.81.0
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: webmock
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - "~>"
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '3.13'
|
160
|
+
type: :development
|
161
|
+
prerelease: false
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
163
|
+
requirements:
|
164
|
+
- - "~>"
|
165
|
+
- !ruby/object:Gem::Version
|
166
|
+
version: '3.13'
|
167
|
+
- !ruby/object:Gem::Dependency
|
168
|
+
name: pry
|
113
169
|
requirement: !ruby/object:Gem::Requirement
|
114
170
|
requirements:
|
115
171
|
- - ">="
|
@@ -123,7 +179,7 @@ dependencies:
|
|
123
179
|
- !ruby/object:Gem::Version
|
124
180
|
version: '0'
|
125
181
|
- !ruby/object:Gem::Dependency
|
126
|
-
name:
|
182
|
+
name: pry-doc
|
127
183
|
requirement: !ruby/object:Gem::Requirement
|
128
184
|
requirements:
|
129
185
|
- - ">="
|
@@ -154,7 +210,7 @@ licenses:
|
|
154
210
|
metadata:
|
155
211
|
homepage_uri: https://rubygems.org/gems/webarchive
|
156
212
|
source_code_uri: https://gitlab.com/yusuke.matsubara/webarchive
|
157
|
-
post_install_message:
|
213
|
+
post_install_message:
|
158
214
|
rdoc_options: []
|
159
215
|
require_paths:
|
160
216
|
- lib
|
@@ -162,15 +218,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
162
218
|
requirements:
|
163
219
|
- - "~>"
|
164
220
|
- !ruby/object:Gem::Version
|
165
|
-
version: '
|
221
|
+
version: '3.0'
|
166
222
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
167
223
|
requirements:
|
168
224
|
- - ">="
|
169
225
|
- !ruby/object:Gem::Version
|
170
226
|
version: '0'
|
171
227
|
requirements: []
|
172
|
-
rubygems_version: 3.
|
173
|
-
signing_key:
|
228
|
+
rubygems_version: 3.3.7
|
229
|
+
signing_key:
|
174
230
|
specification_version: 4
|
175
231
|
summary: webarchive - CUI tool to archive URIs
|
176
232
|
test_files: []
|