webarchive 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +26 -7
- data/bin/webarchive +32 -6
- data/lib/webarchive/version.rb +3 -1
- data/lib/webarchive.rb +310 -109
- metadata +76 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 16eba058e7574f02d9e444a6d72b2aefb41b55fb0acb9adc1b6eef492cef306d
|
4
|
+
data.tar.gz: 33411ba62ea808f72b930ef52a5f112f4df80b4febb1d2bd88c7017a62785550
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8787995fe6bdc059e2275371ed33eb332a8f1c99aa099b995a20805258ed2d05983507a8f2958e104edcc31e7259bc3be68d22f838c917294687533718a799c5
|
7
|
+
data.tar.gz: f9e73ef39c3cb49b6bd25569cc2549292982aa47e71e1c31ce735eb350e8a1fe0882fe8cb0a3d9ee23d3b6075d33ec149a951f4d70f9095d4b8f03bc8c306e6c
|
data/README.md
CHANGED
@@ -1,9 +1,16 @@
|
|
1
1
|
# Webarchive
|
2
2
|
|
3
|
-
This is a CUI tool for sending URIs to public web archiving tools such
|
3
|
+
This is a CUI tool for sending URIs to public web archiving tools such
|
4
|
+
as web.archive.org and archive.today.
|
4
5
|
|
5
6
|
Requests are throttled.
|
6
7
|
|
8
|
+
## Rationale
|
9
|
+
|
10
|
+
This tool's motivation is simple - increased availability by redundancy. Your favorite web archiving service might be down at some point in time, or blocked by certain websites. Use 2 or more services to archive something, and your archive will be safe if at least one of them remains available.
|
11
|
+
|
12
|
+
Browser extensions with similar functionalities might exist, but this tool might be for you when you need to archive a large number of URLs, and well, if you like CUI.
|
13
|
+
|
7
14
|
## Installation
|
8
15
|
|
9
16
|
Use this line to install it:
|
@@ -22,25 +29,37 @@ If you have a list of URIs in a file, use pipe.
|
|
22
29
|
|
23
30
|
$ cat list.txt | webarchive
|
24
31
|
|
32
|
+
Note that, by default, this program logs all the URIs you enter into
|
33
|
+
`~/.webarchive_history`.
|
34
|
+
|
25
35
|
It has optional command-line parameters:
|
26
36
|
|
27
37
|
$ webarchive -h
|
28
|
-
|
38
|
+
|
29
39
|
Usage: webarchive [options]
|
30
|
-
-w, --wait=
|
40
|
+
-w, --wait=SECONDS wait for SECONDS between requests [default: 5.0]
|
41
|
+
-r, --retry=N retry for N times when failed [default: 5]
|
42
|
+
-t, --timeout=SECONDS timeout after SECONDS [default: 60.0]
|
43
|
+
--[no-]history record history [default: enabled]
|
31
44
|
-d, --debug add debug output, implies verbose
|
32
45
|
--verbose
|
33
|
-
|
46
|
+
-h, --help show help
|
34
47
|
|
35
48
|
## Development
|
36
49
|
|
37
|
-
After checking out the repo, run `bundle install` to install
|
50
|
+
After checking out the repo, run `bundle install` to install
|
51
|
+
dependencies. Then, run `rake spec` to run the tests.
|
38
52
|
|
39
|
-
To install this gem onto your local machine, run `bundle exec rake
|
53
|
+
To install this gem onto your local machine, run `bundle exec rake
|
54
|
+
install`. To release a new version, update the version number in
|
55
|
+
`version.rb`, and then run `bundle exec rake release`, which will
|
56
|
+
create a git tag for the version, push git commits and tags, and push
|
57
|
+
the `.gem` file to [rubygems.org](https://rubygems.org).
|
40
58
|
|
41
59
|
## Contributing
|
42
60
|
|
43
|
-
Bug reports and pull requests are welcome at
|
61
|
+
Bug reports and pull requests are welcome at
|
62
|
+
https://gitlab.com/yusuke.matsubara/webarchive.
|
44
63
|
|
45
64
|
## License
|
46
65
|
|
data/bin/webarchive
CHANGED
@@ -1,18 +1,39 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
require 'webarchive'
|
4
5
|
require 'webarchive/version'
|
5
6
|
require 'optparse'
|
6
7
|
|
7
|
-
wait =
|
8
|
+
wait = 5.0
|
9
|
+
retry_ = 5
|
10
|
+
timeout = 60.0
|
8
11
|
debug = false
|
9
12
|
verbose = false
|
10
13
|
history = true
|
11
14
|
Version = WebArchive::VERSION
|
15
|
+
|
12
16
|
OptionParser.new do |opt|
|
13
|
-
opt.on('-w', '--wait=
|
14
|
-
|
15
|
-
|
17
|
+
opt.on('-w', '--wait=SECONDS',
|
18
|
+
"wait for SECONDS between requests [default: #{wait}]") do |v|
|
19
|
+
wait = v.to_f
|
20
|
+
end
|
21
|
+
opt.on('-r', '--retry=N',
|
22
|
+
"retry for N times when failed [default: #{retry_}]") do |v|
|
23
|
+
retry_ = v.to_i
|
24
|
+
end
|
25
|
+
opt.on('-t', '--timeout=SECONDS',
|
26
|
+
"timeout after SECONDS [default: #{timeout}]") do |v|
|
27
|
+
timeout = v.to_i
|
28
|
+
end
|
29
|
+
opt.on('--[no-]history',
|
30
|
+
"record history [default: #{history ? 'enabled' : 'disabled'}]") do |v|
|
31
|
+
history = v
|
32
|
+
end
|
33
|
+
opt.on('-d', '--debug', 'add debug output, implies verbose') do
|
34
|
+
debug = true
|
35
|
+
verbose = true
|
36
|
+
end
|
16
37
|
opt.on('--verbose') { verbose = true }
|
17
38
|
opt.on('-h', '--help', 'show help') do
|
18
39
|
puts opt
|
@@ -20,5 +41,10 @@ OptionParser.new do |opt|
|
|
20
41
|
end
|
21
42
|
end.parse!(ARGV)
|
22
43
|
|
23
|
-
|
24
|
-
|
44
|
+
tagline = "#{WebArchive} #{WebArchive::VERSION}"
|
45
|
+
tagline += ' (debug)' if debug
|
46
|
+
warn tagline if verbose
|
47
|
+
|
48
|
+
WebArchive.launch(wait_secs: wait, max_retry: retry_, history: history,
|
49
|
+
read_timeout_secs: timeout,
|
50
|
+
debug: debug, verbose: verbose)
|
data/lib/webarchive/version.rb
CHANGED
data/lib/webarchive.rb
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
|
-
#
|
4
|
+
# This tool allows you to call multiple web archiving services. It
|
5
|
+
# works from command-line interactively. In the inside, it maintains a
|
6
|
+
# throttled queue for each service provider supported.
|
7
|
+
|
8
|
+
# (This file itself is executable without installation - see the bottom.)
|
4
9
|
|
5
10
|
require 'open-uri'
|
6
11
|
require 'readline'
|
@@ -8,22 +13,40 @@ require 'tempfile'
|
|
8
13
|
require 'simpleidn'
|
9
14
|
require 'net/http'
|
10
15
|
require 'addressable/uri'
|
16
|
+
require 'digest/sha2'
|
11
17
|
require 'mechanize'
|
12
18
|
require 'trie'
|
19
|
+
require 'concurrent'
|
13
20
|
|
14
21
|
# classes and functions of webarchive package
|
15
22
|
module WebArchive
|
23
|
+
Req = Struct.new("Req", :uri, :wait, :max_retry)
|
24
|
+
|
25
|
+
# error class for unexpected response from archiving service
|
26
|
+
class UnexpectedResponseError < StandardError
|
27
|
+
def initialize(cause = nil)
|
28
|
+
super(cause)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# error class for when redirect/canonical uri is not found
|
33
|
+
class NoAlternativeURIError < StandardError
|
34
|
+
def initialize(cause = nil)
|
35
|
+
super(cause)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
16
39
|
begin
|
17
40
|
require 'libnotify'
|
18
|
-
def self.warn_archive_fail(
|
19
|
-
warn "Not archived: #{
|
20
|
-
Libnotify.show(summary: "Not archived: #{
|
41
|
+
def self.warn_archive_fail(req, archiver, body)
|
42
|
+
warn "Not archived: #{req} by #{archiver}; #{body}"
|
43
|
+
Libnotify.show(summary: "Not archived: #{req} by #{archiver}",
|
21
44
|
body: body, timeout: 3)
|
22
45
|
end
|
23
46
|
rescue LoadError
|
24
|
-
unless
|
25
|
-
def self.warn_archive_fail(
|
26
|
-
warn "Not archived: #{
|
47
|
+
unless respond_to? :warn_archive_fail
|
48
|
+
def self.warn_archive_fail(req, archiver, body)
|
49
|
+
warn "Not archived: #{req} by #{archiver}; #{body}"
|
27
50
|
end
|
28
51
|
end
|
29
52
|
end
|
@@ -31,44 +54,183 @@ module WebArchive
|
|
31
54
|
# Queue for sending URLs to a certain archiving web site
|
32
55
|
# The block given to constructor will be executed for each '<<'
|
33
56
|
class ArchiveQueue < Queue
|
34
|
-
|
57
|
+
# Create a new instance of ArchiveQueue
|
58
|
+
# @param name [String] name of the queue
|
59
|
+
# @param interval [Float] length of the wait between requests
|
60
|
+
# @yield [String] URI that the queue receives
|
61
|
+
def initialize(name, interval)
|
35
62
|
super()
|
36
63
|
@name = name
|
64
|
+
@interval = interval
|
37
65
|
@all_sent = false
|
38
|
-
@in_process = 0
|
66
|
+
@in_process = Concurrent::AtomicFixnum.new(0)
|
67
|
+
last_request_time = Time.now - interval
|
39
68
|
@consumer = Thread.new do
|
40
69
|
loop do
|
41
|
-
|
42
|
-
@in_process += 1
|
70
|
+
req = self.deq # deq blocks until non-empty
|
71
|
+
@in_process.value += 1
|
43
72
|
begin
|
44
|
-
|
73
|
+
sleep time_until_next_req(last_request_time, Time.now)
|
74
|
+
last_request_time = Time.now
|
75
|
+
yield req.uri
|
45
76
|
rescue StandardError => e
|
46
|
-
|
47
|
-
|
48
|
-
|
77
|
+
if retry?(e) && req.max_retry.positive?
|
78
|
+
buff = [].tap { |a| a << self.deq until self.empty? }
|
79
|
+
@in_process.value += buff.size + 1
|
80
|
+
Concurrent::ScheduledTask.execute(req.wait) do
|
81
|
+
@in_process.value -= buff.size + 1
|
82
|
+
self.enq Req.new(req.uri, req.wait * 2, req.max_retry - 1)
|
83
|
+
buff.each { |x| self.enq x }
|
84
|
+
end
|
85
|
+
else
|
86
|
+
WebArchive.warn_archive_fail(
|
87
|
+
req.uri, name, ([e.inspect] + e.backtrace).join("\n")
|
88
|
+
)
|
89
|
+
end
|
49
90
|
ensure
|
50
|
-
@in_process -= 1
|
51
|
-
break if @all_sent && self.
|
52
|
-
|
53
|
-
sleep wait
|
91
|
+
@in_process.value -= 1
|
92
|
+
break if @all_sent && self.remaining.zero?
|
54
93
|
end
|
55
94
|
end
|
56
95
|
end
|
57
96
|
end
|
58
97
|
|
98
|
+
def time_until_next_req(last_req, current)
|
99
|
+
elapsed = [current - last_req, 0].max
|
100
|
+
[@interval - elapsed, 0].max
|
101
|
+
end
|
102
|
+
|
103
|
+
# @param exc [Exception]
|
104
|
+
# @return [Boolean]
|
105
|
+
def retry?(exc)
|
106
|
+
[
|
107
|
+
Errno::ECONNRESET,
|
108
|
+
Errno::EHOSTUNREACH
|
109
|
+
].include?(exc.class) ||
|
110
|
+
(exc.is_a?(OpenURI::HTTPError) && exc.message.start_with?('429 ')) ||
|
111
|
+
(exc.is_a?(OpenURI::HTTPError) && exc.message.start_with?('502 ')) ||
|
112
|
+
(exc.is_a?(OpenURI::HTTPError) && exc.message.start_with?('503 ')) ||
|
113
|
+
(exc.is_a?(Mechanize::ResponseCodeError) && exc.response_code == '503')
|
114
|
+
end
|
115
|
+
|
59
116
|
# mark as 'sending done' and wait for items to be processed
|
117
|
+
# @return [Boolean]
|
60
118
|
def done_sending
|
61
119
|
@all_sent = true
|
62
|
-
@consumer.join if self.remaining
|
120
|
+
@consumer.join if self.remaining.positive?
|
63
121
|
end
|
64
122
|
|
65
123
|
# number of queued items (including those being processed)
|
124
|
+
# @return [Integer]
|
66
125
|
def remaining
|
67
|
-
self.size + @in_process
|
126
|
+
self.size + @in_process.value
|
68
127
|
end
|
69
128
|
end
|
70
129
|
|
71
|
-
|
130
|
+
# Client with multiple queues
|
131
|
+
class Client
|
132
|
+
def initialize(wait_secs: 1, max_retry: 3,
|
133
|
+
redirect: false, canonical_uri: true)
|
134
|
+
@wait_secs = wait_secs
|
135
|
+
@max_retry = max_retry
|
136
|
+
@redirect = redirect
|
137
|
+
@canonical_uri = canonical_uri
|
138
|
+
|
139
|
+
@wait_secs = 0 if @wait_secs.negative?
|
140
|
+
@max_retry = 0 if @max_retry.negative?
|
141
|
+
@queues = []
|
142
|
+
end
|
143
|
+
|
144
|
+
# @param queue [ArchiveQueue]
|
145
|
+
def add_queue(queue)
|
146
|
+
@queues << queue
|
147
|
+
end
|
148
|
+
|
149
|
+
def queued_uris
|
150
|
+
@queues.map(&:remaining).inject(:+)
|
151
|
+
end
|
152
|
+
|
153
|
+
# @param uri [String]
|
154
|
+
# @return [Concurrent::Promises::Future] Gives the target URI if redirected
|
155
|
+
def with_redirect(uri)
|
156
|
+
Concurrent::Promises.future do
|
157
|
+
res = Net::HTTP.get_response(Addressable::URI.parse(uri))
|
158
|
+
raise NoAlternativeURIError, 'no redirect found' if
|
159
|
+
!res['location'] || res['location'] == uri
|
160
|
+
|
161
|
+
res['location']
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
def add_scheme(uri, scheme)
|
166
|
+
if uri.relative?
|
167
|
+
uri = uri.dup
|
168
|
+
uri.scheme = scheme
|
169
|
+
end
|
170
|
+
uri
|
171
|
+
end
|
172
|
+
|
173
|
+
def equivalent_uri?(uri, str)
|
174
|
+
uri = add_scheme(uri, Addressable::URI.parse(str).scheme)
|
175
|
+
uri.to_s == str
|
176
|
+
end
|
177
|
+
|
178
|
+
# @param uri [String]
|
179
|
+
# @return [Concurrent::Promises::Future] Gives the canonical URI if there is one
|
180
|
+
def with_canonical_uri(uri)
|
181
|
+
Concurrent::Promises.future do
|
182
|
+
agent = Mechanize.new
|
183
|
+
page = agent.get(uri)
|
184
|
+
ret = nil
|
185
|
+
raise NoAlternativeURIError, 'no canonical URI found' unless
|
186
|
+
page.canonical_uri &&
|
187
|
+
page.class == Mechanize::Page &&
|
188
|
+
page.canonical_uri != page.uri
|
189
|
+
|
190
|
+
if page.canonical_uri.relative?
|
191
|
+
u2 = URI.join(page.uri, page.canonical_uri)
|
192
|
+
ret = u2.to_s if !equivalent_uri?(u2, uri) &&
|
193
|
+
!equivalent_uri?(u2, page.uri)
|
194
|
+
else
|
195
|
+
u1 = page.canonical_uri
|
196
|
+
u1 = add_scheme(u1, 'http') unless u1.scheme
|
197
|
+
ret = u1.to_s if !equivalent_uri?(u1, uri) &&
|
198
|
+
!equivalent_uri?(u1, page.uri)
|
199
|
+
end
|
200
|
+
|
201
|
+
raise NoAlternativeURIError, 'no canonical URI found' unless ret
|
202
|
+
|
203
|
+
ret
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
# @param uri [String]
|
208
|
+
# @return [void]
|
209
|
+
def send_single_uri(uri)
|
210
|
+
@queues.each do |q|
|
211
|
+
q.enq Req.new(uri, @wait_secs, @max_retry)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
# @param uri [String]
|
216
|
+
# @return [Concurrent::Promises::Future]
|
217
|
+
def send_uri(uri)
|
218
|
+
f0 = Concurrent::Promises.future{ send_single_uri(uri) }
|
219
|
+
f1 = with_canonical_uri(uri).then { |x| send_single_uri(x) } if @canonical_uri
|
220
|
+
f2 = with_redirect(uri).then { |x| send_single_uri(x) } if @redirect
|
221
|
+
f1 ||= Concurrent::Promises.future{}
|
222
|
+
f2 ||= Concurrent::Promises.future{}
|
223
|
+
f0.zip(f1).zip(f2)
|
224
|
+
end
|
225
|
+
|
226
|
+
def wait_for_queues
|
227
|
+
@queues.each(&:done_sending)
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
# @param str [String]
|
232
|
+
# @return [String]
|
233
|
+
def self.encode_non_ascii(str)
|
72
234
|
if str =~ /[^[:ascii:]]/
|
73
235
|
Addressable::URI.encode(str)
|
74
236
|
else
|
@@ -76,51 +238,58 @@ module WebArchive
|
|
76
238
|
end
|
77
239
|
end
|
78
240
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
241
|
+
# @param str [String]
|
242
|
+
# @return [String]
|
243
|
+
def self.prepend_http(uri)
|
244
|
+
if %r/\.[a-z]{2,4}(\/|$)/.match(uri) && %r{(^http|://)}.match(uri).nil?
|
245
|
+
'http://' + uri
|
246
|
+
else
|
247
|
+
uri
|
83
248
|
end
|
249
|
+
end
|
84
250
|
|
251
|
+
# Encode non-ASCII components in the given string and make a URI instance from
|
252
|
+
# @param str [String]
|
253
|
+
# @return [Addressable::URI]
|
254
|
+
def self.to_ascii_uri(str)
|
255
|
+
uri = prepend_http(str.strip)
|
85
256
|
u = Addressable::URI.parse(uri)
|
86
257
|
u.host = SimpleIDN.to_ascii(u.host)
|
87
|
-
u.path
|
88
|
-
|
89
|
-
|
258
|
+
u.path, u.query, u.fragment = [
|
259
|
+
u.path, u.query, u.fragment
|
260
|
+
].map(&method(:encode_non_ascii))
|
90
261
|
u
|
91
262
|
end
|
92
263
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
page.canonical_uri.to_s != uri &&
|
105
|
-
page.canonical_uri != page.uri
|
106
|
-
end
|
107
|
-
rescue Net::HTTPClientError, Mechanize::ResponseCodeError
|
108
|
-
# ignore since it will cause a warning later anyway
|
264
|
+
# Write log to a file
|
265
|
+
# @param source [String]
|
266
|
+
# @param content [String]
|
267
|
+
# @return [void]
|
268
|
+
def self.debug_output(source, uri, content)
|
269
|
+
ts = Time.now.strftime('%Y%m%d%H%M%S')
|
270
|
+
filename = "#{self}-#{source}-#{uri.gsub(/\W+/, '_')[0..30]}-"
|
271
|
+
filename += Digest::SHA256.hexdigest(uri + ts)[0..8]
|
272
|
+
Tempfile.open(filename) do |f|
|
273
|
+
f.puts content
|
274
|
+
end
|
109
275
|
end
|
110
276
|
|
111
277
|
# completer for URLs
|
112
278
|
class Completer
|
279
|
+
# @param history_file [File]
|
113
280
|
def initialize(history_file)
|
114
|
-
@file =
|
281
|
+
@file = history_file
|
115
282
|
@trie = Trie.new
|
116
|
-
|
283
|
+
reload!
|
117
284
|
end
|
118
285
|
|
119
|
-
|
120
|
-
|
286
|
+
# @return [void]
|
287
|
+
def update!
|
288
|
+
reload! if File.stat(@file).mtime > @lastupdate
|
121
289
|
end
|
122
290
|
|
123
|
-
|
291
|
+
# @return [void]
|
292
|
+
def reload!
|
124
293
|
if File.exist? @file
|
125
294
|
File.open(@file, encoding: 'utf-8').each_line do |x|
|
126
295
|
@trie.add x.strip
|
@@ -131,13 +300,16 @@ module WebArchive
|
|
131
300
|
@lastupdate = Time.now
|
132
301
|
end
|
133
302
|
|
303
|
+
# @return [Proc]
|
134
304
|
def to_proc
|
135
305
|
proc do |s|
|
136
|
-
|
306
|
+
update!
|
137
307
|
@trie.children(s)
|
138
308
|
end
|
139
309
|
end
|
140
310
|
|
311
|
+
# @param str [String]
|
312
|
+
# @return [void]
|
141
313
|
def append_to_history(str)
|
142
314
|
File.open(@file, mode: 'a', encoding: 'utf-8') do |f|
|
143
315
|
f.puts str
|
@@ -147,65 +319,97 @@ module WebArchive
|
|
147
319
|
|
148
320
|
HISTORY_FILE = '~/.webarchive_history'
|
149
321
|
|
150
|
-
|
151
|
-
|
322
|
+
# Launch the CLI
|
323
|
+
# @return [Concurrent::Promises::Future]
|
324
|
+
def self.launch(wait_secs: 1, max_retry: 3,
|
325
|
+
read_timeout_secs: 15,
|
326
|
+
redirect: false, canonical_uri: true,
|
327
|
+
history: true, debug: false, verbose: false)
|
328
|
+
|
329
|
+
args = method(__method__).parameters.map{ |k, v| [v, binding.local_variable_get(v)] }
|
330
|
+
puts "Arguments: #{args.inspect}" if verbose
|
331
|
+
|
332
|
+
wait_secs = 0 if wait_secs.negative?
|
333
|
+
max_retry = 0 if max_retry.negative?
|
334
|
+
|
152
335
|
Thread.abort_on_exception = true
|
153
336
|
completer = nil
|
154
337
|
if history
|
155
|
-
completer = Completer.new(HISTORY_FILE)
|
338
|
+
completer = Completer.new(File.expand_path(HISTORY_FILE))
|
156
339
|
Readline.completion_proc = completer.to_proc
|
157
340
|
Readline.completion_append_character = ''
|
158
341
|
end
|
159
342
|
|
160
|
-
|
343
|
+
client = Client.new(wait_secs: wait_secs, max_retry: max_retry,
|
344
|
+
redirect: redirect, canonical_uri: canonical_uri)
|
161
345
|
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
346
|
+
# prepare queues
|
347
|
+
client.add_queue(
|
348
|
+
ArchiveQueue.new('archive.org (logged out)', wait_secs) do |uri|
|
349
|
+
u = URI.parse('https://web.archive.org/save/' + uri)
|
350
|
+
u.open(read_timeout: read_timeout_secs) do |f|
|
351
|
+
if f.meta['content-location'] && verbose
|
352
|
+
puts "<https://web.archive.org#{f.meta['content-location']}>"
|
353
|
+
elsif verbose
|
354
|
+
puts f.meta.inspect
|
355
|
+
end
|
168
356
|
end
|
169
357
|
end
|
170
|
-
|
358
|
+
)
|
171
359
|
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
360
|
+
client.add_queue(
|
361
|
+
ArchiveQueue.new('megalodon.jp', wait_secs) do |uri|
|
362
|
+
agent = Mechanize.new
|
363
|
+
agent.read_timeout = read_timeout_secs
|
364
|
+
page = agent.get('https://megalodon.jp/pc/?' +
|
365
|
+
Addressable::URI.form_encode(url: uri))
|
366
|
+
form = page.forms.first
|
367
|
+
raise UnexpectedResponseError, page.inspect unless form
|
368
|
+
|
369
|
+
res = agent.submit(form)
|
370
|
+
if debug
|
371
|
+
debug_output('megalodonjp', uri, res.body)
|
180
372
|
end
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
373
|
+
og = res.at('meta[property="og:url"]')
|
374
|
+
uri = if og
|
375
|
+
og[:content]
|
376
|
+
else
|
377
|
+
res.links.map(&:href).find(-> { res.uri.to_s }) do |x|
|
378
|
+
x =~ %r{megalodon\.jp/[\d-]+/}
|
379
|
+
end
|
188
380
|
end
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
381
|
+
puts "<#{uri}>" if verbose
|
382
|
+
agent.shutdown
|
383
|
+
end
|
384
|
+
)
|
193
385
|
|
194
|
-
|
195
|
-
|
196
|
-
|
386
|
+
client.add_queue(
|
387
|
+
ArchiveQueue.new('archive.today', wait_secs) do |uri|
|
388
|
+
agent = Mechanize.new
|
389
|
+
agent.read_timeout = read_timeout_secs
|
390
|
+
agent.follow_meta_refresh = true
|
197
391
|
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
392
|
+
page = agent.get('https://archive.is/')
|
393
|
+
form = page.form_with(id: 'submiturl')
|
394
|
+
if debug
|
395
|
+
debug_output('archivetoday', uri, page.inspect)
|
396
|
+
end
|
397
|
+
raise UnexpectedResponseError, page.inspect unless form
|
398
|
+
|
399
|
+
form['anyway'] = '1'
|
400
|
+
form.field_with(name: 'url').value = uri
|
401
|
+
sleep 5.0 # not submit too fast
|
402
|
+
page = agent.submit(form)
|
403
|
+
puts "<#{page.uri}>" if verbose
|
404
|
+
agent.shutdown
|
405
|
+
end
|
406
|
+
)
|
407
|
+
|
408
|
+
# main loop
|
206
409
|
|
207
410
|
uri_regexp = URI::DEFAULT_PARSER.make_regexp
|
208
|
-
|
411
|
+
all = Concurrent::Promises.future{}
|
412
|
+
while line = Readline.readline("Q(#{client.queued_uris})> ", add_hist: true)
|
209
413
|
uri = ''
|
210
414
|
begin
|
211
415
|
uri = to_ascii_uri(line).to_s
|
@@ -221,25 +425,22 @@ module WebArchive
|
|
221
425
|
next
|
222
426
|
end
|
223
427
|
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
end
|
232
|
-
end
|
233
|
-
rescue StandardError => e
|
234
|
-
warn "skipping canonical/redirect for #{uri}: #{e.message}"
|
235
|
-
end
|
236
|
-
|
237
|
-
completer.append_to_history(uri) if completer
|
428
|
+
f = client.send_uri(uri).then {
|
429
|
+
completer&.append_to_history(uri)
|
430
|
+
}.on_rejection { |reason, _|
|
431
|
+
warn "skipping canonical/redirect for #{uri}: #{reason}" if
|
432
|
+
!x.is_a?(NoAlternativeURIError)
|
433
|
+
}
|
434
|
+
all = all.zip(f)
|
238
435
|
end
|
239
436
|
|
240
|
-
|
437
|
+
all.wait
|
438
|
+
client.wait_for_queues
|
241
439
|
# TODO: trap INT and ask for confirmation
|
440
|
+
all
|
242
441
|
end
|
243
442
|
end
|
244
443
|
|
245
|
-
|
444
|
+
if $PROGRAM_NAME == __FILE__
|
445
|
+
WebArchive.launch(wait_secs: 1.0, verbose: true, debug: true)
|
446
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webarchive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yusuke Matsubara
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2025-01-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -16,14 +16,28 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 2.
|
19
|
+
version: 2.8.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 2.
|
26
|
+
version: 2.8.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: concurrent-ruby
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.3.0
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.3.0
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: fast_trie
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -44,72 +58,114 @@ dependencies:
|
|
44
58
|
requirements:
|
45
59
|
- - "~>"
|
46
60
|
- !ruby/object:Gem::Version
|
47
|
-
version: 2.
|
61
|
+
version: 2.14.0
|
48
62
|
type: :runtime
|
49
63
|
prerelease: false
|
50
64
|
version_requirements: !ruby/object:Gem::Requirement
|
51
65
|
requirements:
|
52
66
|
- - "~>"
|
53
67
|
- !ruby/object:Gem::Version
|
54
|
-
version: 2.
|
68
|
+
version: 2.14.0
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: net-http-persistent
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
58
72
|
requirements:
|
59
73
|
- - "~>"
|
60
74
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
75
|
+
version: 4.0.0
|
62
76
|
type: :runtime
|
63
77
|
prerelease: false
|
64
78
|
version_requirements: !ruby/object:Gem::Requirement
|
65
79
|
requirements:
|
66
80
|
- - "~>"
|
67
81
|
- !ruby/object:Gem::Version
|
68
|
-
version:
|
82
|
+
version: 4.0.0
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: simpleidn
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
72
86
|
requirements:
|
73
87
|
- - "~>"
|
74
88
|
- !ruby/object:Gem::Version
|
75
|
-
version: 0.
|
89
|
+
version: 0.2.1
|
76
90
|
type: :runtime
|
77
91
|
prerelease: false
|
78
92
|
version_requirements: !ruby/object:Gem::Requirement
|
79
93
|
requirements:
|
80
94
|
- - "~>"
|
81
95
|
- !ruby/object:Gem::Version
|
82
|
-
version: 0.
|
96
|
+
version: 0.2.1
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: bundler
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
86
100
|
requirements:
|
87
101
|
- - "~>"
|
88
102
|
- !ruby/object:Gem::Version
|
89
|
-
version: '1
|
103
|
+
version: '2.1'
|
90
104
|
type: :development
|
91
105
|
prerelease: false
|
92
106
|
version_requirements: !ruby/object:Gem::Requirement
|
93
107
|
requirements:
|
94
108
|
- - "~>"
|
95
109
|
- !ruby/object:Gem::Version
|
96
|
-
version: '1
|
110
|
+
version: '2.1'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: rake
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
100
114
|
requirements:
|
101
115
|
- - "~>"
|
102
116
|
- !ruby/object:Gem::Version
|
103
|
-
version: '
|
117
|
+
version: '13.0'
|
104
118
|
type: :development
|
105
119
|
prerelease: false
|
106
120
|
version_requirements: !ruby/object:Gem::Requirement
|
107
121
|
requirements:
|
108
122
|
- - "~>"
|
109
123
|
- !ruby/object:Gem::Version
|
110
|
-
version: '
|
124
|
+
version: '13.0'
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: rspec
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - "~>"
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '3.13'
|
132
|
+
type: :development
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - "~>"
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '3.13'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: rubocop
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - "~>"
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '1.70'
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - "~>"
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '1.70'
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: webmock
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - "~>"
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '3.24'
|
160
|
+
type: :development
|
161
|
+
prerelease: false
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
163
|
+
requirements:
|
164
|
+
- - "~>"
|
165
|
+
- !ruby/object:Gem::Version
|
166
|
+
version: '3.24'
|
167
|
+
- !ruby/object:Gem::Dependency
|
168
|
+
name: pry
|
113
169
|
requirement: !ruby/object:Gem::Requirement
|
114
170
|
requirements:
|
115
171
|
- - ">="
|
@@ -123,7 +179,7 @@ dependencies:
|
|
123
179
|
- !ruby/object:Gem::Version
|
124
180
|
version: '0'
|
125
181
|
- !ruby/object:Gem::Dependency
|
126
|
-
name:
|
182
|
+
name: pry-doc
|
127
183
|
requirement: !ruby/object:Gem::Requirement
|
128
184
|
requirements:
|
129
185
|
- - ">="
|
@@ -154,7 +210,7 @@ licenses:
|
|
154
210
|
metadata:
|
155
211
|
homepage_uri: https://rubygems.org/gems/webarchive
|
156
212
|
source_code_uri: https://gitlab.com/yusuke.matsubara/webarchive
|
157
|
-
post_install_message:
|
213
|
+
post_install_message:
|
158
214
|
rdoc_options: []
|
159
215
|
require_paths:
|
160
216
|
- lib
|
@@ -162,15 +218,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
162
218
|
requirements:
|
163
219
|
- - "~>"
|
164
220
|
- !ruby/object:Gem::Version
|
165
|
-
version: '
|
221
|
+
version: '3.0'
|
166
222
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
167
223
|
requirements:
|
168
224
|
- - ">="
|
169
225
|
- !ruby/object:Gem::Version
|
170
226
|
version: '0'
|
171
227
|
requirements: []
|
172
|
-
rubygems_version: 3.
|
173
|
-
signing_key:
|
228
|
+
rubygems_version: 3.4.19
|
229
|
+
signing_key:
|
174
230
|
specification_version: 4
|
175
231
|
summary: webarchive - CUI tool to archive URIs
|
176
232
|
test_files: []
|