webarchive 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/webarchive +22 -0
- data/lib/webarchive.rb +236 -0
- data/lib/webarchive/version.rb +3 -0
- metadata +160 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 90b93cbf35cd1c825adcac407f216e976c726b8e376074c3676411098a777ed4
|
4
|
+
data.tar.gz: c5015e92cb5eebaf598c1c9546197f82ff6362b0cd72213a5c62e5df7db8ffca
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: f1464df34ea47318fb724b3d1a31b169c2b60aa712e9f70615f0eb87bd33c0ef8c9079f62dad880782bd60274b5878037aad49ad835b1e880fccdf62aeddf2d3
|
7
|
+
data.tar.gz: b4d50991096eb19d66ef15229efe5fceb320eae76203ea03782651c385089fc3f9f12eeaf41e56d7c051db26ede108520aba6b50da4b6fbc7e70cdbcd7ccbbb0
|
data/bin/webarchive
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'webarchive'
|
4
|
+
require 'webarchive/version'
|
5
|
+
require 'optparse'
|
6
|
+
|
7
|
+
wait = 4.0
|
8
|
+
debug = false
|
9
|
+
verbose = false
|
10
|
+
Version = WebArchive::VERSION
|
11
|
+
OptionParser.new do |opt|
|
12
|
+
opt.on('-w', '--wait=SECS', 'wait for SECS before sending a request') { |v| wait = v.to_f }
|
13
|
+
opt.on('-d', '--debug', 'add debug output, implies verbose') { debug = true }
|
14
|
+
opt.on('--verbose') { verbose = true }
|
15
|
+
opt.on('--version') {
|
16
|
+
puts opt.ver
|
17
|
+
exit
|
18
|
+
}
|
19
|
+
end.parse!(ARGV)
|
20
|
+
|
21
|
+
warn "#{WebArchive} #{WebArchive::VERSION}" if verbose
|
22
|
+
WebArchive.launch(wait_secs: wait, debug: debug, verbose: verbose)
|
data/lib/webarchive.rb
ADDED
@@ -0,0 +1,236 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
# (this file is also an executable - see the bottom)
|
4
|
+
|
5
|
+
require 'open-uri'
|
6
|
+
require 'readline'
|
7
|
+
require 'tempfile'
|
8
|
+
require 'simpleidn'
|
9
|
+
require 'net/http'
|
10
|
+
require 'addressable/uri'
|
11
|
+
require 'mechanize'
|
12
|
+
require 'trie'
|
13
|
+
|
14
|
+
# classes and functions of webarchive package
|
15
|
+
module WebArchive
|
16
|
+
begin
|
17
|
+
require 'libnotify'
|
18
|
+
def self.warn_archive_fail(uri, archiver, body)
|
19
|
+
warn "Not archived: #{uri} by #{archiver}; #{body}"
|
20
|
+
Libnotify.show(summary: "Not archived: #{uri} by #{archiver}",
|
21
|
+
body: body, timeout: 3)
|
22
|
+
end
|
23
|
+
rescue LoadError
|
24
|
+
unless defined? warn_archive_fail
|
25
|
+
def self.warn_archive_fail(uri, archiver, body)
|
26
|
+
warn "Not archived: #{uri} by #{archiver}; #{body}"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Queue for sending URLs to a certain archiving web site
|
32
|
+
# The block given to constructor will be executed for each '<<'
|
33
|
+
class ArchiveQueue < Queue
|
34
|
+
def initialize(name, wait)
|
35
|
+
super()
|
36
|
+
@name = name
|
37
|
+
@all_sent = false
|
38
|
+
@in_process = 0 # always <= 1
|
39
|
+
@consumer = Thread.new do
|
40
|
+
loop do
|
41
|
+
uri = self.pop
|
42
|
+
@in_process += 1
|
43
|
+
begin
|
44
|
+
yield uri
|
45
|
+
rescue StandardError => e
|
46
|
+
WebArchive.warn_archive_fail(uri, name, ([e.inspect] + e.backtrace).join("\n"))
|
47
|
+
ensure
|
48
|
+
@in_process -= 1
|
49
|
+
break if @all_sent && self.empty?
|
50
|
+
|
51
|
+
sleep wait
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# mark as 'sending done' and wait for items to be processed
|
58
|
+
def done_sending
|
59
|
+
@all_sent = true
|
60
|
+
@consumer.join if self.remaining > 0
|
61
|
+
end
|
62
|
+
|
63
|
+
# number of queued items (including those being processed)
|
64
|
+
def remaining
|
65
|
+
self.size + @in_process
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.my_normalize(str)
|
70
|
+
if str =~ /[^[:ascii:]]/
|
71
|
+
Addressable::URI.encode(str)
|
72
|
+
else
|
73
|
+
str
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def self.to_ascii_uri(str)
|
78
|
+
uri = str.strip
|
79
|
+
if %r/\.[a-z]{2,4}(\/|$)/.match(uri) && !uri.include?('://') && !uri.start_with?('http')
|
80
|
+
uri = 'http://' + uri
|
81
|
+
end
|
82
|
+
|
83
|
+
u = Addressable::URI.parse(uri)
|
84
|
+
u.host = SimpleIDN.to_ascii(u.host)
|
85
|
+
u.path = my_normalize(u.path)
|
86
|
+
u.query = my_normalize(u.query)
|
87
|
+
u.fragment = my_normalize(u.fragment)
|
88
|
+
u
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.with_canonical_uri_and_redirect(uri, canonical, redirect)
|
92
|
+
if redirect
|
93
|
+
res = Net::HTTP.get_response(URI.parse(uri))
|
94
|
+
yield res['location'] if res['location'] &&
|
95
|
+
res['location'] != uri
|
96
|
+
end
|
97
|
+
if canonical
|
98
|
+
agent = Mechanize.new
|
99
|
+
page = agent.get(uri)
|
100
|
+
yield page.canonical_uri.to_s if page.class == Mechanize::Page &&
|
101
|
+
page.canonical_uri &&
|
102
|
+
page.canonical_uri.to_s != uri &&
|
103
|
+
page.canonical_uri != page.uri
|
104
|
+
end
|
105
|
+
rescue Net::HTTPClientError, Mechanize::ResponseCodeError
|
106
|
+
# ignore since it will cause a warning later anyway
|
107
|
+
end
|
108
|
+
|
109
|
+
# completer for URLs
|
110
|
+
class Completer
|
111
|
+
def initialize(history_file)
|
112
|
+
@file = File.expand_path(history_file)
|
113
|
+
@trie = Trie.new
|
114
|
+
self.reload
|
115
|
+
end
|
116
|
+
|
117
|
+
def update
|
118
|
+
self.reload if File.stat(@file).mtime > @lastupdate
|
119
|
+
end
|
120
|
+
|
121
|
+
def reload
|
122
|
+
if File.exist? @file
|
123
|
+
File.open(@file, encoding: 'utf-8').each_line do |x|
|
124
|
+
@trie.add x.strip
|
125
|
+
end
|
126
|
+
else
|
127
|
+
File.open(@file, 'w', encoding: 'utf-8') do |f|
|
128
|
+
end
|
129
|
+
end
|
130
|
+
@lastupdate = Time.now
|
131
|
+
end
|
132
|
+
|
133
|
+
def to_proc
|
134
|
+
proc do |s|
|
135
|
+
self.update
|
136
|
+
@trie.children(s)
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
HISTORY_FILE = '~/.webarchive.rb_history'
|
142
|
+
|
143
|
+
def self.launch(wait_secs: 1, debug: false, verbose: false, redirect: false, canonical_uri: true)
|
144
|
+
verbose = true if debug
|
145
|
+
Thread.abort_on_exception = true
|
146
|
+
Readline.completion_proc = Completer.new(HISTORY_FILE).to_proc
|
147
|
+
Readline.completion_append_character = ''
|
148
|
+
|
149
|
+
queues = []
|
150
|
+
|
151
|
+
queues << ArchiveQueue.new('archive.org', wait_secs) do |uri|
|
152
|
+
URI.parse("https://web.archive.org/save/#{uri}").open do |f|
|
153
|
+
if f.meta['content-location'] && verbose
|
154
|
+
puts "<https://web.archive.org#{f.meta['content-location']}>"
|
155
|
+
elsif verbose
|
156
|
+
puts f.meta.inspect
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
queues << ArchiveQueue.new('megalodon.jp', wait_secs) do |uri|
|
162
|
+
agent = Mechanize.new
|
163
|
+
page = agent.get('https://megalodon.jp/pc/?' +
|
164
|
+
Addressable::URI.form_encode(url: uri))
|
165
|
+
res = agent.submit(page.forms.first)
|
166
|
+
if debug
|
167
|
+
Tempfile.open("#{self}-#{uri.gsub(/\W/, '_')}") do |f|
|
168
|
+
f.puts res.body
|
169
|
+
end
|
170
|
+
end
|
171
|
+
og = res.at('meta[property="og:url"]')
|
172
|
+
uri = if og
|
173
|
+
og[:content]
|
174
|
+
else
|
175
|
+
res.links.map(&:href).find(-> { res.uri.to_s }) do |x|
|
176
|
+
x =~ %r{megalodon\.jp/[\d-]+/}
|
177
|
+
end
|
178
|
+
end
|
179
|
+
puts "<#{uri}>" if verbose
|
180
|
+
agent.shutdown
|
181
|
+
end
|
182
|
+
|
183
|
+
queues << ArchiveQueue.new('archive.today', wait_secs) do |uri|
|
184
|
+
agent = Mechanize.new
|
185
|
+
agent.follow_meta_refresh = true
|
186
|
+
|
187
|
+
page = agent.get('https://archive.today/')
|
188
|
+
form = page.form_with(id: 'submiturl')
|
189
|
+
form['anyway'] = '1'
|
190
|
+
form.field_with(name: 'url').value = uri
|
191
|
+
page = agent.submit(form)
|
192
|
+
puts "<#{page.uri}>" if verbose
|
193
|
+
agent.shutdown
|
194
|
+
end
|
195
|
+
|
196
|
+
uri_regexp = URI::DEFAULT_PARSER.make_regexp
|
197
|
+
while line = Readline.readline("Q(#{queues.map(&:remaining).inject(:+)})> ", add_hist: true)
|
198
|
+
uri = ''
|
199
|
+
begin
|
200
|
+
uri = to_ascii_uri(line).to_s
|
201
|
+
rescue Addressable::URI::InvalidURIError => e
|
202
|
+
warn_archive_fail(line.strip, '<>', e.message)
|
203
|
+
end
|
204
|
+
next if uri == ''
|
205
|
+
|
206
|
+
puts uri if verbose
|
207
|
+
|
208
|
+
if uri !~ uri_regexp
|
209
|
+
warn "invalid; skipping '#{uri}'"
|
210
|
+
next
|
211
|
+
end
|
212
|
+
|
213
|
+
queues.each do |q|
|
214
|
+
q << uri
|
215
|
+
end
|
216
|
+
begin
|
217
|
+
with_canonical_uri_and_redirect(uri, canonical_uri, redirect) do |x|
|
218
|
+
queues.each do |q|
|
219
|
+
q << x
|
220
|
+
end
|
221
|
+
end
|
222
|
+
rescue StandardError => e
|
223
|
+
warn "skipping canonical/redirect for #{uri}: #{e.message}"
|
224
|
+
end
|
225
|
+
|
226
|
+
File.open(File.expand_path(HISTORY_FILE), mode: 'a', encoding: 'utf-8') do |f|
|
227
|
+
f.puts uri
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
queues.each(&:done_sending)
|
232
|
+
# TODO: trap INT and ask for confirmation
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
WebArchive.launch(wait_secs: 1.0, verbose: true, debug: true) if $PROGRAM_NAME == __FILE__
|
metadata
ADDED
@@ -0,0 +1,160 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: webarchive
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Yusuke Matsubara
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2019-06-18 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: addressable
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 2.6.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 2.6.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: fast_trie
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.5.1
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.5.1
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: mechanize
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 2.7.6
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 2.7.6
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: simpleidn
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.1.1
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 0.1.1
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: bundler
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.17'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.17'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rake
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '10.0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '10.0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rspec
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: webmock
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
description: CUI tool to archive URIs using web.archive.org, archive.today, and others
|
126
|
+
email: whym@whym.org
|
127
|
+
executables:
|
128
|
+
- webarchive
|
129
|
+
extensions: []
|
130
|
+
extra_rdoc_files: []
|
131
|
+
files:
|
132
|
+
- bin/webarchive
|
133
|
+
- lib/webarchive.rb
|
134
|
+
- lib/webarchive/version.rb
|
135
|
+
homepage: https://rubygems.org/gems/whym
|
136
|
+
licenses:
|
137
|
+
- MIT
|
138
|
+
metadata:
|
139
|
+
homepage_uri: https://rubygems.org/gems/whym
|
140
|
+
source_code_uri: https://gitlab.com/yusuke.matsubara/webarchive
|
141
|
+
post_install_message:
|
142
|
+
rdoc_options: []
|
143
|
+
require_paths:
|
144
|
+
- lib
|
145
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
146
|
+
requirements:
|
147
|
+
- - ">="
|
148
|
+
- !ruby/object:Gem::Version
|
149
|
+
version: '0'
|
150
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
151
|
+
requirements:
|
152
|
+
- - ">="
|
153
|
+
- !ruby/object:Gem::Version
|
154
|
+
version: '0'
|
155
|
+
requirements: []
|
156
|
+
rubygems_version: 3.0.1
|
157
|
+
signing_key:
|
158
|
+
specification_version: 4
|
159
|
+
summary: webarchive - CUI tool to archive URIs
|
160
|
+
test_files: []
|