webarchive 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/webarchive +22 -0
- data/lib/webarchive.rb +236 -0
- data/lib/webarchive/version.rb +3 -0
- metadata +160 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 90b93cbf35cd1c825adcac407f216e976c726b8e376074c3676411098a777ed4
|
4
|
+
data.tar.gz: c5015e92cb5eebaf598c1c9546197f82ff6362b0cd72213a5c62e5df7db8ffca
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: f1464df34ea47318fb724b3d1a31b169c2b60aa712e9f70615f0eb87bd33c0ef8c9079f62dad880782bd60274b5878037aad49ad835b1e880fccdf62aeddf2d3
|
7
|
+
data.tar.gz: b4d50991096eb19d66ef15229efe5fceb320eae76203ea03782651c385089fc3f9f12eeaf41e56d7c051db26ede108520aba6b50da4b6fbc7e70cdbcd7ccbbb0
|
data/bin/webarchive
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'webarchive'
|
4
|
+
require 'webarchive/version'
|
5
|
+
require 'optparse'
|
6
|
+
|
7
|
+
wait = 4.0
|
8
|
+
debug = false
|
9
|
+
verbose = false
|
10
|
+
Version = WebArchive::VERSION
|
11
|
+
OptionParser.new do |opt|
|
12
|
+
opt.on('-w', '--wait=SECS', 'wait for SECS before sending a request') { |v| wait = v.to_f }
|
13
|
+
opt.on('-d', '--debug', 'add debug output, implies verbose') { debug = true }
|
14
|
+
opt.on('--verbose') { verbose = true }
|
15
|
+
opt.on('--version') {
|
16
|
+
puts opt.ver
|
17
|
+
exit
|
18
|
+
}
|
19
|
+
end.parse!(ARGV)
|
20
|
+
|
21
|
+
warn "#{WebArchive} #{WebArchive::VERSION}" if verbose
|
22
|
+
WebArchive.launch(wait_secs: wait, debug: debug, verbose: verbose)
|
data/lib/webarchive.rb
ADDED
@@ -0,0 +1,236 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
# (this file is also an executable - see the bottom)
|
4
|
+
|
5
|
+
require 'open-uri'
|
6
|
+
require 'readline'
|
7
|
+
require 'tempfile'
|
8
|
+
require 'simpleidn'
|
9
|
+
require 'net/http'
|
10
|
+
require 'addressable/uri'
|
11
|
+
require 'mechanize'
|
12
|
+
require 'trie'
|
13
|
+
|
14
|
+
# classes and functions of webarchive package
|
15
|
+
module WebArchive
|
16
|
+
begin
|
17
|
+
require 'libnotify'
|
18
|
+
def self.warn_archive_fail(uri, archiver, body)
|
19
|
+
warn "Not archived: #{uri} by #{archiver}; #{body}"
|
20
|
+
Libnotify.show(summary: "Not archived: #{uri} by #{archiver}",
|
21
|
+
body: body, timeout: 3)
|
22
|
+
end
|
23
|
+
rescue LoadError
|
24
|
+
unless defined? warn_archive_fail
|
25
|
+
def self.warn_archive_fail(uri, archiver, body)
|
26
|
+
warn "Not archived: #{uri} by #{archiver}; #{body}"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Queue for sending URLs to a certain archiving web site
|
32
|
+
# The block given to constructor will be executed for each '<<'
|
33
|
+
class ArchiveQueue < Queue
|
34
|
+
def initialize(name, wait)
|
35
|
+
super()
|
36
|
+
@name = name
|
37
|
+
@all_sent = false
|
38
|
+
@in_process = 0 # always <= 1
|
39
|
+
@consumer = Thread.new do
|
40
|
+
loop do
|
41
|
+
uri = self.pop
|
42
|
+
@in_process += 1
|
43
|
+
begin
|
44
|
+
yield uri
|
45
|
+
rescue StandardError => e
|
46
|
+
WebArchive.warn_archive_fail(uri, name, ([e.inspect] + e.backtrace).join("\n"))
|
47
|
+
ensure
|
48
|
+
@in_process -= 1
|
49
|
+
break if @all_sent && self.empty?
|
50
|
+
|
51
|
+
sleep wait
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# mark as 'sending done' and wait for items to be processed
|
58
|
+
def done_sending
|
59
|
+
@all_sent = true
|
60
|
+
@consumer.join if self.remaining > 0
|
61
|
+
end
|
62
|
+
|
63
|
+
# number of queued items (including those being processed)
|
64
|
+
def remaining
|
65
|
+
self.size + @in_process
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.my_normalize(str)
|
70
|
+
if str =~ /[^[:ascii:]]/
|
71
|
+
Addressable::URI.encode(str)
|
72
|
+
else
|
73
|
+
str
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def self.to_ascii_uri(str)
|
78
|
+
uri = str.strip
|
79
|
+
if %r/\.[a-z]{2,4}(\/|$)/.match(uri) && !uri.include?('://') && !uri.start_with?('http')
|
80
|
+
uri = 'http://' + uri
|
81
|
+
end
|
82
|
+
|
83
|
+
u = Addressable::URI.parse(uri)
|
84
|
+
u.host = SimpleIDN.to_ascii(u.host)
|
85
|
+
u.path = my_normalize(u.path)
|
86
|
+
u.query = my_normalize(u.query)
|
87
|
+
u.fragment = my_normalize(u.fragment)
|
88
|
+
u
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.with_canonical_uri_and_redirect(uri, canonical, redirect)
|
92
|
+
if redirect
|
93
|
+
res = Net::HTTP.get_response(URI.parse(uri))
|
94
|
+
yield res['location'] if res['location'] &&
|
95
|
+
res['location'] != uri
|
96
|
+
end
|
97
|
+
if canonical
|
98
|
+
agent = Mechanize.new
|
99
|
+
page = agent.get(uri)
|
100
|
+
yield page.canonical_uri.to_s if page.class == Mechanize::Page &&
|
101
|
+
page.canonical_uri &&
|
102
|
+
page.canonical_uri.to_s != uri &&
|
103
|
+
page.canonical_uri != page.uri
|
104
|
+
end
|
105
|
+
rescue Net::HTTPClientError, Mechanize::ResponseCodeError
|
106
|
+
# ignore since it will cause a warning later anyway
|
107
|
+
end
|
108
|
+
|
109
|
+
# completer for URLs
|
110
|
+
class Completer
|
111
|
+
def initialize(history_file)
|
112
|
+
@file = File.expand_path(history_file)
|
113
|
+
@trie = Trie.new
|
114
|
+
self.reload
|
115
|
+
end
|
116
|
+
|
117
|
+
def update
|
118
|
+
self.reload if File.stat(@file).mtime > @lastupdate
|
119
|
+
end
|
120
|
+
|
121
|
+
def reload
|
122
|
+
if File.exist? @file
|
123
|
+
File.open(@file, encoding: 'utf-8').each_line do |x|
|
124
|
+
@trie.add x.strip
|
125
|
+
end
|
126
|
+
else
|
127
|
+
File.open(@file, 'w', encoding: 'utf-8') do |f|
|
128
|
+
end
|
129
|
+
end
|
130
|
+
@lastupdate = Time.now
|
131
|
+
end
|
132
|
+
|
133
|
+
def to_proc
|
134
|
+
proc do |s|
|
135
|
+
self.update
|
136
|
+
@trie.children(s)
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
HISTORY_FILE = '~/.webarchive.rb_history'
|
142
|
+
|
143
|
+
def self.launch(wait_secs: 1, debug: false, verbose: false, redirect: false, canonical_uri: true)
|
144
|
+
verbose = true if debug
|
145
|
+
Thread.abort_on_exception = true
|
146
|
+
Readline.completion_proc = Completer.new(HISTORY_FILE).to_proc
|
147
|
+
Readline.completion_append_character = ''
|
148
|
+
|
149
|
+
queues = []
|
150
|
+
|
151
|
+
queues << ArchiveQueue.new('archive.org', wait_secs) do |uri|
|
152
|
+
URI.parse("https://web.archive.org/save/#{uri}").open do |f|
|
153
|
+
if f.meta['content-location'] && verbose
|
154
|
+
puts "<https://web.archive.org#{f.meta['content-location']}>"
|
155
|
+
elsif verbose
|
156
|
+
puts f.meta.inspect
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
queues << ArchiveQueue.new('megalodon.jp', wait_secs) do |uri|
|
162
|
+
agent = Mechanize.new
|
163
|
+
page = agent.get('https://megalodon.jp/pc/?' +
|
164
|
+
Addressable::URI.form_encode(url: uri))
|
165
|
+
res = agent.submit(page.forms.first)
|
166
|
+
if debug
|
167
|
+
Tempfile.open("#{self}-#{uri.gsub(/\W/, '_')}") do |f|
|
168
|
+
f.puts res.body
|
169
|
+
end
|
170
|
+
end
|
171
|
+
og = res.at('meta[property="og:url"]')
|
172
|
+
uri = if og
|
173
|
+
og[:content]
|
174
|
+
else
|
175
|
+
res.links.map(&:href).find(-> { res.uri.to_s }) do |x|
|
176
|
+
x =~ %r{megalodon\.jp/[\d-]+/}
|
177
|
+
end
|
178
|
+
end
|
179
|
+
puts "<#{uri}>" if verbose
|
180
|
+
agent.shutdown
|
181
|
+
end
|
182
|
+
|
183
|
+
queues << ArchiveQueue.new('archive.today', wait_secs) do |uri|
|
184
|
+
agent = Mechanize.new
|
185
|
+
agent.follow_meta_refresh = true
|
186
|
+
|
187
|
+
page = agent.get('https://archive.today/')
|
188
|
+
form = page.form_with(id: 'submiturl')
|
189
|
+
form['anyway'] = '1'
|
190
|
+
form.field_with(name: 'url').value = uri
|
191
|
+
page = agent.submit(form)
|
192
|
+
puts "<#{page.uri}>" if verbose
|
193
|
+
agent.shutdown
|
194
|
+
end
|
195
|
+
|
196
|
+
uri_regexp = URI::DEFAULT_PARSER.make_regexp
|
197
|
+
while line = Readline.readline("Q(#{queues.map(&:remaining).inject(:+)})> ", add_hist: true)
|
198
|
+
uri = ''
|
199
|
+
begin
|
200
|
+
uri = to_ascii_uri(line).to_s
|
201
|
+
rescue Addressable::URI::InvalidURIError => e
|
202
|
+
warn_archive_fail(line.strip, '<>', e.message)
|
203
|
+
end
|
204
|
+
next if uri == ''
|
205
|
+
|
206
|
+
puts uri if verbose
|
207
|
+
|
208
|
+
if uri !~ uri_regexp
|
209
|
+
warn "invalid; skipping '#{uri}'"
|
210
|
+
next
|
211
|
+
end
|
212
|
+
|
213
|
+
queues.each do |q|
|
214
|
+
q << uri
|
215
|
+
end
|
216
|
+
begin
|
217
|
+
with_canonical_uri_and_redirect(uri, canonical_uri, redirect) do |x|
|
218
|
+
queues.each do |q|
|
219
|
+
q << x
|
220
|
+
end
|
221
|
+
end
|
222
|
+
rescue StandardError => e
|
223
|
+
warn "skipping canonical/redirect for #{uri}: #{e.message}"
|
224
|
+
end
|
225
|
+
|
226
|
+
File.open(File.expand_path(HISTORY_FILE), mode: 'a', encoding: 'utf-8') do |f|
|
227
|
+
f.puts uri
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
queues.each(&:done_sending)
|
232
|
+
# TODO: trap INT and ask for confirmation
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
WebArchive.launch(wait_secs: 1.0, verbose: true, debug: true) if $PROGRAM_NAME == __FILE__
|
metadata
ADDED
@@ -0,0 +1,160 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: webarchive
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Yusuke Matsubara
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2019-06-18 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: addressable
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 2.6.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 2.6.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: fast_trie
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.5.1
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.5.1
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: mechanize
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 2.7.6
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 2.7.6
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: simpleidn
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.1.1
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 0.1.1
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: bundler
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.17'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.17'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rake
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '10.0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '10.0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rspec
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: webmock
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
description: CUI tool to archive URIs using web.archive.org, archive.today, and others
|
126
|
+
email: whym@whym.org
|
127
|
+
executables:
|
128
|
+
- webarchive
|
129
|
+
extensions: []
|
130
|
+
extra_rdoc_files: []
|
131
|
+
files:
|
132
|
+
- bin/webarchive
|
133
|
+
- lib/webarchive.rb
|
134
|
+
- lib/webarchive/version.rb
|
135
|
+
homepage: https://rubygems.org/gems/whym
|
136
|
+
licenses:
|
137
|
+
- MIT
|
138
|
+
metadata:
|
139
|
+
homepage_uri: https://rubygems.org/gems/whym
|
140
|
+
source_code_uri: https://gitlab.com/yusuke.matsubara/webarchive
|
141
|
+
post_install_message:
|
142
|
+
rdoc_options: []
|
143
|
+
require_paths:
|
144
|
+
- lib
|
145
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
146
|
+
requirements:
|
147
|
+
- - ">="
|
148
|
+
- !ruby/object:Gem::Version
|
149
|
+
version: '0'
|
150
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
151
|
+
requirements:
|
152
|
+
- - ">="
|
153
|
+
- !ruby/object:Gem::Version
|
154
|
+
version: '0'
|
155
|
+
requirements: []
|
156
|
+
rubygems_version: 3.0.1
|
157
|
+
signing_key:
|
158
|
+
specification_version: 4
|
159
|
+
summary: webarchive - CUI tool to archive URIs
|
160
|
+
test_files: []
|