webarchive 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 90b93cbf35cd1c825adcac407f216e976c726b8e376074c3676411098a777ed4
4
+ data.tar.gz: c5015e92cb5eebaf598c1c9546197f82ff6362b0cd72213a5c62e5df7db8ffca
5
+ SHA512:
6
+ metadata.gz: f1464df34ea47318fb724b3d1a31b169c2b60aa712e9f70615f0eb87bd33c0ef8c9079f62dad880782bd60274b5878037aad49ad835b1e880fccdf62aeddf2d3
7
+ data.tar.gz: b4d50991096eb19d66ef15229efe5fceb320eae76203ea03782651c385089fc3f9f12eeaf41e56d7c051db26ede108520aba6b50da4b6fbc7e70cdbcd7ccbbb0
@@ -0,0 +1,22 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ require 'webarchive'
4
+ require 'webarchive/version'
5
+ require 'optparse'
6
+
7
+ wait = 4.0
8
+ debug = false
9
+ verbose = false
10
+ Version = WebArchive::VERSION
11
+ OptionParser.new do |opt|
12
+ opt.on('-w', '--wait=SECS', 'wait for SECS before sending a request') { |v| wait = v.to_f }
13
+ opt.on('-d', '--debug', 'add debug output, implies verbose') { debug = true }
14
+ opt.on('--verbose') { verbose = true }
15
+ opt.on('--version') {
16
+ puts opt.ver
17
+ exit
18
+ }
19
+ end.parse!(ARGV)
20
+
21
+ warn "#{WebArchive} #{WebArchive::VERSION}" if verbose
22
+ WebArchive.launch(wait_secs: wait, debug: debug, verbose: verbose)
@@ -0,0 +1,236 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ # (this file is also an executable - see the bottom)
4
+
5
+ require 'open-uri'
6
+ require 'readline'
7
+ require 'tempfile'
8
+ require 'simpleidn'
9
+ require 'net/http'
10
+ require 'addressable/uri'
11
+ require 'mechanize'
12
+ require 'trie'
13
+
14
+ # classes and functions of webarchive package
15
+ module WebArchive
16
+ begin
17
+ require 'libnotify'
18
+ def self.warn_archive_fail(uri, archiver, body)
19
+ warn "Not archived: #{uri} by #{archiver}; #{body}"
20
+ Libnotify.show(summary: "Not archived: #{uri} by #{archiver}",
21
+ body: body, timeout: 3)
22
+ end
23
+ rescue LoadError
24
+ unless defined? warn_archive_fail
25
+ def self.warn_archive_fail(uri, archiver, body)
26
+ warn "Not archived: #{uri} by #{archiver}; #{body}"
27
+ end
28
+ end
29
+ end
30
+
31
+ # Queue for sending URLs to a certain archiving web site
32
+ # The block given to constructor will be executed for each '<<'
33
+ class ArchiveQueue < Queue
34
+ def initialize(name, wait)
35
+ super()
36
+ @name = name
37
+ @all_sent = false
38
+ @in_process = 0 # always <= 1
39
+ @consumer = Thread.new do
40
+ loop do
41
+ uri = self.pop
42
+ @in_process += 1
43
+ begin
44
+ yield uri
45
+ rescue StandardError => e
46
+ WebArchive.warn_archive_fail(uri, name, ([e.inspect] + e.backtrace).join("\n"))
47
+ ensure
48
+ @in_process -= 1
49
+ break if @all_sent && self.empty?
50
+
51
+ sleep wait
52
+ end
53
+ end
54
+ end
55
+ end
56
+
57
+ # mark as 'sending done' and wait for items to be processed
58
+ def done_sending
59
+ @all_sent = true
60
+ @consumer.join if self.remaining > 0
61
+ end
62
+
63
+ # number of queued items (including those being processed)
64
+ def remaining
65
+ self.size + @in_process
66
+ end
67
+ end
68
+
69
+ def self.my_normalize(str)
70
+ if str =~ /[^[:ascii:]]/
71
+ Addressable::URI.encode(str)
72
+ else
73
+ str
74
+ end
75
+ end
76
+
77
+ def self.to_ascii_uri(str)
78
+ uri = str.strip
79
+ if %r/\.[a-z]{2,4}(\/|$)/.match(uri) && !uri.include?('://') && !uri.start_with?('http')
80
+ uri = 'http://' + uri
81
+ end
82
+
83
+ u = Addressable::URI.parse(uri)
84
+ u.host = SimpleIDN.to_ascii(u.host)
85
+ u.path = my_normalize(u.path)
86
+ u.query = my_normalize(u.query)
87
+ u.fragment = my_normalize(u.fragment)
88
+ u
89
+ end
90
+
91
+ def self.with_canonical_uri_and_redirect(uri, canonical, redirect)
92
+ if redirect
93
+ res = Net::HTTP.get_response(URI.parse(uri))
94
+ yield res['location'] if res['location'] &&
95
+ res['location'] != uri
96
+ end
97
+ if canonical
98
+ agent = Mechanize.new
99
+ page = agent.get(uri)
100
+ yield page.canonical_uri.to_s if page.class == Mechanize::Page &&
101
+ page.canonical_uri &&
102
+ page.canonical_uri.to_s != uri &&
103
+ page.canonical_uri != page.uri
104
+ end
105
+ rescue Net::HTTPClientError, Mechanize::ResponseCodeError
106
+ # ignore since it will cause a warning later anyway
107
+ end
108
+
109
+ # completer for URLs
110
+ class Completer
111
+ def initialize(history_file)
112
+ @file = File.expand_path(history_file)
113
+ @trie = Trie.new
114
+ self.reload
115
+ end
116
+
117
+ def update
118
+ self.reload if File.stat(@file).mtime > @lastupdate
119
+ end
120
+
121
+ def reload
122
+ if File.exist? @file
123
+ File.open(@file, encoding: 'utf-8').each_line do |x|
124
+ @trie.add x.strip
125
+ end
126
+ else
127
+ File.open(@file, 'w', encoding: 'utf-8') do |f|
128
+ end
129
+ end
130
+ @lastupdate = Time.now
131
+ end
132
+
133
+ def to_proc
134
+ proc do |s|
135
+ self.update
136
+ @trie.children(s)
137
+ end
138
+ end
139
+ end
140
+
141
+ HISTORY_FILE = '~/.webarchive.rb_history'
142
+
143
+ def self.launch(wait_secs: 1, debug: false, verbose: false, redirect: false, canonical_uri: true)
144
+ verbose = true if debug
145
+ Thread.abort_on_exception = true
146
+ Readline.completion_proc = Completer.new(HISTORY_FILE).to_proc
147
+ Readline.completion_append_character = ''
148
+
149
+ queues = []
150
+
151
+ queues << ArchiveQueue.new('archive.org', wait_secs) do |uri|
152
+ URI.parse("https://web.archive.org/save/#{uri}").open do |f|
153
+ if f.meta['content-location'] && verbose
154
+ puts "<https://web.archive.org#{f.meta['content-location']}>"
155
+ elsif verbose
156
+ puts f.meta.inspect
157
+ end
158
+ end
159
+ end
160
+
161
+ queues << ArchiveQueue.new('megalodon.jp', wait_secs) do |uri|
162
+ agent = Mechanize.new
163
+ page = agent.get('https://megalodon.jp/pc/?' +
164
+ Addressable::URI.form_encode(url: uri))
165
+ res = agent.submit(page.forms.first)
166
+ if debug
167
+ Tempfile.open("#{self}-#{uri.gsub(/\W/, '_')}") do |f|
168
+ f.puts res.body
169
+ end
170
+ end
171
+ og = res.at('meta[property="og:url"]')
172
+ uri = if og
173
+ og[:content]
174
+ else
175
+ res.links.map(&:href).find(-> { res.uri.to_s }) do |x|
176
+ x =~ %r{megalodon\.jp/[\d-]+/}
177
+ end
178
+ end
179
+ puts "<#{uri}>" if verbose
180
+ agent.shutdown
181
+ end
182
+
183
+ queues << ArchiveQueue.new('archive.today', wait_secs) do |uri|
184
+ agent = Mechanize.new
185
+ agent.follow_meta_refresh = true
186
+
187
+ page = agent.get('https://archive.today/')
188
+ form = page.form_with(id: 'submiturl')
189
+ form['anyway'] = '1'
190
+ form.field_with(name: 'url').value = uri
191
+ page = agent.submit(form)
192
+ puts "<#{page.uri}>" if verbose
193
+ agent.shutdown
194
+ end
195
+
196
+ uri_regexp = URI::DEFAULT_PARSER.make_regexp
197
+ while line = Readline.readline("Q(#{queues.map(&:remaining).inject(:+)})> ", add_hist: true)
198
+ uri = ''
199
+ begin
200
+ uri = to_ascii_uri(line).to_s
201
+ rescue Addressable::URI::InvalidURIError => e
202
+ warn_archive_fail(line.strip, '<>', e.message)
203
+ end
204
+ next if uri == ''
205
+
206
+ puts uri if verbose
207
+
208
+ if uri !~ uri_regexp
209
+ warn "invalid; skipping '#{uri}'"
210
+ next
211
+ end
212
+
213
+ queues.each do |q|
214
+ q << uri
215
+ end
216
+ begin
217
+ with_canonical_uri_and_redirect(uri, canonical_uri, redirect) do |x|
218
+ queues.each do |q|
219
+ q << x
220
+ end
221
+ end
222
+ rescue StandardError => e
223
+ warn "skipping canonical/redirect for #{uri}: #{e.message}"
224
+ end
225
+
226
+ File.open(File.expand_path(HISTORY_FILE), mode: 'a', encoding: 'utf-8') do |f|
227
+ f.puts uri
228
+ end
229
+ end
230
+
231
+ queues.each(&:done_sending)
232
+ # TODO: trap INT and ask for confirmation
233
+ end
234
+ end
235
+
236
+ WebArchive.launch(wait_secs: 1.0, verbose: true, debug: true) if $PROGRAM_NAME == __FILE__
@@ -0,0 +1,3 @@
1
+ module WebArchive
2
+ VERSION = '0.1.0'
3
+ end
metadata ADDED
@@ -0,0 +1,160 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: webarchive
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Yusuke Matsubara
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-06-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: addressable
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 2.6.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 2.6.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: fast_trie
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 0.5.1
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 0.5.1
41
+ - !ruby/object:Gem::Dependency
42
+ name: mechanize
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 2.7.6
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 2.7.6
55
+ - !ruby/object:Gem::Dependency
56
+ name: simpleidn
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: 0.1.1
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: 0.1.1
69
+ - !ruby/object:Gem::Dependency
70
+ name: bundler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.17'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.17'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '10.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '10.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rspec
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: webmock
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description: CUI tool to archive URIs using web.archive.org, archive.today, and others
126
+ email: whym@whym.org
127
+ executables:
128
+ - webarchive
129
+ extensions: []
130
+ extra_rdoc_files: []
131
+ files:
132
+ - bin/webarchive
133
+ - lib/webarchive.rb
134
+ - lib/webarchive/version.rb
135
+ homepage: https://rubygems.org/gems/whym
136
+ licenses:
137
+ - MIT
138
+ metadata:
139
+ homepage_uri: https://rubygems.org/gems/whym
140
+ source_code_uri: https://gitlab.com/yusuke.matsubara/webarchive
141
+ post_install_message:
142
+ rdoc_options: []
143
+ require_paths:
144
+ - lib
145
+ required_ruby_version: !ruby/object:Gem::Requirement
146
+ requirements:
147
+ - - ">="
148
+ - !ruby/object:Gem::Version
149
+ version: '0'
150
+ required_rubygems_version: !ruby/object:Gem::Requirement
151
+ requirements:
152
+ - - ">="
153
+ - !ruby/object:Gem::Version
154
+ version: '0'
155
+ requirements: []
156
+ rubygems_version: 3.0.1
157
+ signing_key:
158
+ specification_version: 4
159
+ summary: webarchive - CUI tool to archive URIs
160
+ test_files: []