webarchive 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 90b93cbf35cd1c825adcac407f216e976c726b8e376074c3676411098a777ed4
4
+ data.tar.gz: c5015e92cb5eebaf598c1c9546197f82ff6362b0cd72213a5c62e5df7db8ffca
5
+ SHA512:
6
+ metadata.gz: f1464df34ea47318fb724b3d1a31b169c2b60aa712e9f70615f0eb87bd33c0ef8c9079f62dad880782bd60274b5878037aad49ad835b1e880fccdf62aeddf2d3
7
+ data.tar.gz: b4d50991096eb19d66ef15229efe5fceb320eae76203ea03782651c385089fc3f9f12eeaf41e56d7c051db26ede108520aba6b50da4b6fbc7e70cdbcd7ccbbb0
@@ -0,0 +1,22 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ require 'webarchive'
4
+ require 'webarchive/version'
5
+ require 'optparse'
6
+
7
+ wait = 4.0
8
+ debug = false
9
+ verbose = false
10
+ Version = WebArchive::VERSION
11
+ OptionParser.new do |opt|
12
+ opt.on('-w', '--wait=SECS', 'wait for SECS before sending a request') { |v| wait = v.to_f }
13
+ opt.on('-d', '--debug', 'add debug output, implies verbose') { debug = true }
14
+ opt.on('--verbose') { verbose = true }
15
+ opt.on('--version') {
16
+ puts opt.ver
17
+ exit
18
+ }
19
+ end.parse!(ARGV)
20
+
21
+ warn "#{WebArchive} #{WebArchive::VERSION}" if verbose
22
+ WebArchive.launch(wait_secs: wait, debug: debug, verbose: verbose)
@@ -0,0 +1,236 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ # (this file is also an executable - see the bottom)
4
+
5
+ require 'open-uri'
6
+ require 'readline'
7
+ require 'tempfile'
8
+ require 'simpleidn'
9
+ require 'net/http'
10
+ require 'addressable/uri'
11
+ require 'mechanize'
12
+ require 'trie'
13
+
14
+ # classes and functions of webarchive package
15
+ module WebArchive
16
+ begin
17
+ require 'libnotify'
18
+ def self.warn_archive_fail(uri, archiver, body)
19
+ warn "Not archived: #{uri} by #{archiver}; #{body}"
20
+ Libnotify.show(summary: "Not archived: #{uri} by #{archiver}",
21
+ body: body, timeout: 3)
22
+ end
23
+ rescue LoadError
24
+ unless defined? warn_archive_fail
25
+ def self.warn_archive_fail(uri, archiver, body)
26
+ warn "Not archived: #{uri} by #{archiver}; #{body}"
27
+ end
28
+ end
29
+ end
30
+
31
+ # Queue for sending URLs to a certain archiving web site
32
+ # The block given to constructor will be executed for each '<<'
33
+ class ArchiveQueue < Queue
34
+ def initialize(name, wait)
35
+ super()
36
+ @name = name
37
+ @all_sent = false
38
+ @in_process = 0 # always <= 1
39
+ @consumer = Thread.new do
40
+ loop do
41
+ uri = self.pop
42
+ @in_process += 1
43
+ begin
44
+ yield uri
45
+ rescue StandardError => e
46
+ WebArchive.warn_archive_fail(uri, name, ([e.inspect] + e.backtrace).join("\n"))
47
+ ensure
48
+ @in_process -= 1
49
+ break if @all_sent && self.empty?
50
+
51
+ sleep wait
52
+ end
53
+ end
54
+ end
55
+ end
56
+
57
+ # mark as 'sending done' and wait for items to be processed
58
+ def done_sending
59
+ @all_sent = true
60
+ @consumer.join if self.remaining > 0
61
+ end
62
+
63
+ # number of queued items (including those being processed)
64
+ def remaining
65
+ self.size + @in_process
66
+ end
67
+ end
68
+
69
+ def self.my_normalize(str)
70
+ if str =~ /[^[:ascii:]]/
71
+ Addressable::URI.encode(str)
72
+ else
73
+ str
74
+ end
75
+ end
76
+
77
+ def self.to_ascii_uri(str)
78
+ uri = str.strip
79
+ if %r/\.[a-z]{2,4}(\/|$)/.match(uri) && !uri.include?('://') && !uri.start_with?('http')
80
+ uri = 'http://' + uri
81
+ end
82
+
83
+ u = Addressable::URI.parse(uri)
84
+ u.host = SimpleIDN.to_ascii(u.host)
85
+ u.path = my_normalize(u.path)
86
+ u.query = my_normalize(u.query)
87
+ u.fragment = my_normalize(u.fragment)
88
+ u
89
+ end
90
+
91
+ def self.with_canonical_uri_and_redirect(uri, canonical, redirect)
92
+ if redirect
93
+ res = Net::HTTP.get_response(URI.parse(uri))
94
+ yield res['location'] if res['location'] &&
95
+ res['location'] != uri
96
+ end
97
+ if canonical
98
+ agent = Mechanize.new
99
+ page = agent.get(uri)
100
+ yield page.canonical_uri.to_s if page.class == Mechanize::Page &&
101
+ page.canonical_uri &&
102
+ page.canonical_uri.to_s != uri &&
103
+ page.canonical_uri != page.uri
104
+ end
105
+ rescue Net::HTTPClientError, Mechanize::ResponseCodeError
106
+ # ignore since it will cause a warning later anyway
107
+ end
108
+
109
+ # completer for URLs
110
+ class Completer
111
+ def initialize(history_file)
112
+ @file = File.expand_path(history_file)
113
+ @trie = Trie.new
114
+ self.reload
115
+ end
116
+
117
+ def update
118
+ self.reload if File.stat(@file).mtime > @lastupdate
119
+ end
120
+
121
+ def reload
122
+ if File.exist? @file
123
+ File.open(@file, encoding: 'utf-8').each_line do |x|
124
+ @trie.add x.strip
125
+ end
126
+ else
127
+ File.open(@file, 'w', encoding: 'utf-8') do |f|
128
+ end
129
+ end
130
+ @lastupdate = Time.now
131
+ end
132
+
133
+ def to_proc
134
+ proc do |s|
135
+ self.update
136
+ @trie.children(s)
137
+ end
138
+ end
139
+ end
140
+
141
+ HISTORY_FILE = '~/.webarchive.rb_history'
142
+
143
+ def self.launch(wait_secs: 1, debug: false, verbose: false, redirect: false, canonical_uri: true)
144
+ verbose = true if debug
145
+ Thread.abort_on_exception = true
146
+ Readline.completion_proc = Completer.new(HISTORY_FILE).to_proc
147
+ Readline.completion_append_character = ''
148
+
149
+ queues = []
150
+
151
+ queues << ArchiveQueue.new('archive.org', wait_secs) do |uri|
152
+ URI.parse("https://web.archive.org/save/#{uri}").open do |f|
153
+ if f.meta['content-location'] && verbose
154
+ puts "<https://web.archive.org#{f.meta['content-location']}>"
155
+ elsif verbose
156
+ puts f.meta.inspect
157
+ end
158
+ end
159
+ end
160
+
161
+ queues << ArchiveQueue.new('megalodon.jp', wait_secs) do |uri|
162
+ agent = Mechanize.new
163
+ page = agent.get('https://megalodon.jp/pc/?' +
164
+ Addressable::URI.form_encode(url: uri))
165
+ res = agent.submit(page.forms.first)
166
+ if debug
167
+ Tempfile.open("#{self}-#{uri.gsub(/\W/, '_')}") do |f|
168
+ f.puts res.body
169
+ end
170
+ end
171
+ og = res.at('meta[property="og:url"]')
172
+ uri = if og
173
+ og[:content]
174
+ else
175
+ res.links.map(&:href).find(-> { res.uri.to_s }) do |x|
176
+ x =~ %r{megalodon\.jp/[\d-]+/}
177
+ end
178
+ end
179
+ puts "<#{uri}>" if verbose
180
+ agent.shutdown
181
+ end
182
+
183
+ queues << ArchiveQueue.new('archive.today', wait_secs) do |uri|
184
+ agent = Mechanize.new
185
+ agent.follow_meta_refresh = true
186
+
187
+ page = agent.get('https://archive.today/')
188
+ form = page.form_with(id: 'submiturl')
189
+ form['anyway'] = '1'
190
+ form.field_with(name: 'url').value = uri
191
+ page = agent.submit(form)
192
+ puts "<#{page.uri}>" if verbose
193
+ agent.shutdown
194
+ end
195
+
196
+ uri_regexp = URI::DEFAULT_PARSER.make_regexp
197
+ while line = Readline.readline("Q(#{queues.map(&:remaining).inject(:+)})> ", add_hist: true)
198
+ uri = ''
199
+ begin
200
+ uri = to_ascii_uri(line).to_s
201
+ rescue Addressable::URI::InvalidURIError => e
202
+ warn_archive_fail(line.strip, '<>', e.message)
203
+ end
204
+ next if uri == ''
205
+
206
+ puts uri if verbose
207
+
208
+ if uri !~ uri_regexp
209
+ warn "invalid; skipping '#{uri}'"
210
+ next
211
+ end
212
+
213
+ queues.each do |q|
214
+ q << uri
215
+ end
216
+ begin
217
+ with_canonical_uri_and_redirect(uri, canonical_uri, redirect) do |x|
218
+ queues.each do |q|
219
+ q << x
220
+ end
221
+ end
222
+ rescue StandardError => e
223
+ warn "skipping canonical/redirect for #{uri}: #{e.message}"
224
+ end
225
+
226
+ File.open(File.expand_path(HISTORY_FILE), mode: 'a', encoding: 'utf-8') do |f|
227
+ f.puts uri
228
+ end
229
+ end
230
+
231
+ queues.each(&:done_sending)
232
+ # TODO: trap INT and ask for confirmation
233
+ end
234
+ end
235
+
236
+ WebArchive.launch(wait_secs: 1.0, verbose: true, debug: true) if $PROGRAM_NAME == __FILE__
@@ -0,0 +1,3 @@
1
+ module WebArchive
2
+ VERSION = '0.1.0'
3
+ end
metadata ADDED
@@ -0,0 +1,160 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: webarchive
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Yusuke Matsubara
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-06-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: addressable
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 2.6.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 2.6.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: fast_trie
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 0.5.1
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 0.5.1
41
+ - !ruby/object:Gem::Dependency
42
+ name: mechanize
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 2.7.6
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 2.7.6
55
+ - !ruby/object:Gem::Dependency
56
+ name: simpleidn
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: 0.1.1
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: 0.1.1
69
+ - !ruby/object:Gem::Dependency
70
+ name: bundler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.17'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.17'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '10.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '10.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rspec
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: webmock
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description: CUI tool to archive URIs using web.archive.org, archive.today, and others
126
+ email: whym@whym.org
127
+ executables:
128
+ - webarchive
129
+ extensions: []
130
+ extra_rdoc_files: []
131
+ files:
132
+ - bin/webarchive
133
+ - lib/webarchive.rb
134
+ - lib/webarchive/version.rb
135
+ homepage: https://rubygems.org/gems/whym
136
+ licenses:
137
+ - MIT
138
+ metadata:
139
+ homepage_uri: https://rubygems.org/gems/whym
140
+ source_code_uri: https://gitlab.com/yusuke.matsubara/webarchive
141
+ post_install_message:
142
+ rdoc_options: []
143
+ require_paths:
144
+ - lib
145
+ required_ruby_version: !ruby/object:Gem::Requirement
146
+ requirements:
147
+ - - ">="
148
+ - !ruby/object:Gem::Version
149
+ version: '0'
150
+ required_rubygems_version: !ruby/object:Gem::Requirement
151
+ requirements:
152
+ - - ">="
153
+ - !ruby/object:Gem::Version
154
+ version: '0'
155
+ requirements: []
156
+ rubygems_version: 3.0.1
157
+ signing_key:
158
+ specification_version: 4
159
+ summary: webarchive - CUI tool to archive URIs
160
+ test_files: []