w3m-autopagerize 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/next.cgi +31 -0
- data/bin/w3m-autopagerize-server.rb +489 -0
- data/config.sample.rb +130 -0
- data/readme.html +252 -0
- data/readme.org +65 -0
- data/test/test-w3m-autopagerize.rb +166 -0
- metadata +58 -0
data/bin/next.cgi
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
#!/usr/local/bin/ruby19 -Ku
|
2
|
+
require 'drb'
|
3
|
+
require 'w3m-localcgi'
|
4
|
+
require 'kconv'
|
5
|
+
|
6
|
+
url = ENV['W3M_URL']
|
7
|
+
|
8
|
+
DRb.start_service
|
9
|
+
public :print # HACK to work w3mctl
|
10
|
+
srv = DRbObject.new_with_uri "druby://:9322"
|
11
|
+
if ENV['QUERY_STRING'] == 'crop'
|
12
|
+
hash = srv.crop_this_page(url, ENV['W3M_SOURCEFILE'], ENV['W3M_CHARSET'], self.extend(DRbUndumped))
|
13
|
+
else
|
14
|
+
hash = srv.nextpage(url, ENV['W3M_SOURCEFILE'], ENV['W3M_CHARSET'], self.extend(DRbUndumped))
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
if hash[:html]
|
19
|
+
puts "Content-Type: text/html"
|
20
|
+
puts "W3m-AutoPagerize-NextLink: #{hash[:nextLink]}" if hash[:nextLink]
|
21
|
+
puts "W3m-AutoPagerize-PageElement: #{hash[:pageElement]}" if hash[:pageElement]
|
22
|
+
puts
|
23
|
+
puts hash[:html]
|
24
|
+
else
|
25
|
+
if hash[:location]
|
26
|
+
puts "W3m-Control: BACK"
|
27
|
+
puts "W3m-Control: GOTO #{hash[:location]}"
|
28
|
+
puts "W3m-AutoPagerize-NextLink: #{hash[:nextLink]}" if hash[:nextLink]
|
29
|
+
end
|
30
|
+
puts
|
31
|
+
end
|
@@ -0,0 +1,489 @@
|
|
1
|
+
#!/usr/local/bin/ruby19
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# (executable-interpret "ruby19 /m/home/rubikitch/w3m/cgi-bin/w3m-autopagerize/test-w3m-autopagerize.rb --no-use-color ")
|
4
|
+
# (executable-interpret "rm -f /log/w3m-autopagerize.log; w3m-autopagerize-server.rb -r")
|
5
|
+
start_time = Time.now
|
6
|
+
#Encoding.default_internal = "UTF-8"
|
7
|
+
require 'kconv'
|
8
|
+
require 'uri'
|
9
|
+
require 'rubygems'
|
10
|
+
require 'nokogiri'
|
11
|
+
require 'pp'
|
12
|
+
require 'logger'
|
13
|
+
require 'tmpdir'
|
14
|
+
require 'json'
|
15
|
+
|
16
|
+
###########################################################################
|
17
|
+
# Configurable Variables #
|
18
|
+
###########################################################################
|
19
|
+
$W3M_EXTRA_OPTIONS = ""
|
20
|
+
# see http://www.opera-wiki.com/index.php?FAQ%2F5.%E3%82%AB%E3%82%B9%E3%82%BF%E3%83%9E%E3%82%A4%E3%82%BA#k7bb0c80
|
21
|
+
$FALLBACK_PATTERNS = %w[次へ 次頁 次ページ 次項 次の 次を 先へ つぎへ つぎの
|
22
|
+
進む next もっと見る ]
|
23
|
+
$FALLBACK_WORDS = %w[次 つぎ 続きます keep\ reading [→] 次一覧 Older\ Entries]
|
24
|
+
$FALLBACK_START_WORDS = %w[> > 次 つぎ Next NEXT next →]
|
25
|
+
$SITEINFO_IMPORT_URLS = %w[
|
26
|
+
http://wedata.net/databases/AutoPagerize/items.json
|
27
|
+
]
|
28
|
+
$EXCLUDE_URLS = %w[
|
29
|
+
^https?:\/\/.
|
30
|
+
]
|
31
|
+
$LOG_FILE = $stderr
|
32
|
+
|
33
|
+
###########################################################################
|
34
|
+
# DSL for nexturl #
|
35
|
+
###########################################################################
|
36
|
+
$TEST_MODE = false
|
37
|
+
$SITEINFO = []
|
38
|
+
$client = nil
|
39
|
+
class SiteData < Struct.new(:nextLink, :insertBefore, :exampleUrl, :pageElement,
|
40
|
+
:block, :match)
|
41
|
+
def self.fallback_predicate1(text, words=$FALLBACK_WORDS, patterns=$FALLBACK_PATTERNS)
|
42
|
+
a = [
|
43
|
+
words.map{|w| %Q!#{text}="#{w}"!}.join(' or '),
|
44
|
+
patterns.map{|w| %Q!contains(#{text},"#{w}")!}.join(' or '),
|
45
|
+
]
|
46
|
+
a.delete ""
|
47
|
+
a.join " or "
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.fallback_predicate2(text, start_words=$FALLBACK_START_WORDS)
|
51
|
+
start_words.map{|w| %Q!starts-with(#{text},"#{w}")!}.join(' or ')
|
52
|
+
end
|
53
|
+
|
54
|
+
# link to next
|
55
|
+
def self.fallbacks
|
56
|
+
@fallbacks ||= lambda do
|
57
|
+
a = [
|
58
|
+
new("//a[#{fallback_predicate1('.')}]"),
|
59
|
+
new("//form[descendant::input[#{fallback_predicate1('@value')}]]"),
|
60
|
+
]
|
61
|
+
if $FALLBACK_START_WORDS.to_a.length > 0
|
62
|
+
a.concat [
|
63
|
+
new("//a[#{fallback_predicate2('.')}]"),
|
64
|
+
new("//form[descendant::input[#{fallback_predicate2('@value')}]]"),
|
65
|
+
]
|
66
|
+
end
|
67
|
+
a.extend(FallbackSetup)
|
68
|
+
end.call
|
69
|
+
end
|
70
|
+
module FallbackSetup
|
71
|
+
def setup!
|
72
|
+
each {|fallback| $SITEINFO << [/./, fallback]}
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# Make the DSL pretty!
|
77
|
+
members.each do |m|
|
78
|
+
undef_method m
|
79
|
+
module_eval <<-EOC # hack for ruby-mode.el
|
80
|
+
#{'def'} #{m}(v=nil)
|
81
|
+
if v
|
82
|
+
self[:#{m}] = v
|
83
|
+
else
|
84
|
+
self[:#{m}]
|
85
|
+
end
|
86
|
+
end
|
87
|
+
EOC
|
88
|
+
end
|
89
|
+
|
90
|
+
def next_url(uri)
|
91
|
+
uri = URI(uri.to_s)
|
92
|
+
result = instance_exec(uri, match, &block) if block
|
93
|
+
xpath = nextLink
|
94
|
+
if xpath
|
95
|
+
nokogiri = $nokogiri_cache[uri.to_s]
|
96
|
+
$logger.info "#{__method__}: use xpath #{xpath}"
|
97
|
+
nodes = nokogiri.xpath(xpath)
|
98
|
+
node = nodes.first
|
99
|
+
$logger.debug "#{__method__}: nodes.length = #{nodes.length}"
|
100
|
+
nexturl = (node["href"] || node["action"] || node["value"]) rescue nil
|
101
|
+
# nexturl = nokogiri.xpath("#{xpath}/@href").first.content rescue nil
|
102
|
+
$logger.info "#{__method__}: nexturl = #{nexturl or 'NOT FOUND'}"
|
103
|
+
if nexturl
|
104
|
+
nexturl.gsub!(/ /, '+') # for some buggy sites not encoding spaces
|
105
|
+
uri.merge nexturl
|
106
|
+
end
|
107
|
+
else
|
108
|
+
$logger.info "#{__method__}: result = #{result}"
|
109
|
+
uri.merge result
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def defnext(url_or_pattern, nexturl=nil, &b)
|
115
|
+
defnext_ url_or_pattern, nexturl do |u,m|
|
116
|
+
$logger.info "Use defnext for #{url_or_pattern}"
|
117
|
+
instance_exec(u, m, &b)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
def defnext_(url_or_pattern, nexturl=nil, &block)
|
122
|
+
sd = SiteData.new
|
123
|
+
if nexturl
|
124
|
+
sd.block = lambda{|u,m| nexturl }
|
125
|
+
else
|
126
|
+
sd.block = block
|
127
|
+
end
|
128
|
+
$SITEINFO << [ url_or_pattern, sd ]
|
129
|
+
end
|
130
|
+
|
131
|
+
|
132
|
+
def addstring(url_or_pattern, string)
|
133
|
+
defnext_(url_or_pattern) {|u,m|
|
134
|
+
$logger.info "Use addstring for #{url_or_pattern}"
|
135
|
+
u.to_s + string
|
136
|
+
}
|
137
|
+
end
|
138
|
+
|
139
|
+
def increment(url_or_pattern, n=1)
|
140
|
+
defnext_(url_or_pattern) {|u,m|
|
141
|
+
$logger.info "Use increment for #{url_or_pattern}"
|
142
|
+
url=u.to_s
|
143
|
+
nextvar = m[1].to_i + n
|
144
|
+
url[ m.begin(1) ... m.end(1) ] = if m[1] =~ /^0/
|
145
|
+
format("%0#{m[1].length}d", nextvar)
|
146
|
+
else
|
147
|
+
nextvar.to_s
|
148
|
+
end
|
149
|
+
url
|
150
|
+
}
|
151
|
+
end
|
152
|
+
|
153
|
+
def w3mctl(*strings)
|
154
|
+
strings.each do |str|
|
155
|
+
if str
|
156
|
+
if str==true
|
157
|
+
$client.print "\r\n\r\n"
|
158
|
+
else
|
159
|
+
$client.print "W3m-Control: #{str}\r\n"
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
nil
|
164
|
+
end
|
165
|
+
|
166
|
+
###########################################################################
|
167
|
+
# File.zread #
|
168
|
+
###########################################################################
|
169
|
+
require 'zlib'
|
170
|
+
|
171
|
+
Zlib::GZIP_MAGIC = "\037\213"
|
172
|
+
Zlib::GZIP_MAGIC.force_encoding("ASCII-8BIT") if RUBY_VERSION >= "1.9"
|
173
|
+
|
174
|
+
def File.zread(file)
|
175
|
+
Object.module_eval do
|
176
|
+
open(file) do |f|
|
177
|
+
magic = f.read(2)
|
178
|
+
f.rewind
|
179
|
+
if magic == Zlib::GZIP_MAGIC
|
180
|
+
Zlib::GzipReader.wrap(f) {|gz| gz.read }
|
181
|
+
else
|
182
|
+
f.read
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
###########################################################################
|
189
|
+
# content cache #
|
190
|
+
###########################################################################
|
191
|
+
TMPFILE = Dir.tmpdir + "/w3m-autopagerize.tmp.html"
|
192
|
+
$content_cache = Hash.new do |h,url|
|
193
|
+
$logger.debug "cache miss: set $content_cache[#{url.inspect}]"
|
194
|
+
# use w3m to pass cookie
|
195
|
+
header, source = get_header_and_content(url)
|
196
|
+
$logger.debug "cache miss: source is html? = #{source =~ /<body/i and true}"
|
197
|
+
charset = normalize_charset(header[/charset=(.+)$/,1] || Kconv.guess(source))
|
198
|
+
source.force_encoding("ASCII-8BIT")
|
199
|
+
h[url] = [source, charset]
|
200
|
+
end
|
201
|
+
|
202
|
+
# BUG: libxml2 cannot handle id() function without doctype.
|
203
|
+
# http://labs.gmo.jp/blog/ku/2008/07/libxmlhtmlxpathid.html
|
204
|
+
DOCTYPE = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">'
|
205
|
+
$nokogiri_cache = Hash.new do |h,url|
|
206
|
+
$logger.debug "cache miss: set $nokogiri_cache[#{url.inspect}]"
|
207
|
+
source, charset = $content_cache[url]
|
208
|
+
h[url] = Nokogiri::HTML(DOCTYPE+source, nil, charset)
|
209
|
+
end
|
210
|
+
|
211
|
+
|
212
|
+
###########################################################################
|
213
|
+
# Utilities #
|
214
|
+
###########################################################################
|
215
|
+
def get_header_and_content(url)
|
216
|
+
output = `w3m #$W3M_EXTRA_OPTIONS -dump_both -o accept_encoding='gzip' "#{url}"`
|
217
|
+
header, source = output.force_encoding("ASCII-8BIT").split(/\n\n/, 2)
|
218
|
+
open(TMPFILE,"wb"){|f| f.write source}
|
219
|
+
source = File.zread(TMPFILE).force_encoding "ASCII-8BIT"
|
220
|
+
[header, source]
|
221
|
+
ensure
|
222
|
+
File.unlink TMPFILE if File.exist? TMPFILE
|
223
|
+
end
|
224
|
+
|
225
|
+
def get_content(url)
|
226
|
+
if url =~ /^https?:/
|
227
|
+
get_header_and_content(url)[1]
|
228
|
+
else # local file
|
229
|
+
File.zread(File.expand_path(url)).force_encoding "ASCII-8BIT"
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
def normalize_charset(charset)
|
234
|
+
charset = charset.to_s
|
235
|
+
# FIXME I do not know other charsets than Japanese.
|
236
|
+
charset.downcase == "shift_jis" ? "cp932" : charset
|
237
|
+
end
|
238
|
+
|
239
|
+
|
240
|
+
def reinit
|
241
|
+
$SITEINFO = []
|
242
|
+
end
|
243
|
+
|
244
|
+
# unless "".respond_to? :force_encoding # for ruby 1.8
|
245
|
+
# class String
|
246
|
+
# def force_encoding(args); self end
|
247
|
+
# def encoding; Kconv.guess(self) end
|
248
|
+
# end
|
249
|
+
# end
|
250
|
+
|
251
|
+
###########################################################################
|
252
|
+
# Entry Points #
|
253
|
+
###########################################################################
|
254
|
+
class Server
|
255
|
+
def sitedata(url)
|
256
|
+
url = url.to_s
|
257
|
+
match = nil
|
258
|
+
sitedata = $SITEINFO.find{|re, block|
|
259
|
+
match = case re
|
260
|
+
when Regexp
|
261
|
+
url.match(re)
|
262
|
+
else
|
263
|
+
url == re.to_s
|
264
|
+
end
|
265
|
+
}[1]
|
266
|
+
sitedata.match = match
|
267
|
+
sitedata
|
268
|
+
end
|
269
|
+
private :sitedata
|
270
|
+
|
271
|
+
HTML_OUTPUT_FILE = "/tmp/w3m-autopagerize-tmp.html"
|
272
|
+
def crop_html(location, prev_url, sitedata)
|
273
|
+
$logger.debug "#{__method__}: url = #{location}"
|
274
|
+
location = location.to_s
|
275
|
+
nokogiri = $nokogiri_cache[location]
|
276
|
+
title = nokogiri.at("//title").to_html rescue "<title></title>"
|
277
|
+
begin
|
278
|
+
$logger.info "#{__method__}: use xpath #{sitedata.pageElement}"
|
279
|
+
nodes = nokogiri.xpath(sitedata.pageElement)
|
280
|
+
$logger.debug "#{__method__}: nodes.length = #{nodes.length}"
|
281
|
+
html_piece = nodes.to_html
|
282
|
+
raise if html_piece.strip.empty?
|
283
|
+
rescue
|
284
|
+
$logger.error "#{__method__}: failed to crop!"
|
285
|
+
html_piece = nokogiri.at("body").to_html
|
286
|
+
errmsg = %{<p>w3m-autopagerize failed to crop html but next url is found.<br />
|
287
|
+
xpath = #{sitedata.pageElement || 'pageElement not found'}
|
288
|
+
</p>
|
289
|
+
<hr>}
|
290
|
+
else
|
291
|
+
errmsg = ""
|
292
|
+
end
|
293
|
+
# BUG: Nokogiri emits superfluous .
|
294
|
+
html_piece.gsub! / /, '' # hack
|
295
|
+
# BUG: w3m cannot handle <script />, so replace it with <script></script>
|
296
|
+
html_piece.gsub! %r!(<script.+?)/>!, '\1></script>' # hack
|
297
|
+
%w[location title prev_url sitedata.pageElement sitedata.nextLink errmsg html_piece].each do |e|
|
298
|
+
# $logger.debug "#{__method__}: #{e}.encoding = #{eval('e').to_s.encoding}"
|
299
|
+
end
|
300
|
+
html = %{<html>
|
301
|
+
<head><base href="#{location}" />#{title}
|
302
|
+
<link rel="w3m-autopagerize-orig" href="#{location}" />
|
303
|
+
<link rel="w3m-autopagerize-prev" href="#{prev_url}" />
|
304
|
+
</head>
|
305
|
+
<body>
|
306
|
+
Original URL: <a href="#{location}">#{location}</a><br>
|
307
|
+
#{errmsg}
|
308
|
+
#{html_piece}
|
309
|
+
</body></html> }
|
310
|
+
|
311
|
+
{ :html => html, :location => location,
|
312
|
+
:pageElement => sitedata.pageElement, :nextLink => sitedata.nextLink}
|
313
|
+
end
|
314
|
+
private :crop_html
|
315
|
+
|
316
|
+
def prefetch_next_location(location, sitedata)
|
317
|
+
Thread.start do
|
318
|
+
# sleep 1
|
319
|
+
$logger.debug "#{__method__}: #{location}"
|
320
|
+
# sitedata = sitedata location
|
321
|
+
newloc = sitedata.next_url(location)
|
322
|
+
$logger.debug "#{__method__}: new location: #{newloc}"
|
323
|
+
$nokogiri_cache[newloc.to_s]
|
324
|
+
end
|
325
|
+
# It uses Ordered Hash in Ruby 1.9
|
326
|
+
[$nokogiri_cache, $content_cache].each do |hash|
|
327
|
+
hash.delete hash.first[0] if hash.length > 3
|
328
|
+
end
|
329
|
+
end
|
330
|
+
private :prefetch_next_location
|
331
|
+
|
332
|
+
def prepare(url, srcfile, charset, client, method)
|
333
|
+
$logger.info "=================================================="
|
334
|
+
$logger.info "#{method}: entered url=#{url} charset=#{charset}"
|
335
|
+
$logger.debug "#{method}: W3M_SOURCEFILE = #{srcfile}" if srcfile
|
336
|
+
$client = client
|
337
|
+
|
338
|
+
src = File.zread(srcfile).force_encoding("ASCII-8BIT") if srcfile
|
339
|
+
if url =~ /^file:.*\/cgi-bin\// # from Local CGI
|
340
|
+
url = src.force_encoding("ASCII-8BIT")[%r!<base href=['"](.+?)['"]!, 1] # '"
|
341
|
+
$logger.info "#{method}: base url=#{url}"
|
342
|
+
else # from W3M_SOURCEFILE
|
343
|
+
$logger.debug "#{method}: set $content_cache[#{url.inspect}] from W3M_SOURCEFILE"
|
344
|
+
$logger.debug "#{method}: source is html? = #{src =~ /<body/i and true}"
|
345
|
+
$content_cache[url] = [src.force_encoding("ASCII-8BIT"), normalize_charset(charset)] if src
|
346
|
+
end
|
347
|
+
[ src, url ]
|
348
|
+
end
|
349
|
+
|
350
|
+
def crop_this_page(url, srcfile, charset, client)
|
351
|
+
src, url = prepare(url, srcfile, charset, client, __method__)
|
352
|
+
|
353
|
+
sitedata = sitedata url
|
354
|
+
begin
|
355
|
+
crop_html url, nil, sitedata
|
356
|
+
ensure
|
357
|
+
prefetch_next_location sitedata.next_url(url), sitedata
|
358
|
+
end
|
359
|
+
end
|
360
|
+
|
361
|
+
|
362
|
+
def nextpage(url, srcfile, charset, client)
|
363
|
+
src, url = prepare(url, srcfile, charset, client, __method__)
|
364
|
+
sitedata = sitedata url
|
365
|
+
location = sitedata.next_url(url)
|
366
|
+
|
367
|
+
if location
|
368
|
+
if sitedata.pageElement
|
369
|
+
$logger.debug "#{__method__}: location and pageElement found."
|
370
|
+
else
|
371
|
+
$logger.debug "#{__method__}: location found."
|
372
|
+
end
|
373
|
+
begin
|
374
|
+
crop_html location, url, sitedata
|
375
|
+
ensure
|
376
|
+
prefetch_next_location location, sitedata
|
377
|
+
end
|
378
|
+
else
|
379
|
+
fallback_nexturl = for fallback in SiteData.fallbacks
|
380
|
+
u = fallback.next_url(url) and break u
|
381
|
+
end
|
382
|
+
if fallback_nexturl
|
383
|
+
$logger.info "#{__method__}: fallback"
|
384
|
+
begin
|
385
|
+
crop_html fallback_nexturl, url, fallback
|
386
|
+
ensure
|
387
|
+
prefetch_next_location fallback_nexturl, fallback
|
388
|
+
end
|
389
|
+
else
|
390
|
+
$logger.debug "#{__method__}: no location."
|
391
|
+
raise "no location!"
|
392
|
+
end
|
393
|
+
end
|
394
|
+
rescue
|
395
|
+
html = %{<pre>Error!
|
396
|
+
xpath = #{sitedata.nextLink || 'nextLink not found'}
|
397
|
+
#{$!}
|
398
|
+
#{$@.pretty_inspect}
|
399
|
+
src_encoding=#{Kconv.guess(src || $content_cache[url].first)}
|
400
|
+
</pre>
|
401
|
+
}
|
402
|
+
$logger.error "#{__method__}: error!"
|
403
|
+
$logger.error "#{__method__}: #$!"
|
404
|
+
$logger.error "#{__method__}: #{$@.pretty_inspect}"
|
405
|
+
{:html => html}
|
406
|
+
end
|
407
|
+
|
408
|
+
|
409
|
+
# (executable-interpret "ruby19 -r w3m-autopagerize-server -e '$logger=Logger.new(); load_siteinfo'")
|
410
|
+
# (executable-interpret "ruby18 -r w3m-autopagerize-server -e '$logger=Logger.new(); load_siteinfo'")
|
411
|
+
def load_siteinfo
|
412
|
+
keys = %w[exampleUrl insertBefore pageElement nextLink]
|
413
|
+
$SITEINFO_IMPORT_URLS.each do |siteinfo_url|
|
414
|
+
JSON.parse(get_content(siteinfo_url).toutf8).each do |hash|
|
415
|
+
data = hash["data"]
|
416
|
+
if url = data["url"] and not $EXCLUDE_URLS.include? url
|
417
|
+
sd = SiteData.new data["nextLink"], data["insertBefore"],
|
418
|
+
data["exampleUrl"], data["pageElement"]
|
419
|
+
$SITEINFO << [Regexp.new(url), sd]
|
420
|
+
end
|
421
|
+
end
|
422
|
+
end
|
423
|
+
$logger.info "#{__method__}: loaded"
|
424
|
+
end
|
425
|
+
|
426
|
+
def restart
|
427
|
+
exec $0
|
428
|
+
end
|
429
|
+
|
430
|
+
def load_config_file(config_file)
|
431
|
+
if config_file == :ignore
|
432
|
+
$stderr.puts "load_config_file: config file is ignored!"
|
433
|
+
else
|
434
|
+
config_file = File.expand_path(config_file, File.dirname(__FILE__))
|
435
|
+
if File.file? config_file
|
436
|
+
load(config_file)
|
437
|
+
$stderr.puts "load_config_file: loaded #{config_file}"
|
438
|
+
else
|
439
|
+
$stderr.puts "load_config_file: config file #{config_file} not found!"
|
440
|
+
end
|
441
|
+
end
|
442
|
+
rescue Exception
|
443
|
+
$stderr.puts "load_config_file: error loading #{config_file}!"
|
444
|
+
end
|
445
|
+
end
|
446
|
+
|
447
|
+
|
448
|
+
if __FILE__==$0
|
449
|
+
require 'optparse'
|
450
|
+
require 'drb'
|
451
|
+
conf = Struct.new(:log_file, :siteinfo_url, :config_file).new
|
452
|
+
conf.config_file = File.expand_path "~/.w3m-autopagerize.rb"
|
453
|
+
ARGV.options {|o|
|
454
|
+
o.on("-l", "--log LOGFILE",
|
455
|
+
"Use log file.") {|x| conf.log_file = File.expand_path(x, File.dirname(__FILE__)) }
|
456
|
+
o.on("-s", "--siteinfo URL",
|
457
|
+
"URL of JSON data (SITEINFO).") {|x| conf.siteinfo_url = x}
|
458
|
+
o.on("-c", "--config CONFIG", "Use config file.") {|x| conf.config_file = x }
|
459
|
+
o.on("-f", "Ignore config file.") {|x| conf.config_file = :ignore }
|
460
|
+
o.on("-r", "--restart", "--reload",
|
461
|
+
"Restart the server.") {|x|
|
462
|
+
DRbObject.new_with_uri(%q!druby://:9322!).restart rescue nil
|
463
|
+
puts "w3m-autopagerize-server restarted."
|
464
|
+
exit
|
465
|
+
}
|
466
|
+
o.parse!
|
467
|
+
}
|
468
|
+
|
469
|
+
srv = Server.new
|
470
|
+
srv.load_config_file(conf.config_file)
|
471
|
+
$LOG_FILE = conf.log_file || $LOG_FILE
|
472
|
+
$stderr.puts "startup: log file = #{$LOG_FILE.inspect}"
|
473
|
+
$logger = Logger.new($LOG_FILE)
|
474
|
+
$SITEINFO_IMPORT_URLS = [ conf.siteinfo_url ] if conf.siteinfo_url
|
475
|
+
$logger.info "$SITEINFO_IMPORT_URLS = #{$SITEINFO_IMPORT_URLS.inspect}"
|
476
|
+
|
477
|
+
srv.load_siteinfo
|
478
|
+
GC.start
|
479
|
+
SiteData.fallbacks.setup!
|
480
|
+
|
481
|
+
$stderr.puts "start w3m-autopagerize-server.rb (#{Time.now-start_time} secs)"
|
482
|
+
|
483
|
+
Thread.start do
|
484
|
+
loop { sleep 300; GC.start }
|
485
|
+
end
|
486
|
+
|
487
|
+
DRb.start_service("druby://:9322", srv)
|
488
|
+
DRb.thread.join
|
489
|
+
end
|
data/config.sample.rb
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
# This file shows default setting. If you customize w3m-autopagerize,
|
3
|
+
# copy this file to ~/.w3m-autopagerize.rb.
|
4
|
+
|
5
|
+
# Extra options of w3m to fetch web page.
|
6
|
+
$W3M_EXTRA_OPTIONS = ""
|
7
|
+
|
8
|
+
# SITEINFO location. Set URL or filename of SITEINFO JSON data.
|
9
|
+
$SITEINFO_IMPORT_URLS = %w[
|
10
|
+
http://wedata.net/databases/AutoPagerize/items.json
|
11
|
+
]
|
12
|
+
# Disable SITEINFO entries. The default is to ignore `"url": "^https?:\/\/."' entry.
|
13
|
+
$EXCLUDE_URLS = %w[
|
14
|
+
^https?:\/\/.
|
15
|
+
]
|
16
|
+
|
17
|
+
# Log file location
|
18
|
+
# =================
|
19
|
+
#
|
20
|
+
# The default destination of the log is stderr.
|
21
|
+
$LOG_FILE = $stderr
|
22
|
+
# If you use a log file, uncomment this. Note that the default
|
23
|
+
# directory of log file is the directory of w3m-autopagerize-server.rb.
|
24
|
+
|
25
|
+
# $LOG_FILE = "w3m-autopagerize.log"
|
26
|
+
|
27
|
+
# Fallback patterns
|
28
|
+
# =================
|
29
|
+
#
|
30
|
+
# If w3m-autopagerize cannot find next location, ie, wrong/no SITEINFO
|
31
|
+
# entry, w3m-autopagerize uses heuristic method to find next location
|
32
|
+
# with $FALLBACK_* variables. It is like FastForward of Opera.
|
33
|
+
#
|
34
|
+
# Links/buttons whose text is "next" or "keep reading" (full match) are
|
35
|
+
# considered as next location.
|
36
|
+
$FALLBACK_WORDS = %w[次 つぎ 続きます keep\ reading [→] 次一覧 Older\ Entries next Next NEXT]
|
37
|
+
# Links/buttons whose text starts with ">" (prefix match) are
|
38
|
+
# considered as next location.
|
39
|
+
$FALLBACK_START_WORDS = %w[> >]
|
40
|
+
# Links/buttons whose text contains ">" (partial match) are considered
|
41
|
+
# as next location.
|
42
|
+
$FALLBACK_PATTERNS = %w[次へ 次頁 次ページ 次項 次の 次を 先へ つぎへ つぎの 進む もっと見る ]
|
43
|
+
|
44
|
+
# Custom Location
|
45
|
+
# ===============
|
46
|
+
#
|
47
|
+
# You write `next' pages by URL rule. Use `addstring' and `increment'
|
48
|
+
# function. It is handy method to specify next location.
|
49
|
+
# It requires NO XPATH KNOWLEDGE, but some Regexp knowledge:-)
|
50
|
+
#
|
51
|
+
# Custom locations takes precedence over SITEINFO. It means that even
|
52
|
+
# if SITEINFO defines the configuration of a site, use custom
|
53
|
+
# location,
|
54
|
+
#
|
55
|
+
# For example, The next page of "http://www.dotup.org/" is
|
56
|
+
# "http://www.dotup.org/2.html". Use simply `addstring' function.
|
57
|
+
#
|
58
|
+
# The next page of "http://www.dotup.org/2.html" is
|
59
|
+
# "http://www.dotup.org/3.html". Use `increment' function with Regexp.
|
60
|
+
# The first occurrence of "(\d)" (digits) are replaced with the next number.
|
61
|
+
# Note that writing a URL Regexp by %r!URL Regexp! is handy.
|
62
|
+
addstring "http://www.dotup.org/", "2.html"
|
63
|
+
increment %r!http://www.dotup.org/(\d+).html$!
|
64
|
+
#
|
65
|
+
# `increment' can add any integer. For example,
|
66
|
+
# "http://images.google.co.jp/images?q=ruby&ie=Shift_JIS&hl=ja&start=20"
|
67
|
+
# to
|
68
|
+
# "http://images.google.co.jp/images?q=ruby&ie=Shift_JIS&hl=ja&start=40".
|
69
|
+
increment %r!^http://images.google.(?:co.jp|com)/.*start=(\d+)!, 20
|
70
|
+
#
|
71
|
+
# `addstring' function can accept Regexp. For example,
|
72
|
+
# "http://images.google.co.jp/images?q=ruby&ie=Shift_JIS&hl=ja"
|
73
|
+
# to
|
74
|
+
# "http://images.google.co.jp/images?q=ruby&ie=Shift_JIS&hl=ja&start=20".
|
75
|
+
#
|
76
|
+
# Note that `increment' of google image search must be defined BEFORE
|
77
|
+
# `addstring'. If `addstring' is before `increment', w3m-autopagerize
|
78
|
+
# considers the next page of
|
79
|
+
# "http://images.google.co.jp/images?q=ruby&ie=Shift_JIS&hl=ja&start=20"
|
80
|
+
# as
|
81
|
+
# "http://images.google.co.jp/images?q=ruby&ie=Shift_JIS&hl=ja&start=20&start=20".
|
82
|
+
# It is because the URL matches both
|
83
|
+
# %r!^http://images.google.(?:co.jp|com)/! and
|
84
|
+
# %r!^http://images.google.(?:co.jp|com)/.*start=(\d+)!.
|
85
|
+
addstring %r!^http://images.google.(?:co.jp|com)/!, '&start=20'
|
86
|
+
|
87
|
+
# Custom Action
|
88
|
+
# =============
|
89
|
+
#
|
90
|
+
# You can execute any w3m commands for certain URL. For example, I
|
91
|
+
# (rubikitch) login hatena and open my hatena diary, execute
|
92
|
+
# "GOTO http://d.hatena.ne.jp/rubikitch/" and "DELETE_PREVBUF"
|
93
|
+
# three times. Use `defnext' and `w3mctl'.
|
94
|
+
#
|
95
|
+
# This is a good example of login and goto action. Note that when you
|
96
|
+
# use login and goto, you must set your login/password to
|
97
|
+
# ~/.w3m/pre_form file.
|
98
|
+
defnext "https://www.hatena.ne.jp/login" do
|
99
|
+
w3mctl "GOTO http://d.hatena.ne.jp/rubikitch/", "DELETE_PREVBUF", "DELETE_PREVBUF", "DELETE_PREVBUF"
|
100
|
+
end
|
101
|
+
|
102
|
+
# Custom SITEINFO
|
103
|
+
# ===============
|
104
|
+
#
|
105
|
+
# If you have your original SITEINFO for AutoPagerize, you can simply
|
106
|
+
# add the URL or filename into the top of $SITEINFO_IMPORT_URLS.
|
107
|
+
#
|
108
|
+
# The SITEINFO can be defined in Ruby DSL.
|
109
|
+
#
|
110
|
+
# In JSON:
|
111
|
+
#
|
112
|
+
# {
|
113
|
+
# "name": "(.~) what a quiet stiff (~.)",
|
114
|
+
# "data": {
|
115
|
+
# "insertBefore": "",
|
116
|
+
# "pageElement": "id(\"pixflow\")",
|
117
|
+
# "url": "^http:\/\/whytheluckystiff\\.net\/quiet\/",
|
118
|
+
# "nextLink": "id(\"header\")\/a[last()]",
|
119
|
+
# "exampleUrl": "http:\/\/whytheluckystiff.net\/quiet\/"
|
120
|
+
# }
|
121
|
+
# }
|
122
|
+
#
|
123
|
+
# In Ruby:
|
124
|
+
#
|
125
|
+
# defnext %r!^http://whytheluckystiff\.net/quiet/! do
|
126
|
+
# insertBefore ''
|
127
|
+
# pageElement 'id("pixflow")'
|
128
|
+
# nextLink 'id("header")/a[last()]'
|
129
|
+
# exampleUrl 'http://whytheluckystiff.net/quiet/'
|
130
|
+
# end
|
data/readme.html
ADDED
@@ -0,0 +1,252 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
2
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
3
|
+
<html xmlns="http://www.w3.org/1999/xhtml"
|
4
|
+
lang="en" xml:lang="en">
|
5
|
+
<head>
|
6
|
+
<title>AutoPagerize for w3m</title>
|
7
|
+
<meta http-equiv="Content-Type" content="text/html;charset=euc-jp"/>
|
8
|
+
<meta name="generator" content="Org-mode"/>
|
9
|
+
<meta name="generated" content="2009-01-30"/>
|
10
|
+
<meta name="author" content="rubikitch"/>
|
11
|
+
<style type="text/css">
|
12
|
+
<!--/*--><![CDATA[/*><!--*/
|
13
|
+
html { font-family: Times, serif; font-size: 12pt; }
|
14
|
+
.title { text-align: center; }
|
15
|
+
.todo { color: red; }
|
16
|
+
.done { color: green; }
|
17
|
+
.tag { background-color:lightblue; font-weight:normal }
|
18
|
+
.target { }
|
19
|
+
.timestamp { color: grey }
|
20
|
+
.timestamp-kwd { color: CadetBlue }
|
21
|
+
p.verse { margin-left: 3% }
|
22
|
+
pre {
|
23
|
+
border: 1pt solid #AEBDCC;
|
24
|
+
background-color: #F3F5F7;
|
25
|
+
padding: 5pt;
|
26
|
+
font-family: courier, monospace;
|
27
|
+
font-size: 90%;
|
28
|
+
overflow:auto;
|
29
|
+
}
|
30
|
+
table { border-collapse: collapse; }
|
31
|
+
td, th { vertical-align: top; }
|
32
|
+
dt { font-weight: bold; }
|
33
|
+
div.figure { padding: 0.5em; }
|
34
|
+
div.figure p { text-align: center; }
|
35
|
+
.linenr { font-size:smaller }
|
36
|
+
.code-highlighted {background-color:#ffff00;}
|
37
|
+
.org-info-js_info-navigation { border-style:none; }
|
38
|
+
#org-info-js_console-label { font-size:10px; font-weight:bold;
|
39
|
+
white-space:nowrap; }
|
40
|
+
.org-info-js_search-highlight {background-color:#ffff00; color:#000000;
|
41
|
+
font-weight:bold; }
|
42
|
+
/*]]>*/-->
|
43
|
+
</style>
|
44
|
+
<script type="text/javascript">
|
45
|
+
<!--/*--><![CDATA[/*><!--*/
|
46
|
+
function CodeHighlightOn(elem, id)
|
47
|
+
{
|
48
|
+
var target = document.getElementById(id);
|
49
|
+
if(null != target) {
|
50
|
+
elem.cacheClassElem = elem.className;
|
51
|
+
elem.cacheClassTarget = target.className;
|
52
|
+
target.className = "code-highlighted";
|
53
|
+
elem.className = "code-highlighted";
|
54
|
+
}
|
55
|
+
}
|
56
|
+
function CodeHighlightOff(elem, id)
|
57
|
+
{
|
58
|
+
var target = document.getElementById(id);
|
59
|
+
if(elem.cacheClassElem)
|
60
|
+
elem.className = elem.cacheClassElem;
|
61
|
+
if(elem.cacheClassTarget)
|
62
|
+
target.className = elem.cacheClassTarget;
|
63
|
+
}
|
64
|
+
/*]]>*/-->
|
65
|
+
</script>
|
66
|
+
</head><body>
|
67
|
+
<h1 class="title">AutoPagerize for w3m</h1>
|
68
|
+
|
69
|
+
<p>AutoPagerize for w3m <a href="http://rubikitchrb.rubyforge.org/">http://rubikitchrb.rubyforge.org/</a>
|
70
|
+
</p>
|
71
|
+
<p>
|
72
|
+
Copyright (c) 2009 rubikitch <rubikitch@ruby-lang.org> <a href="http://www.rubyist.net/~rubikitch/">http://www.rubyist.net/~rubikitch/</a>
|
73
|
+
</p>
|
74
|
+
<p>
|
75
|
+
Use and distribution subject to the terms of the Ruby license.
|
76
|
+
</p>
|
77
|
+
<div id="table-of-contents">
|
78
|
+
<h2>Table of Contents</h2>
|
79
|
+
<div id="text-table-of-contents">
|
80
|
+
<ul>
|
81
|
+
<li><a href="#sec-1">1 Overview </a></li>
|
82
|
+
<li><a href="#sec-2">2 Programs </a>
|
83
|
+
<ul>
|
84
|
+
<li><a href="#sec-2.1">2.1 w3m-autopagerize-server.rb </a></li>
|
85
|
+
<li><a href="#sec-2.2">2.2 next.cgi </a></li>
|
86
|
+
<li><a href="#sec-2.3">2.3 config.sample.rb </a></li>
|
87
|
+
</ul>
|
88
|
+
</li>
|
89
|
+
<li><a href="#sec-3">3 Installation </a>
|
90
|
+
<ul>
|
91
|
+
<li><a href="#sec-3.1">3.1 Install AutoPagerize for w3m </a></li>
|
92
|
+
<li><a href="#sec-3.2">3.2 Copy config file </a></li>
|
93
|
+
<li><a href="#sec-3.3">3.3 Local CGI setup </a></li>
|
94
|
+
<li><a href="#sec-3.4">3.4 Key bind </a></li>
|
95
|
+
</ul>
|
96
|
+
</li>
|
97
|
+
<li><a href="#sec-4">4 Usage </a></li>
|
98
|
+
<li><a href="#sec-5">5 License </a></li>
|
99
|
+
</ul>
|
100
|
+
</div>
|
101
|
+
</div>
|
102
|
+
|
103
|
+
<div id="outline-container-1" class="outline-2">
|
104
|
+
<h2 id="sec-1">1 Overview </h2>
|
105
|
+
<div id="text-1">
|
106
|
+
|
107
|
+
<p>AutoPagerize for w3m finds next link and extracts page contents. It
|
108
|
+
consists of dRuby server program (w3m-autopagerize-server.rb) and w3m
|
109
|
+
Local CGI program (next.cgi).
|
110
|
+
</p>
|
111
|
+
</div>
|
112
|
+
|
113
|
+
</div>
|
114
|
+
|
115
|
+
<div id="outline-container-2" class="outline-2">
|
116
|
+
<h2 id="sec-2">2 Programs </h2>
|
117
|
+
<div id="text-2">
|
118
|
+
|
119
|
+
|
120
|
+
</div>
|
121
|
+
|
122
|
+
<div id="outline-container-2.1" class="outline-3">
|
123
|
+
<h3 id="sec-2.1">2.1 w3m-autopagerize-server.rb </h3>
|
124
|
+
<div id="text-2.1">
|
125
|
+
|
126
|
+
<p>AutoPagerize for w3m uses dRuby server w3m-autopagerize-server.rb
|
127
|
+
because initializing site data is time-consuming. Before using
|
128
|
+
AutoPagerize for w3m, you have to invoke w3m-autopagerize-server.rb!
|
129
|
+
w3m-autopagerize-server.rb loads config file (~/.w3m-autopagerize.rb)
|
130
|
+
if any and reads AutoPagerize SITEINFO data from wedata.net by
|
131
|
+
default.
|
132
|
+
</p>
|
133
|
+
</div>
|
134
|
+
|
135
|
+
</div>
|
136
|
+
|
137
|
+
<div id="outline-container-2.2" class="outline-3">
|
138
|
+
<h3 id="sec-2.2">2.2 next.cgi </h3>
|
139
|
+
<div id="text-2.2">
|
140
|
+
|
141
|
+
<p>next.cgi is Local CGI program to ask w3m-autopagerize-server.rb to get next page.
|
142
|
+
</p>
|
143
|
+
</div>
|
144
|
+
|
145
|
+
</div>
|
146
|
+
|
147
|
+
<div id="outline-container-2.3" class="outline-3">
|
148
|
+
<h3 id="sec-2.3">2.3 config.sample.rb </h3>
|
149
|
+
<div id="text-2.3">
|
150
|
+
|
151
|
+
<p>The sample config file to customize. See <a href="config.sample.rb">file:config.sample.rb</a> for detail.
|
152
|
+
</p>
|
153
|
+
</div>
|
154
|
+
</div>
|
155
|
+
|
156
|
+
</div>
|
157
|
+
|
158
|
+
<div id="outline-container-3" class="outline-2">
|
159
|
+
<h2 id="sec-3">3 Installation </h2>
|
160
|
+
<div id="text-3">
|
161
|
+
|
162
|
+
|
163
|
+
</div>
|
164
|
+
|
165
|
+
<div id="outline-container-3.1" class="outline-3">
|
166
|
+
<h3 id="sec-3.1">3.1 Install AutoPagerize for w3m </h3>
|
167
|
+
<div id="text-3.1">
|
168
|
+
|
169
|
+
<p>AutoPagerize for w3m works with Ruby 1.9 only! So, you have to install
|
170
|
+
Ruby 1.9.x. Then simply issue:
|
171
|
+
</p>
|
172
|
+
<pre class="example">
|
173
|
+
$ sudo ruby1.9 -S gem install w3m-autopagerize
|
174
|
+
</pre>
|
175
|
+
|
176
|
+
</div>
|
177
|
+
|
178
|
+
</div>
|
179
|
+
|
180
|
+
<div id="outline-container-3.2" class="outline-3">
|
181
|
+
<h3 id="sec-3.2">3.2 Copy config file </h3>
|
182
|
+
<div id="text-3.2">
|
183
|
+
|
184
|
+
<p>If you customize AutoPagerize for w3m, copy config.sample.rb to
|
185
|
+
~/.w3m-autopagerize.rb and edit it.
|
186
|
+
</p>
|
187
|
+
</div>
|
188
|
+
|
189
|
+
</div>
|
190
|
+
|
191
|
+
<div id="outline-container-3.3" class="outline-3">
|
192
|
+
<h3 id="sec-3.3">3.3 Local CGI setup </h3>
|
193
|
+
<div id="text-3.3">
|
194
|
+
|
195
|
+
<p>Local CGI program next.cgi is installed at
|
196
|
+
/usr/local/bin/next.cgi. You have to make w3m find it. Add
|
197
|
+
/usr/local/bin to your Local CGI path (cgi<sub>bin</sub>), or make symlink.
|
198
|
+
</p>
|
199
|
+
<pre class="example">
|
200
|
+
$ cd ~/w3m/cgi-bin; ln -s /usr/local/bin/next.cgi
|
201
|
+
</pre>
|
202
|
+
|
203
|
+
</div>
|
204
|
+
|
205
|
+
</div>
|
206
|
+
|
207
|
+
<div id="outline-container-3.4" class="outline-3">
|
208
|
+
<h3 id="sec-3.4">3.4 Key bind </h3>
|
209
|
+
<div id="text-3.4">
|
210
|
+
|
211
|
+
<p>Bind AutoPagerize for w3m to your favorite key. Edit ~/.w3m/keymap and add this line.
|
212
|
+
</p>
|
213
|
+
<pre class="example">
|
214
|
+
keymap x GOTO file:/cgi-bin/next.cgi
|
215
|
+
</pre>
|
216
|
+
|
217
|
+
</div>
|
218
|
+
</div>
|
219
|
+
|
220
|
+
</div>
|
221
|
+
|
222
|
+
<div id="outline-container-4" class="outline-2">
|
223
|
+
<h2 id="sec-4">4 Usage </h2>
|
224
|
+
<div id="text-4">
|
225
|
+
|
226
|
+
<ul>
|
227
|
+
<li>
|
228
|
+
Press `x' key to go to next page.
|
229
|
+
</li>
|
230
|
+
<li>
|
231
|
+
Press `=' key to see information, eg. XPath to get next page.
|
232
|
+
|
233
|
+
</li>
|
234
|
+
</ul>
|
235
|
+
</div>
|
236
|
+
|
237
|
+
</div>
|
238
|
+
|
239
|
+
<div id="outline-container-5" class="outline-2">
|
240
|
+
<h2 id="sec-5">5 License </h2>
|
241
|
+
<div id="text-5">
|
242
|
+
|
243
|
+
<p>AutoPagerize for w3m is licensed under the same terms as Ruby.
|
244
|
+
</p></div>
|
245
|
+
</div>
|
246
|
+
<div id="postamble"><p class="author"> Author: rubikitch
|
247
|
+
<a href="mailto:rubikitch@ruby-lang.org"><rubikitch@ruby-lang.org></a>
|
248
|
+
</p>
|
249
|
+
<p class="date"> Date: 2009-01-30</p>
|
250
|
+
<p>HTML generated by org-mode 6.18 in emacs 22</p>
|
251
|
+
</div></body>
|
252
|
+
</html>
|
data/readme.org
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
#+TITLE: AutoPagerize for w3m
|
2
|
+
#+AUTHOR: rubikitch
|
3
|
+
#+EMAIL: rubikitch@ruby-lang.org
|
4
|
+
#+DATE: 2009-01-30
|
5
|
+
#+LANGUAGE: en
|
6
|
+
#+OPTIONS: H:3 num:t toc:t \n:nil @:t ::t |:t ^:t -:t f:t *:t TeX:t LaTeX:nil skip:nil d:nil todo:t pri:nil tags:not-in-toc
|
7
|
+
#+INFOJS_OPT: view:nil toc:nil ltoc:t mouse:underline buttons:0 path:http://orgmode.org/org-info.js
|
8
|
+
#+EXPORT_SELECT_TAGS: export
|
9
|
+
#+EXPORT_EXCLUDE_TAGS: noexport
|
10
|
+
#+LINK_UP:
|
11
|
+
#+LINK_HOME:
|
12
|
+
AutoPagerize for w3m http://rubikitchrb.rubyforge.org/
|
13
|
+
|
14
|
+
Copyright (c) 2009 rubikitch <rubikitch@ruby-lang.org> http://www.rubyist.net/~rubikitch/
|
15
|
+
|
16
|
+
Use and distribution subject to the terms of the Ruby license.
|
17
|
+
* Overview
|
18
|
+
AutoPagerize for w3m finds next link and extracts page contents. It
|
19
|
+
consists of dRuby server program (w3m-autopagerize-server.rb) and w3m
|
20
|
+
Local CGI program (next.cgi).
|
21
|
+
|
22
|
+
* Programs
|
23
|
+
** w3m-autopagerize-server.rb
|
24
|
+
AutoPagerize for w3m uses dRuby server w3m-autopagerize-server.rb
|
25
|
+
because initializing site data is time-consuming. Before using
|
26
|
+
AutoPagerize for w3m, you have to invoke w3m-autopagerize-server.rb!
|
27
|
+
w3m-autopagerize-server.rb loads config file (~/.w3m-autopagerize.rb)
|
28
|
+
if any and reads AutoPagerize SITEINFO data from wedata.net by
|
29
|
+
default.
|
30
|
+
|
31
|
+
** next.cgi
|
32
|
+
next.cgi is Local CGI program to ask w3m-autopagerize-server.rb to get next page.
|
33
|
+
|
34
|
+
** config.sample.rb
|
35
|
+
The sample config file to customize. See file:config.sample.rb for detail.
|
36
|
+
|
37
|
+
* Installation
|
38
|
+
** Install AutoPagerize for w3m
|
39
|
+
AutoPagerize for w3m works with Ruby 1.9 only! So, you have to install
|
40
|
+
Ruby 1.9.x. Then simply issue:
|
41
|
+
|
42
|
+
: $ sudo ruby1.9 -S gem install w3m-autopagerize
|
43
|
+
|
44
|
+
** Copy config file
|
45
|
+
If you customize AutoPagerize for w3m, copy config.sample.rb to
|
46
|
+
~/.w3m-autopagerize.rb and edit it.
|
47
|
+
|
48
|
+
** Local CGI setup
|
49
|
+
Local CGI program next.cgi is installed at
|
50
|
+
/usr/local/bin/next.cgi. You have to make w3m find it. Add
|
51
|
+
/usr/local/bin to your Local CGI path (cgi_bin), or make symlink.
|
52
|
+
|
53
|
+
: $ cd ~/w3m/cgi-bin; ln -s /usr/local/bin/next.cgi
|
54
|
+
|
55
|
+
** Key bind
|
56
|
+
Bind AutoPagerize for w3m to your favorite key. Edit ~/.w3m/keymap and add this line.
|
57
|
+
|
58
|
+
: keymap x GOTO file:/cgi-bin/next.cgi
|
59
|
+
|
60
|
+
* Usage
|
61
|
+
- Press `x' key to go to next page.
|
62
|
+
- Press `=' key to see information, eg. XPath to get next page.
|
63
|
+
|
64
|
+
* License
|
65
|
+
AutoPagerize for w3m is licensed under the same terms as Ruby.
|
@@ -0,0 +1,166 @@
|
|
1
|
+
#!/usr/local/bin/ruby19
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# (executable-interpret "ruby19 /m/home/rubikitch/w3m/cgi-bin/w3m-autopagerize/test-w3m-autopagerize.rb --no-use-color ")
|
4
|
+
require 'fileutils'
|
5
|
+
FileUtils.rm_f "test.log"
|
6
|
+
|
7
|
+
require 'test/unit'
|
8
|
+
require 'open-uri'
|
9
|
+
require 'script'
|
10
|
+
require 'w3m-autopagerize-server' # !> method redefined; discarding old debug_with_time
|
11
|
+
|
12
|
+
$TEST_MODE = true
|
13
|
+
$W3M_EXTRA_OPTIONS = "-o http_proxy=http://127.0.0.1:8339/"
|
14
|
+
$logger = Logger.new "test.log"
|
15
|
+
class TestAutoPagerize < Test::Unit::TestCase
|
16
|
+
def test_hatena_success
|
17
|
+
$logger.info "Test: #{__method__}"
|
18
|
+
reinit
|
19
|
+
defnext %r{^https?:\/\/(?:d2?|[^.]+\.g)\.hatena\.ne\.jp\/} do
|
20
|
+
exampleUrl %{http://os0x.g.hatena.ne.jp/os0x/}
|
21
|
+
pageElement %{id("days")}
|
22
|
+
nextLink %{//a[@rel="prev"]}
|
23
|
+
end
|
24
|
+
|
25
|
+
nexturl = "http://d.hatena.ne.jp/rubikitch/20090110/1231524557"
|
26
|
+
origurl = "http://d.hatena.ne.jp/rubikitch/20090113/1231844047"
|
27
|
+
np = Server.new.nextpage(origurl, nil, nil, Object.new)
|
28
|
+
assert_equal nexturl, np[:location]
|
29
|
+
assert_match(/<base/, np[:html])
|
30
|
+
assert_equal %{id("days")}, np[:pageElement]
|
31
|
+
assert_equal %{//a[@rel="prev"]}, np[:nextLink]
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_hatena_fail
|
35
|
+
$logger.info "Test: #{__method__}"
|
36
|
+
reinit
|
37
|
+
defnext %r{^https?:\/\/(?:d2?|[^.]+\.g)\.hatena\.ne\.jp\/} do
|
38
|
+
exampleUrl %{http://os0x.g.hatena.ne.jp/os0x/}
|
39
|
+
pageElement %{id("noelement")}
|
40
|
+
nextLink %{//a[@rel="prev"]}
|
41
|
+
end
|
42
|
+
|
43
|
+
nexturl = "http://d.hatena.ne.jp/rubikitch/20090110/1231524557"
|
44
|
+
origurl = "http://d.hatena.ne.jp/rubikitch/20090113/1231844047"
|
45
|
+
np = Server.new.nextpage(origurl, nil, nil, Object.new)
|
46
|
+
assert_equal nexturl, np[:location]
|
47
|
+
assert_match(/failed to crop html/, np[:html])
|
48
|
+
assert_equal %{id("noelement")}, np[:pageElement]
|
49
|
+
assert_equal %{//a[@rel="prev"]}, np[:nextLink]
|
50
|
+
end
|
51
|
+
|
52
|
+
def test_google_addstring
|
53
|
+
$logger.info "Test: #{__method__}"
|
54
|
+
reinit
|
55
|
+
addstring %r!^http://www.google.(?:co.jp|com)/search!, '&start=100'
|
56
|
+
|
57
|
+
nexturl = "http://www.google.com/search?q=ruby&hl=ja&num=100&start=100"
|
58
|
+
origurl = "http://www.google.com/search?q=ruby&hl=ja&num=100"
|
59
|
+
np = Server.new.nextpage(origurl, nil, nil, Object.new)
|
60
|
+
assert_equal nexturl, np[:location]
|
61
|
+
end
|
62
|
+
|
63
|
+
def test_google_increment
|
64
|
+
reinit
|
65
|
+
increment %r!^http://www.google.(?:co.jp|com)/search.*start=(\d+)!, 100
|
66
|
+
|
67
|
+
nexturl = "http://www.google.com/search?q=ruby&hl=ja&num=100&start=200"
|
68
|
+
origurl = "http://www.google.com/search?q=ruby&hl=ja&num=100&start=100"
|
69
|
+
np = Server.new.nextpage(origurl, nil, nil, Object.new)
|
70
|
+
assert_equal nexturl, np[:location]
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_google_fallback_link
|
74
|
+
$logger.info "Test: #{__method__}"
|
75
|
+
reinit
|
76
|
+
$FALLBACK_PATTERNS = %w[次へ]
|
77
|
+
$FALLBACK_WORDS = %w[次へ]
|
78
|
+
$SITEINFO = [[ /./, SiteData.fallbacks[0] ]]
|
79
|
+
|
80
|
+
nexturl = "http://www.google.com/search?num=100&hl=ja&pwst=1&q=ruby&start=100&sa=N"
|
81
|
+
origurl = "http://www.google.com/search?q=ruby&hl=ja&num=100"
|
82
|
+
np = Server.new.nextpage(origurl, nil, "UTF-8", Object.new)
|
83
|
+
assert_equal nexturl, np[:location]
|
84
|
+
end
|
85
|
+
|
86
|
+
def test_futaba_fallback_form
|
87
|
+
$logger.info "Test: #{__method__}"
|
88
|
+
reinit
|
89
|
+
$FALLBACK_WORDS = %w[次のページ]
|
90
|
+
$SITEINFO = [[ /./, SiteData.fallbacks[1] ]]
|
91
|
+
|
92
|
+
nexturl = "http://may.2chan.net/27/1.htm"
|
93
|
+
origurl = "http://may.2chan.net/27/futaba.htm"
|
94
|
+
np = Server.new.nextpage(origurl, nil, "cp932", Object.new)
|
95
|
+
assert_equal nexturl, np[:location]
|
96
|
+
end
|
97
|
+
|
98
|
+
def test_futaba_fallback_by_wrong_sitedata
|
99
|
+
$logger.info "Test: #{__method__}"
|
100
|
+
reinit
|
101
|
+
|
102
|
+
defnext %r{2chan} do
|
103
|
+
pageElement %{id("noelement")}
|
104
|
+
nextLink %{//a[@rel="prev"]}
|
105
|
+
end
|
106
|
+
|
107
|
+
$FALLBACK_WORDS = %w[次のページ]
|
108
|
+
SiteData.fallbacks.setup!
|
109
|
+
|
110
|
+
nexturl = "http://may.2chan.net/27/1.htm"
|
111
|
+
origurl = "http://may.2chan.net/27/futaba.htm"
|
112
|
+
np = Server.new.nextpage(origurl, nil, "cp932", Object.new)
|
113
|
+
assert_equal nexturl, np[:location]
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
class TestFallBackPredicate < Test::Unit::TestCase
|
118
|
+
def test_1
|
119
|
+
assert_equal '.="tugi" or contains(.,"Next")',
|
120
|
+
SiteData.fallback_predicate1(".", %w[tugi], %w[Next])
|
121
|
+
end
|
122
|
+
def test_2
|
123
|
+
assert_equal '.="tugi"', SiteData.fallback_predicate1(".", %w[tugi], [])
|
124
|
+
end
|
125
|
+
def test_3
|
126
|
+
assert_equal 'contains(.,"Next")', SiteData.fallback_predicate1(".", [], %w[Next])
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
class TestFallBackSiteData < Test::Unit::TestCase
|
131
|
+
def setup
|
132
|
+
SiteData.instance_eval { @fallbacks = nil }
|
133
|
+
end
|
134
|
+
|
135
|
+
def test_1
|
136
|
+
$FALLBACK_PATTERNS = %w[次へ]
|
137
|
+
$FALLBACK_WORDS = %w[次へ]
|
138
|
+
$FALLBACK_START_WORDS = %w[tugi]
|
139
|
+
assert_equal 4, SiteData.fallbacks.length
|
140
|
+
end
|
141
|
+
|
142
|
+
def test_2
|
143
|
+
$FALLBACK_PATTERNS = %w[次へ]
|
144
|
+
$FALLBACK_WORDS = %w[次へ]
|
145
|
+
$FALLBACK_START_WORDS = []
|
146
|
+
assert_equal 2, SiteData.fallbacks.length
|
147
|
+
end
|
148
|
+
|
149
|
+
def test_3
|
150
|
+
$FALLBACK_PATTERNS = %w[次へ]
|
151
|
+
$FALLBACK_WORDS = %w[次へ]
|
152
|
+
$FALLBACK_START_WORDS = %w[tugi]
|
153
|
+
assert SiteData.fallbacks.respond_to? :setup!
|
154
|
+
end
|
155
|
+
|
156
|
+
|
157
|
+
end
|
158
|
+
|
159
|
+
|
160
|
+
# >> Loaded suite -
|
161
|
+
# >> Started
|
162
|
+
# >> .....
|
163
|
+
# >>
|
164
|
+
# >> Finished in 1.154570634 seconds.
|
165
|
+
# >>
|
166
|
+
# >> 5 tests, 14 assertions, 0 failures, 0 errors, 0 pendings, 0 omissions, 0 notifications
|
metadata
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: w3m-autopagerize
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- rubikitch
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-01-30 00:00:00 +09:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: AutoPagerize for w3m
|
17
|
+
email: rubikitch@ruby-lang.org
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files: []
|
23
|
+
|
24
|
+
files:
|
25
|
+
- readme.org
|
26
|
+
- readme.html
|
27
|
+
- config.sample.rb
|
28
|
+
- bin/w3m-autopagerize-server.rb
|
29
|
+
- bin/next.cgi
|
30
|
+
- test/test-w3m-autopagerize.rb
|
31
|
+
has_rdoc: false
|
32
|
+
homepage: http://www.rubyist.net/~rubikitch/
|
33
|
+
post_install_message:
|
34
|
+
rdoc_options: []
|
35
|
+
|
36
|
+
require_paths:
|
37
|
+
- lib
|
38
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
39
|
+
requirements:
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: "0"
|
43
|
+
version:
|
44
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - ">="
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: "0"
|
49
|
+
version:
|
50
|
+
requirements: []
|
51
|
+
|
52
|
+
rubyforge_project: rubikitchrb
|
53
|
+
rubygems_version: 1.3.1
|
54
|
+
signing_key:
|
55
|
+
specification_version: 2
|
56
|
+
summary: AutoPagerize for w3m
|
57
|
+
test_files: []
|
58
|
+
|