RubyGems - w3m-autopagerize - Versions diffs - 1.0.0 - Mend

w3m-autopagerize 1.0.0

Files changed (7) hide show

data/bin/next.cgi +31 -0
data/bin/w3m-autopagerize-server.rb +489 -0
data/config.sample.rb +130 -0
data/readme.html +252 -0
data/readme.org +65 -0
data/test/test-w3m-autopagerize.rb +166 -0
metadata +58 -0

data/bin/next.cgi ADDED

@@ -0,0 +1,31 @@
+#!/usr/local/bin/ruby19 -Ku
+require 'drb'
+require 'w3m-localcgi'
+require 'kconv'
+url = ENV['W3M_URL']
+DRb.start_service
+public :print                   # HACK to work w3mctl
+srv = DRbObject.new_with_uri "druby://:9322"
+if ENV['QUERY_STRING'] == 'crop'
+  hash = srv.crop_this_page(url, ENV['W3M_SOURCEFILE'], ENV['W3M_CHARSET'], self.extend(DRbUndumped))
+else
+  hash = srv.nextpage(url, ENV['W3M_SOURCEFILE'], ENV['W3M_CHARSET'], self.extend(DRbUndumped))
+end
+if hash[:html]
+  puts "Content-Type: text/html"
+  puts "W3m-AutoPagerize-NextLink: #{hash[:nextLink]}" if hash[:nextLink]
+  puts "W3m-AutoPagerize-PageElement: #{hash[:pageElement]}" if hash[:pageElement]
+  puts
+  puts hash[:html]
+else
+  if hash[:location]
+    puts "W3m-Control: BACK"
+    puts "W3m-Control: GOTO #{hash[:location]}"
+    puts "W3m-AutoPagerize-NextLink: #{hash[:nextLink]}" if hash[:nextLink]
+  end
+  puts
+end

data/bin/w3m-autopagerize-server.rb ADDED

@@ -0,0 +1,489 @@
+#!/usr/local/bin/ruby19
+# -*- coding: utf-8 -*-
+# (executable-interpret "ruby19 /m/home/rubikitch/w3m/cgi-bin/w3m-autopagerize/test-w3m-autopagerize.rb --no-use-color ")
+# (executable-interpret "rm -f /log/w3m-autopagerize.log; w3m-autopagerize-server.rb -r")
+start_time = Time.now
+#Encoding.default_internal = "UTF-8"
+require 'kconv'
+require 'uri'
+require 'rubygems'
+require 'nokogiri'
+require 'pp'
+require 'logger'
+require 'tmpdir'
+require 'json'
+###########################################################################
+# Configurable Variables                                                  #
+###########################################################################
+$W3M_EXTRA_OPTIONS = ""
+# see http://www.opera-wiki.com/index.php?FAQ%2F5.%E3%82%AB%E3%82%B9%E3%82%BF%E3%83%9E%E3%82%A4%E3%82%BA#k7bb0c80
+$FALLBACK_PATTERNS = %w[次へ 次頁 次ページ 次項 次の 次を 先へ つぎへ つぎの
+                         進む next  もっと見る  ]
+$FALLBACK_WORDS = %w[次 つぎ 続きます keep\ reading [→] 次一覧 Older\ Entries]
+$FALLBACK_START_WORDS = %w[> ＞ 次 つぎ Next NEXT next →]
+$SITEINFO_IMPORT_URLS = %w[
+  http://wedata.net/databases/AutoPagerize/items.json
+]
+$EXCLUDE_URLS = %w[
+  ^https?:\/\/.
+]
+$LOG_FILE = $stderr
+###########################################################################
+# DSL for nexturl                                                         #
+###########################################################################
+$TEST_MODE = false
+$SITEINFO = []
+$client = nil
+class SiteData < Struct.new(:nextLink, :insertBefore, :exampleUrl, :pageElement,
+    :block, :match)
+  def self.fallback_predicate1(text, words=$FALLBACK_WORDS, patterns=$FALLBACK_PATTERNS)
+    a = [
+      words.map{|w| %Q!#{text}="#{w}"!}.join(' or '),
+      patterns.map{|w| %Q!contains(#{text},"#{w}")!}.join(' or '),
+    ]
+    a.delete ""
+    a.join " or "
+  end
+  def self.fallback_predicate2(text, start_words=$FALLBACK_START_WORDS)
+    start_words.map{|w| %Q!starts-with(#{text},"#{w}")!}.join(' or ')
+  end
+  # link to next
+  def self.fallbacks
+    @fallbacks ||= lambda do
+      a = [
+        new("//a[#{fallback_predicate1('.')}]"),
+        new("//form[descendant::input[#{fallback_predicate1('@value')}]]"),
+      ]
+      if $FALLBACK_START_WORDS.to_a.length > 0
+        a.concat [
+          new("//a[#{fallback_predicate2('.')}]"),
+          new("//form[descendant::input[#{fallback_predicate2('@value')}]]"),
+        ]
+      end
+      a.extend(FallbackSetup)
+    end.call
+  end
+  module FallbackSetup
+    def setup!
+      each {|fallback| $SITEINFO << [/./, fallback]}
+    end
+  end
+  # Make the DSL pretty!
+  members.each do |m|
+    undef_method m
+    module_eval <<-EOC         # hack for ruby-mode.el
+      #{'def'} #{m}(v=nil)
+        if v
+          self[:#{m}] = v
+        else
+          self[:#{m}]
+        end
+      end
+    EOC
+  end
+  def next_url(uri)
+    uri = URI(uri.to_s)
+    result = instance_exec(uri, match, &block) if block
+    xpath = nextLink
+    if xpath
+      nokogiri = $nokogiri_cache[uri.to_s]
+      $logger.info "#{__method__}: use xpath #{xpath}"
+      nodes = nokogiri.xpath(xpath)
+      node = nodes.first
+      $logger.debug "#{__method__}: nodes.length = #{nodes.length}"
+      nexturl = (node["href"] || node["action"] || node["value"]) rescue nil
+      #  nexturl = nokogiri.xpath("#{xpath}/@href").first.content rescue nil
+      $logger.info "#{__method__}: nexturl = #{nexturl or 'NOT FOUND'}"
+      if nexturl
+        nexturl.gsub!(/ /, '+') # for some buggy sites not encoding spaces
+        uri.merge nexturl
+      end
+    else
+      $logger.info "#{__method__}: result = #{result}"
+      uri.merge result
+    end
+  end
+end
+def defnext(url_or_pattern, nexturl=nil, &b)
+  defnext_ url_or_pattern, nexturl do |u,m|
+    $logger.info "Use defnext for #{url_or_pattern}"
+    instance_exec(u, m, &b)
+  end
+end
+def defnext_(url_or_pattern, nexturl=nil, &block)
+  sd = SiteData.new
+  if nexturl
+    sd.block = lambda{|u,m| nexturl }
+  else
+    sd.block = block
+  end
+  $SITEINFO << [ url_or_pattern, sd ]
+end
+def addstring(url_or_pattern, string)
+  defnext_(url_or_pattern) {|u,m|
+    $logger.info "Use addstring for #{url_or_pattern}"
+    u.to_s + string
+  }
+end
+def increment(url_or_pattern, n=1)
+  defnext_(url_or_pattern) {|u,m|
+    $logger.info "Use increment for #{url_or_pattern}"
+    url=u.to_s
+    nextvar = m[1].to_i + n
+    url[ m.begin(1) ... m.end(1) ] = if m[1] =~ /^0/
+                                       format("%0#{m[1].length}d", nextvar)
+                                     else
+                                       nextvar.to_s
+                                     end
+    url
+  }
+end
+def w3mctl(*strings)
+  strings.each do |str|
+    if str
+      if str==true
+        $client.print "\r\n\r\n"
+      else
+        $client.print "W3m-Control: #{str}\r\n"
+      end
+    end
+  end
+  nil
+end
+###########################################################################
+# File.zread                                                              #
+###########################################################################
+require 'zlib'
+Zlib::GZIP_MAGIC = "\037\213"
+Zlib::GZIP_MAGIC.force_encoding("ASCII-8BIT") if RUBY_VERSION >= "1.9"
+def File.zread(file)
+  Object.module_eval do
+    open(file) do |f|
+      magic = f.read(2)
+      f.rewind
+      if magic == Zlib::GZIP_MAGIC
+        Zlib::GzipReader.wrap(f) {|gz| gz.read }
+      else
+        f.read
+      end
+    end
+  end
+end
+###########################################################################
+# content cache                                                           #
+###########################################################################
+TMPFILE = Dir.tmpdir + "/w3m-autopagerize.tmp.html"
+$content_cache = Hash.new do |h,url|
+  $logger.debug "cache miss: set $content_cache[#{url.inspect}]"
+  # use w3m to pass cookie
+  header, source = get_header_and_content(url)
+  $logger.debug "cache miss: source is html? = #{source =~ /<body/i and true}"
+  charset = normalize_charset(header[/charset=(.+)$/,1] || Kconv.guess(source))
+  source.force_encoding("ASCII-8BIT")
+  h[url] = [source, charset]
+end
+# BUG: libxml2 cannot handle id() function without doctype.
+# http://labs.gmo.jp/blog/ku/2008/07/libxmlhtmlxpathid.html
+DOCTYPE = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">'
+$nokogiri_cache = Hash.new do |h,url|
+  $logger.debug "cache miss: set $nokogiri_cache[#{url.inspect}]"
+  source, charset = $content_cache[url]
+  h[url] = Nokogiri::HTML(DOCTYPE+source, nil, charset)
+end
+###########################################################################
+# Utilities                                                               #
+###########################################################################
+def get_header_and_content(url)
+  output = `w3m #$W3M_EXTRA_OPTIONS -dump_both -o accept_encoding='gzip' "#{url}"`
+  header, source = output.force_encoding("ASCII-8BIT").split(/\n\n/, 2)
+  open(TMPFILE,"wb"){|f| f.write source}
+  source = File.zread(TMPFILE).force_encoding "ASCII-8BIT"
+  [header, source]
+ensure
+  File.unlink TMPFILE if File.exist? TMPFILE
+end
+def get_content(url)
+  if url =~ /^https?:/
+    get_header_and_content(url)[1]
+  else                          # local file
+    File.zread(File.expand_path(url)).force_encoding "ASCII-8BIT"
+  end
+end
+def normalize_charset(charset)
+  charset = charset.to_s
+  # FIXME I do not know other charsets than Japanese.
+  charset.downcase == "shift_jis" ? "cp932" : charset
+end
+def reinit
+  $SITEINFO = []
+end
+# unless "".respond_to? :force_encoding # for ruby 1.8
+#   class String
+#     def force_encoding(args); self end
+#     def encoding; Kconv.guess(self) end
+#   end
+# end
+###########################################################################
+# Entry Points                                                            #
+###########################################################################
+class Server
+  def sitedata(url)
+    url = url.to_s
+    match = nil
+    sitedata = $SITEINFO.find{|re, block|
+      match = case re
+              when Regexp
+                url.match(re)
+              else
+                url == re.to_s
+              end
+    }[1]
+    sitedata.match = match
+    sitedata
+  end
+  private :sitedata
+  HTML_OUTPUT_FILE = "/tmp/w3m-autopagerize-tmp.html"
+  def crop_html(location, prev_url, sitedata)
+    $logger.debug "#{__method__}: url = #{location}"
+    location = location.to_s
+    nokogiri = $nokogiri_cache[location]
+    title = nokogiri.at("//title").to_html rescue "<title></title>"
+    begin
+      $logger.info "#{__method__}: use xpath #{sitedata.pageElement}"
+      nodes = nokogiri.xpath(sitedata.pageElement)
+      $logger.debug "#{__method__}: nodes.length = #{nodes.length}"
+      html_piece = nodes.to_html
+      raise if html_piece.strip.empty?
+    rescue
+      $logger.error "#{__method__}: failed to crop!"
+      html_piece = nokogiri.at("body").to_html
+      errmsg = %{<p>w3m-autopagerize failed to crop html but next url is found.<br />
+                 xpath = #{sitedata.pageElement || 'pageElement not found'}
+                 </p>
+                 <hr>}
+    else
+        errmsg = ""
+    end
+    # BUG: Nokogiri emits superfluous &#13;.
+    html_piece.gsub! /&#13;/, ''        # hack
+    # BUG: w3m cannot handle <script />, so replace it with <script></script>
+    html_piece.gsub! %r!(<script.+?)/>!, '\1></script>' # hack
+    %w[location title prev_url sitedata.pageElement sitedata.nextLink errmsg html_piece].each do |e|
+      # $logger.debug "#{__method__}: #{e}.encoding = #{eval('e').to_s.encoding}"
+    end
+    html = %{<html>
+             <head><base href="#{location}" />#{title}
+             <link rel="w3m-autopagerize-orig" href="#{location}" />
+             <link rel="w3m-autopagerize-prev" href="#{prev_url}" />
+             </head>
+             <body>
+             Original URL: <a href="#{location}">#{location}</a><br>
+             #{errmsg}
+             #{html_piece}
+             </body></html> }
+    { :html => html, :location => location,
+      :pageElement => sitedata.pageElement, :nextLink => sitedata.nextLink}
+  end
+  private :crop_html
+  def prefetch_next_location(location, sitedata)
+    Thread.start do
+      #        sleep 1
+      $logger.debug "#{__method__}: #{location}"
+      # sitedata = sitedata location
+      newloc = sitedata.next_url(location)
+      $logger.debug "#{__method__}: new location: #{newloc}"
+      $nokogiri_cache[newloc.to_s]
+    end
+    # It uses Ordered Hash in Ruby 1.9
+    [$nokogiri_cache, $content_cache].each do |hash|
+      hash.delete hash.first[0] if hash.length > 3
+    end
+  end
+  private :prefetch_next_location
+  def prepare(url, srcfile, charset, client, method)
+    $logger.info "=================================================="
+    $logger.info "#{method}: entered url=#{url} charset=#{charset}"
+    $logger.debug "#{method}: W3M_SOURCEFILE = #{srcfile}" if srcfile
+    $client = client
+    src = File.zread(srcfile).force_encoding("ASCII-8BIT") if srcfile
+    if url =~ /^file:.*\/cgi-bin\// # from Local CGI
+      url = src.force_encoding("ASCII-8BIT")[%r!<base href=['"](.+?)['"]!, 1] # '"
+      $logger.info "#{method}: base url=#{url}"
+    else                        # from W3M_SOURCEFILE
+      $logger.debug "#{method}: set $content_cache[#{url.inspect}] from W3M_SOURCEFILE"
+      $logger.debug "#{method}: source is html? = #{src =~ /<body/i and true}"
+      $content_cache[url] = [src.force_encoding("ASCII-8BIT"), normalize_charset(charset)] if src
+    end
+    [ src, url ]
+  end
+  def crop_this_page(url, srcfile, charset, client)
+    src, url = prepare(url, srcfile, charset, client, __method__)
+    sitedata = sitedata url
+    begin
+      crop_html url, nil, sitedata
+    ensure
+      prefetch_next_location sitedata.next_url(url), sitedata
+    end
+  end
+  def nextpage(url, srcfile, charset, client)
+    src, url = prepare(url, srcfile, charset, client, __method__)
+    sitedata = sitedata url
+    location = sitedata.next_url(url)
+    if location
+      if sitedata.pageElement
+        $logger.debug "#{__method__}: location and pageElement found."
+      else
+        $logger.debug "#{__method__}: location found."
+      end
+      begin
+        crop_html location, url, sitedata
+      ensure
+        prefetch_next_location location, sitedata
+      end
+    else
+      fallback_nexturl = for fallback in SiteData.fallbacks
+                           u = fallback.next_url(url) and break u
+                         end
+      if fallback_nexturl
+        $logger.info "#{__method__}: fallback"
+        begin
+          crop_html fallback_nexturl, url, fallback
+        ensure
+          prefetch_next_location fallback_nexturl, fallback
+        end
+      else
+        $logger.debug "#{__method__}: no location."
+        raise "no location!"
+      end
+    end
+  rescue
+    html = %{<pre>Error!
+xpath = #{sitedata.nextLink || 'nextLink not found'}
+#{$!}
+#{$@.pretty_inspect}
+src_encoding=#{Kconv.guess(src || $content_cache[url].first)}
+</pre>
+    }
+    $logger.error "#{__method__}: error!"
+    $logger.error "#{__method__}: #$!"
+    $logger.error "#{__method__}: #{$@.pretty_inspect}"
+    {:html => html}
+  end
+  # (executable-interpret "ruby19 -r w3m-autopagerize-server -e '$logger=Logger.new(); load_siteinfo'")
+  # (executable-interpret "ruby18 -r w3m-autopagerize-server -e '$logger=Logger.new(); load_siteinfo'")
+  def load_siteinfo
+    keys = %w[exampleUrl insertBefore pageElement nextLink]
+    $SITEINFO_IMPORT_URLS.each do |siteinfo_url|
+      JSON.parse(get_content(siteinfo_url).toutf8).each do |hash|
+        data = hash["data"]
+        if url = data["url"] and not $EXCLUDE_URLS.include? url
+          sd = SiteData.new data["nextLink"], data["insertBefore"],
+                 data["exampleUrl"], data["pageElement"]
+          $SITEINFO << [Regexp.new(url), sd]
+        end
+      end
+    end
+    $logger.info "#{__method__}: loaded"
+  end
+  def restart
+    exec $0
+  end
+  def load_config_file(config_file)
+    if config_file == :ignore
+      $stderr.puts "load_config_file: config file is ignored!"
+    else
+      config_file = File.expand_path(config_file, File.dirname(__FILE__))
+      if File.file? config_file
+        load(config_file)
+        $stderr.puts "load_config_file: loaded #{config_file}"
+      else
+        $stderr.puts "load_config_file: config file #{config_file} not found!"
+      end
+    end
+  rescue Exception
+    $stderr.puts "load_config_file: error loading #{config_file}!"
+  end
+end
+if __FILE__==$0
+  require 'optparse'
+  require 'drb'
+  conf = Struct.new(:log_file, :siteinfo_url, :config_file).new
+  conf.config_file = File.expand_path "~/.w3m-autopagerize.rb"
+  ARGV.options {|o|
+    o.on("-l", "--log LOGFILE",
+      "Use log file.") {|x| conf.log_file = File.expand_path(x, File.dirname(__FILE__)) }
+    o.on("-s", "--siteinfo URL",
+      "URL of JSON data (SITEINFO).") {|x| conf.siteinfo_url = x}
+    o.on("-c", "--config CONFIG", "Use config file.") {|x| conf.config_file = x }
+    o.on("-f", "Ignore config file.") {|x| conf.config_file = :ignore }
+    o.on("-r", "--restart", "--reload",
+      "Restart the server.") {|x|
+      DRbObject.new_with_uri(%q!druby://:9322!).restart rescue nil
+      puts "w3m-autopagerize-server restarted."
+      exit
+    }
+    o.parse!
+  }
+  srv = Server.new
+  srv.load_config_file(conf.config_file)
+  $LOG_FILE = conf.log_file || $LOG_FILE
+  $stderr.puts "startup: log file = #{$LOG_FILE.inspect}"
+  $logger = Logger.new($LOG_FILE)
+  $SITEINFO_IMPORT_URLS = [ conf.siteinfo_url ] if conf.siteinfo_url
+  $logger.info "$SITEINFO_IMPORT_URLS = #{$SITEINFO_IMPORT_URLS.inspect}"
+  srv.load_siteinfo
+  GC.start
+  SiteData.fallbacks.setup!
+  $stderr.puts "start w3m-autopagerize-server.rb (#{Time.now-start_time} secs)"
+  Thread.start do
+    loop { sleep 300; GC.start }
+  end
+  DRb.start_service("druby://:9322", srv)
+  DRb.thread.join
+end

data/config.sample.rb ADDED

@@ -0,0 +1,130 @@
+# -*- coding: utf-8 -*-
+# This file shows default setting. If you customize w3m-autopagerize,
+# copy this file to ~/.w3m-autopagerize.rb.
+# Extra options of w3m to fetch web page.
+$W3M_EXTRA_OPTIONS = ""
+# SITEINFO location. Set URL or filename of SITEINFO JSON data.
+$SITEINFO_IMPORT_URLS = %w[
+  http://wedata.net/databases/AutoPagerize/items.json
+]
+# Disable SITEINFO entries. The default is to ignore `"url": "^https?:\/\/."' entry.
+$EXCLUDE_URLS = %w[
+  ^https?:\/\/.
+]
+# Log file location
+# =================
+#
+# The default destination of the log is stderr.
+$LOG_FILE = $stderr
+# If you use a log file, uncomment this. Note that the default
+# directory of log file is the directory of w3m-autopagerize-server.rb.
+#  $LOG_FILE = "w3m-autopagerize.log"
+# Fallback patterns
+# =================
+#
+# If w3m-autopagerize cannot find next location, ie, wrong/no SITEINFO
+# entry, w3m-autopagerize uses heuristic method to find next location
+# with $FALLBACK_* variables. It is like FastForward of Opera.
+#
+# Links/buttons whose text is "next" or "keep reading" (full match) are
+# considered as next location.
+$FALLBACK_WORDS = %w[次 つぎ 続きます keep\ reading [→] 次一覧 Older\ Entries next Next NEXT]
+# Links/buttons whose text starts with ">" (prefix match) are
+# considered as next location.
+$FALLBACK_START_WORDS = %w[> ＞]
+# Links/buttons whose text contains ">" (partial match) are considered
+# as next location.
+$FALLBACK_PATTERNS = %w[次へ 次頁 次ページ 次項 次の 次を 先へ つぎへ つぎの 進む もっと見る  ]
+# Custom Location
+# ===============
+#
+# You write `next' pages by URL rule. Use `addstring' and `increment'
+# function. It is handy method to specify next location.
+# It requires NO XPATH KNOWLEDGE, but some Regexp knowledge:-)
+#
+# Custom locations takes precedence over SITEINFO. It means that even
+# if SITEINFO defines the configuration of a site, use custom
+# location,
+#
+# For example, The next page of "http://www.dotup.org/" is
+# "http://www.dotup.org/2.html". Use simply `addstring' function.
+#
+# The next page of "http://www.dotup.org/2.html" is
+# "http://www.dotup.org/3.html". Use `increment' function with Regexp.
+# The first occurrence of "(\d)" (digits) are replaced with the next number.
+# Note that writing a URL Regexp by %r!URL Regexp! is handy.
+addstring "http://www.dotup.org/", "2.html"
+increment %r!http://www.dotup.org/(\d+).html$!
+#
+# `increment' can add any integer. For example,
+# "http://images.google.co.jp/images?q=ruby&ie=Shift_JIS&hl=ja&start=20"
+# to
+# "http://images.google.co.jp/images?q=ruby&ie=Shift_JIS&hl=ja&start=40".
+increment %r!^http://images.google.(?:co.jp|com)/.*start=(\d+)!, 20
+#
+# `addstring' function can accept Regexp. For example,
+# "http://images.google.co.jp/images?q=ruby&ie=Shift_JIS&hl=ja"
+# to
+# "http://images.google.co.jp/images?q=ruby&ie=Shift_JIS&hl=ja&start=20".
+#
+# Note that `increment' of google image search must be defined BEFORE
+# `addstring'. If `addstring' is before `increment', w3m-autopagerize
+# considers the next page of
+# "http://images.google.co.jp/images?q=ruby&ie=Shift_JIS&hl=ja&start=20"
+# as
+# "http://images.google.co.jp/images?q=ruby&ie=Shift_JIS&hl=ja&start=20&start=20".
+# It is because the URL matches both
+# %r!^http://images.google.(?:co.jp|com)/! and
+# %r!^http://images.google.(?:co.jp|com)/.*start=(\d+)!.
+addstring %r!^http://images.google.(?:co.jp|com)/!, '&start=20'
+# Custom Action
+# =============
+#
+# You can execute any w3m commands for certain URL. For example, I
+# (rubikitch) login hatena and open my hatena diary, execute
+# "GOTO http://d.hatena.ne.jp/rubikitch/" and "DELETE_PREVBUF"
+# three times. Use `defnext' and `w3mctl'.
+#
+# This is a good example of login and goto action. Note that when you
+# use login and goto, you must set your login/password to
+# ~/.w3m/pre_form file.
+defnext "https://www.hatena.ne.jp/login" do
+  w3mctl "GOTO http://d.hatena.ne.jp/rubikitch/", "DELETE_PREVBUF", "DELETE_PREVBUF", "DELETE_PREVBUF"
+end
+# Custom SITEINFO
+# ===============
+#
+# If you have your original SITEINFO for AutoPagerize, you can simply
+# add the URL or filename into the top of $SITEINFO_IMPORT_URLS.
+#
+# The SITEINFO can be defined in Ruby DSL.
+#
+# In JSON:
+#
+#   {
+#     "name": "(.~) what a quiet stiff (~.)",
+#     "data": {
+#       "insertBefore": "",
+#       "pageElement": "id(\"pixflow\")",
+#       "url": "^http:\/\/whytheluckystiff\\.net\/quiet\/",
+#       "nextLink": "id(\"header\")\/a[last()]",
+#       "exampleUrl": "http:\/\/whytheluckystiff.net\/quiet\/"
+#     }
+#   }
+#
+# In Ruby:
+#
+#   defnext %r!^http://whytheluckystiff\.net/quiet/! do
+#     insertBefore ''
+#     pageElement  'id("pixflow")'
+#     nextLink     'id("header")/a[last()]'
+#     exampleUrl   'http://whytheluckystiff.net/quiet/'
+#   end

data/readme.html ADDED

@@ -0,0 +1,252 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+               "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml"
+lang="en" xml:lang="en">
+<head>
+<title>AutoPagerize for w3m</title>
+<meta http-equiv="Content-Type" content="text/html;charset=euc-jp"/>
+<meta name="generator" content="Org-mode"/>
+<meta name="generated" content="2009-01-30"/>
+<meta name="author" content="rubikitch"/>
+<style type="text/css">
+ <!--/*--><![CDATA[/*><!--*/
+  html { font-family: Times, serif; font-size: 12pt; }
+  .title  { text-align: center; }
+  .todo   { color: red; }
+  .done   { color: green; }
+  .tag    { background-color:lightblue; font-weight:normal }
+  .target { }
+  .timestamp { color: grey }
+  .timestamp-kwd { color: CadetBlue }
+  p.verse { margin-left: 3% }
+  pre {
+	border: 1pt solid #AEBDCC;
+	background-color: #F3F5F7;
+	padding: 5pt;
+	font-family: courier, monospace;
+        font-size: 90%;
+        overflow:auto;
+  }
+  table { border-collapse: collapse; }
+  td, th { vertical-align: top; }
+  dt { font-weight: bold; }
+  div.figure { padding: 0.5em; }
+  div.figure p { text-align: center; }
+  .linenr { font-size:smaller }
+  .code-highlighted {background-color:#ffff00;}
+  .org-info-js_info-navigation { border-style:none; }
+  #org-info-js_console-label { font-size:10px; font-weight:bold;
+                               white-space:nowrap; }
+  .org-info-js_search-highlight {background-color:#ffff00; color:#000000;
+                                 font-weight:bold; }
+  /*]]>*/-->
+</style>
+<script type="text/javascript">
+<!--/*--><![CDATA[/*><!--*/
+ function CodeHighlightOn(elem, id)
+ {
+   var target = document.getElementById(id);
+   if(null != target) {
+     elem.cacheClassElem = elem.className;
+     elem.cacheClassTarget = target.className;
+     target.className = "code-highlighted";
+     elem.className   = "code-highlighted";
+   }
+ }
+ function CodeHighlightOff(elem, id)
+ {
+   var target = document.getElementById(id);
+   if(elem.cacheClassElem)
+     elem.className = elem.cacheClassElem;
+   if(elem.cacheClassTarget)
+     target.className = elem.cacheClassTarget;
+ }
+/*]]>*/-->
+</script>
+</head><body>
+<h1 class="title">AutoPagerize for w3m</h1>
+<p>AutoPagerize for w3m <a href="http://rubikitchrb.rubyforge.org/">http://rubikitchrb.rubyforge.org/</a>
+</p>
+<p>
+Copyright (c) 2009 rubikitch &lt;rubikitch@ruby-lang.org&gt; <a href="http://www.rubyist.net/~rubikitch/">http://www.rubyist.net/~rubikitch/</a>
+</p>
+<p>
+Use and distribution subject to the terms of the Ruby license.
+</p>
+<div id="table-of-contents">
+<h2>Table of Contents</h2>
+<div id="text-table-of-contents">
+<ul>
+<li><a href="#sec-1">1 Overview </a></li>
+<li><a href="#sec-2">2 Programs </a>
+<ul>
+<li><a href="#sec-2.1">2.1 w3m-autopagerize-server.rb </a></li>
+<li><a href="#sec-2.2">2.2 next.cgi </a></li>
+<li><a href="#sec-2.3">2.3 config.sample.rb </a></li>
+</ul>
+</li>
+<li><a href="#sec-3">3 Installation </a>
+<ul>
+<li><a href="#sec-3.1">3.1 Install AutoPagerize for w3m </a></li>
+<li><a href="#sec-3.2">3.2 Copy config file </a></li>
+<li><a href="#sec-3.3">3.3 Local CGI setup </a></li>
+<li><a href="#sec-3.4">3.4 Key bind </a></li>
+</ul>
+</li>
+<li><a href="#sec-4">4 Usage </a></li>
+<li><a href="#sec-5">5 License </a></li>
+</ul>
+</div>
+</div>
+<div id="outline-container-1" class="outline-2">
+<h2 id="sec-1">1 Overview </h2>
+<div id="text-1">
+<p>AutoPagerize for w3m finds next link and extracts page contents. It
+consists of dRuby server program (w3m-autopagerize-server.rb) and w3m
+Local CGI program (next.cgi).
+</p>
+</div>
+</div>
+<div id="outline-container-2" class="outline-2">
+<h2 id="sec-2">2 Programs </h2>
+<div id="text-2">
+</div>
+<div id="outline-container-2.1" class="outline-3">
+<h3 id="sec-2.1">2.1 w3m-autopagerize-server.rb </h3>
+<div id="text-2.1">
+<p>AutoPagerize for w3m uses dRuby server w3m-autopagerize-server.rb
+because initializing site data is time-consuming. Before using
+AutoPagerize for w3m, you have to invoke w3m-autopagerize-server.rb!
+w3m-autopagerize-server.rb loads config file (~/.w3m-autopagerize.rb)
+if any and reads AutoPagerize SITEINFO data from wedata.net by
+default.
+</p>
+</div>
+</div>
+<div id="outline-container-2.2" class="outline-3">
+<h3 id="sec-2.2">2.2 next.cgi </h3>
+<div id="text-2.2">
+<p>next.cgi is Local CGI program to ask w3m-autopagerize-server.rb to get next page.
+</p>
+</div>
+</div>
+<div id="outline-container-2.3" class="outline-3">
+<h3 id="sec-2.3">2.3 config.sample.rb </h3>
+<div id="text-2.3">
+<p>The sample config file to customize. See <a href="config.sample.rb">file:config.sample.rb</a> for detail.
+</p>
+</div>
+</div>
+</div>
+<div id="outline-container-3" class="outline-2">
+<h2 id="sec-3">3 Installation </h2>
+<div id="text-3">
+</div>
+<div id="outline-container-3.1" class="outline-3">
+<h3 id="sec-3.1">3.1 Install AutoPagerize for w3m </h3>
+<div id="text-3.1">
+<p>AutoPagerize for w3m works with Ruby 1.9 only! So, you have to install
+Ruby 1.9.x. Then simply issue:
+</p>
+<pre class="example">
+ $ sudo ruby1.9 -S gem install w3m-autopagerize
+</pre>
+</div>
+</div>
+<div id="outline-container-3.2" class="outline-3">
+<h3 id="sec-3.2">3.2 Copy config file </h3>
+<div id="text-3.2">
+<p>If you customize AutoPagerize for w3m, copy config.sample.rb to
+~/.w3m-autopagerize.rb and edit it.
+</p>
+</div>
+</div>
+<div id="outline-container-3.3" class="outline-3">
+<h3 id="sec-3.3">3.3 Local CGI setup </h3>
+<div id="text-3.3">
+<p>Local CGI program next.cgi is installed at
+/usr/local/bin/next.cgi. You have to make w3m find it. Add
+/usr/local/bin to your Local CGI path (cgi<sub>bin</sub>), or make symlink.
+</p>
+<pre class="example">
+ $ cd ~/w3m/cgi-bin; ln -s /usr/local/bin/next.cgi
+</pre>
+</div>
+</div>
+<div id="outline-container-3.4" class="outline-3">
+<h3 id="sec-3.4">3.4 Key bind </h3>
+<div id="text-3.4">
+<p>Bind AutoPagerize for w3m to your favorite key. Edit ~/.w3m/keymap and add this line.
+</p>
+<pre class="example">
+ keymap x GOTO file:/cgi-bin/next.cgi
+</pre>
+</div>
+</div>
+</div>
+<div id="outline-container-4" class="outline-2">
+<h2 id="sec-4">4 Usage </h2>
+<div id="text-4">
+<ul>
+<li>
+Press `x' key to go to next page.
+</li>
+<li>
+Press `=' key to see information, eg. XPath to get next page.
+</li>
+</ul>
+</div>
+</div>
+<div id="outline-container-5" class="outline-2">
+<h2 id="sec-5">5 License </h2>
+<div id="text-5">
+<p>AutoPagerize for w3m is licensed under the same terms as Ruby.
+</p></div>
+</div>
+<div id="postamble"><p class="author"> Author: rubikitch
+<a href="mailto:rubikitch@ruby-lang.org">&lt;rubikitch@ruby-lang.org&gt;</a>
+</p>
+<p class="date"> Date: 2009-01-30</p>
+<p>HTML generated by org-mode 6.18 in emacs 22</p>
+</div></body>
+</html>

data/readme.org ADDED

@@ -0,0 +1,65 @@
+#+TITLE:     AutoPagerize for w3m
+#+AUTHOR:    rubikitch
+#+EMAIL:     rubikitch@ruby-lang.org
+#+DATE:      2009-01-30
+#+LANGUAGE:  en
+#+OPTIONS:   H:3 num:t toc:t \n:nil @:t ::t |:t ^:t -:t f:t *:t TeX:t LaTeX:nil skip:nil d:nil todo:t pri:nil tags:not-in-toc
+#+INFOJS_OPT: view:nil toc:nil ltoc:t mouse:underline buttons:0 path:http://orgmode.org/org-info.js
+#+EXPORT_SELECT_TAGS: export
+#+EXPORT_EXCLUDE_TAGS: noexport
+#+LINK_UP:
+#+LINK_HOME:
+AutoPagerize for w3m http://rubikitchrb.rubyforge.org/
+Copyright (c) 2009 rubikitch <rubikitch@ruby-lang.org> http://www.rubyist.net/~rubikitch/
+Use and distribution subject to the terms of the Ruby license.
+* Overview
+AutoPagerize for w3m finds next link and extracts page contents. It
+consists of dRuby server program (w3m-autopagerize-server.rb) and w3m
+Local CGI program (next.cgi).
+* Programs
+** w3m-autopagerize-server.rb
+AutoPagerize for w3m uses dRuby server w3m-autopagerize-server.rb
+because initializing site data is time-consuming. Before using
+AutoPagerize for w3m, you have to invoke w3m-autopagerize-server.rb!
+w3m-autopagerize-server.rb loads config file (~/.w3m-autopagerize.rb)
+if any and reads AutoPagerize SITEINFO data from wedata.net by
+default.
+** next.cgi
+next.cgi is Local CGI program to ask w3m-autopagerize-server.rb to get next page.
+** config.sample.rb
+The sample config file to customize. See file:config.sample.rb for detail.
+* Installation
+** Install AutoPagerize for w3m
+AutoPagerize for w3m works with Ruby 1.9 only! So, you have to install
+Ruby 1.9.x. Then simply issue:
+: $ sudo ruby1.9 -S gem install w3m-autopagerize
+** Copy config file
+If you customize AutoPagerize for w3m, copy config.sample.rb to
+~/.w3m-autopagerize.rb and edit it.
+** Local CGI setup
+Local CGI program next.cgi is installed at
+/usr/local/bin/next.cgi. You have to make w3m find it. Add
+/usr/local/bin to your Local CGI path (cgi_bin), or make symlink.
+: $ cd ~/w3m/cgi-bin; ln -s /usr/local/bin/next.cgi
+** Key bind
+Bind AutoPagerize for w3m to your favorite key. Edit ~/.w3m/keymap and add this line.
+: keymap x GOTO file:/cgi-bin/next.cgi
+* Usage
+- Press `x' key to go to next page.
+- Press `=' key to see information, eg. XPath to get next page.
+* License
+AutoPagerize for w3m is licensed under the same terms as Ruby.

data/test/test-w3m-autopagerize.rb ADDED

@@ -0,0 +1,166 @@
+#!/usr/local/bin/ruby19
+# -*- coding: utf-8 -*-
+# (executable-interpret "ruby19 /m/home/rubikitch/w3m/cgi-bin/w3m-autopagerize/test-w3m-autopagerize.rb --no-use-color ")
+require 'fileutils'
+FileUtils.rm_f "test.log"
+require 'test/unit'
+require 'open-uri'
+require 'script'
+require 'w3m-autopagerize-server' # !> method redefined; discarding old debug_with_time
+$TEST_MODE = true
+$W3M_EXTRA_OPTIONS = "-o http_proxy=http://127.0.0.1:8339/"
+$logger = Logger.new "test.log"
+class TestAutoPagerize < Test::Unit::TestCase
+  def test_hatena_success
+    $logger.info "Test: #{__method__}"
+    reinit
+    defnext %r{^https?:\/\/(?:d2?|[^.]+\.g)\.hatena\.ne\.jp\/} do
+      exampleUrl   %{http://os0x.g.hatena.ne.jp/os0x/}
+      pageElement  %{id("days")}
+      nextLink     %{//a[@rel="prev"]}
+    end
+    nexturl = "http://d.hatena.ne.jp/rubikitch/20090110/1231524557"
+    origurl = "http://d.hatena.ne.jp/rubikitch/20090113/1231844047"
+    np = Server.new.nextpage(origurl, nil, nil, Object.new)
+    assert_equal nexturl, np[:location]
+    assert_match(/<base/, np[:html])
+    assert_equal %{id("days")}, np[:pageElement]
+    assert_equal %{//a[@rel="prev"]}, np[:nextLink]
+  end
+  def test_hatena_fail
+    $logger.info "Test: #{__method__}"
+    reinit
+    defnext %r{^https?:\/\/(?:d2?|[^.]+\.g)\.hatena\.ne\.jp\/} do
+      exampleUrl   %{http://os0x.g.hatena.ne.jp/os0x/}
+      pageElement  %{id("noelement")}
+      nextLink     %{//a[@rel="prev"]}
+    end
+    nexturl = "http://d.hatena.ne.jp/rubikitch/20090110/1231524557"
+    origurl = "http://d.hatena.ne.jp/rubikitch/20090113/1231844047"
+    np = Server.new.nextpage(origurl, nil, nil, Object.new)
+    assert_equal nexturl, np[:location]
+    assert_match(/failed to crop html/, np[:html])
+    assert_equal %{id("noelement")}, np[:pageElement]
+    assert_equal %{//a[@rel="prev"]}, np[:nextLink]
+  end
+  def test_google_addstring
+    $logger.info "Test: #{__method__}"
+    reinit
+    addstring %r!^http://www.google.(?:co.jp|com)/search!, '&start=100'
+    nexturl = "http://www.google.com/search?q=ruby&hl=ja&num=100&start=100"
+    origurl = "http://www.google.com/search?q=ruby&hl=ja&num=100"
+    np = Server.new.nextpage(origurl, nil, nil, Object.new)
+    assert_equal nexturl, np[:location]
+  end
+  def test_google_increment
+    reinit
+    increment %r!^http://www.google.(?:co.jp|com)/search.*start=(\d+)!, 100
+    nexturl = "http://www.google.com/search?q=ruby&hl=ja&num=100&start=200"
+    origurl = "http://www.google.com/search?q=ruby&hl=ja&num=100&start=100"
+    np = Server.new.nextpage(origurl, nil, nil, Object.new)
+    assert_equal nexturl, np[:location]
+  end
+  def test_google_fallback_link
+    $logger.info "Test: #{__method__}"
+    reinit
+    $FALLBACK_PATTERNS = %w[次へ]
+    $FALLBACK_WORDS = %w[次へ]
+    $SITEINFO = [[ /./, SiteData.fallbacks[0] ]]
+    nexturl = "http://www.google.com/search?num=100&hl=ja&pwst=1&q=ruby&start=100&sa=N"
+    origurl = "http://www.google.com/search?q=ruby&hl=ja&num=100"
+    np = Server.new.nextpage(origurl, nil, "UTF-8", Object.new)
+    assert_equal nexturl, np[:location]
+  end
+  def test_futaba_fallback_form
+    $logger.info "Test: #{__method__}"
+    reinit
+    $FALLBACK_WORDS = %w[次のページ]
+    $SITEINFO = [[ /./, SiteData.fallbacks[1] ]]
+    nexturl = "http://may.2chan.net/27/1.htm"
+    origurl = "http://may.2chan.net/27/futaba.htm"
+    np = Server.new.nextpage(origurl, nil, "cp932", Object.new)
+    assert_equal nexturl, np[:location]
+  end
+  def test_futaba_fallback_by_wrong_sitedata
+    $logger.info "Test: #{__method__}"
+    reinit
+    defnext %r{2chan} do
+      pageElement  %{id("noelement")}
+      nextLink     %{//a[@rel="prev"]}
+    end
+    $FALLBACK_WORDS = %w[次のページ]
+    SiteData.fallbacks.setup!
+    nexturl = "http://may.2chan.net/27/1.htm"
+    origurl = "http://may.2chan.net/27/futaba.htm"
+    np = Server.new.nextpage(origurl, nil, "cp932", Object.new)
+    assert_equal nexturl, np[:location]
+  end
+end
+class TestFallBackPredicate < Test::Unit::TestCase
+  def test_1
+    assert_equal '.="tugi" or contains(.,"Next")',
+      SiteData.fallback_predicate1(".", %w[tugi], %w[Next])
+  end
+  def test_2
+    assert_equal '.="tugi"', SiteData.fallback_predicate1(".", %w[tugi], [])
+  end
+  def test_3
+    assert_equal 'contains(.,"Next")', SiteData.fallback_predicate1(".", [], %w[Next])
+  end
+end
+class TestFallBackSiteData < Test::Unit::TestCase
+  def setup
+    SiteData.instance_eval { @fallbacks = nil }
+  end
+  def test_1
+    $FALLBACK_PATTERNS = %w[次へ]
+    $FALLBACK_WORDS = %w[次へ]
+    $FALLBACK_START_WORDS = %w[tugi]
+    assert_equal 4, SiteData.fallbacks.length
+  end
+  def test_2
+    $FALLBACK_PATTERNS = %w[次へ]
+    $FALLBACK_WORDS = %w[次へ]
+    $FALLBACK_START_WORDS = []
+    assert_equal 2, SiteData.fallbacks.length
+  end
+  def test_3
+    $FALLBACK_PATTERNS = %w[次へ]
+    $FALLBACK_WORDS = %w[次へ]
+    $FALLBACK_START_WORDS = %w[tugi]
+    assert SiteData.fallbacks.respond_to? :setup!
+  end
+end
+# >> Loaded suite -
+# >> Started
+# >> .....
+# >>
+# >> Finished in 1.154570634 seconds.
+# >>
+# >> 5 tests, 14 assertions, 0 failures, 0 errors, 0 pendings, 0 omissions, 0 notifications

metadata ADDED

@@ -0,0 +1,58 @@
+--- !ruby/object:Gem::Specification
+name: w3m-autopagerize
+version: !ruby/object:Gem::Version
+  version: 1.0.0
+platform: ruby
+authors:
+- rubikitch
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-01-30 00:00:00 +09:00
+default_executable:
+dependencies: []
+description: AutoPagerize for w3m
+email: rubikitch@ruby-lang.org
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- readme.org
+- readme.html
+- config.sample.rb
+- bin/w3m-autopagerize-server.rb
+- bin/next.cgi
+- test/test-w3m-autopagerize.rb
+has_rdoc: false
+homepage: http://www.rubyist.net/~rubikitch/
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project: rubikitchrb
+rubygems_version: 1.3.1
+signing_key:
+specification_version: 2
+summary: AutoPagerize for w3m
+test_files: []