RubyGems - w3m-autopagerize - Versions diffs - 1.0.0 - Mend

w3m-autopagerize 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/bin/next.cgi +31 -0
data/bin/w3m-autopagerize-server.rb +489 -0
data/config.sample.rb +130 -0
data/readme.html +252 -0
data/readme.org +65 -0
data/test/test-w3m-autopagerize.rb +166 -0
metadata +58 -0

data/bin/next.cgi ADDED

@@ -0,0 +1,31 @@
+#!/usr/local/bin/ruby19 -Ku
+require 'drb'
+require 'w3m-localcgi'
+require 'kconv'
+url = ENV['W3M_URL']
+DRb.start_service
+public :print                   # HACK to work w3mctl
+srv = DRbObject.new_with_uri "druby://:9322"
+if ENV['QUERY_STRING'] == 'crop'
+  hash = srv.crop_this_page(url, ENV['W3M_SOURCEFILE'], ENV['W3M_CHARSET'], self.extend(DRbUndumped))
+else
+  hash = srv.nextpage(url, ENV['W3M_SOURCEFILE'], ENV['W3M_CHARSET'], self.extend(DRbUndumped))
+end
+if hash[:html]
+  puts "Content-Type: text/html"
+  puts "W3m-AutoPagerize-NextLink: #{hash[:nextLink]}" if hash[:nextLink]
+  puts "W3m-AutoPagerize-PageElement: #{hash[:pageElement]}" if hash[:pageElement]
+  puts
+  puts hash[:html]
+else
+  if hash[:location]
+    puts "W3m-Control: BACK"
+    puts "W3m-Control: GOTO #{hash[:location]}"
+    puts "W3m-AutoPagerize-NextLink: #{hash[:nextLink]}" if hash[:nextLink]
+  end
+  puts
+end

data/bin/w3m-autopagerize-server.rb ADDED

@@ -0,0 +1,489 @@
+#!/usr/local/bin/ruby19
+# -*- coding: utf-8 -*-
+# (executable-interpret "ruby19 /m/home/rubikitch/w3m/cgi-bin/w3m-autopagerize/test-w3m-autopagerize.rb --no-use-color ")
+# (executable-interpret "rm -f /log/w3m-autopagerize.log; w3m-autopagerize-server.rb -r")
+start_time = Time.now
+#Encoding.default_internal = "UTF-8"
+require 'kconv'
+require 'uri'
+require 'rubygems'
+require 'nokogiri'
+require 'pp'
+require 'logger'
+require 'tmpdir'
+require 'json'
+###########################################################################
+# Configurable Variables                                                  #
+###########################################################################
+$W3M_EXTRA_OPTIONS = ""
+# see http://www.opera-wiki.com/index.php?FAQ%2F5.%E3%82%AB%E3%82%B9%E3%82%BF%E3%83%9E%E3%82%A4%E3%82%BA#k7bb0c80
+$FALLBACK_PATTERNS = %w[次へ 次頁 次ページ 次項 次の 次を 先へ つぎへ つぎの
+                         進む next  もっと見る  ]
+$FALLBACK_WORDS = %w[次 つぎ 続きます keep\ reading [→] 次一覧 Older\ Entries]
+$FALLBACK_START_WORDS = %w[> ＞ 次 つぎ Next NEXT next →]
+$SITEINFO_IMPORT_URLS = %w[
+  http://wedata.net/databases/AutoPagerize/items.json
+]
+$EXCLUDE_URLS = %w[
+  ^https?:\/\/.
+]
+$LOG_FILE = $stderr
+###########################################################################
+# DSL for nexturl                                                         #
+###########################################################################
+$TEST_MODE = false
+$SITEINFO = []
+$client = nil
+class SiteData < Struct.new(:nextLink, :insertBefore, :exampleUrl, :pageElement,
+    :block, :match)
+  def self.fallback_predicate1(text, words=$FALLBACK_WORDS, patterns=$FALLBACK_PATTERNS)
+    a = [
+      words.map{|w| %Q!#{text}="#{w}"!}.join(' or '),
+      patterns.map{|w| %Q!contains(#{text},"#{w}")!}.join(' or '),
+    ]
+    a.delete ""
+    a.join " or "
+  end
+  def self.fallback_predicate2(text, start_words=$FALLBACK_START_WORDS)
+    start_words.map{|w| %Q!starts-with(#{text},"#{w}")!}.join(' or ')
+  end
+  # link to next
+  def self.fallbacks
+    @fallbacks ||= lambda do
+      a = [
+        new("//a[#{fallback_predicate1('.')}]"),
+        new("//form[descendant::input[#{fallback_predicate1('@value')}]]"),
+      ]
+      if $FALLBACK_START_WORDS.to_a.length > 0
+        a.concat [
+          new("//a[#{fallback_predicate2('.')}]"),
+          new("//form[descendant::input[#{fallback_predicate2('@value')}]]"),
+        ]
+      end
+      a.extend(FallbackSetup)
+    end.call
+  end
+  module FallbackSetup
+    def setup!
+      each {|fallback| $SITEINFO << [/./, fallback]}
+    end
+  end
+  # Make the DSL pretty!
+  members.each do |m|
+    undef_method m
+    module_eval <<-EOC         # hack for ruby-mode.el
+      #{'def'} #{m}(v=nil)
+        if v
+          self[:#{m}] = v
+        else
+          self[:#{m}]
+        end
+      end
+    EOC
+  end
+  def next_url(uri)
+    uri = URI(uri.to_s)
+    result = instance_exec(uri, match, &block) if block
+    xpath = nextLink
+    if xpath
+      nokogiri = $nokogiri_cache[uri.to_s]
+      $logger.info "#{__method__}: use xpath #{xpath}"
+      nodes = nokogiri.xpath(xpath)
+      node = nodes.first
+      $logger.debug "#{__method__}: nodes.length = #{nodes.length}"
+      nexturl = (node["href"] || node["action"] || node["value"]) rescue nil
+      #  nexturl = nokogiri.xpath("#{xpath}/@href").first.content rescue nil
+      $logger.info "#{__method__}: nexturl = #{nexturl or 'NOT FOUND'}"
+      if nexturl
+        nexturl.gsub!(/ /, '+') # for some buggy sites not encoding spaces
+        uri.merge nexturl
+      end
+    else
+      $logger.info "#{__method__}: result = #{result}"
+      uri.merge result
+    end
+  end
+end
+def defnext(url_or_pattern, nexturl=nil, &b)
+  defnext_ url_or_pattern, nexturl do |u,m|
+    $logger.info "Use defnext for #{url_or_pattern}"
+    instance_exec(u, m, &b)
+  end
+end
+def defnext_(url_or_pattern, nexturl=nil, &block)
+  sd = SiteData.new
+  if nexturl
+    sd.block = lambda{|u,m| nexturl }
+  else
+    sd.block = block
+  end
+  $SITEINFO << [ url_or_pattern, sd ]
+end
+def addstring(url_or_pattern, string)
+  defnext_(url_or_pattern) {|u,m|
+    $logger.info "Use addstring for #{url_or_pattern}"
+    u.to_s + string
+  }
+end
+def increment(url_or_pattern, n=1)
+  defnext_(url_or_pattern) {|u,m|
+    $logger.info "Use increment for #{url_or_pattern}"
+    url=u.to_s
+    nextvar = m[1].to_i + n
+    url[ m.begin(1) ... m.end(1) ] = if m[1] =~ /^0/
+                                       format("%0#{m[1].length}d", nextvar)
+                                     else
+                                       nextvar.to_s
+                                     end
+    url
+  }
+end
+def w3mctl(*strings)
+  strings.each do |str|
+    if str
+      if str==true
+        $client.print "\r\n\r\n"
+      else
+        $client.print "W3m-Control: #{str}\r\n"
+      end
+    end
+  end
+  nil
+end
+###########################################################################
+# File.zread                                                              #
+###########################################################################
+require 'zlib'
+Zlib::GZIP_MAGIC = "\037\213"
+Zlib::GZIP_MAGIC.force_encoding("ASCII-8BIT") if RUBY_VERSION >= "1.9"
+def File.zread(file)
+  Object.module_eval do
+    open(file) do |f|
+      magic = f.read(2)
+      f.rewind
+      if magic == Zlib::GZIP_MAGIC
+        Zlib::GzipReader.wrap(f) {|gz| gz.read }
+      else
+        f.read
+      end
+    end
+  end
+end
+###########################################################################
+# content cache                                                           #
+###########################################################################
+TMPFILE = Dir.tmpdir + "/w3m-autopagerize.tmp.html"
+$content_cache = Hash.new do |h,url|
+  $logger.debug "cache miss: set $content_cache[#{url.inspect}]"
+  # use w3m to pass cookie
+  header, source = get_header_and_content(url)
+  $logger.debug "cache miss: source is html? = #{source =~ /<body/i and true}"
+  charset = normalize_charset(header[/charset=(.+)$/,1] || Kconv.guess(source))
+  source.force_encoding("ASCII-8BIT")
+  h[url] = [source, charset]
+end
+# BUG: libxml2 cannot handle id() function without doctype.
+# http://labs.gmo.jp/blog/ku/2008/07/libxmlhtmlxpathid.html
+DOCTYPE = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">'
+$nokogiri_cache = Hash.new do |h,url|
+  $logger.debug "cache miss: set $nokogiri_cache[#{url.inspect}]"
+  source, charset = $content_cache[url]
+  h[url] = Nokogiri::HTML(DOCTYPE+source, nil, charset)
+end
+###########################################################################
+# Utilities                                                               #
+###########################################################################
+def get_header_and_content(url)
+  output = `w3m #$W3M_EXTRA_OPTIONS -dump_both -o accept_encoding='gzip' "#{url}"`
+  header, source = output.force_encoding("ASCII-8BIT").split(/\n\n/, 2)
+  open(TMPFILE,"wb"){|f| f.write source}
+  source = File.zread(TMPFILE).force_encoding "ASCII-8BIT"
+  [header, source]
+ensure
+  File.unlink TMPFILE if File.exist? TMPFILE
+end
+def get_content(url)
+  if url =~ /^https?:/
+    get_header_and_content(url)[1]
+  else                          # local file
+    File.zread(File.expand_path(url)).force_encoding "ASCII-8BIT"
+  end
+end
+def normalize_charset(charset)
+  charset = charset.to_s
+  # FIXME I do not know other charsets than Japanese.
+  charset.downcase == "shift_jis" ? "cp932" : charset
+end
+def reinit
+  $SITEINFO = []
+end
+# unless "".respond_to? :force_encoding # for ruby 1.8
+#   class String
+#     def force_encoding(args); self end
+#     def encoding; Kconv.guess(self) end
+#   end
+# end
+###########################################################################
+# Entry Points                                                            #
+###########################################################################
+class Server
+  def sitedata(url)
+    url = url.to_s
+    match = nil
+    sitedata = $SITEINFO.find{|re, block|
+      match = case re
+              when Regexp
+                url.match(re)
+              else
+                url == re.to_s
+              end
+    }[1]
+    sitedata.match = match
+    sitedata
+  end
+  private :sitedata
+  HTML_OUTPUT_FILE = "/tmp/w3m-autopagerize-tmp.html"
+  def crop_html(location, prev_url, sitedata)
+    $logger.debug "#{__method__}: url = #{location}"
+    location = location.to_s
+    nokogiri = $nokogiri_cache[location]
+    title = nokogiri.at("//title").to_html rescue "<title></title>"
+    begin
+      $logger.info "#{__method__}: use xpath #{sitedata.pageElement}"
+      nodes = nokogiri.xpath(sitedata.pageElement)
+      $logger.debug "#{__method__}: nodes.length = #{nodes.length}"
+      html_piece = nodes.to_html
+      raise if html_piece.strip.empty?
+    rescue
+      $logger.error "#{__method__}: failed to crop!"
+      html_piece = nokogiri.at("body").to_html
+      errmsg = %{<p>w3m-autopagerize failed to crop html but next url is found.<br />
+                 xpath = #{sitedata.pageElement || 'pageElement not found'}
+                 </p>
+                 <hr>}
+    else
+        errmsg = ""
+    end
+    # BUG: Nokogiri emits superfluous &#13;.
+    html_piece.gsub! /&#13;/, ''        # hack
+    # BUG: w3m cannot handle <script />, so replace it with <script></script>
+    html_piece.gsub! %r!(<script.+?)/>!, '\1></script>' # hack
+    %w[location title prev_url sitedata.pageElement sitedata.nextLink errmsg html_piece].each do |e|
+      # $logger.debug "#{__method__}: #{e}.encoding = #{eval('e').to_s.encoding}"
+    end
+    html = %{<html>
+             <head><base href="#{location}" />#{title}
+             <link rel="w3m-autopagerize-orig" href="#{location}" />
+             <link rel="w3m-autopagerize-prev" href="#{prev_url}" />
+             </head>
+             <body>
+             Original URL: <a href="#{location}">#{location}</a><br>
+             #{errmsg}
+             #{html_piece}
+             </body></html> }
+    { :html => html, :location => location,
+      :pageElement => sitedata.pageElement, :nextLink => sitedata.nextLink}
+  end
+  private :crop_html
+  def prefetch_next_location(location, sitedata)
+    Thread.start do
+      #        sleep 1
+      $logger.debug "#{__method__}: #{location}"
+      # sitedata = sitedata location
+      newloc = sitedata.next_url(location)
+      $logger.debug "#{__method__}: new location: #{newloc}"
+      $nokogiri_cache[newloc.to_s]
+    end
+    # It uses Ordered Hash in Ruby 1.9
+    [$nokogiri_cache, $content_cache].each do |hash|
+      hash.delete hash.first[0] if hash.length > 3
+    end
+  end
+  private :prefetch_next_location
+  def prepare(url, srcfile, charset, client, method)
+    $logger.info "=================================================="
+    $logger.info "#{method}: entered url=#{url} charset=#{charset}"
+    $logger.debug "#{method}: W3M_SOURCEFILE = #{srcfile}" if srcfile
+    $client = client
+    src = File.zread(srcfile).force_encoding("ASCII-8BIT") if srcfile
+    if url =~ /^file:.*\/cgi-bin\// # from Local CGI
+      url = src.force_encoding("ASCII-8BIT")[%r!<base href=['"](.+?)['"]!, 1] # '"
+      $logger.info "#{method}: base url=#{url}"
+    else                        # from W3M_SOURCEFILE
+      $logger.debug "#{method}: set $content_cache[#{url.inspect}] from W3M_SOURCEFILE"
+      $logger.debug "#{method}: source is html? = #{src =~ /<body/i and true}"
+      $content_cache[url] = [src.force_encoding("ASCII-8BIT"), normalize_charset(charset)] if src
+    end
+    [ src, url ]
+  end
+  def crop_this_page(url, srcfile, charset, client)
+    src, url = prepare(url, srcfile, charset, client, __method__)
+    sitedata = sitedata url
+    begin
+      crop_html url, nil, sitedata
+    ensure
+      prefetch_next_location sitedata.next_url(url), sitedata
+    end
+  end
+  def nextpage(url, srcfile, charset, client)
+    src, url = prepare(url, srcfile, charset, client, __method__)
+    sitedata = sitedata url
+    location = sitedata.next_url(url)
+    if location
+      if sitedata.pageElement
+        $logger.debug "#{__method__}: location and pageElement found."
+      else
+        $logger.debug "#{__method__}: location found."
+      end
+      begin
+        crop_html location, url, sitedata
+      ensure
+        prefetch_next_location location, sitedata
+      end
+    else
+      fallback_nexturl = for fallback in SiteData.fallbacks
+                           u = fallback.next_url(url) and break u
+                         end
+      if fallback_nexturl
+        $logger.info "#{__method__}: fallback"
+        begin
+          crop_html fallback_nexturl, url, fallback
+        ensure
+          prefetch_next_location fallback_nexturl, fallback
+        end
+      else
+        $logger.debug "#{__method__}: no location."
+        raise "no location!"
+      end
+    end
+  rescue
+    html = %{<pre>Error!
+xpath = #{sitedata.nextLink || 'nextLink not found'}
+#{$!}
+#{$@.pretty_inspect}
+src_encoding=#{Kconv.guess(src || $content_cache[url].first)}
+</pre>
+    }
+    $logger.error "#{__method__}: error!"
+    $logger.error "#{__method__}: #$!"
+    $logger.error "#{__method__}: #{$@.pretty_inspect}"
+    {:html => html}
+  end
+  # (executable-interpret "ruby19 -r w3m-autopagerize-server -e '$logger=Logger.new(); load_siteinfo'")
+  # (executable-interpret "ruby18 -r w3m-autopagerize-server -e '$logger=Logger.new(); load_siteinfo'")
+  def load_siteinfo
+    keys = %w[exampleUrl insertBefore pageElement nextLink]
+    $SITEINFO_IMPORT_URLS.each do |siteinfo_url|
+      JSON.parse(get_content(siteinfo_url).toutf8).each do |hash|
+        data = hash["data"]
+        if url = data["url"] and not $EXCLUDE_URLS.include? url
+          sd = SiteData.new data["nextLink"], data["insertBefore"],
+                 data["exampleUrl"], data["pageElement"]
+          $SITEINFO << [Regexp.new(url), sd]
+        end
+      end
+    end
+    $logger.info "#{__method__}: loaded"
+  end
+  def restart
+    exec $0
+  end
+  def load_config_file(config_file)
+    if config_file == :ignore
+      $stderr.puts "load_config_file: config file is ignored!"
+    else
+      config_file = File.expand_path(config_file, File.dirname(__FILE__))
+      if File.file? config_file
+        load(config_file)
+        $stderr.puts "load_config_file: loaded #{config_file}"
+      else
+        $stderr.puts "load_config_file: config file #{config_file} not found!"
+      end
+    end
+  rescue Exception
+    $stderr.puts "load_config_file: error loading #{config_file}!"
+  end
+end
+if __FILE__==$0
+  require 'optparse'
+  require 'drb'
+  conf = Struct.new(:log_file, :siteinfo_url, :config_file).new
+  conf.config_file = File.expand_path "~/.w3m-autopagerize.rb"
+  ARGV.options {|o|
+    o.on("-l", "--log LOGFILE",
+      "Use log file.") {|x| conf.log_file = File.expand_path(x, File.dirname(__FILE__)) }
+    o.on("-s", "--siteinfo URL",
+      "URL of JSON data (SITEINFO).") {|x| conf.siteinfo_url = x}
+    o.on("-c", "--config CONFIG", "Use config file.") {|x| conf.config_file = x }
+    o.on("-f", "Ignore config file.") {|x| conf.config_file = :ignore }
+    o.on("-r", "--restart", "--reload",
+      "Restart the server.") {|x|
+      DRbObject.new_with_uri(%q!druby://:9322!).restart rescue nil
+      puts "w3m-autopagerize-server restarted."
+      exit
+    }
+    o.parse!
+  }
+  srv = Server.new
+  srv.load_config_file(conf.config_file)
+  $LOG_FILE = conf.log_file || $LOG_FILE
+  $stderr.puts "startup: log file = #{$LOG_FILE.inspect}"
+  $logger = Logger.new($LOG_FILE)
+  $SITEINFO_IMPORT_URLS = [ conf.siteinfo_url ] if conf.siteinfo_url
+  $logger.info "$SITEINFO_IMPORT_URLS = #{$SITEINFO_IMPORT_URLS.inspect}"
+  srv.load_siteinfo
+  GC.start
+  SiteData.fallbacks.setup!
+  $stderr.puts "start w3m-autopagerize-server.rb (#{Time.now-start_time} secs)"
+  Thread.start do
+    loop { sleep 300; GC.start }
+  end
+  DRb.start_service("druby://:9322", srv)
+  DRb.thread.join
+end

data/config.sample.rb ADDED

@@ -0,0 +1,130 @@
+# -*- coding: utf-8 -*-
+# This file shows default setting. If you customize w3m-autopagerize,
+# copy this file to ~/.w3m-autopagerize.rb.
+# Extra options of w3m to fetch web page.
+$W3M_EXTRA_OPTIONS = ""
+# SITEINFO location. Set URL or filename of SITEINFO JSON data.
+$SITEINFO_IMPORT_URLS = %w[
+  http://wedata.net/databases/AutoPagerize/items.json
+]
+# Disable SITEINFO entries. The default is to ignore `"url": "^https?:\/\/."' entry.
+$EXCLUDE_URLS = %w[
+  ^https?:\/\/.
+]
+# Log file location
+# =================
+#
+# The default destination of the log is stderr.
+$LOG_FILE = $stderr
+# If you use a log file, uncomment this. Note that the default
+# directory of log file is the directory of w3m-autopagerize-server.rb.
+#  $LOG_FILE = "w3m-autopagerize.log"
+# Fallback patterns
+# =================
+#
+# If w3m-autopagerize cannot find next location, ie, wrong/no SITEINFO
+# entry, w3m-autopagerize uses heuristic method to find next location
+# with $FALLBACK_* variables. It is like FastForward of Opera.
+#
+# Links/buttons whose text is "next" or "keep reading" (full match) are
+# considered as next location.
+$FALLBACK_WORDS = %w[次 つぎ 続きます keep\ reading [→] 次一覧 Older\ Entries next Next NEXT]
+# Links/buttons whose text starts with ">" (prefix match) are
+# considered as next location.
+$FALLBACK_START_WORDS = %w[> ＞]
+# Links/buttons whose text contains ">" (partial match) are considered
+# as next location.
+$FALLBACK_PATTERNS = %w[次へ 次頁 次ページ 次項 次の 次を 先へ つぎへ つぎの 進む もっと見る  ]
+# Custom Location
+# ===============
+#
+# You write `next' pages by URL rule. Use `addstring' and `increment'
+# function. It is handy method to specify next location.
+# It requires NO XPATH KNOWLEDGE, but some Regexp knowledge:-)
+#
+# Custom locations takes precedence over SITEINFO. It means that even
+# if SITEINFO defines the configuration of a site, use custom
+# location,
+#
+# For example, The next page of "http://www.dotup.org/" is
+# "http://www.dotup.org/2.html". Use simply `addstring' function.
+#
+# The next page of "http://www.dotup.org/2.html" is
+# "http://www.dotup.org/3.html". Use `increment' function with Regexp.
+# The first occurrence of "(\d)" (digits) are replaced with the next number.
+# Note that writing a URL Regexp by %r!URL Regexp! is handy.
+addstring "http://www.dotup.org/", "2.html"
+increment %r!http://www.dotup.org/(\d+).html$!
+#
+# `increment' can add any integer. For example,
+# "http://images.google.co.jp/images?q=ruby&ie=Shift_JIS&hl=ja&start=20"
+# to
+# "http://images.google.co.jp/images?q=ruby&ie=Shift_JIS&hl=ja&start=40".
+increment %r!^http://images.google.(?:co.jp|com)/.*start=(\d+)!, 20
+#
+# `addstring' function can accept Regexp. For example,
+# "http://images.google.co.jp/images?q=ruby&ie=Shift_JIS&hl=ja"
+# to
+# "http://images.google.co.jp/images?q=ruby&ie=Shift_JIS&hl=ja&start=20".
+#
+# Note that `increment' of google image search must be defined BEFORE
+# `addstring'. If `addstring' is before `increment', w3m-autopagerize
+# considers the next page of
+# "http://images.google.co.jp/images?q=ruby&ie=Shift_JIS&hl=ja&start=20"
+# as
+# "http://images.google.co.jp/images?q=ruby&ie=Shift_JIS&hl=ja&start=20&start=20".
+# It is because the URL matches both
+# %r!^http://images.google.(?:co.jp|com)/! and
+# %r!^http://images.google.(?:co.jp|com)/.*start=(\d+)!.
+addstring %r!^http://images.google.(?:co.jp|com)/!, '&start=20'
+# Custom Action
+# =============
+#
+# You can execute any w3m commands for certain URL. For example, I
+# (rubikitch) login hatena and open my hatena diary, execute
+# "GOTO http://d.hatena.ne.jp/rubikitch/" and "DELETE_PREVBUF"
+# three times. Use `defnext' and `w3mctl'.
+#
+# This is a good example of login and goto action. Note that when you
+# use login and goto, you must set your login/password to
+# ~/.w3m/pre_form file.
+defnext "https://www.hatena.ne.jp/login" do
+  w3mctl "GOTO http://d.hatena.ne.jp/rubikitch/", "DELETE_PREVBUF", "DELETE_PREVBUF", "DELETE_PREVBUF"
+end
+# Custom SITEINFO
+# ===============
+#
+# If you have your original SITEINFO for AutoPagerize, you can simply
+# add the URL or filename into the top of $SITEINFO_IMPORT_URLS.
+#
+# The SITEINFO can be defined in Ruby DSL.
+#
+# In JSON:
+#
+#   {
+#     "name": "(.~) what a quiet stiff (~.)",
+#     "data": {
+#       "insertBefore": "",
+#       "pageElement": "id(\"pixflow\")",
+#       "url": "^http:\/\/whytheluckystiff\\.net\/quiet\/",
+#       "nextLink": "id(\"header\")\/a[last()]",
+#       "exampleUrl": "http:\/\/whytheluckystiff.net\/quiet\/"
+#     }
+#   }
+#
+# In Ruby:
+#
+#   defnext %r!^http://whytheluckystiff\.net/quiet/! do
+#     insertBefore ''
+#     pageElement  'id("pixflow")'
+#     nextLink     'id("header")/a[last()]'
+#     exampleUrl   'http://whytheluckystiff.net/quiet/'
+#   end

data/readme.html ADDED

@@ -0,0 +1,252 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+               "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml"
+lang="en" xml:lang="en">
+<head>
+<title>AutoPagerize for w3m</title>
+<meta http-equiv="Content-Type" content="text/html;charset=euc-jp"/>
+<meta name="generator" content="Org-mode"/>
+<meta name="generated" content="2009-01-30"/>
+<meta name="author" content="rubikitch"/>
+<style type="text/css">
+ <!--/*--><![CDATA[/*><!--*/
+  html { font-family: Times, serif; font-size: 12pt; }
+  .title  { text-align: center; }
+  .todo   { color: red; }
+  .done   { color: green; }
+  .tag    { background-color:lightblue; font-weight:normal }
+  .target { }
+  .timestamp { color: grey }
+  .timestamp-kwd { color: CadetBlue }
+  p.verse { margin-left: 3% }
+  pre {
+	border: 1pt solid #AEBDCC;
+	background-color: #F3F5F7;
+	padding: 5pt;
+	font-family: courier, monospace;
+        font-size: 90%;
+        overflow:auto;
+  }
+  table { border-collapse: collapse; }
+  td, th { vertical-align: top; }
+  dt { font-weight: bold; }
+  div.figure { padding: 0.5em; }
+  div.figure p { text-align: center; }
+  .linenr { font-size:smaller }
+  .code-highlighted {background-color:#ffff00;}
+  .org-info-js_info-navigation { border-style:none; }
+  #org-info-js_console-label { font-size:10px; font-weight:bold;
+                               white-space:nowrap; }
+  .org-info-js_search-highlight {background-color:#ffff00; color:#000000;
+                                 font-weight:bold; }
+  /*]]>*/-->
+</style>
+<script type="text/javascript">
+<!--/*--><![CDATA[/*><!--*/
+ function CodeHighlightOn(elem, id)
+ {
+   var target = document.getElementById(id);
+   if(null != target) {
+     elem.cacheClassElem = elem.className;
+     elem.cacheClassTarget = target.className;
+     target.className = "code-highlighted";
+     elem.className   = "code-highlighted";
+   }
+ }
+ function CodeHighlightOff(elem, id)
+ {
+   var target = document.getElementById(id);
+   if(elem.cacheClassElem)
+     elem.className = elem.cacheClassElem;
+   if(elem.cacheClassTarget)
+     target.className = elem.cacheClassTarget;
+ }
+/*]]>*/-->
+</script>
+</head><body>
+<h1 class="title">AutoPagerize for w3m</h1>
+<p>AutoPagerize for w3m <a href="http://rubikitchrb.rubyforge.org/">http://rubikitchrb.rubyforge.org/</a>
+</p>
+<p>
+Copyright (c) 2009 rubikitch &lt;rubikitch@ruby-lang.org&gt; <a href="http://www.rubyist.net/~rubikitch/">http://www.rubyist.net/~rubikitch/</a>
+</p>
+<p>
+Use and distribution subject to the terms of the Ruby license.
+</p>
+<div id="table-of-contents">
+<h2>Table of Contents</h2>
+<div id="text-table-of-contents">
+<ul>
+<li><a href="#sec-1">1 Overview </a></li>
+<li><a href="#sec-2">2 Programs </a>
+<ul>
+<li><a href="#sec-2.1">2.1 w3m-autopagerize-server.rb </a></li>
+<li><a href="#sec-2.2">2.2 next.cgi </a></li>
+<li><a href="#sec-2.3">2.3 config.sample.rb </a></li>
+</ul>
+</li>
+<li><a href="#sec-3">3 Installation </a>
+<ul>
+<li><a href="#sec-3.1">3.1 Install AutoPagerize for w3m </a></li>
+<li><a href="#sec-3.2">3.2 Copy config file </a></li>
+<li><a href="#sec-3.3">3.3 Local CGI setup </a></li>
+<li><a href="#sec-3.4">3.4 Key bind </a></li>
+</ul>
+</li>
+<li><a href="#sec-4">4 Usage </a></li>
+<li><a href="#sec-5">5 License </a></li>
+</ul>
+</div>
+</div>
+<div id="outline-container-1" class="outline-2">
+<h2 id="sec-1">1 Overview </h2>
+<div id="text-1">
+<p>AutoPagerize for w3m finds next link and extracts page contents. It
+consists of dRuby server program (w3m-autopagerize-server.rb) and w3m
+Local CGI program (next.cgi).
+</p>
+</div>
+</div>
+<div id="outline-container-2" class="outline-2">
+<h2 id="sec-2">2 Programs </h2>
+<div id="text-2">
+</div>
+<div id="outline-container-2.1" class="outline-3">
+<h3 id="sec-2.1">2.1 w3m-autopagerize-server.rb </h3>
+<div id="text-2.1">
+<p>AutoPagerize for w3m uses dRuby server w3m-autopagerize-server.rb
+because initializing site data is time-consuming. Before using
+AutoPagerize for w3m, you have to invoke w3m-autopagerize-server.rb!
+w3m-autopagerize-server.rb loads config file (~/.w3m-autopagerize.rb)
+if any and reads AutoPagerize SITEINFO data from wedata.net by
+default.
+</p>
+</div>
+</div>
+<div id="outline-container-2.2" class="outline-3">
+<h3 id="sec-2.2">2.2 next.cgi </h3>
+<div id="text-2.2">
+<p>next.cgi is Local CGI program to ask w3m-autopagerize-server.rb to get next page.
+</p>
+</div>
+</div>
+<div id="outline-container-2.3" class="outline-3">
+<h3 id="sec-2.3">2.3 config.sample.rb </h3>
+<div id="text-2.3">
+<p>The sample config file to customize. See <a href="config.sample.rb">file:config.sample.rb</a> for detail.
+</p>
+</div>
+</div>
+</div>
+<div id="outline-container-3" class="outline-2">
+<h2 id="sec-3">3 Installation </h2>
+<div id="text-3">
+</div>
+<div id="outline-container-3.1" class="outline-3">
+<h3 id="sec-3.1">3.1 Install AutoPagerize for w3m </h3>
+<div id="text-3.1">
+<p>AutoPagerize for w3m works with Ruby 1.9 only! So, you have to install
+Ruby 1.9.x. Then simply issue:
+</p>
+<pre class="example">
+ $ sudo ruby1.9 -S gem install w3m-autopagerize
+</pre>
+</div>
+</div>
+<div id="outline-container-3.2" class="outline-3">
+<h3 id="sec-3.2">3.2 Copy config file </h3>
+<div id="text-3.2">
+<p>If you customize AutoPagerize for w3m, copy config.sample.rb to
+~/.w3m-autopagerize.rb and edit it.
+</p>
+</div>
+</div>
+<div id="outline-container-3.3" class="outline-3">
+<h3 id="sec-3.3">3.3 Local CGI setup </h3>
+<div id="text-3.3">
+<p>Local CGI program next.cgi is installed at
+/usr/local/bin/next.cgi. You have to make w3m find it. Add
+/usr/local/bin to your Local CGI path (cgi<sub>bin</sub>), or make symlink.
+</p>
+<pre class="example">
+ $ cd ~/w3m/cgi-bin; ln -s /usr/local/bin/next.cgi
+</pre>
+</div>
+</div>
+<div id="outline-container-3.4" class="outline-3">
+<h3 id="sec-3.4">3.4 Key bind </h3>
+<div id="text-3.4">
+<p>Bind AutoPagerize for w3m to your favorite key. Edit ~/.w3m/keymap and add this line.
+</p>
+<pre class="example">
+ keymap x GOTO file:/cgi-bin/next.cgi
+</pre>
+</div>
+</div>
+</div>
+<div id="outline-container-4" class="outline-2">
+<h2 id="sec-4">4 Usage </h2>
+<div id="text-4">
+<ul>
+<li>
+Press `x' key to go to next page.
+</li>
+<li>
+Press `=' key to see information, eg. XPath to get next page.
+</li>
+</ul>
+</div>
+</div>
+<div id="outline-container-5" class="outline-2">
+<h2 id="sec-5">5 License </h2>
+<div id="text-5">
+<p>AutoPagerize for w3m is licensed under the same terms as Ruby.
+</p></div>
+</div>
+<div id="postamble"><p class="author"> Author: rubikitch
+<a href="mailto:rubikitch@ruby-lang.org">&lt;rubikitch@ruby-lang.org&gt;</a>
+</p>
+<p class="date"> Date: 2009-01-30</p>
+<p>HTML generated by org-mode 6.18 in emacs 22</p>
+</div></body>
+</html>

data/readme.org ADDED

@@ -0,0 +1,65 @@
+#+TITLE:     AutoPagerize for w3m
+#+AUTHOR:    rubikitch
+#+EMAIL:     rubikitch@ruby-lang.org
+#+DATE:      2009-01-30
+#+LANGUAGE:  en
+#+OPTIONS:   H:3 num:t toc:t \n:nil @:t ::t |:t ^:t -:t f:t *:t TeX:t LaTeX:nil skip:nil d:nil todo:t pri:nil tags:not-in-toc
+#+INFOJS_OPT: view:nil toc:nil ltoc:t mouse:underline buttons:0 path:http://orgmode.org/org-info.js
+#+EXPORT_SELECT_TAGS: export
+#+EXPORT_EXCLUDE_TAGS: noexport
+#+LINK_UP:
+#+LINK_HOME:
+AutoPagerize for w3m http://rubikitchrb.rubyforge.org/
+Copyright (c) 2009 rubikitch <rubikitch@ruby-lang.org> http://www.rubyist.net/~rubikitch/
+Use and distribution subject to the terms of the Ruby license.
+* Overview
+AutoPagerize for w3m finds next link and extracts page contents. It
+consists of dRuby server program (w3m-autopagerize-server.rb) and w3m
+Local CGI program (next.cgi).
+* Programs
+** w3m-autopagerize-server.rb
+AutoPagerize for w3m uses dRuby server w3m-autopagerize-server.rb
+because initializing site data is time-consuming. Before using
+AutoPagerize for w3m, you have to invoke w3m-autopagerize-server.rb!
+w3m-autopagerize-server.rb loads config file (~/.w3m-autopagerize.rb)
+if any and reads AutoPagerize SITEINFO data from wedata.net by
+default.
+** next.cgi
+next.cgi is Local CGI program to ask w3m-autopagerize-server.rb to get next page.
+** config.sample.rb
+The sample config file to customize. See file:config.sample.rb for detail.
+* Installation
+** Install AutoPagerize for w3m
+AutoPagerize for w3m works with Ruby 1.9 only! So, you have to install
+Ruby 1.9.x. Then simply issue:
+: $ sudo ruby1.9 -S gem install w3m-autopagerize
+** Copy config file
+If you customize AutoPagerize for w3m, copy config.sample.rb to
+~/.w3m-autopagerize.rb and edit it.
+** Local CGI setup
+Local CGI program next.cgi is installed at
+/usr/local/bin/next.cgi. You have to make w3m find it. Add
+/usr/local/bin to your Local CGI path (cgi_bin), or make symlink.
+: $ cd ~/w3m/cgi-bin; ln -s /usr/local/bin/next.cgi
+** Key bind
+Bind AutoPagerize for w3m to your favorite key. Edit ~/.w3m/keymap and add this line.
+: keymap x GOTO file:/cgi-bin/next.cgi
+* Usage
+- Press `x' key to go to next page.
+- Press `=' key to see information, eg. XPath to get next page.
+* License
+AutoPagerize for w3m is licensed under the same terms as Ruby.

data/test/test-w3m-autopagerize.rb ADDED

@@ -0,0 +1,166 @@
+#!/usr/local/bin/ruby19
+# -*- coding: utf-8 -*-
+# (executable-interpret "ruby19 /m/home/rubikitch/w3m/cgi-bin/w3m-autopagerize/test-w3m-autopagerize.rb --no-use-color ")
+require 'fileutils'
+FileUtils.rm_f "test.log"
+require 'test/unit'
+require 'open-uri'
+require 'script'
+require 'w3m-autopagerize-server' # !> method redefined; discarding old debug_with_time
+$TEST_MODE = true
+$W3M_EXTRA_OPTIONS = "-o http_proxy=http://127.0.0.1:8339/"
+$logger = Logger.new "test.log"
+class TestAutoPagerize < Test::Unit::TestCase
+  def test_hatena_success
+    $logger.info "Test: #{__method__}"
+    reinit
+    defnext %r{^https?:\/\/(?:d2?|[^.]+\.g)\.hatena\.ne\.jp\/} do
+      exampleUrl   %{http://os0x.g.hatena.ne.jp/os0x/}
+      pageElement  %{id("days")}
+      nextLink     %{//a[@rel="prev"]}
+    end
+    nexturl = "http://d.hatena.ne.jp/rubikitch/20090110/1231524557"
+    origurl = "http://d.hatena.ne.jp/rubikitch/20090113/1231844047"
+    np = Server.new.nextpage(origurl, nil, nil, Object.new)
+    assert_equal nexturl, np[:location]
+    assert_match(/<base/, np[:html])
+    assert_equal %{id("days")}, np[:pageElement]
+    assert_equal %{//a[@rel="prev"]}, np[:nextLink]
+  end
+  def test_hatena_fail
+    $logger.info "Test: #{__method__}"
+    reinit
+    defnext %r{^https?:\/\/(?:d2?|[^.]+\.g)\.hatena\.ne\.jp\/} do
+      exampleUrl   %{http://os0x.g.hatena.ne.jp/os0x/}
+      pageElement  %{id("noelement")}
+      nextLink     %{//a[@rel="prev"]}
+    end
+    nexturl = "http://d.hatena.ne.jp/rubikitch/20090110/1231524557"
+    origurl = "http://d.hatena.ne.jp/rubikitch/20090113/1231844047"
+    np = Server.new.nextpage(origurl, nil, nil, Object.new)
+    assert_equal nexturl, np[:location]
+    assert_match(/failed to crop html/, np[:html])
+    assert_equal %{id("noelement")}, np[:pageElement]
+    assert_equal %{//a[@rel="prev"]}, np[:nextLink]
+  end
+  def test_google_addstring
+    $logger.info "Test: #{__method__}"
+    reinit
+    addstring %r!^http://www.google.(?:co.jp|com)/search!, '&start=100'
+    nexturl = "http://www.google.com/search?q=ruby&hl=ja&num=100&start=100"
+    origurl = "http://www.google.com/search?q=ruby&hl=ja&num=100"
+    np = Server.new.nextpage(origurl, nil, nil, Object.new)
+    assert_equal nexturl, np[:location]
+  end
+  def test_google_increment
+    reinit
+    increment %r!^http://www.google.(?:co.jp|com)/search.*start=(\d+)!, 100
+    nexturl = "http://www.google.com/search?q=ruby&hl=ja&num=100&start=200"
+    origurl = "http://www.google.com/search?q=ruby&hl=ja&num=100&start=100"
+    np = Server.new.nextpage(origurl, nil, nil, Object.new)
+    assert_equal nexturl, np[:location]
+  end
+  def test_google_fallback_link
+    $logger.info "Test: #{__method__}"
+    reinit
+    $FALLBACK_PATTERNS = %w[次へ]
+    $FALLBACK_WORDS = %w[次へ]
+    $SITEINFO = [[ /./, SiteData.fallbacks[0] ]]
+    nexturl = "http://www.google.com/search?num=100&hl=ja&pwst=1&q=ruby&start=100&sa=N"
+    origurl = "http://www.google.com/search?q=ruby&hl=ja&num=100"
+    np = Server.new.nextpage(origurl, nil, "UTF-8", Object.new)
+    assert_equal nexturl, np[:location]
+  end
+  def test_futaba_fallback_form
+    $logger.info "Test: #{__method__}"
+    reinit
+    $FALLBACK_WORDS = %w[次のページ]
+    $SITEINFO = [[ /./, SiteData.fallbacks[1] ]]
+    nexturl = "http://may.2chan.net/27/1.htm"
+    origurl = "http://may.2chan.net/27/futaba.htm"
+    np = Server.new.nextpage(origurl, nil, "cp932", Object.new)
+    assert_equal nexturl, np[:location]
+  end
+  def test_futaba_fallback_by_wrong_sitedata
+    $logger.info "Test: #{__method__}"
+    reinit
+    defnext %r{2chan} do
+      pageElement  %{id("noelement")}
+      nextLink     %{//a[@rel="prev"]}
+    end
+    $FALLBACK_WORDS = %w[次のページ]
+    SiteData.fallbacks.setup!
+    nexturl = "http://may.2chan.net/27/1.htm"
+    origurl = "http://may.2chan.net/27/futaba.htm"
+    np = Server.new.nextpage(origurl, nil, "cp932", Object.new)
+    assert_equal nexturl, np[:location]
+  end
+end
+class TestFallBackPredicate < Test::Unit::TestCase
+  def test_1
+    assert_equal '.="tugi" or contains(.,"Next")',
+      SiteData.fallback_predicate1(".", %w[tugi], %w[Next])
+  end
+  def test_2
+    assert_equal '.="tugi"', SiteData.fallback_predicate1(".", %w[tugi], [])
+  end
+  def test_3
+    assert_equal 'contains(.,"Next")', SiteData.fallback_predicate1(".", [], %w[Next])
+  end
+end
+class TestFallBackSiteData < Test::Unit::TestCase
+  def setup
+    SiteData.instance_eval { @fallbacks = nil }
+  end
+  def test_1
+    $FALLBACK_PATTERNS = %w[次へ]
+    $FALLBACK_WORDS = %w[次へ]
+    $FALLBACK_START_WORDS = %w[tugi]
+    assert_equal 4, SiteData.fallbacks.length
+  end
+  def test_2
+    $FALLBACK_PATTERNS = %w[次へ]
+    $FALLBACK_WORDS = %w[次へ]
+    $FALLBACK_START_WORDS = []
+    assert_equal 2, SiteData.fallbacks.length
+  end
+  def test_3
+    $FALLBACK_PATTERNS = %w[次へ]
+    $FALLBACK_WORDS = %w[次へ]
+    $FALLBACK_START_WORDS = %w[tugi]
+    assert SiteData.fallbacks.respond_to? :setup!
+  end
+end
+# >> Loaded suite -
+# >> Started
+# >> .....
+# >>
+# >> Finished in 1.154570634 seconds.
+# >>
+# >> 5 tests, 14 assertions, 0 failures, 0 errors, 0 pendings, 0 omissions, 0 notifications

metadata ADDED

@@ -0,0 +1,58 @@
+--- !ruby/object:Gem::Specification
+name: w3m-autopagerize
+version: !ruby/object:Gem::Version
+  version: 1.0.0
+platform: ruby
+authors:
+- rubikitch
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-01-30 00:00:00 +09:00
+default_executable:
+dependencies: []
+description: AutoPagerize for w3m
+email: rubikitch@ruby-lang.org
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- readme.org
+- readme.html
+- config.sample.rb
+- bin/w3m-autopagerize-server.rb
+- bin/next.cgi
+- test/test-w3m-autopagerize.rb
+has_rdoc: false
+homepage: http://www.rubyist.net/~rubikitch/
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project: rubikitchrb
+rubygems_version: 1.3.1
+signing_key:
+specification_version: 2
+summary: AutoPagerize for w3m
+test_files: []