RubyGems - ircbot - Versions diffs - 0.1.5 → 0.2.0 - Mend

ircbot 0.1.5 → 0.2.0

Files changed (43) hide show

data/.gitignore +5 -0
data/Gemfile +3 -0
data/Gemfile.lock +71 -0
data/README +72 -3
data/bin/ircbot +3 -0
data/config/samples/postgres.yml +19 -0
data/config/{sama-zu.yml → samples/sama-zu.yml} +1 -1
data/config/{yml.erb → samples/yml.erb} +0 -0
data/ircbot.gemspec +13 -0
data/lib/ircbot.rb +3 -1
data/lib/ircbot/client.rb +6 -0
data/lib/ircbot/client/config.rb +9 -0
data/lib/ircbot/client/plugins.rb +14 -1
data/lib/ircbot/core_ext/message.rb +4 -1
data/lib/ircbot/plugin.rb +17 -0
data/lib/ircbot/plugins.rb +68 -13
data/lib/ircbot/utils/html_parser.rb +26 -0
data/lib/ircbot/utils/watcher.rb +36 -0
data/lib/ircbot/version.rb +1 -1
data/old/plugins/summary.cpi +267 -0
data/plugins/plugins.rb +1 -1
data/plugins/reminder.rb +79 -175
data/plugins/summary/ch2.rb +272 -0
data/plugins/summary/engines.rb +30 -0
data/plugins/summary/engines/base.rb +105 -0
data/plugins/summary/engines/ch2.rb +14 -0
data/plugins/summary/engines/https.rb +6 -0
data/plugins/summary/engines/none.rb +10 -0
data/plugins/summary/engines/twitter.rb +16 -0
data/plugins/summary/spec/ch2_spec.rb +64 -0
data/plugins/summary/spec/spec_helper.rb +19 -0
data/plugins/summary/spec/summarizers_none_spec.rb +15 -0
data/plugins/summary/spec/summarizers_spec.rb +23 -0
data/plugins/summary/summary.rb +58 -0
data/plugins/watchdog/db.rb +80 -0
data/plugins/watchdog/exceptions.rb +4 -0
data/plugins/watchdog/updater.rb +21 -0
data/plugins/watchdog/watchdog.rb +82 -0
data/spec/plugin_spec.rb +11 -0
data/spec/plugins_spec.rb +35 -1
data/spec/utils/html_parser_spec.rb +30 -0
data/spec/utils/spec_helper.rb +1 -0
metadata +190 -13

@@ -0,0 +1,272 @@
+#!/usr/bin/env ruby
+# vim:encoding=UTF-8:
+# original: net-irc-0.0.9/examples/2ch.rb
+$KCODE = "u" if RUBY_VERSION < "1.9" # json use this
+require 'rubygems'
+require 'uri'
+require 'net/http'
+require 'stringio'
+require 'zlib'
+require 'nkf'
+require 'ircbot'
+module Ch2
+  class Dat
+    class UnknownThread < StandardError; end
+    attr_accessor :uri
+    attr_accessor :last_modified, :size
+    Line = Struct.new(:n, :name, :mail, :misc, :body, :opts, :id) do
+      def to_s
+        [name, body, misc, opts].compact.join(" ")
+      end
+      def <=>(other)
+        body.to_s.size <=> other.body.to_s.size
+      end
+      def aa?
+        body = self.body
+        return false if body.count("\n") < 3
+        significants = body.scan(/[>\n0-9a-z０-９A-Zａ-ｚＡ-Ｚぁ-んァ-ン一-龠]/u).size.to_f
+        body_length  = body.scan(/./u).size
+        is_aa = 1 - significants / body_length
+        is_aa > 0.6
+      end
+    end
+    attr_reader :board, :num, :arg
+    delegate :host, :port, :to => "@uri"
+    def initialize(thread_uri)
+      @uri = URI(thread_uri)
+      _, _, _, @board, @num, = *@uri.path.split('/')
+      @dat = []
+      case @uri.path
+      when %r{^/test/read\.cgi/(.*?)/(\d+)(/(.+))?}
+        @arg = $4
+      end
+      @valid = !! (@board && @num)
+    end
+    def valid?
+      @valid
+    end
+    def length
+      @dat.length
+    end
+    def subject
+      retrieve(true) if @dat.size.zero?
+      self[1].opts || ""
+    end
+    def [](n)
+      l = @dat[n - 1]
+      return nil unless l
+      name, mail, misc, body, opts = * l.split(/<>/)
+      id = misc[/ID:([^\s]+)/, 1]
+      body.gsub!(/<br>/, "\n")
+      body.gsub!(/<[^>]+>/, "")
+      body.gsub!(/^\s+|\s+$/, "")
+      body.gsub!(/&(gt|lt|amp|nbsp);/) {|s|
+        { 'gt' => ">", 'lt' => "<", 'amp' => "&", 'nbsp' => " " }[$1]
+      }
+      Line.new(n, name, mail, misc, body, opts, id)
+    end
+    def dat
+      @num
+    end
+    def retrieve(force=false)
+      @dat = [] if @force
+      res = Net::HTTP.start(@uri.host, @uri.port) do |http|
+        req = Net::HTTP::Get.new('/%s/dat/%d.dat' % [@board, @num])
+        req['User-Agent']        = 'Monazilla/1.00 (2ig.rb/0.0e)'
+        req['Accept-Encoding']   = 'gzip' unless @size
+        unless force
+          req['If-Modified-Since'] = @last_modified if @last_modified
+          req['Range']             = "bytes=%d-" % @size if @size
+        end
+        http.request(req)
+      end
+      ret = nil
+      case res.code.to_i
+      when 200, 206
+        body = res.body
+        if res['Content-Encoding'] == 'gzip'
+          body = StringIO.open(body, 'rb') {|io| Zlib::GzipReader.new(io).read }
+        end
+        @last_modified = res['Last-Modified']
+        if res.code == '206'
+          @size += body.size
+        else
+          @size  = body.size
+        end
+        body = NKF.nkf('-w', body)
+        curr = @dat.size + 1
+        @dat.concat(body.split(/\n/))
+        last = @dat.size
+        (curr..last).map {|n|
+          self[n]
+        }
+      when 416 # たぶん削除が発生
+        p ['416']
+        retrieve(true)
+        []
+      when 304 # Not modified
+        []
+      when 302 # dat 落ち
+        p ['302', res['Location']]
+        raise UnknownThread
+      else
+        p ['Unknown Status:', res.code]
+        []
+      end
+    end
+    def canonicalize_subject(subject)
+      subject.gsub(/[Ａ-Ｚａ-ｚ０-９]/u) {|c|
+        c.unpack("U*").map {|i| i - 65248 }.pack("U*")
+      }
+    end
+    def guess_next_thread
+      res = Net::HTTP.start(@uri.host, @uri.port) do |http|
+        req = Net::HTTP::Get.new('/%s/subject.txt' % @board)
+        req['User-Agent']        = 'Monazilla/1.00 (2ig.rb/0.0e)'
+        http.request(req)
+      end
+      recent_posted_threads = (900..999).inject({}) {|r,i|
+        line = self[i]
+        line.body.scan(%r|ttp://#{@uri.host}/test/read.cgi/[^/]+/\d+/|).each do |uri|
+          r["h#{uri}"] = i
+        end if line
+        r
+      }
+      current_subject    = canonicalize_subject(self.subject)
+      current_thread_rev = current_subject.scan(/\d+/).map {|d| d.to_i }
+      current            = current_subject.scan(/./u)
+      body = NKF.nkf('-w', res.body)
+      threads = body.split(/\n/).map {|l|
+        dat, rest = *l.split(/<>/)
+        dat.sub!(/\.dat$/, "")
+        uri = "http://#{@uri.host}/test/read.cgi/#{@board}/#{dat}/"
+        subject, n = */(.+?) \((\d+)\)/.match(rest).captures
+        canonical_subject = canonicalize_subject(subject)
+        thread_rev     = canonical_subject[/\d+/].to_i
+        distance       = (dat     == self.dat)     ? Float::MAX :
+        (subject == self.subject) ? 0 :
+        levenshtein(canonical_subject.scan(/./u), current)
+        continuous_num = current_thread_rev.find {|rev| rev == thread_rev - 1 }
+        appear_recent  = recent_posted_threads[uri]
+        score = distance
+        score -= 10 if continuous_num
+        score -= 10 if appear_recent
+        score += 10 if dat.to_i < self.dat.to_i
+        {
+          :uri            => uri,
+          :dat            => dat,
+          :subject        => subject,
+          :distance       => distance,
+          :continuous_num => continuous_num,
+          :appear_recent  => appear_recent,
+          :score          => score.to_f
+        }
+      }.sort_by {|o|
+        o[:score]
+      }
+      threads
+    end
+    def levenshtein(a, b)
+      case
+      when a.empty?
+        b.length
+      when b.empty?
+        a.length
+      when a == b
+        0
+      else
+        d = Array.new(a.length + 1) { |s|
+          Array.new(b.length + 1, 0)
+        }
+        (0..a.length).each do |i|
+          d[i][0] = i
+        end
+        (0..b.length).each do |j|
+          d[0][j] = j
+        end
+        (1..a.length).each do |i|
+          (1..b.length).each do |j|
+            cost = (a[i - 1] == b[j - 1]) ? 0 : 1
+            d[i][j] = [
+                       d[i-1][j  ] + 1,
+                       d[i  ][j-1] + 1,
+                       d[i-1][j-1] + cost
+                      ].min
+          end
+        end
+        d[a.length][b.length]
+      end
+    end
+    def dat_url
+      "http://%s/%s/dat/%d.dat" % [host, board, num]
+    end
+    def summarize
+      retrieve
+      lead = "[%s] " % subject
+      case (arg||"").scan(/[\d-]/).join
+      when /^\d+$/                # exact id
+        range = (arg.to_i .. arg.to_i)
+      when /^(\d+)-(\d+)$/
+        range = ($1.to_i .. $2.to_i)
+      when /^(\d+)-$/
+        range = ($1.to_i .. 1000)
+      when /^-(\d+)$/
+        range = (1 .. $2.to_i)
+      else
+        range = (1 .. 1000)
+        sort  = true
+      end
+      lines = range.map{|i| self[i]}.compact
+      bodies = lines.map(&:body)
+      bodies.sort!{|a,b| b.size <=> a.size} if sort
+      return lead + bodies.join(" ").gsub(/\s+/, ' ')
+    end
+  end
+end

data/plugins/summary/engines.rb ADDED

@@ -0,0 +1,30 @@
+require 'dsl_accessor'
+require 'extlib'
+module Engines
+  Mapping = []
+  class NotImplementedError < NotImplementedError; end
+  class Nop < NotImplementedError; end
+  def self.create(url)
+    for pattern, klass in Mapping
+      return klass.new(url) if pattern =~ url
+    end
+    raise NotImplementedError, "[BUG] Not supported URL: %s" % url
+  end
+  # load ruby library and register its url
+  def self.register(name)
+    load File.dirname(__FILE__) + "/engines/#{name}.rb"
+    klass = instance_eval(Extlib::Inflection.camelize(name))
+    Mapping.unshift [klass.url, klass] unless klass == Base
+  end
+  register("base")
+  register("none")
+  register("https")
+  register("ch2")
+  register("twitter")
+end

data/plugins/summary/engines/base.rb ADDED

@@ -0,0 +1,105 @@
+require 'open3'
+require 'cgi'
+module Engines
+  class Base
+    dsl_accessor :url
+    MaxContentLength = 512 * 1024
+    def initialize(url)
+      @url = url
+    end
+    def head(url)
+      # HTTP/1.1 200 OK
+      # Content-Type: text/html; charset=utf-8
+      # Date: Sun, 08 Apr 2012 18:08:45 GMT
+      # Content-Length: 245091
+      # Server: GSE
+      curl_options = ["--head", "--location", "--user-agent", "Mozilla"]
+      Open3.popen3(*["curl", curl_options, url].flatten) {|i,o,e| o.read }
+    end
+    def text?(url)
+      head(url).to_s =~ %r{^Content-Type:.*text/}
+    end
+    def fetch(url)
+      curl_options = [
+                      "--location", "--compressed",
+                      "--user-agent", "Mozilla",
+                      "--max-filesize", "%d" % MaxContentLength,
+                     ]
+      Open3.popen3(*["curl", curl_options, url].flatten) {|i,o,e| o.read }
+    end
+    def trim_tags(html)
+      html.gsub!(%r{<head[^>]*>.*?</head>}mi, '')
+      html.gsub!(%r{<script.*?>.*?</script>}mi, '')
+      html.gsub!(%r{<style.*?>.*?</style>}mi, '')
+      html.gsub!(%r{<noscript.*?>.*?</noscript>}mi, '')
+      html.gsub!(%r{</?.*?>}, '')
+      html.gsub!(%r{<\!--.*?-->}mi, '')
+      html.gsub!(%r{<\!\w.*?>}mi, '')
+      html.gsub!(%r{\s+}m, ' ')
+      html.strip!
+      html = CGI.unescapeHTML(html)
+      return html
+    end
+    def get_title(html)
+      if %r{<title>(.*?)</title>}mi =~ html
+        title = $1.strip
+        title.gsub!(%r{<.*?>}m, '')
+        title.gsub!(%r{\s+}m, ' ')
+        NKF.nkf("-w -Z3 --numchar-input --no-cp932", title)
+      else
+        ""
+      end
+    end
+    def get_body(html)
+      if /<body.*?>(.*?)<\/body>/im =~ html
+        body = $1
+      else
+        raise Nop, "No Body Found"
+      end
+      body.gsub!(%r{<!--.*?-->}im, '')
+      body.gsub!(%r{<\!\w.*?>}mi, '')
+      #body.gsub!(%r{<head.*?>.*?<\/head>}mi, '')
+      body.gsub!(%r{<head[^>]*>.*?<\/head>}mi, '')
+      body.gsub!(%r{<script.*?>.*?<\/script>}mi, '')
+      body.gsub!(%r{<style.*?>.*?<\/style>}mi, '')
+      body.gsub!(%r{<noscript.*?>.*?</noscript>}mi, '')
+      body.gsub!(%r{(:?<a.*?>|<\/a>)}mi, '')
+      body.gsub!(%r{(:?<font.*?>|<\/font>)}mi, '')
+      body.gsub!(%r{<img.*?/?>}mi, '')
+      body.gsub!(%r{(:?<b>|<\/b>|<i>|<\/i>|<u>|<\/u>|<p>|<\/p>|<\/li>)}mi,'')
+      body.gsub!(%r{(<(:?br)(:?\s+/)?>)}mi,'')
+      body.gsub!(%r{(:?<\/?h[1-6]>)}mi, ' ')
+      body.gsub!(%r{<li>}mi, ' * ')
+      elements = body.split(/<.*?>/mi)
+      elements.each { |item| item.gsub!(/\s+/, ' ') }
+      elements.each { |item| item.strip! }
+      elements.reject! { |item| item.empty? }
+      summary = elements.max_by {|e| e.size }
+      NKF.nkf("-w -Z3 --numchar-input --no-cp932", summary||"")
+    end
+    def parse(html)
+      title = get_title(html)
+      body = get_body(html)
+      return title, body
+    end
+    def execute
+      raise Nop, "Not Text" unless text?(@url)
+      html = fetch(@url)
+      html = NKF.nkf("-w -Z1 --no-cp932", html)
+      title, body = parse(html)
+      return "[%s] %s" % [title, body]
+    end
+  end
+end

data/plugins/summary/engines/ch2.rb ADDED

@@ -0,0 +1,14 @@
+require 'ch2'
+module Engines
+  class Ch2 < Base
+    url %r{^http://[^./]+\.2ch\.net}
+    def execute
+      dat = ::Ch2::Dat.new(@url)
+      dat.valid? or raise Nop
+      return trim_tags(dat.summarize)
+    end
+  end
+end

data/plugins/summary/engines/https.rb ADDED

@@ -0,0 +1,6 @@
+module Engines
+  class Https < Base
+    url %r{^https://}
+  end
+end

data/plugins/summary/engines/none.rb ADDED

@@ -0,0 +1,10 @@
+module Engines
+  class None < Base
+    url %r{}
+    def execute
+      raise Nop
+    end
+  end
+end

data/plugins/summary/engines/twitter.rb ADDED

@@ -0,0 +1,16 @@
+module Engines
+  class Twitter < Base
+    url %r{twitter\.com}
+    def initialize(url)
+      super
+      @url = normalize_url(@url)
+    end
+    def normalize_url(url)
+      return url.sub(%r{#!/}, '').sub(%r{//(?:\w+\.)?(twitter.com/)}, "//mobile.\\1")
+    end
+  end
+end