ircbot 0.1.5 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. data/.gitignore +5 -0
  2. data/Gemfile +3 -0
  3. data/Gemfile.lock +71 -0
  4. data/README +72 -3
  5. data/bin/ircbot +3 -0
  6. data/config/samples/postgres.yml +19 -0
  7. data/config/{sama-zu.yml → samples/sama-zu.yml} +1 -1
  8. data/config/{yml.erb → samples/yml.erb} +0 -0
  9. data/ircbot.gemspec +13 -0
  10. data/lib/ircbot.rb +3 -1
  11. data/lib/ircbot/client.rb +6 -0
  12. data/lib/ircbot/client/config.rb +9 -0
  13. data/lib/ircbot/client/plugins.rb +14 -1
  14. data/lib/ircbot/core_ext/message.rb +4 -1
  15. data/lib/ircbot/plugin.rb +17 -0
  16. data/lib/ircbot/plugins.rb +68 -13
  17. data/lib/ircbot/utils/html_parser.rb +26 -0
  18. data/lib/ircbot/utils/watcher.rb +36 -0
  19. data/lib/ircbot/version.rb +1 -1
  20. data/old/plugins/summary.cpi +267 -0
  21. data/plugins/plugins.rb +1 -1
  22. data/plugins/reminder.rb +79 -175
  23. data/plugins/summary/ch2.rb +272 -0
  24. data/plugins/summary/engines.rb +30 -0
  25. data/plugins/summary/engines/base.rb +105 -0
  26. data/plugins/summary/engines/ch2.rb +14 -0
  27. data/plugins/summary/engines/https.rb +6 -0
  28. data/plugins/summary/engines/none.rb +10 -0
  29. data/plugins/summary/engines/twitter.rb +16 -0
  30. data/plugins/summary/spec/ch2_spec.rb +64 -0
  31. data/plugins/summary/spec/spec_helper.rb +19 -0
  32. data/plugins/summary/spec/summarizers_none_spec.rb +15 -0
  33. data/plugins/summary/spec/summarizers_spec.rb +23 -0
  34. data/plugins/summary/summary.rb +58 -0
  35. data/plugins/watchdog/db.rb +80 -0
  36. data/plugins/watchdog/exceptions.rb +4 -0
  37. data/plugins/watchdog/updater.rb +21 -0
  38. data/plugins/watchdog/watchdog.rb +82 -0
  39. data/spec/plugin_spec.rb +11 -0
  40. data/spec/plugins_spec.rb +35 -1
  41. data/spec/utils/html_parser_spec.rb +30 -0
  42. data/spec/utils/spec_helper.rb +1 -0
  43. metadata +190 -13
@@ -0,0 +1,272 @@
1
+ #!/usr/bin/env ruby
2
+ # vim:encoding=UTF-8:
3
+
4
+ # original: net-irc-0.0.9/examples/2ch.rb
5
+
6
+ $KCODE = "u" if RUBY_VERSION < "1.9" # json use this
7
+
8
+ require 'rubygems'
9
+ require 'uri'
10
+ require 'net/http'
11
+ require 'stringio'
12
+ require 'zlib'
13
+ require 'nkf'
14
+ require 'ircbot'
15
+
16
+ module Ch2
17
+ class Dat
18
+ class UnknownThread < StandardError; end
19
+
20
+ attr_accessor :uri
21
+ attr_accessor :last_modified, :size
22
+
23
+ Line = Struct.new(:n, :name, :mail, :misc, :body, :opts, :id) do
24
+ def to_s
25
+ [name, body, misc, opts].compact.join(" ")
26
+ end
27
+
28
+ def <=>(other)
29
+ body.to_s.size <=> other.body.to_s.size
30
+ end
31
+
32
+ def aa?
33
+ body = self.body
34
+ return false if body.count("\n") < 3
35
+
36
+ significants = body.scan(/[>\n0-9a-z0-9A-Za-zA-Zぁ-んァ-ン一-龠]/u).size.to_f
37
+ body_length = body.scan(/./u).size
38
+ is_aa = 1 - significants / body_length
39
+
40
+ is_aa > 0.6
41
+ end
42
+ end
43
+
44
+ attr_reader :board, :num, :arg
45
+ delegate :host, :port, :to => "@uri"
46
+
47
+ def initialize(thread_uri)
48
+ @uri = URI(thread_uri)
49
+ _, _, _, @board, @num, = *@uri.path.split('/')
50
+ @dat = []
51
+
52
+ case @uri.path
53
+ when %r{^/test/read\.cgi/(.*?)/(\d+)(/(.+))?}
54
+ @arg = $4
55
+ end
56
+ @valid = !! (@board && @num)
57
+ end
58
+
59
+ def valid?
60
+ @valid
61
+ end
62
+
63
+ def length
64
+ @dat.length
65
+ end
66
+
67
+ def subject
68
+ retrieve(true) if @dat.size.zero?
69
+ self[1].opts || ""
70
+ end
71
+
72
+ def [](n)
73
+ l = @dat[n - 1]
74
+ return nil unless l
75
+ name, mail, misc, body, opts = * l.split(/<>/)
76
+ id = misc[/ID:([^\s]+)/, 1]
77
+
78
+ body.gsub!(/<br>/, "\n")
79
+ body.gsub!(/<[^>]+>/, "")
80
+ body.gsub!(/^\s+|\s+$/, "")
81
+ body.gsub!(/&(gt|lt|amp|nbsp);/) {|s|
82
+ { 'gt' => ">", 'lt' => "<", 'amp' => "&", 'nbsp' => " " }[$1]
83
+ }
84
+
85
+ Line.new(n, name, mail, misc, body, opts, id)
86
+ end
87
+
88
+ def dat
89
+ @num
90
+ end
91
+
92
+ def retrieve(force=false)
93
+ @dat = [] if @force
94
+
95
+ res = Net::HTTP.start(@uri.host, @uri.port) do |http|
96
+ req = Net::HTTP::Get.new('/%s/dat/%d.dat' % [@board, @num])
97
+ req['User-Agent'] = 'Monazilla/1.00 (2ig.rb/0.0e)'
98
+ req['Accept-Encoding'] = 'gzip' unless @size
99
+ unless force
100
+ req['If-Modified-Since'] = @last_modified if @last_modified
101
+ req['Range'] = "bytes=%d-" % @size if @size
102
+ end
103
+
104
+ http.request(req)
105
+ end
106
+
107
+ ret = nil
108
+ case res.code.to_i
109
+ when 200, 206
110
+ body = res.body
111
+ if res['Content-Encoding'] == 'gzip'
112
+ body = StringIO.open(body, 'rb') {|io| Zlib::GzipReader.new(io).read }
113
+ end
114
+
115
+ @last_modified = res['Last-Modified']
116
+ if res.code == '206'
117
+ @size += body.size
118
+ else
119
+ @size = body.size
120
+ end
121
+
122
+ body = NKF.nkf('-w', body)
123
+
124
+ curr = @dat.size + 1
125
+ @dat.concat(body.split(/\n/))
126
+ last = @dat.size
127
+
128
+ (curr..last).map {|n|
129
+ self[n]
130
+ }
131
+ when 416 # たぶん削除が発生
132
+ p ['416']
133
+ retrieve(true)
134
+ []
135
+ when 304 # Not modified
136
+ []
137
+ when 302 # dat 落ち
138
+ p ['302', res['Location']]
139
+ raise UnknownThread
140
+ else
141
+ p ['Unknown Status:', res.code]
142
+ []
143
+ end
144
+ end
145
+
146
+ def canonicalize_subject(subject)
147
+ subject.gsub(/[A-Za-z0-9]/u) {|c|
148
+ c.unpack("U*").map {|i| i - 65248 }.pack("U*")
149
+ }
150
+ end
151
+
152
+ def guess_next_thread
153
+ res = Net::HTTP.start(@uri.host, @uri.port) do |http|
154
+ req = Net::HTTP::Get.new('/%s/subject.txt' % @board)
155
+ req['User-Agent'] = 'Monazilla/1.00 (2ig.rb/0.0e)'
156
+ http.request(req)
157
+ end
158
+
159
+ recent_posted_threads = (900..999).inject({}) {|r,i|
160
+ line = self[i]
161
+ line.body.scan(%r|ttp://#{@uri.host}/test/read.cgi/[^/]+/\d+/|).each do |uri|
162
+ r["h#{uri}"] = i
163
+ end if line
164
+ r
165
+ }
166
+
167
+ current_subject = canonicalize_subject(self.subject)
168
+ current_thread_rev = current_subject.scan(/\d+/).map {|d| d.to_i }
169
+ current = current_subject.scan(/./u)
170
+
171
+ body = NKF.nkf('-w', res.body)
172
+ threads = body.split(/\n/).map {|l|
173
+ dat, rest = *l.split(/<>/)
174
+ dat.sub!(/\.dat$/, "")
175
+
176
+ uri = "http://#{@uri.host}/test/read.cgi/#{@board}/#{dat}/"
177
+
178
+ subject, n = */(.+?) \((\d+)\)/.match(rest).captures
179
+ canonical_subject = canonicalize_subject(subject)
180
+ thread_rev = canonical_subject[/\d+/].to_i
181
+
182
+ distance = (dat == self.dat) ? Float::MAX :
183
+ (subject == self.subject) ? 0 :
184
+ levenshtein(canonical_subject.scan(/./u), current)
185
+ continuous_num = current_thread_rev.find {|rev| rev == thread_rev - 1 }
186
+ appear_recent = recent_posted_threads[uri]
187
+
188
+ score = distance
189
+ score -= 10 if continuous_num
190
+ score -= 10 if appear_recent
191
+ score += 10 if dat.to_i < self.dat.to_i
192
+ {
193
+ :uri => uri,
194
+ :dat => dat,
195
+ :subject => subject,
196
+ :distance => distance,
197
+ :continuous_num => continuous_num,
198
+ :appear_recent => appear_recent,
199
+ :score => score.to_f
200
+ }
201
+ }.sort_by {|o|
202
+ o[:score]
203
+ }
204
+
205
+ threads
206
+ end
207
+
208
+ def levenshtein(a, b)
209
+ case
210
+ when a.empty?
211
+ b.length
212
+ when b.empty?
213
+ a.length
214
+ when a == b
215
+ 0
216
+ else
217
+ d = Array.new(a.length + 1) { |s|
218
+ Array.new(b.length + 1, 0)
219
+ }
220
+
221
+ (0..a.length).each do |i|
222
+ d[i][0] = i
223
+ end
224
+
225
+ (0..b.length).each do |j|
226
+ d[0][j] = j
227
+ end
228
+
229
+ (1..a.length).each do |i|
230
+ (1..b.length).each do |j|
231
+ cost = (a[i - 1] == b[j - 1]) ? 0 : 1
232
+ d[i][j] = [
233
+ d[i-1][j ] + 1,
234
+ d[i ][j-1] + 1,
235
+ d[i-1][j-1] + cost
236
+ ].min
237
+ end
238
+ end
239
+
240
+ d[a.length][b.length]
241
+ end
242
+ end
243
+
244
+ def dat_url
245
+ "http://%s/%s/dat/%d.dat" % [host, board, num]
246
+ end
247
+
248
+ def summarize
249
+ retrieve
250
+ lead = "[%s] " % subject
251
+
252
+ case (arg||"").scan(/[\d-]/).join
253
+ when /^\d+$/ # exact id
254
+ range = (arg.to_i .. arg.to_i)
255
+ when /^(\d+)-(\d+)$/
256
+ range = ($1.to_i .. $2.to_i)
257
+ when /^(\d+)-$/
258
+ range = ($1.to_i .. 1000)
259
+ when /^-(\d+)$/
260
+ range = (1 .. $2.to_i)
261
+ else
262
+ range = (1 .. 1000)
263
+ sort = true
264
+ end
265
+
266
+ lines = range.map{|i| self[i]}.compact
267
+ bodies = lines.map(&:body)
268
+ bodies.sort!{|a,b| b.size <=> a.size} if sort
269
+ return lead + bodies.join(" ").gsub(/\s+/, ' ')
270
+ end
271
+ end
272
+ end
@@ -0,0 +1,30 @@
1
+ require 'dsl_accessor'
2
+ require 'extlib'
3
+
4
+ module Engines
5
+ Mapping = []
6
+
7
+ class NotImplementedError < NotImplementedError; end
8
+ class Nop < NotImplementedError; end
9
+
10
+ def self.create(url)
11
+ for pattern, klass in Mapping
12
+ return klass.new(url) if pattern =~ url
13
+ end
14
+ raise NotImplementedError, "[BUG] Not supported URL: %s" % url
15
+ end
16
+
17
+ # load ruby library and register its url
18
+ def self.register(name)
19
+ load File.dirname(__FILE__) + "/engines/#{name}.rb"
20
+ klass = instance_eval(Extlib::Inflection.camelize(name))
21
+ Mapping.unshift [klass.url, klass] unless klass == Base
22
+ end
23
+
24
+ register("base")
25
+ register("none")
26
+ register("https")
27
+ register("ch2")
28
+ register("twitter")
29
+ end
30
+
@@ -0,0 +1,105 @@
1
+ require 'open3'
2
+ require 'cgi'
3
+
4
+ module Engines
5
+ class Base
6
+ dsl_accessor :url
7
+
8
+ MaxContentLength = 512 * 1024
9
+
10
+ def initialize(url)
11
+ @url = url
12
+ end
13
+
14
+ def head(url)
15
+ # HTTP/1.1 200 OK
16
+ # Content-Type: text/html; charset=utf-8
17
+ # Date: Sun, 08 Apr 2012 18:08:45 GMT
18
+ # Content-Length: 245091
19
+ # Server: GSE
20
+
21
+ curl_options = ["--head", "--location", "--user-agent", "Mozilla"]
22
+ Open3.popen3(*["curl", curl_options, url].flatten) {|i,o,e| o.read }
23
+ end
24
+
25
+ def text?(url)
26
+ head(url).to_s =~ %r{^Content-Type:.*text/}
27
+ end
28
+
29
+ def fetch(url)
30
+ curl_options = [
31
+ "--location", "--compressed",
32
+ "--user-agent", "Mozilla",
33
+ "--max-filesize", "%d" % MaxContentLength,
34
+ ]
35
+ Open3.popen3(*["curl", curl_options, url].flatten) {|i,o,e| o.read }
36
+ end
37
+
38
+ def trim_tags(html)
39
+ html.gsub!(%r{<head[^>]*>.*?</head>}mi, '')
40
+ html.gsub!(%r{<script.*?>.*?</script>}mi, '')
41
+ html.gsub!(%r{<style.*?>.*?</style>}mi, '')
42
+ html.gsub!(%r{<noscript.*?>.*?</noscript>}mi, '')
43
+ html.gsub!(%r{</?.*?>}, '')
44
+ html.gsub!(%r{<\!--.*?-->}mi, '')
45
+ html.gsub!(%r{<\!\w.*?>}mi, '')
46
+ html.gsub!(%r{\s+}m, ' ')
47
+ html.strip!
48
+ html = CGI.unescapeHTML(html)
49
+ return html
50
+ end
51
+
52
+ def get_title(html)
53
+ if %r{<title>(.*?)</title>}mi =~ html
54
+ title = $1.strip
55
+ title.gsub!(%r{<.*?>}m, '')
56
+ title.gsub!(%r{\s+}m, ' ')
57
+ NKF.nkf("-w -Z3 --numchar-input --no-cp932", title)
58
+ else
59
+ ""
60
+ end
61
+ end
62
+
63
+ def get_body(html)
64
+ if /<body.*?>(.*?)<\/body>/im =~ html
65
+ body = $1
66
+ else
67
+ raise Nop, "No Body Found"
68
+ end
69
+ body.gsub!(%r{<!--.*?-->}im, '')
70
+ body.gsub!(%r{<\!\w.*?>}mi, '')
71
+ #body.gsub!(%r{<head.*?>.*?<\/head>}mi, '')
72
+ body.gsub!(%r{<head[^>]*>.*?<\/head>}mi, '')
73
+ body.gsub!(%r{<script.*?>.*?<\/script>}mi, '')
74
+ body.gsub!(%r{<style.*?>.*?<\/style>}mi, '')
75
+ body.gsub!(%r{<noscript.*?>.*?</noscript>}mi, '')
76
+ body.gsub!(%r{(:?<a.*?>|<\/a>)}mi, '')
77
+ body.gsub!(%r{(:?<font.*?>|<\/font>)}mi, '')
78
+ body.gsub!(%r{<img.*?/?>}mi, '')
79
+ body.gsub!(%r{(:?<b>|<\/b>|<i>|<\/i>|<u>|<\/u>|<p>|<\/p>|<\/li>)}mi,'')
80
+ body.gsub!(%r{(<(:?br)(:?\s+/)?>)}mi,'')
81
+ body.gsub!(%r{(:?<\/?h[1-6]>)}mi, ' ')
82
+ body.gsub!(%r{<li>}mi, ' * ')
83
+ elements = body.split(/<.*?>/mi)
84
+ elements.each { |item| item.gsub!(/\s+/, ' ') }
85
+ elements.each { |item| item.strip! }
86
+ elements.reject! { |item| item.empty? }
87
+ summary = elements.max_by {|e| e.size }
88
+ NKF.nkf("-w -Z3 --numchar-input --no-cp932", summary||"")
89
+ end
90
+
91
+ def parse(html)
92
+ title = get_title(html)
93
+ body = get_body(html)
94
+ return title, body
95
+ end
96
+
97
+ def execute
98
+ raise Nop, "Not Text" unless text?(@url)
99
+ html = fetch(@url)
100
+ html = NKF.nkf("-w -Z1 --no-cp932", html)
101
+ title, body = parse(html)
102
+ return "[%s] %s" % [title, body]
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,14 @@
1
+ require 'ch2'
2
+
3
+ module Engines
4
+ class Ch2 < Base
5
+ url %r{^http://[^./]+\.2ch\.net}
6
+
7
+ def execute
8
+ dat = ::Ch2::Dat.new(@url)
9
+ dat.valid? or raise Nop
10
+ return trim_tags(dat.summarize)
11
+ end
12
+ end
13
+ end
14
+
@@ -0,0 +1,6 @@
1
+ module Engines
2
+ class Https < Base
3
+ url %r{^https://}
4
+ end
5
+ end
6
+
@@ -0,0 +1,10 @@
1
+ module Engines
2
+ class None < Base
3
+ url %r{}
4
+
5
+ def execute
6
+ raise Nop
7
+ end
8
+ end
9
+ end
10
+
@@ -0,0 +1,16 @@
1
+ module Engines
2
+ class Twitter < Base
3
+ url %r{twitter\.com}
4
+
5
+ def initialize(url)
6
+ super
7
+ @url = normalize_url(@url)
8
+ end
9
+
10
+ def normalize_url(url)
11
+ return url.sub(%r{#!/}, '').sub(%r{//(?:\w+\.)?(twitter.com/)}, "//mobile.\\1")
12
+ end
13
+ end
14
+ end
15
+
16
+