ircbot 0.1.5 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. data/.gitignore +5 -0
  2. data/Gemfile +3 -0
  3. data/Gemfile.lock +71 -0
  4. data/README +72 -3
  5. data/bin/ircbot +3 -0
  6. data/config/samples/postgres.yml +19 -0
  7. data/config/{sama-zu.yml → samples/sama-zu.yml} +1 -1
  8. data/config/{yml.erb → samples/yml.erb} +0 -0
  9. data/ircbot.gemspec +13 -0
  10. data/lib/ircbot.rb +3 -1
  11. data/lib/ircbot/client.rb +6 -0
  12. data/lib/ircbot/client/config.rb +9 -0
  13. data/lib/ircbot/client/plugins.rb +14 -1
  14. data/lib/ircbot/core_ext/message.rb +4 -1
  15. data/lib/ircbot/plugin.rb +17 -0
  16. data/lib/ircbot/plugins.rb +68 -13
  17. data/lib/ircbot/utils/html_parser.rb +26 -0
  18. data/lib/ircbot/utils/watcher.rb +36 -0
  19. data/lib/ircbot/version.rb +1 -1
  20. data/old/plugins/summary.cpi +267 -0
  21. data/plugins/plugins.rb +1 -1
  22. data/plugins/reminder.rb +79 -175
  23. data/plugins/summary/ch2.rb +272 -0
  24. data/plugins/summary/engines.rb +30 -0
  25. data/plugins/summary/engines/base.rb +105 -0
  26. data/plugins/summary/engines/ch2.rb +14 -0
  27. data/plugins/summary/engines/https.rb +6 -0
  28. data/plugins/summary/engines/none.rb +10 -0
  29. data/plugins/summary/engines/twitter.rb +16 -0
  30. data/plugins/summary/spec/ch2_spec.rb +64 -0
  31. data/plugins/summary/spec/spec_helper.rb +19 -0
  32. data/plugins/summary/spec/summarizers_none_spec.rb +15 -0
  33. data/plugins/summary/spec/summarizers_spec.rb +23 -0
  34. data/plugins/summary/summary.rb +58 -0
  35. data/plugins/watchdog/db.rb +80 -0
  36. data/plugins/watchdog/exceptions.rb +4 -0
  37. data/plugins/watchdog/updater.rb +21 -0
  38. data/plugins/watchdog/watchdog.rb +82 -0
  39. data/spec/plugin_spec.rb +11 -0
  40. data/spec/plugins_spec.rb +35 -1
  41. data/spec/utils/html_parser_spec.rb +30 -0
  42. data/spec/utils/spec_helper.rb +1 -0
  43. metadata +190 -13
@@ -0,0 +1,272 @@
1
+ #!/usr/bin/env ruby
2
+ # vim:encoding=UTF-8:
3
+
4
+ # original: net-irc-0.0.9/examples/2ch.rb
5
+
6
+ $KCODE = "u" if RUBY_VERSION < "1.9" # json use this
7
+
8
+ require 'rubygems'
9
+ require 'uri'
10
+ require 'net/http'
11
+ require 'stringio'
12
+ require 'zlib'
13
+ require 'nkf'
14
+ require 'ircbot'
15
+
16
+ module Ch2
17
+ class Dat
18
+ class UnknownThread < StandardError; end
19
+
20
+ attr_accessor :uri
21
+ attr_accessor :last_modified, :size
22
+
23
+ Line = Struct.new(:n, :name, :mail, :misc, :body, :opts, :id) do
24
+ def to_s
25
+ [name, body, misc, opts].compact.join(" ")
26
+ end
27
+
28
+ def <=>(other)
29
+ body.to_s.size <=> other.body.to_s.size
30
+ end
31
+
32
+ def aa?
33
+ body = self.body
34
+ return false if body.count("\n") < 3
35
+
36
+ significants = body.scan(/[>\n0-9a-z0-9A-Za-zA-Zぁ-んァ-ン一-龠]/u).size.to_f
37
+ body_length = body.scan(/./u).size
38
+ is_aa = 1 - significants / body_length
39
+
40
+ is_aa > 0.6
41
+ end
42
+ end
43
+
44
+ attr_reader :board, :num, :arg
45
+ delegate :host, :port, :to => "@uri"
46
+
47
+ def initialize(thread_uri)
48
+ @uri = URI(thread_uri)
49
+ _, _, _, @board, @num, = *@uri.path.split('/')
50
+ @dat = []
51
+
52
+ case @uri.path
53
+ when %r{^/test/read\.cgi/(.*?)/(\d+)(/(.+))?}
54
+ @arg = $4
55
+ end
56
+ @valid = !! (@board && @num)
57
+ end
58
+
59
+ def valid?
60
+ @valid
61
+ end
62
+
63
+ def length
64
+ @dat.length
65
+ end
66
+
67
+ def subject
68
+ retrieve(true) if @dat.size.zero?
69
+ self[1].opts || ""
70
+ end
71
+
72
+ def [](n)
73
+ l = @dat[n - 1]
74
+ return nil unless l
75
+ name, mail, misc, body, opts = * l.split(/<>/)
76
+ id = misc[/ID:([^\s]+)/, 1]
77
+
78
+ body.gsub!(/<br>/, "\n")
79
+ body.gsub!(/<[^>]+>/, "")
80
+ body.gsub!(/^\s+|\s+$/, "")
81
+ body.gsub!(/&(gt|lt|amp|nbsp);/) {|s|
82
+ { 'gt' => ">", 'lt' => "<", 'amp' => "&", 'nbsp' => " " }[$1]
83
+ }
84
+
85
+ Line.new(n, name, mail, misc, body, opts, id)
86
+ end
87
+
88
+ def dat
89
+ @num
90
+ end
91
+
92
+ def retrieve(force=false)
93
+ @dat = [] if @force
94
+
95
+ res = Net::HTTP.start(@uri.host, @uri.port) do |http|
96
+ req = Net::HTTP::Get.new('/%s/dat/%d.dat' % [@board, @num])
97
+ req['User-Agent'] = 'Monazilla/1.00 (2ig.rb/0.0e)'
98
+ req['Accept-Encoding'] = 'gzip' unless @size
99
+ unless force
100
+ req['If-Modified-Since'] = @last_modified if @last_modified
101
+ req['Range'] = "bytes=%d-" % @size if @size
102
+ end
103
+
104
+ http.request(req)
105
+ end
106
+
107
+ ret = nil
108
+ case res.code.to_i
109
+ when 200, 206
110
+ body = res.body
111
+ if res['Content-Encoding'] == 'gzip'
112
+ body = StringIO.open(body, 'rb') {|io| Zlib::GzipReader.new(io).read }
113
+ end
114
+
115
+ @last_modified = res['Last-Modified']
116
+ if res.code == '206'
117
+ @size += body.size
118
+ else
119
+ @size = body.size
120
+ end
121
+
122
+ body = NKF.nkf('-w', body)
123
+
124
+ curr = @dat.size + 1
125
+ @dat.concat(body.split(/\n/))
126
+ last = @dat.size
127
+
128
+ (curr..last).map {|n|
129
+ self[n]
130
+ }
131
+ when 416 # たぶん削除が発生
132
+ p ['416']
133
+ retrieve(true)
134
+ []
135
+ when 304 # Not modified
136
+ []
137
+ when 302 # dat 落ち
138
+ p ['302', res['Location']]
139
+ raise UnknownThread
140
+ else
141
+ p ['Unknown Status:', res.code]
142
+ []
143
+ end
144
+ end
145
+
146
+ def canonicalize_subject(subject)
147
+ subject.gsub(/[A-Za-z0-9]/u) {|c|
148
+ c.unpack("U*").map {|i| i - 65248 }.pack("U*")
149
+ }
150
+ end
151
+
152
+ def guess_next_thread
153
+ res = Net::HTTP.start(@uri.host, @uri.port) do |http|
154
+ req = Net::HTTP::Get.new('/%s/subject.txt' % @board)
155
+ req['User-Agent'] = 'Monazilla/1.00 (2ig.rb/0.0e)'
156
+ http.request(req)
157
+ end
158
+
159
+ recent_posted_threads = (900..999).inject({}) {|r,i|
160
+ line = self[i]
161
+ line.body.scan(%r|ttp://#{@uri.host}/test/read.cgi/[^/]+/\d+/|).each do |uri|
162
+ r["h#{uri}"] = i
163
+ end if line
164
+ r
165
+ }
166
+
167
+ current_subject = canonicalize_subject(self.subject)
168
+ current_thread_rev = current_subject.scan(/\d+/).map {|d| d.to_i }
169
+ current = current_subject.scan(/./u)
170
+
171
+ body = NKF.nkf('-w', res.body)
172
+ threads = body.split(/\n/).map {|l|
173
+ dat, rest = *l.split(/<>/)
174
+ dat.sub!(/\.dat$/, "")
175
+
176
+ uri = "http://#{@uri.host}/test/read.cgi/#{@board}/#{dat}/"
177
+
178
+ subject, n = */(.+?) \((\d+)\)/.match(rest).captures
179
+ canonical_subject = canonicalize_subject(subject)
180
+ thread_rev = canonical_subject[/\d+/].to_i
181
+
182
+ distance = (dat == self.dat) ? Float::MAX :
183
+ (subject == self.subject) ? 0 :
184
+ levenshtein(canonical_subject.scan(/./u), current)
185
+ continuous_num = current_thread_rev.find {|rev| rev == thread_rev - 1 }
186
+ appear_recent = recent_posted_threads[uri]
187
+
188
+ score = distance
189
+ score -= 10 if continuous_num
190
+ score -= 10 if appear_recent
191
+ score += 10 if dat.to_i < self.dat.to_i
192
+ {
193
+ :uri => uri,
194
+ :dat => dat,
195
+ :subject => subject,
196
+ :distance => distance,
197
+ :continuous_num => continuous_num,
198
+ :appear_recent => appear_recent,
199
+ :score => score.to_f
200
+ }
201
+ }.sort_by {|o|
202
+ o[:score]
203
+ }
204
+
205
+ threads
206
+ end
207
+
208
+ def levenshtein(a, b)
209
+ case
210
+ when a.empty?
211
+ b.length
212
+ when b.empty?
213
+ a.length
214
+ when a == b
215
+ 0
216
+ else
217
+ d = Array.new(a.length + 1) { |s|
218
+ Array.new(b.length + 1, 0)
219
+ }
220
+
221
+ (0..a.length).each do |i|
222
+ d[i][0] = i
223
+ end
224
+
225
+ (0..b.length).each do |j|
226
+ d[0][j] = j
227
+ end
228
+
229
+ (1..a.length).each do |i|
230
+ (1..b.length).each do |j|
231
+ cost = (a[i - 1] == b[j - 1]) ? 0 : 1
232
+ d[i][j] = [
233
+ d[i-1][j ] + 1,
234
+ d[i ][j-1] + 1,
235
+ d[i-1][j-1] + cost
236
+ ].min
237
+ end
238
+ end
239
+
240
+ d[a.length][b.length]
241
+ end
242
+ end
243
+
244
+ def dat_url
245
+ "http://%s/%s/dat/%d.dat" % [host, board, num]
246
+ end
247
+
248
+ def summarize
249
+ retrieve
250
+ lead = "[%s] " % subject
251
+
252
+ case (arg||"").scan(/[\d-]/).join
253
+ when /^\d+$/ # exact id
254
+ range = (arg.to_i .. arg.to_i)
255
+ when /^(\d+)-(\d+)$/
256
+ range = ($1.to_i .. $2.to_i)
257
+ when /^(\d+)-$/
258
+ range = ($1.to_i .. 1000)
259
+ when /^-(\d+)$/
260
+ range = (1 .. $2.to_i)
261
+ else
262
+ range = (1 .. 1000)
263
+ sort = true
264
+ end
265
+
266
+ lines = range.map{|i| self[i]}.compact
267
+ bodies = lines.map(&:body)
268
+ bodies.sort!{|a,b| b.size <=> a.size} if sort
269
+ return lead + bodies.join(" ").gsub(/\s+/, ' ')
270
+ end
271
+ end
272
+ end
@@ -0,0 +1,30 @@
1
+ require 'dsl_accessor'
2
+ require 'extlib'
3
+
4
+ module Engines
5
+ Mapping = []
6
+
7
+ class NotImplementedError < NotImplementedError; end
8
+ class Nop < NotImplementedError; end
9
+
10
+ def self.create(url)
11
+ for pattern, klass in Mapping
12
+ return klass.new(url) if pattern =~ url
13
+ end
14
+ raise NotImplementedError, "[BUG] Not supported URL: %s" % url
15
+ end
16
+
17
+ # load ruby library and register its url
18
+ def self.register(name)
19
+ load File.dirname(__FILE__) + "/engines/#{name}.rb"
20
+ klass = instance_eval(Extlib::Inflection.camelize(name))
21
+ Mapping.unshift [klass.url, klass] unless klass == Base
22
+ end
23
+
24
+ register("base")
25
+ register("none")
26
+ register("https")
27
+ register("ch2")
28
+ register("twitter")
29
+ end
30
+
@@ -0,0 +1,105 @@
1
+ require 'open3'
2
+ require 'cgi'
3
+
4
+ module Engines
5
+ class Base
6
+ dsl_accessor :url
7
+
8
+ MaxContentLength = 512 * 1024
9
+
10
+ def initialize(url)
11
+ @url = url
12
+ end
13
+
14
+ def head(url)
15
+ # HTTP/1.1 200 OK
16
+ # Content-Type: text/html; charset=utf-8
17
+ # Date: Sun, 08 Apr 2012 18:08:45 GMT
18
+ # Content-Length: 245091
19
+ # Server: GSE
20
+
21
+ curl_options = ["--head", "--location", "--user-agent", "Mozilla"]
22
+ Open3.popen3(*["curl", curl_options, url].flatten) {|i,o,e| o.read }
23
+ end
24
+
25
+ def text?(url)
26
+ head(url).to_s =~ %r{^Content-Type:.*text/}
27
+ end
28
+
29
+ def fetch(url)
30
+ curl_options = [
31
+ "--location", "--compressed",
32
+ "--user-agent", "Mozilla",
33
+ "--max-filesize", "%d" % MaxContentLength,
34
+ ]
35
+ Open3.popen3(*["curl", curl_options, url].flatten) {|i,o,e| o.read }
36
+ end
37
+
38
+ def trim_tags(html)
39
+ html.gsub!(%r{<head[^>]*>.*?</head>}mi, '')
40
+ html.gsub!(%r{<script.*?>.*?</script>}mi, '')
41
+ html.gsub!(%r{<style.*?>.*?</style>}mi, '')
42
+ html.gsub!(%r{<noscript.*?>.*?</noscript>}mi, '')
43
+ html.gsub!(%r{</?.*?>}, '')
44
+ html.gsub!(%r{<\!--.*?-->}mi, '')
45
+ html.gsub!(%r{<\!\w.*?>}mi, '')
46
+ html.gsub!(%r{\s+}m, ' ')
47
+ html.strip!
48
+ html = CGI.unescapeHTML(html)
49
+ return html
50
+ end
51
+
52
+ def get_title(html)
53
+ if %r{<title>(.*?)</title>}mi =~ html
54
+ title = $1.strip
55
+ title.gsub!(%r{<.*?>}m, '')
56
+ title.gsub!(%r{\s+}m, ' ')
57
+ NKF.nkf("-w -Z3 --numchar-input --no-cp932", title)
58
+ else
59
+ ""
60
+ end
61
+ end
62
+
63
+ def get_body(html)
64
+ if /<body.*?>(.*?)<\/body>/im =~ html
65
+ body = $1
66
+ else
67
+ raise Nop, "No Body Found"
68
+ end
69
+ body.gsub!(%r{<!--.*?-->}im, '')
70
+ body.gsub!(%r{<\!\w.*?>}mi, '')
71
+ #body.gsub!(%r{<head.*?>.*?<\/head>}mi, '')
72
+ body.gsub!(%r{<head[^>]*>.*?<\/head>}mi, '')
73
+ body.gsub!(%r{<script.*?>.*?<\/script>}mi, '')
74
+ body.gsub!(%r{<style.*?>.*?<\/style>}mi, '')
75
+ body.gsub!(%r{<noscript.*?>.*?</noscript>}mi, '')
76
+ body.gsub!(%r{(:?<a.*?>|<\/a>)}mi, '')
77
+ body.gsub!(%r{(:?<font.*?>|<\/font>)}mi, '')
78
+ body.gsub!(%r{<img.*?/?>}mi, '')
79
+ body.gsub!(%r{(:?<b>|<\/b>|<i>|<\/i>|<u>|<\/u>|<p>|<\/p>|<\/li>)}mi,'')
80
+ body.gsub!(%r{(<(:?br)(:?\s+/)?>)}mi,'')
81
+ body.gsub!(%r{(:?<\/?h[1-6]>)}mi, ' ')
82
+ body.gsub!(%r{<li>}mi, ' * ')
83
+ elements = body.split(/<.*?>/mi)
84
+ elements.each { |item| item.gsub!(/\s+/, ' ') }
85
+ elements.each { |item| item.strip! }
86
+ elements.reject! { |item| item.empty? }
87
+ summary = elements.max_by {|e| e.size }
88
+ NKF.nkf("-w -Z3 --numchar-input --no-cp932", summary||"")
89
+ end
90
+
91
+ def parse(html)
92
+ title = get_title(html)
93
+ body = get_body(html)
94
+ return title, body
95
+ end
96
+
97
+ def execute
98
+ raise Nop, "Not Text" unless text?(@url)
99
+ html = fetch(@url)
100
+ html = NKF.nkf("-w -Z1 --no-cp932", html)
101
+ title, body = parse(html)
102
+ return "[%s] %s" % [title, body]
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,14 @@
1
+ require 'ch2'
2
+
3
+ module Engines
4
+ class Ch2 < Base
5
+ url %r{^http://[^./]+\.2ch\.net}
6
+
7
+ def execute
8
+ dat = ::Ch2::Dat.new(@url)
9
+ dat.valid? or raise Nop
10
+ return trim_tags(dat.summarize)
11
+ end
12
+ end
13
+ end
14
+
@@ -0,0 +1,6 @@
1
+ module Engines
2
+ class Https < Base
3
+ url %r{^https://}
4
+ end
5
+ end
6
+
@@ -0,0 +1,10 @@
1
+ module Engines
2
+ class None < Base
3
+ url %r{}
4
+
5
+ def execute
6
+ raise Nop
7
+ end
8
+ end
9
+ end
10
+
@@ -0,0 +1,16 @@
1
+ module Engines
2
+ class Twitter < Base
3
+ url %r{twitter\.com}
4
+
5
+ def initialize(url)
6
+ super
7
+ @url = normalize_url(@url)
8
+ end
9
+
10
+ def normalize_url(url)
11
+ return url.sub(%r{#!/}, '').sub(%r{//(?:\w+\.)?(twitter.com/)}, "//mobile.\\1")
12
+ end
13
+ end
14
+ end
15
+
16
+