ircbot 0.1.5 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +5 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +71 -0
- data/README +72 -3
- data/bin/ircbot +3 -0
- data/config/samples/postgres.yml +19 -0
- data/config/{sama-zu.yml → samples/sama-zu.yml} +1 -1
- data/config/{yml.erb → samples/yml.erb} +0 -0
- data/ircbot.gemspec +13 -0
- data/lib/ircbot.rb +3 -1
- data/lib/ircbot/client.rb +6 -0
- data/lib/ircbot/client/config.rb +9 -0
- data/lib/ircbot/client/plugins.rb +14 -1
- data/lib/ircbot/core_ext/message.rb +4 -1
- data/lib/ircbot/plugin.rb +17 -0
- data/lib/ircbot/plugins.rb +68 -13
- data/lib/ircbot/utils/html_parser.rb +26 -0
- data/lib/ircbot/utils/watcher.rb +36 -0
- data/lib/ircbot/version.rb +1 -1
- data/old/plugins/summary.cpi +267 -0
- data/plugins/plugins.rb +1 -1
- data/plugins/reminder.rb +79 -175
- data/plugins/summary/ch2.rb +272 -0
- data/plugins/summary/engines.rb +30 -0
- data/plugins/summary/engines/base.rb +105 -0
- data/plugins/summary/engines/ch2.rb +14 -0
- data/plugins/summary/engines/https.rb +6 -0
- data/plugins/summary/engines/none.rb +10 -0
- data/plugins/summary/engines/twitter.rb +16 -0
- data/plugins/summary/spec/ch2_spec.rb +64 -0
- data/plugins/summary/spec/spec_helper.rb +19 -0
- data/plugins/summary/spec/summarizers_none_spec.rb +15 -0
- data/plugins/summary/spec/summarizers_spec.rb +23 -0
- data/plugins/summary/summary.rb +58 -0
- data/plugins/watchdog/db.rb +80 -0
- data/plugins/watchdog/exceptions.rb +4 -0
- data/plugins/watchdog/updater.rb +21 -0
- data/plugins/watchdog/watchdog.rb +82 -0
- data/spec/plugin_spec.rb +11 -0
- data/spec/plugins_spec.rb +35 -1
- data/spec/utils/html_parser_spec.rb +30 -0
- data/spec/utils/spec_helper.rb +1 -0
- metadata +190 -13
@@ -0,0 +1,272 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# vim:encoding=UTF-8:
|
3
|
+
|
4
|
+
# original: net-irc-0.0.9/examples/2ch.rb
|
5
|
+
|
6
|
+
$KCODE = "u" if RUBY_VERSION < "1.9" # json use this
|
7
|
+
|
8
|
+
require 'rubygems'
|
9
|
+
require 'uri'
|
10
|
+
require 'net/http'
|
11
|
+
require 'stringio'
|
12
|
+
require 'zlib'
|
13
|
+
require 'nkf'
|
14
|
+
require 'ircbot'
|
15
|
+
|
16
|
+
module Ch2
|
17
|
+
class Dat
|
18
|
+
class UnknownThread < StandardError; end
|
19
|
+
|
20
|
+
attr_accessor :uri
|
21
|
+
attr_accessor :last_modified, :size
|
22
|
+
|
23
|
+
Line = Struct.new(:n, :name, :mail, :misc, :body, :opts, :id) do
|
24
|
+
def to_s
|
25
|
+
[name, body, misc, opts].compact.join(" ")
|
26
|
+
end
|
27
|
+
|
28
|
+
def <=>(other)
|
29
|
+
body.to_s.size <=> other.body.to_s.size
|
30
|
+
end
|
31
|
+
|
32
|
+
def aa?
|
33
|
+
body = self.body
|
34
|
+
return false if body.count("\n") < 3
|
35
|
+
|
36
|
+
significants = body.scan(/[>\n0-9a-z0-9A-Za-zA-Zぁ-んァ-ン一-龠]/u).size.to_f
|
37
|
+
body_length = body.scan(/./u).size
|
38
|
+
is_aa = 1 - significants / body_length
|
39
|
+
|
40
|
+
is_aa > 0.6
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
attr_reader :board, :num, :arg
|
45
|
+
delegate :host, :port, :to => "@uri"
|
46
|
+
|
47
|
+
def initialize(thread_uri)
|
48
|
+
@uri = URI(thread_uri)
|
49
|
+
_, _, _, @board, @num, = *@uri.path.split('/')
|
50
|
+
@dat = []
|
51
|
+
|
52
|
+
case @uri.path
|
53
|
+
when %r{^/test/read\.cgi/(.*?)/(\d+)(/(.+))?}
|
54
|
+
@arg = $4
|
55
|
+
end
|
56
|
+
@valid = !! (@board && @num)
|
57
|
+
end
|
58
|
+
|
59
|
+
def valid?
|
60
|
+
@valid
|
61
|
+
end
|
62
|
+
|
63
|
+
def length
|
64
|
+
@dat.length
|
65
|
+
end
|
66
|
+
|
67
|
+
def subject
|
68
|
+
retrieve(true) if @dat.size.zero?
|
69
|
+
self[1].opts || ""
|
70
|
+
end
|
71
|
+
|
72
|
+
def [](n)
|
73
|
+
l = @dat[n - 1]
|
74
|
+
return nil unless l
|
75
|
+
name, mail, misc, body, opts = * l.split(/<>/)
|
76
|
+
id = misc[/ID:([^\s]+)/, 1]
|
77
|
+
|
78
|
+
body.gsub!(/<br>/, "\n")
|
79
|
+
body.gsub!(/<[^>]+>/, "")
|
80
|
+
body.gsub!(/^\s+|\s+$/, "")
|
81
|
+
body.gsub!(/&(gt|lt|amp|nbsp);/) {|s|
|
82
|
+
{ 'gt' => ">", 'lt' => "<", 'amp' => "&", 'nbsp' => " " }[$1]
|
83
|
+
}
|
84
|
+
|
85
|
+
Line.new(n, name, mail, misc, body, opts, id)
|
86
|
+
end
|
87
|
+
|
88
|
+
def dat
|
89
|
+
@num
|
90
|
+
end
|
91
|
+
|
92
|
+
def retrieve(force=false)
|
93
|
+
@dat = [] if @force
|
94
|
+
|
95
|
+
res = Net::HTTP.start(@uri.host, @uri.port) do |http|
|
96
|
+
req = Net::HTTP::Get.new('/%s/dat/%d.dat' % [@board, @num])
|
97
|
+
req['User-Agent'] = 'Monazilla/1.00 (2ig.rb/0.0e)'
|
98
|
+
req['Accept-Encoding'] = 'gzip' unless @size
|
99
|
+
unless force
|
100
|
+
req['If-Modified-Since'] = @last_modified if @last_modified
|
101
|
+
req['Range'] = "bytes=%d-" % @size if @size
|
102
|
+
end
|
103
|
+
|
104
|
+
http.request(req)
|
105
|
+
end
|
106
|
+
|
107
|
+
ret = nil
|
108
|
+
case res.code.to_i
|
109
|
+
when 200, 206
|
110
|
+
body = res.body
|
111
|
+
if res['Content-Encoding'] == 'gzip'
|
112
|
+
body = StringIO.open(body, 'rb') {|io| Zlib::GzipReader.new(io).read }
|
113
|
+
end
|
114
|
+
|
115
|
+
@last_modified = res['Last-Modified']
|
116
|
+
if res.code == '206'
|
117
|
+
@size += body.size
|
118
|
+
else
|
119
|
+
@size = body.size
|
120
|
+
end
|
121
|
+
|
122
|
+
body = NKF.nkf('-w', body)
|
123
|
+
|
124
|
+
curr = @dat.size + 1
|
125
|
+
@dat.concat(body.split(/\n/))
|
126
|
+
last = @dat.size
|
127
|
+
|
128
|
+
(curr..last).map {|n|
|
129
|
+
self[n]
|
130
|
+
}
|
131
|
+
when 416 # たぶん削除が発生
|
132
|
+
p ['416']
|
133
|
+
retrieve(true)
|
134
|
+
[]
|
135
|
+
when 304 # Not modified
|
136
|
+
[]
|
137
|
+
when 302 # dat 落ち
|
138
|
+
p ['302', res['Location']]
|
139
|
+
raise UnknownThread
|
140
|
+
else
|
141
|
+
p ['Unknown Status:', res.code]
|
142
|
+
[]
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def canonicalize_subject(subject)
|
147
|
+
subject.gsub(/[A-Za-z0-9]/u) {|c|
|
148
|
+
c.unpack("U*").map {|i| i - 65248 }.pack("U*")
|
149
|
+
}
|
150
|
+
end
|
151
|
+
|
152
|
+
def guess_next_thread
|
153
|
+
res = Net::HTTP.start(@uri.host, @uri.port) do |http|
|
154
|
+
req = Net::HTTP::Get.new('/%s/subject.txt' % @board)
|
155
|
+
req['User-Agent'] = 'Monazilla/1.00 (2ig.rb/0.0e)'
|
156
|
+
http.request(req)
|
157
|
+
end
|
158
|
+
|
159
|
+
recent_posted_threads = (900..999).inject({}) {|r,i|
|
160
|
+
line = self[i]
|
161
|
+
line.body.scan(%r|ttp://#{@uri.host}/test/read.cgi/[^/]+/\d+/|).each do |uri|
|
162
|
+
r["h#{uri}"] = i
|
163
|
+
end if line
|
164
|
+
r
|
165
|
+
}
|
166
|
+
|
167
|
+
current_subject = canonicalize_subject(self.subject)
|
168
|
+
current_thread_rev = current_subject.scan(/\d+/).map {|d| d.to_i }
|
169
|
+
current = current_subject.scan(/./u)
|
170
|
+
|
171
|
+
body = NKF.nkf('-w', res.body)
|
172
|
+
threads = body.split(/\n/).map {|l|
|
173
|
+
dat, rest = *l.split(/<>/)
|
174
|
+
dat.sub!(/\.dat$/, "")
|
175
|
+
|
176
|
+
uri = "http://#{@uri.host}/test/read.cgi/#{@board}/#{dat}/"
|
177
|
+
|
178
|
+
subject, n = */(.+?) \((\d+)\)/.match(rest).captures
|
179
|
+
canonical_subject = canonicalize_subject(subject)
|
180
|
+
thread_rev = canonical_subject[/\d+/].to_i
|
181
|
+
|
182
|
+
distance = (dat == self.dat) ? Float::MAX :
|
183
|
+
(subject == self.subject) ? 0 :
|
184
|
+
levenshtein(canonical_subject.scan(/./u), current)
|
185
|
+
continuous_num = current_thread_rev.find {|rev| rev == thread_rev - 1 }
|
186
|
+
appear_recent = recent_posted_threads[uri]
|
187
|
+
|
188
|
+
score = distance
|
189
|
+
score -= 10 if continuous_num
|
190
|
+
score -= 10 if appear_recent
|
191
|
+
score += 10 if dat.to_i < self.dat.to_i
|
192
|
+
{
|
193
|
+
:uri => uri,
|
194
|
+
:dat => dat,
|
195
|
+
:subject => subject,
|
196
|
+
:distance => distance,
|
197
|
+
:continuous_num => continuous_num,
|
198
|
+
:appear_recent => appear_recent,
|
199
|
+
:score => score.to_f
|
200
|
+
}
|
201
|
+
}.sort_by {|o|
|
202
|
+
o[:score]
|
203
|
+
}
|
204
|
+
|
205
|
+
threads
|
206
|
+
end
|
207
|
+
|
208
|
+
def levenshtein(a, b)
|
209
|
+
case
|
210
|
+
when a.empty?
|
211
|
+
b.length
|
212
|
+
when b.empty?
|
213
|
+
a.length
|
214
|
+
when a == b
|
215
|
+
0
|
216
|
+
else
|
217
|
+
d = Array.new(a.length + 1) { |s|
|
218
|
+
Array.new(b.length + 1, 0)
|
219
|
+
}
|
220
|
+
|
221
|
+
(0..a.length).each do |i|
|
222
|
+
d[i][0] = i
|
223
|
+
end
|
224
|
+
|
225
|
+
(0..b.length).each do |j|
|
226
|
+
d[0][j] = j
|
227
|
+
end
|
228
|
+
|
229
|
+
(1..a.length).each do |i|
|
230
|
+
(1..b.length).each do |j|
|
231
|
+
cost = (a[i - 1] == b[j - 1]) ? 0 : 1
|
232
|
+
d[i][j] = [
|
233
|
+
d[i-1][j ] + 1,
|
234
|
+
d[i ][j-1] + 1,
|
235
|
+
d[i-1][j-1] + cost
|
236
|
+
].min
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
d[a.length][b.length]
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
def dat_url
|
245
|
+
"http://%s/%s/dat/%d.dat" % [host, board, num]
|
246
|
+
end
|
247
|
+
|
248
|
+
def summarize
|
249
|
+
retrieve
|
250
|
+
lead = "[%s] " % subject
|
251
|
+
|
252
|
+
case (arg||"").scan(/[\d-]/).join
|
253
|
+
when /^\d+$/ # exact id
|
254
|
+
range = (arg.to_i .. arg.to_i)
|
255
|
+
when /^(\d+)-(\d+)$/
|
256
|
+
range = ($1.to_i .. $2.to_i)
|
257
|
+
when /^(\d+)-$/
|
258
|
+
range = ($1.to_i .. 1000)
|
259
|
+
when /^-(\d+)$/
|
260
|
+
range = (1 .. $2.to_i)
|
261
|
+
else
|
262
|
+
range = (1 .. 1000)
|
263
|
+
sort = true
|
264
|
+
end
|
265
|
+
|
266
|
+
lines = range.map{|i| self[i]}.compact
|
267
|
+
bodies = lines.map(&:body)
|
268
|
+
bodies.sort!{|a,b| b.size <=> a.size} if sort
|
269
|
+
return lead + bodies.join(" ").gsub(/\s+/, ' ')
|
270
|
+
end
|
271
|
+
end
|
272
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'dsl_accessor'
|
2
|
+
require 'extlib'
|
3
|
+
|
4
|
+
module Engines
|
5
|
+
Mapping = []
|
6
|
+
|
7
|
+
class NotImplementedError < NotImplementedError; end
|
8
|
+
class Nop < NotImplementedError; end
|
9
|
+
|
10
|
+
def self.create(url)
|
11
|
+
for pattern, klass in Mapping
|
12
|
+
return klass.new(url) if pattern =~ url
|
13
|
+
end
|
14
|
+
raise NotImplementedError, "[BUG] Not supported URL: %s" % url
|
15
|
+
end
|
16
|
+
|
17
|
+
# load ruby library and register its url
|
18
|
+
def self.register(name)
|
19
|
+
load File.dirname(__FILE__) + "/engines/#{name}.rb"
|
20
|
+
klass = instance_eval(Extlib::Inflection.camelize(name))
|
21
|
+
Mapping.unshift [klass.url, klass] unless klass == Base
|
22
|
+
end
|
23
|
+
|
24
|
+
register("base")
|
25
|
+
register("none")
|
26
|
+
register("https")
|
27
|
+
register("ch2")
|
28
|
+
register("twitter")
|
29
|
+
end
|
30
|
+
|
@@ -0,0 +1,105 @@
|
|
1
|
+
require 'open3'
|
2
|
+
require 'cgi'
|
3
|
+
|
4
|
+
module Engines
|
5
|
+
class Base
|
6
|
+
dsl_accessor :url
|
7
|
+
|
8
|
+
MaxContentLength = 512 * 1024
|
9
|
+
|
10
|
+
def initialize(url)
|
11
|
+
@url = url
|
12
|
+
end
|
13
|
+
|
14
|
+
def head(url)
|
15
|
+
# HTTP/1.1 200 OK
|
16
|
+
# Content-Type: text/html; charset=utf-8
|
17
|
+
# Date: Sun, 08 Apr 2012 18:08:45 GMT
|
18
|
+
# Content-Length: 245091
|
19
|
+
# Server: GSE
|
20
|
+
|
21
|
+
curl_options = ["--head", "--location", "--user-agent", "Mozilla"]
|
22
|
+
Open3.popen3(*["curl", curl_options, url].flatten) {|i,o,e| o.read }
|
23
|
+
end
|
24
|
+
|
25
|
+
def text?(url)
|
26
|
+
head(url).to_s =~ %r{^Content-Type:.*text/}
|
27
|
+
end
|
28
|
+
|
29
|
+
def fetch(url)
|
30
|
+
curl_options = [
|
31
|
+
"--location", "--compressed",
|
32
|
+
"--user-agent", "Mozilla",
|
33
|
+
"--max-filesize", "%d" % MaxContentLength,
|
34
|
+
]
|
35
|
+
Open3.popen3(*["curl", curl_options, url].flatten) {|i,o,e| o.read }
|
36
|
+
end
|
37
|
+
|
38
|
+
def trim_tags(html)
|
39
|
+
html.gsub!(%r{<head[^>]*>.*?</head>}mi, '')
|
40
|
+
html.gsub!(%r{<script.*?>.*?</script>}mi, '')
|
41
|
+
html.gsub!(%r{<style.*?>.*?</style>}mi, '')
|
42
|
+
html.gsub!(%r{<noscript.*?>.*?</noscript>}mi, '')
|
43
|
+
html.gsub!(%r{</?.*?>}, '')
|
44
|
+
html.gsub!(%r{<\!--.*?-->}mi, '')
|
45
|
+
html.gsub!(%r{<\!\w.*?>}mi, '')
|
46
|
+
html.gsub!(%r{\s+}m, ' ')
|
47
|
+
html.strip!
|
48
|
+
html = CGI.unescapeHTML(html)
|
49
|
+
return html
|
50
|
+
end
|
51
|
+
|
52
|
+
def get_title(html)
|
53
|
+
if %r{<title>(.*?)</title>}mi =~ html
|
54
|
+
title = $1.strip
|
55
|
+
title.gsub!(%r{<.*?>}m, '')
|
56
|
+
title.gsub!(%r{\s+}m, ' ')
|
57
|
+
NKF.nkf("-w -Z3 --numchar-input --no-cp932", title)
|
58
|
+
else
|
59
|
+
""
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def get_body(html)
|
64
|
+
if /<body.*?>(.*?)<\/body>/im =~ html
|
65
|
+
body = $1
|
66
|
+
else
|
67
|
+
raise Nop, "No Body Found"
|
68
|
+
end
|
69
|
+
body.gsub!(%r{<!--.*?-->}im, '')
|
70
|
+
body.gsub!(%r{<\!\w.*?>}mi, '')
|
71
|
+
#body.gsub!(%r{<head.*?>.*?<\/head>}mi, '')
|
72
|
+
body.gsub!(%r{<head[^>]*>.*?<\/head>}mi, '')
|
73
|
+
body.gsub!(%r{<script.*?>.*?<\/script>}mi, '')
|
74
|
+
body.gsub!(%r{<style.*?>.*?<\/style>}mi, '')
|
75
|
+
body.gsub!(%r{<noscript.*?>.*?</noscript>}mi, '')
|
76
|
+
body.gsub!(%r{(:?<a.*?>|<\/a>)}mi, '')
|
77
|
+
body.gsub!(%r{(:?<font.*?>|<\/font>)}mi, '')
|
78
|
+
body.gsub!(%r{<img.*?/?>}mi, '')
|
79
|
+
body.gsub!(%r{(:?<b>|<\/b>|<i>|<\/i>|<u>|<\/u>|<p>|<\/p>|<\/li>)}mi,'')
|
80
|
+
body.gsub!(%r{(<(:?br)(:?\s+/)?>)}mi,'')
|
81
|
+
body.gsub!(%r{(:?<\/?h[1-6]>)}mi, ' ')
|
82
|
+
body.gsub!(%r{<li>}mi, ' * ')
|
83
|
+
elements = body.split(/<.*?>/mi)
|
84
|
+
elements.each { |item| item.gsub!(/\s+/, ' ') }
|
85
|
+
elements.each { |item| item.strip! }
|
86
|
+
elements.reject! { |item| item.empty? }
|
87
|
+
summary = elements.max_by {|e| e.size }
|
88
|
+
NKF.nkf("-w -Z3 --numchar-input --no-cp932", summary||"")
|
89
|
+
end
|
90
|
+
|
91
|
+
def parse(html)
|
92
|
+
title = get_title(html)
|
93
|
+
body = get_body(html)
|
94
|
+
return title, body
|
95
|
+
end
|
96
|
+
|
97
|
+
def execute
|
98
|
+
raise Nop, "Not Text" unless text?(@url)
|
99
|
+
html = fetch(@url)
|
100
|
+
html = NKF.nkf("-w -Z1 --no-cp932", html)
|
101
|
+
title, body = parse(html)
|
102
|
+
return "[%s] %s" % [title, body]
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Engines
|
2
|
+
class Twitter < Base
|
3
|
+
url %r{twitter\.com}
|
4
|
+
|
5
|
+
def initialize(url)
|
6
|
+
super
|
7
|
+
@url = normalize_url(@url)
|
8
|
+
end
|
9
|
+
|
10
|
+
def normalize_url(url)
|
11
|
+
return url.sub(%r{#!/}, '').sub(%r{//(?:\w+\.)?(twitter.com/)}, "//mobile.\\1")
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
|