drip 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +0 -2
- data/drip.gemspec +1 -1
- data/drip.txt +1068 -190
- data/lib/drip.rb +146 -9
- data/lib/drip/version.rb +1 -1
- data/lib/my_drip.rb +4 -0
- data/sample/copocopo.rb +1 -1
- data/sample/demo4book/crawl.rb +56 -0
- data/sample/demo4book/demo_ui.rb +71 -0
- data/sample/demo4book/demo_ui_webrick.rb +69 -0
- data/sample/demo4book/index.rb +96 -0
- data/sample/demo4book/query2.rb +47 -0
- data/sample/demo4book/query2_test.rb +18 -0
- data/test/basic.rb +131 -0
- metadata +21 -5
data/lib/drip.rb
CHANGED
@@ -7,6 +7,127 @@ class Drip
|
|
7
7
|
include DRbUndumped
|
8
8
|
def inspect; to_s; end
|
9
9
|
|
10
|
+
class ImmutableDrip
|
11
|
+
class Generator
|
12
|
+
def initialize
|
13
|
+
@pool = []
|
14
|
+
@tag = []
|
15
|
+
@shared = Hash.new {|h, k| h[k] = k; k}
|
16
|
+
end
|
17
|
+
|
18
|
+
def add(key, value, *tag)
|
19
|
+
@pool << [key, value]
|
20
|
+
idx = @pool.size - 1
|
21
|
+
tag.uniq.each do |t|
|
22
|
+
@tag << [[@shared[t], key], idx]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def generate
|
27
|
+
tag = @tag.sort
|
28
|
+
tag.inject(nil) do |last, kv|
|
29
|
+
k = kv[0]
|
30
|
+
k[0] = last if k[0] == last
|
31
|
+
k[0]
|
32
|
+
end
|
33
|
+
ImmutableDrip.new(@pool.sort, tag)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
INF = 1.0/0
|
38
|
+
|
39
|
+
def initialize(pool=[], tag=[])
|
40
|
+
@pool = pool
|
41
|
+
@tag = tag
|
42
|
+
end
|
43
|
+
|
44
|
+
def fetch(key)
|
45
|
+
idx = lower_boundary(@pool, key)
|
46
|
+
k, v = @pool[idx]
|
47
|
+
k == key ? v.to_a : nil
|
48
|
+
end
|
49
|
+
|
50
|
+
def read(key, n=1)
|
51
|
+
idx = lower_boundary(@pool, key + 1)
|
52
|
+
return [] unless idx
|
53
|
+
@pool[idx, n].collect {|kv|
|
54
|
+
[kv[0], *kv[1].to_a]
|
55
|
+
}
|
56
|
+
end
|
57
|
+
|
58
|
+
def read_tag(key, tag, n=1)
|
59
|
+
idx = lower_boundary(@tag, [tag, key + 1])
|
60
|
+
return [] unless idx
|
61
|
+
@tag[idx, n].find_all {|kv| kv[0][0] == tag}.collect {|kv|
|
62
|
+
[kv[0][1], *@pool[kv[1]][1].to_a]
|
63
|
+
}
|
64
|
+
end
|
65
|
+
|
66
|
+
def head_tag(n, tag)
|
67
|
+
lower = lower_boundary(@tag, [tag, 0])
|
68
|
+
upper = upper_boundary(@tag, [tag, INF])
|
69
|
+
lower = [lower, upper - n].max
|
70
|
+
@tag[lower ... upper].collect {|kv|
|
71
|
+
[kv[0][1], *@pool[kv[1]][1].to_a]
|
72
|
+
}
|
73
|
+
end
|
74
|
+
|
75
|
+
def head(n=1, tag=nil)
|
76
|
+
return head_tag(n, tag) if tag
|
77
|
+
n = @pool.size < n ? @pool.size : n
|
78
|
+
@pool[-n, n].collect {|kv|
|
79
|
+
[kv[0], *kv[1].to_a]
|
80
|
+
}
|
81
|
+
end
|
82
|
+
|
83
|
+
def older_tag(key, tag)
|
84
|
+
idx = upper_boundary(@tag, [tag, key-1])
|
85
|
+
k, v = @tag[idx - 1]
|
86
|
+
k && k[0] == tag ? [k[1], *@pool[v][1].to_a] : nil
|
87
|
+
end
|
88
|
+
|
89
|
+
def older(key, tag=nil)
|
90
|
+
key = @pool[-1][0] + 1 unless key
|
91
|
+
return older_tag(key, tag) if tag
|
92
|
+
idx = upper_boundary(@pool, key - 1)
|
93
|
+
k, v = @pool[idx - 1]
|
94
|
+
k && k < key ? [k, *v.to_a] : nil
|
95
|
+
end
|
96
|
+
|
97
|
+
def newer(key, tag=nil)
|
98
|
+
return read(key, 1)[0] unless tag
|
99
|
+
read_tag(key, tag, 1)[0]
|
100
|
+
end
|
101
|
+
|
102
|
+
def lower_boundary(ary, key)
|
103
|
+
lower = -1
|
104
|
+
upper = ary.size
|
105
|
+
while lower + 1 != upper
|
106
|
+
mid = (lower + upper).div(2)
|
107
|
+
if (ary[mid][0] <=> key) < 0
|
108
|
+
lower = mid
|
109
|
+
else
|
110
|
+
upper = mid
|
111
|
+
end
|
112
|
+
end
|
113
|
+
return upper
|
114
|
+
end
|
115
|
+
|
116
|
+
def upper_boundary(ary, key)
|
117
|
+
lower = -1
|
118
|
+
upper = ary.size
|
119
|
+
while lower + 1 != upper
|
120
|
+
mid = (lower + upper).div(2)
|
121
|
+
if (ary[mid][0] <=> key) <= 0
|
122
|
+
lower = mid
|
123
|
+
else
|
124
|
+
upper = mid
|
125
|
+
end
|
126
|
+
end
|
127
|
+
return lower + 1
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
10
131
|
def initialize(dir, option={})
|
11
132
|
@pool = RBTree.new
|
12
133
|
@tag = RBTree.new
|
@@ -15,20 +136,20 @@ class Drip
|
|
15
136
|
prepare_store(dir, option)
|
16
137
|
end
|
17
138
|
|
18
|
-
def write(*
|
19
|
-
write_after(Time.now, *
|
139
|
+
def write(obj, *tags)
|
140
|
+
write_after(Time.now, obj, *tags)
|
20
141
|
end
|
21
142
|
|
22
143
|
def write_after(at, *value)
|
23
144
|
make_key(at) do |key|
|
24
|
-
do_write(key, value)
|
145
|
+
value = do_write(key, value)
|
25
146
|
@pool[key] = @store.write(key, value)
|
26
147
|
end
|
27
148
|
end
|
28
149
|
|
29
150
|
def write_at(at, *value)
|
30
151
|
make_key_at(at) do |key|
|
31
|
-
do_write(key, value)
|
152
|
+
value = do_write(key, value)
|
32
153
|
@pool[key] = @store.write(key, value)
|
33
154
|
end
|
34
155
|
end
|
@@ -197,15 +318,30 @@ class Drip
|
|
197
318
|
end
|
198
319
|
|
199
320
|
Dir.mkdir(dir) rescue nil
|
321
|
+
dump = Dir.glob(File.join(dir, '*.dump')).max_by do |fn|
|
322
|
+
File.basename(fn).to_i(36)
|
323
|
+
end
|
324
|
+
if dump
|
325
|
+
@pool, @tag, last = File.open(dump, 'rb') {|fp| Marshal.load(fp)}
|
326
|
+
@event.take([:last, nil])
|
327
|
+
@event.write([:last, last])
|
328
|
+
File.unlink(dump)
|
329
|
+
end
|
330
|
+
loaded = dump ? File.basename(dump).to_i(36) : 0
|
200
331
|
Dir.glob(File.join(dir, '*.log')) do |fn|
|
332
|
+
next if loaded > File.basename(fn).to_i(36)
|
201
333
|
begin
|
202
334
|
store = SimpleStore.reader(fn)
|
203
335
|
restore(store)
|
204
336
|
rescue
|
205
337
|
end
|
206
338
|
end
|
207
|
-
name = time_to_key(Time.now).to_s(36)
|
208
|
-
|
339
|
+
name = time_to_key(Time.now).to_s(36)
|
340
|
+
_, last = @event.read([:last, nil])
|
341
|
+
File.open(File.join(dir, name + '.dump'), 'wb') {|fp|
|
342
|
+
Marshal.dump([@pool, @tag, last], fp)
|
343
|
+
}
|
344
|
+
@store = SimpleStore.new(File.join(dir, name + '.log'))
|
209
345
|
end
|
210
346
|
|
211
347
|
def shared_text(str)
|
@@ -218,13 +354,14 @@ class Drip
|
|
218
354
|
end
|
219
355
|
|
220
356
|
def do_write(key, value)
|
221
|
-
|
222
|
-
|
357
|
+
obj, *tags = value
|
358
|
+
tags.uniq!
|
359
|
+
tags.each do |k|
|
223
360
|
next unless String === k
|
224
361
|
tag = shared_text(k)
|
225
362
|
@tag[[tag, key]] = key
|
226
363
|
end
|
227
|
-
@pool[key] =
|
364
|
+
@pool[key] = [obj] + tags
|
228
365
|
end
|
229
366
|
|
230
367
|
def restore(store)
|
data/lib/drip/version.rb
CHANGED
data/lib/my_drip.rb
CHANGED
data/sample/copocopo.rb
CHANGED
@@ -54,7 +54,7 @@ class CopoCopo
|
|
54
54
|
@last, event = @drip.read_tag(@last, 'DripDemo Event', 1)[0]
|
55
55
|
next if retweet?(event)
|
56
56
|
next if mention?(event)
|
57
|
-
next unless Time.now < created_at(event) +
|
57
|
+
next unless Time.now < created_at(event) + 6000
|
58
58
|
name = dig(event, 'user', 'screen_name')
|
59
59
|
next unless @friends.include?(name)
|
60
60
|
ary = extract(event['text'] || '')
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'pp'
|
2
|
+
require 'my_drip'
|
3
|
+
require 'monitor'
|
4
|
+
|
5
|
+
class Crawler
|
6
|
+
include MonitorMixin
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
super()
|
10
|
+
@root = File.expand_path('~/develop/git-repo/')
|
11
|
+
@drip = MyDrip
|
12
|
+
k, = @drip.head(1, 'rbcrawl-begin')[0]
|
13
|
+
@fence = k || 0
|
14
|
+
end
|
15
|
+
|
16
|
+
def last_mtime(fname)
|
17
|
+
k, v, = @drip.head(1, 'rbcrawl-fname=' + fname)[0]
|
18
|
+
(v && k > @fence) ? v[1] : Time.at(1)
|
19
|
+
end
|
20
|
+
|
21
|
+
def do_crawl
|
22
|
+
synchronize do
|
23
|
+
ary = []
|
24
|
+
Dir.chdir(@root)
|
25
|
+
Dir.glob('**/*.rb').each do |fname|
|
26
|
+
mtime = File.mtime(fname)
|
27
|
+
next if last_mtime(fname) >= mtime
|
28
|
+
@drip.write([fname, mtime, File.read(fname)],
|
29
|
+
'rbcrawl', 'rbcrawl-fname=' + fname)
|
30
|
+
ary << fname
|
31
|
+
end
|
32
|
+
@drip.write(ary, 'rbcrawl-footprint')
|
33
|
+
ary
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def quit
|
38
|
+
synchronize do
|
39
|
+
exit(0)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
if __FILE__ == $0
|
45
|
+
crawler = Crawler.new
|
46
|
+
Thread.new do
|
47
|
+
while true
|
48
|
+
pp crawler.do_crawl
|
49
|
+
sleep 60
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
gets
|
54
|
+
crawler.quit
|
55
|
+
end
|
56
|
+
|
@@ -0,0 +1,71 @@
|
|
1
|
+
require 'index'
|
2
|
+
require 'crawl'
|
3
|
+
require 'webrick/cgi'
|
4
|
+
require 'erb'
|
5
|
+
|
6
|
+
class DemoListView
|
7
|
+
include ERB::Util
|
8
|
+
extend ERB::DefMethod
|
9
|
+
def_erb_method('to_html(word, list)', ERB.new(<<EOS))
|
10
|
+
<html><head><title>Demo UI</title></head><body>
|
11
|
+
<form method="post"><input type="text" name="w" value="<%=h word %>" /></form>
|
12
|
+
<% if word %>
|
13
|
+
<p>search: <%=h word %></p>
|
14
|
+
<ul>
|
15
|
+
<% list.each do |fname| %>
|
16
|
+
<li><%=h fname%></li>
|
17
|
+
<% end %>
|
18
|
+
</ul>
|
19
|
+
<% end %>
|
20
|
+
</body></html>
|
21
|
+
EOS
|
22
|
+
end
|
23
|
+
|
24
|
+
class DemoUICGI < WEBrick::CGI
|
25
|
+
def initialize(crawler, indexer, *args)
|
26
|
+
super(*args)
|
27
|
+
@crawler = crawler
|
28
|
+
@indexer = indexer
|
29
|
+
@list_view = DemoListView.new
|
30
|
+
end
|
31
|
+
|
32
|
+
def req_query(req, key)
|
33
|
+
value ,= req.query[key]
|
34
|
+
return nil unless value
|
35
|
+
value.force_encoding('utf-8')
|
36
|
+
value
|
37
|
+
end
|
38
|
+
|
39
|
+
def do_GET(req, res)
|
40
|
+
if req.path_info == '/quit'
|
41
|
+
Thread.new do
|
42
|
+
@crawler.quit
|
43
|
+
end
|
44
|
+
end
|
45
|
+
word = req_query(req, 'w') || ''
|
46
|
+
list = word.empty? ? [] : @indexer.dict.query(word)
|
47
|
+
res['content-type'] = 'text/html; charset=utf-8'
|
48
|
+
res.body = @list_view.to_html(word, list)
|
49
|
+
end
|
50
|
+
|
51
|
+
alias do_POST do_GET
|
52
|
+
end
|
53
|
+
|
54
|
+
if __FILE__ == $0
|
55
|
+
crawler = Crawler.new
|
56
|
+
Thread.new do
|
57
|
+
while true
|
58
|
+
pp crawler.do_crawl
|
59
|
+
sleep 60
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
indexer = Indexer.new
|
64
|
+
Thread.new do
|
65
|
+
indexer.update_dict
|
66
|
+
end
|
67
|
+
|
68
|
+
cgi = DemoUICGI.new(crawler, indexer)
|
69
|
+
DRb.start_service('druby://localhost:50830', cgi)
|
70
|
+
DRb.thread.join
|
71
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'index'
|
2
|
+
require 'crawl'
|
3
|
+
require 'webrick'
|
4
|
+
require 'erb'
|
5
|
+
|
6
|
+
class DemoListView
|
7
|
+
include ERB::Util
|
8
|
+
extend ERB::DefMethod
|
9
|
+
def_erb_method('to_html(word, list)', ERB.new(<<EOS))
|
10
|
+
<html><head><title>Demo UI</title></head><body>
|
11
|
+
<form method="post"><input type="text" name="w" value="<%=h word %>" /></form>
|
12
|
+
<% if word %>
|
13
|
+
<p>search: <%=h word %></p>
|
14
|
+
<ul>
|
15
|
+
<% list.each do |fname| %>
|
16
|
+
<li><%=h fname%></li>
|
17
|
+
<% end %>
|
18
|
+
</ul>
|
19
|
+
<% end %>
|
20
|
+
</body></html>
|
21
|
+
EOS
|
22
|
+
end
|
23
|
+
|
24
|
+
class DemoUIServlet < WEBrick::HTTPServlet::AbstractServlet
|
25
|
+
def initialize(server, crawler, indexer, list_view)
|
26
|
+
super(server)
|
27
|
+
@crawler = crawler
|
28
|
+
@indexer = indexer
|
29
|
+
@list_view = list_view
|
30
|
+
end
|
31
|
+
|
32
|
+
def req_query(req, key)
|
33
|
+
value ,= req.query[key]
|
34
|
+
return nil unless value
|
35
|
+
value.force_encoding('utf-8')
|
36
|
+
value
|
37
|
+
end
|
38
|
+
|
39
|
+
def do_GET(req, res)
|
40
|
+
word = req_query(req, 'w') || ''
|
41
|
+
list = word.empty? ? [] : @indexer.dict.query(word)
|
42
|
+
res['content-type'] = 'text/html; charset=utf-8'
|
43
|
+
res.body = @list_view.to_html(word, list)
|
44
|
+
end
|
45
|
+
|
46
|
+
alias do_POST do_GET
|
47
|
+
end
|
48
|
+
|
49
|
+
if __FILE__ == $0
|
50
|
+
crawler = Crawler.new
|
51
|
+
Thread.new do
|
52
|
+
while true
|
53
|
+
pp crawler.do_crawl
|
54
|
+
sleep 60
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
indexer = Indexer.new
|
59
|
+
Thread.new do
|
60
|
+
indexer.update_dict
|
61
|
+
end
|
62
|
+
|
63
|
+
server = WEBrick::HTTPServer.new({:Port => 10080,
|
64
|
+
:BindAddress => '127.0.0.1'})
|
65
|
+
server.mount('/', DemoUIServlet, crawler, indexer, DemoListView.new)
|
66
|
+
trap('INT') { server.shutdown }
|
67
|
+
server.start
|
68
|
+
crawler.quit
|
69
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
require 'nkf'
|
2
|
+
require 'rbtree'
|
3
|
+
require 'my_drip'
|
4
|
+
require 'monitor'
|
5
|
+
require 'pp'
|
6
|
+
|
7
|
+
|
8
|
+
class Indexer
|
9
|
+
def initialize(cursor=0)
|
10
|
+
@drip = MyDrip
|
11
|
+
@dict = Dict.new
|
12
|
+
k, = @drip.head(1, 'rbcrawl-begin')[0]
|
13
|
+
@fence = k || 0
|
14
|
+
@cursor = [cursor, @fence].max
|
15
|
+
end
|
16
|
+
attr_reader :dict
|
17
|
+
|
18
|
+
def update_dict
|
19
|
+
each_document do |cur, prev|
|
20
|
+
@dict.delete(*prev) if prev
|
21
|
+
@dict.push(*cur)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def each_document
|
26
|
+
while true
|
27
|
+
ary = @drip.read_tag(@cursor, 'rbcrawl', 10, 1)
|
28
|
+
ary.each do |k, v|
|
29
|
+
prev = prev_version(k, v[0])
|
30
|
+
yield(v, prev)
|
31
|
+
@cursor = k
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def prev_version(cursor, fname)
|
37
|
+
k, v = @drip.older(cursor, 'rbcrawl-fname=' + fname)
|
38
|
+
(v && k > @fence) ? v : nil
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
class Dict
|
43
|
+
include MonitorMixin
|
44
|
+
def initialize
|
45
|
+
super()
|
46
|
+
@tree = RBTree.new
|
47
|
+
end
|
48
|
+
|
49
|
+
def query(word)
|
50
|
+
synchronize do
|
51
|
+
@tree.bound([word, 0, ''], [word + "\0", 0, '']).collect {|k, v| k[2]}
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def delete(fname, mtime, src)
|
56
|
+
synchronize do
|
57
|
+
each_tree_key(fname, mtime, src) do |key|
|
58
|
+
@tree.delete(key)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def push(fname, mtime, src)
|
64
|
+
synchronize do
|
65
|
+
each_tree_key(fname, mtime, src) do |key|
|
66
|
+
@tree[key] = true
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def intern(word)
|
72
|
+
k, v = @tree.lower_bound([word, 0, ''])
|
73
|
+
return k[0] if k && k[0] == word
|
74
|
+
word
|
75
|
+
end
|
76
|
+
|
77
|
+
def each_tree_key(fname, mtime, src)
|
78
|
+
NKF.nkf('-w', src).scan(/\w+/m).uniq.each do |word|
|
79
|
+
yield([intern(word), mtime.to_i, fname])
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
if __FILE__ == $0
|
85
|
+
indexer ||= Indexer.new(0)
|
86
|
+
Thread.new do
|
87
|
+
indexer.update_dict
|
88
|
+
end
|
89
|
+
|
90
|
+
while line = gets
|
91
|
+
ary = indexer.dict.query(line.chomp)
|
92
|
+
pp ary
|
93
|
+
pp ary.size
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|