drip 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +0 -2
- data/drip.gemspec +1 -1
- data/drip.txt +1068 -190
- data/lib/drip.rb +146 -9
- data/lib/drip/version.rb +1 -1
- data/lib/my_drip.rb +4 -0
- data/sample/copocopo.rb +1 -1
- data/sample/demo4book/crawl.rb +56 -0
- data/sample/demo4book/demo_ui.rb +71 -0
- data/sample/demo4book/demo_ui_webrick.rb +69 -0
- data/sample/demo4book/index.rb +96 -0
- data/sample/demo4book/query2.rb +47 -0
- data/sample/demo4book/query2_test.rb +18 -0
- data/test/basic.rb +131 -0
- metadata +21 -5
data/lib/drip.rb
CHANGED
@@ -7,6 +7,127 @@ class Drip
|
|
7
7
|
include DRbUndumped
|
8
8
|
def inspect; to_s; end
|
9
9
|
|
10
|
+
class ImmutableDrip
|
11
|
+
class Generator
|
12
|
+
def initialize
|
13
|
+
@pool = []
|
14
|
+
@tag = []
|
15
|
+
@shared = Hash.new {|h, k| h[k] = k; k}
|
16
|
+
end
|
17
|
+
|
18
|
+
def add(key, value, *tag)
|
19
|
+
@pool << [key, value]
|
20
|
+
idx = @pool.size - 1
|
21
|
+
tag.uniq.each do |t|
|
22
|
+
@tag << [[@shared[t], key], idx]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def generate
|
27
|
+
tag = @tag.sort
|
28
|
+
tag.inject(nil) do |last, kv|
|
29
|
+
k = kv[0]
|
30
|
+
k[0] = last if k[0] == last
|
31
|
+
k[0]
|
32
|
+
end
|
33
|
+
ImmutableDrip.new(@pool.sort, tag)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
INF = 1.0/0
|
38
|
+
|
39
|
+
def initialize(pool=[], tag=[])
|
40
|
+
@pool = pool
|
41
|
+
@tag = tag
|
42
|
+
end
|
43
|
+
|
44
|
+
def fetch(key)
|
45
|
+
idx = lower_boundary(@pool, key)
|
46
|
+
k, v = @pool[idx]
|
47
|
+
k == key ? v.to_a : nil
|
48
|
+
end
|
49
|
+
|
50
|
+
def read(key, n=1)
|
51
|
+
idx = lower_boundary(@pool, key + 1)
|
52
|
+
return [] unless idx
|
53
|
+
@pool[idx, n].collect {|kv|
|
54
|
+
[kv[0], *kv[1].to_a]
|
55
|
+
}
|
56
|
+
end
|
57
|
+
|
58
|
+
def read_tag(key, tag, n=1)
|
59
|
+
idx = lower_boundary(@tag, [tag, key + 1])
|
60
|
+
return [] unless idx
|
61
|
+
@tag[idx, n].find_all {|kv| kv[0][0] == tag}.collect {|kv|
|
62
|
+
[kv[0][1], *@pool[kv[1]][1].to_a]
|
63
|
+
}
|
64
|
+
end
|
65
|
+
|
66
|
+
def head_tag(n, tag)
|
67
|
+
lower = lower_boundary(@tag, [tag, 0])
|
68
|
+
upper = upper_boundary(@tag, [tag, INF])
|
69
|
+
lower = [lower, upper - n].max
|
70
|
+
@tag[lower ... upper].collect {|kv|
|
71
|
+
[kv[0][1], *@pool[kv[1]][1].to_a]
|
72
|
+
}
|
73
|
+
end
|
74
|
+
|
75
|
+
def head(n=1, tag=nil)
|
76
|
+
return head_tag(n, tag) if tag
|
77
|
+
n = @pool.size < n ? @pool.size : n
|
78
|
+
@pool[-n, n].collect {|kv|
|
79
|
+
[kv[0], *kv[1].to_a]
|
80
|
+
}
|
81
|
+
end
|
82
|
+
|
83
|
+
def older_tag(key, tag)
|
84
|
+
idx = upper_boundary(@tag, [tag, key-1])
|
85
|
+
k, v = @tag[idx - 1]
|
86
|
+
k && k[0] == tag ? [k[1], *@pool[v][1].to_a] : nil
|
87
|
+
end
|
88
|
+
|
89
|
+
def older(key, tag=nil)
|
90
|
+
key = @pool[-1][0] + 1 unless key
|
91
|
+
return older_tag(key, tag) if tag
|
92
|
+
idx = upper_boundary(@pool, key - 1)
|
93
|
+
k, v = @pool[idx - 1]
|
94
|
+
k && k < key ? [k, *v.to_a] : nil
|
95
|
+
end
|
96
|
+
|
97
|
+
def newer(key, tag=nil)
|
98
|
+
return read(key, 1)[0] unless tag
|
99
|
+
read_tag(key, tag, 1)[0]
|
100
|
+
end
|
101
|
+
|
102
|
+
def lower_boundary(ary, key)
|
103
|
+
lower = -1
|
104
|
+
upper = ary.size
|
105
|
+
while lower + 1 != upper
|
106
|
+
mid = (lower + upper).div(2)
|
107
|
+
if (ary[mid][0] <=> key) < 0
|
108
|
+
lower = mid
|
109
|
+
else
|
110
|
+
upper = mid
|
111
|
+
end
|
112
|
+
end
|
113
|
+
return upper
|
114
|
+
end
|
115
|
+
|
116
|
+
def upper_boundary(ary, key)
|
117
|
+
lower = -1
|
118
|
+
upper = ary.size
|
119
|
+
while lower + 1 != upper
|
120
|
+
mid = (lower + upper).div(2)
|
121
|
+
if (ary[mid][0] <=> key) <= 0
|
122
|
+
lower = mid
|
123
|
+
else
|
124
|
+
upper = mid
|
125
|
+
end
|
126
|
+
end
|
127
|
+
return lower + 1
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
10
131
|
def initialize(dir, option={})
|
11
132
|
@pool = RBTree.new
|
12
133
|
@tag = RBTree.new
|
@@ -15,20 +136,20 @@ class Drip
|
|
15
136
|
prepare_store(dir, option)
|
16
137
|
end
|
17
138
|
|
18
|
-
def write(*
|
19
|
-
write_after(Time.now, *
|
139
|
+
def write(obj, *tags)
|
140
|
+
write_after(Time.now, obj, *tags)
|
20
141
|
end
|
21
142
|
|
22
143
|
def write_after(at, *value)
|
23
144
|
make_key(at) do |key|
|
24
|
-
do_write(key, value)
|
145
|
+
value = do_write(key, value)
|
25
146
|
@pool[key] = @store.write(key, value)
|
26
147
|
end
|
27
148
|
end
|
28
149
|
|
29
150
|
def write_at(at, *value)
|
30
151
|
make_key_at(at) do |key|
|
31
|
-
do_write(key, value)
|
152
|
+
value = do_write(key, value)
|
32
153
|
@pool[key] = @store.write(key, value)
|
33
154
|
end
|
34
155
|
end
|
@@ -197,15 +318,30 @@ class Drip
|
|
197
318
|
end
|
198
319
|
|
199
320
|
Dir.mkdir(dir) rescue nil
|
321
|
+
dump = Dir.glob(File.join(dir, '*.dump')).max_by do |fn|
|
322
|
+
File.basename(fn).to_i(36)
|
323
|
+
end
|
324
|
+
if dump
|
325
|
+
@pool, @tag, last = File.open(dump, 'rb') {|fp| Marshal.load(fp)}
|
326
|
+
@event.take([:last, nil])
|
327
|
+
@event.write([:last, last])
|
328
|
+
File.unlink(dump)
|
329
|
+
end
|
330
|
+
loaded = dump ? File.basename(dump).to_i(36) : 0
|
200
331
|
Dir.glob(File.join(dir, '*.log')) do |fn|
|
332
|
+
next if loaded > File.basename(fn).to_i(36)
|
201
333
|
begin
|
202
334
|
store = SimpleStore.reader(fn)
|
203
335
|
restore(store)
|
204
336
|
rescue
|
205
337
|
end
|
206
338
|
end
|
207
|
-
name = time_to_key(Time.now).to_s(36)
|
208
|
-
|
339
|
+
name = time_to_key(Time.now).to_s(36)
|
340
|
+
_, last = @event.read([:last, nil])
|
341
|
+
File.open(File.join(dir, name + '.dump'), 'wb') {|fp|
|
342
|
+
Marshal.dump([@pool, @tag, last], fp)
|
343
|
+
}
|
344
|
+
@store = SimpleStore.new(File.join(dir, name + '.log'))
|
209
345
|
end
|
210
346
|
|
211
347
|
def shared_text(str)
|
@@ -218,13 +354,14 @@ class Drip
|
|
218
354
|
end
|
219
355
|
|
220
356
|
def do_write(key, value)
|
221
|
-
|
222
|
-
|
357
|
+
obj, *tags = value
|
358
|
+
tags.uniq!
|
359
|
+
tags.each do |k|
|
223
360
|
next unless String === k
|
224
361
|
tag = shared_text(k)
|
225
362
|
@tag[[tag, key]] = key
|
226
363
|
end
|
227
|
-
@pool[key] =
|
364
|
+
@pool[key] = [obj] + tags
|
228
365
|
end
|
229
366
|
|
230
367
|
def restore(store)
|
data/lib/drip/version.rb
CHANGED
data/lib/my_drip.rb
CHANGED
data/sample/copocopo.rb
CHANGED
@@ -54,7 +54,7 @@ class CopoCopo
|
|
54
54
|
@last, event = @drip.read_tag(@last, 'DripDemo Event', 1)[0]
|
55
55
|
next if retweet?(event)
|
56
56
|
next if mention?(event)
|
57
|
-
next unless Time.now < created_at(event) +
|
57
|
+
next unless Time.now < created_at(event) + 6000
|
58
58
|
name = dig(event, 'user', 'screen_name')
|
59
59
|
next unless @friends.include?(name)
|
60
60
|
ary = extract(event['text'] || '')
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'pp'
|
2
|
+
require 'my_drip'
|
3
|
+
require 'monitor'
|
4
|
+
|
5
|
+
class Crawler
|
6
|
+
include MonitorMixin
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
super()
|
10
|
+
@root = File.expand_path('~/develop/git-repo/')
|
11
|
+
@drip = MyDrip
|
12
|
+
k, = @drip.head(1, 'rbcrawl-begin')[0]
|
13
|
+
@fence = k || 0
|
14
|
+
end
|
15
|
+
|
16
|
+
def last_mtime(fname)
|
17
|
+
k, v, = @drip.head(1, 'rbcrawl-fname=' + fname)[0]
|
18
|
+
(v && k > @fence) ? v[1] : Time.at(1)
|
19
|
+
end
|
20
|
+
|
21
|
+
def do_crawl
|
22
|
+
synchronize do
|
23
|
+
ary = []
|
24
|
+
Dir.chdir(@root)
|
25
|
+
Dir.glob('**/*.rb').each do |fname|
|
26
|
+
mtime = File.mtime(fname)
|
27
|
+
next if last_mtime(fname) >= mtime
|
28
|
+
@drip.write([fname, mtime, File.read(fname)],
|
29
|
+
'rbcrawl', 'rbcrawl-fname=' + fname)
|
30
|
+
ary << fname
|
31
|
+
end
|
32
|
+
@drip.write(ary, 'rbcrawl-footprint')
|
33
|
+
ary
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def quit
|
38
|
+
synchronize do
|
39
|
+
exit(0)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
if __FILE__ == $0
|
45
|
+
crawler = Crawler.new
|
46
|
+
Thread.new do
|
47
|
+
while true
|
48
|
+
pp crawler.do_crawl
|
49
|
+
sleep 60
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
gets
|
54
|
+
crawler.quit
|
55
|
+
end
|
56
|
+
|
@@ -0,0 +1,71 @@
|
|
1
|
+
require 'index'
|
2
|
+
require 'crawl'
|
3
|
+
require 'webrick/cgi'
|
4
|
+
require 'erb'
|
5
|
+
|
6
|
+
class DemoListView
|
7
|
+
include ERB::Util
|
8
|
+
extend ERB::DefMethod
|
9
|
+
def_erb_method('to_html(word, list)', ERB.new(<<EOS))
|
10
|
+
<html><head><title>Demo UI</title></head><body>
|
11
|
+
<form method="post"><input type="text" name="w" value="<%=h word %>" /></form>
|
12
|
+
<% if word %>
|
13
|
+
<p>search: <%=h word %></p>
|
14
|
+
<ul>
|
15
|
+
<% list.each do |fname| %>
|
16
|
+
<li><%=h fname%></li>
|
17
|
+
<% end %>
|
18
|
+
</ul>
|
19
|
+
<% end %>
|
20
|
+
</body></html>
|
21
|
+
EOS
|
22
|
+
end
|
23
|
+
|
24
|
+
class DemoUICGI < WEBrick::CGI
|
25
|
+
def initialize(crawler, indexer, *args)
|
26
|
+
super(*args)
|
27
|
+
@crawler = crawler
|
28
|
+
@indexer = indexer
|
29
|
+
@list_view = DemoListView.new
|
30
|
+
end
|
31
|
+
|
32
|
+
def req_query(req, key)
|
33
|
+
value ,= req.query[key]
|
34
|
+
return nil unless value
|
35
|
+
value.force_encoding('utf-8')
|
36
|
+
value
|
37
|
+
end
|
38
|
+
|
39
|
+
def do_GET(req, res)
|
40
|
+
if req.path_info == '/quit'
|
41
|
+
Thread.new do
|
42
|
+
@crawler.quit
|
43
|
+
end
|
44
|
+
end
|
45
|
+
word = req_query(req, 'w') || ''
|
46
|
+
list = word.empty? ? [] : @indexer.dict.query(word)
|
47
|
+
res['content-type'] = 'text/html; charset=utf-8'
|
48
|
+
res.body = @list_view.to_html(word, list)
|
49
|
+
end
|
50
|
+
|
51
|
+
alias do_POST do_GET
|
52
|
+
end
|
53
|
+
|
54
|
+
if __FILE__ == $0
|
55
|
+
crawler = Crawler.new
|
56
|
+
Thread.new do
|
57
|
+
while true
|
58
|
+
pp crawler.do_crawl
|
59
|
+
sleep 60
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
indexer = Indexer.new
|
64
|
+
Thread.new do
|
65
|
+
indexer.update_dict
|
66
|
+
end
|
67
|
+
|
68
|
+
cgi = DemoUICGI.new(crawler, indexer)
|
69
|
+
DRb.start_service('druby://localhost:50830', cgi)
|
70
|
+
DRb.thread.join
|
71
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'index'
|
2
|
+
require 'crawl'
|
3
|
+
require 'webrick'
|
4
|
+
require 'erb'
|
5
|
+
|
6
|
+
class DemoListView
|
7
|
+
include ERB::Util
|
8
|
+
extend ERB::DefMethod
|
9
|
+
def_erb_method('to_html(word, list)', ERB.new(<<EOS))
|
10
|
+
<html><head><title>Demo UI</title></head><body>
|
11
|
+
<form method="post"><input type="text" name="w" value="<%=h word %>" /></form>
|
12
|
+
<% if word %>
|
13
|
+
<p>search: <%=h word %></p>
|
14
|
+
<ul>
|
15
|
+
<% list.each do |fname| %>
|
16
|
+
<li><%=h fname%></li>
|
17
|
+
<% end %>
|
18
|
+
</ul>
|
19
|
+
<% end %>
|
20
|
+
</body></html>
|
21
|
+
EOS
|
22
|
+
end
|
23
|
+
|
24
|
+
class DemoUIServlet < WEBrick::HTTPServlet::AbstractServlet
|
25
|
+
def initialize(server, crawler, indexer, list_view)
|
26
|
+
super(server)
|
27
|
+
@crawler = crawler
|
28
|
+
@indexer = indexer
|
29
|
+
@list_view = list_view
|
30
|
+
end
|
31
|
+
|
32
|
+
def req_query(req, key)
|
33
|
+
value ,= req.query[key]
|
34
|
+
return nil unless value
|
35
|
+
value.force_encoding('utf-8')
|
36
|
+
value
|
37
|
+
end
|
38
|
+
|
39
|
+
def do_GET(req, res)
|
40
|
+
word = req_query(req, 'w') || ''
|
41
|
+
list = word.empty? ? [] : @indexer.dict.query(word)
|
42
|
+
res['content-type'] = 'text/html; charset=utf-8'
|
43
|
+
res.body = @list_view.to_html(word, list)
|
44
|
+
end
|
45
|
+
|
46
|
+
alias do_POST do_GET
|
47
|
+
end
|
48
|
+
|
49
|
+
if __FILE__ == $0
|
50
|
+
crawler = Crawler.new
|
51
|
+
Thread.new do
|
52
|
+
while true
|
53
|
+
pp crawler.do_crawl
|
54
|
+
sleep 60
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
indexer = Indexer.new
|
59
|
+
Thread.new do
|
60
|
+
indexer.update_dict
|
61
|
+
end
|
62
|
+
|
63
|
+
server = WEBrick::HTTPServer.new({:Port => 10080,
|
64
|
+
:BindAddress => '127.0.0.1'})
|
65
|
+
server.mount('/', DemoUIServlet, crawler, indexer, DemoListView.new)
|
66
|
+
trap('INT') { server.shutdown }
|
67
|
+
server.start
|
68
|
+
crawler.quit
|
69
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
require 'nkf'
|
2
|
+
require 'rbtree'
|
3
|
+
require 'my_drip'
|
4
|
+
require 'monitor'
|
5
|
+
require 'pp'
|
6
|
+
|
7
|
+
|
8
|
+
class Indexer
|
9
|
+
def initialize(cursor=0)
|
10
|
+
@drip = MyDrip
|
11
|
+
@dict = Dict.new
|
12
|
+
k, = @drip.head(1, 'rbcrawl-begin')[0]
|
13
|
+
@fence = k || 0
|
14
|
+
@cursor = [cursor, @fence].max
|
15
|
+
end
|
16
|
+
attr_reader :dict
|
17
|
+
|
18
|
+
def update_dict
|
19
|
+
each_document do |cur, prev|
|
20
|
+
@dict.delete(*prev) if prev
|
21
|
+
@dict.push(*cur)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def each_document
|
26
|
+
while true
|
27
|
+
ary = @drip.read_tag(@cursor, 'rbcrawl', 10, 1)
|
28
|
+
ary.each do |k, v|
|
29
|
+
prev = prev_version(k, v[0])
|
30
|
+
yield(v, prev)
|
31
|
+
@cursor = k
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def prev_version(cursor, fname)
|
37
|
+
k, v = @drip.older(cursor, 'rbcrawl-fname=' + fname)
|
38
|
+
(v && k > @fence) ? v : nil
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
class Dict
|
43
|
+
include MonitorMixin
|
44
|
+
def initialize
|
45
|
+
super()
|
46
|
+
@tree = RBTree.new
|
47
|
+
end
|
48
|
+
|
49
|
+
def query(word)
|
50
|
+
synchronize do
|
51
|
+
@tree.bound([word, 0, ''], [word + "\0", 0, '']).collect {|k, v| k[2]}
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def delete(fname, mtime, src)
|
56
|
+
synchronize do
|
57
|
+
each_tree_key(fname, mtime, src) do |key|
|
58
|
+
@tree.delete(key)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def push(fname, mtime, src)
|
64
|
+
synchronize do
|
65
|
+
each_tree_key(fname, mtime, src) do |key|
|
66
|
+
@tree[key] = true
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def intern(word)
|
72
|
+
k, v = @tree.lower_bound([word, 0, ''])
|
73
|
+
return k[0] if k && k[0] == word
|
74
|
+
word
|
75
|
+
end
|
76
|
+
|
77
|
+
def each_tree_key(fname, mtime, src)
|
78
|
+
NKF.nkf('-w', src).scan(/\w+/m).uniq.each do |word|
|
79
|
+
yield([intern(word), mtime.to_i, fname])
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
if __FILE__ == $0
|
85
|
+
indexer ||= Indexer.new(0)
|
86
|
+
Thread.new do
|
87
|
+
indexer.update_dict
|
88
|
+
end
|
89
|
+
|
90
|
+
while line = gets
|
91
|
+
ary = indexer.dict.query(line.chomp)
|
92
|
+
pp ary
|
93
|
+
pp ary.size
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|