fairy 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (186) hide show
  1. data/LICENSE +674 -0
  2. data/Makefile +116 -0
  3. data/README +15 -0
  4. data/bin/fairy +582 -0
  5. data/bin/fairy-cat +74 -0
  6. data/bin/fairy-cp +128 -0
  7. data/bin/fairy-rm +122 -0
  8. data/bin/subcmd/controller +41 -0
  9. data/bin/subcmd/inspector +81 -0
  10. data/bin/subcmd/master +43 -0
  11. data/bin/subcmd/node +47 -0
  12. data/bin/subcmd/processor +54 -0
  13. data/doc/programming-interface.html +240 -0
  14. data/doc/programming-interface.rd +300 -0
  15. data/etc/fairy.conf.tmpl +118 -0
  16. data/ext/simple_hash/extconf.rb +4 -0
  17. data/ext/simple_hash/simple_hash.c +42 -0
  18. data/fairy.gemspec +60 -0
  19. data/lib/fairy/client/addins.rb +20 -0
  20. data/lib/fairy/client/barrier.rb +29 -0
  21. data/lib/fairy/client/basic-group-by.rb +52 -0
  22. data/lib/fairy/client/cat.rb +41 -0
  23. data/lib/fairy/client/direct-product.rb +51 -0
  24. data/lib/fairy/client/equijoin.rb +79 -0
  25. data/lib/fairy/client/exec.rb +54 -0
  26. data/lib/fairy/client/filter.rb +62 -0
  27. data/lib/fairy/client/find.rb +35 -0
  28. data/lib/fairy/client/group-by.rb +194 -0
  29. data/lib/fairy/client/here.rb +84 -0
  30. data/lib/fairy/client/inject.rb +70 -0
  31. data/lib/fairy/client/input-file.rb +53 -0
  32. data/lib/fairy/client/input-iota.rb +49 -0
  33. data/lib/fairy/client/input-local-file.rb +188 -0
  34. data/lib/fairy/client/input-varray.rb +30 -0
  35. data/lib/fairy/client/input.rb +42 -0
  36. data/lib/fairy/client/io-filter.rb +26 -0
  37. data/lib/fairy/client/junction.rb +31 -0
  38. data/lib/fairy/client/map.rb +34 -0
  39. data/lib/fairy/client/merge-group-by.rb +71 -0
  40. data/lib/fairy/client/output-file.rb +64 -0
  41. data/lib/fairy/client/output-local-file.rb +60 -0
  42. data/lib/fairy/client/output-null.rb +47 -0
  43. data/lib/fairy/client/output-varray.rb +50 -0
  44. data/lib/fairy/client/output.rb +29 -0
  45. data/lib/fairy/client/roma-put.rb +62 -0
  46. data/lib/fairy/client/roma.rb +156 -0
  47. data/lib/fairy/client/seg-join.rb +61 -0
  48. data/lib/fairy/client/seg-map.rb +78 -0
  49. data/lib/fairy/client/seg-shuffle.rb +35 -0
  50. data/lib/fairy/client/seg-split.rb +27 -0
  51. data/lib/fairy/client/seg-zip.rb +60 -0
  52. data/lib/fairy/client/select.rb +38 -0
  53. data/lib/fairy/client/sort.rb +48 -0
  54. data/lib/fairy/client/sort18.rb +56 -0
  55. data/lib/fairy/client/sort19.rb +61 -0
  56. data/lib/fairy/client/there.rb +47 -0
  57. data/lib/fairy/client/top_n_into_roma.rb +34 -0
  58. data/lib/fairy/client/wc.rb +92 -0
  59. data/lib/fairy/controller.rb +1103 -0
  60. data/lib/fairy/logger.rb +107 -0
  61. data/lib/fairy/master/addins.rb +20 -0
  62. data/lib/fairy/master/atom.rb +17 -0
  63. data/lib/fairy/master/c-barrier.rb +283 -0
  64. data/lib/fairy/master/c-basic-group-by.rb +250 -0
  65. data/lib/fairy/master/c-cat.rb +159 -0
  66. data/lib/fairy/master/c-direct-product.rb +203 -0
  67. data/lib/fairy/master/c-exec.rb +68 -0
  68. data/lib/fairy/master/c-filter.rb +422 -0
  69. data/lib/fairy/master/c-find.rb +138 -0
  70. data/lib/fairy/master/c-group-by.rb +64 -0
  71. data/lib/fairy/master/c-here.rb +80 -0
  72. data/lib/fairy/master/c-inject.rb +119 -0
  73. data/lib/fairy/master/c-input-file.rb +46 -0
  74. data/lib/fairy/master/c-input-iota.rb +66 -0
  75. data/lib/fairy/master/c-input-local-file.rb +117 -0
  76. data/lib/fairy/master/c-input-varray.rb +53 -0
  77. data/lib/fairy/master/c-input.rb +24 -0
  78. data/lib/fairy/master/c-inputtable.rb +31 -0
  79. data/lib/fairy/master/c-inputtable18.rb +36 -0
  80. data/lib/fairy/master/c-inputtable19.rb +35 -0
  81. data/lib/fairy/master/c-io-filter.rb +28 -0
  82. data/lib/fairy/master/c-junction.rb +54 -0
  83. data/lib/fairy/master/c-map.rb +27 -0
  84. data/lib/fairy/master/c-merge-group-by.rb +241 -0
  85. data/lib/fairy/master/c-output-file.rb +84 -0
  86. data/lib/fairy/master/c-output-local-file.rb +19 -0
  87. data/lib/fairy/master/c-output-null.rb +45 -0
  88. data/lib/fairy/master/c-output-varray.rb +57 -0
  89. data/lib/fairy/master/c-output.rb +20 -0
  90. data/lib/fairy/master/c-seg-join.rb +141 -0
  91. data/lib/fairy/master/c-seg-map.rb +26 -0
  92. data/lib/fairy/master/c-seg-shuffle.rb +87 -0
  93. data/lib/fairy/master/c-seg-split.rb +110 -0
  94. data/lib/fairy/master/c-seg-zip.rb +132 -0
  95. data/lib/fairy/master/c-select.rb +27 -0
  96. data/lib/fairy/master/c-sort.rb +108 -0
  97. data/lib/fairy/master/c-there.rb +57 -0
  98. data/lib/fairy/master/c-wc.rb +232 -0
  99. data/lib/fairy/master/job-interpriter.rb +19 -0
  100. data/lib/fairy/master/scheduler.rb +24 -0
  101. data/lib/fairy/master.rb +329 -0
  102. data/lib/fairy/node/addins.rb +19 -0
  103. data/lib/fairy/node/p-barrier.rb +95 -0
  104. data/lib/fairy/node/p-basic-group-by.rb +252 -0
  105. data/lib/fairy/node/p-direct-product.rb +153 -0
  106. data/lib/fairy/node/p-exec.rb +30 -0
  107. data/lib/fairy/node/p-filter.rb +363 -0
  108. data/lib/fairy/node/p-find.rb +111 -0
  109. data/lib/fairy/node/p-group-by.rb +1534 -0
  110. data/lib/fairy/node/p-here.rb +21 -0
  111. data/lib/fairy/node/p-identity.rb +24 -0
  112. data/lib/fairy/node/p-inject.rb +127 -0
  113. data/lib/fairy/node/p-input-file.rb +108 -0
  114. data/lib/fairy/node/p-input-iota.rb +39 -0
  115. data/lib/fairy/node/p-input-local-file.rb +61 -0
  116. data/lib/fairy/node/p-input-varray.rb +26 -0
  117. data/lib/fairy/node/p-io-filter.rb +28 -0
  118. data/lib/fairy/node/p-map.rb +40 -0
  119. data/lib/fairy/node/p-merger-group-by.rb +48 -0
  120. data/lib/fairy/node/p-output-file.rb +104 -0
  121. data/lib/fairy/node/p-output-local-file.rb +14 -0
  122. data/lib/fairy/node/p-output-null.rb +32 -0
  123. data/lib/fairy/node/p-output-varray.rb +41 -0
  124. data/lib/fairy/node/p-seg-join.rb +82 -0
  125. data/lib/fairy/node/p-seg-map.rb +34 -0
  126. data/lib/fairy/node/p-seg-split.rb +61 -0
  127. data/lib/fairy/node/p-seg-zip.rb +79 -0
  128. data/lib/fairy/node/p-select.rb +40 -0
  129. data/lib/fairy/node/p-single-exportable.rb +90 -0
  130. data/lib/fairy/node/p-sort.rb +195 -0
  131. data/lib/fairy/node/p-task.rb +113 -0
  132. data/lib/fairy/node/p-there.rb +44 -0
  133. data/lib/fairy/node/p-wc.rb +266 -0
  134. data/lib/fairy/node.rb +187 -0
  135. data/lib/fairy/processor.rb +510 -0
  136. data/lib/fairy/share/base-app.rb +114 -0
  137. data/lib/fairy/share/block-source.rb +234 -0
  138. data/lib/fairy/share/conf.rb +396 -0
  139. data/lib/fairy/share/debug.rb +21 -0
  140. data/lib/fairy/share/encoding.rb +17 -0
  141. data/lib/fairy/share/fast-tempfile.rb +93 -0
  142. data/lib/fairy/share/file-place.rb +176 -0
  143. data/lib/fairy/share/hash-1.rb +20 -0
  144. data/lib/fairy/share/hash-md5.rb +28 -0
  145. data/lib/fairy/share/hash-murmur.rb +69 -0
  146. data/lib/fairy/share/hash-rb18.rb +20 -0
  147. data/lib/fairy/share/hash-simple-hash.rb +28 -0
  148. data/lib/fairy/share/inspector.rb +16 -0
  149. data/lib/fairy/share/lc/exceptions.rb +82 -0
  150. data/lib/fairy/share/lc/ja/exceptions.rb +81 -0
  151. data/lib/fairy/share/locale.rb +17 -0
  152. data/lib/fairy/share/log.rb +215 -0
  153. data/lib/fairy/share/pool-dictionary.rb +53 -0
  154. data/lib/fairy/share/port-marshaled-queue.rb +347 -0
  155. data/lib/fairy/share/port.rb +1697 -0
  156. data/lib/fairy/share/reference.rb +45 -0
  157. data/lib/fairy/share/stdout.rb +56 -0
  158. data/lib/fairy/share/tr.rb +16 -0
  159. data/lib/fairy/share/varray.rb +147 -0
  160. data/lib/fairy/share/vfile.rb +183 -0
  161. data/lib/fairy/version.rb +8 -0
  162. data/lib/fairy.rb +206 -0
  163. data/sample/grep.rb +46 -0
  164. data/sample/ping.rb +19 -0
  165. data/sample/sort.rb +102 -0
  166. data/sample/wordcount.rb +61 -0
  167. data/spec/README +12 -0
  168. data/spec/fairy1_spec.rb +31 -0
  169. data/spec/fairy2_spec.rb +42 -0
  170. data/spec/fairy3_spec.rb +126 -0
  171. data/spec/fairy4_spec.rb +63 -0
  172. data/spec/fairy5_spec.rb +45 -0
  173. data/spec/fairy6_spec.rb +52 -0
  174. data/spec/fairy7_spec.rb +58 -0
  175. data/spec/fairy8_spec.rb +48 -0
  176. data/spec/mkdat.rb +148 -0
  177. data/spec/run_all.sh +65 -0
  178. data/test/testc.rb +7111 -0
  179. data/tools/cap_recipe/Capfile +144 -0
  180. data/tools/cap_recipe/cluster.yml.sample +14 -0
  181. data/tools/fairy_perf_graph.rb +444 -0
  182. data/tools/git-tag +44 -0
  183. data/tools/log-analysis.rb +62 -0
  184. data/tools/svn-ls-diff +38 -0
  185. data/tools/svn-tags +37 -0
  186. metadata +298 -0
@@ -0,0 +1,111 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # Copyright (C) 2007-2010 Rakuten, Inc.
4
+ #
5
+
6
+ require "fairy/node/p-io-filter"
7
+ require "fairy/node/p-single-exportable"
8
+
9
+ module Fairy
10
+ class PLocalFind<PSingleExportFilter
11
+ Processor.def_export self
12
+
13
+ def initialize(id, ntask, bjob, opts, block_source)
14
+ super
15
+ @block_source = block_source
16
+
17
+ @findp = false
18
+ @findp_mutex = Mutex.new
19
+ end
20
+
21
+ def basic_each(&block)
22
+ @find_proc = BBlock.new(@block_source, @context, self)
23
+
24
+ @input.each do |e|
25
+ # 見つかっていたら空読み
26
+ @findp_mutex.synchronize do
27
+ next if @findp
28
+ if !(@findp = @find_proc.yield(e))
29
+ next
30
+ elsif Import::CTLTOKEN_NULLVALUE === @findp
31
+ @findp = false
32
+ next
33
+ end
34
+ block.call e
35
+ end
36
+ end
37
+ end
38
+
39
+ # def start
40
+ # super do
41
+ # @import.each do |e|
42
+ # # 見つかっていたら空読み
43
+ # @find_mutex.synchronize do
44
+ # next if @find
45
+ # next unless find = @map_proc.yield(e)
46
+ # @export.push e
47
+ # end
48
+ # end
49
+ # end
50
+ # end
51
+
52
+ def find_break
53
+ @find_mutex.synchronize do
54
+ @findp = true
55
+ end
56
+ end
57
+ end
58
+
59
+
60
+ class PFindResult<PIOFilter
61
+ Processor.def_export self
62
+
63
+ def initialize(*args)
64
+ super
65
+
66
+ @value = :__FAIRY_NO_VALUE__
67
+ @value_mutex = Mutex.new
68
+ @value_cv = ConditionVariable.new
69
+ end
70
+
71
+ def input=(input)
72
+ super
73
+
74
+ start do
75
+ self.start_find
76
+ end
77
+ end
78
+
79
+ alias super_each each
80
+
81
+ def each(&block)
82
+ block.call value
83
+ end
84
+
85
+ def value
86
+ @value_mutex.synchronize do
87
+ while @value == :__FAIRY_NO_VALUE__
88
+ @value_cv.wait(@value_mutex)
89
+ end
90
+ @value
91
+ end
92
+ end
93
+
94
+ def start_find
95
+ find = false
96
+ @input.each do |e|
97
+ # 最初の要素以外空読み
98
+ next if find
99
+ find = e
100
+
101
+ @value = find
102
+ @value_cv.broadcast
103
+ # @export.push find
104
+ # ちょっと気になる...
105
+ # @export.push END_OF_STREAM
106
+
107
+ @bjob.update_find
108
+ end
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,1534 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # Copyright (C) 2007-2010 Rakuten, Inc.
4
+ #
5
+
6
+ require "fairy/node/p-io-filter"
7
+ require "fairy/node/p-basic-group-by"
8
+
9
+ module Fairy
10
+ class PGroupBy<PBasicGroupBy
11
+
12
+ Processor.def_export self
13
+
14
+ def initialize(id, ntask, bjob, opts, block_source)
15
+ super
16
+
17
+ @exports = []
18
+ def @exports.each_pair(&block)
19
+ each_with_index do |item, idx|
20
+ block.call(idx, item) if item
21
+ unless item
22
+ Log::debug(self, "No assgined Export")
23
+ end
24
+ end
25
+ end
26
+
27
+ @mod = opts[:no_segment]
28
+ @mod ||= CONF.GROUP_BY_NO_SEGMENT
29
+ Log::debug(self, "NO_SEGMENT: #{@mod}")
30
+
31
+ mod = opts[:hash_module]
32
+ mod ||= CONF.GROUP_BY_HASH_MODULE
33
+ require mod
34
+ @hash_generator = Fairy::HValueGenerator.new(bjob.hash_seed)
35
+
36
+ @hash_optimize = CONF.GROUP_BY_GROUPING_OPTIMIZE
37
+ @hash_optimize = opts[:grouping_optimize] if opts.key?(:grouping_optimize)
38
+ end
39
+
40
+ def hash_key(e)
41
+ if Import::CTLTOKEN_NULLVALUE === (key = super)
42
+ return key
43
+ end
44
+ @hash_generator.value(super) % @mod
45
+ end
46
+
47
+ class PPostFilter<PSingleExportFilter
48
+ Processor.def_export self
49
+
50
+ def initialize(id, ntask, bjob, opts, block_source)
51
+ super
52
+ @block_source = block_source
53
+
54
+ @buffering_policy = @opts[:buffering_policy]
55
+ @buffering_policy ||= CONF.GROUP_BY_BUFFERING_POLICY
56
+
57
+ unless CONF.BUG234
58
+ @hash_optimize = CONF.GROUP_BY_GROUPING_OPTIMIZE
59
+ @hash_optimize = opts[:grouping_optimize] if opts.key?(:grouping_optimize)
60
+ end
61
+ end
62
+
63
+ # def start
64
+ # super do
65
+ # @key_value_buffer =
66
+ # eval("#{@buffering_policy[:buffering_class]}").new(@buffering_policy)
67
+ # @hash_proc = BBlock.new(@block_source, @context, self)
68
+
69
+ # @import.each do |e|
70
+ # key = key(e)
71
+ # @key_value_buffer.push(key, e)
72
+ # end
73
+ # @key_value_buffer.each do |key, values|
74
+ # #Log::debug(self, key)
75
+ # @export.push [key, values]
76
+ # end
77
+ # @key_value_buffer = nil
78
+ # end
79
+ # end
80
+
81
+
82
+ def basic_each_0(&block)
83
+ # @key_value_buffer =
84
+ # eval("#{@buffering_policy[:buffering_class]}").new(@buffering_policy)
85
+
86
+ if @hash_optimize
87
+ @hash_proc = eval("proc{#{@block_source.source}}")
88
+ else
89
+ @hash_proc = BBlock.new(@block_source, @context, self)
90
+ end
91
+
92
+ @input.group_by{|e| e}.each{|k, v|
93
+ block.call [k, v]
94
+ }
95
+ end
96
+
97
+ def basic_each(&block)
98
+ @key_value_buffer =
99
+ eval("#{@buffering_policy[:buffering_class]}").new(self, @buffering_policy)
100
+ if @hash_optimize
101
+ @hash_proc = eval("proc{#{@block_source.source}}")
102
+ else
103
+ @hash_proc = BBlock.new(@block_source, @context, self)
104
+ end
105
+
106
+ @input.each do |e|
107
+ @key_value_buffer.push(e)
108
+ e = nil
109
+ end
110
+ @key_value_buffer.each do |kvs|
111
+ block.call kvs
112
+ end
113
+ @key_value_buffer = nil
114
+ end
115
+
116
+ def hash_key(e)
117
+ @hash_proc.yield(e)
118
+ end
119
+ end
120
+
121
+ class KeyValueStream
122
+ include Enumerable
123
+
124
+ EOS = :__KEY_VALUE_STREAM_EOS__
125
+
126
+ def initialize(key, generator)
127
+ @key = key
128
+ @buf = []
129
+ end
130
+
131
+ attr_reader :key
132
+
133
+ def push(e)
134
+ @buf.push e
135
+ end
136
+ alias enq push
137
+
138
+ def push_eos
139
+ push EOS
140
+ end
141
+
142
+ def concat(elements)
143
+ @buf.concat elements
144
+ end
145
+
146
+ def shift
147
+ while @buf.empty?
148
+ Fiber.yield
149
+ end
150
+ @buf.shift
151
+ end
152
+ alias deq shift
153
+ alias pop shift
154
+
155
+ def each(&block)
156
+ while (v = shift) != EOS
157
+ block.call v
158
+ end
159
+ end
160
+
161
+ def size
162
+ c = 0
163
+ each{|v| c += 1}
164
+ c
165
+ end
166
+
167
+ # def inspect
168
+ # "#{self.class}<#{super}>"
169
+ # end
170
+ end
171
+
172
+ class OnMemoryBuffer
173
+ def initialize(njob, policy)
174
+ @njob = njob
175
+ @policy = policy
176
+
177
+ @key_values = {}
178
+ @key_values_mutex = Mutex.new
179
+
180
+ @CHUNK_SIZE = CONF.GROUP_BY_CMSB_CHUNK_SIZE
181
+
182
+ @log_id = format("%s[%s]", self.class.name.sub(/Fairy::/, ''), @njob.id)
183
+ end
184
+
185
+ attr_accessor :log_id
186
+
187
+ def push(value)
188
+ key = @njob.hash_key(value)
189
+
190
+ @key_values_mutex.synchronize do
191
+ @key_values[key] = [[]] unless @key_values.key?(key)
192
+ if @CHUNK_SIZE < @key_values[key].last.size
193
+ @key_values[key].push []
194
+ end
195
+ @key_values[key].last.push value
196
+ end
197
+ end
198
+
199
+ def each(&block)
200
+ @key_values.each do |key, vv|
201
+ kvs = KeyValueStream.new(key, nil)
202
+ vv.each{|v| kvs.concat v}
203
+ kvs.push_eos
204
+ block.call(kvs)
205
+ end
206
+ end
207
+ end
208
+
209
+ class SimpleFileByKeyBuffer
210
+ def initialize(njob, policy)
211
+ require "tempfile"
212
+
213
+ @njob = njob
214
+ @policy = policy
215
+
216
+ @key_file = {}
217
+ @key_file_mutex = Mutex.new
218
+ @buffer_dir = policy[:buffer_dir]
219
+ @buffer_dir ||= CONF.TMP_DIR
220
+ end
221
+
222
+ def push(value)
223
+ key = @njob.hash_key(value)
224
+
225
+ @key_file_mutex.synchronize do
226
+ unless @key_file.key?(key)
227
+ @key_file[key] = Tempfile.open("mod-group-by-buffer-#{@njob.no}-", @buffer_dir)
228
+ end
229
+
230
+ # ruby BUG#2390の対応のため.
231
+ # Marshal.dump(value, @key_file[key])
232
+ Marshal.dump(value, @key_file[key].instance_eval{@tmpfile})
233
+ end
234
+ end
235
+
236
+ def each(&block)
237
+ @key_file.each do |key, file|
238
+ values = KeyValueStream.new(key, nil)
239
+ file.rewind
240
+ while !file.eof?
241
+ values.push Marshal.load(file)
242
+ end
243
+ values.push_eos
244
+ # file.close
245
+
246
+ yield values
247
+ end
248
+ end
249
+ end
250
+
251
+ class SimpleCommandSortBuffer
252
+ def initialize(njob, policy)
253
+ require "fairy/share/fast-tempfile"
254
+
255
+ @njob = njob
256
+ @policy = policy
257
+
258
+ @buffer_dir = policy[:buffer_dir]
259
+ @buffer_dir ||= CONF.TMP_DIR
260
+ @buffer = FastTempfile.open("mod-group-by-buffer--#{@njob.no}", @buffer_dir)
261
+ @buffer_mutex = Mutex.new
262
+ end
263
+
264
+ def push(value)
265
+ key = @njob.hash_key(value)
266
+
267
+ @buffer_mutex.synchronize do
268
+ @buffer.io << [Marshal.dump(key)].pack("m").tr("\n", ":")
269
+ @buffer.io << " "
270
+ @buffer.io << [Marshal.dump(value)].pack("m").tr("\n", ":")
271
+ @buffer.io << "\n"
272
+ end
273
+ end
274
+
275
+ def each(&block)
276
+ buffile = @buffer.path
277
+ @buffer.close
278
+ IO::popen("sort #{buffile}") do |io|
279
+ key = nil
280
+ values = nil
281
+ io.each do |line|
282
+
283
+ #Log::debug(self, line)
284
+
285
+ mk, mv = line.split(" ")
286
+ k = Marshal.load(mk.tr(":", "\n").unpack("m").first)
287
+ v = Marshal.load(mv.tr(":", "\n").unpack("m").first)
288
+ if key == k
289
+ values.push v
290
+ else
291
+ if values
292
+ values.push_eos
293
+ yield values
294
+ end
295
+ values = KeyValueStream.new(k, self)
296
+ key = k
297
+ values.push v
298
+ end
299
+ end
300
+ if values
301
+ values.push_eos
302
+ yield values
303
+ end
304
+ end
305
+ end
306
+ end
307
+
308
+ class CommandMergeSortBuffer<OnMemoryBuffer
309
+ def initialize(njob, policy)
310
+ super
311
+
312
+ @key_values_size = 0
313
+
314
+ @threshold = policy[:threshold]
315
+ @threshold ||= CONF.GROUP_BY_CMSB_THRESHOLD
316
+
317
+ @buffers = nil
318
+ end
319
+
320
+ def init_2ndmemory
321
+ require "fairy/share/fast-tempfile"
322
+
323
+ @buffer_dir = @policy[:buffer_dir]
324
+ @buffer_dir ||= CONF.TMP_DIR
325
+
326
+ @buffers = []
327
+ end
328
+
329
+ def open_buffer(&block)
330
+ unless @buffers
331
+ init_2ndmemory
332
+ end
333
+ buffer = FastTempfile.open("mod-group-by-buffer-#{@njob.no}-", @buffer_dir)
334
+ @buffers.push buffer
335
+ if block_given?
336
+ begin
337
+ # ruby BUG#2390の対応のため.
338
+ # yield buffer
339
+ yield buffer.io
340
+ ensure
341
+ buffer.close
342
+ end
343
+ else
344
+ buffer
345
+ end
346
+ end
347
+
348
+ def push(value)
349
+ super
350
+
351
+ @key_values_size += 1
352
+ key_values = nil
353
+ @key_values_mutex.synchronize do
354
+ if @key_values_size > @threshold
355
+ key_values = @key_values
356
+ @key_values_size = 0
357
+ @key_values = {}
358
+ end
359
+ if key_values
360
+ store_2ndmemory(key_values)
361
+ end
362
+ end
363
+ end
364
+
365
+ def store_2ndmemory(key_values)
366
+ Log::info(self, "start store")
367
+ sorted = key_values.collect{|key, values|
368
+ [[Marshal.dump(key)].pack("m").tr("\n", ":"),
369
+ [Marshal.dump(values)].pack("m").tr("\n", ":")]}.sort_by{|e| e.first}
370
+
371
+ open_buffer do |io|
372
+ sorted.each do |k, v|
373
+ io.puts "#{k}\t#{v}"
374
+ end
375
+ end
376
+ sorted = nil
377
+ Log::info(self, "end store")
378
+ end
379
+
380
+ def each(&block)
381
+ if @buffers
382
+ each_2ndmemory &block
383
+ else
384
+ super
385
+ end
386
+ end
387
+
388
+ def each_2ndmemory(&block)
389
+ unless @key_values.empty?
390
+ store_2ndmemory(@key_values)
391
+ end
392
+
393
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
394
+
395
+ IO::popen("sort -m -k1,1 #{@buffers.collect{|b| b.path}.join(' ')}") do |io|
396
+ key = nil
397
+ values = nil
398
+ io.each do |line|
399
+ mk, mv = line.split(/\s+/)
400
+ k = Marshal.load(mk.tr(":", "\n").unpack("m").first)
401
+ v = Marshal.load(mv.tr(":", "\n").unpack("m").first)
402
+ if key == k
403
+ values.concat v
404
+ else
405
+ if values
406
+ values.push_eos
407
+ yield values
408
+ end
409
+ key = k
410
+ values = KeyValueStream.new(key, self)
411
+ values.concat v
412
+ end
413
+ end
414
+ if values
415
+ values.push_eos
416
+ yield values
417
+ end
418
+ end
419
+ end
420
+ end
421
+
422
+ class MergeSortBuffer<CommandMergeSortBuffer
423
+ class StSt
424
+ def initialize(buffers)
425
+ @buffers = buffers.collect{|buf|
426
+ buf.open
427
+ kv = read_line(buf.io)
428
+ [kv, buf]
429
+ }.select{|kv, buf| !kv.nil?}.sort_by{|kv, buf| kv[0]}
430
+
431
+ @fiber = nil
432
+ end
433
+
434
+ def each(&block)
435
+ key = @buffers.first.first.first
436
+ values = KeyValueStream.new(key, self)
437
+ @fiber = Fiber.new{yield values}
438
+ while buf_min = @buffers.shift
439
+ kv, buf = buf_min
440
+ if key == kv[0]
441
+ values.concat kv[1]
442
+ @fiber.resume
443
+ else
444
+ values.push_eos
445
+ @fiber.resume
446
+ key = kv[0]
447
+ values = KeyValueStream.new(key, self)
448
+ @fiber = Fiber.new{yield values}
449
+ values.concat kv[1]
450
+ @fiber.resume
451
+ end
452
+
453
+ unless line = read_line(buf.io)
454
+ buf.close!
455
+ next
456
+ end
457
+ idx = @buffers.rindex{|kv, b| kv[0] <= line[0]}
458
+ # idx ? @buffers.insert(idx+1, [line, buf]) : @buffers.unshift([line, buf])
459
+ buf_min[0] = line
460
+ idx ? @buffers.insert(idx+1, buf_min) : @buffers.unshift(buf_min)
461
+
462
+ end
463
+ values.push_eos
464
+ @fiber.resume
465
+ end
466
+
467
+ def read_line(io)
468
+ begin
469
+ k = Marshal.load(io)
470
+ v = Marshal.load(io)
471
+ rescue EOFError
472
+ return nil
473
+ rescue ArgumentError
474
+ Log::debug(self, "MARSHAL ERROR OCCURED!!")
475
+ io.seek(-1024, IO::SEEK_CUR)
476
+ buf = io.read(2048)
477
+ Log::debugf(self, "File Contents: %s", buf)
478
+
479
+ raise
480
+ end
481
+ [k, v]
482
+ end
483
+ end
484
+
485
+ def store_2ndmemory(key_values)
486
+ Log::debug(self, "START STORE")
487
+ sorted = key_values.sort_by{|e| e.first}
488
+
489
+ open_buffer do |io|
490
+ sorted.each do |key, vv|
491
+ dk = Marshal.dump(key)
492
+ vv.each do |values|
493
+ io.write dk
494
+ Marshal.dump(values, io)
495
+ end
496
+
497
+ end
498
+ end
499
+ sorted = nil
500
+ Log::debug(self, "FINISH STORE")
501
+ end
502
+
503
+ def each_2ndmemory(&block)
504
+ unless @key_values.empty?
505
+ store_2ndmemory(@key_values)
506
+ @key_values = nil
507
+ end
508
+ Log::info(self, "Merge Start: #{@buffers.size} files")
509
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
510
+
511
+ stst = StSt.new(@buffers)
512
+ @buffers = nil
513
+ stst.each(&block)
514
+ end
515
+ end
516
+
517
+ class ExtMergeSortBuffer<MergeSortBuffer
518
+
519
+ def each_2ndmemory(&block)
520
+ require "deep-connect/deep-fork"
521
+
522
+ unless @key_values.empty?
523
+ store_2ndmemory(@key_values)
524
+ end
525
+
526
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
527
+
528
+ df = DeepConnect::DeepFork.fork(@njob.processor.deepconnect){|dc, ds|
529
+ $0 = "fairy processor sorter"
530
+
531
+ dc.export("Sorter", self)
532
+
533
+ finish_wait
534
+ # ds.close
535
+ # dc.stop
536
+ sleep 1
537
+ }
538
+ sorter = df.peer_deep_space.import("Sorter", true)
539
+ sorter.sub_each {|key, values|
540
+ # sorter.sub_each {|bigstr|
541
+ # values = bigstr.split("\t").collect{|e|
542
+ # e.gsub(/(\\t|\\\\)/){|v| v == "\\t" ? "\t" : "\\"}
543
+ # }
544
+ # key = values.shift
545
+ block.call values
546
+ nil # referenceが戻らないようにしている
547
+ }
548
+ sorter.finish
549
+ # df.peer_deep_space.close
550
+ @buffers.each{|buf| buf.close!}
551
+ Process.waitpid(df.peer_pid)
552
+ end
553
+
554
+ def sub_each(&block)
555
+ bufs = @buffers.collect{|buf|
556
+ buf.open
557
+ kv = read_line(buf.io)
558
+ [kv, buf]
559
+ }.select{|kv, buf| !kv.nil?}.sort_by{|kv, buf| kv[0]}
560
+
561
+ key = nil
562
+ values = []
563
+ while buf_min = bufs.shift
564
+ kv, buf = buf_min
565
+
566
+ if key == kv[0]
567
+ values.concat kv[1]
568
+ else
569
+ yield key, values unless values.empty?
570
+ key = kv[0]
571
+ values = kv[1]
572
+ end
573
+
574
+ next unless line = read_line(buf.io)
575
+ idx = bufs.rindex{|kv, b| kv[0] <= line[0]}
576
+ idx ? bufs.insert(idx+1, [line, buf]) : bufs.unshift([line, buf])
577
+ end
578
+ unless values.empty?
579
+ yield values
580
+ # values.unshift key
581
+ # bigstr = values.collect{|e|
582
+ # e.gsub(/[\\\t]/){|v| v == "\t" ? "\\t" : '\\\\'}
583
+ # }.join("\t")
584
+ # yield bigstr
585
+ end
586
+ nil # referenceが戻らないようにしている
587
+ end
588
+ # DeepConnect.def_method_spec(self, "REF sub_each(){DVAL, DVAL}")
589
+
590
+ def finish_wait
591
+ @mx = Mutex.new
592
+ @cv = ConditionVariable.new
593
+ @mx.synchronize do
594
+ @cv.wait(@mx)
595
+ end
596
+ end
597
+
598
+ def finish
599
+ @cv.signal
600
+ end
601
+
602
+ end
603
+
604
+ #
605
+ # using: Depq(http://depq.rubyforge.org/)
606
+ #
607
+ class DepqMergeSortBuffer<MergeSortBuffer
608
+ class StSt<MergeSortBuffer::StSt
609
+ def initialize(buffers)
610
+ require "depq"
611
+
612
+ @buffers = Depq.new
613
+ buffers.each{|buf|
614
+ buf.open
615
+ kv = read_line(buf.io)
616
+ next unless kv
617
+ @buffers.insert [kv, buf], kv.first
618
+ }
619
+
620
+ @fiber = nil
621
+ end
622
+
623
+ def each(&block)
624
+ key = @buffers.find_min.first.first
625
+ values = KeyValueStream.new(key, self)
626
+ @fiber = Fiber.new{yield values}
627
+ while buf_min = @buffers.delete_min
628
+ kv, buf = buf_min
629
+ if key == kv[0]
630
+ values.concat kv[1]
631
+ @fiber.resume
632
+ else
633
+ values.push_eos
634
+ @fiber.resume
635
+ key = kv[0]
636
+ values = KeyValueStream.new(key, self)
637
+ @fiber = Fiber.new{yield values}
638
+ values.concat kv[1]
639
+ @fiber.resume
640
+ end
641
+
642
+ unless line = read_line(buf.io)
643
+ buf.close!
644
+ next
645
+ end
646
+ @buffers.insert [line, buf], line[0]
647
+ end
648
+ values.push_eos
649
+ @fiber.resume
650
+ end
651
+ end
652
+
653
+ def each_2ndmemory(&block)
654
+ unless @key_values.empty?
655
+ store_2ndmemory(@key_values)
656
+ @key_values = nil
657
+ end
658
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
659
+
660
+ stst = StSt.new(@buffers)
661
+ @buffers = nil
662
+ stst.each(&block)
663
+ end
664
+ end
665
+
666
+ class DepqMergeSortBuffer2<DepqMergeSortBuffer
667
+ class StSt<DepqMergeSortBuffer::StSt
668
+ def each(&block)
669
+ key = @buffers.find_min.first.first
670
+ values = KeyValueStream.new(key, self)
671
+ @fiber = Fiber.new{yield values}
672
+ while buf_min = @buffers.find_min
673
+ kv, buf = buf_min
674
+ if key == kv[0]
675
+ values.concat kv[1]
676
+ @fiber.resume
677
+ else
678
+ values.push_eos
679
+ @fiber.resume
680
+ key = kv[0]
681
+ values = KeyValueStream.new(key, self)
682
+ @fiber = Fiber.new{yield values}
683
+ values.concat kv[1]
684
+ @fiber.resume
685
+ end
686
+
687
+ unless line = read_line(buf.io)
688
+ buf.close!
689
+ @buffers.delete_min
690
+ next
691
+ end
692
+ # @buffers.replace_min [line, buf], line[0]
693
+ buf_min[0] = line
694
+ loc = @buffers.find_min_locator
695
+ loc.update_priority line[0]
696
+ end
697
+ values.push_eos
698
+ @fiber.resume
699
+ end
700
+ end
701
+
702
+ def each_2ndmemory(&block)
703
+ unless @key_values.empty?
704
+ store_2ndmemory(@key_values)
705
+ @key_values = nil
706
+ end
707
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
708
+
709
+ stst = StSt.new(@buffers)
710
+ @buffers = nil
711
+ stst.each(&block)
712
+ end
713
+ end
714
+
715
+ #
716
+ # using: PriorityQueue(http://rubyforge.org/projects/priority-queue/)
717
+ #
718
+ class PQMergeSortBuffer<MergeSortBuffer
719
+ class StSt<MergeSortBuffer::StSt
720
+ class Pair
721
+ def initialize(kv, buf)
722
+ @key_values = kv
723
+ @buf = buf
724
+ end
725
+
726
+ attr_accessor :key_values
727
+ attr_accessor :buf
728
+
729
+ def key
730
+ @key_values.first
731
+ end
732
+
733
+ def values
734
+ @key_values.last
735
+ end
736
+ end
737
+
738
+ def initialize(buffers)
739
+ require "priority_queue"
740
+
741
+ @buffers = PriorityQueue.new
742
+ buffers.each{|buf|
743
+ buf.open
744
+ kv = read_line(buf.io)
745
+ next unless kv
746
+ @buffers.push Pair.new(kv, buf) , kv.first
747
+ }
748
+
749
+ @fiber = nil
750
+ end
751
+
752
+ def each(&block)
753
+ key = @buffers.min_key.key
754
+ values = KeyValueStream.new(key, self)
755
+ @fiber = Fiber.new{yield values}
756
+ while min_pair = @buffers.delete_min_return_key
757
+ # buf, kv = buf_min
758
+ if key == min_pair.key
759
+ values.concat min_pair.values
760
+ @fiber.resume
761
+ else
762
+ values.push_eos
763
+ @fiber.resume
764
+ key = min_pair.key
765
+ values = KeyValueStream.new(key, self)
766
+ @fiber = Fiber.new{yield values}
767
+ values.concat min_pair.values
768
+ @fiber.resume
769
+ end
770
+
771
+ unless line = read_line(min_pair.buf.io)
772
+ min_pair.buf.close!
773
+ next
774
+ end
775
+ min_pair.key_values = line
776
+ @buffers.push min_pair, line[0]
777
+ end
778
+ values.push_eos
779
+ @fiber.resume
780
+ end
781
+ end
782
+
783
+ def each_2ndmemory(&block)
784
+ unless @key_values.empty?
785
+ store_2ndmemory(@key_values)
786
+ @key_values = nil
787
+ end
788
+ Log::info(self, "Merge Start: #{@buffers.size} files")
789
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
790
+
791
+ stst = StSt.new(@buffers)
792
+ @buffers = nil
793
+ stst.each(&block)
794
+ end
795
+ end
796
+
797
+ class PQMergeSortBuffer2<MergeSortBuffer
798
+ class StSt<MergeSortBuffer::StSt
799
+ def initialize(buffers)
800
+ require "priority_queue"
801
+
802
+ @buffers = PriorityQueue.new
803
+ buffers.each{|buf|
804
+ buf.open
805
+ kv = read_line(buf.io)
806
+ next unless kv
807
+ @buffers.push [kv, buf], kv.first
808
+ }
809
+
810
+ @fiber = nil
811
+ end
812
+
813
+ def each(&block)
814
+ key = @buffers.min_key.first.first
815
+ values = KeyValueStream.new(key, self)
816
+ @fiber = Fiber.new{yield values}
817
+ while buf_min = @buffers.min_key
818
+ kv, buf = buf_min
819
+ if key == kv[0]
820
+ values.concat kv[1]
821
+ @fiber.resume
822
+ else
823
+ values.push_eos
824
+ @fiber.resume
825
+ key = kv[0]
826
+ values = KeyValueStream.new(key, self)
827
+ @fiber = Fiber.new{yield values}
828
+ values.concat kv[1]
829
+ @fiber.resume
830
+ end
831
+
832
+ unless line = read_line(buf.io)
833
+ buf.close!
834
+ @buffers.delete_min
835
+ next
836
+ end
837
+ buf_min[0] = line
838
+ @buffers.change_priority buf_min, line[0]
839
+ end
840
+ values.push_eos
841
+ @fiber.resume
842
+ end
843
+ end
844
+
845
+ def each_2ndmemory(&block)
846
+ unless @key_values.empty?
847
+ store_2ndmemory(@key_values)
848
+ @key_values = nil
849
+ end
850
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
851
+
852
+ stst = StSt.new(@buffers)
853
+ @buffers = nil
854
+ stst.each(&block)
855
+ end
856
+ end
857
+
858
+ class DirectOnMemoryBuffer
859
+
860
+ def initialize(njob, policy)
861
+ @njob = njob
862
+ @policy = policy
863
+
864
+ @key_values = []
865
+ @key_values_mutex = Mutex.new
866
+
867
+ @CHUNK_SIZE = policy[:chunk_size]
868
+ @CHUNK_SIZE ||= CONF.GROUP_BY_CMSB_CHUNK_SIZE
869
+
870
+ @log_id = format("%s[%s]", self.class.name.sub(/Fairy::/, ''), @njob.id)
871
+ end
872
+
873
+ attr_accessor :log_id
874
+
875
+ def push(value)
876
+ @key_values_mutex.synchronize do
877
+ @key_values.push value
878
+ end
879
+ end
880
+
881
+ def each(&block)
882
+ # @key_values = @key_values.collect{|e| [@njob.hash_key(e), e]}.group_by{|k, e| k}.sort_by{|k, e| k}
883
+ @key_values = @key_values.group_by{|e| @njob.hash_key(e)}.sort_by{|k, e| k}.collect{|k, values| kvs = KeyValueStream.new(k, nil); kvs.concat(values); kvs.push_eos; kvs}
884
+ @key_values.each &block
885
+ end
886
+ end
887
+
888
+ class DirectMergeSortBuffer<DirectOnMemoryBuffer
889
+ def initialize(njob, policy)
890
+ super
891
+
892
+ @threshold = policy[:threshold]
893
+ @threshold ||= CONF.GROUP_BY_CMSB_THRESHOLD
894
+
895
+ @buffers = nil
896
+ end
897
+
898
+ def init_2ndmemory
899
+ require "fairy/share/fast-tempfile"
900
+
901
+ @buffer_dir = @policy[:buffer_dir]
902
+ @buffer_dir ||= CONF.TMP_DIR
903
+
904
+ @buffers = []
905
+ end
906
+
907
+ def open_buffer(&block)
908
+ unless @buffers
909
+ init_2ndmemory
910
+ end
911
+ buffer = FastTempfile.open("mod-group-by-buffer-#{@njob.no}-", @buffer_dir)
912
+ @buffers.push buffer
913
+ if block_given?
914
+ begin
915
+ # ruby BUG#2390の対応のため.
916
+ # yield buffer
917
+ yield buffer.io
918
+ ensure
919
+ buffer.close
920
+ end
921
+ else
922
+ buffer
923
+ end
924
+ end
925
+
926
+ def push(value)
927
+ super
928
+
929
+ key_values = nil
930
+ @key_values_mutex.synchronize do
931
+ if @key_values.size > @threshold
932
+ key_values = @key_values
933
+ @key_values = []
934
+ end
935
+ if key_values
936
+ store_2ndmemory(key_values)
937
+ end
938
+ end
939
+ end
940
+
941
+ def store_2ndmemory(key_values)
942
+ Log::debug(self, "START STORE")
943
+ key_values = key_values.sort_by{|e| @njob.hash_key(e)}
944
+
945
+ open_buffer do |io|
946
+ key_values.each_slice(@CHUNK_SIZE) do |ary|
947
+ Marshal.dump(ary, io)
948
+ end
949
+ end
950
+ sorted = nil
951
+ Log::debug(self, "FINISH STORE")
952
+ end
953
+
954
+ def each(&block)
955
+ if @buffers
956
+ each_2ndmemory &block
957
+ else
958
+ super
959
+ end
960
+ end
961
+
962
+ def each_2ndmemory(&block)
963
+ unless @key_values.empty?
964
+ store_2ndmemory(@key_values)
965
+ @key_values = nil
966
+ end
967
+ Log::info(self, "Merge Start: #{@buffers.size} files")
968
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
969
+
970
+ m = Merger.new(@njob, @buffers)
971
+ m.each(&block)
972
+ end
973
+
974
+ class Merger
975
+ def initialize(njob, buffers, cached_buffer_class = CachedBuffer)
976
+ @njob = njob
977
+ @buffers = buffers.collect{|buf| cached_buffer_class.new(@njob, buf)}.select{|buf| !buf.eof?}.sort_by{|buf| buf.key}
978
+
979
+ @key = nil
980
+ end
981
+
982
+ def each(&block)
983
+ while !@buffers.empty?
984
+ @key = @buffers.first.key
985
+ values = KeyValueStream.new(@key, self)
986
+ block.call values
987
+ end
988
+ end
989
+
990
+ def each_by_key(&block)
991
+ while buf_min = @buffers.shift
992
+ vv_key = buf_min.key
993
+ unless @key == vv_key
994
+ @buffers.unshift buf_min
995
+ return
996
+ end
997
+
998
+ buf_min.each_by_same_key(&block)
999
+
1000
+ if buf_min.eof?
1001
+ buf_min.close!
1002
+ next
1003
+ end
1004
+
1005
+ if vv_key == buf_min.key
1006
+ @buffers.unshift(buf_min)
1007
+ else
1008
+ idx = @buffers.rindex{|buf| buf.key <= buf_min.key}
1009
+ idx ? @buffers.insert(idx+1, buf_min) : @buffers.unshift(buf_min)
1010
+ end
1011
+ end
1012
+ end
1013
+
1014
+ def get_buf(values)
1015
+ unless buf_min = @buffers.shift
1016
+ values.push_eos
1017
+ return
1018
+ end
1019
+
1020
+ vv_key = buf_min.key
1021
+ unless @key == vv_key
1022
+ values.push_eos
1023
+ @buffers.unshift buf_min
1024
+ return
1025
+ end
1026
+
1027
+ vv = buf_min.shift_values
1028
+ if vv
1029
+ values.concat vv
1030
+ end
1031
+ if buf_min.eof?
1032
+ buf_min.close!
1033
+ return
1034
+ end
1035
+
1036
+ idx = @buffers.rindex{|buf| buf.key <= buf_min.key}
1037
+ idx ? @buffers.insert(idx+1, buf_min) : @buffers.unshift(buf_min)
1038
+ end
1039
+ end
1040
+
1041
+ class CachedBuffer
1042
+ extend Forwardable
1043
+
1044
+ def initialize(njob, io)
1045
+ @njob = njob
1046
+ @io = io
1047
+ io.open
1048
+
1049
+ @cache = []
1050
+ @cache_pv = 0
1051
+
1052
+ @eof = false
1053
+
1054
+ read_buffer
1055
+ @key = @njob.hash_key(@cache.first)
1056
+ end
1057
+
1058
+ def_delegator :@io, :open
1059
+ def_delegator :@io, :close
1060
+ def_delegator :@io, :close!
1061
+
1062
+ def eof?
1063
+ @eof
1064
+ end
1065
+
1066
+ def key
1067
+ @key
1068
+ end
1069
+
1070
+ def each_by_same_key(&block)
1071
+ if @cache.size <= @cache_pv
1072
+ read_buffer
1073
+ return if @cache.empty?
1074
+ end
1075
+
1076
+ while @njob.hash_key(@cache[@cache_pv]) == @key
1077
+ block.call @cache[@cache_pv]
1078
+ @cache_pv += 1
1079
+
1080
+ if @cache.size <= @cache_pv
1081
+ read_buffer
1082
+ return if @cache.empty?
1083
+ end
1084
+ end
1085
+ @key = @njob.hash_key(@cache[@cache_pv])
1086
+ end
1087
+
1088
+ def shift_values
1089
+ if @cache.empty?
1090
+ read_buffer
1091
+ return nil if @cache.empty?
1092
+ end
1093
+
1094
+ idx = @cache.index{|v| @njob.hash_key(v) != @key}
1095
+ if idx
1096
+ vv = @cache.slice!(0, idx)
1097
+ @key = @njob.hash_key(@cache.first)
1098
+ else
1099
+ vv = @cache
1100
+ @cache = []
1101
+ end
1102
+ vv
1103
+ end
1104
+
1105
+ def read_buffer
1106
+ io = @io.io
1107
+ begin
1108
+ @cache = Marshal.load(io)
1109
+ rescue EOFError
1110
+ @eof = true
1111
+ @cache = []
1112
+ rescue ArgumentError
1113
+ Log::debug(self, "MARSHAL ERROR OCCURED!!")
1114
+ io.seek(-1024, IO::SEEK_CUR)
1115
+ buf = io.read(2048)
1116
+ Log::debugf(self, "File Contents: %s", buf)
1117
+ raise
1118
+ end
1119
+ # @key = @njob.hash_key(@cache.first)
1120
+ @cache_pv = 0
1121
+ end
1122
+
1123
+ end
1124
+
1125
+ class KeyValueStream
1126
+ include Enumerable
1127
+
1128
+ EOS = :__KEY_VALUE_STREAM_EOS__
1129
+
1130
+ def initialize(key, merger)
1131
+ @key = key
1132
+ @merger = merger
1133
+
1134
+ @buf = []
1135
+ end
1136
+ attr_reader :key
1137
+
1138
+ def push(e)
1139
+ @buf.push e
1140
+ end
1141
+ alias enq push
1142
+
1143
+ def push_eos
1144
+ push EOS
1145
+ end
1146
+
1147
+ def concat(elements)
1148
+ @buf.concat elements
1149
+ end
1150
+
1151
+ def shift
1152
+ while @buf.empty?
1153
+ @merger.get_buf(self)
1154
+ end
1155
+ @buf.shift
1156
+ end
1157
+ alias deq shift
1158
+ alias pop shift
1159
+
1160
+ def each(&block)
1161
+ @merger.each_by_key(&block)
1162
+ end
1163
+
1164
+ def size
1165
+ c = 0
1166
+ each{|v| c += 1}
1167
+ c
1168
+ end
1169
+ end
1170
+ end
1171
+
1172
+ class DirectFBMergeSortBuffer<DirectMergeSortBuffer
1173
+ def each_2ndmemory(&block)
1174
+ unless @key_values.empty?
1175
+ store_2ndmemory(@key_values)
1176
+ @key_values = nil
1177
+ end
1178
+ Log::info(self, "Merge Start: #{@buffers.size} files")
1179
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
1180
+
1181
+ m = Merger.new(@njob, @buffers)
1182
+ m.each(&block)
1183
+ end
1184
+
1185
+ class Merger<DirectMergeSortBuffer::Merger
1186
+ def initialize(njob, buffers)
1187
+ @njob = njob
1188
+ @buffers = buffers.collect{|buf| CachedBuffer.new(@njob, buf)}.select{|buf| !buf.eof?}.sort_by{|buf| buf.key}
1189
+
1190
+ @key = nil
1191
+ end
1192
+ end
1193
+
1194
+ class CachedBuffer<DirectMergeSortBuffer::CachedBuffer
1195
+ extend Forwardable
1196
+
1197
+ def initialize(njob, io)
1198
+ super
1199
+
1200
+ @each_fb = Fiber.new{|block| each_sub(block)}
1201
+ end
1202
+
1203
+ # def key
1204
+ # if @cache.empty?
1205
+ # read_buffer
1206
+ # end
1207
+ # @key
1208
+ # end
1209
+
1210
+ def each_by_same_key(&block)
1211
+ @each_fb.resume(block)
1212
+ end
1213
+
1214
+ def each_sub(block)
1215
+ if @cache.empty?
1216
+ read_buffer
1217
+ return if @cache.empty?
1218
+ end
1219
+
1220
+ while !@cache.empty?
1221
+ @cache.each do |e|
1222
+ unless @njob.hash_key(e) == @key
1223
+ @key = @njob.hash_key(e)
1224
+ block = Fiber.yield
1225
+ end
1226
+ block.call e
1227
+ end
1228
+ read_buffer
1229
+ end
1230
+ end
1231
+
1232
+ def read_buffer
1233
+ io = @io.io
1234
+ begin
1235
+ @cache = Marshal.load(io)
1236
+ rescue EOFError
1237
+ @eof = true
1238
+ @cache = []
1239
+ rescue ArgumentError
1240
+ Log::debug(self, "MARSHAL ERROR OCCURED!!")
1241
+ io.seek(-1024, IO::SEEK_CUR)
1242
+ buf = io.read(2048)
1243
+ Log::debugf(self, "File Contents: %s", buf)
1244
+ raise
1245
+ end
1246
+ # @key = @njob.hash_key(@cache.first)
1247
+ end
1248
+ end
1249
+ end
1250
+
1251
+ class DirectPQMergeSortBuffer<DirectMergeSortBuffer
1252
+
1253
+ def initialize(njob, policy)
1254
+ require "priority_queue"
1255
+ super
1256
+ end
1257
+
1258
+ def each_2ndmemory(&block)
1259
+ unless @key_values.empty?
1260
+ store_2ndmemory(@key_values)
1261
+ @key_values = nil
1262
+ end
1263
+ Log::info(self, "Merge Start: #{@buffers.size} files")
1264
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
1265
+
1266
+ m = Merger.new(@njob, @buffers)
1267
+ m.each(&block)
1268
+ end
1269
+
1270
+ class Merger<DirectMergeSortBuffer::Merger
1271
+
1272
+ def initialize(njob, buffers)
1273
+ @njob = njob
1274
+ @buffers = PriorityQueue.new
1275
+ buffers.each{|buf|
1276
+ cb = DirectMergeSortBuffer::CachedBuffer.new(@njob, buf)
1277
+ next if cb.eof?
1278
+ @buffers.push cb, cb.key
1279
+ }
1280
+
1281
+ @key = nil
1282
+ end
1283
+
1284
+ def each(&block)
1285
+ while !@buffers.empty?
1286
+ @key = @buffers.min_key.key
1287
+ values = DirectMergeSortBuffer::KeyValueStream.new(@key, self)
1288
+ block.call values
1289
+ end
1290
+ end
1291
+
1292
+ def each_by_key(&block)
1293
+ while buf_min = @buffers.delete_min_return_key
1294
+ vv_key = buf_min.key
1295
+ unless @key == vv_key
1296
+ @buffers.push buf_min, buf_min.key
1297
+ return
1298
+ end
1299
+
1300
+ buf_min.each_by_same_key(&block)
1301
+
1302
+ if buf_min.eof?
1303
+ buf_min.close!
1304
+ return
1305
+ end
1306
+
1307
+ @buffers.push buf_min, buf_min.key
1308
+ end
1309
+ end
1310
+
1311
+
1312
+ def get_buf(values)
1313
+ unless buf_min = @buffers.delete_min_return_key
1314
+ values.push_eos
1315
+ return
1316
+ end
1317
+
1318
+ vv_key = buf_min.key
1319
+ unless @key == vv_key
1320
+ values.push_eos
1321
+ @buffers.push buf_min, buf_min.key
1322
+ return
1323
+ end
1324
+
1325
+ vv = buf_min.shift_values
1326
+ if vv
1327
+ values.concat vv
1328
+ end
1329
+ if buf_min.eof?
1330
+ buf_min.close!
1331
+ return
1332
+ end
1333
+
1334
+ @buffers.push buf_min, buf_min.key
1335
+ end
1336
+ end
1337
+ end
1338
+
1339
+ class DirectKBMergeSortBuffer<CommandMergeSortBuffer
1340
+
1341
+ def store_2ndmemory(key_values)
1342
+ Log::debug(self, "START STORE")
1343
+ sorted = key_values.sort_by{|e| e.first}
1344
+
1345
+ open_buffer do |io|
1346
+ sorted.each do |key, vv|
1347
+ vv.each do |values|
1348
+ Marshal.dump(values, io)
1349
+ end
1350
+ end
1351
+ end
1352
+ sorted = nil
1353
+ Log::debug(self, "FINISH STORE")
1354
+ end
1355
+
1356
+ def each_2ndmemory(&block)
1357
+ unless @key_values.empty?
1358
+ store_2ndmemory(@key_values)
1359
+ @key_values = nil
1360
+ end
1361
+ Log::info(self, "Merge Start: #{@buffers.size} files")
1362
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
1363
+
1364
+ m = DirectMergeSortBuffer::Merger.new(@njob, @buffers, CachedBuffer)
1365
+ m.each(&block)
1366
+ end
1367
+
1368
+ class CachedBuffer
1369
+ extend Forwardable
1370
+
1371
+ def initialize(njob, io)
1372
+ @njob = njob
1373
+ @io = io
1374
+ io.open
1375
+
1376
+ @cache = []
1377
+
1378
+ @eof = false
1379
+
1380
+ read_buffer
1381
+ @key = @njob.hash_key(@cache.first)
1382
+ end
1383
+
1384
+ def_delegator :@io, :open
1385
+ def_delegator :@io, :close
1386
+ def_delegator :@io, :close!
1387
+
1388
+ def eof?
1389
+ @eof
1390
+ end
1391
+
1392
+ def key
1393
+ @key
1394
+ end
1395
+
1396
+ def each_by_same_key(&block)
1397
+ loop do
1398
+ @cache.each &block
1399
+ read_buffer
1400
+ return if @cache.empty?
1401
+ unless @njob.hash_key(@cache.first) == @key
1402
+ @key = @njob.hash_key(@cache.first)
1403
+ return
1404
+ end
1405
+ end
1406
+ end
1407
+
1408
+ def read_buffer
1409
+ io = @io.io
1410
+ begin
1411
+ @cache = Marshal.load(io)
1412
+ rescue EOFError
1413
+ @eof = true
1414
+ @cache = []
1415
+ rescue ArgumentError
1416
+ Log::debug(self, "MARSHAL ERROR OCCURED!!")
1417
+ io.seek(-1024, IO::SEEK_CUR)
1418
+ buf = io.read(2048)
1419
+ Log::debugf(self, "File Contents: %s", buf)
1420
+ raise
1421
+ end
1422
+ end
1423
+ end
1424
+ end
1425
+
1426
+ class DirectKB2MergeSortBuffer<DirectKBMergeSortBuffer
1427
+ def store_2ndmemory(key_values)
1428
+ Log::debug(self, "START STORE")
1429
+ sorted = key_values.sort_by{|e| e.first}
1430
+
1431
+ open_buffer do |io|
1432
+ tmpary = []
1433
+ tmpary_sz = 0
1434
+ sorted.each do |key, vv|
1435
+ vv.each do |values|
1436
+ if tmpary_sz >= @CHUNK_SIZE
1437
+ Marshal.dump(tmpary, io)
1438
+ tmpary = []
1439
+ tmpary_sz = 0
1440
+ end
1441
+ tmpary.push values
1442
+ tmpary_sz += values.size
1443
+ end
1444
+ end
1445
+ if tmpary_sz > 0
1446
+ Marshal.dump(tmpary, io)
1447
+ tmpary = nil
1448
+ end
1449
+ end
1450
+ sorted = nil
1451
+ Log::debug(self, "FINISH STORE")
1452
+ end
1453
+
1454
+ def each_2ndmemory(&block)
1455
+ unless @key_values.empty?
1456
+ store_2ndmemory(@key_values)
1457
+ @key_values = nil
1458
+ end
1459
+ Log::info(self, "Merge Start: #{@buffers.size} files")
1460
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
1461
+
1462
+ m = DirectMergeSortBuffer::Merger.new(@njob, @buffers, CachedBuffer)
1463
+ m.each(&block)
1464
+ end
1465
+
1466
+ class CachedBuffer
1467
+ extend Forwardable
1468
+
1469
+ def initialize(njob, io)
1470
+ @njob = njob
1471
+ @io = io
1472
+ io.open
1473
+
1474
+ @cache = []
1475
+
1476
+ @eof = false
1477
+
1478
+ read_buffer
1479
+ @key = @njob.hash_key(@cache.first.first)
1480
+ end
1481
+
1482
+ def_delegator :@io, :open
1483
+ def_delegator :@io, :close
1484
+ def_delegator :@io, :close!
1485
+
1486
+ def eof?
1487
+ @eof
1488
+ end
1489
+
1490
+ def key
1491
+ @key
1492
+ end
1493
+
1494
+ def each_by_same_key(&block)
1495
+ loop do
1496
+ while vv = @cache.shift
1497
+ unless @njob.hash_key(vv.first) == @key
1498
+ @cache.unshift vv
1499
+ @key = @njob.hash_key(vv.first)
1500
+ return
1501
+ end
1502
+ vv.each &block
1503
+ end
1504
+ read_buffer
1505
+ return if @cache.empty?
1506
+ unless @njob.hash_key(@cache.first.first) == @key
1507
+ @key = @njob.hash_key(@cache.first.first)
1508
+ return
1509
+ end
1510
+ end
1511
+ end
1512
+
1513
+ def read_buffer
1514
+ io = @io.io
1515
+ begin
1516
+ @cache = Marshal.load(io)
1517
+ rescue EOFError
1518
+ @eof = true
1519
+ @cache = []
1520
+ rescue ArgumentError
1521
+ Log::debug(self, "MARSHAL ERROR OCCURED!!")
1522
+ io.seek(-1024, IO::SEEK_CUR)
1523
+ buf = io.read(2048)
1524
+ Log::debugf(self, "File Contents: %s", buf)
1525
+ raise
1526
+ end
1527
+ end
1528
+ end
1529
+ end
1530
+ end
1531
+ end
1532
+
1533
+
1534
+