fairy 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. data/LICENSE +674 -0
  2. data/Makefile +116 -0
  3. data/README +15 -0
  4. data/bin/fairy +582 -0
  5. data/bin/fairy-cat +74 -0
  6. data/bin/fairy-cp +128 -0
  7. data/bin/fairy-rm +122 -0
  8. data/bin/subcmd/controller +41 -0
  9. data/bin/subcmd/inspector +81 -0
  10. data/bin/subcmd/master +43 -0
  11. data/bin/subcmd/node +47 -0
  12. data/bin/subcmd/processor +54 -0
  13. data/doc/programming-interface.html +240 -0
  14. data/doc/programming-interface.rd +300 -0
  15. data/etc/fairy.conf.tmpl +118 -0
  16. data/ext/simple_hash/extconf.rb +4 -0
  17. data/ext/simple_hash/simple_hash.c +42 -0
  18. data/fairy.gemspec +60 -0
  19. data/lib/fairy/client/addins.rb +20 -0
  20. data/lib/fairy/client/barrier.rb +29 -0
  21. data/lib/fairy/client/basic-group-by.rb +52 -0
  22. data/lib/fairy/client/cat.rb +41 -0
  23. data/lib/fairy/client/direct-product.rb +51 -0
  24. data/lib/fairy/client/equijoin.rb +79 -0
  25. data/lib/fairy/client/exec.rb +54 -0
  26. data/lib/fairy/client/filter.rb +62 -0
  27. data/lib/fairy/client/find.rb +35 -0
  28. data/lib/fairy/client/group-by.rb +194 -0
  29. data/lib/fairy/client/here.rb +84 -0
  30. data/lib/fairy/client/inject.rb +70 -0
  31. data/lib/fairy/client/input-file.rb +53 -0
  32. data/lib/fairy/client/input-iota.rb +49 -0
  33. data/lib/fairy/client/input-local-file.rb +188 -0
  34. data/lib/fairy/client/input-varray.rb +30 -0
  35. data/lib/fairy/client/input.rb +42 -0
  36. data/lib/fairy/client/io-filter.rb +26 -0
  37. data/lib/fairy/client/junction.rb +31 -0
  38. data/lib/fairy/client/map.rb +34 -0
  39. data/lib/fairy/client/merge-group-by.rb +71 -0
  40. data/lib/fairy/client/output-file.rb +64 -0
  41. data/lib/fairy/client/output-local-file.rb +60 -0
  42. data/lib/fairy/client/output-null.rb +47 -0
  43. data/lib/fairy/client/output-varray.rb +50 -0
  44. data/lib/fairy/client/output.rb +29 -0
  45. data/lib/fairy/client/roma-put.rb +62 -0
  46. data/lib/fairy/client/roma.rb +156 -0
  47. data/lib/fairy/client/seg-join.rb +61 -0
  48. data/lib/fairy/client/seg-map.rb +78 -0
  49. data/lib/fairy/client/seg-shuffle.rb +35 -0
  50. data/lib/fairy/client/seg-split.rb +27 -0
  51. data/lib/fairy/client/seg-zip.rb +60 -0
  52. data/lib/fairy/client/select.rb +38 -0
  53. data/lib/fairy/client/sort.rb +48 -0
  54. data/lib/fairy/client/sort18.rb +56 -0
  55. data/lib/fairy/client/sort19.rb +61 -0
  56. data/lib/fairy/client/there.rb +47 -0
  57. data/lib/fairy/client/top_n_into_roma.rb +34 -0
  58. data/lib/fairy/client/wc.rb +92 -0
  59. data/lib/fairy/controller.rb +1103 -0
  60. data/lib/fairy/logger.rb +107 -0
  61. data/lib/fairy/master/addins.rb +20 -0
  62. data/lib/fairy/master/atom.rb +17 -0
  63. data/lib/fairy/master/c-barrier.rb +283 -0
  64. data/lib/fairy/master/c-basic-group-by.rb +250 -0
  65. data/lib/fairy/master/c-cat.rb +159 -0
  66. data/lib/fairy/master/c-direct-product.rb +203 -0
  67. data/lib/fairy/master/c-exec.rb +68 -0
  68. data/lib/fairy/master/c-filter.rb +422 -0
  69. data/lib/fairy/master/c-find.rb +138 -0
  70. data/lib/fairy/master/c-group-by.rb +64 -0
  71. data/lib/fairy/master/c-here.rb +80 -0
  72. data/lib/fairy/master/c-inject.rb +119 -0
  73. data/lib/fairy/master/c-input-file.rb +46 -0
  74. data/lib/fairy/master/c-input-iota.rb +66 -0
  75. data/lib/fairy/master/c-input-local-file.rb +117 -0
  76. data/lib/fairy/master/c-input-varray.rb +53 -0
  77. data/lib/fairy/master/c-input.rb +24 -0
  78. data/lib/fairy/master/c-inputtable.rb +31 -0
  79. data/lib/fairy/master/c-inputtable18.rb +36 -0
  80. data/lib/fairy/master/c-inputtable19.rb +35 -0
  81. data/lib/fairy/master/c-io-filter.rb +28 -0
  82. data/lib/fairy/master/c-junction.rb +54 -0
  83. data/lib/fairy/master/c-map.rb +27 -0
  84. data/lib/fairy/master/c-merge-group-by.rb +241 -0
  85. data/lib/fairy/master/c-output-file.rb +84 -0
  86. data/lib/fairy/master/c-output-local-file.rb +19 -0
  87. data/lib/fairy/master/c-output-null.rb +45 -0
  88. data/lib/fairy/master/c-output-varray.rb +57 -0
  89. data/lib/fairy/master/c-output.rb +20 -0
  90. data/lib/fairy/master/c-seg-join.rb +141 -0
  91. data/lib/fairy/master/c-seg-map.rb +26 -0
  92. data/lib/fairy/master/c-seg-shuffle.rb +87 -0
  93. data/lib/fairy/master/c-seg-split.rb +110 -0
  94. data/lib/fairy/master/c-seg-zip.rb +132 -0
  95. data/lib/fairy/master/c-select.rb +27 -0
  96. data/lib/fairy/master/c-sort.rb +108 -0
  97. data/lib/fairy/master/c-there.rb +57 -0
  98. data/lib/fairy/master/c-wc.rb +232 -0
  99. data/lib/fairy/master/job-interpriter.rb +19 -0
  100. data/lib/fairy/master/scheduler.rb +24 -0
  101. data/lib/fairy/master.rb +329 -0
  102. data/lib/fairy/node/addins.rb +19 -0
  103. data/lib/fairy/node/p-barrier.rb +95 -0
  104. data/lib/fairy/node/p-basic-group-by.rb +252 -0
  105. data/lib/fairy/node/p-direct-product.rb +153 -0
  106. data/lib/fairy/node/p-exec.rb +30 -0
  107. data/lib/fairy/node/p-filter.rb +363 -0
  108. data/lib/fairy/node/p-find.rb +111 -0
  109. data/lib/fairy/node/p-group-by.rb +1534 -0
  110. data/lib/fairy/node/p-here.rb +21 -0
  111. data/lib/fairy/node/p-identity.rb +24 -0
  112. data/lib/fairy/node/p-inject.rb +127 -0
  113. data/lib/fairy/node/p-input-file.rb +108 -0
  114. data/lib/fairy/node/p-input-iota.rb +39 -0
  115. data/lib/fairy/node/p-input-local-file.rb +61 -0
  116. data/lib/fairy/node/p-input-varray.rb +26 -0
  117. data/lib/fairy/node/p-io-filter.rb +28 -0
  118. data/lib/fairy/node/p-map.rb +40 -0
  119. data/lib/fairy/node/p-merger-group-by.rb +48 -0
  120. data/lib/fairy/node/p-output-file.rb +104 -0
  121. data/lib/fairy/node/p-output-local-file.rb +14 -0
  122. data/lib/fairy/node/p-output-null.rb +32 -0
  123. data/lib/fairy/node/p-output-varray.rb +41 -0
  124. data/lib/fairy/node/p-seg-join.rb +82 -0
  125. data/lib/fairy/node/p-seg-map.rb +34 -0
  126. data/lib/fairy/node/p-seg-split.rb +61 -0
  127. data/lib/fairy/node/p-seg-zip.rb +79 -0
  128. data/lib/fairy/node/p-select.rb +40 -0
  129. data/lib/fairy/node/p-single-exportable.rb +90 -0
  130. data/lib/fairy/node/p-sort.rb +195 -0
  131. data/lib/fairy/node/p-task.rb +113 -0
  132. data/lib/fairy/node/p-there.rb +44 -0
  133. data/lib/fairy/node/p-wc.rb +266 -0
  134. data/lib/fairy/node.rb +187 -0
  135. data/lib/fairy/processor.rb +510 -0
  136. data/lib/fairy/share/base-app.rb +114 -0
  137. data/lib/fairy/share/block-source.rb +234 -0
  138. data/lib/fairy/share/conf.rb +396 -0
  139. data/lib/fairy/share/debug.rb +21 -0
  140. data/lib/fairy/share/encoding.rb +17 -0
  141. data/lib/fairy/share/fast-tempfile.rb +93 -0
  142. data/lib/fairy/share/file-place.rb +176 -0
  143. data/lib/fairy/share/hash-1.rb +20 -0
  144. data/lib/fairy/share/hash-md5.rb +28 -0
  145. data/lib/fairy/share/hash-murmur.rb +69 -0
  146. data/lib/fairy/share/hash-rb18.rb +20 -0
  147. data/lib/fairy/share/hash-simple-hash.rb +28 -0
  148. data/lib/fairy/share/inspector.rb +16 -0
  149. data/lib/fairy/share/lc/exceptions.rb +82 -0
  150. data/lib/fairy/share/lc/ja/exceptions.rb +81 -0
  151. data/lib/fairy/share/locale.rb +17 -0
  152. data/lib/fairy/share/log.rb +215 -0
  153. data/lib/fairy/share/pool-dictionary.rb +53 -0
  154. data/lib/fairy/share/port-marshaled-queue.rb +347 -0
  155. data/lib/fairy/share/port.rb +1697 -0
  156. data/lib/fairy/share/reference.rb +45 -0
  157. data/lib/fairy/share/stdout.rb +56 -0
  158. data/lib/fairy/share/tr.rb +16 -0
  159. data/lib/fairy/share/varray.rb +147 -0
  160. data/lib/fairy/share/vfile.rb +183 -0
  161. data/lib/fairy/version.rb +8 -0
  162. data/lib/fairy.rb +206 -0
  163. data/sample/grep.rb +46 -0
  164. data/sample/ping.rb +19 -0
  165. data/sample/sort.rb +102 -0
  166. data/sample/wordcount.rb +61 -0
  167. data/spec/README +12 -0
  168. data/spec/fairy1_spec.rb +31 -0
  169. data/spec/fairy2_spec.rb +42 -0
  170. data/spec/fairy3_spec.rb +126 -0
  171. data/spec/fairy4_spec.rb +63 -0
  172. data/spec/fairy5_spec.rb +45 -0
  173. data/spec/fairy6_spec.rb +52 -0
  174. data/spec/fairy7_spec.rb +58 -0
  175. data/spec/fairy8_spec.rb +48 -0
  176. data/spec/mkdat.rb +148 -0
  177. data/spec/run_all.sh +65 -0
  178. data/test/testc.rb +7111 -0
  179. data/tools/cap_recipe/Capfile +144 -0
  180. data/tools/cap_recipe/cluster.yml.sample +14 -0
  181. data/tools/fairy_perf_graph.rb +444 -0
  182. data/tools/git-tag +44 -0
  183. data/tools/log-analysis.rb +62 -0
  184. data/tools/svn-ls-diff +38 -0
  185. data/tools/svn-tags +37 -0
  186. metadata +298 -0
@@ -0,0 +1,111 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # Copyright (C) 2007-2010 Rakuten, Inc.
4
+ #
5
+
6
+ require "fairy/node/p-io-filter"
7
+ require "fairy/node/p-single-exportable"
8
+
9
+ module Fairy
10
+ class PLocalFind<PSingleExportFilter
11
+ Processor.def_export self
12
+
13
+ def initialize(id, ntask, bjob, opts, block_source)
14
+ super
15
+ @block_source = block_source
16
+
17
+ @findp = false
18
+ @findp_mutex = Mutex.new
19
+ end
20
+
21
+ def basic_each(&block)
22
+ @find_proc = BBlock.new(@block_source, @context, self)
23
+
24
+ @input.each do |e|
25
+ # 見つかっていたら空読み
26
+ @findp_mutex.synchronize do
27
+ next if @findp
28
+ if !(@findp = @find_proc.yield(e))
29
+ next
30
+ elsif Import::CTLTOKEN_NULLVALUE === @findp
31
+ @findp = false
32
+ next
33
+ end
34
+ block.call e
35
+ end
36
+ end
37
+ end
38
+
39
+ # def start
40
+ # super do
41
+ # @import.each do |e|
42
+ # # 見つかっていたら空読み
43
+ # @find_mutex.synchronize do
44
+ # next if @find
45
+ # next unless find = @map_proc.yield(e)
46
+ # @export.push e
47
+ # end
48
+ # end
49
+ # end
50
+ # end
51
+
52
+ def find_break
53
+ @find_mutex.synchronize do
54
+ @findp = true
55
+ end
56
+ end
57
+ end
58
+
59
+
60
+ class PFindResult<PIOFilter
61
+ Processor.def_export self
62
+
63
+ def initialize(*args)
64
+ super
65
+
66
+ @value = :__FAIRY_NO_VALUE__
67
+ @value_mutex = Mutex.new
68
+ @value_cv = ConditionVariable.new
69
+ end
70
+
71
+ def input=(input)
72
+ super
73
+
74
+ start do
75
+ self.start_find
76
+ end
77
+ end
78
+
79
+ alias super_each each
80
+
81
+ def each(&block)
82
+ block.call value
83
+ end
84
+
85
+ def value
86
+ @value_mutex.synchronize do
87
+ while @value == :__FAIRY_NO_VALUE__
88
+ @value_cv.wait(@value_mutex)
89
+ end
90
+ @value
91
+ end
92
+ end
93
+
94
+ def start_find
95
+ find = false
96
+ @input.each do |e|
97
+ # 最初の要素以外空読み
98
+ next if find
99
+ find = e
100
+
101
+ @value = find
102
+ @value_cv.broadcast
103
+ # @export.push find
104
+ # ちょっと気になる...
105
+ # @export.push END_OF_STREAM
106
+
107
+ @bjob.update_find
108
+ end
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,1534 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # Copyright (C) 2007-2010 Rakuten, Inc.
4
+ #
5
+
6
+ require "fairy/node/p-io-filter"
7
+ require "fairy/node/p-basic-group-by"
8
+
9
+ module Fairy
10
+ class PGroupBy<PBasicGroupBy
11
+
12
+ Processor.def_export self
13
+
14
+ def initialize(id, ntask, bjob, opts, block_source)
15
+ super
16
+
17
+ @exports = []
18
+ def @exports.each_pair(&block)
19
+ each_with_index do |item, idx|
20
+ block.call(idx, item) if item
21
+ unless item
22
+ Log::debug(self, "No assgined Export")
23
+ end
24
+ end
25
+ end
26
+
27
+ @mod = opts[:no_segment]
28
+ @mod ||= CONF.GROUP_BY_NO_SEGMENT
29
+ Log::debug(self, "NO_SEGMENT: #{@mod}")
30
+
31
+ mod = opts[:hash_module]
32
+ mod ||= CONF.GROUP_BY_HASH_MODULE
33
+ require mod
34
+ @hash_generator = Fairy::HValueGenerator.new(bjob.hash_seed)
35
+
36
+ @hash_optimize = CONF.GROUP_BY_GROUPING_OPTIMIZE
37
+ @hash_optimize = opts[:grouping_optimize] if opts.key?(:grouping_optimize)
38
+ end
39
+
40
+ def hash_key(e)
41
+ if Import::CTLTOKEN_NULLVALUE === (key = super)
42
+ return key
43
+ end
44
+ @hash_generator.value(super) % @mod
45
+ end
46
+
47
+ class PPostFilter<PSingleExportFilter
48
+ Processor.def_export self
49
+
50
+ def initialize(id, ntask, bjob, opts, block_source)
51
+ super
52
+ @block_source = block_source
53
+
54
+ @buffering_policy = @opts[:buffering_policy]
55
+ @buffering_policy ||= CONF.GROUP_BY_BUFFERING_POLICY
56
+
57
+ unless CONF.BUG234
58
+ @hash_optimize = CONF.GROUP_BY_GROUPING_OPTIMIZE
59
+ @hash_optimize = opts[:grouping_optimize] if opts.key?(:grouping_optimize)
60
+ end
61
+ end
62
+
63
+ # def start
64
+ # super do
65
+ # @key_value_buffer =
66
+ # eval("#{@buffering_policy[:buffering_class]}").new(@buffering_policy)
67
+ # @hash_proc = BBlock.new(@block_source, @context, self)
68
+
69
+ # @import.each do |e|
70
+ # key = key(e)
71
+ # @key_value_buffer.push(key, e)
72
+ # end
73
+ # @key_value_buffer.each do |key, values|
74
+ # #Log::debug(self, key)
75
+ # @export.push [key, values]
76
+ # end
77
+ # @key_value_buffer = nil
78
+ # end
79
+ # end
80
+
81
+
82
+ def basic_each_0(&block)
83
+ # @key_value_buffer =
84
+ # eval("#{@buffering_policy[:buffering_class]}").new(@buffering_policy)
85
+
86
+ if @hash_optimize
87
+ @hash_proc = eval("proc{#{@block_source.source}}")
88
+ else
89
+ @hash_proc = BBlock.new(@block_source, @context, self)
90
+ end
91
+
92
+ @input.group_by{|e| e}.each{|k, v|
93
+ block.call [k, v]
94
+ }
95
+ end
96
+
97
+ def basic_each(&block)
98
+ @key_value_buffer =
99
+ eval("#{@buffering_policy[:buffering_class]}").new(self, @buffering_policy)
100
+ if @hash_optimize
101
+ @hash_proc = eval("proc{#{@block_source.source}}")
102
+ else
103
+ @hash_proc = BBlock.new(@block_source, @context, self)
104
+ end
105
+
106
+ @input.each do |e|
107
+ @key_value_buffer.push(e)
108
+ e = nil
109
+ end
110
+ @key_value_buffer.each do |kvs|
111
+ block.call kvs
112
+ end
113
+ @key_value_buffer = nil
114
+ end
115
+
116
+ def hash_key(e)
117
+ @hash_proc.yield(e)
118
+ end
119
+ end
120
+
121
+ class KeyValueStream
122
+ include Enumerable
123
+
124
+ EOS = :__KEY_VALUE_STREAM_EOS__
125
+
126
+ def initialize(key, generator)
127
+ @key = key
128
+ @buf = []
129
+ end
130
+
131
+ attr_reader :key
132
+
133
+ def push(e)
134
+ @buf.push e
135
+ end
136
+ alias enq push
137
+
138
+ def push_eos
139
+ push EOS
140
+ end
141
+
142
+ def concat(elements)
143
+ @buf.concat elements
144
+ end
145
+
146
+ def shift
147
+ while @buf.empty?
148
+ Fiber.yield
149
+ end
150
+ @buf.shift
151
+ end
152
+ alias deq shift
153
+ alias pop shift
154
+
155
+ def each(&block)
156
+ while (v = shift) != EOS
157
+ block.call v
158
+ end
159
+ end
160
+
161
+ def size
162
+ c = 0
163
+ each{|v| c += 1}
164
+ c
165
+ end
166
+
167
+ # def inspect
168
+ # "#{self.class}<#{super}>"
169
+ # end
170
+ end
171
+
172
+ class OnMemoryBuffer
173
+ def initialize(njob, policy)
174
+ @njob = njob
175
+ @policy = policy
176
+
177
+ @key_values = {}
178
+ @key_values_mutex = Mutex.new
179
+
180
+ @CHUNK_SIZE = CONF.GROUP_BY_CMSB_CHUNK_SIZE
181
+
182
+ @log_id = format("%s[%s]", self.class.name.sub(/Fairy::/, ''), @njob.id)
183
+ end
184
+
185
+ attr_accessor :log_id
186
+
187
+ def push(value)
188
+ key = @njob.hash_key(value)
189
+
190
+ @key_values_mutex.synchronize do
191
+ @key_values[key] = [[]] unless @key_values.key?(key)
192
+ if @CHUNK_SIZE < @key_values[key].last.size
193
+ @key_values[key].push []
194
+ end
195
+ @key_values[key].last.push value
196
+ end
197
+ end
198
+
199
+ def each(&block)
200
+ @key_values.each do |key, vv|
201
+ kvs = KeyValueStream.new(key, nil)
202
+ vv.each{|v| kvs.concat v}
203
+ kvs.push_eos
204
+ block.call(kvs)
205
+ end
206
+ end
207
+ end
208
+
209
+ class SimpleFileByKeyBuffer
210
+ def initialize(njob, policy)
211
+ require "tempfile"
212
+
213
+ @njob = njob
214
+ @policy = policy
215
+
216
+ @key_file = {}
217
+ @key_file_mutex = Mutex.new
218
+ @buffer_dir = policy[:buffer_dir]
219
+ @buffer_dir ||= CONF.TMP_DIR
220
+ end
221
+
222
+ def push(value)
223
+ key = @njob.hash_key(value)
224
+
225
+ @key_file_mutex.synchronize do
226
+ unless @key_file.key?(key)
227
+ @key_file[key] = Tempfile.open("mod-group-by-buffer-#{@njob.no}-", @buffer_dir)
228
+ end
229
+
230
+ # ruby BUG#2390の対応のため.
231
+ # Marshal.dump(value, @key_file[key])
232
+ Marshal.dump(value, @key_file[key].instance_eval{@tmpfile})
233
+ end
234
+ end
235
+
236
+ def each(&block)
237
+ @key_file.each do |key, file|
238
+ values = KeyValueStream.new(key, nil)
239
+ file.rewind
240
+ while !file.eof?
241
+ values.push Marshal.load(file)
242
+ end
243
+ values.push_eos
244
+ # file.close
245
+
246
+ yield values
247
+ end
248
+ end
249
+ end
250
+
251
+ class SimpleCommandSortBuffer
252
+ def initialize(njob, policy)
253
+ require "fairy/share/fast-tempfile"
254
+
255
+ @njob = njob
256
+ @policy = policy
257
+
258
+ @buffer_dir = policy[:buffer_dir]
259
+ @buffer_dir ||= CONF.TMP_DIR
260
+ @buffer = FastTempfile.open("mod-group-by-buffer--#{@njob.no}", @buffer_dir)
261
+ @buffer_mutex = Mutex.new
262
+ end
263
+
264
+ def push(value)
265
+ key = @njob.hash_key(value)
266
+
267
+ @buffer_mutex.synchronize do
268
+ @buffer.io << [Marshal.dump(key)].pack("m").tr("\n", ":")
269
+ @buffer.io << " "
270
+ @buffer.io << [Marshal.dump(value)].pack("m").tr("\n", ":")
271
+ @buffer.io << "\n"
272
+ end
273
+ end
274
+
275
+ def each(&block)
276
+ buffile = @buffer.path
277
+ @buffer.close
278
+ IO::popen("sort #{buffile}") do |io|
279
+ key = nil
280
+ values = nil
281
+ io.each do |line|
282
+
283
+ #Log::debug(self, line)
284
+
285
+ mk, mv = line.split(" ")
286
+ k = Marshal.load(mk.tr(":", "\n").unpack("m").first)
287
+ v = Marshal.load(mv.tr(":", "\n").unpack("m").first)
288
+ if key == k
289
+ values.push v
290
+ else
291
+ if values
292
+ values.push_eos
293
+ yield values
294
+ end
295
+ values = KeyValueStream.new(k, self)
296
+ key = k
297
+ values.push v
298
+ end
299
+ end
300
+ if values
301
+ values.push_eos
302
+ yield values
303
+ end
304
+ end
305
+ end
306
+ end
307
+
308
+ class CommandMergeSortBuffer<OnMemoryBuffer
309
+ def initialize(njob, policy)
310
+ super
311
+
312
+ @key_values_size = 0
313
+
314
+ @threshold = policy[:threshold]
315
+ @threshold ||= CONF.GROUP_BY_CMSB_THRESHOLD
316
+
317
+ @buffers = nil
318
+ end
319
+
320
+ def init_2ndmemory
321
+ require "fairy/share/fast-tempfile"
322
+
323
+ @buffer_dir = @policy[:buffer_dir]
324
+ @buffer_dir ||= CONF.TMP_DIR
325
+
326
+ @buffers = []
327
+ end
328
+
329
+ def open_buffer(&block)
330
+ unless @buffers
331
+ init_2ndmemory
332
+ end
333
+ buffer = FastTempfile.open("mod-group-by-buffer-#{@njob.no}-", @buffer_dir)
334
+ @buffers.push buffer
335
+ if block_given?
336
+ begin
337
+ # ruby BUG#2390の対応のため.
338
+ # yield buffer
339
+ yield buffer.io
340
+ ensure
341
+ buffer.close
342
+ end
343
+ else
344
+ buffer
345
+ end
346
+ end
347
+
348
+ def push(value)
349
+ super
350
+
351
+ @key_values_size += 1
352
+ key_values = nil
353
+ @key_values_mutex.synchronize do
354
+ if @key_values_size > @threshold
355
+ key_values = @key_values
356
+ @key_values_size = 0
357
+ @key_values = {}
358
+ end
359
+ if key_values
360
+ store_2ndmemory(key_values)
361
+ end
362
+ end
363
+ end
364
+
365
+ def store_2ndmemory(key_values)
366
+ Log::info(self, "start store")
367
+ sorted = key_values.collect{|key, values|
368
+ [[Marshal.dump(key)].pack("m").tr("\n", ":"),
369
+ [Marshal.dump(values)].pack("m").tr("\n", ":")]}.sort_by{|e| e.first}
370
+
371
+ open_buffer do |io|
372
+ sorted.each do |k, v|
373
+ io.puts "#{k}\t#{v}"
374
+ end
375
+ end
376
+ sorted = nil
377
+ Log::info(self, "end store")
378
+ end
379
+
380
+ def each(&block)
381
+ if @buffers
382
+ each_2ndmemory &block
383
+ else
384
+ super
385
+ end
386
+ end
387
+
388
+ def each_2ndmemory(&block)
389
+ unless @key_values.empty?
390
+ store_2ndmemory(@key_values)
391
+ end
392
+
393
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
394
+
395
+ IO::popen("sort -m -k1,1 #{@buffers.collect{|b| b.path}.join(' ')}") do |io|
396
+ key = nil
397
+ values = nil
398
+ io.each do |line|
399
+ mk, mv = line.split(/\s+/)
400
+ k = Marshal.load(mk.tr(":", "\n").unpack("m").first)
401
+ v = Marshal.load(mv.tr(":", "\n").unpack("m").first)
402
+ if key == k
403
+ values.concat v
404
+ else
405
+ if values
406
+ values.push_eos
407
+ yield values
408
+ end
409
+ key = k
410
+ values = KeyValueStream.new(key, self)
411
+ values.concat v
412
+ end
413
+ end
414
+ if values
415
+ values.push_eos
416
+ yield values
417
+ end
418
+ end
419
+ end
420
+ end
421
+
422
+ class MergeSortBuffer<CommandMergeSortBuffer
423
+ class StSt
424
+ def initialize(buffers)
425
+ @buffers = buffers.collect{|buf|
426
+ buf.open
427
+ kv = read_line(buf.io)
428
+ [kv, buf]
429
+ }.select{|kv, buf| !kv.nil?}.sort_by{|kv, buf| kv[0]}
430
+
431
+ @fiber = nil
432
+ end
433
+
434
+ def each(&block)
435
+ key = @buffers.first.first.first
436
+ values = KeyValueStream.new(key, self)
437
+ @fiber = Fiber.new{yield values}
438
+ while buf_min = @buffers.shift
439
+ kv, buf = buf_min
440
+ if key == kv[0]
441
+ values.concat kv[1]
442
+ @fiber.resume
443
+ else
444
+ values.push_eos
445
+ @fiber.resume
446
+ key = kv[0]
447
+ values = KeyValueStream.new(key, self)
448
+ @fiber = Fiber.new{yield values}
449
+ values.concat kv[1]
450
+ @fiber.resume
451
+ end
452
+
453
+ unless line = read_line(buf.io)
454
+ buf.close!
455
+ next
456
+ end
457
+ idx = @buffers.rindex{|kv, b| kv[0] <= line[0]}
458
+ # idx ? @buffers.insert(idx+1, [line, buf]) : @buffers.unshift([line, buf])
459
+ buf_min[0] = line
460
+ idx ? @buffers.insert(idx+1, buf_min) : @buffers.unshift(buf_min)
461
+
462
+ end
463
+ values.push_eos
464
+ @fiber.resume
465
+ end
466
+
467
+ def read_line(io)
468
+ begin
469
+ k = Marshal.load(io)
470
+ v = Marshal.load(io)
471
+ rescue EOFError
472
+ return nil
473
+ rescue ArgumentError
474
+ Log::debug(self, "MARSHAL ERROR OCCURED!!")
475
+ io.seek(-1024, IO::SEEK_CUR)
476
+ buf = io.read(2048)
477
+ Log::debugf(self, "File Contents: %s", buf)
478
+
479
+ raise
480
+ end
481
+ [k, v]
482
+ end
483
+ end
484
+
485
+ def store_2ndmemory(key_values)
486
+ Log::debug(self, "START STORE")
487
+ sorted = key_values.sort_by{|e| e.first}
488
+
489
+ open_buffer do |io|
490
+ sorted.each do |key, vv|
491
+ dk = Marshal.dump(key)
492
+ vv.each do |values|
493
+ io.write dk
494
+ Marshal.dump(values, io)
495
+ end
496
+
497
+ end
498
+ end
499
+ sorted = nil
500
+ Log::debug(self, "FINISH STORE")
501
+ end
502
+
503
+ def each_2ndmemory(&block)
504
+ unless @key_values.empty?
505
+ store_2ndmemory(@key_values)
506
+ @key_values = nil
507
+ end
508
+ Log::info(self, "Merge Start: #{@buffers.size} files")
509
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
510
+
511
+ stst = StSt.new(@buffers)
512
+ @buffers = nil
513
+ stst.each(&block)
514
+ end
515
+ end
516
+
517
+ class ExtMergeSortBuffer<MergeSortBuffer
518
+
519
+ def each_2ndmemory(&block)
520
+ require "deep-connect/deep-fork"
521
+
522
+ unless @key_values.empty?
523
+ store_2ndmemory(@key_values)
524
+ end
525
+
526
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
527
+
528
+ df = DeepConnect::DeepFork.fork(@njob.processor.deepconnect){|dc, ds|
529
+ $0 = "fairy processor sorter"
530
+
531
+ dc.export("Sorter", self)
532
+
533
+ finish_wait
534
+ # ds.close
535
+ # dc.stop
536
+ sleep 1
537
+ }
538
+ sorter = df.peer_deep_space.import("Sorter", true)
539
+ sorter.sub_each {|key, values|
540
+ # sorter.sub_each {|bigstr|
541
+ # values = bigstr.split("\t").collect{|e|
542
+ # e.gsub(/(\\t|\\\\)/){|v| v == "\\t" ? "\t" : "\\"}
543
+ # }
544
+ # key = values.shift
545
+ block.call values
546
+ nil # referenceが戻らないようにしている
547
+ }
548
+ sorter.finish
549
+ # df.peer_deep_space.close
550
+ @buffers.each{|buf| buf.close!}
551
+ Process.waitpid(df.peer_pid)
552
+ end
553
+
554
+ def sub_each(&block)
555
+ bufs = @buffers.collect{|buf|
556
+ buf.open
557
+ kv = read_line(buf.io)
558
+ [kv, buf]
559
+ }.select{|kv, buf| !kv.nil?}.sort_by{|kv, buf| kv[0]}
560
+
561
+ key = nil
562
+ values = []
563
+ while buf_min = bufs.shift
564
+ kv, buf = buf_min
565
+
566
+ if key == kv[0]
567
+ values.concat kv[1]
568
+ else
569
+ yield key, values unless values.empty?
570
+ key = kv[0]
571
+ values = kv[1]
572
+ end
573
+
574
+ next unless line = read_line(buf.io)
575
+ idx = bufs.rindex{|kv, b| kv[0] <= line[0]}
576
+ idx ? bufs.insert(idx+1, [line, buf]) : bufs.unshift([line, buf])
577
+ end
578
+ unless values.empty?
579
+ yield values
580
+ # values.unshift key
581
+ # bigstr = values.collect{|e|
582
+ # e.gsub(/[\\\t]/){|v| v == "\t" ? "\\t" : '\\\\'}
583
+ # }.join("\t")
584
+ # yield bigstr
585
+ end
586
+ nil # referenceが戻らないようにしている
587
+ end
588
+ # DeepConnect.def_method_spec(self, "REF sub_each(){DVAL, DVAL}")
589
+
590
+ def finish_wait
591
+ @mx = Mutex.new
592
+ @cv = ConditionVariable.new
593
+ @mx.synchronize do
594
+ @cv.wait(@mx)
595
+ end
596
+ end
597
+
598
+ def finish
599
+ @cv.signal
600
+ end
601
+
602
+ end
603
+
604
+ #
605
+ # using: Depq(http://depq.rubyforge.org/)
606
+ #
607
+ class DepqMergeSortBuffer<MergeSortBuffer
608
+ class StSt<MergeSortBuffer::StSt
609
+ def initialize(buffers)
610
+ require "depq"
611
+
612
+ @buffers = Depq.new
613
+ buffers.each{|buf|
614
+ buf.open
615
+ kv = read_line(buf.io)
616
+ next unless kv
617
+ @buffers.insert [kv, buf], kv.first
618
+ }
619
+
620
+ @fiber = nil
621
+ end
622
+
623
+ def each(&block)
624
+ key = @buffers.find_min.first.first
625
+ values = KeyValueStream.new(key, self)
626
+ @fiber = Fiber.new{yield values}
627
+ while buf_min = @buffers.delete_min
628
+ kv, buf = buf_min
629
+ if key == kv[0]
630
+ values.concat kv[1]
631
+ @fiber.resume
632
+ else
633
+ values.push_eos
634
+ @fiber.resume
635
+ key = kv[0]
636
+ values = KeyValueStream.new(key, self)
637
+ @fiber = Fiber.new{yield values}
638
+ values.concat kv[1]
639
+ @fiber.resume
640
+ end
641
+
642
+ unless line = read_line(buf.io)
643
+ buf.close!
644
+ next
645
+ end
646
+ @buffers.insert [line, buf], line[0]
647
+ end
648
+ values.push_eos
649
+ @fiber.resume
650
+ end
651
+ end
652
+
653
+ def each_2ndmemory(&block)
654
+ unless @key_values.empty?
655
+ store_2ndmemory(@key_values)
656
+ @key_values = nil
657
+ end
658
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
659
+
660
+ stst = StSt.new(@buffers)
661
+ @buffers = nil
662
+ stst.each(&block)
663
+ end
664
+ end
665
+
666
+ class DepqMergeSortBuffer2<DepqMergeSortBuffer
667
+ class StSt<DepqMergeSortBuffer::StSt
668
+ def each(&block)
669
+ key = @buffers.find_min.first.first
670
+ values = KeyValueStream.new(key, self)
671
+ @fiber = Fiber.new{yield values}
672
+ while buf_min = @buffers.find_min
673
+ kv, buf = buf_min
674
+ if key == kv[0]
675
+ values.concat kv[1]
676
+ @fiber.resume
677
+ else
678
+ values.push_eos
679
+ @fiber.resume
680
+ key = kv[0]
681
+ values = KeyValueStream.new(key, self)
682
+ @fiber = Fiber.new{yield values}
683
+ values.concat kv[1]
684
+ @fiber.resume
685
+ end
686
+
687
+ unless line = read_line(buf.io)
688
+ buf.close!
689
+ @buffers.delete_min
690
+ next
691
+ end
692
+ # @buffers.replace_min [line, buf], line[0]
693
+ buf_min[0] = line
694
+ loc = @buffers.find_min_locator
695
+ loc.update_priority line[0]
696
+ end
697
+ values.push_eos
698
+ @fiber.resume
699
+ end
700
+ end
701
+
702
+ def each_2ndmemory(&block)
703
+ unless @key_values.empty?
704
+ store_2ndmemory(@key_values)
705
+ @key_values = nil
706
+ end
707
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
708
+
709
+ stst = StSt.new(@buffers)
710
+ @buffers = nil
711
+ stst.each(&block)
712
+ end
713
+ end
714
+
715
+ #
716
+ # using: PriorityQueue(http://rubyforge.org/projects/priority-queue/)
717
+ #
718
+ class PQMergeSortBuffer<MergeSortBuffer
719
+ class StSt<MergeSortBuffer::StSt
720
+ class Pair
721
+ def initialize(kv, buf)
722
+ @key_values = kv
723
+ @buf = buf
724
+ end
725
+
726
+ attr_accessor :key_values
727
+ attr_accessor :buf
728
+
729
+ def key
730
+ @key_values.first
731
+ end
732
+
733
+ def values
734
+ @key_values.last
735
+ end
736
+ end
737
+
738
+ def initialize(buffers)
739
+ require "priority_queue"
740
+
741
+ @buffers = PriorityQueue.new
742
+ buffers.each{|buf|
743
+ buf.open
744
+ kv = read_line(buf.io)
745
+ next unless kv
746
+ @buffers.push Pair.new(kv, buf) , kv.first
747
+ }
748
+
749
+ @fiber = nil
750
+ end
751
+
752
+ def each(&block)
753
+ key = @buffers.min_key.key
754
+ values = KeyValueStream.new(key, self)
755
+ @fiber = Fiber.new{yield values}
756
+ while min_pair = @buffers.delete_min_return_key
757
+ # buf, kv = buf_min
758
+ if key == min_pair.key
759
+ values.concat min_pair.values
760
+ @fiber.resume
761
+ else
762
+ values.push_eos
763
+ @fiber.resume
764
+ key = min_pair.key
765
+ values = KeyValueStream.new(key, self)
766
+ @fiber = Fiber.new{yield values}
767
+ values.concat min_pair.values
768
+ @fiber.resume
769
+ end
770
+
771
+ unless line = read_line(min_pair.buf.io)
772
+ min_pair.buf.close!
773
+ next
774
+ end
775
+ min_pair.key_values = line
776
+ @buffers.push min_pair, line[0]
777
+ end
778
+ values.push_eos
779
+ @fiber.resume
780
+ end
781
+ end
782
+
783
+ def each_2ndmemory(&block)
784
+ unless @key_values.empty?
785
+ store_2ndmemory(@key_values)
786
+ @key_values = nil
787
+ end
788
+ Log::info(self, "Merge Start: #{@buffers.size} files")
789
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
790
+
791
+ stst = StSt.new(@buffers)
792
+ @buffers = nil
793
+ stst.each(&block)
794
+ end
795
+ end
796
+
797
+ class PQMergeSortBuffer2<MergeSortBuffer
798
+ class StSt<MergeSortBuffer::StSt
799
+ def initialize(buffers)
800
+ require "priority_queue"
801
+
802
+ @buffers = PriorityQueue.new
803
+ buffers.each{|buf|
804
+ buf.open
805
+ kv = read_line(buf.io)
806
+ next unless kv
807
+ @buffers.push [kv, buf], kv.first
808
+ }
809
+
810
+ @fiber = nil
811
+ end
812
+
813
+ def each(&block)
814
+ key = @buffers.min_key.first.first
815
+ values = KeyValueStream.new(key, self)
816
+ @fiber = Fiber.new{yield values}
817
+ while buf_min = @buffers.min_key
818
+ kv, buf = buf_min
819
+ if key == kv[0]
820
+ values.concat kv[1]
821
+ @fiber.resume
822
+ else
823
+ values.push_eos
824
+ @fiber.resume
825
+ key = kv[0]
826
+ values = KeyValueStream.new(key, self)
827
+ @fiber = Fiber.new{yield values}
828
+ values.concat kv[1]
829
+ @fiber.resume
830
+ end
831
+
832
+ unless line = read_line(buf.io)
833
+ buf.close!
834
+ @buffers.delete_min
835
+ next
836
+ end
837
+ buf_min[0] = line
838
+ @buffers.change_priority buf_min, line[0]
839
+ end
840
+ values.push_eos
841
+ @fiber.resume
842
+ end
843
+ end
844
+
845
+ def each_2ndmemory(&block)
846
+ unless @key_values.empty?
847
+ store_2ndmemory(@key_values)
848
+ @key_values = nil
849
+ end
850
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
851
+
852
+ stst = StSt.new(@buffers)
853
+ @buffers = nil
854
+ stst.each(&block)
855
+ end
856
+ end
857
+
858
+ class DirectOnMemoryBuffer
859
+
860
+ def initialize(njob, policy)
861
+ @njob = njob
862
+ @policy = policy
863
+
864
+ @key_values = []
865
+ @key_values_mutex = Mutex.new
866
+
867
+ @CHUNK_SIZE = policy[:chunk_size]
868
+ @CHUNK_SIZE ||= CONF.GROUP_BY_CMSB_CHUNK_SIZE
869
+
870
+ @log_id = format("%s[%s]", self.class.name.sub(/Fairy::/, ''), @njob.id)
871
+ end
872
+
873
+ attr_accessor :log_id
874
+
875
+ def push(value)
876
+ @key_values_mutex.synchronize do
877
+ @key_values.push value
878
+ end
879
+ end
880
+
881
+ def each(&block)
882
+ # @key_values = @key_values.collect{|e| [@njob.hash_key(e), e]}.group_by{|k, e| k}.sort_by{|k, e| k}
883
+ @key_values = @key_values.group_by{|e| @njob.hash_key(e)}.sort_by{|k, e| k}.collect{|k, values| kvs = KeyValueStream.new(k, nil); kvs.concat(values); kvs.push_eos; kvs}
884
+ @key_values.each &block
885
+ end
886
+ end
887
+
888
+ class DirectMergeSortBuffer<DirectOnMemoryBuffer
889
+ def initialize(njob, policy)
890
+ super
891
+
892
+ @threshold = policy[:threshold]
893
+ @threshold ||= CONF.GROUP_BY_CMSB_THRESHOLD
894
+
895
+ @buffers = nil
896
+ end
897
+
898
+ def init_2ndmemory
899
+ require "fairy/share/fast-tempfile"
900
+
901
+ @buffer_dir = @policy[:buffer_dir]
902
+ @buffer_dir ||= CONF.TMP_DIR
903
+
904
+ @buffers = []
905
+ end
906
+
907
+ def open_buffer(&block)
908
+ unless @buffers
909
+ init_2ndmemory
910
+ end
911
+ buffer = FastTempfile.open("mod-group-by-buffer-#{@njob.no}-", @buffer_dir)
912
+ @buffers.push buffer
913
+ if block_given?
914
+ begin
915
+ # ruby BUG#2390の対応のため.
916
+ # yield buffer
917
+ yield buffer.io
918
+ ensure
919
+ buffer.close
920
+ end
921
+ else
922
+ buffer
923
+ end
924
+ end
925
+
926
+ def push(value)
927
+ super
928
+
929
+ key_values = nil
930
+ @key_values_mutex.synchronize do
931
+ if @key_values.size > @threshold
932
+ key_values = @key_values
933
+ @key_values = []
934
+ end
935
+ if key_values
936
+ store_2ndmemory(key_values)
937
+ end
938
+ end
939
+ end
940
+
941
+ def store_2ndmemory(key_values)
942
+ Log::debug(self, "START STORE")
943
+ key_values = key_values.sort_by{|e| @njob.hash_key(e)}
944
+
945
+ open_buffer do |io|
946
+ key_values.each_slice(@CHUNK_SIZE) do |ary|
947
+ Marshal.dump(ary, io)
948
+ end
949
+ end
950
+ sorted = nil
951
+ Log::debug(self, "FINISH STORE")
952
+ end
953
+
954
+ def each(&block)
955
+ if @buffers
956
+ each_2ndmemory &block
957
+ else
958
+ super
959
+ end
960
+ end
961
+
962
+ def each_2ndmemory(&block)
963
+ unless @key_values.empty?
964
+ store_2ndmemory(@key_values)
965
+ @key_values = nil
966
+ end
967
+ Log::info(self, "Merge Start: #{@buffers.size} files")
968
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
969
+
970
+ m = Merger.new(@njob, @buffers)
971
+ m.each(&block)
972
+ end
973
+
974
+ class Merger
975
+ def initialize(njob, buffers, cached_buffer_class = CachedBuffer)
976
+ @njob = njob
977
+ @buffers = buffers.collect{|buf| cached_buffer_class.new(@njob, buf)}.select{|buf| !buf.eof?}.sort_by{|buf| buf.key}
978
+
979
+ @key = nil
980
+ end
981
+
982
+ def each(&block)
983
+ while !@buffers.empty?
984
+ @key = @buffers.first.key
985
+ values = KeyValueStream.new(@key, self)
986
+ block.call values
987
+ end
988
+ end
989
+
990
+ def each_by_key(&block)
991
+ while buf_min = @buffers.shift
992
+ vv_key = buf_min.key
993
+ unless @key == vv_key
994
+ @buffers.unshift buf_min
995
+ return
996
+ end
997
+
998
+ buf_min.each_by_same_key(&block)
999
+
1000
+ if buf_min.eof?
1001
+ buf_min.close!
1002
+ next
1003
+ end
1004
+
1005
+ if vv_key == buf_min.key
1006
+ @buffers.unshift(buf_min)
1007
+ else
1008
+ idx = @buffers.rindex{|buf| buf.key <= buf_min.key}
1009
+ idx ? @buffers.insert(idx+1, buf_min) : @buffers.unshift(buf_min)
1010
+ end
1011
+ end
1012
+ end
1013
+
1014
+ def get_buf(values)
1015
+ unless buf_min = @buffers.shift
1016
+ values.push_eos
1017
+ return
1018
+ end
1019
+
1020
+ vv_key = buf_min.key
1021
+ unless @key == vv_key
1022
+ values.push_eos
1023
+ @buffers.unshift buf_min
1024
+ return
1025
+ end
1026
+
1027
+ vv = buf_min.shift_values
1028
+ if vv
1029
+ values.concat vv
1030
+ end
1031
+ if buf_min.eof?
1032
+ buf_min.close!
1033
+ return
1034
+ end
1035
+
1036
+ idx = @buffers.rindex{|buf| buf.key <= buf_min.key}
1037
+ idx ? @buffers.insert(idx+1, buf_min) : @buffers.unshift(buf_min)
1038
+ end
1039
+ end
1040
+
1041
+ class CachedBuffer
1042
+ extend Forwardable
1043
+
1044
+ def initialize(njob, io)
1045
+ @njob = njob
1046
+ @io = io
1047
+ io.open
1048
+
1049
+ @cache = []
1050
+ @cache_pv = 0
1051
+
1052
+ @eof = false
1053
+
1054
+ read_buffer
1055
+ @key = @njob.hash_key(@cache.first)
1056
+ end
1057
+
1058
+ def_delegator :@io, :open
1059
+ def_delegator :@io, :close
1060
+ def_delegator :@io, :close!
1061
+
1062
+ def eof?
1063
+ @eof
1064
+ end
1065
+
1066
+ def key
1067
+ @key
1068
+ end
1069
+
1070
+ def each_by_same_key(&block)
1071
+ if @cache.size <= @cache_pv
1072
+ read_buffer
1073
+ return if @cache.empty?
1074
+ end
1075
+
1076
+ while @njob.hash_key(@cache[@cache_pv]) == @key
1077
+ block.call @cache[@cache_pv]
1078
+ @cache_pv += 1
1079
+
1080
+ if @cache.size <= @cache_pv
1081
+ read_buffer
1082
+ return if @cache.empty?
1083
+ end
1084
+ end
1085
+ @key = @njob.hash_key(@cache[@cache_pv])
1086
+ end
1087
+
1088
+ def shift_values
1089
+ if @cache.empty?
1090
+ read_buffer
1091
+ return nil if @cache.empty?
1092
+ end
1093
+
1094
+ idx = @cache.index{|v| @njob.hash_key(v) != @key}
1095
+ if idx
1096
+ vv = @cache.slice!(0, idx)
1097
+ @key = @njob.hash_key(@cache.first)
1098
+ else
1099
+ vv = @cache
1100
+ @cache = []
1101
+ end
1102
+ vv
1103
+ end
1104
+
1105
+ def read_buffer
1106
+ io = @io.io
1107
+ begin
1108
+ @cache = Marshal.load(io)
1109
+ rescue EOFError
1110
+ @eof = true
1111
+ @cache = []
1112
+ rescue ArgumentError
1113
+ Log::debug(self, "MARSHAL ERROR OCCURED!!")
1114
+ io.seek(-1024, IO::SEEK_CUR)
1115
+ buf = io.read(2048)
1116
+ Log::debugf(self, "File Contents: %s", buf)
1117
+ raise
1118
+ end
1119
+ # @key = @njob.hash_key(@cache.first)
1120
+ @cache_pv = 0
1121
+ end
1122
+
1123
+ end
1124
+
1125
+ class KeyValueStream
1126
+ include Enumerable
1127
+
1128
+ EOS = :__KEY_VALUE_STREAM_EOS__
1129
+
1130
+ def initialize(key, merger)
1131
+ @key = key
1132
+ @merger = merger
1133
+
1134
+ @buf = []
1135
+ end
1136
+ attr_reader :key
1137
+
1138
+ def push(e)
1139
+ @buf.push e
1140
+ end
1141
+ alias enq push
1142
+
1143
+ def push_eos
1144
+ push EOS
1145
+ end
1146
+
1147
+ def concat(elements)
1148
+ @buf.concat elements
1149
+ end
1150
+
1151
+ def shift
1152
+ while @buf.empty?
1153
+ @merger.get_buf(self)
1154
+ end
1155
+ @buf.shift
1156
+ end
1157
+ alias deq shift
1158
+ alias pop shift
1159
+
1160
+ def each(&block)
1161
+ @merger.each_by_key(&block)
1162
+ end
1163
+
1164
+ def size
1165
+ c = 0
1166
+ each{|v| c += 1}
1167
+ c
1168
+ end
1169
+ end
1170
+ end
1171
+
1172
+ class DirectFBMergeSortBuffer<DirectMergeSortBuffer
1173
+ def each_2ndmemory(&block)
1174
+ unless @key_values.empty?
1175
+ store_2ndmemory(@key_values)
1176
+ @key_values = nil
1177
+ end
1178
+ Log::info(self, "Merge Start: #{@buffers.size} files")
1179
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
1180
+
1181
+ m = Merger.new(@njob, @buffers)
1182
+ m.each(&block)
1183
+ end
1184
+
1185
+ class Merger<DirectMergeSortBuffer::Merger
1186
+ def initialize(njob, buffers)
1187
+ @njob = njob
1188
+ @buffers = buffers.collect{|buf| CachedBuffer.new(@njob, buf)}.select{|buf| !buf.eof?}.sort_by{|buf| buf.key}
1189
+
1190
+ @key = nil
1191
+ end
1192
+ end
1193
+
1194
+ class CachedBuffer<DirectMergeSortBuffer::CachedBuffer
1195
+ extend Forwardable
1196
+
1197
+ def initialize(njob, io)
1198
+ super
1199
+
1200
+ @each_fb = Fiber.new{|block| each_sub(block)}
1201
+ end
1202
+
1203
+ # def key
1204
+ # if @cache.empty?
1205
+ # read_buffer
1206
+ # end
1207
+ # @key
1208
+ # end
1209
+
1210
+ def each_by_same_key(&block)
1211
+ @each_fb.resume(block)
1212
+ end
1213
+
1214
+ def each_sub(block)
1215
+ if @cache.empty?
1216
+ read_buffer
1217
+ return if @cache.empty?
1218
+ end
1219
+
1220
+ while !@cache.empty?
1221
+ @cache.each do |e|
1222
+ unless @njob.hash_key(e) == @key
1223
+ @key = @njob.hash_key(e)
1224
+ block = Fiber.yield
1225
+ end
1226
+ block.call e
1227
+ end
1228
+ read_buffer
1229
+ end
1230
+ end
1231
+
1232
+ def read_buffer
1233
+ io = @io.io
1234
+ begin
1235
+ @cache = Marshal.load(io)
1236
+ rescue EOFError
1237
+ @eof = true
1238
+ @cache = []
1239
+ rescue ArgumentError
1240
+ Log::debug(self, "MARSHAL ERROR OCCURED!!")
1241
+ io.seek(-1024, IO::SEEK_CUR)
1242
+ buf = io.read(2048)
1243
+ Log::debugf(self, "File Contents: %s", buf)
1244
+ raise
1245
+ end
1246
+ # @key = @njob.hash_key(@cache.first)
1247
+ end
1248
+ end
1249
+ end
1250
+
1251
+ class DirectPQMergeSortBuffer<DirectMergeSortBuffer
1252
+
1253
+ def initialize(njob, policy)
1254
+ require "priority_queue"
1255
+ super
1256
+ end
1257
+
1258
+ def each_2ndmemory(&block)
1259
+ unless @key_values.empty?
1260
+ store_2ndmemory(@key_values)
1261
+ @key_values = nil
1262
+ end
1263
+ Log::info(self, "Merge Start: #{@buffers.size} files")
1264
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
1265
+
1266
+ m = Merger.new(@njob, @buffers)
1267
+ m.each(&block)
1268
+ end
1269
+
1270
+ class Merger<DirectMergeSortBuffer::Merger
1271
+
1272
+ def initialize(njob, buffers)
1273
+ @njob = njob
1274
+ @buffers = PriorityQueue.new
1275
+ buffers.each{|buf|
1276
+ cb = DirectMergeSortBuffer::CachedBuffer.new(@njob, buf)
1277
+ next if cb.eof?
1278
+ @buffers.push cb, cb.key
1279
+ }
1280
+
1281
+ @key = nil
1282
+ end
1283
+
1284
+ def each(&block)
1285
+ while !@buffers.empty?
1286
+ @key = @buffers.min_key.key
1287
+ values = DirectMergeSortBuffer::KeyValueStream.new(@key, self)
1288
+ block.call values
1289
+ end
1290
+ end
1291
+
1292
+ def each_by_key(&block)
1293
+ while buf_min = @buffers.delete_min_return_key
1294
+ vv_key = buf_min.key
1295
+ unless @key == vv_key
1296
+ @buffers.push buf_min, buf_min.key
1297
+ return
1298
+ end
1299
+
1300
+ buf_min.each_by_same_key(&block)
1301
+
1302
+ if buf_min.eof?
1303
+ buf_min.close!
1304
+ return
1305
+ end
1306
+
1307
+ @buffers.push buf_min, buf_min.key
1308
+ end
1309
+ end
1310
+
1311
+
1312
+ def get_buf(values)
1313
+ unless buf_min = @buffers.delete_min_return_key
1314
+ values.push_eos
1315
+ return
1316
+ end
1317
+
1318
+ vv_key = buf_min.key
1319
+ unless @key == vv_key
1320
+ values.push_eos
1321
+ @buffers.push buf_min, buf_min.key
1322
+ return
1323
+ end
1324
+
1325
+ vv = buf_min.shift_values
1326
+ if vv
1327
+ values.concat vv
1328
+ end
1329
+ if buf_min.eof?
1330
+ buf_min.close!
1331
+ return
1332
+ end
1333
+
1334
+ @buffers.push buf_min, buf_min.key
1335
+ end
1336
+ end
1337
+ end
1338
+
1339
+ class DirectKBMergeSortBuffer<CommandMergeSortBuffer
1340
+
1341
+ def store_2ndmemory(key_values)
1342
+ Log::debug(self, "START STORE")
1343
+ sorted = key_values.sort_by{|e| e.first}
1344
+
1345
+ open_buffer do |io|
1346
+ sorted.each do |key, vv|
1347
+ vv.each do |values|
1348
+ Marshal.dump(values, io)
1349
+ end
1350
+ end
1351
+ end
1352
+ sorted = nil
1353
+ Log::debug(self, "FINISH STORE")
1354
+ end
1355
+
1356
+ def each_2ndmemory(&block)
1357
+ unless @key_values.empty?
1358
+ store_2ndmemory(@key_values)
1359
+ @key_values = nil
1360
+ end
1361
+ Log::info(self, "Merge Start: #{@buffers.size} files")
1362
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
1363
+
1364
+ m = DirectMergeSortBuffer::Merger.new(@njob, @buffers, CachedBuffer)
1365
+ m.each(&block)
1366
+ end
1367
+
1368
+ class CachedBuffer
1369
+ extend Forwardable
1370
+
1371
+ def initialize(njob, io)
1372
+ @njob = njob
1373
+ @io = io
1374
+ io.open
1375
+
1376
+ @cache = []
1377
+
1378
+ @eof = false
1379
+
1380
+ read_buffer
1381
+ @key = @njob.hash_key(@cache.first)
1382
+ end
1383
+
1384
+ def_delegator :@io, :open
1385
+ def_delegator :@io, :close
1386
+ def_delegator :@io, :close!
1387
+
1388
+ def eof?
1389
+ @eof
1390
+ end
1391
+
1392
+ def key
1393
+ @key
1394
+ end
1395
+
1396
+ def each_by_same_key(&block)
1397
+ loop do
1398
+ @cache.each &block
1399
+ read_buffer
1400
+ return if @cache.empty?
1401
+ unless @njob.hash_key(@cache.first) == @key
1402
+ @key = @njob.hash_key(@cache.first)
1403
+ return
1404
+ end
1405
+ end
1406
+ end
1407
+
1408
+ def read_buffer
1409
+ io = @io.io
1410
+ begin
1411
+ @cache = Marshal.load(io)
1412
+ rescue EOFError
1413
+ @eof = true
1414
+ @cache = []
1415
+ rescue ArgumentError
1416
+ Log::debug(self, "MARSHAL ERROR OCCURED!!")
1417
+ io.seek(-1024, IO::SEEK_CUR)
1418
+ buf = io.read(2048)
1419
+ Log::debugf(self, "File Contents: %s", buf)
1420
+ raise
1421
+ end
1422
+ end
1423
+ end
1424
+ end
1425
+
1426
+ class DirectKB2MergeSortBuffer<DirectKBMergeSortBuffer
1427
+ def store_2ndmemory(key_values)
1428
+ Log::debug(self, "START STORE")
1429
+ sorted = key_values.sort_by{|e| e.first}
1430
+
1431
+ open_buffer do |io|
1432
+ tmpary = []
1433
+ tmpary_sz = 0
1434
+ sorted.each do |key, vv|
1435
+ vv.each do |values|
1436
+ if tmpary_sz >= @CHUNK_SIZE
1437
+ Marshal.dump(tmpary, io)
1438
+ tmpary = []
1439
+ tmpary_sz = 0
1440
+ end
1441
+ tmpary.push values
1442
+ tmpary_sz += values.size
1443
+ end
1444
+ end
1445
+ if tmpary_sz > 0
1446
+ Marshal.dump(tmpary, io)
1447
+ tmpary = nil
1448
+ end
1449
+ end
1450
+ sorted = nil
1451
+ Log::debug(self, "FINISH STORE")
1452
+ end
1453
+
1454
+ def each_2ndmemory(&block)
1455
+ unless @key_values.empty?
1456
+ store_2ndmemory(@key_values)
1457
+ @key_values = nil
1458
+ end
1459
+ Log::info(self, "Merge Start: #{@buffers.size} files")
1460
+ Log::debug(self, @buffers.collect{|b| b.path}.join(" "))
1461
+
1462
+ m = DirectMergeSortBuffer::Merger.new(@njob, @buffers, CachedBuffer)
1463
+ m.each(&block)
1464
+ end
1465
+
1466
+ class CachedBuffer
1467
+ extend Forwardable
1468
+
1469
+ def initialize(njob, io)
1470
+ @njob = njob
1471
+ @io = io
1472
+ io.open
1473
+
1474
+ @cache = []
1475
+
1476
+ @eof = false
1477
+
1478
+ read_buffer
1479
+ @key = @njob.hash_key(@cache.first.first)
1480
+ end
1481
+
1482
+ def_delegator :@io, :open
1483
+ def_delegator :@io, :close
1484
+ def_delegator :@io, :close!
1485
+
1486
+ def eof?
1487
+ @eof
1488
+ end
1489
+
1490
+ def key
1491
+ @key
1492
+ end
1493
+
1494
+ def each_by_same_key(&block)
1495
+ loop do
1496
+ while vv = @cache.shift
1497
+ unless @njob.hash_key(vv.first) == @key
1498
+ @cache.unshift vv
1499
+ @key = @njob.hash_key(vv.first)
1500
+ return
1501
+ end
1502
+ vv.each &block
1503
+ end
1504
+ read_buffer
1505
+ return if @cache.empty?
1506
+ unless @njob.hash_key(@cache.first.first) == @key
1507
+ @key = @njob.hash_key(@cache.first.first)
1508
+ return
1509
+ end
1510
+ end
1511
+ end
1512
+
1513
+ def read_buffer
1514
+ io = @io.io
1515
+ begin
1516
+ @cache = Marshal.load(io)
1517
+ rescue EOFError
1518
+ @eof = true
1519
+ @cache = []
1520
+ rescue ArgumentError
1521
+ Log::debug(self, "MARSHAL ERROR OCCURED!!")
1522
+ io.seek(-1024, IO::SEEK_CUR)
1523
+ buf = io.read(2048)
1524
+ Log::debugf(self, "File Contents: %s", buf)
1525
+ raise
1526
+ end
1527
+ end
1528
+ end
1529
+ end
1530
+ end
1531
+ end
1532
+
1533
+
1534
+