ruby-spark 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +37 -0
  3. data/Gemfile +47 -0
  4. data/Guardfile +5 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +185 -0
  7. data/Rakefile +35 -0
  8. data/TODO.md +7 -0
  9. data/benchmark/aggregate.rb +33 -0
  10. data/benchmark/bisect.rb +88 -0
  11. data/benchmark/custom_marshal.rb +94 -0
  12. data/benchmark/digest.rb +150 -0
  13. data/benchmark/enumerator.rb +88 -0
  14. data/benchmark/performance/prepare.sh +18 -0
  15. data/benchmark/performance/python.py +156 -0
  16. data/benchmark/performance/r.r +69 -0
  17. data/benchmark/performance/ruby.rb +167 -0
  18. data/benchmark/performance/run-all.sh +160 -0
  19. data/benchmark/performance/scala.scala +181 -0
  20. data/benchmark/serializer.rb +82 -0
  21. data/benchmark/sort.rb +43 -0
  22. data/benchmark/sort2.rb +164 -0
  23. data/benchmark/take.rb +28 -0
  24. data/bin/ruby-spark +8 -0
  25. data/example/pi.rb +28 -0
  26. data/ext/ruby_c/extconf.rb +3 -0
  27. data/ext/ruby_c/murmur.c +158 -0
  28. data/ext/ruby_c/murmur.h +9 -0
  29. data/ext/ruby_c/ruby-spark.c +18 -0
  30. data/ext/ruby_java/Digest.java +36 -0
  31. data/ext/ruby_java/Murmur2.java +98 -0
  32. data/ext/ruby_java/RubySparkExtService.java +28 -0
  33. data/ext/ruby_java/extconf.rb +3 -0
  34. data/ext/spark/build.sbt +73 -0
  35. data/ext/spark/project/plugins.sbt +9 -0
  36. data/ext/spark/sbt/sbt +34 -0
  37. data/ext/spark/src/main/scala/Exec.scala +91 -0
  38. data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
  39. data/ext/spark/src/main/scala/Marshal.scala +52 -0
  40. data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
  41. data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
  42. data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
  43. data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
  44. data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
  45. data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
  46. data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
  47. data/ext/spark/src/main/scala/RubyPage.scala +34 -0
  48. data/ext/spark/src/main/scala/RubyRDD.scala +364 -0
  49. data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
  50. data/ext/spark/src/main/scala/RubyTab.scala +11 -0
  51. data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
  52. data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
  53. data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
  54. data/lib/ruby-spark.rb +1 -0
  55. data/lib/spark.rb +198 -0
  56. data/lib/spark/accumulator.rb +260 -0
  57. data/lib/spark/broadcast.rb +98 -0
  58. data/lib/spark/build.rb +43 -0
  59. data/lib/spark/cli.rb +169 -0
  60. data/lib/spark/command.rb +86 -0
  61. data/lib/spark/command/base.rb +154 -0
  62. data/lib/spark/command/basic.rb +345 -0
  63. data/lib/spark/command/pair.rb +124 -0
  64. data/lib/spark/command/sort.rb +51 -0
  65. data/lib/spark/command/statistic.rb +144 -0
  66. data/lib/spark/command_builder.rb +141 -0
  67. data/lib/spark/command_validator.rb +34 -0
  68. data/lib/spark/config.rb +244 -0
  69. data/lib/spark/constant.rb +14 -0
  70. data/lib/spark/context.rb +304 -0
  71. data/lib/spark/error.rb +50 -0
  72. data/lib/spark/ext/hash.rb +41 -0
  73. data/lib/spark/ext/integer.rb +25 -0
  74. data/lib/spark/ext/io.rb +57 -0
  75. data/lib/spark/ext/ip_socket.rb +29 -0
  76. data/lib/spark/ext/module.rb +58 -0
  77. data/lib/spark/ext/object.rb +24 -0
  78. data/lib/spark/ext/string.rb +24 -0
  79. data/lib/spark/helper.rb +10 -0
  80. data/lib/spark/helper/logger.rb +40 -0
  81. data/lib/spark/helper/parser.rb +85 -0
  82. data/lib/spark/helper/serialize.rb +71 -0
  83. data/lib/spark/helper/statistic.rb +93 -0
  84. data/lib/spark/helper/system.rb +42 -0
  85. data/lib/spark/java_bridge.rb +19 -0
  86. data/lib/spark/java_bridge/base.rb +203 -0
  87. data/lib/spark/java_bridge/jruby.rb +23 -0
  88. data/lib/spark/java_bridge/rjb.rb +41 -0
  89. data/lib/spark/logger.rb +76 -0
  90. data/lib/spark/mllib.rb +100 -0
  91. data/lib/spark/mllib/classification/common.rb +31 -0
  92. data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
  93. data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
  94. data/lib/spark/mllib/classification/svm.rb +135 -0
  95. data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
  96. data/lib/spark/mllib/clustering/kmeans.rb +118 -0
  97. data/lib/spark/mllib/matrix.rb +120 -0
  98. data/lib/spark/mllib/regression/common.rb +73 -0
  99. data/lib/spark/mllib/regression/labeled_point.rb +41 -0
  100. data/lib/spark/mllib/regression/lasso.rb +100 -0
  101. data/lib/spark/mllib/regression/linear.rb +124 -0
  102. data/lib/spark/mllib/regression/ridge.rb +97 -0
  103. data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
  104. data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
  105. data/lib/spark/mllib/stat/distribution.rb +12 -0
  106. data/lib/spark/mllib/vector.rb +185 -0
  107. data/lib/spark/rdd.rb +1328 -0
  108. data/lib/spark/sampler.rb +92 -0
  109. data/lib/spark/serializer.rb +24 -0
  110. data/lib/spark/serializer/base.rb +170 -0
  111. data/lib/spark/serializer/cartesian.rb +37 -0
  112. data/lib/spark/serializer/marshal.rb +19 -0
  113. data/lib/spark/serializer/message_pack.rb +25 -0
  114. data/lib/spark/serializer/oj.rb +25 -0
  115. data/lib/spark/serializer/pair.rb +27 -0
  116. data/lib/spark/serializer/utf8.rb +25 -0
  117. data/lib/spark/sort.rb +189 -0
  118. data/lib/spark/stat_counter.rb +125 -0
  119. data/lib/spark/storage_level.rb +39 -0
  120. data/lib/spark/version.rb +3 -0
  121. data/lib/spark/worker/master.rb +144 -0
  122. data/lib/spark/worker/spark_files.rb +15 -0
  123. data/lib/spark/worker/worker.rb +197 -0
  124. data/ruby-spark.gemspec +36 -0
  125. data/spec/generator.rb +37 -0
  126. data/spec/inputs/lorem_300.txt +316 -0
  127. data/spec/inputs/numbers/1.txt +50 -0
  128. data/spec/inputs/numbers/10.txt +50 -0
  129. data/spec/inputs/numbers/11.txt +50 -0
  130. data/spec/inputs/numbers/12.txt +50 -0
  131. data/spec/inputs/numbers/13.txt +50 -0
  132. data/spec/inputs/numbers/14.txt +50 -0
  133. data/spec/inputs/numbers/15.txt +50 -0
  134. data/spec/inputs/numbers/16.txt +50 -0
  135. data/spec/inputs/numbers/17.txt +50 -0
  136. data/spec/inputs/numbers/18.txt +50 -0
  137. data/spec/inputs/numbers/19.txt +50 -0
  138. data/spec/inputs/numbers/2.txt +50 -0
  139. data/spec/inputs/numbers/20.txt +50 -0
  140. data/spec/inputs/numbers/3.txt +50 -0
  141. data/spec/inputs/numbers/4.txt +50 -0
  142. data/spec/inputs/numbers/5.txt +50 -0
  143. data/spec/inputs/numbers/6.txt +50 -0
  144. data/spec/inputs/numbers/7.txt +50 -0
  145. data/spec/inputs/numbers/8.txt +50 -0
  146. data/spec/inputs/numbers/9.txt +50 -0
  147. data/spec/inputs/numbers_0_100.txt +101 -0
  148. data/spec/inputs/numbers_1_100.txt +100 -0
  149. data/spec/lib/collect_spec.rb +42 -0
  150. data/spec/lib/command_spec.rb +68 -0
  151. data/spec/lib/config_spec.rb +64 -0
  152. data/spec/lib/context_spec.rb +163 -0
  153. data/spec/lib/ext_spec.rb +72 -0
  154. data/spec/lib/external_apps_spec.rb +45 -0
  155. data/spec/lib/filter_spec.rb +80 -0
  156. data/spec/lib/flat_map_spec.rb +100 -0
  157. data/spec/lib/group_spec.rb +109 -0
  158. data/spec/lib/helper_spec.rb +19 -0
  159. data/spec/lib/key_spec.rb +41 -0
  160. data/spec/lib/manipulation_spec.rb +114 -0
  161. data/spec/lib/map_partitions_spec.rb +87 -0
  162. data/spec/lib/map_spec.rb +91 -0
  163. data/spec/lib/mllib/classification_spec.rb +54 -0
  164. data/spec/lib/mllib/clustering_spec.rb +35 -0
  165. data/spec/lib/mllib/matrix_spec.rb +32 -0
  166. data/spec/lib/mllib/regression_spec.rb +116 -0
  167. data/spec/lib/mllib/vector_spec.rb +77 -0
  168. data/spec/lib/reduce_by_key_spec.rb +118 -0
  169. data/spec/lib/reduce_spec.rb +131 -0
  170. data/spec/lib/sample_spec.rb +46 -0
  171. data/spec/lib/serializer_spec.rb +13 -0
  172. data/spec/lib/sort_spec.rb +58 -0
  173. data/spec/lib/statistic_spec.rb +168 -0
  174. data/spec/lib/whole_text_files_spec.rb +33 -0
  175. data/spec/spec_helper.rb +39 -0
  176. metadata +301 -0
@@ -0,0 +1,86 @@
1
+ module Spark
2
+ ##
3
+ # Container which includes all commands and other things for worker
4
+ # Every RDD have own copy of Command
5
+ #
6
+ class Command
7
+
8
+ attr_accessor :serializer, :deserializer, :commands, :libraries, :bound_objects
9
+
10
+ def initialize
11
+ @serializer = nil
12
+ @deserializer = nil
13
+ @commands = []
14
+ @libraries = []
15
+ @bound_objects = {}
16
+ end
17
+
18
+ def execute(iterator, split_index)
19
+ # Require necessary libraries
20
+ libraries.each{|lib| require lib}
21
+
22
+ # Prepare bound objects
23
+ @commands.each do |command|
24
+ command.__objects__ = bound_objects
25
+ end
26
+
27
+ # Prepare for running
28
+ @commands.each(&:prepare)
29
+
30
+ # Run all task
31
+ @commands.each do |command|
32
+ iterator = command.execute(iterator, split_index)
33
+ end
34
+
35
+ # Return changed iterator. This is not be necessary for some tasks
36
+ # because of using inplace changing but some task can return
37
+ # only one value (for example reduce).
38
+ iterator
39
+ end
40
+
41
+ def last
42
+ @commands.last
43
+ end
44
+
45
+ def bound_objects
46
+ # Objects from users
47
+ # Already initialized objects on worker
48
+ return @bound_objects if @bound_objects
49
+
50
+ if @serialized_bound_objects
51
+ # Still serialized
52
+ @bound_objects = Marshal.load(@serialized_bound_objects)
53
+ else
54
+ # Something else
55
+ @bound_objects = {}
56
+ end
57
+ end
58
+
59
+ # Bound objects can depend on library which is loaded during @execute
60
+ # In that case worker raise "undefined class/module"
61
+ def marshal_dump
62
+ [@serializer, @deserializer, @commands, @libraries, serialized_bound_objects]
63
+ end
64
+
65
+ def marshal_load(array)
66
+ @serializer = array.shift
67
+ @deserializer = array.shift
68
+ @commands = array.shift
69
+ @libraries = array.shift
70
+ @serialized_bound_objects = array.shift
71
+ end
72
+
73
+ private
74
+
75
+ def serialized_bound_objects
76
+ @serialized_bound_objects ||= Marshal.dump(@bound_objects)
77
+ end
78
+
79
+ end
80
+ end
81
+
82
+ require 'spark/command/base'
83
+ require 'spark/command/basic'
84
+ require 'spark/command/pair'
85
+ require 'spark/command/statistic'
86
+ require 'spark/command/sort'
@@ -0,0 +1,154 @@
1
+ ##
2
+ # Spark::Command::Base
3
+ #
4
+ # Parent for all commands (Map, FlatMap, Sort, ...)
5
+ #
6
+ class Spark::Command::Base
7
+
8
+ DEFAULT_VARIABLE_OPTIONS = {
9
+ type: Hash,
10
+ function: true
11
+ }
12
+
13
+ def initialize(*args)
14
+ settings.variables.each do |name, options|
15
+ instance_variable_set("@#{name}", args.shift)
16
+ end
17
+ end
18
+
19
+ def self.error(message)
20
+ raise Spark::CommandError, message
21
+ end
22
+
23
+ def error(message)
24
+ self.class.error(message)
25
+ end
26
+
27
+ def log(message=nil)
28
+ $stdout.puts %{==> #{Time.now.strftime("%H:%M:%S")} [#{self.class.name}] #{message}}
29
+ $stdout.flush
30
+ end
31
+
32
+
33
+ # ===============================================================================================
34
+ # Methods called during class loading
35
+ # This is not nicer way but these methods set/get classes variables for child
36
+
37
+ # Settings for command (variables)
38
+ def self.settings
39
+ init_settings
40
+ class_variable_get(:@@settings)
41
+ end
42
+
43
+ def settings
44
+ self.class.settings
45
+ end
46
+
47
+ # Init empty settings
48
+ def self.init_settings
49
+ if !class_variable_defined?(:@@settings)
50
+ struct = Struct.new(:variables)
51
+
52
+ class_variable_set(:@@settings, struct.new)
53
+ settings.variables = {}
54
+ end
55
+ end
56
+
57
+ # New variable for command
58
+ #
59
+ # == Example:
60
+ #
61
+ # class Map < Spark::Command::Base
62
+ # variable :map_function
63
+ # end
64
+ #
65
+ # command = Map.new(1)
66
+ #
67
+ # command.instance_variables
68
+ # # => [:@map_function]
69
+ # command.instance_variable_get(:@map_function)
70
+ # # => 1
71
+ #
72
+ def self.variable(name, options={})
73
+ if settings.variables.has_key?(name)
74
+ error "Function #{name} already exist."
75
+ end
76
+
77
+ settings.variables[name] = DEFAULT_VARIABLE_OPTIONS.merge(options)
78
+ end
79
+
80
+
81
+ # ===============================================================================================
82
+ # Executing methods
83
+
84
+ # Execute command for data and split index
85
+ def execute(iterator, split_index)
86
+ # Implemented on Base but can be override
87
+ before_run
88
+
89
+ # Run has to be implemented on child
90
+ if iterator.is_a?(Enumerator::Lazy) && respond_to?(:lazy_run)
91
+ return lazy_run(iterator, split_index)
92
+ end
93
+
94
+ iterator = iterator.to_a
95
+ run(iterator, split_index)
96
+ end
97
+
98
+ def prepared?
99
+ !!@prepared
100
+ end
101
+
102
+ # This is called before execution. Executing will be stopped if
103
+ # some command contains error (e.g. badly serialized lambda).
104
+ #
105
+ # == What is doing?
106
+ # * evaluate lambda
107
+ # * evaluate method
108
+ # * make new lambda
109
+ #
110
+ def prepare
111
+ return if prepared?
112
+
113
+ to_function = settings.variables.select {|_, options| options[:function]}
114
+ to_function.each do |name, options|
115
+ name = "@#{name}"
116
+ data = instance_variable_get(name)
117
+
118
+ case data[:type]
119
+ when 'proc'
120
+ result = eval(data[:content])
121
+ when 'symbol'
122
+ result = lambda(&data[:content])
123
+ when 'method'
124
+ # Method must me added to instance not Class
125
+ instance_eval(data[:content])
126
+ # Method will be available as Proc
127
+ result = lambda(&method(data[:name]))
128
+ end
129
+
130
+ instance_variable_set(name, result)
131
+ end
132
+
133
+ @prepared = true
134
+ end
135
+
136
+ # This method is called before every execution.
137
+ def before_run
138
+ end
139
+
140
+
141
+ # ===============================================================================================
142
+ # Bound objects
143
+
144
+ attr_accessor :__objects__
145
+
146
+ def method_missing(method, *args, &block)
147
+ if __objects__ && __objects__.has_key?(method)
148
+ return __objects__[method]
149
+ end
150
+
151
+ super
152
+ end
153
+
154
+ end
@@ -0,0 +1,345 @@
1
+ _Base = Spark::Command::Base
2
+
3
+ # -------------------------------------------------------------------------------------------------
4
+ # Map
5
+
6
+ class Spark::Command::Map < _Base
7
+ variable :map_function
8
+
9
+ def run(iterator, *)
10
+ iterator.map! do |item|
11
+ @map_function.call(item)
12
+ end
13
+ iterator
14
+ end
15
+
16
+ def lazy_run(iterator, *)
17
+ iterator.map do |item|
18
+ @map_function.call(item)
19
+ end
20
+ end
21
+ end
22
+
23
+ # -------------------------------------------------------------------------------------------------
24
+ # FlatMap
25
+
26
+ class Spark::Command::FlatMap < Spark::Command::Map
27
+ def run(iterator, *)
28
+ iterator = super
29
+ iterator.flatten!(1)
30
+ iterator
31
+ end
32
+
33
+ def lazy_run(iterator, *)
34
+ iterator.flat_map do |item|
35
+ @map_function.call(item)
36
+ end
37
+ end
38
+ end
39
+
40
+ # -------------------------------------------------------------------------------------------------
41
+ # MapPartitionsWithIndex
42
+
43
+ class Spark::Command::MapPartitionsWithIndex < _Base
44
+ variable :partition_function
45
+
46
+ def run(iterator, index)
47
+ iterator = @partition_function.call(iterator, index)
48
+ iterator
49
+ end
50
+
51
+ # User should controll if there is Enumerator or not
52
+ # alias_method :lazy_run, :run
53
+ end
54
+
55
+ # -------------------------------------------------------------------------------------------------
56
+ # MapPartitions
57
+
58
+ class Spark::Command::MapPartitions < Spark::Command::MapPartitionsWithIndex
59
+ def run(iterator, *)
60
+ # Do not use `super` because `@partition_function` can be method with 1 argument
61
+ iterator = @partition_function.call(iterator)
62
+ iterator
63
+ end
64
+ # alias_method :lazy_run, :run
65
+ end
66
+
67
+ # -------------------------------------------------------------------------------------------------
68
+ # Filter
69
+
70
+ class Spark::Command::Filter < _Base
71
+ variable :filter_function
72
+
73
+ def run(iterator, *)
74
+ iterator.select! do |item|
75
+ @filter_function.call(item)
76
+ end
77
+ iterator
78
+ end
79
+
80
+ def lazy_run(iterator, *)
81
+ iterator.select do |item|
82
+ @filter_function.call(item)
83
+ end
84
+ end
85
+ end
86
+
87
+ # -------------------------------------------------------------------------------------------------
88
+ # Compact
89
+
90
+ class Spark::Command::Compact < _Base
91
+ def run(iterator, *)
92
+ iterator.compact!
93
+ iterator
94
+ end
95
+
96
+ def lazy_run(iterator, *)
97
+ iterator.select do |item|
98
+ !item.nil?
99
+ end
100
+ end
101
+ end
102
+
103
+ # -------------------------------------------------------------------------------------------------
104
+ # Glom
105
+
106
+ class Spark::Command::Glom < _Base
107
+ def run(iterator, *)
108
+ [iterator]
109
+ end
110
+
111
+ def lazy_run(iterator, *)
112
+ run(iterator.to_a)
113
+ end
114
+ end
115
+
116
+ # -------------------------------------------------------------------------------------------------
117
+ # Shuffle
118
+
119
+ class Spark::Command::Shuffle < _Base
120
+ variable :seed, function: false, type: Integer
121
+
122
+ def run(iterator, *)
123
+ iterator.shuffle!(random: rng)
124
+ iterator
125
+ end
126
+
127
+ def rng
128
+ Random.new(@seed)
129
+ end
130
+ end
131
+
132
+ # -------------------------------------------------------------------------------------------------
133
+ # PartitionBy
134
+
135
+ class Spark::Command::PartitionBy
136
+
137
+ class Base < Spark::Command::Base
138
+ include Spark::Helper::Serialize
139
+
140
+ def prepare
141
+ super
142
+
143
+ # Default. Keep it after super because Sorting has own key_function.
144
+ @key_function ||= lambda{|x| x[0]}
145
+ end
146
+
147
+ def run(iterator, *)
148
+ iterator.map! do |item|
149
+ make_partition_item(item)
150
+ end
151
+ iterator.flatten!(1)
152
+ iterator
153
+ end
154
+
155
+ def lazy_run(iterator, *)
156
+ iterator.flat_map do |item|
157
+ make_partition_item(item)
158
+ end
159
+ end
160
+
161
+ private
162
+
163
+ def make_partition_item(item)
164
+ [
165
+ pack_long(@partition_func.call(@key_function[item])),
166
+ item
167
+ ]
168
+ end
169
+ end
170
+
171
+ class Basic < Base
172
+ variable :partition_func
173
+ end
174
+
175
+ class Sorting < Base
176
+ variable :key_function
177
+ variable :bounds, function: false, type: Array
178
+ variable :ascending, function: false, type: [TrueClass, FalseClass]
179
+ variable :num_partitions, function: false, type: Numeric
180
+
181
+ def prepare
182
+ super
183
+
184
+ # Index by bisect alghoritm
185
+ @partition_func ||= Proc.new do |key|
186
+ count = 0
187
+ @bounds.each{|i|
188
+ break if i >= key
189
+ count += 1
190
+ }
191
+
192
+ if @ascending
193
+ count
194
+ else
195
+ @num_partitions - 1 - count
196
+ end
197
+ end
198
+ end
199
+
200
+ end # Sorting
201
+ end # PartitionBy
202
+
203
+ # -------------------------------------------------------------------------------------------------
204
+ # Aggregate
205
+
206
+ class Spark::Command::Aggregate < _Base
207
+ variable :reduce_func
208
+ variable :zero_value, function: false, type: Object
209
+
210
+ def run(iterator, *)
211
+ [iterator.reduce(@zero_value, &@reduce_func)]
212
+ end
213
+
214
+ def lazy_run(iterator, *)
215
+ run(iterator)
216
+ end
217
+ end
218
+
219
+ # -------------------------------------------------------------------------------------------------
220
+ # Reduce
221
+
222
+ class Spark::Command::Reduce < Spark::Command::Aggregate
223
+ def run(iterator, *)
224
+ [iterator.reduce(&@reduce_func)]
225
+ end
226
+ end
227
+
228
+ # -------------------------------------------------------------------------------------------------
229
+ # Foreach
230
+
231
+ class Spark::Command::Foreach < _Base
232
+ variable :each_function
233
+
234
+ def run(iterator, *)
235
+ iterator.each do |item|
236
+ @each_function.call(item)
237
+ end
238
+ nil
239
+ end
240
+ end
241
+
242
+ # -------------------------------------------------------------------------------------------------
243
+ # ForeachPartition
244
+
245
+ class Spark::Command::ForeachPartition < _Base
246
+ variable :partition_function
247
+
248
+ def run(iterator, *)
249
+ @partition_function.call(iterator)
250
+ nil
251
+ end
252
+ end
253
+
254
+ # -------------------------------------------------------------------------------------------------
255
+ # KeyBy
256
+
257
+ class Spark::Command::KeyBy < _Base
258
+ variable :key_function
259
+
260
+ def run(iterator, *)
261
+ iterator.map! do |item|
262
+ [@key_function.call(item), item]
263
+ end
264
+ iterator
265
+ end
266
+
267
+ def lazy_run(iterator, *)
268
+ iterator.map do |item|
269
+ [@key_function.call(item), item]
270
+ end
271
+ end
272
+ end
273
+
274
+ # -------------------------------------------------------------------------------------------------
275
+ # Take
276
+
277
+ class Spark::Command::Take < _Base
278
+ variable :total, function: false, type: Numeric
279
+ variable :last_part, function: false, type: Numeric
280
+
281
+ def run(iterator, index)
282
+ if index == @last_part && iterator.size > @total
283
+ return iterator.slice!(0, @total)
284
+ end
285
+
286
+ iterator
287
+ end
288
+ end
289
+
290
+ # -------------------------------------------------------------------------------------------------
291
+ # Pipe
292
+
293
+ class Spark::Command::Pipe < _Base
294
+ variable :cmds, function: false, type: Array
295
+
296
+ def before_run
297
+ require 'open3'
298
+
299
+ @in, @out, @threads = Open3.pipeline_rw(*@cmds)
300
+ end
301
+
302
+ def run(iterator, *)
303
+ create_writing_thread(iterator)
304
+
305
+ new_iterator = []
306
+
307
+ # Read full input
308
+ begin
309
+ loop {
310
+ new_iterator << @out.readline.rstrip
311
+ }
312
+ rescue EOFError
313
+ end
314
+
315
+ new_iterator
316
+ end
317
+
318
+ def lazy_run(iterator, *)
319
+ create_writing_thread(iterator)
320
+
321
+ Enumerator::Lazy.new([nil]) do |yielder, _|
322
+ begin
323
+ loop {
324
+ yielder << @out.readline.rstrip
325
+ }
326
+ rescue EOFError
327
+ end
328
+ end
329
+ end
330
+
331
+ private
332
+
333
+ def create_writing_thread(iterator)
334
+ @writing_thread = Thread.new do
335
+ # Send complete iterator to the pipe
336
+ iterator.each do |item|
337
+ @in.puts(item.to_s.rstrip)
338
+ end
339
+
340
+ # Input must be closed for EOFError
341
+ @in.close
342
+ end
343
+ end
344
+
345
+ end