ruby-spark 1.0.0 → 1.1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -1
  3. data/README.md +99 -32
  4. data/TODO.md +2 -3
  5. data/benchmark/{performance → comparison}/prepare.sh +0 -0
  6. data/benchmark/{performance → comparison}/python.py +0 -0
  7. data/benchmark/{performance → comparison}/r.r +0 -0
  8. data/benchmark/{performance → comparison}/ruby.rb +0 -0
  9. data/benchmark/{performance → comparison}/run-all.sh +0 -0
  10. data/benchmark/{performance → comparison}/scala.scala +0 -0
  11. data/example/pi.rb +1 -1
  12. data/example/website_search.rb +83 -0
  13. data/ext/spark/src/main/scala/RubyRDD.scala +30 -2
  14. data/lib/spark.rb +2 -2
  15. data/lib/spark/build.rb +1 -1
  16. data/lib/spark/cli.rb +1 -1
  17. data/lib/spark/command/base.rb +4 -0
  18. data/lib/spark/command_builder.rb +2 -2
  19. data/lib/spark/config.rb +11 -17
  20. data/lib/spark/context.rb +63 -45
  21. data/lib/spark/ext/io.rb +11 -1
  22. data/lib/spark/java_bridge/base.rb +2 -2
  23. data/lib/spark/rdd.rb +67 -18
  24. data/lib/spark/serializer.rb +68 -13
  25. data/lib/spark/serializer/auto_batched.rb +59 -0
  26. data/lib/spark/serializer/base.rb +30 -137
  27. data/lib/spark/serializer/batched.rb +84 -0
  28. data/lib/spark/serializer/cartesian.rb +5 -29
  29. data/lib/spark/serializer/compressed.rb +27 -0
  30. data/lib/spark/serializer/marshal.rb +6 -8
  31. data/lib/spark/serializer/message_pack.rb +8 -10
  32. data/lib/spark/serializer/oj.rb +8 -10
  33. data/lib/spark/serializer/pair.rb +27 -13
  34. data/lib/spark/serializer/text.rb +25 -0
  35. data/lib/spark/version.rb +1 -1
  36. data/lib/spark/worker/worker.rb +5 -2
  37. data/ruby-spark.gemspec +13 -1
  38. data/spec/lib/context_spec.rb +3 -1
  39. data/spec/lib/manipulation_spec.rb +18 -10
  40. data/spec/lib/map_partitions_spec.rb +16 -16
  41. data/spec/lib/serializer_spec.rb +84 -9
  42. data/spec/lib/statistic_spec.rb +26 -24
  43. data/spec/spec_helper.rb +1 -2
  44. metadata +112 -10
  45. data/lib/spark/serializer/utf8.rb +0 -25
@@ -16,6 +16,10 @@ class Spark::Command::Base
16
16
  end
17
17
  end
18
18
 
19
+ def to_s
20
+ self.class.name.split('::').last
21
+ end
22
+
19
23
  def self.error(message)
20
24
  raise Spark::CommandError, message
21
25
  end
@@ -32,8 +32,8 @@ module Spark
32
32
  def deep_copy
33
33
  copy = self.dup
34
34
  copy.create_command
35
- copy.serializer = self.serializer.dup
36
- copy.deserializer = self.deserializer.dup
35
+ copy.serializer = self.serializer.deep_copy
36
+ copy.deserializer = self.deserializer.deep_copy
37
37
  copy.commands = self.commands.dup
38
38
  copy.libraries = self.libraries.dup
39
39
  copy.bound_objects = self.bound_objects.dup
data/lib/spark/config.rb CHANGED
@@ -9,7 +9,7 @@ module Spark
9
9
 
10
10
  TYPES = {
11
11
  'spark.shuffle.spill' => :boolean,
12
- 'spark.ruby.batch_size' => :integer
12
+ 'spark.ruby.serializer.compress' => :boolean
13
13
  }
14
14
 
15
15
  # Initialize java SparkConf and load default configuration.
@@ -55,8 +55,8 @@ module Spark
55
55
  errors << 'A master URL must be set in your configuration.'
56
56
  end
57
57
 
58
- if Spark::Serializer.get(get('spark.ruby.serializer')).nil?
59
- errors << 'Default serializer must be set in your configuration.'
58
+ if Spark::Serializer.find(get('spark.ruby.serializer')).nil?
59
+ errors << 'Unknow serializer.'
60
60
  end
61
61
 
62
62
  scanned = get('spark.ruby.executor.command').scan('%s')
@@ -137,9 +137,9 @@ module Spark
137
137
  set_app_name('RubySpark')
138
138
  set_master('local[*]')
139
139
  set('spark.ruby.driver_home', Spark.home)
140
- set('spark.ruby.parallelize_strategy', default_parallelize_strategy)
141
140
  set('spark.ruby.serializer', default_serializer)
142
- set('spark.ruby.batch_size', default_batch_size)
141
+ set('spark.ruby.serializer.compress', default_serializer_compress)
142
+ set('spark.ruby.serializer.batch_size', default_serializer_batch_size)
143
143
  set('spark.ruby.executor.uri', default_executor_uri)
144
144
  set('spark.ruby.executor.command', default_executor_command)
145
145
  set('spark.ruby.executor.options', default_executor_options)
@@ -147,22 +147,16 @@ module Spark
147
147
  load_executor_envs
148
148
  end
149
149
 
150
- # How to handle with data in method parallelize.
151
- #
152
- # == Possible options:
153
- # inplace:: data are changed directly to save memory
154
- # deep_copy:: data are cloned fist
155
- #
156
- def default_parallelize_strategy
157
- ENV['SPARK_RUBY_PARALLELIZE_STRATEGY'] || 'inplace'
158
- end
159
-
160
150
  def default_serializer
161
151
  ENV['SPARK_RUBY_SERIALIZER'] || Spark::Serializer::DEFAULT_SERIALIZER_NAME
162
152
  end
163
153
 
164
- def default_batch_size
165
- ENV['SPARK_RUBY_BATCH_SIZE'] || Spark::Serializer::DEFAULT_BATCH_SIZE.to_s
154
+ def default_serializer_compress
155
+ ENV['SPARK_RUBY_SERIALIZER_COMPRESS'] || Spark::Serializer::DEFAULT_COMPRESS
156
+ end
157
+
158
+ def default_serializer_batch_size
159
+ ENV['SPARK_RUBY_SERIALIZER_BATCH_SIZE'] || Spark::Serializer::DEFAULT_BATCH_SIZE
166
160
  end
167
161
 
168
162
  # Ruby executor.
data/lib/spark/context.rb CHANGED
@@ -2,6 +2,7 @@
2
2
  Spark.load_lib
3
3
 
4
4
  module Spark
5
+ ##
5
6
  # Main entry point for Spark functionality. A SparkContext represents the connection to a Spark
6
7
  # cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster.
7
8
  #
@@ -57,10 +58,38 @@ module Spark
57
58
  sc.defaultParallelism
58
59
  end
59
60
 
60
- def get_serializer(serializer, *args)
61
- serializer = Spark::Serializer.get(serializer)
62
- serializer ||= Spark::Serializer.get(config['spark.ruby.serializer'])
63
- serializer.new(config['spark.ruby.batch_size']).set(*args)
61
+ # Default serializer
62
+ #
63
+ # Batch -> Compress -> Basic
64
+ #
65
+ def default_serializer
66
+ # Basic
67
+ serializer = Spark::Serializer.find!(config('spark.ruby.serializer')).new
68
+
69
+ # Compress
70
+ if config('spark.ruby.serializer.compress')
71
+ serializer = Spark::Serializer.compressed(serializer)
72
+ end
73
+
74
+ # Bactching
75
+ batch_size = default_batch_size
76
+ if batch_size == 'auto'
77
+ serializer = Spark::Serializer.auto_batched(serializer)
78
+ else
79
+ serializer = Spark::Serializer.batched(serializer, batch_size)
80
+ end
81
+
82
+ # Finally, "container" contains serializers
83
+ serializer
84
+ end
85
+
86
+ def default_batch_size
87
+ size = config('spark.ruby.serializer.batch_size').to_i
88
+ if size >= 1
89
+ size
90
+ else
91
+ 'auto'
92
+ end
64
93
  end
65
94
 
66
95
  # Set a local property that affects jobs submitted from this thread, such as the
@@ -93,12 +122,11 @@ module Spark
93
122
  # be changed at runtime.
94
123
  #
95
124
  def config(key=nil)
96
- # if key
97
- # Spark.config[key]
98
- # else
99
- # Spark.config.get_all
100
- # end
101
- Spark.config
125
+ if key
126
+ Spark.config.get(key)
127
+ else
128
+ Spark.config
129
+ end
102
130
  end
103
131
 
104
132
  # Add a file to be downloaded with this Spark job on every node.
@@ -164,10 +192,7 @@ module Spark
164
192
  # == Parameters:
165
193
  # data:: Range or Array
166
194
  # num_slices:: number of slice
167
- # options::
168
- # - use
169
- # - serializer
170
- # - batch_size
195
+ # serializer:: custom serializer (default: serializer based on configuration)
171
196
  #
172
197
  # == Examples:
173
198
  # $sc.parallelize(["1", "2", "3"]).map(lambda{|x| x.to_i}).collect
@@ -176,33 +201,21 @@ module Spark
176
201
  # $sc.parallelize(1..3).map(:to_s).collect
177
202
  # #=> ["1", "2", "3"]
178
203
  #
179
- def parallelize(data, num_slices=nil, options={})
204
+ def parallelize(data, num_slices=nil, serializer=nil)
180
205
  num_slices ||= default_parallelism
206
+ serializer ||= default_serializer
181
207
 
182
- # use = jruby? ? (options[:use] || :direct) : :file
183
- use = :file
184
- serializer = get_serializer(options[:serializer], options[:batch_size])
185
-
186
- if data.is_a?(Array) && config['spark.ruby.parallelize_strategy'] == 'deep_copy'
187
- data = data.deep_copy
188
- else
189
- # For enumerator or range
190
- data = data.to_a
191
- end
208
+ serializer.check_each(data)
192
209
 
193
- case use
194
- when :direct
195
- serializer.dump_to_java(data)
196
- jrdd = jcontext.parallelize(data, num_slices)
197
- when :file
198
- file = Tempfile.new('to_parallelize', temp_dir)
199
- serializer.dump(data, file)
200
- file.close # not unlink
201
- jrdd = RubyRDD.readRDDFromFile(jcontext, file.path, num_slices)
202
- file.unlink
203
- end
210
+ # Through file
211
+ file = Tempfile.new('to_parallelize', temp_dir)
212
+ serializer.dump_to_io(data, file)
213
+ file.close # not unlink
214
+ jrdd = RubyRDD.readRDDFromFile(jcontext, file.path, num_slices)
204
215
 
205
216
  Spark::RDD.new(jrdd, self, serializer)
217
+ ensure
218
+ file && file.unlink
206
219
  end
207
220
 
208
221
  # Read a text file from HDFS, a local file system (available on all nodes), or any
@@ -217,11 +230,12 @@ module Spark
217
230
  # $sc.text_file(f.path).map(lambda{|x| x.to_i}).collect
218
231
  # # => [1, 2]
219
232
  #
220
- def text_file(path, min_partitions=nil, options={})
233
+ def text_file(path, min_partitions=nil, encoding=Encoding::UTF_8, serializer=nil)
221
234
  min_partitions ||= default_parallelism
222
- serializer = get_serializer(options[:serializer], options[:batch_size])
235
+ serializer ||= default_serializer
236
+ deserializer = Spark::Serializer.build { __text__(encoding) }
223
237
 
224
- Spark::RDD.new(@jcontext.textFile(path, min_partitions), self, serializer, get_serializer('UTF8'))
238
+ Spark::RDD.new(@jcontext.textFile(path, min_partitions), self, serializer, deserializer)
225
239
  end
226
240
 
227
241
  # Read a directory of text files from HDFS, a local file system (available on all nodes), or any
@@ -240,10 +254,10 @@ module Spark
240
254
  # $sc.whole_text_files(dir).flat_map(lambda{|key, value| value.split}).collect
241
255
  # # => ["1", "2", "3", "4"]
242
256
  #
243
- def whole_text_files(path, min_partitions=nil, options={})
257
+ def whole_text_files(path, min_partitions=nil, serializer=nil)
244
258
  min_partitions ||= default_parallelism
245
- serializer = get_serializer(options[:serializer], options[:batch_size])
246
- deserializer = get_serializer('Pair', get_serializer('UTF8'), get_serializer('UTF8'))
259
+ serializer ||= default_serializer
260
+ deserializer = Spark::Serializer.build{ __pair__(__text__, __text__) }
247
261
 
248
262
  Spark::RDD.new(@jcontext.wholeTextFiles(path, min_partitions), self, serializer, deserializer)
249
263
  end
@@ -254,7 +268,7 @@ module Spark
254
268
  # If partitions is not specified, this will run over all partitions.
255
269
  #
256
270
  # == Example:
257
- # rdd = $sc.parallelize(0..10, 5, batch_size: 1)
271
+ # rdd = $sc.parallelize(0..10, 5)
258
272
  # $sc.run_job(rdd, lambda{|x| x.to_s}, [0,2])
259
273
  # # => ["[0, 1]", "[4, 5]"]
260
274
  #
@@ -282,9 +296,13 @@ module Spark
282
296
  # Rjb represent Fixnum as Integer but Jruby as Long
283
297
  partitions = to_java_array_list(convert_to_java_int(partitions))
284
298
 
299
+ # File for result
300
+ file = Tempfile.new('collect', temp_dir)
301
+
285
302
  mapped = rdd.new_rdd_from_command(command, *args)
286
- iterator = PythonRDD.runJob(rdd.context.sc, mapped.jrdd, partitions, allow_local)
287
- mapped.collect_from_iterator(iterator)
303
+ RubyRDD.runJob(rdd.context.sc, mapped.jrdd, partitions, allow_local, file.path)
304
+
305
+ mapped.collect_from_file(file)
288
306
  end
289
307
 
290
308
 
data/lib/spark/ext/io.rb CHANGED
@@ -12,6 +12,12 @@ module Spark
12
12
  unpack_int(read(4))
13
13
  end
14
14
 
15
+ def read_int_or_eof
16
+ bytes = read(4)
17
+ return Spark::Constant::DATA_EOF if bytes.nil?
18
+ unpack_int(bytes)
19
+ end
20
+
15
21
  def read_long
16
22
  unpack_long(read(8))
17
23
  end
@@ -35,8 +41,11 @@ module Spark
35
41
  write(pack_long(data))
36
42
  end
37
43
 
44
+ # Size and data can have different encoding
45
+ # Marshal: both ASCII
46
+ # Oj: ASCII and UTF-8
38
47
  def write_string(data)
39
- write_int(data.size)
48
+ write_int(data.bytesize)
40
49
  write(data)
41
50
  end
42
51
 
@@ -55,3 +64,4 @@ module Spark
55
64
  end
56
65
 
57
66
  IO.__send__(:include, Spark::CoreExtension::IO)
67
+ StringIO.__send__(:include, Spark::CoreExtension::IO)
@@ -145,8 +145,8 @@ module Spark
145
145
  if class_name == 'JavaRDD'
146
146
  jrdd = RubyRDD.toRuby(object)
147
147
 
148
- serializer = Spark.sc.get_serializer('marshal', nil)
149
- deserializer = Spark.sc.get_serializer('marshal', 2) # is fully batched
148
+ serializer = Spark::Serializer.build { __batched__(__marshal__) }
149
+ serializer = Spark::Serializer.build { __batched__(__marshal__, 2) }
150
150
 
151
151
  return Spark::RDD.new(jrdd, Spark.sc, serializer, deserializer)
152
152
  end
data/lib/spark/rdd.rb CHANGED
@@ -34,6 +34,18 @@ module Spark
34
34
  @command = Spark::CommandBuilder.new(serializer, deserializer)
35
35
  end
36
36
 
37
+ def inspect
38
+ comms = @command.commands.join(' -> ')
39
+
40
+ result = %{#<#{self.class.name}:0x#{object_id}}
41
+ result << %{ (#{comms})} unless comms.empty?
42
+ result << %{\n}
43
+ result << %{ Serializer: "#{serializer}"\n}
44
+ result << %{Deserializer: "#{deserializer}"}
45
+ result << %{>}
46
+ result
47
+ end
48
+
37
49
 
38
50
  # =============================================================================
39
51
  # Operators
@@ -159,7 +171,16 @@ module Spark
159
171
  end
160
172
 
161
173
  def to_java
162
- rdd = self.reserialize('Marshal')
174
+ marshal = Spark::Serializer.marshal
175
+
176
+ if deserializer.batched?
177
+ ser = deserializer.deep_copy
178
+ ser.serializer = marshal
179
+ else
180
+ ser = Spark::Serializer.batched(marshal)
181
+ end
182
+
183
+ rdd = self.reserialize(ser)
163
184
  RubyRDD.toJava(rdd.jrdd, rdd.serializer.batched?)
164
185
  end
165
186
 
@@ -169,20 +190,32 @@ module Spark
169
190
 
170
191
  # Return an array that contains all of the elements in this RDD.
171
192
  # RJB raise an error if stage is killed.
172
- def collect
173
- collect_from_iterator(jrdd.collect.iterator)
193
+ def collect(as_enum=false)
194
+ file = Tempfile.new('collect', context.temp_dir)
195
+
196
+ RubyRDD.writeRDDToFile(jrdd.rdd, file.path)
197
+
198
+ collect_from_file(file, as_enum)
174
199
  rescue => e
175
200
  raise Spark::RDDError, e.message
176
201
  end
177
202
 
178
- def collect_from_iterator(iterator)
203
+ def collect_from_file(file, as_enum=false)
179
204
  if self.is_a?(PipelinedRDD)
180
205
  klass = @command.serializer
181
206
  else
182
207
  klass = @command.deserializer
183
208
  end
184
209
 
185
- klass.load_from_iterator(iterator)
210
+ if as_enum
211
+ result = klass.load_from_file(file)
212
+ else
213
+ result = klass.load_from_io(file).to_a
214
+ file.close
215
+ file.unlink
216
+ end
217
+
218
+ result
186
219
  end
187
220
 
188
221
  # Convert an Array to Hash
@@ -198,7 +231,7 @@ module Spark
198
231
  # to satisfy the limit.
199
232
  #
200
233
  # == Example:
201
- # rdd = $sc.parallelize(0..100, 20, batch_size: 1)
234
+ # rdd = $sc.parallelize(0..100, 20)
202
235
  # rdd.take(5)
203
236
  # # => [0, 1, 2, 3, 4]
204
237
  #
@@ -293,7 +326,7 @@ module Spark
293
326
  # seq = lambda{|x,y| x+y}
294
327
  # com = lambda{|x,y| x*y}
295
328
  #
296
- # rdd = $sc.parallelize(1..10, 2, batch_size: 1)
329
+ # rdd = $sc.parallelize(1..10, 2)
297
330
  # rdd.aggregate(1, seq, com)
298
331
  # # => 656
299
332
  #
@@ -590,7 +623,7 @@ module Spark
590
623
  # of the original partition.
591
624
  #
592
625
  # == Example:
593
- # rdd = $sc.parallelize(0...4, 4, batch_size: 1)
626
+ # rdd = $sc.parallelize(0...4, 4)
594
627
  # rdd.map_partitions_with_index(lambda{|part, index| part.first * index}).collect
595
628
  # # => [0, 1, 4, 9]
596
629
  #
@@ -623,7 +656,7 @@ module Spark
623
656
  # Return an RDD created by coalescing all elements within each partition into an array.
624
657
  #
625
658
  # == Example:
626
- # rdd = $sc.parallelize(0..10, 3, batch_size: 1)
659
+ # rdd = $sc.parallelize(0..10, 3)
627
660
  # rdd.glom.collect
628
661
  # # => [[0, 1, 2], [3, 4, 5, 6], [7, 8, 9, 10]]
629
662
  #
@@ -639,8 +672,14 @@ module Spark
639
672
  # # => [[0, 1, 2], [3, 4, 5, 6, 7, 8, 9, 10]]
640
673
  #
641
674
  def coalesce(num_partitions)
675
+ if self.is_a?(PipelinedRDD)
676
+ deser = @command.serializer
677
+ else
678
+ deser = @command.deserializer
679
+ end
680
+
642
681
  new_jrdd = jrdd.coalesce(num_partitions)
643
- RDD.new(new_jrdd, context, @command.serializer, @command.deserializer)
682
+ RDD.new(new_jrdd, context, @command.serializer, deser)
644
683
  end
645
684
 
646
685
  # Return the Cartesian product of this RDD and another one, that is, the
@@ -655,7 +694,8 @@ module Spark
655
694
  # # => [[1, 4], [1, 5], [1, 6], [2, 4], [2, 5], [2, 6], [3, 4], [3, 5], [3, 6]]
656
695
  #
657
696
  def cartesian(other)
658
- _deserializer = Spark::Serializer::Cartesian.new.set(self.deserializer, other.deserializer)
697
+ _deserializer = Spark::Serializer::Cartesian.new(self.deserializer, other.deserializer)
698
+
659
699
  new_jrdd = jrdd.cartesian(other.jrdd)
660
700
  RDD.new(new_jrdd, context, serializer, _deserializer)
661
701
  end
@@ -697,7 +737,7 @@ module Spark
697
737
  #
698
738
  def union(other)
699
739
  if self.serializer != other.serializer
700
- other = other.reserialize(serializer.name, serializer.batch_size)
740
+ other = other.reserialize(serializer)
701
741
  end
702
742
 
703
743
  new_jrdd = jrdd.union(other.jrdd)
@@ -713,10 +753,7 @@ module Spark
713
753
  # rdd.reserialize("oj").collect
714
754
  # # => ["1", "2", "3"]
715
755
  #
716
- def reserialize(new_serializer, new_batch_size=nil)
717
- new_batch_size ||= deserializer.batch_size
718
- new_serializer = Spark::Serializer.get!(new_serializer).new(new_batch_size)
719
-
756
+ def reserialize(new_serializer)
720
757
  if serializer == new_serializer
721
758
  return self
722
759
  end
@@ -906,7 +943,7 @@ module Spark
906
943
  # x+y
907
944
  # end
908
945
  #
909
- # rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"], 2, batch_size: 1).map(lambda{|x| [x, 1]})
946
+ # rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"], 2).map(lambda{|x| [x, 1]})
910
947
  # rdd.combine_by_key(method(:combiner), method(:merge), method(:merge)).collect_as_hash
911
948
  # # => {"a"=>3, "b"=>2, "c"=>3}
912
949
  #
@@ -973,7 +1010,7 @@ module Spark
973
1010
  # x*y
974
1011
  # end
975
1012
  #
976
- # rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]], 2, batch_size: 1)
1013
+ # rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]], 2)
977
1014
  # rdd.aggregate_by_key(1, method(:combine), method(:merge))
978
1015
  # # => [["b", 3], ["a", 16], ["c", 6]]
979
1016
  #
@@ -1064,6 +1101,17 @@ module Spark
1064
1101
  self.sort_by('lambda{|(key, _)| key}')
1065
1102
  end
1066
1103
 
1104
+ # Sort the RDD by value
1105
+ #
1106
+ # == Example:
1107
+ # rdd = $sc.parallelize([["a", 3], ["b", 1], ["c", 2]])
1108
+ # rdd.sort_by_value.collect
1109
+ # # => [["b", 1], ["c", 2], ["a", 3]]
1110
+ #
1111
+ def sort_by_value(ascending=true, num_partitions=nil)
1112
+ self.sort_by('lambda{|(_, value)| value}')
1113
+ end
1114
+
1067
1115
  # Sorts this RDD by the given key_function
1068
1116
  #
1069
1117
  # This is a different implementation than spark. Sort by doesn't use
@@ -1190,6 +1238,7 @@ module Spark
1190
1238
  alias_method :defaultReducePartitions, :default_reduce_partitions
1191
1239
  alias_method :setName, :set_name
1192
1240
  alias_method :addLibrary, :add_library
1241
+ alias_method :require, :add_library
1193
1242
 
1194
1243
  alias_method :flatMap, :flat_map
1195
1244
  alias_method :mapPartitions, :map_partitions