ruby-spark 1.0.0 → 1.1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -1
  3. data/README.md +99 -32
  4. data/TODO.md +2 -3
  5. data/benchmark/{performance → comparison}/prepare.sh +0 -0
  6. data/benchmark/{performance → comparison}/python.py +0 -0
  7. data/benchmark/{performance → comparison}/r.r +0 -0
  8. data/benchmark/{performance → comparison}/ruby.rb +0 -0
  9. data/benchmark/{performance → comparison}/run-all.sh +0 -0
  10. data/benchmark/{performance → comparison}/scala.scala +0 -0
  11. data/example/pi.rb +1 -1
  12. data/example/website_search.rb +83 -0
  13. data/ext/spark/src/main/scala/RubyRDD.scala +30 -2
  14. data/lib/spark.rb +2 -2
  15. data/lib/spark/build.rb +1 -1
  16. data/lib/spark/cli.rb +1 -1
  17. data/lib/spark/command/base.rb +4 -0
  18. data/lib/spark/command_builder.rb +2 -2
  19. data/lib/spark/config.rb +11 -17
  20. data/lib/spark/context.rb +63 -45
  21. data/lib/spark/ext/io.rb +11 -1
  22. data/lib/spark/java_bridge/base.rb +2 -2
  23. data/lib/spark/rdd.rb +67 -18
  24. data/lib/spark/serializer.rb +68 -13
  25. data/lib/spark/serializer/auto_batched.rb +59 -0
  26. data/lib/spark/serializer/base.rb +30 -137
  27. data/lib/spark/serializer/batched.rb +84 -0
  28. data/lib/spark/serializer/cartesian.rb +5 -29
  29. data/lib/spark/serializer/compressed.rb +27 -0
  30. data/lib/spark/serializer/marshal.rb +6 -8
  31. data/lib/spark/serializer/message_pack.rb +8 -10
  32. data/lib/spark/serializer/oj.rb +8 -10
  33. data/lib/spark/serializer/pair.rb +27 -13
  34. data/lib/spark/serializer/text.rb +25 -0
  35. data/lib/spark/version.rb +1 -1
  36. data/lib/spark/worker/worker.rb +5 -2
  37. data/ruby-spark.gemspec +13 -1
  38. data/spec/lib/context_spec.rb +3 -1
  39. data/spec/lib/manipulation_spec.rb +18 -10
  40. data/spec/lib/map_partitions_spec.rb +16 -16
  41. data/spec/lib/serializer_spec.rb +84 -9
  42. data/spec/lib/statistic_spec.rb +26 -24
  43. data/spec/spec_helper.rb +1 -2
  44. metadata +112 -10
  45. data/lib/spark/serializer/utf8.rb +0 -25
@@ -16,6 +16,10 @@ class Spark::Command::Base
16
16
  end
17
17
  end
18
18
 
19
+ def to_s
20
+ self.class.name.split('::').last
21
+ end
22
+
19
23
  def self.error(message)
20
24
  raise Spark::CommandError, message
21
25
  end
@@ -32,8 +32,8 @@ module Spark
32
32
  def deep_copy
33
33
  copy = self.dup
34
34
  copy.create_command
35
- copy.serializer = self.serializer.dup
36
- copy.deserializer = self.deserializer.dup
35
+ copy.serializer = self.serializer.deep_copy
36
+ copy.deserializer = self.deserializer.deep_copy
37
37
  copy.commands = self.commands.dup
38
38
  copy.libraries = self.libraries.dup
39
39
  copy.bound_objects = self.bound_objects.dup
data/lib/spark/config.rb CHANGED
@@ -9,7 +9,7 @@ module Spark
9
9
 
10
10
  TYPES = {
11
11
  'spark.shuffle.spill' => :boolean,
12
- 'spark.ruby.batch_size' => :integer
12
+ 'spark.ruby.serializer.compress' => :boolean
13
13
  }
14
14
 
15
15
  # Initialize java SparkConf and load default configuration.
@@ -55,8 +55,8 @@ module Spark
55
55
  errors << 'A master URL must be set in your configuration.'
56
56
  end
57
57
 
58
- if Spark::Serializer.get(get('spark.ruby.serializer')).nil?
59
- errors << 'Default serializer must be set in your configuration.'
58
+ if Spark::Serializer.find(get('spark.ruby.serializer')).nil?
59
+ errors << 'Unknow serializer.'
60
60
  end
61
61
 
62
62
  scanned = get('spark.ruby.executor.command').scan('%s')
@@ -137,9 +137,9 @@ module Spark
137
137
  set_app_name('RubySpark')
138
138
  set_master('local[*]')
139
139
  set('spark.ruby.driver_home', Spark.home)
140
- set('spark.ruby.parallelize_strategy', default_parallelize_strategy)
141
140
  set('spark.ruby.serializer', default_serializer)
142
- set('spark.ruby.batch_size', default_batch_size)
141
+ set('spark.ruby.serializer.compress', default_serializer_compress)
142
+ set('spark.ruby.serializer.batch_size', default_serializer_batch_size)
143
143
  set('spark.ruby.executor.uri', default_executor_uri)
144
144
  set('spark.ruby.executor.command', default_executor_command)
145
145
  set('spark.ruby.executor.options', default_executor_options)
@@ -147,22 +147,16 @@ module Spark
147
147
  load_executor_envs
148
148
  end
149
149
 
150
- # How to handle with data in method parallelize.
151
- #
152
- # == Possible options:
153
- # inplace:: data are changed directly to save memory
154
- # deep_copy:: data are cloned fist
155
- #
156
- def default_parallelize_strategy
157
- ENV['SPARK_RUBY_PARALLELIZE_STRATEGY'] || 'inplace'
158
- end
159
-
160
150
  def default_serializer
161
151
  ENV['SPARK_RUBY_SERIALIZER'] || Spark::Serializer::DEFAULT_SERIALIZER_NAME
162
152
  end
163
153
 
164
- def default_batch_size
165
- ENV['SPARK_RUBY_BATCH_SIZE'] || Spark::Serializer::DEFAULT_BATCH_SIZE.to_s
154
+ def default_serializer_compress
155
+ ENV['SPARK_RUBY_SERIALIZER_COMPRESS'] || Spark::Serializer::DEFAULT_COMPRESS
156
+ end
157
+
158
+ def default_serializer_batch_size
159
+ ENV['SPARK_RUBY_SERIALIZER_BATCH_SIZE'] || Spark::Serializer::DEFAULT_BATCH_SIZE
166
160
  end
167
161
 
168
162
  # Ruby executor.
data/lib/spark/context.rb CHANGED
@@ -2,6 +2,7 @@
2
2
  Spark.load_lib
3
3
 
4
4
  module Spark
5
+ ##
5
6
  # Main entry point for Spark functionality. A SparkContext represents the connection to a Spark
6
7
  # cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster.
7
8
  #
@@ -57,10 +58,38 @@ module Spark
57
58
  sc.defaultParallelism
58
59
  end
59
60
 
60
- def get_serializer(serializer, *args)
61
- serializer = Spark::Serializer.get(serializer)
62
- serializer ||= Spark::Serializer.get(config['spark.ruby.serializer'])
63
- serializer.new(config['spark.ruby.batch_size']).set(*args)
61
+ # Default serializer
62
+ #
63
+ # Batch -> Compress -> Basic
64
+ #
65
+ def default_serializer
66
+ # Basic
67
+ serializer = Spark::Serializer.find!(config('spark.ruby.serializer')).new
68
+
69
+ # Compress
70
+ if config('spark.ruby.serializer.compress')
71
+ serializer = Spark::Serializer.compressed(serializer)
72
+ end
73
+
74
+ # Bactching
75
+ batch_size = default_batch_size
76
+ if batch_size == 'auto'
77
+ serializer = Spark::Serializer.auto_batched(serializer)
78
+ else
79
+ serializer = Spark::Serializer.batched(serializer, batch_size)
80
+ end
81
+
82
+ # Finally, "container" contains serializers
83
+ serializer
84
+ end
85
+
86
+ def default_batch_size
87
+ size = config('spark.ruby.serializer.batch_size').to_i
88
+ if size >= 1
89
+ size
90
+ else
91
+ 'auto'
92
+ end
64
93
  end
65
94
 
66
95
  # Set a local property that affects jobs submitted from this thread, such as the
@@ -93,12 +122,11 @@ module Spark
93
122
  # be changed at runtime.
94
123
  #
95
124
  def config(key=nil)
96
- # if key
97
- # Spark.config[key]
98
- # else
99
- # Spark.config.get_all
100
- # end
101
- Spark.config
125
+ if key
126
+ Spark.config.get(key)
127
+ else
128
+ Spark.config
129
+ end
102
130
  end
103
131
 
104
132
  # Add a file to be downloaded with this Spark job on every node.
@@ -164,10 +192,7 @@ module Spark
164
192
  # == Parameters:
165
193
  # data:: Range or Array
166
194
  # num_slices:: number of slice
167
- # options::
168
- # - use
169
- # - serializer
170
- # - batch_size
195
+ # serializer:: custom serializer (default: serializer based on configuration)
171
196
  #
172
197
  # == Examples:
173
198
  # $sc.parallelize(["1", "2", "3"]).map(lambda{|x| x.to_i}).collect
@@ -176,33 +201,21 @@ module Spark
176
201
  # $sc.parallelize(1..3).map(:to_s).collect
177
202
  # #=> ["1", "2", "3"]
178
203
  #
179
- def parallelize(data, num_slices=nil, options={})
204
+ def parallelize(data, num_slices=nil, serializer=nil)
180
205
  num_slices ||= default_parallelism
206
+ serializer ||= default_serializer
181
207
 
182
- # use = jruby? ? (options[:use] || :direct) : :file
183
- use = :file
184
- serializer = get_serializer(options[:serializer], options[:batch_size])
185
-
186
- if data.is_a?(Array) && config['spark.ruby.parallelize_strategy'] == 'deep_copy'
187
- data = data.deep_copy
188
- else
189
- # For enumerator or range
190
- data = data.to_a
191
- end
208
+ serializer.check_each(data)
192
209
 
193
- case use
194
- when :direct
195
- serializer.dump_to_java(data)
196
- jrdd = jcontext.parallelize(data, num_slices)
197
- when :file
198
- file = Tempfile.new('to_parallelize', temp_dir)
199
- serializer.dump(data, file)
200
- file.close # not unlink
201
- jrdd = RubyRDD.readRDDFromFile(jcontext, file.path, num_slices)
202
- file.unlink
203
- end
210
+ # Through file
211
+ file = Tempfile.new('to_parallelize', temp_dir)
212
+ serializer.dump_to_io(data, file)
213
+ file.close # not unlink
214
+ jrdd = RubyRDD.readRDDFromFile(jcontext, file.path, num_slices)
204
215
 
205
216
  Spark::RDD.new(jrdd, self, serializer)
217
+ ensure
218
+ file && file.unlink
206
219
  end
207
220
 
208
221
  # Read a text file from HDFS, a local file system (available on all nodes), or any
@@ -217,11 +230,12 @@ module Spark
217
230
  # $sc.text_file(f.path).map(lambda{|x| x.to_i}).collect
218
231
  # # => [1, 2]
219
232
  #
220
- def text_file(path, min_partitions=nil, options={})
233
+ def text_file(path, min_partitions=nil, encoding=Encoding::UTF_8, serializer=nil)
221
234
  min_partitions ||= default_parallelism
222
- serializer = get_serializer(options[:serializer], options[:batch_size])
235
+ serializer ||= default_serializer
236
+ deserializer = Spark::Serializer.build { __text__(encoding) }
223
237
 
224
- Spark::RDD.new(@jcontext.textFile(path, min_partitions), self, serializer, get_serializer('UTF8'))
238
+ Spark::RDD.new(@jcontext.textFile(path, min_partitions), self, serializer, deserializer)
225
239
  end
226
240
 
227
241
  # Read a directory of text files from HDFS, a local file system (available on all nodes), or any
@@ -240,10 +254,10 @@ module Spark
240
254
  # $sc.whole_text_files(dir).flat_map(lambda{|key, value| value.split}).collect
241
255
  # # => ["1", "2", "3", "4"]
242
256
  #
243
- def whole_text_files(path, min_partitions=nil, options={})
257
+ def whole_text_files(path, min_partitions=nil, serializer=nil)
244
258
  min_partitions ||= default_parallelism
245
- serializer = get_serializer(options[:serializer], options[:batch_size])
246
- deserializer = get_serializer('Pair', get_serializer('UTF8'), get_serializer('UTF8'))
259
+ serializer ||= default_serializer
260
+ deserializer = Spark::Serializer.build{ __pair__(__text__, __text__) }
247
261
 
248
262
  Spark::RDD.new(@jcontext.wholeTextFiles(path, min_partitions), self, serializer, deserializer)
249
263
  end
@@ -254,7 +268,7 @@ module Spark
254
268
  # If partitions is not specified, this will run over all partitions.
255
269
  #
256
270
  # == Example:
257
- # rdd = $sc.parallelize(0..10, 5, batch_size: 1)
271
+ # rdd = $sc.parallelize(0..10, 5)
258
272
  # $sc.run_job(rdd, lambda{|x| x.to_s}, [0,2])
259
273
  # # => ["[0, 1]", "[4, 5]"]
260
274
  #
@@ -282,9 +296,13 @@ module Spark
282
296
  # Rjb represent Fixnum as Integer but Jruby as Long
283
297
  partitions = to_java_array_list(convert_to_java_int(partitions))
284
298
 
299
+ # File for result
300
+ file = Tempfile.new('collect', temp_dir)
301
+
285
302
  mapped = rdd.new_rdd_from_command(command, *args)
286
- iterator = PythonRDD.runJob(rdd.context.sc, mapped.jrdd, partitions, allow_local)
287
- mapped.collect_from_iterator(iterator)
303
+ RubyRDD.runJob(rdd.context.sc, mapped.jrdd, partitions, allow_local, file.path)
304
+
305
+ mapped.collect_from_file(file)
288
306
  end
289
307
 
290
308
 
data/lib/spark/ext/io.rb CHANGED
@@ -12,6 +12,12 @@ module Spark
12
12
  unpack_int(read(4))
13
13
  end
14
14
 
15
+ def read_int_or_eof
16
+ bytes = read(4)
17
+ return Spark::Constant::DATA_EOF if bytes.nil?
18
+ unpack_int(bytes)
19
+ end
20
+
15
21
  def read_long
16
22
  unpack_long(read(8))
17
23
  end
@@ -35,8 +41,11 @@ module Spark
35
41
  write(pack_long(data))
36
42
  end
37
43
 
44
+ # Size and data can have different encoding
45
+ # Marshal: both ASCII
46
+ # Oj: ASCII and UTF-8
38
47
  def write_string(data)
39
- write_int(data.size)
48
+ write_int(data.bytesize)
40
49
  write(data)
41
50
  end
42
51
 
@@ -55,3 +64,4 @@ module Spark
55
64
  end
56
65
 
57
66
  IO.__send__(:include, Spark::CoreExtension::IO)
67
+ StringIO.__send__(:include, Spark::CoreExtension::IO)
@@ -145,8 +145,8 @@ module Spark
145
145
  if class_name == 'JavaRDD'
146
146
  jrdd = RubyRDD.toRuby(object)
147
147
 
148
- serializer = Spark.sc.get_serializer('marshal', nil)
149
- deserializer = Spark.sc.get_serializer('marshal', 2) # is fully batched
148
+ serializer = Spark::Serializer.build { __batched__(__marshal__) }
149
+ serializer = Spark::Serializer.build { __batched__(__marshal__, 2) }
150
150
 
151
151
  return Spark::RDD.new(jrdd, Spark.sc, serializer, deserializer)
152
152
  end
data/lib/spark/rdd.rb CHANGED
@@ -34,6 +34,18 @@ module Spark
34
34
  @command = Spark::CommandBuilder.new(serializer, deserializer)
35
35
  end
36
36
 
37
+ def inspect
38
+ comms = @command.commands.join(' -> ')
39
+
40
+ result = %{#<#{self.class.name}:0x#{object_id}}
41
+ result << %{ (#{comms})} unless comms.empty?
42
+ result << %{\n}
43
+ result << %{ Serializer: "#{serializer}"\n}
44
+ result << %{Deserializer: "#{deserializer}"}
45
+ result << %{>}
46
+ result
47
+ end
48
+
37
49
 
38
50
  # =============================================================================
39
51
  # Operators
@@ -159,7 +171,16 @@ module Spark
159
171
  end
160
172
 
161
173
  def to_java
162
- rdd = self.reserialize('Marshal')
174
+ marshal = Spark::Serializer.marshal
175
+
176
+ if deserializer.batched?
177
+ ser = deserializer.deep_copy
178
+ ser.serializer = marshal
179
+ else
180
+ ser = Spark::Serializer.batched(marshal)
181
+ end
182
+
183
+ rdd = self.reserialize(ser)
163
184
  RubyRDD.toJava(rdd.jrdd, rdd.serializer.batched?)
164
185
  end
165
186
 
@@ -169,20 +190,32 @@ module Spark
169
190
 
170
191
  # Return an array that contains all of the elements in this RDD.
171
192
  # RJB raise an error if stage is killed.
172
- def collect
173
- collect_from_iterator(jrdd.collect.iterator)
193
+ def collect(as_enum=false)
194
+ file = Tempfile.new('collect', context.temp_dir)
195
+
196
+ RubyRDD.writeRDDToFile(jrdd.rdd, file.path)
197
+
198
+ collect_from_file(file, as_enum)
174
199
  rescue => e
175
200
  raise Spark::RDDError, e.message
176
201
  end
177
202
 
178
- def collect_from_iterator(iterator)
203
+ def collect_from_file(file, as_enum=false)
179
204
  if self.is_a?(PipelinedRDD)
180
205
  klass = @command.serializer
181
206
  else
182
207
  klass = @command.deserializer
183
208
  end
184
209
 
185
- klass.load_from_iterator(iterator)
210
+ if as_enum
211
+ result = klass.load_from_file(file)
212
+ else
213
+ result = klass.load_from_io(file).to_a
214
+ file.close
215
+ file.unlink
216
+ end
217
+
218
+ result
186
219
  end
187
220
 
188
221
  # Convert an Array to Hash
@@ -198,7 +231,7 @@ module Spark
198
231
  # to satisfy the limit.
199
232
  #
200
233
  # == Example:
201
- # rdd = $sc.parallelize(0..100, 20, batch_size: 1)
234
+ # rdd = $sc.parallelize(0..100, 20)
202
235
  # rdd.take(5)
203
236
  # # => [0, 1, 2, 3, 4]
204
237
  #
@@ -293,7 +326,7 @@ module Spark
293
326
  # seq = lambda{|x,y| x+y}
294
327
  # com = lambda{|x,y| x*y}
295
328
  #
296
- # rdd = $sc.parallelize(1..10, 2, batch_size: 1)
329
+ # rdd = $sc.parallelize(1..10, 2)
297
330
  # rdd.aggregate(1, seq, com)
298
331
  # # => 656
299
332
  #
@@ -590,7 +623,7 @@ module Spark
590
623
  # of the original partition.
591
624
  #
592
625
  # == Example:
593
- # rdd = $sc.parallelize(0...4, 4, batch_size: 1)
626
+ # rdd = $sc.parallelize(0...4, 4)
594
627
  # rdd.map_partitions_with_index(lambda{|part, index| part.first * index}).collect
595
628
  # # => [0, 1, 4, 9]
596
629
  #
@@ -623,7 +656,7 @@ module Spark
623
656
  # Return an RDD created by coalescing all elements within each partition into an array.
624
657
  #
625
658
  # == Example:
626
- # rdd = $sc.parallelize(0..10, 3, batch_size: 1)
659
+ # rdd = $sc.parallelize(0..10, 3)
627
660
  # rdd.glom.collect
628
661
  # # => [[0, 1, 2], [3, 4, 5, 6], [7, 8, 9, 10]]
629
662
  #
@@ -639,8 +672,14 @@ module Spark
639
672
  # # => [[0, 1, 2], [3, 4, 5, 6, 7, 8, 9, 10]]
640
673
  #
641
674
  def coalesce(num_partitions)
675
+ if self.is_a?(PipelinedRDD)
676
+ deser = @command.serializer
677
+ else
678
+ deser = @command.deserializer
679
+ end
680
+
642
681
  new_jrdd = jrdd.coalesce(num_partitions)
643
- RDD.new(new_jrdd, context, @command.serializer, @command.deserializer)
682
+ RDD.new(new_jrdd, context, @command.serializer, deser)
644
683
  end
645
684
 
646
685
  # Return the Cartesian product of this RDD and another one, that is, the
@@ -655,7 +694,8 @@ module Spark
655
694
  # # => [[1, 4], [1, 5], [1, 6], [2, 4], [2, 5], [2, 6], [3, 4], [3, 5], [3, 6]]
656
695
  #
657
696
  def cartesian(other)
658
- _deserializer = Spark::Serializer::Cartesian.new.set(self.deserializer, other.deserializer)
697
+ _deserializer = Spark::Serializer::Cartesian.new(self.deserializer, other.deserializer)
698
+
659
699
  new_jrdd = jrdd.cartesian(other.jrdd)
660
700
  RDD.new(new_jrdd, context, serializer, _deserializer)
661
701
  end
@@ -697,7 +737,7 @@ module Spark
697
737
  #
698
738
  def union(other)
699
739
  if self.serializer != other.serializer
700
- other = other.reserialize(serializer.name, serializer.batch_size)
740
+ other = other.reserialize(serializer)
701
741
  end
702
742
 
703
743
  new_jrdd = jrdd.union(other.jrdd)
@@ -713,10 +753,7 @@ module Spark
713
753
  # rdd.reserialize("oj").collect
714
754
  # # => ["1", "2", "3"]
715
755
  #
716
- def reserialize(new_serializer, new_batch_size=nil)
717
- new_batch_size ||= deserializer.batch_size
718
- new_serializer = Spark::Serializer.get!(new_serializer).new(new_batch_size)
719
-
756
+ def reserialize(new_serializer)
720
757
  if serializer == new_serializer
721
758
  return self
722
759
  end
@@ -906,7 +943,7 @@ module Spark
906
943
  # x+y
907
944
  # end
908
945
  #
909
- # rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"], 2, batch_size: 1).map(lambda{|x| [x, 1]})
946
+ # rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"], 2).map(lambda{|x| [x, 1]})
910
947
  # rdd.combine_by_key(method(:combiner), method(:merge), method(:merge)).collect_as_hash
911
948
  # # => {"a"=>3, "b"=>2, "c"=>3}
912
949
  #
@@ -973,7 +1010,7 @@ module Spark
973
1010
  # x*y
974
1011
  # end
975
1012
  #
976
- # rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]], 2, batch_size: 1)
1013
+ # rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]], 2)
977
1014
  # rdd.aggregate_by_key(1, method(:combine), method(:merge))
978
1015
  # # => [["b", 3], ["a", 16], ["c", 6]]
979
1016
  #
@@ -1064,6 +1101,17 @@ module Spark
1064
1101
  self.sort_by('lambda{|(key, _)| key}')
1065
1102
  end
1066
1103
 
1104
+ # Sort the RDD by value
1105
+ #
1106
+ # == Example:
1107
+ # rdd = $sc.parallelize([["a", 3], ["b", 1], ["c", 2]])
1108
+ # rdd.sort_by_value.collect
1109
+ # # => [["b", 1], ["c", 2], ["a", 3]]
1110
+ #
1111
+ def sort_by_value(ascending=true, num_partitions=nil)
1112
+ self.sort_by('lambda{|(_, value)| value}')
1113
+ end
1114
+
1067
1115
  # Sorts this RDD by the given key_function
1068
1116
  #
1069
1117
  # This is a different implementation than spark. Sort by doesn't use
@@ -1190,6 +1238,7 @@ module Spark
1190
1238
  alias_method :defaultReducePartitions, :default_reduce_partitions
1191
1239
  alias_method :setName, :set_name
1192
1240
  alias_method :addLibrary, :add_library
1241
+ alias_method :require, :add_library
1193
1242
 
1194
1243
  alias_method :flatMap, :flat_map
1195
1244
  alias_method :mapPartitions, :map_partitions