ruby-spark 1.0.0 → 1.1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/README.md +99 -32
- data/TODO.md +2 -3
- data/benchmark/{performance → comparison}/prepare.sh +0 -0
- data/benchmark/{performance → comparison}/python.py +0 -0
- data/benchmark/{performance → comparison}/r.r +0 -0
- data/benchmark/{performance → comparison}/ruby.rb +0 -0
- data/benchmark/{performance → comparison}/run-all.sh +0 -0
- data/benchmark/{performance → comparison}/scala.scala +0 -0
- data/example/pi.rb +1 -1
- data/example/website_search.rb +83 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +30 -2
- data/lib/spark.rb +2 -2
- data/lib/spark/build.rb +1 -1
- data/lib/spark/cli.rb +1 -1
- data/lib/spark/command/base.rb +4 -0
- data/lib/spark/command_builder.rb +2 -2
- data/lib/spark/config.rb +11 -17
- data/lib/spark/context.rb +63 -45
- data/lib/spark/ext/io.rb +11 -1
- data/lib/spark/java_bridge/base.rb +2 -2
- data/lib/spark/rdd.rb +67 -18
- data/lib/spark/serializer.rb +68 -13
- data/lib/spark/serializer/auto_batched.rb +59 -0
- data/lib/spark/serializer/base.rb +30 -137
- data/lib/spark/serializer/batched.rb +84 -0
- data/lib/spark/serializer/cartesian.rb +5 -29
- data/lib/spark/serializer/compressed.rb +27 -0
- data/lib/spark/serializer/marshal.rb +6 -8
- data/lib/spark/serializer/message_pack.rb +8 -10
- data/lib/spark/serializer/oj.rb +8 -10
- data/lib/spark/serializer/pair.rb +27 -13
- data/lib/spark/serializer/text.rb +25 -0
- data/lib/spark/version.rb +1 -1
- data/lib/spark/worker/worker.rb +5 -2
- data/ruby-spark.gemspec +13 -1
- data/spec/lib/context_spec.rb +3 -1
- data/spec/lib/manipulation_spec.rb +18 -10
- data/spec/lib/map_partitions_spec.rb +16 -16
- data/spec/lib/serializer_spec.rb +84 -9
- data/spec/lib/statistic_spec.rb +26 -24
- data/spec/spec_helper.rb +1 -2
- metadata +112 -10
- data/lib/spark/serializer/utf8.rb +0 -25
data/lib/spark/command/base.rb
CHANGED
@@ -32,8 +32,8 @@ module Spark
|
|
32
32
|
def deep_copy
|
33
33
|
copy = self.dup
|
34
34
|
copy.create_command
|
35
|
-
copy.serializer = self.serializer.
|
36
|
-
copy.deserializer = self.deserializer.
|
35
|
+
copy.serializer = self.serializer.deep_copy
|
36
|
+
copy.deserializer = self.deserializer.deep_copy
|
37
37
|
copy.commands = self.commands.dup
|
38
38
|
copy.libraries = self.libraries.dup
|
39
39
|
copy.bound_objects = self.bound_objects.dup
|
data/lib/spark/config.rb
CHANGED
@@ -9,7 +9,7 @@ module Spark
|
|
9
9
|
|
10
10
|
TYPES = {
|
11
11
|
'spark.shuffle.spill' => :boolean,
|
12
|
-
'spark.ruby.
|
12
|
+
'spark.ruby.serializer.compress' => :boolean
|
13
13
|
}
|
14
14
|
|
15
15
|
# Initialize java SparkConf and load default configuration.
|
@@ -55,8 +55,8 @@ module Spark
|
|
55
55
|
errors << 'A master URL must be set in your configuration.'
|
56
56
|
end
|
57
57
|
|
58
|
-
if Spark::Serializer.
|
59
|
-
errors << '
|
58
|
+
if Spark::Serializer.find(get('spark.ruby.serializer')).nil?
|
59
|
+
errors << 'Unknow serializer.'
|
60
60
|
end
|
61
61
|
|
62
62
|
scanned = get('spark.ruby.executor.command').scan('%s')
|
@@ -137,9 +137,9 @@ module Spark
|
|
137
137
|
set_app_name('RubySpark')
|
138
138
|
set_master('local[*]')
|
139
139
|
set('spark.ruby.driver_home', Spark.home)
|
140
|
-
set('spark.ruby.parallelize_strategy', default_parallelize_strategy)
|
141
140
|
set('spark.ruby.serializer', default_serializer)
|
142
|
-
set('spark.ruby.
|
141
|
+
set('spark.ruby.serializer.compress', default_serializer_compress)
|
142
|
+
set('spark.ruby.serializer.batch_size', default_serializer_batch_size)
|
143
143
|
set('spark.ruby.executor.uri', default_executor_uri)
|
144
144
|
set('spark.ruby.executor.command', default_executor_command)
|
145
145
|
set('spark.ruby.executor.options', default_executor_options)
|
@@ -147,22 +147,16 @@ module Spark
|
|
147
147
|
load_executor_envs
|
148
148
|
end
|
149
149
|
|
150
|
-
# How to handle with data in method parallelize.
|
151
|
-
#
|
152
|
-
# == Possible options:
|
153
|
-
# inplace:: data are changed directly to save memory
|
154
|
-
# deep_copy:: data are cloned fist
|
155
|
-
#
|
156
|
-
def default_parallelize_strategy
|
157
|
-
ENV['SPARK_RUBY_PARALLELIZE_STRATEGY'] || 'inplace'
|
158
|
-
end
|
159
|
-
|
160
150
|
def default_serializer
|
161
151
|
ENV['SPARK_RUBY_SERIALIZER'] || Spark::Serializer::DEFAULT_SERIALIZER_NAME
|
162
152
|
end
|
163
153
|
|
164
|
-
def
|
165
|
-
ENV['
|
154
|
+
def default_serializer_compress
|
155
|
+
ENV['SPARK_RUBY_SERIALIZER_COMPRESS'] || Spark::Serializer::DEFAULT_COMPRESS
|
156
|
+
end
|
157
|
+
|
158
|
+
def default_serializer_batch_size
|
159
|
+
ENV['SPARK_RUBY_SERIALIZER_BATCH_SIZE'] || Spark::Serializer::DEFAULT_BATCH_SIZE
|
166
160
|
end
|
167
161
|
|
168
162
|
# Ruby executor.
|
data/lib/spark/context.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
Spark.load_lib
|
3
3
|
|
4
4
|
module Spark
|
5
|
+
##
|
5
6
|
# Main entry point for Spark functionality. A SparkContext represents the connection to a Spark
|
6
7
|
# cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster.
|
7
8
|
#
|
@@ -57,10 +58,38 @@ module Spark
|
|
57
58
|
sc.defaultParallelism
|
58
59
|
end
|
59
60
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
61
|
+
# Default serializer
|
62
|
+
#
|
63
|
+
# Batch -> Compress -> Basic
|
64
|
+
#
|
65
|
+
def default_serializer
|
66
|
+
# Basic
|
67
|
+
serializer = Spark::Serializer.find!(config('spark.ruby.serializer')).new
|
68
|
+
|
69
|
+
# Compress
|
70
|
+
if config('spark.ruby.serializer.compress')
|
71
|
+
serializer = Spark::Serializer.compressed(serializer)
|
72
|
+
end
|
73
|
+
|
74
|
+
# Bactching
|
75
|
+
batch_size = default_batch_size
|
76
|
+
if batch_size == 'auto'
|
77
|
+
serializer = Spark::Serializer.auto_batched(serializer)
|
78
|
+
else
|
79
|
+
serializer = Spark::Serializer.batched(serializer, batch_size)
|
80
|
+
end
|
81
|
+
|
82
|
+
# Finally, "container" contains serializers
|
83
|
+
serializer
|
84
|
+
end
|
85
|
+
|
86
|
+
def default_batch_size
|
87
|
+
size = config('spark.ruby.serializer.batch_size').to_i
|
88
|
+
if size >= 1
|
89
|
+
size
|
90
|
+
else
|
91
|
+
'auto'
|
92
|
+
end
|
64
93
|
end
|
65
94
|
|
66
95
|
# Set a local property that affects jobs submitted from this thread, such as the
|
@@ -93,12 +122,11 @@ module Spark
|
|
93
122
|
# be changed at runtime.
|
94
123
|
#
|
95
124
|
def config(key=nil)
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
Spark.config
|
125
|
+
if key
|
126
|
+
Spark.config.get(key)
|
127
|
+
else
|
128
|
+
Spark.config
|
129
|
+
end
|
102
130
|
end
|
103
131
|
|
104
132
|
# Add a file to be downloaded with this Spark job on every node.
|
@@ -164,10 +192,7 @@ module Spark
|
|
164
192
|
# == Parameters:
|
165
193
|
# data:: Range or Array
|
166
194
|
# num_slices:: number of slice
|
167
|
-
#
|
168
|
-
# - use
|
169
|
-
# - serializer
|
170
|
-
# - batch_size
|
195
|
+
# serializer:: custom serializer (default: serializer based on configuration)
|
171
196
|
#
|
172
197
|
# == Examples:
|
173
198
|
# $sc.parallelize(["1", "2", "3"]).map(lambda{|x| x.to_i}).collect
|
@@ -176,33 +201,21 @@ module Spark
|
|
176
201
|
# $sc.parallelize(1..3).map(:to_s).collect
|
177
202
|
# #=> ["1", "2", "3"]
|
178
203
|
#
|
179
|
-
def parallelize(data, num_slices=nil,
|
204
|
+
def parallelize(data, num_slices=nil, serializer=nil)
|
180
205
|
num_slices ||= default_parallelism
|
206
|
+
serializer ||= default_serializer
|
181
207
|
|
182
|
-
|
183
|
-
use = :file
|
184
|
-
serializer = get_serializer(options[:serializer], options[:batch_size])
|
185
|
-
|
186
|
-
if data.is_a?(Array) && config['spark.ruby.parallelize_strategy'] == 'deep_copy'
|
187
|
-
data = data.deep_copy
|
188
|
-
else
|
189
|
-
# For enumerator or range
|
190
|
-
data = data.to_a
|
191
|
-
end
|
208
|
+
serializer.check_each(data)
|
192
209
|
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
file = Tempfile.new('to_parallelize', temp_dir)
|
199
|
-
serializer.dump(data, file)
|
200
|
-
file.close # not unlink
|
201
|
-
jrdd = RubyRDD.readRDDFromFile(jcontext, file.path, num_slices)
|
202
|
-
file.unlink
|
203
|
-
end
|
210
|
+
# Through file
|
211
|
+
file = Tempfile.new('to_parallelize', temp_dir)
|
212
|
+
serializer.dump_to_io(data, file)
|
213
|
+
file.close # not unlink
|
214
|
+
jrdd = RubyRDD.readRDDFromFile(jcontext, file.path, num_slices)
|
204
215
|
|
205
216
|
Spark::RDD.new(jrdd, self, serializer)
|
217
|
+
ensure
|
218
|
+
file && file.unlink
|
206
219
|
end
|
207
220
|
|
208
221
|
# Read a text file from HDFS, a local file system (available on all nodes), or any
|
@@ -217,11 +230,12 @@ module Spark
|
|
217
230
|
# $sc.text_file(f.path).map(lambda{|x| x.to_i}).collect
|
218
231
|
# # => [1, 2]
|
219
232
|
#
|
220
|
-
def text_file(path, min_partitions=nil,
|
233
|
+
def text_file(path, min_partitions=nil, encoding=Encoding::UTF_8, serializer=nil)
|
221
234
|
min_partitions ||= default_parallelism
|
222
|
-
serializer
|
235
|
+
serializer ||= default_serializer
|
236
|
+
deserializer = Spark::Serializer.build { __text__(encoding) }
|
223
237
|
|
224
|
-
Spark::RDD.new(@jcontext.textFile(path, min_partitions), self, serializer,
|
238
|
+
Spark::RDD.new(@jcontext.textFile(path, min_partitions), self, serializer, deserializer)
|
225
239
|
end
|
226
240
|
|
227
241
|
# Read a directory of text files from HDFS, a local file system (available on all nodes), or any
|
@@ -240,10 +254,10 @@ module Spark
|
|
240
254
|
# $sc.whole_text_files(dir).flat_map(lambda{|key, value| value.split}).collect
|
241
255
|
# # => ["1", "2", "3", "4"]
|
242
256
|
#
|
243
|
-
def whole_text_files(path, min_partitions=nil,
|
257
|
+
def whole_text_files(path, min_partitions=nil, serializer=nil)
|
244
258
|
min_partitions ||= default_parallelism
|
245
|
-
serializer
|
246
|
-
deserializer
|
259
|
+
serializer ||= default_serializer
|
260
|
+
deserializer = Spark::Serializer.build{ __pair__(__text__, __text__) }
|
247
261
|
|
248
262
|
Spark::RDD.new(@jcontext.wholeTextFiles(path, min_partitions), self, serializer, deserializer)
|
249
263
|
end
|
@@ -254,7 +268,7 @@ module Spark
|
|
254
268
|
# If partitions is not specified, this will run over all partitions.
|
255
269
|
#
|
256
270
|
# == Example:
|
257
|
-
# rdd = $sc.parallelize(0..10, 5
|
271
|
+
# rdd = $sc.parallelize(0..10, 5)
|
258
272
|
# $sc.run_job(rdd, lambda{|x| x.to_s}, [0,2])
|
259
273
|
# # => ["[0, 1]", "[4, 5]"]
|
260
274
|
#
|
@@ -282,9 +296,13 @@ module Spark
|
|
282
296
|
# Rjb represent Fixnum as Integer but Jruby as Long
|
283
297
|
partitions = to_java_array_list(convert_to_java_int(partitions))
|
284
298
|
|
299
|
+
# File for result
|
300
|
+
file = Tempfile.new('collect', temp_dir)
|
301
|
+
|
285
302
|
mapped = rdd.new_rdd_from_command(command, *args)
|
286
|
-
|
287
|
-
|
303
|
+
RubyRDD.runJob(rdd.context.sc, mapped.jrdd, partitions, allow_local, file.path)
|
304
|
+
|
305
|
+
mapped.collect_from_file(file)
|
288
306
|
end
|
289
307
|
|
290
308
|
|
data/lib/spark/ext/io.rb
CHANGED
@@ -12,6 +12,12 @@ module Spark
|
|
12
12
|
unpack_int(read(4))
|
13
13
|
end
|
14
14
|
|
15
|
+
def read_int_or_eof
|
16
|
+
bytes = read(4)
|
17
|
+
return Spark::Constant::DATA_EOF if bytes.nil?
|
18
|
+
unpack_int(bytes)
|
19
|
+
end
|
20
|
+
|
15
21
|
def read_long
|
16
22
|
unpack_long(read(8))
|
17
23
|
end
|
@@ -35,8 +41,11 @@ module Spark
|
|
35
41
|
write(pack_long(data))
|
36
42
|
end
|
37
43
|
|
44
|
+
# Size and data can have different encoding
|
45
|
+
# Marshal: both ASCII
|
46
|
+
# Oj: ASCII and UTF-8
|
38
47
|
def write_string(data)
|
39
|
-
write_int(data.
|
48
|
+
write_int(data.bytesize)
|
40
49
|
write(data)
|
41
50
|
end
|
42
51
|
|
@@ -55,3 +64,4 @@ module Spark
|
|
55
64
|
end
|
56
65
|
|
57
66
|
IO.__send__(:include, Spark::CoreExtension::IO)
|
67
|
+
StringIO.__send__(:include, Spark::CoreExtension::IO)
|
@@ -145,8 +145,8 @@ module Spark
|
|
145
145
|
if class_name == 'JavaRDD'
|
146
146
|
jrdd = RubyRDD.toRuby(object)
|
147
147
|
|
148
|
-
serializer
|
149
|
-
|
148
|
+
serializer = Spark::Serializer.build { __batched__(__marshal__) }
|
149
|
+
serializer = Spark::Serializer.build { __batched__(__marshal__, 2) }
|
150
150
|
|
151
151
|
return Spark::RDD.new(jrdd, Spark.sc, serializer, deserializer)
|
152
152
|
end
|
data/lib/spark/rdd.rb
CHANGED
@@ -34,6 +34,18 @@ module Spark
|
|
34
34
|
@command = Spark::CommandBuilder.new(serializer, deserializer)
|
35
35
|
end
|
36
36
|
|
37
|
+
def inspect
|
38
|
+
comms = @command.commands.join(' -> ')
|
39
|
+
|
40
|
+
result = %{#<#{self.class.name}:0x#{object_id}}
|
41
|
+
result << %{ (#{comms})} unless comms.empty?
|
42
|
+
result << %{\n}
|
43
|
+
result << %{ Serializer: "#{serializer}"\n}
|
44
|
+
result << %{Deserializer: "#{deserializer}"}
|
45
|
+
result << %{>}
|
46
|
+
result
|
47
|
+
end
|
48
|
+
|
37
49
|
|
38
50
|
# =============================================================================
|
39
51
|
# Operators
|
@@ -159,7 +171,16 @@ module Spark
|
|
159
171
|
end
|
160
172
|
|
161
173
|
def to_java
|
162
|
-
|
174
|
+
marshal = Spark::Serializer.marshal
|
175
|
+
|
176
|
+
if deserializer.batched?
|
177
|
+
ser = deserializer.deep_copy
|
178
|
+
ser.serializer = marshal
|
179
|
+
else
|
180
|
+
ser = Spark::Serializer.batched(marshal)
|
181
|
+
end
|
182
|
+
|
183
|
+
rdd = self.reserialize(ser)
|
163
184
|
RubyRDD.toJava(rdd.jrdd, rdd.serializer.batched?)
|
164
185
|
end
|
165
186
|
|
@@ -169,20 +190,32 @@ module Spark
|
|
169
190
|
|
170
191
|
# Return an array that contains all of the elements in this RDD.
|
171
192
|
# RJB raise an error if stage is killed.
|
172
|
-
def collect
|
173
|
-
|
193
|
+
def collect(as_enum=false)
|
194
|
+
file = Tempfile.new('collect', context.temp_dir)
|
195
|
+
|
196
|
+
RubyRDD.writeRDDToFile(jrdd.rdd, file.path)
|
197
|
+
|
198
|
+
collect_from_file(file, as_enum)
|
174
199
|
rescue => e
|
175
200
|
raise Spark::RDDError, e.message
|
176
201
|
end
|
177
202
|
|
178
|
-
def
|
203
|
+
def collect_from_file(file, as_enum=false)
|
179
204
|
if self.is_a?(PipelinedRDD)
|
180
205
|
klass = @command.serializer
|
181
206
|
else
|
182
207
|
klass = @command.deserializer
|
183
208
|
end
|
184
209
|
|
185
|
-
|
210
|
+
if as_enum
|
211
|
+
result = klass.load_from_file(file)
|
212
|
+
else
|
213
|
+
result = klass.load_from_io(file).to_a
|
214
|
+
file.close
|
215
|
+
file.unlink
|
216
|
+
end
|
217
|
+
|
218
|
+
result
|
186
219
|
end
|
187
220
|
|
188
221
|
# Convert an Array to Hash
|
@@ -198,7 +231,7 @@ module Spark
|
|
198
231
|
# to satisfy the limit.
|
199
232
|
#
|
200
233
|
# == Example:
|
201
|
-
# rdd = $sc.parallelize(0..100, 20
|
234
|
+
# rdd = $sc.parallelize(0..100, 20)
|
202
235
|
# rdd.take(5)
|
203
236
|
# # => [0, 1, 2, 3, 4]
|
204
237
|
#
|
@@ -293,7 +326,7 @@ module Spark
|
|
293
326
|
# seq = lambda{|x,y| x+y}
|
294
327
|
# com = lambda{|x,y| x*y}
|
295
328
|
#
|
296
|
-
# rdd = $sc.parallelize(1..10, 2
|
329
|
+
# rdd = $sc.parallelize(1..10, 2)
|
297
330
|
# rdd.aggregate(1, seq, com)
|
298
331
|
# # => 656
|
299
332
|
#
|
@@ -590,7 +623,7 @@ module Spark
|
|
590
623
|
# of the original partition.
|
591
624
|
#
|
592
625
|
# == Example:
|
593
|
-
# rdd = $sc.parallelize(0...4, 4
|
626
|
+
# rdd = $sc.parallelize(0...4, 4)
|
594
627
|
# rdd.map_partitions_with_index(lambda{|part, index| part.first * index}).collect
|
595
628
|
# # => [0, 1, 4, 9]
|
596
629
|
#
|
@@ -623,7 +656,7 @@ module Spark
|
|
623
656
|
# Return an RDD created by coalescing all elements within each partition into an array.
|
624
657
|
#
|
625
658
|
# == Example:
|
626
|
-
# rdd = $sc.parallelize(0..10, 3
|
659
|
+
# rdd = $sc.parallelize(0..10, 3)
|
627
660
|
# rdd.glom.collect
|
628
661
|
# # => [[0, 1, 2], [3, 4, 5, 6], [7, 8, 9, 10]]
|
629
662
|
#
|
@@ -639,8 +672,14 @@ module Spark
|
|
639
672
|
# # => [[0, 1, 2], [3, 4, 5, 6, 7, 8, 9, 10]]
|
640
673
|
#
|
641
674
|
def coalesce(num_partitions)
|
675
|
+
if self.is_a?(PipelinedRDD)
|
676
|
+
deser = @command.serializer
|
677
|
+
else
|
678
|
+
deser = @command.deserializer
|
679
|
+
end
|
680
|
+
|
642
681
|
new_jrdd = jrdd.coalesce(num_partitions)
|
643
|
-
RDD.new(new_jrdd, context, @command.serializer,
|
682
|
+
RDD.new(new_jrdd, context, @command.serializer, deser)
|
644
683
|
end
|
645
684
|
|
646
685
|
# Return the Cartesian product of this RDD and another one, that is, the
|
@@ -655,7 +694,8 @@ module Spark
|
|
655
694
|
# # => [[1, 4], [1, 5], [1, 6], [2, 4], [2, 5], [2, 6], [3, 4], [3, 5], [3, 6]]
|
656
695
|
#
|
657
696
|
def cartesian(other)
|
658
|
-
_deserializer = Spark::Serializer::Cartesian.new
|
697
|
+
_deserializer = Spark::Serializer::Cartesian.new(self.deserializer, other.deserializer)
|
698
|
+
|
659
699
|
new_jrdd = jrdd.cartesian(other.jrdd)
|
660
700
|
RDD.new(new_jrdd, context, serializer, _deserializer)
|
661
701
|
end
|
@@ -697,7 +737,7 @@ module Spark
|
|
697
737
|
#
|
698
738
|
def union(other)
|
699
739
|
if self.serializer != other.serializer
|
700
|
-
other = other.reserialize(serializer
|
740
|
+
other = other.reserialize(serializer)
|
701
741
|
end
|
702
742
|
|
703
743
|
new_jrdd = jrdd.union(other.jrdd)
|
@@ -713,10 +753,7 @@ module Spark
|
|
713
753
|
# rdd.reserialize("oj").collect
|
714
754
|
# # => ["1", "2", "3"]
|
715
755
|
#
|
716
|
-
def reserialize(new_serializer
|
717
|
-
new_batch_size ||= deserializer.batch_size
|
718
|
-
new_serializer = Spark::Serializer.get!(new_serializer).new(new_batch_size)
|
719
|
-
|
756
|
+
def reserialize(new_serializer)
|
720
757
|
if serializer == new_serializer
|
721
758
|
return self
|
722
759
|
end
|
@@ -906,7 +943,7 @@ module Spark
|
|
906
943
|
# x+y
|
907
944
|
# end
|
908
945
|
#
|
909
|
-
# rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"], 2
|
946
|
+
# rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"], 2).map(lambda{|x| [x, 1]})
|
910
947
|
# rdd.combine_by_key(method(:combiner), method(:merge), method(:merge)).collect_as_hash
|
911
948
|
# # => {"a"=>3, "b"=>2, "c"=>3}
|
912
949
|
#
|
@@ -973,7 +1010,7 @@ module Spark
|
|
973
1010
|
# x*y
|
974
1011
|
# end
|
975
1012
|
#
|
976
|
-
# rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]], 2
|
1013
|
+
# rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]], 2)
|
977
1014
|
# rdd.aggregate_by_key(1, method(:combine), method(:merge))
|
978
1015
|
# # => [["b", 3], ["a", 16], ["c", 6]]
|
979
1016
|
#
|
@@ -1064,6 +1101,17 @@ module Spark
|
|
1064
1101
|
self.sort_by('lambda{|(key, _)| key}')
|
1065
1102
|
end
|
1066
1103
|
|
1104
|
+
# Sort the RDD by value
|
1105
|
+
#
|
1106
|
+
# == Example:
|
1107
|
+
# rdd = $sc.parallelize([["a", 3], ["b", 1], ["c", 2]])
|
1108
|
+
# rdd.sort_by_value.collect
|
1109
|
+
# # => [["b", 1], ["c", 2], ["a", 3]]
|
1110
|
+
#
|
1111
|
+
def sort_by_value(ascending=true, num_partitions=nil)
|
1112
|
+
self.sort_by('lambda{|(_, value)| value}')
|
1113
|
+
end
|
1114
|
+
|
1067
1115
|
# Sorts this RDD by the given key_function
|
1068
1116
|
#
|
1069
1117
|
# This is a different implementation than spark. Sort by doesn't use
|
@@ -1190,6 +1238,7 @@ module Spark
|
|
1190
1238
|
alias_method :defaultReducePartitions, :default_reduce_partitions
|
1191
1239
|
alias_method :setName, :set_name
|
1192
1240
|
alias_method :addLibrary, :add_library
|
1241
|
+
alias_method :require, :add_library
|
1193
1242
|
|
1194
1243
|
alias_method :flatMap, :flat_map
|
1195
1244
|
alias_method :mapPartitions, :map_partitions
|