ruby-spark 1.0.0 → 1.1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/README.md +99 -32
- data/TODO.md +2 -3
- data/benchmark/{performance → comparison}/prepare.sh +0 -0
- data/benchmark/{performance → comparison}/python.py +0 -0
- data/benchmark/{performance → comparison}/r.r +0 -0
- data/benchmark/{performance → comparison}/ruby.rb +0 -0
- data/benchmark/{performance → comparison}/run-all.sh +0 -0
- data/benchmark/{performance → comparison}/scala.scala +0 -0
- data/example/pi.rb +1 -1
- data/example/website_search.rb +83 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +30 -2
- data/lib/spark.rb +2 -2
- data/lib/spark/build.rb +1 -1
- data/lib/spark/cli.rb +1 -1
- data/lib/spark/command/base.rb +4 -0
- data/lib/spark/command_builder.rb +2 -2
- data/lib/spark/config.rb +11 -17
- data/lib/spark/context.rb +63 -45
- data/lib/spark/ext/io.rb +11 -1
- data/lib/spark/java_bridge/base.rb +2 -2
- data/lib/spark/rdd.rb +67 -18
- data/lib/spark/serializer.rb +68 -13
- data/lib/spark/serializer/auto_batched.rb +59 -0
- data/lib/spark/serializer/base.rb +30 -137
- data/lib/spark/serializer/batched.rb +84 -0
- data/lib/spark/serializer/cartesian.rb +5 -29
- data/lib/spark/serializer/compressed.rb +27 -0
- data/lib/spark/serializer/marshal.rb +6 -8
- data/lib/spark/serializer/message_pack.rb +8 -10
- data/lib/spark/serializer/oj.rb +8 -10
- data/lib/spark/serializer/pair.rb +27 -13
- data/lib/spark/serializer/text.rb +25 -0
- data/lib/spark/version.rb +1 -1
- data/lib/spark/worker/worker.rb +5 -2
- data/ruby-spark.gemspec +13 -1
- data/spec/lib/context_spec.rb +3 -1
- data/spec/lib/manipulation_spec.rb +18 -10
- data/spec/lib/map_partitions_spec.rb +16 -16
- data/spec/lib/serializer_spec.rb +84 -9
- data/spec/lib/statistic_spec.rb +26 -24
- data/spec/spec_helper.rb +1 -2
- metadata +112 -10
- data/lib/spark/serializer/utf8.rb +0 -25
data/lib/spark/command/base.rb
CHANGED
@@ -32,8 +32,8 @@ module Spark
|
|
32
32
|
def deep_copy
|
33
33
|
copy = self.dup
|
34
34
|
copy.create_command
|
35
|
-
copy.serializer = self.serializer.
|
36
|
-
copy.deserializer = self.deserializer.
|
35
|
+
copy.serializer = self.serializer.deep_copy
|
36
|
+
copy.deserializer = self.deserializer.deep_copy
|
37
37
|
copy.commands = self.commands.dup
|
38
38
|
copy.libraries = self.libraries.dup
|
39
39
|
copy.bound_objects = self.bound_objects.dup
|
data/lib/spark/config.rb
CHANGED
@@ -9,7 +9,7 @@ module Spark
|
|
9
9
|
|
10
10
|
TYPES = {
|
11
11
|
'spark.shuffle.spill' => :boolean,
|
12
|
-
'spark.ruby.
|
12
|
+
'spark.ruby.serializer.compress' => :boolean
|
13
13
|
}
|
14
14
|
|
15
15
|
# Initialize java SparkConf and load default configuration.
|
@@ -55,8 +55,8 @@ module Spark
|
|
55
55
|
errors << 'A master URL must be set in your configuration.'
|
56
56
|
end
|
57
57
|
|
58
|
-
if Spark::Serializer.
|
59
|
-
errors << '
|
58
|
+
if Spark::Serializer.find(get('spark.ruby.serializer')).nil?
|
59
|
+
errors << 'Unknow serializer.'
|
60
60
|
end
|
61
61
|
|
62
62
|
scanned = get('spark.ruby.executor.command').scan('%s')
|
@@ -137,9 +137,9 @@ module Spark
|
|
137
137
|
set_app_name('RubySpark')
|
138
138
|
set_master('local[*]')
|
139
139
|
set('spark.ruby.driver_home', Spark.home)
|
140
|
-
set('spark.ruby.parallelize_strategy', default_parallelize_strategy)
|
141
140
|
set('spark.ruby.serializer', default_serializer)
|
142
|
-
set('spark.ruby.
|
141
|
+
set('spark.ruby.serializer.compress', default_serializer_compress)
|
142
|
+
set('spark.ruby.serializer.batch_size', default_serializer_batch_size)
|
143
143
|
set('spark.ruby.executor.uri', default_executor_uri)
|
144
144
|
set('spark.ruby.executor.command', default_executor_command)
|
145
145
|
set('spark.ruby.executor.options', default_executor_options)
|
@@ -147,22 +147,16 @@ module Spark
|
|
147
147
|
load_executor_envs
|
148
148
|
end
|
149
149
|
|
150
|
-
# How to handle with data in method parallelize.
|
151
|
-
#
|
152
|
-
# == Possible options:
|
153
|
-
# inplace:: data are changed directly to save memory
|
154
|
-
# deep_copy:: data are cloned fist
|
155
|
-
#
|
156
|
-
def default_parallelize_strategy
|
157
|
-
ENV['SPARK_RUBY_PARALLELIZE_STRATEGY'] || 'inplace'
|
158
|
-
end
|
159
|
-
|
160
150
|
def default_serializer
|
161
151
|
ENV['SPARK_RUBY_SERIALIZER'] || Spark::Serializer::DEFAULT_SERIALIZER_NAME
|
162
152
|
end
|
163
153
|
|
164
|
-
def
|
165
|
-
ENV['
|
154
|
+
def default_serializer_compress
|
155
|
+
ENV['SPARK_RUBY_SERIALIZER_COMPRESS'] || Spark::Serializer::DEFAULT_COMPRESS
|
156
|
+
end
|
157
|
+
|
158
|
+
def default_serializer_batch_size
|
159
|
+
ENV['SPARK_RUBY_SERIALIZER_BATCH_SIZE'] || Spark::Serializer::DEFAULT_BATCH_SIZE
|
166
160
|
end
|
167
161
|
|
168
162
|
# Ruby executor.
|
data/lib/spark/context.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
Spark.load_lib
|
3
3
|
|
4
4
|
module Spark
|
5
|
+
##
|
5
6
|
# Main entry point for Spark functionality. A SparkContext represents the connection to a Spark
|
6
7
|
# cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster.
|
7
8
|
#
|
@@ -57,10 +58,38 @@ module Spark
|
|
57
58
|
sc.defaultParallelism
|
58
59
|
end
|
59
60
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
61
|
+
# Default serializer
|
62
|
+
#
|
63
|
+
# Batch -> Compress -> Basic
|
64
|
+
#
|
65
|
+
def default_serializer
|
66
|
+
# Basic
|
67
|
+
serializer = Spark::Serializer.find!(config('spark.ruby.serializer')).new
|
68
|
+
|
69
|
+
# Compress
|
70
|
+
if config('spark.ruby.serializer.compress')
|
71
|
+
serializer = Spark::Serializer.compressed(serializer)
|
72
|
+
end
|
73
|
+
|
74
|
+
# Bactching
|
75
|
+
batch_size = default_batch_size
|
76
|
+
if batch_size == 'auto'
|
77
|
+
serializer = Spark::Serializer.auto_batched(serializer)
|
78
|
+
else
|
79
|
+
serializer = Spark::Serializer.batched(serializer, batch_size)
|
80
|
+
end
|
81
|
+
|
82
|
+
# Finally, "container" contains serializers
|
83
|
+
serializer
|
84
|
+
end
|
85
|
+
|
86
|
+
def default_batch_size
|
87
|
+
size = config('spark.ruby.serializer.batch_size').to_i
|
88
|
+
if size >= 1
|
89
|
+
size
|
90
|
+
else
|
91
|
+
'auto'
|
92
|
+
end
|
64
93
|
end
|
65
94
|
|
66
95
|
# Set a local property that affects jobs submitted from this thread, such as the
|
@@ -93,12 +122,11 @@ module Spark
|
|
93
122
|
# be changed at runtime.
|
94
123
|
#
|
95
124
|
def config(key=nil)
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
Spark.config
|
125
|
+
if key
|
126
|
+
Spark.config.get(key)
|
127
|
+
else
|
128
|
+
Spark.config
|
129
|
+
end
|
102
130
|
end
|
103
131
|
|
104
132
|
# Add a file to be downloaded with this Spark job on every node.
|
@@ -164,10 +192,7 @@ module Spark
|
|
164
192
|
# == Parameters:
|
165
193
|
# data:: Range or Array
|
166
194
|
# num_slices:: number of slice
|
167
|
-
#
|
168
|
-
# - use
|
169
|
-
# - serializer
|
170
|
-
# - batch_size
|
195
|
+
# serializer:: custom serializer (default: serializer based on configuration)
|
171
196
|
#
|
172
197
|
# == Examples:
|
173
198
|
# $sc.parallelize(["1", "2", "3"]).map(lambda{|x| x.to_i}).collect
|
@@ -176,33 +201,21 @@ module Spark
|
|
176
201
|
# $sc.parallelize(1..3).map(:to_s).collect
|
177
202
|
# #=> ["1", "2", "3"]
|
178
203
|
#
|
179
|
-
def parallelize(data, num_slices=nil,
|
204
|
+
def parallelize(data, num_slices=nil, serializer=nil)
|
180
205
|
num_slices ||= default_parallelism
|
206
|
+
serializer ||= default_serializer
|
181
207
|
|
182
|
-
|
183
|
-
use = :file
|
184
|
-
serializer = get_serializer(options[:serializer], options[:batch_size])
|
185
|
-
|
186
|
-
if data.is_a?(Array) && config['spark.ruby.parallelize_strategy'] == 'deep_copy'
|
187
|
-
data = data.deep_copy
|
188
|
-
else
|
189
|
-
# For enumerator or range
|
190
|
-
data = data.to_a
|
191
|
-
end
|
208
|
+
serializer.check_each(data)
|
192
209
|
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
file = Tempfile.new('to_parallelize', temp_dir)
|
199
|
-
serializer.dump(data, file)
|
200
|
-
file.close # not unlink
|
201
|
-
jrdd = RubyRDD.readRDDFromFile(jcontext, file.path, num_slices)
|
202
|
-
file.unlink
|
203
|
-
end
|
210
|
+
# Through file
|
211
|
+
file = Tempfile.new('to_parallelize', temp_dir)
|
212
|
+
serializer.dump_to_io(data, file)
|
213
|
+
file.close # not unlink
|
214
|
+
jrdd = RubyRDD.readRDDFromFile(jcontext, file.path, num_slices)
|
204
215
|
|
205
216
|
Spark::RDD.new(jrdd, self, serializer)
|
217
|
+
ensure
|
218
|
+
file && file.unlink
|
206
219
|
end
|
207
220
|
|
208
221
|
# Read a text file from HDFS, a local file system (available on all nodes), or any
|
@@ -217,11 +230,12 @@ module Spark
|
|
217
230
|
# $sc.text_file(f.path).map(lambda{|x| x.to_i}).collect
|
218
231
|
# # => [1, 2]
|
219
232
|
#
|
220
|
-
def text_file(path, min_partitions=nil,
|
233
|
+
def text_file(path, min_partitions=nil, encoding=Encoding::UTF_8, serializer=nil)
|
221
234
|
min_partitions ||= default_parallelism
|
222
|
-
serializer
|
235
|
+
serializer ||= default_serializer
|
236
|
+
deserializer = Spark::Serializer.build { __text__(encoding) }
|
223
237
|
|
224
|
-
Spark::RDD.new(@jcontext.textFile(path, min_partitions), self, serializer,
|
238
|
+
Spark::RDD.new(@jcontext.textFile(path, min_partitions), self, serializer, deserializer)
|
225
239
|
end
|
226
240
|
|
227
241
|
# Read a directory of text files from HDFS, a local file system (available on all nodes), or any
|
@@ -240,10 +254,10 @@ module Spark
|
|
240
254
|
# $sc.whole_text_files(dir).flat_map(lambda{|key, value| value.split}).collect
|
241
255
|
# # => ["1", "2", "3", "4"]
|
242
256
|
#
|
243
|
-
def whole_text_files(path, min_partitions=nil,
|
257
|
+
def whole_text_files(path, min_partitions=nil, serializer=nil)
|
244
258
|
min_partitions ||= default_parallelism
|
245
|
-
serializer
|
246
|
-
deserializer
|
259
|
+
serializer ||= default_serializer
|
260
|
+
deserializer = Spark::Serializer.build{ __pair__(__text__, __text__) }
|
247
261
|
|
248
262
|
Spark::RDD.new(@jcontext.wholeTextFiles(path, min_partitions), self, serializer, deserializer)
|
249
263
|
end
|
@@ -254,7 +268,7 @@ module Spark
|
|
254
268
|
# If partitions is not specified, this will run over all partitions.
|
255
269
|
#
|
256
270
|
# == Example:
|
257
|
-
# rdd = $sc.parallelize(0..10, 5
|
271
|
+
# rdd = $sc.parallelize(0..10, 5)
|
258
272
|
# $sc.run_job(rdd, lambda{|x| x.to_s}, [0,2])
|
259
273
|
# # => ["[0, 1]", "[4, 5]"]
|
260
274
|
#
|
@@ -282,9 +296,13 @@ module Spark
|
|
282
296
|
# Rjb represent Fixnum as Integer but Jruby as Long
|
283
297
|
partitions = to_java_array_list(convert_to_java_int(partitions))
|
284
298
|
|
299
|
+
# File for result
|
300
|
+
file = Tempfile.new('collect', temp_dir)
|
301
|
+
|
285
302
|
mapped = rdd.new_rdd_from_command(command, *args)
|
286
|
-
|
287
|
-
|
303
|
+
RubyRDD.runJob(rdd.context.sc, mapped.jrdd, partitions, allow_local, file.path)
|
304
|
+
|
305
|
+
mapped.collect_from_file(file)
|
288
306
|
end
|
289
307
|
|
290
308
|
|
data/lib/spark/ext/io.rb
CHANGED
@@ -12,6 +12,12 @@ module Spark
|
|
12
12
|
unpack_int(read(4))
|
13
13
|
end
|
14
14
|
|
15
|
+
def read_int_or_eof
|
16
|
+
bytes = read(4)
|
17
|
+
return Spark::Constant::DATA_EOF if bytes.nil?
|
18
|
+
unpack_int(bytes)
|
19
|
+
end
|
20
|
+
|
15
21
|
def read_long
|
16
22
|
unpack_long(read(8))
|
17
23
|
end
|
@@ -35,8 +41,11 @@ module Spark
|
|
35
41
|
write(pack_long(data))
|
36
42
|
end
|
37
43
|
|
44
|
+
# Size and data can have different encoding
|
45
|
+
# Marshal: both ASCII
|
46
|
+
# Oj: ASCII and UTF-8
|
38
47
|
def write_string(data)
|
39
|
-
write_int(data.
|
48
|
+
write_int(data.bytesize)
|
40
49
|
write(data)
|
41
50
|
end
|
42
51
|
|
@@ -55,3 +64,4 @@ module Spark
|
|
55
64
|
end
|
56
65
|
|
57
66
|
IO.__send__(:include, Spark::CoreExtension::IO)
|
67
|
+
StringIO.__send__(:include, Spark::CoreExtension::IO)
|
@@ -145,8 +145,8 @@ module Spark
|
|
145
145
|
if class_name == 'JavaRDD'
|
146
146
|
jrdd = RubyRDD.toRuby(object)
|
147
147
|
|
148
|
-
serializer
|
149
|
-
|
148
|
+
serializer = Spark::Serializer.build { __batched__(__marshal__) }
|
149
|
+
serializer = Spark::Serializer.build { __batched__(__marshal__, 2) }
|
150
150
|
|
151
151
|
return Spark::RDD.new(jrdd, Spark.sc, serializer, deserializer)
|
152
152
|
end
|
data/lib/spark/rdd.rb
CHANGED
@@ -34,6 +34,18 @@ module Spark
|
|
34
34
|
@command = Spark::CommandBuilder.new(serializer, deserializer)
|
35
35
|
end
|
36
36
|
|
37
|
+
def inspect
|
38
|
+
comms = @command.commands.join(' -> ')
|
39
|
+
|
40
|
+
result = %{#<#{self.class.name}:0x#{object_id}}
|
41
|
+
result << %{ (#{comms})} unless comms.empty?
|
42
|
+
result << %{\n}
|
43
|
+
result << %{ Serializer: "#{serializer}"\n}
|
44
|
+
result << %{Deserializer: "#{deserializer}"}
|
45
|
+
result << %{>}
|
46
|
+
result
|
47
|
+
end
|
48
|
+
|
37
49
|
|
38
50
|
# =============================================================================
|
39
51
|
# Operators
|
@@ -159,7 +171,16 @@ module Spark
|
|
159
171
|
end
|
160
172
|
|
161
173
|
def to_java
|
162
|
-
|
174
|
+
marshal = Spark::Serializer.marshal
|
175
|
+
|
176
|
+
if deserializer.batched?
|
177
|
+
ser = deserializer.deep_copy
|
178
|
+
ser.serializer = marshal
|
179
|
+
else
|
180
|
+
ser = Spark::Serializer.batched(marshal)
|
181
|
+
end
|
182
|
+
|
183
|
+
rdd = self.reserialize(ser)
|
163
184
|
RubyRDD.toJava(rdd.jrdd, rdd.serializer.batched?)
|
164
185
|
end
|
165
186
|
|
@@ -169,20 +190,32 @@ module Spark
|
|
169
190
|
|
170
191
|
# Return an array that contains all of the elements in this RDD.
|
171
192
|
# RJB raise an error if stage is killed.
|
172
|
-
def collect
|
173
|
-
|
193
|
+
def collect(as_enum=false)
|
194
|
+
file = Tempfile.new('collect', context.temp_dir)
|
195
|
+
|
196
|
+
RubyRDD.writeRDDToFile(jrdd.rdd, file.path)
|
197
|
+
|
198
|
+
collect_from_file(file, as_enum)
|
174
199
|
rescue => e
|
175
200
|
raise Spark::RDDError, e.message
|
176
201
|
end
|
177
202
|
|
178
|
-
def
|
203
|
+
def collect_from_file(file, as_enum=false)
|
179
204
|
if self.is_a?(PipelinedRDD)
|
180
205
|
klass = @command.serializer
|
181
206
|
else
|
182
207
|
klass = @command.deserializer
|
183
208
|
end
|
184
209
|
|
185
|
-
|
210
|
+
if as_enum
|
211
|
+
result = klass.load_from_file(file)
|
212
|
+
else
|
213
|
+
result = klass.load_from_io(file).to_a
|
214
|
+
file.close
|
215
|
+
file.unlink
|
216
|
+
end
|
217
|
+
|
218
|
+
result
|
186
219
|
end
|
187
220
|
|
188
221
|
# Convert an Array to Hash
|
@@ -198,7 +231,7 @@ module Spark
|
|
198
231
|
# to satisfy the limit.
|
199
232
|
#
|
200
233
|
# == Example:
|
201
|
-
# rdd = $sc.parallelize(0..100, 20
|
234
|
+
# rdd = $sc.parallelize(0..100, 20)
|
202
235
|
# rdd.take(5)
|
203
236
|
# # => [0, 1, 2, 3, 4]
|
204
237
|
#
|
@@ -293,7 +326,7 @@ module Spark
|
|
293
326
|
# seq = lambda{|x,y| x+y}
|
294
327
|
# com = lambda{|x,y| x*y}
|
295
328
|
#
|
296
|
-
# rdd = $sc.parallelize(1..10, 2
|
329
|
+
# rdd = $sc.parallelize(1..10, 2)
|
297
330
|
# rdd.aggregate(1, seq, com)
|
298
331
|
# # => 656
|
299
332
|
#
|
@@ -590,7 +623,7 @@ module Spark
|
|
590
623
|
# of the original partition.
|
591
624
|
#
|
592
625
|
# == Example:
|
593
|
-
# rdd = $sc.parallelize(0...4, 4
|
626
|
+
# rdd = $sc.parallelize(0...4, 4)
|
594
627
|
# rdd.map_partitions_with_index(lambda{|part, index| part.first * index}).collect
|
595
628
|
# # => [0, 1, 4, 9]
|
596
629
|
#
|
@@ -623,7 +656,7 @@ module Spark
|
|
623
656
|
# Return an RDD created by coalescing all elements within each partition into an array.
|
624
657
|
#
|
625
658
|
# == Example:
|
626
|
-
# rdd = $sc.parallelize(0..10, 3
|
659
|
+
# rdd = $sc.parallelize(0..10, 3)
|
627
660
|
# rdd.glom.collect
|
628
661
|
# # => [[0, 1, 2], [3, 4, 5, 6], [7, 8, 9, 10]]
|
629
662
|
#
|
@@ -639,8 +672,14 @@ module Spark
|
|
639
672
|
# # => [[0, 1, 2], [3, 4, 5, 6, 7, 8, 9, 10]]
|
640
673
|
#
|
641
674
|
def coalesce(num_partitions)
|
675
|
+
if self.is_a?(PipelinedRDD)
|
676
|
+
deser = @command.serializer
|
677
|
+
else
|
678
|
+
deser = @command.deserializer
|
679
|
+
end
|
680
|
+
|
642
681
|
new_jrdd = jrdd.coalesce(num_partitions)
|
643
|
-
RDD.new(new_jrdd, context, @command.serializer,
|
682
|
+
RDD.new(new_jrdd, context, @command.serializer, deser)
|
644
683
|
end
|
645
684
|
|
646
685
|
# Return the Cartesian product of this RDD and another one, that is, the
|
@@ -655,7 +694,8 @@ module Spark
|
|
655
694
|
# # => [[1, 4], [1, 5], [1, 6], [2, 4], [2, 5], [2, 6], [3, 4], [3, 5], [3, 6]]
|
656
695
|
#
|
657
696
|
def cartesian(other)
|
658
|
-
_deserializer = Spark::Serializer::Cartesian.new
|
697
|
+
_deserializer = Spark::Serializer::Cartesian.new(self.deserializer, other.deserializer)
|
698
|
+
|
659
699
|
new_jrdd = jrdd.cartesian(other.jrdd)
|
660
700
|
RDD.new(new_jrdd, context, serializer, _deserializer)
|
661
701
|
end
|
@@ -697,7 +737,7 @@ module Spark
|
|
697
737
|
#
|
698
738
|
def union(other)
|
699
739
|
if self.serializer != other.serializer
|
700
|
-
other = other.reserialize(serializer
|
740
|
+
other = other.reserialize(serializer)
|
701
741
|
end
|
702
742
|
|
703
743
|
new_jrdd = jrdd.union(other.jrdd)
|
@@ -713,10 +753,7 @@ module Spark
|
|
713
753
|
# rdd.reserialize("oj").collect
|
714
754
|
# # => ["1", "2", "3"]
|
715
755
|
#
|
716
|
-
def reserialize(new_serializer
|
717
|
-
new_batch_size ||= deserializer.batch_size
|
718
|
-
new_serializer = Spark::Serializer.get!(new_serializer).new(new_batch_size)
|
719
|
-
|
756
|
+
def reserialize(new_serializer)
|
720
757
|
if serializer == new_serializer
|
721
758
|
return self
|
722
759
|
end
|
@@ -906,7 +943,7 @@ module Spark
|
|
906
943
|
# x+y
|
907
944
|
# end
|
908
945
|
#
|
909
|
-
# rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"], 2
|
946
|
+
# rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"], 2).map(lambda{|x| [x, 1]})
|
910
947
|
# rdd.combine_by_key(method(:combiner), method(:merge), method(:merge)).collect_as_hash
|
911
948
|
# # => {"a"=>3, "b"=>2, "c"=>3}
|
912
949
|
#
|
@@ -973,7 +1010,7 @@ module Spark
|
|
973
1010
|
# x*y
|
974
1011
|
# end
|
975
1012
|
#
|
976
|
-
# rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]], 2
|
1013
|
+
# rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]], 2)
|
977
1014
|
# rdd.aggregate_by_key(1, method(:combine), method(:merge))
|
978
1015
|
# # => [["b", 3], ["a", 16], ["c", 6]]
|
979
1016
|
#
|
@@ -1064,6 +1101,17 @@ module Spark
|
|
1064
1101
|
self.sort_by('lambda{|(key, _)| key}')
|
1065
1102
|
end
|
1066
1103
|
|
1104
|
+
# Sort the RDD by value
|
1105
|
+
#
|
1106
|
+
# == Example:
|
1107
|
+
# rdd = $sc.parallelize([["a", 3], ["b", 1], ["c", 2]])
|
1108
|
+
# rdd.sort_by_value.collect
|
1109
|
+
# # => [["b", 1], ["c", 2], ["a", 3]]
|
1110
|
+
#
|
1111
|
+
def sort_by_value(ascending=true, num_partitions=nil)
|
1112
|
+
self.sort_by('lambda{|(_, value)| value}')
|
1113
|
+
end
|
1114
|
+
|
1067
1115
|
# Sorts this RDD by the given key_function
|
1068
1116
|
#
|
1069
1117
|
# This is a different implementation than spark. Sort by doesn't use
|
@@ -1190,6 +1238,7 @@ module Spark
|
|
1190
1238
|
alias_method :defaultReducePartitions, :default_reduce_partitions
|
1191
1239
|
alias_method :setName, :set_name
|
1192
1240
|
alias_method :addLibrary, :add_library
|
1241
|
+
alias_method :require, :add_library
|
1193
1242
|
|
1194
1243
|
alias_method :flatMap, :flat_map
|
1195
1244
|
alias_method :mapPartitions, :map_partitions
|