ruby-spark 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +37 -0
- data/Gemfile +47 -0
- data/Guardfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +185 -0
- data/Rakefile +35 -0
- data/TODO.md +7 -0
- data/benchmark/aggregate.rb +33 -0
- data/benchmark/bisect.rb +88 -0
- data/benchmark/custom_marshal.rb +94 -0
- data/benchmark/digest.rb +150 -0
- data/benchmark/enumerator.rb +88 -0
- data/benchmark/performance/prepare.sh +18 -0
- data/benchmark/performance/python.py +156 -0
- data/benchmark/performance/r.r +69 -0
- data/benchmark/performance/ruby.rb +167 -0
- data/benchmark/performance/run-all.sh +160 -0
- data/benchmark/performance/scala.scala +181 -0
- data/benchmark/serializer.rb +82 -0
- data/benchmark/sort.rb +43 -0
- data/benchmark/sort2.rb +164 -0
- data/benchmark/take.rb +28 -0
- data/bin/ruby-spark +8 -0
- data/example/pi.rb +28 -0
- data/ext/ruby_c/extconf.rb +3 -0
- data/ext/ruby_c/murmur.c +158 -0
- data/ext/ruby_c/murmur.h +9 -0
- data/ext/ruby_c/ruby-spark.c +18 -0
- data/ext/ruby_java/Digest.java +36 -0
- data/ext/ruby_java/Murmur2.java +98 -0
- data/ext/ruby_java/RubySparkExtService.java +28 -0
- data/ext/ruby_java/extconf.rb +3 -0
- data/ext/spark/build.sbt +73 -0
- data/ext/spark/project/plugins.sbt +9 -0
- data/ext/spark/sbt/sbt +34 -0
- data/ext/spark/src/main/scala/Exec.scala +91 -0
- data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
- data/ext/spark/src/main/scala/Marshal.scala +52 -0
- data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
- data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
- data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
- data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
- data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
- data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
- data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
- data/ext/spark/src/main/scala/RubyPage.scala +34 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +364 -0
- data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
- data/ext/spark/src/main/scala/RubyTab.scala +11 -0
- data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
- data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
- data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
- data/lib/ruby-spark.rb +1 -0
- data/lib/spark.rb +198 -0
- data/lib/spark/accumulator.rb +260 -0
- data/lib/spark/broadcast.rb +98 -0
- data/lib/spark/build.rb +43 -0
- data/lib/spark/cli.rb +169 -0
- data/lib/spark/command.rb +86 -0
- data/lib/spark/command/base.rb +154 -0
- data/lib/spark/command/basic.rb +345 -0
- data/lib/spark/command/pair.rb +124 -0
- data/lib/spark/command/sort.rb +51 -0
- data/lib/spark/command/statistic.rb +144 -0
- data/lib/spark/command_builder.rb +141 -0
- data/lib/spark/command_validator.rb +34 -0
- data/lib/spark/config.rb +244 -0
- data/lib/spark/constant.rb +14 -0
- data/lib/spark/context.rb +304 -0
- data/lib/spark/error.rb +50 -0
- data/lib/spark/ext/hash.rb +41 -0
- data/lib/spark/ext/integer.rb +25 -0
- data/lib/spark/ext/io.rb +57 -0
- data/lib/spark/ext/ip_socket.rb +29 -0
- data/lib/spark/ext/module.rb +58 -0
- data/lib/spark/ext/object.rb +24 -0
- data/lib/spark/ext/string.rb +24 -0
- data/lib/spark/helper.rb +10 -0
- data/lib/spark/helper/logger.rb +40 -0
- data/lib/spark/helper/parser.rb +85 -0
- data/lib/spark/helper/serialize.rb +71 -0
- data/lib/spark/helper/statistic.rb +93 -0
- data/lib/spark/helper/system.rb +42 -0
- data/lib/spark/java_bridge.rb +19 -0
- data/lib/spark/java_bridge/base.rb +203 -0
- data/lib/spark/java_bridge/jruby.rb +23 -0
- data/lib/spark/java_bridge/rjb.rb +41 -0
- data/lib/spark/logger.rb +76 -0
- data/lib/spark/mllib.rb +100 -0
- data/lib/spark/mllib/classification/common.rb +31 -0
- data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
- data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
- data/lib/spark/mllib/classification/svm.rb +135 -0
- data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
- data/lib/spark/mllib/clustering/kmeans.rb +118 -0
- data/lib/spark/mllib/matrix.rb +120 -0
- data/lib/spark/mllib/regression/common.rb +73 -0
- data/lib/spark/mllib/regression/labeled_point.rb +41 -0
- data/lib/spark/mllib/regression/lasso.rb +100 -0
- data/lib/spark/mllib/regression/linear.rb +124 -0
- data/lib/spark/mllib/regression/ridge.rb +97 -0
- data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
- data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
- data/lib/spark/mllib/stat/distribution.rb +12 -0
- data/lib/spark/mllib/vector.rb +185 -0
- data/lib/spark/rdd.rb +1328 -0
- data/lib/spark/sampler.rb +92 -0
- data/lib/spark/serializer.rb +24 -0
- data/lib/spark/serializer/base.rb +170 -0
- data/lib/spark/serializer/cartesian.rb +37 -0
- data/lib/spark/serializer/marshal.rb +19 -0
- data/lib/spark/serializer/message_pack.rb +25 -0
- data/lib/spark/serializer/oj.rb +25 -0
- data/lib/spark/serializer/pair.rb +27 -0
- data/lib/spark/serializer/utf8.rb +25 -0
- data/lib/spark/sort.rb +189 -0
- data/lib/spark/stat_counter.rb +125 -0
- data/lib/spark/storage_level.rb +39 -0
- data/lib/spark/version.rb +3 -0
- data/lib/spark/worker/master.rb +144 -0
- data/lib/spark/worker/spark_files.rb +15 -0
- data/lib/spark/worker/worker.rb +197 -0
- data/ruby-spark.gemspec +36 -0
- data/spec/generator.rb +37 -0
- data/spec/inputs/lorem_300.txt +316 -0
- data/spec/inputs/numbers/1.txt +50 -0
- data/spec/inputs/numbers/10.txt +50 -0
- data/spec/inputs/numbers/11.txt +50 -0
- data/spec/inputs/numbers/12.txt +50 -0
- data/spec/inputs/numbers/13.txt +50 -0
- data/spec/inputs/numbers/14.txt +50 -0
- data/spec/inputs/numbers/15.txt +50 -0
- data/spec/inputs/numbers/16.txt +50 -0
- data/spec/inputs/numbers/17.txt +50 -0
- data/spec/inputs/numbers/18.txt +50 -0
- data/spec/inputs/numbers/19.txt +50 -0
- data/spec/inputs/numbers/2.txt +50 -0
- data/spec/inputs/numbers/20.txt +50 -0
- data/spec/inputs/numbers/3.txt +50 -0
- data/spec/inputs/numbers/4.txt +50 -0
- data/spec/inputs/numbers/5.txt +50 -0
- data/spec/inputs/numbers/6.txt +50 -0
- data/spec/inputs/numbers/7.txt +50 -0
- data/spec/inputs/numbers/8.txt +50 -0
- data/spec/inputs/numbers/9.txt +50 -0
- data/spec/inputs/numbers_0_100.txt +101 -0
- data/spec/inputs/numbers_1_100.txt +100 -0
- data/spec/lib/collect_spec.rb +42 -0
- data/spec/lib/command_spec.rb +68 -0
- data/spec/lib/config_spec.rb +64 -0
- data/spec/lib/context_spec.rb +163 -0
- data/spec/lib/ext_spec.rb +72 -0
- data/spec/lib/external_apps_spec.rb +45 -0
- data/spec/lib/filter_spec.rb +80 -0
- data/spec/lib/flat_map_spec.rb +100 -0
- data/spec/lib/group_spec.rb +109 -0
- data/spec/lib/helper_spec.rb +19 -0
- data/spec/lib/key_spec.rb +41 -0
- data/spec/lib/manipulation_spec.rb +114 -0
- data/spec/lib/map_partitions_spec.rb +87 -0
- data/spec/lib/map_spec.rb +91 -0
- data/spec/lib/mllib/classification_spec.rb +54 -0
- data/spec/lib/mllib/clustering_spec.rb +35 -0
- data/spec/lib/mllib/matrix_spec.rb +32 -0
- data/spec/lib/mllib/regression_spec.rb +116 -0
- data/spec/lib/mllib/vector_spec.rb +77 -0
- data/spec/lib/reduce_by_key_spec.rb +118 -0
- data/spec/lib/reduce_spec.rb +131 -0
- data/spec/lib/sample_spec.rb +46 -0
- data/spec/lib/serializer_spec.rb +13 -0
- data/spec/lib/sort_spec.rb +58 -0
- data/spec/lib/statistic_spec.rb +168 -0
- data/spec/lib/whole_text_files_spec.rb +33 -0
- data/spec/spec_helper.rb +39 -0
- metadata +301 -0
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'matrix'
|
2
|
+
|
3
|
+
# Based on ruby 2.1
|
4
|
+
|
5
|
+
class Vector
|
6
|
+
def self.elements(array, copy=true)
|
7
|
+
DenseVector.new(convert_to_array(array, copy))
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
module Spark
|
12
|
+
module Mllib
|
13
|
+
class VectorAdapter < ::Vector
|
14
|
+
|
15
|
+
def self.new(*args)
|
16
|
+
object = self.allocate
|
17
|
+
object.__send__(:initialize, *args)
|
18
|
+
object
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize(*args)
|
22
|
+
case args.shift
|
23
|
+
when :dense
|
24
|
+
values = args.shift.dup
|
25
|
+
when :sparse
|
26
|
+
values = [0.0] * args.shift.to_i
|
27
|
+
else
|
28
|
+
raise Spark::MllibError, 'Unknow vector type.'
|
29
|
+
end
|
30
|
+
|
31
|
+
super(values)
|
32
|
+
end
|
33
|
+
|
34
|
+
def []=(index, value)
|
35
|
+
@elements[index] = value
|
36
|
+
end
|
37
|
+
|
38
|
+
def dot(other)
|
39
|
+
if other.is_a?(Spark::Mllib::MatrixBase)
|
40
|
+
other * self
|
41
|
+
else
|
42
|
+
inner_product(other)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def squared_distance(other)
|
47
|
+
diff = self - other
|
48
|
+
diff.dot(diff)
|
49
|
+
end
|
50
|
+
|
51
|
+
def values
|
52
|
+
@values || to_a
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
##
|
2
|
+
# MultivariateGaussian
|
3
|
+
#
|
4
|
+
# This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution. In
|
5
|
+
# the event that the covariance matrix is singular, the density will be computed in a
|
6
|
+
# reduced dimensional subspace under which the distribution is supported.
|
7
|
+
#
|
8
|
+
# == Arguments:
|
9
|
+
# mu:: The mean vector of the distribution
|
10
|
+
# sigma:: The covariance matrix of the distribution
|
11
|
+
#
|
12
|
+
Spark::Mllib::MultivariateGaussian = Struct.new(:mu, :sigma)
|
@@ -0,0 +1,185 @@
|
|
1
|
+
module Spark
|
2
|
+
module Mllib
|
3
|
+
module Vectors
|
4
|
+
|
5
|
+
def self.dense(*args)
|
6
|
+
DenseVector.new(*args)
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.sparse(*args)
|
10
|
+
SparseVector.new(*args)
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.parse(data)
|
14
|
+
if data.start_with?('[') && data.end_with?(']')
|
15
|
+
DenseVector.parse(data)
|
16
|
+
elsif data.start_with?('(') && data.end_with?(')')
|
17
|
+
SparseVector.parse(data)
|
18
|
+
else
|
19
|
+
raise ArgumentError, 'Unknow vector.'
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.to_vector(data)
|
24
|
+
if data.is_a?(SparseVector) || data.is_a?(DenseVector)
|
25
|
+
data
|
26
|
+
elsif data.is_a?(Array)
|
27
|
+
DenseVector.new(data)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
module Spark
|
36
|
+
module Mllib
|
37
|
+
# @abstract Parent for all type of vectors
|
38
|
+
class VectorBase < VectorAdapter
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
module Spark
|
44
|
+
module Mllib
|
45
|
+
##
|
46
|
+
# A dense vector represented by a value array.
|
47
|
+
#
|
48
|
+
# Dense vector is a vector in which most of the elements are non-zero.
|
49
|
+
#
|
50
|
+
# == Example:
|
51
|
+
# DenseVector.new([1,2,3,4,5]).values
|
52
|
+
# # => [1, 2, 3, 4, 5]
|
53
|
+
#
|
54
|
+
# DenseVector.new(1..5).values
|
55
|
+
# # => [1, 2, 3, 4, 5]
|
56
|
+
#
|
57
|
+
class DenseVector < VectorBase
|
58
|
+
|
59
|
+
def initialize(values)
|
60
|
+
super(:dense, values.to_a)
|
61
|
+
end
|
62
|
+
|
63
|
+
# Covert string to vector
|
64
|
+
#
|
65
|
+
# DenseVector.parse("[1.0,2.0,3.0,4.0,5.0]")
|
66
|
+
#
|
67
|
+
def self.parse(data)
|
68
|
+
unless data =~ /\[[0-9., ]+\]/
|
69
|
+
raise ArgumentError, 'Unknow format for DenseVector.'
|
70
|
+
end
|
71
|
+
|
72
|
+
data.sub!('[', '')
|
73
|
+
data.sub!(']', '')
|
74
|
+
|
75
|
+
data = data.split(',')
|
76
|
+
data.map!(&:to_f)
|
77
|
+
|
78
|
+
DenseVector.new(data)
|
79
|
+
end
|
80
|
+
|
81
|
+
# Convert vector to string
|
82
|
+
#
|
83
|
+
# DenseVector.new([1,2,3,4,5]).to_s
|
84
|
+
# # => "[1.0,2.0,3.0,4.0,5.0]"
|
85
|
+
#
|
86
|
+
def to_s
|
87
|
+
"[#{values.join(',')}]"
|
88
|
+
end
|
89
|
+
|
90
|
+
def to_java
|
91
|
+
JDenseVector.new(values)
|
92
|
+
end
|
93
|
+
|
94
|
+
def self.from_java(object)
|
95
|
+
DenseVector.new(object.values)
|
96
|
+
end
|
97
|
+
|
98
|
+
def marshal_dump
|
99
|
+
values
|
100
|
+
end
|
101
|
+
|
102
|
+
def marshal_load(array)
|
103
|
+
initialize(array)
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
module Spark
|
111
|
+
module Mllib
|
112
|
+
##
|
113
|
+
# A sparse vector represented by an index array and an value array.
|
114
|
+
#
|
115
|
+
# Sparse vector is a vector in which most of the elements are zero.
|
116
|
+
#
|
117
|
+
# == Example:
|
118
|
+
# SparseVector.new(4, {1 => 1.0, 3 => 5.5}).values
|
119
|
+
# # => [0, 1.0, 0, 5.5]
|
120
|
+
#
|
121
|
+
# SparseVector.new(4, [[1, 3], [1.0, 5.5]]).values
|
122
|
+
# # => [0, 1.0, 0, 5.5]
|
123
|
+
#
|
124
|
+
# SparseVector.new(4, [1, 3], [1.0, 5.5]).values
|
125
|
+
# # => [0, 1.0, 0, 5.5]
|
126
|
+
#
|
127
|
+
class SparseVector < VectorBase
|
128
|
+
|
129
|
+
attr_reader :indices
|
130
|
+
|
131
|
+
def initialize(arg1, arg2=nil, arg3=nil)
|
132
|
+
super(:sparse, arg1)
|
133
|
+
|
134
|
+
if arg2.is_a?(Hash)
|
135
|
+
@indices = arg2.keys
|
136
|
+
@values = arg2.values
|
137
|
+
else
|
138
|
+
@indices = arg2
|
139
|
+
@values = arg3
|
140
|
+
end
|
141
|
+
|
142
|
+
@indices.zip(@values).each do |(index, value)|
|
143
|
+
self[index] = value
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# Covert string to vector
|
148
|
+
#
|
149
|
+
# SparseVector.parse("(5,[1,4],[3.0,5.0])")
|
150
|
+
#
|
151
|
+
def self.parse(data)
|
152
|
+
data = data.match(/\(([0-9]+)[ ]*,[ ]*\[([0-9,. ]*)\][ ]*,[ ]*\[([0-9,. ]*)\]\)/)
|
153
|
+
if data
|
154
|
+
size = data[1].to_i
|
155
|
+
indices = data[2].split(',')
|
156
|
+
indices.map!(&:to_i)
|
157
|
+
values = data[3].split(',')
|
158
|
+
values.map!(&:to_f)
|
159
|
+
|
160
|
+
SparseVector.new(size, indices, values)
|
161
|
+
else
|
162
|
+
raise ArgumentError, 'Unknow format for SparseVector.'
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
# Convert vector to string
|
167
|
+
#
|
168
|
+
# SparseVector.new(5, {1 => 3, 4 => 5}).to_s
|
169
|
+
# # => "(5,[1,4],[3.0,5.0])"
|
170
|
+
#
|
171
|
+
def to_s
|
172
|
+
"(#{size},[#{indices.join(',')}],[#{values.join(',')}])"
|
173
|
+
end
|
174
|
+
|
175
|
+
def marshal_dump
|
176
|
+
[size, indices, values]
|
177
|
+
end
|
178
|
+
|
179
|
+
def marshal_load(array)
|
180
|
+
initialize(array[0], array[1], array[2])
|
181
|
+
end
|
182
|
+
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
data/lib/spark/rdd.rb
ADDED
@@ -0,0 +1,1328 @@
|
|
1
|
+
module Spark
|
2
|
+
##
|
3
|
+
# A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. Represents an immutable,
|
4
|
+
# partitioned collection of elements that can be operated on in parallel. This class contains the
|
5
|
+
# basic operations available on all RDDs, such as `map`, `filter`, and `persist`.
|
6
|
+
#
|
7
|
+
class RDD
|
8
|
+
|
9
|
+
extend Forwardable
|
10
|
+
|
11
|
+
attr_reader :jrdd, :context, :command
|
12
|
+
|
13
|
+
include Spark::Helper::Logger
|
14
|
+
include Spark::Helper::Parser
|
15
|
+
include Spark::Helper::Statistic
|
16
|
+
|
17
|
+
def_delegators :@command, :serializer, :deserializer, :libraries, :files
|
18
|
+
|
19
|
+
# Initializing RDD, this method is root of all Pipelined RDD - its unique
|
20
|
+
# If you call some operations on this class it will be computed in Java
|
21
|
+
#
|
22
|
+
# == Parameters:
|
23
|
+
# jrdd:: org.apache.spark.api.java.JavaRDD
|
24
|
+
# context:: {Spark::Context}
|
25
|
+
# serializer:: {Spark::Serializer}
|
26
|
+
#
|
27
|
+
def initialize(jrdd, context, serializer, deserializer=nil)
|
28
|
+
@jrdd = jrdd
|
29
|
+
@context = context
|
30
|
+
|
31
|
+
@cached = false
|
32
|
+
@checkpointed = false
|
33
|
+
|
34
|
+
@command = Spark::CommandBuilder.new(serializer, deserializer)
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
# =============================================================================
|
39
|
+
# Operators
|
40
|
+
|
41
|
+
def +(other)
|
42
|
+
self.union(other)
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
# =============================================================================
|
47
|
+
# Commad and serializer
|
48
|
+
|
49
|
+
def add_command(klass, *args)
|
50
|
+
@command.deep_copy.add_command(klass, *args)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Add ruby library
|
54
|
+
# Libraries will be included before computing
|
55
|
+
#
|
56
|
+
# == Example:
|
57
|
+
# rdd.add_library('pry').add_library('nio4r', 'distribution')
|
58
|
+
#
|
59
|
+
def add_library(*libraries)
|
60
|
+
@command.add_library(*libraries)
|
61
|
+
self
|
62
|
+
end
|
63
|
+
|
64
|
+
# Bind object to RDD
|
65
|
+
#
|
66
|
+
# == Example:
|
67
|
+
# text = "test"
|
68
|
+
#
|
69
|
+
# rdd = $sc.parallelize(0..5)
|
70
|
+
# rdd = rdd.map(lambda{|x| x.to_s + " " + text})
|
71
|
+
# rdd = rdd.bind(text: text)
|
72
|
+
#
|
73
|
+
# rdd.collect
|
74
|
+
# # => ["0 test", "1 test", "2 test", "3 test", "4 test", "5 test"]
|
75
|
+
#
|
76
|
+
def bind(objects)
|
77
|
+
unless objects.is_a?(Hash)
|
78
|
+
raise ArgumentError, 'Argument must be a Hash.'
|
79
|
+
end
|
80
|
+
|
81
|
+
@command.bind(objects)
|
82
|
+
self
|
83
|
+
end
|
84
|
+
|
85
|
+
def new_rdd_from_command(klass, *args)
|
86
|
+
comm = add_command(klass, *args)
|
87
|
+
PipelinedRDD.new(self, comm)
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
# =============================================================================
|
92
|
+
# Variables and non-computing functions
|
93
|
+
|
94
|
+
def config
|
95
|
+
@context.config
|
96
|
+
end
|
97
|
+
|
98
|
+
def default_reduce_partitions
|
99
|
+
config['spark.default.parallelism'] || partitions_size
|
100
|
+
end
|
101
|
+
|
102
|
+
# Count of ParallelCollectionPartition
|
103
|
+
def partitions_size
|
104
|
+
jrdd.rdd.partitions.size
|
105
|
+
end
|
106
|
+
|
107
|
+
# A unique ID for this RDD (within its SparkContext).
|
108
|
+
def id
|
109
|
+
jrdd.id
|
110
|
+
end
|
111
|
+
|
112
|
+
# Persist this RDD with the default storage level MEMORY_ONLY_SER because of serialization.
|
113
|
+
def cache
|
114
|
+
persist('memory_only_ser')
|
115
|
+
end
|
116
|
+
|
117
|
+
# Set this RDD's storage level to persist its values across operations after the first time
|
118
|
+
# it is computed. This can only be used to assign a new storage level if the RDD does not
|
119
|
+
# have a storage level set yet.
|
120
|
+
#
|
121
|
+
# See StorageLevel for type of new_level
|
122
|
+
#
|
123
|
+
def persist(new_level)
|
124
|
+
@cached = true
|
125
|
+
jrdd.persist(Spark::StorageLevel.java_get(new_level))
|
126
|
+
self
|
127
|
+
end
|
128
|
+
|
129
|
+
# Mark the RDD as non-persistent, and remove all blocks for it from memory and disk.
|
130
|
+
#
|
131
|
+
# == Parameters:
|
132
|
+
# blocking:: whether to block until all blocks are deleted.
|
133
|
+
#
|
134
|
+
def unpersist(blocking=true)
|
135
|
+
@cached = false
|
136
|
+
jrdd.unpersist(blocking)
|
137
|
+
self
|
138
|
+
end
|
139
|
+
|
140
|
+
def cached?
|
141
|
+
@cached
|
142
|
+
end
|
143
|
+
|
144
|
+
def checkpointed?
|
145
|
+
@checkpointed
|
146
|
+
end
|
147
|
+
|
148
|
+
# Return the name of this RDD.
|
149
|
+
#
|
150
|
+
def name
|
151
|
+
_name = jrdd.name
|
152
|
+
_name && _name.encode(Encoding::UTF_8)
|
153
|
+
end
|
154
|
+
|
155
|
+
# Assign a name to this RDD.
|
156
|
+
#
|
157
|
+
def set_name(name)
|
158
|
+
jrdd.setName(name)
|
159
|
+
end
|
160
|
+
|
161
|
+
def to_java
|
162
|
+
rdd = self.reserialize('Marshal')
|
163
|
+
RubyRDD.toJava(rdd.jrdd, rdd.serializer.batched?)
|
164
|
+
end
|
165
|
+
|
166
|
+
|
167
|
+
# =============================================================================
|
168
|
+
# Actions which return value
|
169
|
+
|
170
|
+
# Return an array that contains all of the elements in this RDD.
|
171
|
+
# RJB raise an error if stage is killed.
|
172
|
+
def collect
|
173
|
+
collect_from_iterator(jrdd.collect.iterator)
|
174
|
+
rescue => e
|
175
|
+
raise Spark::RDDError, e.message
|
176
|
+
end
|
177
|
+
|
178
|
+
def collect_from_iterator(iterator)
|
179
|
+
if self.is_a?(PipelinedRDD)
|
180
|
+
klass = @command.serializer
|
181
|
+
else
|
182
|
+
klass = @command.deserializer
|
183
|
+
end
|
184
|
+
|
185
|
+
klass.load_from_iterator(iterator)
|
186
|
+
end
|
187
|
+
|
188
|
+
# Convert an Array to Hash
|
189
|
+
#
|
190
|
+
def collect_as_hash
|
191
|
+
Hash[collect]
|
192
|
+
end
|
193
|
+
|
194
|
+
# Take the first num elements of the RDD.
|
195
|
+
#
|
196
|
+
# It works by first scanning one partition, and use the results from
|
197
|
+
# that partition to estimate the number of additional partitions needed
|
198
|
+
# to satisfy the limit.
|
199
|
+
#
|
200
|
+
# == Example:
|
201
|
+
# rdd = $sc.parallelize(0..100, 20, batch_size: 1)
|
202
|
+
# rdd.take(5)
|
203
|
+
# # => [0, 1, 2, 3, 4]
|
204
|
+
#
|
205
|
+
def take(count)
|
206
|
+
buffer = []
|
207
|
+
|
208
|
+
parts_count = self.partitions_size
|
209
|
+
# No parts was scanned, yet
|
210
|
+
last_scanned = -1
|
211
|
+
|
212
|
+
while buffer.empty?
|
213
|
+
last_scanned += 1
|
214
|
+
buffer += context.run_job_with_command(self, [last_scanned], true, Spark::Command::Take, 0, -1)
|
215
|
+
end
|
216
|
+
|
217
|
+
# Assumption. Depend on batch_size and how Spark divided data.
|
218
|
+
items_per_part = buffer.size
|
219
|
+
left = count - buffer.size
|
220
|
+
|
221
|
+
while left > 0 && last_scanned < parts_count
|
222
|
+
parts_to_take = (left.to_f/items_per_part).ceil
|
223
|
+
parts_for_scanned = Array.new(parts_to_take) do
|
224
|
+
last_scanned += 1
|
225
|
+
end
|
226
|
+
|
227
|
+
# We cannot take exact number of items because workers are isolated from each other.
|
228
|
+
# => once you take e.g. 50% from last part and left is still > 0 then its very
|
229
|
+
# difficult merge new items
|
230
|
+
items = context.run_job_with_command(self, parts_for_scanned, true, Spark::Command::Take, left, last_scanned)
|
231
|
+
buffer += items
|
232
|
+
|
233
|
+
left = count - buffer.size
|
234
|
+
# Average size of all parts
|
235
|
+
items_per_part = [items_per_part, items.size].reduce(0){|sum, x| sum + x.to_f/2}
|
236
|
+
end
|
237
|
+
|
238
|
+
buffer.slice!(0, count)
|
239
|
+
end
|
240
|
+
|
241
|
+
# Return the first element in this RDD.
|
242
|
+
#
|
243
|
+
# == Example:
|
244
|
+
# rdd = $sc.parallelize(0..100)
|
245
|
+
# rdd.first
|
246
|
+
# # => 0
|
247
|
+
#
|
248
|
+
def first
|
249
|
+
self.take(1)[0]
|
250
|
+
end
|
251
|
+
|
252
|
+
# Reduces the elements of this RDD using the specified lambda or method.
|
253
|
+
#
|
254
|
+
# == Example:
|
255
|
+
# rdd = $sc.parallelize(0..10)
|
256
|
+
# rdd.reduce(lambda{|sum, x| sum+x})
|
257
|
+
# # => 55
|
258
|
+
#
|
259
|
+
def reduce(f)
|
260
|
+
_reduce(Spark::Command::Reduce, f, f)
|
261
|
+
end
|
262
|
+
|
263
|
+
# Aggregate the elements of each partition, and then the results for all the partitions, using a
|
264
|
+
# given associative function and a neutral "zero value".
|
265
|
+
#
|
266
|
+
# The function f(x, y) is allowed to modify x and return it as its result value to avoid
|
267
|
+
# object allocation; however, it should not modify y.
|
268
|
+
#
|
269
|
+
# Be careful, zero_values is applied to all stages. See example.
|
270
|
+
#
|
271
|
+
# == Example:
|
272
|
+
# rdd = $sc.parallelize(0..10, 2)
|
273
|
+
# rdd.fold(1, lambda{|sum, x| sum+x})
|
274
|
+
# # => 58
|
275
|
+
#
|
276
|
+
def fold(zero_value, f)
|
277
|
+
self.aggregate(zero_value, f, f)
|
278
|
+
end
|
279
|
+
|
280
|
+
# Aggregate the elements of each partition, and then the results for all the partitions, using
|
281
|
+
# given combine functions and a neutral "zero value".
|
282
|
+
#
|
283
|
+
# This function can return a different result type. We need one operation for merging.
|
284
|
+
#
|
285
|
+
# Result must be an Array otherwise Serializer Array's zero value will be send
|
286
|
+
# as multiple values and not just one.
|
287
|
+
#
|
288
|
+
# == Example:
|
289
|
+
# # 1 2 3 4 5 => 15 + 1 = 16
|
290
|
+
# # 6 7 8 9 10 => 40 + 1 = 41
|
291
|
+
# # 16 * 41 = 656
|
292
|
+
#
|
293
|
+
# seq = lambda{|x,y| x+y}
|
294
|
+
# com = lambda{|x,y| x*y}
|
295
|
+
#
|
296
|
+
# rdd = $sc.parallelize(1..10, 2, batch_size: 1)
|
297
|
+
# rdd.aggregate(1, seq, com)
|
298
|
+
# # => 656
|
299
|
+
#
|
300
|
+
def aggregate(zero_value, seq_op, comb_op)
|
301
|
+
_reduce(Spark::Command::Aggregate, seq_op, comb_op, zero_value)
|
302
|
+
end
|
303
|
+
|
304
|
+
# Return the max of this RDD
|
305
|
+
#
|
306
|
+
# == Example:
|
307
|
+
# rdd = $sc.parallelize(0..10)
|
308
|
+
# rdd.max
|
309
|
+
# # => 10
|
310
|
+
#
|
311
|
+
def max
|
312
|
+
self.reduce('lambda{|memo, item| memo > item ? memo : item }')
|
313
|
+
end
|
314
|
+
|
315
|
+
# Return the min of this RDD
|
316
|
+
#
|
317
|
+
# == Example:
|
318
|
+
# rdd = $sc.parallelize(0..10)
|
319
|
+
# rdd.min
|
320
|
+
# # => 0
|
321
|
+
#
|
322
|
+
def min
|
323
|
+
self.reduce('lambda{|memo, item| memo < item ? memo : item }')
|
324
|
+
end
|
325
|
+
|
326
|
+
# Return the sum of this RDD
|
327
|
+
#
|
328
|
+
# == Example:
|
329
|
+
# rdd = $sc.parallelize(0..10)
|
330
|
+
# rdd.sum
|
331
|
+
# # => 55
|
332
|
+
#
|
333
|
+
def sum
|
334
|
+
self.reduce('lambda{|sum, item| sum + item}')
|
335
|
+
end
|
336
|
+
|
337
|
+
# Return the number of values in this RDD
|
338
|
+
#
|
339
|
+
# == Example:
|
340
|
+
# rdd = $sc.parallelize(0..10)
|
341
|
+
# rdd.count
|
342
|
+
# # => 11
|
343
|
+
#
|
344
|
+
def count
|
345
|
+
# nil is for seq_op => it means the all result go directly to one worker for combine
|
346
|
+
@count ||= self.map_partitions('lambda{|iterator| iterator.to_a.size }')
|
347
|
+
.aggregate(0, nil, 'lambda{|sum, item| sum + item }')
|
348
|
+
end
|
349
|
+
|
350
|
+
# Return a {Spark::StatCounter} object that captures the mean, variance
|
351
|
+
# and count of the RDD's elements in one operation.
|
352
|
+
def stats
|
353
|
+
@stats ||= new_rdd_from_command(Spark::Command::Stats).reduce('lambda{|memo, item| memo.merge(item)}')
|
354
|
+
end
|
355
|
+
|
356
|
+
# Compute the mean of this RDD's elements.
|
357
|
+
#
|
358
|
+
# == Example:
|
359
|
+
# $sc.parallelize([1, 2, 3]).mean
|
360
|
+
# # => 2.0
|
361
|
+
#
|
362
|
+
def mean
|
363
|
+
stats.mean
|
364
|
+
end
|
365
|
+
|
366
|
+
# Compute the variance of this RDD's elements.
|
367
|
+
#
|
368
|
+
# == Example:
|
369
|
+
# $sc.parallelize([1, 2, 3]).variance
|
370
|
+
# # => 0.666...
|
371
|
+
#
|
372
|
+
def variance
|
373
|
+
stats.variance
|
374
|
+
end
|
375
|
+
|
376
|
+
# Compute the standard deviation of this RDD's elements.
|
377
|
+
#
|
378
|
+
# == Example:
|
379
|
+
# $sc.parallelize([1, 2, 3]).stdev
|
380
|
+
# # => 0.816...
|
381
|
+
#
|
382
|
+
def stdev
|
383
|
+
stats.stdev
|
384
|
+
end
|
385
|
+
|
386
|
+
# Compute the sample standard deviation of this RDD's elements (which
|
387
|
+
# corrects for bias in estimating the standard deviation by dividing by
|
388
|
+
# N-1 instead of N).
|
389
|
+
#
|
390
|
+
# == Example:
|
391
|
+
# $sc.parallelize([1, 2, 3]).sample_stdev
|
392
|
+
# # => 1.0
|
393
|
+
#
|
394
|
+
def sample_stdev
|
395
|
+
stats.sample_stdev
|
396
|
+
end
|
397
|
+
|
398
|
+
# Compute the sample variance of this RDD's elements (which corrects
|
399
|
+
# for bias in estimating the variance by dividing by N-1 instead of N).
|
400
|
+
#
|
401
|
+
# == Example:
|
402
|
+
# $sc.parallelize([1, 2, 3]).sample_variance
|
403
|
+
# # => 1.0
|
404
|
+
#
|
405
|
+
def sample_variance
|
406
|
+
stats.sample_variance
|
407
|
+
end
|
408
|
+
|
409
|
+
# Compute a histogram using the provided buckets. The buckets
|
410
|
+
# are all open to the right except for the last which is closed.
|
411
|
+
# e.g. [1,10,20,50] means the buckets are [1,10) [10,20) [20,50],
|
412
|
+
# which means 1<=x<10, 10<=x<20, 20<=x<=50. And on the input of 1
|
413
|
+
# and 50 we would have a histogram of 1,0,1.
|
414
|
+
#
|
415
|
+
# If your histogram is evenly spaced (e.g. [0, 10, 20, 30]),
|
416
|
+
# this can be switched from an O(log n) inseration to O(1) per
|
417
|
+
# element(where n = # buckets).
|
418
|
+
#
|
419
|
+
# Buckets must be sorted and not contain any duplicates, must be
|
420
|
+
# at least two elements.
|
421
|
+
#
|
422
|
+
# == Examples:
|
423
|
+
# rdd = $sc.parallelize(0..50)
|
424
|
+
#
|
425
|
+
# rdd.histogram(2)
|
426
|
+
# # => [[0.0, 25.0, 50], [25, 26]]
|
427
|
+
#
|
428
|
+
# rdd.histogram([0, 5, 25, 50])
|
429
|
+
# # => [[0, 5, 25, 50], [5, 20, 26]]
|
430
|
+
#
|
431
|
+
# rdd.histogram([0, 15, 30, 45, 60])
|
432
|
+
# # => [[0, 15, 30, 45, 60], [15, 15, 15, 6]]
|
433
|
+
#
|
434
|
+
def histogram(buckets)
|
435
|
+
|
436
|
+
# -----------------------------------------------------------------------
|
437
|
+
# Integer
|
438
|
+
#
|
439
|
+
if buckets.is_a?(Integer)
|
440
|
+
|
441
|
+
# Validation
|
442
|
+
if buckets < 1
|
443
|
+
raise ArgumentError, "Bucket count must be >= 1, #{buckets} inserted."
|
444
|
+
end
|
445
|
+
|
446
|
+
# Filter invalid values
|
447
|
+
# Nil and NaN
|
448
|
+
func = 'lambda{|x|
|
449
|
+
if x.nil? || (x.is_a?(Float) && x.nan?)
|
450
|
+
false
|
451
|
+
else
|
452
|
+
true
|
453
|
+
end
|
454
|
+
}'
|
455
|
+
filtered = self.filter(func)
|
456
|
+
|
457
|
+
# Compute the minimum and the maximum
|
458
|
+
func = 'lambda{|memo, item|
|
459
|
+
[memo[0] < item[0] ? memo[0] : item[0],
|
460
|
+
memo[1] > item[1] ? memo[1] : item[1]]
|
461
|
+
}'
|
462
|
+
min, max = filtered.map('lambda{|x| [x, x]}').reduce(func)
|
463
|
+
|
464
|
+
# Min, max must be valid numbers
|
465
|
+
if (min.is_a?(Float) && !min.finite?) || (max.is_a?(Float) && !max.finite?)
|
466
|
+
raise Spark::RDDError, 'Histogram on either an empty RDD or RDD containing +/-infinity or NaN'
|
467
|
+
end
|
468
|
+
|
469
|
+
# Already finished
|
470
|
+
if min == max || buckets == 1
|
471
|
+
return [min, max], [filtered.count]
|
472
|
+
end
|
473
|
+
|
474
|
+
# Custom range
|
475
|
+
begin
|
476
|
+
span = max - min # increment
|
477
|
+
buckets = (0...buckets).map do |x|
|
478
|
+
min + (x * span) / buckets.to_f
|
479
|
+
end
|
480
|
+
buckets << max
|
481
|
+
rescue NoMethodError
|
482
|
+
raise Spark::RDDError, 'Can not generate buckets with non-number in RDD'
|
483
|
+
end
|
484
|
+
|
485
|
+
even = true
|
486
|
+
|
487
|
+
# -----------------------------------------------------------------------
|
488
|
+
# Array
|
489
|
+
#
|
490
|
+
elsif buckets.is_a?(Array)
|
491
|
+
|
492
|
+
if buckets.size < 2
|
493
|
+
raise ArgumentError, 'Buckets should have more than one value.'
|
494
|
+
end
|
495
|
+
|
496
|
+
if buckets.detect{|x| x.nil? || (x.is_a?(Float) && x.nan?)}
|
497
|
+
raise ArgumentError, 'Can not have nil or nan numbers in buckets.'
|
498
|
+
end
|
499
|
+
|
500
|
+
if buckets.detect{|x| buckets.count(x) > 1}
|
501
|
+
raise ArgumentError, 'Buckets should not contain duplicated values.'
|
502
|
+
end
|
503
|
+
|
504
|
+
if buckets.sort != buckets
|
505
|
+
raise ArgumentError, 'Buckets must be sorted.'
|
506
|
+
end
|
507
|
+
|
508
|
+
even = false
|
509
|
+
|
510
|
+
# -----------------------------------------------------------------------
|
511
|
+
# Other
|
512
|
+
#
|
513
|
+
else
|
514
|
+
raise Spark::RDDError, 'Buckets should be number or array.'
|
515
|
+
end
|
516
|
+
|
517
|
+
reduce_func = 'lambda{|memo, item|
|
518
|
+
memo.size.times do |i|
|
519
|
+
memo[i] += item[i]
|
520
|
+
end
|
521
|
+
memo
|
522
|
+
}'
|
523
|
+
|
524
|
+
return buckets, new_rdd_from_command(Spark::Command::Histogram, even, buckets).reduce(reduce_func)
|
525
|
+
end
|
526
|
+
|
527
|
+
# Applies a function f to all elements of this RDD.
|
528
|
+
#
|
529
|
+
# == Example:
|
530
|
+
# rdd = $sc.parallelize(0..5)
|
531
|
+
# rdd.foreach(lambda{|x| puts x})
|
532
|
+
# # => nil
|
533
|
+
#
|
534
|
+
def foreach(f, options={})
|
535
|
+
new_rdd_from_command(Spark::Command::Foreach, f).collect
|
536
|
+
nil
|
537
|
+
end
|
538
|
+
|
539
|
+
# Applies a function f to each partition of this RDD.
|
540
|
+
#
|
541
|
+
# == Example:
|
542
|
+
# rdd = $sc.parallelize(0..5)
|
543
|
+
# rdd.foreachPartition(lambda{|x| puts x.to_s})
|
544
|
+
# # => nil
|
545
|
+
#
|
546
|
+
def foreach_partition(f, options={})
|
547
|
+
new_rdd_from_command(Spark::Command::ForeachPartition, f).collect
|
548
|
+
nil
|
549
|
+
end
|
550
|
+
|
551
|
+
|
552
|
+
# =============================================================================
|
553
|
+
# Transformations of RDD
|
554
|
+
|
555
|
+
# Return a new RDD by applying a function to all elements of this RDD.
|
556
|
+
#
|
557
|
+
# == Example:
|
558
|
+
# rdd = $sc.parallelize(0..5)
|
559
|
+
# rdd.map(lambda {|x| x*2}).collect
|
560
|
+
# # => [0, 2, 4, 6, 8, 10]
|
561
|
+
#
|
562
|
+
def map(f)
|
563
|
+
new_rdd_from_command(Spark::Command::Map, f)
|
564
|
+
end
|
565
|
+
|
566
|
+
# Return a new RDD by first applying a function to all elements of this
|
567
|
+
# RDD, and then flattening the results.
|
568
|
+
#
|
569
|
+
# == Example:
|
570
|
+
# rdd = $sc.parallelize(0..5)
|
571
|
+
# rdd.flat_map(lambda {|x| [x, 1]}).collect
|
572
|
+
# # => [0, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1]
|
573
|
+
#
|
574
|
+
def flat_map(f)
|
575
|
+
new_rdd_from_command(Spark::Command::FlatMap, f)
|
576
|
+
end
|
577
|
+
|
578
|
+
# Return a new RDD by applying a function to each partition of this RDD.
|
579
|
+
#
|
580
|
+
# == Example:
|
581
|
+
# rdd = $sc.parallelize(0..10, 2)
|
582
|
+
# rdd.map_partitions(lambda{|part| part.reduce(:+)}).collect
|
583
|
+
# # => [15, 40]
|
584
|
+
#
|
585
|
+
def map_partitions(f)
|
586
|
+
new_rdd_from_command(Spark::Command::MapPartitions, f)
|
587
|
+
end
|
588
|
+
|
589
|
+
# Return a new RDD by applying a function to each partition of this RDD, while tracking the index
|
590
|
+
# of the original partition.
|
591
|
+
#
|
592
|
+
# == Example:
|
593
|
+
# rdd = $sc.parallelize(0...4, 4, batch_size: 1)
|
594
|
+
# rdd.map_partitions_with_index(lambda{|part, index| part.first * index}).collect
|
595
|
+
# # => [0, 1, 4, 9]
|
596
|
+
#
|
597
|
+
def map_partitions_with_index(f, options={})
|
598
|
+
new_rdd_from_command(Spark::Command::MapPartitionsWithIndex, f)
|
599
|
+
end
|
600
|
+
|
601
|
+
# Return a new RDD containing only the elements that satisfy a predicate.
|
602
|
+
#
|
603
|
+
# == Example:
|
604
|
+
# rdd = $sc.parallelize(0..10)
|
605
|
+
# rdd.filter(lambda{|x| x.even?}).collect
|
606
|
+
# # => [0, 2, 4, 6, 8, 10]
|
607
|
+
#
|
608
|
+
def filter(f)
|
609
|
+
new_rdd_from_command(Spark::Command::Filter, f)
|
610
|
+
end
|
611
|
+
|
612
|
+
# Return a new RDD containing non-nil elements.
|
613
|
+
#
|
614
|
+
# == Example:
|
615
|
+
# rdd = $sc.parallelize([1, nil, 2, nil, 3])
|
616
|
+
# rdd.compact.collect
|
617
|
+
# # => [1, 2, 3]
|
618
|
+
#
|
619
|
+
def compact
|
620
|
+
new_rdd_from_command(Spark::Command::Compact)
|
621
|
+
end
|
622
|
+
|
623
|
+
# Return an RDD created by coalescing all elements within each partition into an array.
|
624
|
+
#
|
625
|
+
# == Example:
|
626
|
+
# rdd = $sc.parallelize(0..10, 3, batch_size: 1)
|
627
|
+
# rdd.glom.collect
|
628
|
+
# # => [[0, 1, 2], [3, 4, 5, 6], [7, 8, 9, 10]]
|
629
|
+
#
|
630
|
+
def glom
|
631
|
+
new_rdd_from_command(Spark::Command::Glom)
|
632
|
+
end
|
633
|
+
|
634
|
+
# Return a new RDD that is reduced into num_partitions partitions.
|
635
|
+
#
|
636
|
+
# == Example:
|
637
|
+
# rdd = $sc.parallelize(0..10, 3)
|
638
|
+
# rdd.coalesce(2).glom.collect
|
639
|
+
# # => [[0, 1, 2], [3, 4, 5, 6, 7, 8, 9, 10]]
|
640
|
+
#
|
641
|
+
def coalesce(num_partitions)
|
642
|
+
new_jrdd = jrdd.coalesce(num_partitions)
|
643
|
+
RDD.new(new_jrdd, context, @command.serializer, @command.deserializer)
|
644
|
+
end
|
645
|
+
|
646
|
+
# Return the Cartesian product of this RDD and another one, that is, the
|
647
|
+
# RDD of all pairs of elements `(a, b)` where `a` is in `self` and
|
648
|
+
# `b` is in `other`.
|
649
|
+
#
|
650
|
+
# == Example:
|
651
|
+
# rdd1 = $sc.parallelize([1,2,3])
|
652
|
+
# rdd2 = $sc.parallelize([4,5,6])
|
653
|
+
#
|
654
|
+
# rdd1.cartesian(rdd2).collect
|
655
|
+
# # => [[1, 4], [1, 5], [1, 6], [2, 4], [2, 5], [2, 6], [3, 4], [3, 5], [3, 6]]
|
656
|
+
#
|
657
|
+
def cartesian(other)
|
658
|
+
_deserializer = Spark::Serializer::Cartesian.new.set(self.deserializer, other.deserializer)
|
659
|
+
new_jrdd = jrdd.cartesian(other.jrdd)
|
660
|
+
RDD.new(new_jrdd, context, serializer, _deserializer)
|
661
|
+
end
|
662
|
+
|
663
|
+
# Return a new RDD containing the distinct elements in this RDD.
|
664
|
+
# Ordering is not preserved because of reducing
|
665
|
+
#
|
666
|
+
# == Example:
|
667
|
+
# rdd = $sc.parallelize([1,1,1,2,3])
|
668
|
+
# rdd.distinct.collect
|
669
|
+
# # => [1, 2, 3]
|
670
|
+
#
|
671
|
+
def distinct
|
672
|
+
self.map('lambda{|x| [x, nil]}')
|
673
|
+
.reduce_by_key('lambda{|x,_| x}')
|
674
|
+
.map('lambda{|x| x[0]}')
|
675
|
+
end
|
676
|
+
|
677
|
+
# Return a shuffled RDD.
|
678
|
+
#
|
679
|
+
# == Example:
|
680
|
+
# rdd = $sc.parallelize(0..10)
|
681
|
+
# rdd.shuffle.collect
|
682
|
+
# # => [3, 10, 6, 7, 8, 0, 4, 2, 9, 1, 5]
|
683
|
+
#
|
684
|
+
def shuffle(seed=nil)
|
685
|
+
seed ||= Random.new_seed
|
686
|
+
|
687
|
+
new_rdd_from_command(Spark::Command::Shuffle, seed)
|
688
|
+
end
|
689
|
+
|
690
|
+
# Return the union of this RDD and another one. Any identical elements will appear multiple
|
691
|
+
# times (use .distinct to eliminate them).
|
692
|
+
#
|
693
|
+
# == Example:
|
694
|
+
# rdd = $sc.parallelize([1, 2, 3])
|
695
|
+
# rdd.union(rdd).collect
|
696
|
+
# # => [1, 2, 3, 1, 2, 3]
|
697
|
+
#
|
698
|
+
def union(other)
|
699
|
+
if self.serializer != other.serializer
|
700
|
+
other = other.reserialize(serializer.name, serializer.batch_size)
|
701
|
+
end
|
702
|
+
|
703
|
+
new_jrdd = jrdd.union(other.jrdd)
|
704
|
+
RDD.new(new_jrdd, context, serializer, deserializer)
|
705
|
+
end
|
706
|
+
|
707
|
+
# Return a new RDD with different serializer. This method is useful during union
|
708
|
+
# and join operations.
|
709
|
+
#
|
710
|
+
# == Example:
|
711
|
+
# rdd = $sc.parallelize([1, 2, 3], nil, serializer: "marshal")
|
712
|
+
# rdd = rdd.map(lambda{|x| x.to_s})
|
713
|
+
# rdd.reserialize("oj").collect
|
714
|
+
# # => ["1", "2", "3"]
|
715
|
+
#
|
716
|
+
def reserialize(new_serializer, new_batch_size=nil)
|
717
|
+
new_batch_size ||= deserializer.batch_size
|
718
|
+
new_serializer = Spark::Serializer.get!(new_serializer).new(new_batch_size)
|
719
|
+
|
720
|
+
if serializer == new_serializer
|
721
|
+
return self
|
722
|
+
end
|
723
|
+
|
724
|
+
new_command = @command.deep_copy
|
725
|
+
new_command.serializer = new_serializer
|
726
|
+
|
727
|
+
PipelinedRDD.new(self, new_command)
|
728
|
+
end
|
729
|
+
|
730
|
+
# Return the intersection of this RDD and another one. The output will not contain
|
731
|
+
# any duplicate elements, even if the input RDDs did.
|
732
|
+
#
|
733
|
+
# == Example:
|
734
|
+
# rdd1 = $sc.parallelize([1,2,3,4,5])
|
735
|
+
# rdd2 = $sc.parallelize([1,4,5,6,7])
|
736
|
+
# rdd1.intersection(rdd2).collect
|
737
|
+
# # => [1, 4, 5]
|
738
|
+
#
|
739
|
+
def intersection(other)
|
740
|
+
mapping_function = 'lambda{|item| [item, nil]}'
|
741
|
+
filter_function = 'lambda{|(key, values)| values.size > 1}'
|
742
|
+
|
743
|
+
self.map(mapping_function)
|
744
|
+
.cogroup(other.map(mapping_function))
|
745
|
+
.filter(filter_function)
|
746
|
+
.keys
|
747
|
+
end
|
748
|
+
|
749
|
+
# Return a copy of the RDD partitioned using the specified partitioner.
|
750
|
+
#
|
751
|
+
# == Example:
|
752
|
+
# rdd = $sc.parallelize(["1","2","3","4","5"]).map(lambda {|x| [x, 1]})
|
753
|
+
# rdd.partitionBy(2).glom.collect
|
754
|
+
# # => [[["3", 1], ["4", 1]], [["1", 1], ["2", 1], ["5", 1]]]
|
755
|
+
#
|
756
|
+
def partition_by(num_partitions, partition_func=nil)
|
757
|
+
num_partitions ||= default_reduce_partitions
|
758
|
+
partition_func ||= 'lambda{|x| Spark::Digest.portable_hash(x.to_s)}'
|
759
|
+
|
760
|
+
_partition_by(num_partitions, Spark::Command::PartitionBy::Basic, partition_func)
|
761
|
+
end
|
762
|
+
|
763
|
+
# Return a sampled subset of this RDD. Operations are base on Poisson and Uniform
|
764
|
+
# distributions.
|
765
|
+
# TODO: Replace Unfirom for Bernoulli
|
766
|
+
#
|
767
|
+
# == Examples:
|
768
|
+
# rdd = $sc.parallelize(0..100)
|
769
|
+
#
|
770
|
+
# rdd.sample(true, 10).collect
|
771
|
+
# # => [17, 17, 22, 23, 51, 52, 62, 64, 69, 70, 96]
|
772
|
+
#
|
773
|
+
# rdd.sample(false, 0.1).collect
|
774
|
+
# # => [3, 5, 9, 32, 44, 55, 66, 68, 75, 80, 86, 91, 98]
|
775
|
+
#
|
776
|
+
def sample(with_replacement, fraction, seed=nil)
|
777
|
+
new_rdd_from_command(Spark::Command::Sample, with_replacement, fraction, seed)
|
778
|
+
end
|
779
|
+
|
780
|
+
# Return a fixed-size sampled subset of this RDD in an array
|
781
|
+
#
|
782
|
+
# == Examples:
|
783
|
+
# rdd = $sc.parallelize(0..100)
|
784
|
+
#
|
785
|
+
# rdd.take_sample(true, 10)
|
786
|
+
# # => [90, 84, 74, 44, 27, 22, 72, 96, 80, 54]
|
787
|
+
#
|
788
|
+
# rdd.take_sample(false, 10)
|
789
|
+
# # => [5, 35, 30, 48, 22, 33, 40, 75, 42, 32]
|
790
|
+
#
|
791
|
+
def take_sample(with_replacement, num, seed=nil)
|
792
|
+
|
793
|
+
if num < 0
|
794
|
+
raise Spark::RDDError, 'Size have to be greater than 0'
|
795
|
+
elsif num == 0
|
796
|
+
return []
|
797
|
+
end
|
798
|
+
|
799
|
+
# Taken from scala
|
800
|
+
num_st_dev = 10.0
|
801
|
+
|
802
|
+
# Number of items
|
803
|
+
initial_count = self.count
|
804
|
+
return [] if initial_count == 0
|
805
|
+
|
806
|
+
# Create new generator
|
807
|
+
seed ||= Random.new_seed
|
808
|
+
rng = Random.new(seed)
|
809
|
+
|
810
|
+
# Shuffle elements if requested num if greater than array size
|
811
|
+
if !with_replacement && num >= initial_count
|
812
|
+
return self.shuffle(seed).collect
|
813
|
+
end
|
814
|
+
|
815
|
+
# Max num
|
816
|
+
max_sample_size = Integer::MAX - (num_st_dev * Math.sqrt(Integer::MAX)).to_i
|
817
|
+
if num > max_sample_size
|
818
|
+
raise Spark::RDDError, "Size can not be greate than #{max_sample_size}"
|
819
|
+
end
|
820
|
+
|
821
|
+
# Approximate fraction with tolerance
|
822
|
+
fraction = compute_fraction(num, initial_count, with_replacement)
|
823
|
+
|
824
|
+
# Compute first samled subset
|
825
|
+
samples = self.sample(with_replacement, fraction, seed).collect
|
826
|
+
|
827
|
+
# If the first sample didn't turn out large enough, keep trying to take samples;
|
828
|
+
# this shouldn't happen often because we use a big multiplier for their initial size.
|
829
|
+
index = 0
|
830
|
+
while samples.size < num
|
831
|
+
log_warning("Needed to re-sample due to insufficient sample size. Repeat #{index}")
|
832
|
+
samples = self.sample(with_replacement, fraction, rng.rand(0..Integer::MAX)).collect
|
833
|
+
index += 1
|
834
|
+
end
|
835
|
+
|
836
|
+
samples.shuffle!(random: rng)
|
837
|
+
samples[0, num]
|
838
|
+
end
|
839
|
+
|
840
|
+
# Return an RDD created by piping elements to a forked external process.
|
841
|
+
#
|
842
|
+
# == Cmds:
|
843
|
+
# cmd = [env,] command... [,options]
|
844
|
+
#
|
845
|
+
# env: hash
|
846
|
+
# name => val : set the environment variable
|
847
|
+
# name => nil : unset the environment variable
|
848
|
+
# command...:
|
849
|
+
# commandline : command line string which is passed to the standard shell
|
850
|
+
# cmdname, arg1, ... : command name and one or more arguments (This form does
|
851
|
+
# not use the shell. See below for caveats.)
|
852
|
+
# [cmdname, argv0], arg1, ... : command name, argv[0] and zero or more arguments (no shell)
|
853
|
+
# options: hash
|
854
|
+
#
|
855
|
+
# See http://ruby-doc.org/core-2.2.0/Process.html#method-c-spawn
|
856
|
+
#
|
857
|
+
# == Examples:
|
858
|
+
# $sc.parallelize(0..5).pipe('cat').collect
|
859
|
+
# # => ["0", "1", "2", "3", "4", "5"]
|
860
|
+
#
|
861
|
+
# rdd = $sc.parallelize(0..5)
|
862
|
+
# rdd = rdd.pipe('cat', "awk '{print $1*10}'")
|
863
|
+
# rdd = rdd.map(lambda{|x| x.to_i + 1})
|
864
|
+
# rdd.collect
|
865
|
+
# # => [1, 11, 21, 31, 41, 51]
|
866
|
+
#
|
867
|
+
def pipe(*cmds)
|
868
|
+
new_rdd_from_command(Spark::Command::Pipe, cmds)
|
869
|
+
end
|
870
|
+
|
871
|
+
|
872
|
+
# =============================================================================
|
873
|
+
# Pair functions
|
874
|
+
|
875
|
+
# Merge the values for each key using an associative reduce function. This will also perform
|
876
|
+
# the merging locally on each mapper before sending results to a reducer, similarly to a
|
877
|
+
# "combiner" in MapReduce. Output will be hash-partitioned with the existing partitioner/
|
878
|
+
# parallelism level.
|
879
|
+
#
|
880
|
+
# == Example:
|
881
|
+
# rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"]).map(lambda{|x| [x, 1]})
|
882
|
+
# rdd.reduce_by_key(lambda{|x,y| x+y}).collect_as_hash
|
883
|
+
# # => {"a"=>3, "b"=>2, "c"=>3}
|
884
|
+
#
|
885
|
+
def reduce_by_key(f, num_partitions=nil)
|
886
|
+
combine_by_key('lambda {|x| x}', f, f, num_partitions)
|
887
|
+
end
|
888
|
+
|
889
|
+
# Generic function to combine the elements for each key using a custom set of aggregation
|
890
|
+
# functions. Turns a JavaPairRDD[(K, V)] into a result of type JavaPairRDD[(K, C)], for a
|
891
|
+
# "combined type" C * Note that V and C can be different -- for example, one might group an
|
892
|
+
# RDD of type (Int, Int) into an RDD of type (Int, List[Int]). Users provide three
|
893
|
+
# functions:
|
894
|
+
#
|
895
|
+
# == Parameters:
|
896
|
+
# create_combiner:: which turns a V into a C (e.g., creates a one-element list)
|
897
|
+
# merge_value:: to merge a V into a C (e.g., adds it to the end of a list)
|
898
|
+
# merge_combiners:: to combine two C's into a single one.
|
899
|
+
#
|
900
|
+
# == Example:
|
901
|
+
# def combiner(x)
|
902
|
+
# x
|
903
|
+
# end
|
904
|
+
#
|
905
|
+
# def merge(x,y)
|
906
|
+
# x+y
|
907
|
+
# end
|
908
|
+
#
|
909
|
+
# rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"], 2, batch_size: 1).map(lambda{|x| [x, 1]})
|
910
|
+
# rdd.combine_by_key(method(:combiner), method(:merge), method(:merge)).collect_as_hash
|
911
|
+
# # => {"a"=>3, "b"=>2, "c"=>3}
|
912
|
+
#
|
913
|
+
def combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions=nil)
|
914
|
+
_combine_by_key(
|
915
|
+
[Spark::Command::CombineByKey::Combine, create_combiner, merge_value],
|
916
|
+
[Spark::Command::CombineByKey::Merge, merge_combiners],
|
917
|
+
num_partitions
|
918
|
+
)
|
919
|
+
end
|
920
|
+
|
921
|
+
# Return an RDD of grouped items.
|
922
|
+
#
|
923
|
+
# == Example:
|
924
|
+
# rdd = $sc.parallelize(0..5)
|
925
|
+
# rdd.group_by(lambda{|x| x%2}).collect
|
926
|
+
# # => [[0, [0, 2, 4]], [1, [1, 3, 5]]]
|
927
|
+
#
|
928
|
+
def group_by(f, num_partitions=nil)
|
929
|
+
self.key_by(f).group_by_key(num_partitions)
|
930
|
+
end
|
931
|
+
|
932
|
+
# Group the values for each key in the RDD into a single sequence. Allows controlling the
|
933
|
+
# partitioning of the resulting key-value pair RDD by passing a Partitioner.
|
934
|
+
#
|
935
|
+
# Note: If you are grouping in order to perform an aggregation (such as a sum or average)
|
936
|
+
# over each key, using reduce_by_key or combine_by_key will provide much better performance.
|
937
|
+
#
|
938
|
+
# == Example:
|
939
|
+
# rdd = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
|
940
|
+
# rdd.group_by_key.collect
|
941
|
+
# # => [["a", [1, 2]], ["b", [3]]]
|
942
|
+
#
|
943
|
+
def group_by_key(num_partitions=nil)
|
944
|
+
create_combiner = 'lambda{|item| [item]}'
|
945
|
+
merge_value = 'lambda{|combiner, item| combiner << item; combiner}'
|
946
|
+
merge_combiners = 'lambda{|combiner_1, combiner_2| combiner_1 += combiner_2; combiner_1}'
|
947
|
+
|
948
|
+
combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions)
|
949
|
+
end
|
950
|
+
|
951
|
+
# Merge the values for each key using an associative function f
|
952
|
+
# and a neutral `zero_value` which may be added to the result an
|
953
|
+
# arbitrary number of times, and must not change the result
|
954
|
+
# (e.g., 0 for addition, or 1 for multiplication.).
|
955
|
+
#
|
956
|
+
# == Example:
|
957
|
+
# rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]])
|
958
|
+
# rdd.fold_by_key(1, lambda{|x,y| x+y})
|
959
|
+
# # => [["a", 9], ["c", 6], ["b", 3]]
|
960
|
+
#
|
961
|
+
def fold_by_key(zero_value, f, num_partitions=nil)
|
962
|
+
self.aggregate_by_key(zero_value, f, f, num_partitions)
|
963
|
+
end
|
964
|
+
|
965
|
+
# Aggregate the values of each key, using given combine functions and a neutral zero value.
|
966
|
+
#
|
967
|
+
# == Example:
|
968
|
+
# def combine(x,y)
|
969
|
+
# x+y
|
970
|
+
# end
|
971
|
+
#
|
972
|
+
# def merge(x,y)
|
973
|
+
# x*y
|
974
|
+
# end
|
975
|
+
#
|
976
|
+
# rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]], 2, batch_size: 1)
|
977
|
+
# rdd.aggregate_by_key(1, method(:combine), method(:merge))
|
978
|
+
# # => [["b", 3], ["a", 16], ["c", 6]]
|
979
|
+
#
|
980
|
+
def aggregate_by_key(zero_value, seq_func, comb_func, num_partitions=nil)
|
981
|
+
_combine_by_key(
|
982
|
+
[Spark::Command::CombineByKey::CombineWithZero, zero_value, seq_func],
|
983
|
+
[Spark::Command::CombineByKey::Merge, comb_func],
|
984
|
+
num_partitions
|
985
|
+
)
|
986
|
+
end
|
987
|
+
|
988
|
+
# The same functionality as cogroup but this can grouped only 2 rdd's and you
|
989
|
+
# can change num_partitions.
|
990
|
+
#
|
991
|
+
# == Example:
|
992
|
+
# rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
|
993
|
+
# rdd2 = $sc.parallelize([["a", 4], ["a", 5], ["b", 6]])
|
994
|
+
# rdd1.group_with(rdd2).collect
|
995
|
+
# # => [["a", [1, 2, 4, 5]], ["b", [3, 6]]]
|
996
|
+
#
|
997
|
+
def group_with(other, num_partitions=nil)
|
998
|
+
self.union(other).group_by_key(num_partitions)
|
999
|
+
end
|
1000
|
+
|
1001
|
+
# For each key k in `this` or `other`, return a resulting RDD that contains a tuple with the
|
1002
|
+
# list of values for that key in `this` as well as `other`.
|
1003
|
+
#
|
1004
|
+
# == Example:
|
1005
|
+
# rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
|
1006
|
+
# rdd2 = $sc.parallelize([["a", 4], ["a", 5], ["b", 6]])
|
1007
|
+
# rdd3 = $sc.parallelize([["a", 7], ["a", 8], ["b", 9]])
|
1008
|
+
# rdd1.cogroup(rdd2, rdd3).collect
|
1009
|
+
# # => [["a", [1, 2, 4, 5, 7, 8]], ["b", [3, 6, 9]]]
|
1010
|
+
#
|
1011
|
+
def cogroup(*others)
|
1012
|
+
unioned = self
|
1013
|
+
others.each do |other|
|
1014
|
+
unioned = unioned.union(other)
|
1015
|
+
end
|
1016
|
+
|
1017
|
+
unioned.group_by_key
|
1018
|
+
end
|
1019
|
+
|
1020
|
+
# Return each (key, value) pair in self RDD that has no pair with matching
|
1021
|
+
# key in other RDD.
|
1022
|
+
#
|
1023
|
+
# == Example:
|
1024
|
+
# rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3], ["c", 4]])
|
1025
|
+
# rdd2 = $sc.parallelize([["b", 5], ["c", 6]])
|
1026
|
+
# rdd1.subtract_by_key(rdd2).collect
|
1027
|
+
# # => [["a", 1], ["a", 2]]
|
1028
|
+
#
|
1029
|
+
def subtract_by_key(other, num_partitions=nil)
|
1030
|
+
create_combiner = 'lambda{|item| [[item]]}'
|
1031
|
+
merge_value = 'lambda{|combiner, item| combiner.first << item; combiner}'
|
1032
|
+
merge_combiners = 'lambda{|combiner_1, combiner_2| combiner_1 += combiner_2; combiner_1}'
|
1033
|
+
|
1034
|
+
self.union(other)
|
1035
|
+
.combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions)
|
1036
|
+
.filter('lambda{|(key,values)| values.size == 1}')
|
1037
|
+
.flat_map_values('lambda{|item| item.first}')
|
1038
|
+
end
|
1039
|
+
|
1040
|
+
# Return an RDD with the elements from self that are not in other.
|
1041
|
+
#
|
1042
|
+
# == Example:
|
1043
|
+
# rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3], ["c", 4]])
|
1044
|
+
# rdd2 = $sc.parallelize([["a", 2], ["c", 6]])
|
1045
|
+
# rdd1.subtract(rdd2).collect
|
1046
|
+
# # => [["a", 1], ["b", 3], ["c", 4]]
|
1047
|
+
#
|
1048
|
+
def subtract(other, num_partitions=nil)
|
1049
|
+
mapping_function = 'lambda{|x| [x,nil]}'
|
1050
|
+
|
1051
|
+
self.map(mapping_function)
|
1052
|
+
.subtract_by_key(other.map(mapping_function), num_partitions)
|
1053
|
+
.keys
|
1054
|
+
end
|
1055
|
+
|
1056
|
+
# Sort the RDD by key
|
1057
|
+
#
|
1058
|
+
# == Example:
|
1059
|
+
# rdd = $sc.parallelize([["c", 1], ["b", 2], ["a", 3]])
|
1060
|
+
# rdd.sort_by_key.collect
|
1061
|
+
# # => [["a", 3], ["b", 2], ["c", 1]]
|
1062
|
+
#
|
1063
|
+
def sort_by_key(ascending=true, num_partitions=nil)
|
1064
|
+
self.sort_by('lambda{|(key, _)| key}')
|
1065
|
+
end
|
1066
|
+
|
1067
|
+
# Sorts this RDD by the given key_function
|
1068
|
+
#
|
1069
|
+
# This is a different implementation than spark. Sort by doesn't use
|
1070
|
+
# key_by method first. It can be slower but take less memory and
|
1071
|
+
# you can always use map.sort_by_key
|
1072
|
+
#
|
1073
|
+
# == Example:
|
1074
|
+
# rdd = $sc.parallelize(["aaaaaaa", "cc", "b", "eeee", "ddd"])
|
1075
|
+
#
|
1076
|
+
# rdd.sort_by.collect
|
1077
|
+
# # => ["aaaaaaa", "b", "cc", "ddd", "eeee"]
|
1078
|
+
#
|
1079
|
+
# rdd.sort_by(lambda{|x| x.size}).collect
|
1080
|
+
# # => ["b", "cc", "ddd", "eeee", "aaaaaaa"]
|
1081
|
+
#
|
1082
|
+
def sort_by(key_function=nil, ascending=true, num_partitions=nil)
|
1083
|
+
key_function ||= 'lambda{|x| x}'
|
1084
|
+
num_partitions ||= default_reduce_partitions
|
1085
|
+
|
1086
|
+
command_klass = Spark::Command::SortByKey
|
1087
|
+
|
1088
|
+
# Allow spill data to disk due to memory limit
|
1089
|
+
# spilling = config['spark.shuffle.spill'] || false
|
1090
|
+
spilling = false
|
1091
|
+
memory = ''
|
1092
|
+
|
1093
|
+
# Set spilling to false if worker has unlimited memory
|
1094
|
+
if memory.empty?
|
1095
|
+
spilling = false
|
1096
|
+
memory = nil
|
1097
|
+
else
|
1098
|
+
memory = to_memory_size(memory)
|
1099
|
+
end
|
1100
|
+
|
1101
|
+
# Sorting should do one worker
|
1102
|
+
if num_partitions == 1
|
1103
|
+
rdd = self
|
1104
|
+
rdd = rdd.coalesce(1) if partitions_size > 1
|
1105
|
+
return rdd.new_rdd_from_command(command_klass, key_function, ascending, spilling, memory, serializer)
|
1106
|
+
end
|
1107
|
+
|
1108
|
+
# Compute boundary of collection
|
1109
|
+
# Collection should be evenly distributed
|
1110
|
+
# 20.0 is from scala RangePartitioner (for roughly balanced output partitions)
|
1111
|
+
count = self.count
|
1112
|
+
sample_size = num_partitions * 20.0
|
1113
|
+
fraction = [sample_size / [count, 1].max, 1.0].min
|
1114
|
+
samples = self.sample(false, fraction, 1).map(key_function).collect
|
1115
|
+
samples.sort!
|
1116
|
+
# Reverse is much faster than reverse sort_by
|
1117
|
+
samples.reverse! if !ascending
|
1118
|
+
|
1119
|
+
# Determine part bounds
|
1120
|
+
bounds = determine_bounds(samples, num_partitions)
|
1121
|
+
|
1122
|
+
shuffled = _partition_by(num_partitions, Spark::Command::PartitionBy::Sorting, key_function, bounds, ascending, num_partitions)
|
1123
|
+
shuffled.new_rdd_from_command(command_klass, key_function, ascending, spilling, memory, serializer)
|
1124
|
+
end
|
1125
|
+
|
1126
|
+
# Creates array of the elements in this RDD by applying function f.
|
1127
|
+
#
|
1128
|
+
# == Example:
|
1129
|
+
# rdd = $sc.parallelize(0..5)
|
1130
|
+
# rdd.key_by(lambda{|x| x%2}).collect
|
1131
|
+
# # => [[0, 0], [1, 1], [0, 2], [1, 3], [0, 4], [1, 5]]
|
1132
|
+
#
|
1133
|
+
def key_by(f)
|
1134
|
+
new_rdd_from_command(Spark::Command::KeyBy, f)
|
1135
|
+
end
|
1136
|
+
|
1137
|
+
# Pass each value in the key-value pair RDD through a map function without changing
|
1138
|
+
# the keys. This also retains the original RDD's partitioning.
|
1139
|
+
#
|
1140
|
+
# == Example:
|
1141
|
+
# rdd = $sc.parallelize(["ruby", "scala", "java"])
|
1142
|
+
# rdd = rdd.map(lambda{|x| [x, x]})
|
1143
|
+
# rdd = rdd.map_values(lambda{|x| x.upcase})
|
1144
|
+
# rdd.collect
|
1145
|
+
# # => [["ruby", "RUBY"], ["scala", "SCALA"], ["java", "JAVA"]]
|
1146
|
+
#
|
1147
|
+
def map_values(f)
|
1148
|
+
new_rdd_from_command(Spark::Command::MapValues, f)
|
1149
|
+
end
|
1150
|
+
|
1151
|
+
# Pass each value in the key-value pair RDD through a flat_map function
|
1152
|
+
# without changing the keys; this also retains the original RDD's
|
1153
|
+
# partitioning.
|
1154
|
+
#
|
1155
|
+
# == Example:
|
1156
|
+
# rdd = $sc.parallelize([["a", [1,2]], ["b", [3]]])
|
1157
|
+
# rdd = rdd.flat_map_values(lambda{|x| x*2})
|
1158
|
+
# rdd.collect
|
1159
|
+
# # => [["a", 1], ["a", 2], ["a", 1], ["a", 2], ["b", 3], ["b", 3]]
|
1160
|
+
#
|
1161
|
+
def flat_map_values(f)
|
1162
|
+
new_rdd_from_command(Spark::Command::FlatMapValues, f)
|
1163
|
+
end
|
1164
|
+
|
1165
|
+
# Return an RDD with the first element of PairRDD
|
1166
|
+
#
|
1167
|
+
# == Example:
|
1168
|
+
# rdd = $sc.parallelize([[1,2], [3,4], [5,6]])
|
1169
|
+
# rdd.keys.collect
|
1170
|
+
# # => [1, 3, 5]
|
1171
|
+
#
|
1172
|
+
def keys
|
1173
|
+
self.map('lambda{|(key, _)| key}')
|
1174
|
+
end
|
1175
|
+
|
1176
|
+
# Return an RDD with the second element of PairRDD
|
1177
|
+
#
|
1178
|
+
# == Example:
|
1179
|
+
# rdd = $sc.parallelize([[1,2], [3,4], [5,6]])
|
1180
|
+
# rdd.keys.collect
|
1181
|
+
# # => [2, 4, 6]
|
1182
|
+
#
|
1183
|
+
def values
|
1184
|
+
self.map('lambda{|(_, value)| value}')
|
1185
|
+
end
|
1186
|
+
|
1187
|
+
|
1188
|
+
# Aliases
|
1189
|
+
alias_method :partitionsSize, :partitions_size
|
1190
|
+
alias_method :defaultReducePartitions, :default_reduce_partitions
|
1191
|
+
alias_method :setName, :set_name
|
1192
|
+
alias_method :addLibrary, :add_library
|
1193
|
+
|
1194
|
+
alias_method :flatMap, :flat_map
|
1195
|
+
alias_method :mapPartitions, :map_partitions
|
1196
|
+
alias_method :mapPartitionsWithIndex, :map_partitions_with_index
|
1197
|
+
alias_method :reduceByKey, :reduce_by_key
|
1198
|
+
alias_method :combineByKey, :combine_by_key
|
1199
|
+
alias_method :groupByKey, :group_by_key
|
1200
|
+
alias_method :groupWith, :group_with
|
1201
|
+
alias_method :partitionBy, :partition_by
|
1202
|
+
alias_method :defaultReducePartitions, :default_reduce_partitions
|
1203
|
+
alias_method :foreachPartition, :foreach_partition
|
1204
|
+
alias_method :mapValues, :map_values
|
1205
|
+
alias_method :takeSample, :take_sample
|
1206
|
+
alias_method :sortBy, :sort_by
|
1207
|
+
alias_method :sortByKey, :sort_by_key
|
1208
|
+
alias_method :keyBy, :key_by
|
1209
|
+
alias_method :groupBy, :group_by
|
1210
|
+
alias_method :foldByKey, :fold_by_key
|
1211
|
+
alias_method :aggregateByKey, :aggregate_by_key
|
1212
|
+
alias_method :subtractByKey, :subtract_by_key
|
1213
|
+
alias_method :sampleStdev, :sample_stdev
|
1214
|
+
alias_method :sampleVariance, :sample_variance
|
1215
|
+
|
1216
|
+
private
|
1217
|
+
|
1218
|
+
# This is base method for reduce operation. Is used by reduce, fold and aggregation.
|
1219
|
+
# Only difference is that fold has zero value.
|
1220
|
+
#
|
1221
|
+
def _reduce(klass, seq_op, comb_op, zero_value=nil)
|
1222
|
+
if seq_op.nil?
|
1223
|
+
# Partitions are already reduced
|
1224
|
+
rdd = self
|
1225
|
+
else
|
1226
|
+
rdd = new_rdd_from_command(klass, seq_op, zero_value)
|
1227
|
+
end
|
1228
|
+
|
1229
|
+
# Send all results to one worker and combine results
|
1230
|
+
rdd = rdd.coalesce(1).compact
|
1231
|
+
|
1232
|
+
# Add the same function to new RDD
|
1233
|
+
comm = rdd.add_command(klass, comb_op, zero_value)
|
1234
|
+
comm.deserializer = @command.serializer
|
1235
|
+
|
1236
|
+
# Value is returned in array
|
1237
|
+
PipelinedRDD.new(rdd, comm).collect[0]
|
1238
|
+
end
|
1239
|
+
|
1240
|
+
def _partition_by(num_partitions, klass, *args)
|
1241
|
+
# RDD is transform from [key, value] to [hash, [key, value]]
|
1242
|
+
keyed = new_rdd_from_command(klass, *args)
|
1243
|
+
keyed.serializer.unbatch!
|
1244
|
+
|
1245
|
+
# PairwiseRDD and PythonPartitioner are borrowed from Python
|
1246
|
+
# but works great on ruby too
|
1247
|
+
pairwise_rdd = PairwiseRDD.new(keyed.jrdd.rdd).asJavaPairRDD
|
1248
|
+
partitioner = PythonPartitioner.new(num_partitions, args.first.object_id)
|
1249
|
+
new_jrdd = pairwise_rdd.partitionBy(partitioner).values
|
1250
|
+
|
1251
|
+
# Reset deserializer
|
1252
|
+
RDD.new(new_jrdd, context, @command.serializer, keyed.serializer)
|
1253
|
+
end
|
1254
|
+
|
1255
|
+
# For using a different combine_by_key
|
1256
|
+
#
|
1257
|
+
# == Used for:
|
1258
|
+
# * combine_by_key
|
1259
|
+
# * fold_by_key (with zero value)
|
1260
|
+
#
|
1261
|
+
def _combine_by_key(combine, merge, num_partitions)
|
1262
|
+
num_partitions ||= default_reduce_partitions
|
1263
|
+
|
1264
|
+
# Combine key
|
1265
|
+
combined = new_rdd_from_command(combine.shift, *combine)
|
1266
|
+
|
1267
|
+
# Merge items
|
1268
|
+
shuffled = combined.partition_by(num_partitions)
|
1269
|
+
merge_comm = shuffled.add_command(merge.shift, *merge)
|
1270
|
+
|
1271
|
+
PipelinedRDD.new(shuffled, merge_comm)
|
1272
|
+
end
|
1273
|
+
|
1274
|
+
end
|
1275
|
+
|
1276
|
+
# Pipelined Resilient Distributed Dataset, operations are pipelined and sended to worker
|
1277
|
+
#
|
1278
|
+
# RDD
|
1279
|
+
# `-- map
|
1280
|
+
# `-- map
|
1281
|
+
# `-- map
|
1282
|
+
#
|
1283
|
+
# Code is executed from top to bottom
|
1284
|
+
#
|
1285
|
+
class PipelinedRDD < RDD
|
1286
|
+
|
1287
|
+
attr_reader :prev_jrdd, :command
|
1288
|
+
|
1289
|
+
def initialize(prev, command)
|
1290
|
+
|
1291
|
+
if prev.is_a?(PipelinedRDD) && prev.pipelinable?
|
1292
|
+
# Second, ... stages
|
1293
|
+
@prev_jrdd = prev.prev_jrdd
|
1294
|
+
else
|
1295
|
+
# First stage
|
1296
|
+
@prev_jrdd = prev.jrdd
|
1297
|
+
end
|
1298
|
+
|
1299
|
+
@cached = false
|
1300
|
+
@checkpointed = false
|
1301
|
+
|
1302
|
+
@context = prev.context
|
1303
|
+
@command = command
|
1304
|
+
end
|
1305
|
+
|
1306
|
+
def pipelinable?
|
1307
|
+
!(cached? || checkpointed?)
|
1308
|
+
end
|
1309
|
+
|
1310
|
+
# Serialization necessary things and sent it to RubyRDD (scala extension)
|
1311
|
+
def jrdd
|
1312
|
+
@jrdd ||= _jrdd
|
1313
|
+
end
|
1314
|
+
|
1315
|
+
private
|
1316
|
+
|
1317
|
+
def _jrdd
|
1318
|
+
command = @command.build
|
1319
|
+
|
1320
|
+
broadcasts = @command.bound_objects.select{|_, value| value.is_a?(Spark::Broadcast)}.values
|
1321
|
+
broadcasts = to_java_array_list(broadcasts.map(&:jbroadcast))
|
1322
|
+
|
1323
|
+
ruby_rdd = RubyRDD.new(@prev_jrdd.rdd, command, broadcasts, @context.jaccumulator)
|
1324
|
+
ruby_rdd.asJavaRDD
|
1325
|
+
end
|
1326
|
+
|
1327
|
+
end
|
1328
|
+
end
|