ruby-spark 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +37 -0
- data/Gemfile +47 -0
- data/Guardfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +185 -0
- data/Rakefile +35 -0
- data/TODO.md +7 -0
- data/benchmark/aggregate.rb +33 -0
- data/benchmark/bisect.rb +88 -0
- data/benchmark/custom_marshal.rb +94 -0
- data/benchmark/digest.rb +150 -0
- data/benchmark/enumerator.rb +88 -0
- data/benchmark/performance/prepare.sh +18 -0
- data/benchmark/performance/python.py +156 -0
- data/benchmark/performance/r.r +69 -0
- data/benchmark/performance/ruby.rb +167 -0
- data/benchmark/performance/run-all.sh +160 -0
- data/benchmark/performance/scala.scala +181 -0
- data/benchmark/serializer.rb +82 -0
- data/benchmark/sort.rb +43 -0
- data/benchmark/sort2.rb +164 -0
- data/benchmark/take.rb +28 -0
- data/bin/ruby-spark +8 -0
- data/example/pi.rb +28 -0
- data/ext/ruby_c/extconf.rb +3 -0
- data/ext/ruby_c/murmur.c +158 -0
- data/ext/ruby_c/murmur.h +9 -0
- data/ext/ruby_c/ruby-spark.c +18 -0
- data/ext/ruby_java/Digest.java +36 -0
- data/ext/ruby_java/Murmur2.java +98 -0
- data/ext/ruby_java/RubySparkExtService.java +28 -0
- data/ext/ruby_java/extconf.rb +3 -0
- data/ext/spark/build.sbt +73 -0
- data/ext/spark/project/plugins.sbt +9 -0
- data/ext/spark/sbt/sbt +34 -0
- data/ext/spark/src/main/scala/Exec.scala +91 -0
- data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
- data/ext/spark/src/main/scala/Marshal.scala +52 -0
- data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
- data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
- data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
- data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
- data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
- data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
- data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
- data/ext/spark/src/main/scala/RubyPage.scala +34 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +364 -0
- data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
- data/ext/spark/src/main/scala/RubyTab.scala +11 -0
- data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
- data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
- data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
- data/lib/ruby-spark.rb +1 -0
- data/lib/spark.rb +198 -0
- data/lib/spark/accumulator.rb +260 -0
- data/lib/spark/broadcast.rb +98 -0
- data/lib/spark/build.rb +43 -0
- data/lib/spark/cli.rb +169 -0
- data/lib/spark/command.rb +86 -0
- data/lib/spark/command/base.rb +154 -0
- data/lib/spark/command/basic.rb +345 -0
- data/lib/spark/command/pair.rb +124 -0
- data/lib/spark/command/sort.rb +51 -0
- data/lib/spark/command/statistic.rb +144 -0
- data/lib/spark/command_builder.rb +141 -0
- data/lib/spark/command_validator.rb +34 -0
- data/lib/spark/config.rb +244 -0
- data/lib/spark/constant.rb +14 -0
- data/lib/spark/context.rb +304 -0
- data/lib/spark/error.rb +50 -0
- data/lib/spark/ext/hash.rb +41 -0
- data/lib/spark/ext/integer.rb +25 -0
- data/lib/spark/ext/io.rb +57 -0
- data/lib/spark/ext/ip_socket.rb +29 -0
- data/lib/spark/ext/module.rb +58 -0
- data/lib/spark/ext/object.rb +24 -0
- data/lib/spark/ext/string.rb +24 -0
- data/lib/spark/helper.rb +10 -0
- data/lib/spark/helper/logger.rb +40 -0
- data/lib/spark/helper/parser.rb +85 -0
- data/lib/spark/helper/serialize.rb +71 -0
- data/lib/spark/helper/statistic.rb +93 -0
- data/lib/spark/helper/system.rb +42 -0
- data/lib/spark/java_bridge.rb +19 -0
- data/lib/spark/java_bridge/base.rb +203 -0
- data/lib/spark/java_bridge/jruby.rb +23 -0
- data/lib/spark/java_bridge/rjb.rb +41 -0
- data/lib/spark/logger.rb +76 -0
- data/lib/spark/mllib.rb +100 -0
- data/lib/spark/mllib/classification/common.rb +31 -0
- data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
- data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
- data/lib/spark/mllib/classification/svm.rb +135 -0
- data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
- data/lib/spark/mllib/clustering/kmeans.rb +118 -0
- data/lib/spark/mllib/matrix.rb +120 -0
- data/lib/spark/mllib/regression/common.rb +73 -0
- data/lib/spark/mllib/regression/labeled_point.rb +41 -0
- data/lib/spark/mllib/regression/lasso.rb +100 -0
- data/lib/spark/mllib/regression/linear.rb +124 -0
- data/lib/spark/mllib/regression/ridge.rb +97 -0
- data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
- data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
- data/lib/spark/mllib/stat/distribution.rb +12 -0
- data/lib/spark/mllib/vector.rb +185 -0
- data/lib/spark/rdd.rb +1328 -0
- data/lib/spark/sampler.rb +92 -0
- data/lib/spark/serializer.rb +24 -0
- data/lib/spark/serializer/base.rb +170 -0
- data/lib/spark/serializer/cartesian.rb +37 -0
- data/lib/spark/serializer/marshal.rb +19 -0
- data/lib/spark/serializer/message_pack.rb +25 -0
- data/lib/spark/serializer/oj.rb +25 -0
- data/lib/spark/serializer/pair.rb +27 -0
- data/lib/spark/serializer/utf8.rb +25 -0
- data/lib/spark/sort.rb +189 -0
- data/lib/spark/stat_counter.rb +125 -0
- data/lib/spark/storage_level.rb +39 -0
- data/lib/spark/version.rb +3 -0
- data/lib/spark/worker/master.rb +144 -0
- data/lib/spark/worker/spark_files.rb +15 -0
- data/lib/spark/worker/worker.rb +197 -0
- data/ruby-spark.gemspec +36 -0
- data/spec/generator.rb +37 -0
- data/spec/inputs/lorem_300.txt +316 -0
- data/spec/inputs/numbers/1.txt +50 -0
- data/spec/inputs/numbers/10.txt +50 -0
- data/spec/inputs/numbers/11.txt +50 -0
- data/spec/inputs/numbers/12.txt +50 -0
- data/spec/inputs/numbers/13.txt +50 -0
- data/spec/inputs/numbers/14.txt +50 -0
- data/spec/inputs/numbers/15.txt +50 -0
- data/spec/inputs/numbers/16.txt +50 -0
- data/spec/inputs/numbers/17.txt +50 -0
- data/spec/inputs/numbers/18.txt +50 -0
- data/spec/inputs/numbers/19.txt +50 -0
- data/spec/inputs/numbers/2.txt +50 -0
- data/spec/inputs/numbers/20.txt +50 -0
- data/spec/inputs/numbers/3.txt +50 -0
- data/spec/inputs/numbers/4.txt +50 -0
- data/spec/inputs/numbers/5.txt +50 -0
- data/spec/inputs/numbers/6.txt +50 -0
- data/spec/inputs/numbers/7.txt +50 -0
- data/spec/inputs/numbers/8.txt +50 -0
- data/spec/inputs/numbers/9.txt +50 -0
- data/spec/inputs/numbers_0_100.txt +101 -0
- data/spec/inputs/numbers_1_100.txt +100 -0
- data/spec/lib/collect_spec.rb +42 -0
- data/spec/lib/command_spec.rb +68 -0
- data/spec/lib/config_spec.rb +64 -0
- data/spec/lib/context_spec.rb +163 -0
- data/spec/lib/ext_spec.rb +72 -0
- data/spec/lib/external_apps_spec.rb +45 -0
- data/spec/lib/filter_spec.rb +80 -0
- data/spec/lib/flat_map_spec.rb +100 -0
- data/spec/lib/group_spec.rb +109 -0
- data/spec/lib/helper_spec.rb +19 -0
- data/spec/lib/key_spec.rb +41 -0
- data/spec/lib/manipulation_spec.rb +114 -0
- data/spec/lib/map_partitions_spec.rb +87 -0
- data/spec/lib/map_spec.rb +91 -0
- data/spec/lib/mllib/classification_spec.rb +54 -0
- data/spec/lib/mllib/clustering_spec.rb +35 -0
- data/spec/lib/mllib/matrix_spec.rb +32 -0
- data/spec/lib/mllib/regression_spec.rb +116 -0
- data/spec/lib/mllib/vector_spec.rb +77 -0
- data/spec/lib/reduce_by_key_spec.rb +118 -0
- data/spec/lib/reduce_spec.rb +131 -0
- data/spec/lib/sample_spec.rb +46 -0
- data/spec/lib/serializer_spec.rb +13 -0
- data/spec/lib/sort_spec.rb +58 -0
- data/spec/lib/statistic_spec.rb +168 -0
- data/spec/lib/whole_text_files_spec.rb +33 -0
- data/spec/spec_helper.rb +39 -0
- metadata +301 -0
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'matrix'
|
2
|
+
|
3
|
+
# Based on ruby 2.1
|
4
|
+
|
5
|
+
class Vector
|
6
|
+
def self.elements(array, copy=true)
|
7
|
+
DenseVector.new(convert_to_array(array, copy))
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
module Spark
|
12
|
+
module Mllib
|
13
|
+
class VectorAdapter < ::Vector
|
14
|
+
|
15
|
+
def self.new(*args)
|
16
|
+
object = self.allocate
|
17
|
+
object.__send__(:initialize, *args)
|
18
|
+
object
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize(*args)
|
22
|
+
case args.shift
|
23
|
+
when :dense
|
24
|
+
values = args.shift.dup
|
25
|
+
when :sparse
|
26
|
+
values = [0.0] * args.shift.to_i
|
27
|
+
else
|
28
|
+
raise Spark::MllibError, 'Unknow vector type.'
|
29
|
+
end
|
30
|
+
|
31
|
+
super(values)
|
32
|
+
end
|
33
|
+
|
34
|
+
def []=(index, value)
|
35
|
+
@elements[index] = value
|
36
|
+
end
|
37
|
+
|
38
|
+
def dot(other)
|
39
|
+
if other.is_a?(Spark::Mllib::MatrixBase)
|
40
|
+
other * self
|
41
|
+
else
|
42
|
+
inner_product(other)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def squared_distance(other)
|
47
|
+
diff = self - other
|
48
|
+
diff.dot(diff)
|
49
|
+
end
|
50
|
+
|
51
|
+
def values
|
52
|
+
@values || to_a
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
##
|
2
|
+
# MultivariateGaussian
|
3
|
+
#
|
4
|
+
# This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution. In
|
5
|
+
# the event that the covariance matrix is singular, the density will be computed in a
|
6
|
+
# reduced dimensional subspace under which the distribution is supported.
|
7
|
+
#
|
8
|
+
# == Arguments:
|
9
|
+
# mu:: The mean vector of the distribution
|
10
|
+
# sigma:: The covariance matrix of the distribution
|
11
|
+
#
|
12
|
+
Spark::Mllib::MultivariateGaussian = Struct.new(:mu, :sigma)
|
@@ -0,0 +1,185 @@
|
|
1
|
+
module Spark
|
2
|
+
module Mllib
|
3
|
+
module Vectors
|
4
|
+
|
5
|
+
def self.dense(*args)
|
6
|
+
DenseVector.new(*args)
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.sparse(*args)
|
10
|
+
SparseVector.new(*args)
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.parse(data)
|
14
|
+
if data.start_with?('[') && data.end_with?(']')
|
15
|
+
DenseVector.parse(data)
|
16
|
+
elsif data.start_with?('(') && data.end_with?(')')
|
17
|
+
SparseVector.parse(data)
|
18
|
+
else
|
19
|
+
raise ArgumentError, 'Unknow vector.'
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.to_vector(data)
|
24
|
+
if data.is_a?(SparseVector) || data.is_a?(DenseVector)
|
25
|
+
data
|
26
|
+
elsif data.is_a?(Array)
|
27
|
+
DenseVector.new(data)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
module Spark
|
36
|
+
module Mllib
|
37
|
+
# @abstract Parent for all type of vectors
|
38
|
+
class VectorBase < VectorAdapter
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
module Spark
|
44
|
+
module Mllib
|
45
|
+
##
|
46
|
+
# A dense vector represented by a value array.
|
47
|
+
#
|
48
|
+
# Dense vector is a vector in which most of the elements are non-zero.
|
49
|
+
#
|
50
|
+
# == Example:
|
51
|
+
# DenseVector.new([1,2,3,4,5]).values
|
52
|
+
# # => [1, 2, 3, 4, 5]
|
53
|
+
#
|
54
|
+
# DenseVector.new(1..5).values
|
55
|
+
# # => [1, 2, 3, 4, 5]
|
56
|
+
#
|
57
|
+
class DenseVector < VectorBase
|
58
|
+
|
59
|
+
def initialize(values)
|
60
|
+
super(:dense, values.to_a)
|
61
|
+
end
|
62
|
+
|
63
|
+
# Covert string to vector
|
64
|
+
#
|
65
|
+
# DenseVector.parse("[1.0,2.0,3.0,4.0,5.0]")
|
66
|
+
#
|
67
|
+
def self.parse(data)
|
68
|
+
unless data =~ /\[[0-9., ]+\]/
|
69
|
+
raise ArgumentError, 'Unknow format for DenseVector.'
|
70
|
+
end
|
71
|
+
|
72
|
+
data.sub!('[', '')
|
73
|
+
data.sub!(']', '')
|
74
|
+
|
75
|
+
data = data.split(',')
|
76
|
+
data.map!(&:to_f)
|
77
|
+
|
78
|
+
DenseVector.new(data)
|
79
|
+
end
|
80
|
+
|
81
|
+
# Convert vector to string
|
82
|
+
#
|
83
|
+
# DenseVector.new([1,2,3,4,5]).to_s
|
84
|
+
# # => "[1.0,2.0,3.0,4.0,5.0]"
|
85
|
+
#
|
86
|
+
def to_s
|
87
|
+
"[#{values.join(',')}]"
|
88
|
+
end
|
89
|
+
|
90
|
+
def to_java
|
91
|
+
JDenseVector.new(values)
|
92
|
+
end
|
93
|
+
|
94
|
+
def self.from_java(object)
|
95
|
+
DenseVector.new(object.values)
|
96
|
+
end
|
97
|
+
|
98
|
+
def marshal_dump
|
99
|
+
values
|
100
|
+
end
|
101
|
+
|
102
|
+
def marshal_load(array)
|
103
|
+
initialize(array)
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
module Spark
|
111
|
+
module Mllib
|
112
|
+
##
|
113
|
+
# A sparse vector represented by an index array and an value array.
|
114
|
+
#
|
115
|
+
# Sparse vector is a vector in which most of the elements are zero.
|
116
|
+
#
|
117
|
+
# == Example:
|
118
|
+
# SparseVector.new(4, {1 => 1.0, 3 => 5.5}).values
|
119
|
+
# # => [0, 1.0, 0, 5.5]
|
120
|
+
#
|
121
|
+
# SparseVector.new(4, [[1, 3], [1.0, 5.5]]).values
|
122
|
+
# # => [0, 1.0, 0, 5.5]
|
123
|
+
#
|
124
|
+
# SparseVector.new(4, [1, 3], [1.0, 5.5]).values
|
125
|
+
# # => [0, 1.0, 0, 5.5]
|
126
|
+
#
|
127
|
+
class SparseVector < VectorBase
|
128
|
+
|
129
|
+
attr_reader :indices
|
130
|
+
|
131
|
+
def initialize(arg1, arg2=nil, arg3=nil)
|
132
|
+
super(:sparse, arg1)
|
133
|
+
|
134
|
+
if arg2.is_a?(Hash)
|
135
|
+
@indices = arg2.keys
|
136
|
+
@values = arg2.values
|
137
|
+
else
|
138
|
+
@indices = arg2
|
139
|
+
@values = arg3
|
140
|
+
end
|
141
|
+
|
142
|
+
@indices.zip(@values).each do |(index, value)|
|
143
|
+
self[index] = value
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# Covert string to vector
|
148
|
+
#
|
149
|
+
# SparseVector.parse("(5,[1,4],[3.0,5.0])")
|
150
|
+
#
|
151
|
+
def self.parse(data)
|
152
|
+
data = data.match(/\(([0-9]+)[ ]*,[ ]*\[([0-9,. ]*)\][ ]*,[ ]*\[([0-9,. ]*)\]\)/)
|
153
|
+
if data
|
154
|
+
size = data[1].to_i
|
155
|
+
indices = data[2].split(',')
|
156
|
+
indices.map!(&:to_i)
|
157
|
+
values = data[3].split(',')
|
158
|
+
values.map!(&:to_f)
|
159
|
+
|
160
|
+
SparseVector.new(size, indices, values)
|
161
|
+
else
|
162
|
+
raise ArgumentError, 'Unknow format for SparseVector.'
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
# Convert vector to string
|
167
|
+
#
|
168
|
+
# SparseVector.new(5, {1 => 3, 4 => 5}).to_s
|
169
|
+
# # => "(5,[1,4],[3.0,5.0])"
|
170
|
+
#
|
171
|
+
def to_s
|
172
|
+
"(#{size},[#{indices.join(',')}],[#{values.join(',')}])"
|
173
|
+
end
|
174
|
+
|
175
|
+
def marshal_dump
|
176
|
+
[size, indices, values]
|
177
|
+
end
|
178
|
+
|
179
|
+
def marshal_load(array)
|
180
|
+
initialize(array[0], array[1], array[2])
|
181
|
+
end
|
182
|
+
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
data/lib/spark/rdd.rb
ADDED
@@ -0,0 +1,1328 @@
|
|
1
|
+
module Spark
|
2
|
+
##
|
3
|
+
# A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. Represents an immutable,
|
4
|
+
# partitioned collection of elements that can be operated on in parallel. This class contains the
|
5
|
+
# basic operations available on all RDDs, such as `map`, `filter`, and `persist`.
|
6
|
+
#
|
7
|
+
class RDD
|
8
|
+
|
9
|
+
extend Forwardable
|
10
|
+
|
11
|
+
attr_reader :jrdd, :context, :command
|
12
|
+
|
13
|
+
include Spark::Helper::Logger
|
14
|
+
include Spark::Helper::Parser
|
15
|
+
include Spark::Helper::Statistic
|
16
|
+
|
17
|
+
def_delegators :@command, :serializer, :deserializer, :libraries, :files
|
18
|
+
|
19
|
+
# Initializing RDD, this method is root of all Pipelined RDD - its unique
|
20
|
+
# If you call some operations on this class it will be computed in Java
|
21
|
+
#
|
22
|
+
# == Parameters:
|
23
|
+
# jrdd:: org.apache.spark.api.java.JavaRDD
|
24
|
+
# context:: {Spark::Context}
|
25
|
+
# serializer:: {Spark::Serializer}
|
26
|
+
#
|
27
|
+
def initialize(jrdd, context, serializer, deserializer=nil)
|
28
|
+
@jrdd = jrdd
|
29
|
+
@context = context
|
30
|
+
|
31
|
+
@cached = false
|
32
|
+
@checkpointed = false
|
33
|
+
|
34
|
+
@command = Spark::CommandBuilder.new(serializer, deserializer)
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
# =============================================================================
|
39
|
+
# Operators
|
40
|
+
|
41
|
+
def +(other)
|
42
|
+
self.union(other)
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
# =============================================================================
|
47
|
+
# Commad and serializer
|
48
|
+
|
49
|
+
def add_command(klass, *args)
|
50
|
+
@command.deep_copy.add_command(klass, *args)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Add ruby library
|
54
|
+
# Libraries will be included before computing
|
55
|
+
#
|
56
|
+
# == Example:
|
57
|
+
# rdd.add_library('pry').add_library('nio4r', 'distribution')
|
58
|
+
#
|
59
|
+
def add_library(*libraries)
|
60
|
+
@command.add_library(*libraries)
|
61
|
+
self
|
62
|
+
end
|
63
|
+
|
64
|
+
# Bind object to RDD
|
65
|
+
#
|
66
|
+
# == Example:
|
67
|
+
# text = "test"
|
68
|
+
#
|
69
|
+
# rdd = $sc.parallelize(0..5)
|
70
|
+
# rdd = rdd.map(lambda{|x| x.to_s + " " + text})
|
71
|
+
# rdd = rdd.bind(text: text)
|
72
|
+
#
|
73
|
+
# rdd.collect
|
74
|
+
# # => ["0 test", "1 test", "2 test", "3 test", "4 test", "5 test"]
|
75
|
+
#
|
76
|
+
def bind(objects)
|
77
|
+
unless objects.is_a?(Hash)
|
78
|
+
raise ArgumentError, 'Argument must be a Hash.'
|
79
|
+
end
|
80
|
+
|
81
|
+
@command.bind(objects)
|
82
|
+
self
|
83
|
+
end
|
84
|
+
|
85
|
+
def new_rdd_from_command(klass, *args)
|
86
|
+
comm = add_command(klass, *args)
|
87
|
+
PipelinedRDD.new(self, comm)
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
# =============================================================================
|
92
|
+
# Variables and non-computing functions
|
93
|
+
|
94
|
+
def config
|
95
|
+
@context.config
|
96
|
+
end
|
97
|
+
|
98
|
+
def default_reduce_partitions
|
99
|
+
config['spark.default.parallelism'] || partitions_size
|
100
|
+
end
|
101
|
+
|
102
|
+
# Count of ParallelCollectionPartition
|
103
|
+
def partitions_size
|
104
|
+
jrdd.rdd.partitions.size
|
105
|
+
end
|
106
|
+
|
107
|
+
# A unique ID for this RDD (within its SparkContext).
|
108
|
+
def id
|
109
|
+
jrdd.id
|
110
|
+
end
|
111
|
+
|
112
|
+
# Persist this RDD with the default storage level MEMORY_ONLY_SER because of serialization.
|
113
|
+
def cache
|
114
|
+
persist('memory_only_ser')
|
115
|
+
end
|
116
|
+
|
117
|
+
# Set this RDD's storage level to persist its values across operations after the first time
|
118
|
+
# it is computed. This can only be used to assign a new storage level if the RDD does not
|
119
|
+
# have a storage level set yet.
|
120
|
+
#
|
121
|
+
# See StorageLevel for type of new_level
|
122
|
+
#
|
123
|
+
def persist(new_level)
|
124
|
+
@cached = true
|
125
|
+
jrdd.persist(Spark::StorageLevel.java_get(new_level))
|
126
|
+
self
|
127
|
+
end
|
128
|
+
|
129
|
+
# Mark the RDD as non-persistent, and remove all blocks for it from memory and disk.
|
130
|
+
#
|
131
|
+
# == Parameters:
|
132
|
+
# blocking:: whether to block until all blocks are deleted.
|
133
|
+
#
|
134
|
+
def unpersist(blocking=true)
|
135
|
+
@cached = false
|
136
|
+
jrdd.unpersist(blocking)
|
137
|
+
self
|
138
|
+
end
|
139
|
+
|
140
|
+
def cached?
|
141
|
+
@cached
|
142
|
+
end
|
143
|
+
|
144
|
+
def checkpointed?
|
145
|
+
@checkpointed
|
146
|
+
end
|
147
|
+
|
148
|
+
# Return the name of this RDD.
|
149
|
+
#
|
150
|
+
def name
|
151
|
+
_name = jrdd.name
|
152
|
+
_name && _name.encode(Encoding::UTF_8)
|
153
|
+
end
|
154
|
+
|
155
|
+
# Assign a name to this RDD.
|
156
|
+
#
|
157
|
+
def set_name(name)
|
158
|
+
jrdd.setName(name)
|
159
|
+
end
|
160
|
+
|
161
|
+
def to_java
|
162
|
+
rdd = self.reserialize('Marshal')
|
163
|
+
RubyRDD.toJava(rdd.jrdd, rdd.serializer.batched?)
|
164
|
+
end
|
165
|
+
|
166
|
+
|
167
|
+
# =============================================================================
|
168
|
+
# Actions which return value
|
169
|
+
|
170
|
+
# Return an array that contains all of the elements in this RDD.
|
171
|
+
# RJB raise an error if stage is killed.
|
172
|
+
def collect
|
173
|
+
collect_from_iterator(jrdd.collect.iterator)
|
174
|
+
rescue => e
|
175
|
+
raise Spark::RDDError, e.message
|
176
|
+
end
|
177
|
+
|
178
|
+
def collect_from_iterator(iterator)
|
179
|
+
if self.is_a?(PipelinedRDD)
|
180
|
+
klass = @command.serializer
|
181
|
+
else
|
182
|
+
klass = @command.deserializer
|
183
|
+
end
|
184
|
+
|
185
|
+
klass.load_from_iterator(iterator)
|
186
|
+
end
|
187
|
+
|
188
|
+
# Convert an Array to Hash
|
189
|
+
#
|
190
|
+
def collect_as_hash
|
191
|
+
Hash[collect]
|
192
|
+
end
|
193
|
+
|
194
|
+
# Take the first num elements of the RDD.
|
195
|
+
#
|
196
|
+
# It works by first scanning one partition, and use the results from
|
197
|
+
# that partition to estimate the number of additional partitions needed
|
198
|
+
# to satisfy the limit.
|
199
|
+
#
|
200
|
+
# == Example:
|
201
|
+
# rdd = $sc.parallelize(0..100, 20, batch_size: 1)
|
202
|
+
# rdd.take(5)
|
203
|
+
# # => [0, 1, 2, 3, 4]
|
204
|
+
#
|
205
|
+
def take(count)
|
206
|
+
buffer = []
|
207
|
+
|
208
|
+
parts_count = self.partitions_size
|
209
|
+
# No parts was scanned, yet
|
210
|
+
last_scanned = -1
|
211
|
+
|
212
|
+
while buffer.empty?
|
213
|
+
last_scanned += 1
|
214
|
+
buffer += context.run_job_with_command(self, [last_scanned], true, Spark::Command::Take, 0, -1)
|
215
|
+
end
|
216
|
+
|
217
|
+
# Assumption. Depend on batch_size and how Spark divided data.
|
218
|
+
items_per_part = buffer.size
|
219
|
+
left = count - buffer.size
|
220
|
+
|
221
|
+
while left > 0 && last_scanned < parts_count
|
222
|
+
parts_to_take = (left.to_f/items_per_part).ceil
|
223
|
+
parts_for_scanned = Array.new(parts_to_take) do
|
224
|
+
last_scanned += 1
|
225
|
+
end
|
226
|
+
|
227
|
+
# We cannot take exact number of items because workers are isolated from each other.
|
228
|
+
# => once you take e.g. 50% from last part and left is still > 0 then its very
|
229
|
+
# difficult merge new items
|
230
|
+
items = context.run_job_with_command(self, parts_for_scanned, true, Spark::Command::Take, left, last_scanned)
|
231
|
+
buffer += items
|
232
|
+
|
233
|
+
left = count - buffer.size
|
234
|
+
# Average size of all parts
|
235
|
+
items_per_part = [items_per_part, items.size].reduce(0){|sum, x| sum + x.to_f/2}
|
236
|
+
end
|
237
|
+
|
238
|
+
buffer.slice!(0, count)
|
239
|
+
end
|
240
|
+
|
241
|
+
# Return the first element in this RDD.
|
242
|
+
#
|
243
|
+
# == Example:
|
244
|
+
# rdd = $sc.parallelize(0..100)
|
245
|
+
# rdd.first
|
246
|
+
# # => 0
|
247
|
+
#
|
248
|
+
def first
|
249
|
+
self.take(1)[0]
|
250
|
+
end
|
251
|
+
|
252
|
+
# Reduces the elements of this RDD using the specified lambda or method.
|
253
|
+
#
|
254
|
+
# == Example:
|
255
|
+
# rdd = $sc.parallelize(0..10)
|
256
|
+
# rdd.reduce(lambda{|sum, x| sum+x})
|
257
|
+
# # => 55
|
258
|
+
#
|
259
|
+
def reduce(f)
|
260
|
+
_reduce(Spark::Command::Reduce, f, f)
|
261
|
+
end
|
262
|
+
|
263
|
+
# Aggregate the elements of each partition, and then the results for all the partitions, using a
|
264
|
+
# given associative function and a neutral "zero value".
|
265
|
+
#
|
266
|
+
# The function f(x, y) is allowed to modify x and return it as its result value to avoid
|
267
|
+
# object allocation; however, it should not modify y.
|
268
|
+
#
|
269
|
+
# Be careful, zero_values is applied to all stages. See example.
|
270
|
+
#
|
271
|
+
# == Example:
|
272
|
+
# rdd = $sc.parallelize(0..10, 2)
|
273
|
+
# rdd.fold(1, lambda{|sum, x| sum+x})
|
274
|
+
# # => 58
|
275
|
+
#
|
276
|
+
def fold(zero_value, f)
|
277
|
+
self.aggregate(zero_value, f, f)
|
278
|
+
end
|
279
|
+
|
280
|
+
# Aggregate the elements of each partition, and then the results for all the partitions, using
|
281
|
+
# given combine functions and a neutral "zero value".
|
282
|
+
#
|
283
|
+
# This function can return a different result type. We need one operation for merging.
|
284
|
+
#
|
285
|
+
# Result must be an Array otherwise Serializer Array's zero value will be send
|
286
|
+
# as multiple values and not just one.
|
287
|
+
#
|
288
|
+
# == Example:
|
289
|
+
# # 1 2 3 4 5 => 15 + 1 = 16
|
290
|
+
# # 6 7 8 9 10 => 40 + 1 = 41
|
291
|
+
# # 16 * 41 = 656
|
292
|
+
#
|
293
|
+
# seq = lambda{|x,y| x+y}
|
294
|
+
# com = lambda{|x,y| x*y}
|
295
|
+
#
|
296
|
+
# rdd = $sc.parallelize(1..10, 2, batch_size: 1)
|
297
|
+
# rdd.aggregate(1, seq, com)
|
298
|
+
# # => 656
|
299
|
+
#
|
300
|
+
def aggregate(zero_value, seq_op, comb_op)
|
301
|
+
_reduce(Spark::Command::Aggregate, seq_op, comb_op, zero_value)
|
302
|
+
end
|
303
|
+
|
304
|
+
# Return the max of this RDD
|
305
|
+
#
|
306
|
+
# == Example:
|
307
|
+
# rdd = $sc.parallelize(0..10)
|
308
|
+
# rdd.max
|
309
|
+
# # => 10
|
310
|
+
#
|
311
|
+
def max
|
312
|
+
self.reduce('lambda{|memo, item| memo > item ? memo : item }')
|
313
|
+
end
|
314
|
+
|
315
|
+
# Return the min of this RDD
|
316
|
+
#
|
317
|
+
# == Example:
|
318
|
+
# rdd = $sc.parallelize(0..10)
|
319
|
+
# rdd.min
|
320
|
+
# # => 0
|
321
|
+
#
|
322
|
+
def min
|
323
|
+
self.reduce('lambda{|memo, item| memo < item ? memo : item }')
|
324
|
+
end
|
325
|
+
|
326
|
+
# Return the sum of this RDD
|
327
|
+
#
|
328
|
+
# == Example:
|
329
|
+
# rdd = $sc.parallelize(0..10)
|
330
|
+
# rdd.sum
|
331
|
+
# # => 55
|
332
|
+
#
|
333
|
+
def sum
|
334
|
+
self.reduce('lambda{|sum, item| sum + item}')
|
335
|
+
end
|
336
|
+
|
337
|
+
# Return the number of values in this RDD
|
338
|
+
#
|
339
|
+
# == Example:
|
340
|
+
# rdd = $sc.parallelize(0..10)
|
341
|
+
# rdd.count
|
342
|
+
# # => 11
|
343
|
+
#
|
344
|
+
def count
|
345
|
+
# nil is for seq_op => it means the all result go directly to one worker for combine
|
346
|
+
@count ||= self.map_partitions('lambda{|iterator| iterator.to_a.size }')
|
347
|
+
.aggregate(0, nil, 'lambda{|sum, item| sum + item }')
|
348
|
+
end
|
349
|
+
|
350
|
+
# Return a {Spark::StatCounter} object that captures the mean, variance
|
351
|
+
# and count of the RDD's elements in one operation.
|
352
|
+
def stats
|
353
|
+
@stats ||= new_rdd_from_command(Spark::Command::Stats).reduce('lambda{|memo, item| memo.merge(item)}')
|
354
|
+
end
|
355
|
+
|
356
|
+
# Compute the mean of this RDD's elements.
|
357
|
+
#
|
358
|
+
# == Example:
|
359
|
+
# $sc.parallelize([1, 2, 3]).mean
|
360
|
+
# # => 2.0
|
361
|
+
#
|
362
|
+
def mean
|
363
|
+
stats.mean
|
364
|
+
end
|
365
|
+
|
366
|
+
# Compute the variance of this RDD's elements.
|
367
|
+
#
|
368
|
+
# == Example:
|
369
|
+
# $sc.parallelize([1, 2, 3]).variance
|
370
|
+
# # => 0.666...
|
371
|
+
#
|
372
|
+
def variance
|
373
|
+
stats.variance
|
374
|
+
end
|
375
|
+
|
376
|
+
# Compute the standard deviation of this RDD's elements.
|
377
|
+
#
|
378
|
+
# == Example:
|
379
|
+
# $sc.parallelize([1, 2, 3]).stdev
|
380
|
+
# # => 0.816...
|
381
|
+
#
|
382
|
+
def stdev
|
383
|
+
stats.stdev
|
384
|
+
end
|
385
|
+
|
386
|
+
# Compute the sample standard deviation of this RDD's elements (which
|
387
|
+
# corrects for bias in estimating the standard deviation by dividing by
|
388
|
+
# N-1 instead of N).
|
389
|
+
#
|
390
|
+
# == Example:
|
391
|
+
# $sc.parallelize([1, 2, 3]).sample_stdev
|
392
|
+
# # => 1.0
|
393
|
+
#
|
394
|
+
def sample_stdev
|
395
|
+
stats.sample_stdev
|
396
|
+
end
|
397
|
+
|
398
|
+
# Compute the sample variance of this RDD's elements (which corrects
|
399
|
+
# for bias in estimating the variance by dividing by N-1 instead of N).
|
400
|
+
#
|
401
|
+
# == Example:
|
402
|
+
# $sc.parallelize([1, 2, 3]).sample_variance
|
403
|
+
# # => 1.0
|
404
|
+
#
|
405
|
+
def sample_variance
|
406
|
+
stats.sample_variance
|
407
|
+
end
|
408
|
+
|
409
|
+
# Compute a histogram using the provided buckets. The buckets
|
410
|
+
# are all open to the right except for the last which is closed.
|
411
|
+
# e.g. [1,10,20,50] means the buckets are [1,10) [10,20) [20,50],
|
412
|
+
# which means 1<=x<10, 10<=x<20, 20<=x<=50. And on the input of 1
|
413
|
+
# and 50 we would have a histogram of 1,0,1.
|
414
|
+
#
|
415
|
+
# If your histogram is evenly spaced (e.g. [0, 10, 20, 30]),
|
416
|
+
# this can be switched from an O(log n) inseration to O(1) per
|
417
|
+
# element(where n = # buckets).
|
418
|
+
#
|
419
|
+
# Buckets must be sorted and not contain any duplicates, must be
|
420
|
+
# at least two elements.
|
421
|
+
#
|
422
|
+
# == Examples:
|
423
|
+
# rdd = $sc.parallelize(0..50)
|
424
|
+
#
|
425
|
+
# rdd.histogram(2)
|
426
|
+
# # => [[0.0, 25.0, 50], [25, 26]]
|
427
|
+
#
|
428
|
+
# rdd.histogram([0, 5, 25, 50])
|
429
|
+
# # => [[0, 5, 25, 50], [5, 20, 26]]
|
430
|
+
#
|
431
|
+
# rdd.histogram([0, 15, 30, 45, 60])
|
432
|
+
# # => [[0, 15, 30, 45, 60], [15, 15, 15, 6]]
|
433
|
+
#
|
434
|
+
def histogram(buckets)
|
435
|
+
|
436
|
+
# -----------------------------------------------------------------------
|
437
|
+
# Integer
|
438
|
+
#
|
439
|
+
if buckets.is_a?(Integer)
|
440
|
+
|
441
|
+
# Validation
|
442
|
+
if buckets < 1
|
443
|
+
raise ArgumentError, "Bucket count must be >= 1, #{buckets} inserted."
|
444
|
+
end
|
445
|
+
|
446
|
+
# Filter invalid values
|
447
|
+
# Nil and NaN
|
448
|
+
func = 'lambda{|x|
|
449
|
+
if x.nil? || (x.is_a?(Float) && x.nan?)
|
450
|
+
false
|
451
|
+
else
|
452
|
+
true
|
453
|
+
end
|
454
|
+
}'
|
455
|
+
filtered = self.filter(func)
|
456
|
+
|
457
|
+
# Compute the minimum and the maximum
|
458
|
+
func = 'lambda{|memo, item|
|
459
|
+
[memo[0] < item[0] ? memo[0] : item[0],
|
460
|
+
memo[1] > item[1] ? memo[1] : item[1]]
|
461
|
+
}'
|
462
|
+
min, max = filtered.map('lambda{|x| [x, x]}').reduce(func)
|
463
|
+
|
464
|
+
# Min, max must be valid numbers
|
465
|
+
if (min.is_a?(Float) && !min.finite?) || (max.is_a?(Float) && !max.finite?)
|
466
|
+
raise Spark::RDDError, 'Histogram on either an empty RDD or RDD containing +/-infinity or NaN'
|
467
|
+
end
|
468
|
+
|
469
|
+
# Already finished
|
470
|
+
if min == max || buckets == 1
|
471
|
+
return [min, max], [filtered.count]
|
472
|
+
end
|
473
|
+
|
474
|
+
# Custom range
|
475
|
+
begin
|
476
|
+
span = max - min # increment
|
477
|
+
buckets = (0...buckets).map do |x|
|
478
|
+
min + (x * span) / buckets.to_f
|
479
|
+
end
|
480
|
+
buckets << max
|
481
|
+
rescue NoMethodError
|
482
|
+
raise Spark::RDDError, 'Can not generate buckets with non-number in RDD'
|
483
|
+
end
|
484
|
+
|
485
|
+
even = true
|
486
|
+
|
487
|
+
# -----------------------------------------------------------------------
|
488
|
+
# Array
|
489
|
+
#
|
490
|
+
elsif buckets.is_a?(Array)
|
491
|
+
|
492
|
+
if buckets.size < 2
|
493
|
+
raise ArgumentError, 'Buckets should have more than one value.'
|
494
|
+
end
|
495
|
+
|
496
|
+
if buckets.detect{|x| x.nil? || (x.is_a?(Float) && x.nan?)}
|
497
|
+
raise ArgumentError, 'Can not have nil or nan numbers in buckets.'
|
498
|
+
end
|
499
|
+
|
500
|
+
if buckets.detect{|x| buckets.count(x) > 1}
|
501
|
+
raise ArgumentError, 'Buckets should not contain duplicated values.'
|
502
|
+
end
|
503
|
+
|
504
|
+
if buckets.sort != buckets
|
505
|
+
raise ArgumentError, 'Buckets must be sorted.'
|
506
|
+
end
|
507
|
+
|
508
|
+
even = false
|
509
|
+
|
510
|
+
# -----------------------------------------------------------------------
|
511
|
+
# Other
|
512
|
+
#
|
513
|
+
else
|
514
|
+
raise Spark::RDDError, 'Buckets should be number or array.'
|
515
|
+
end
|
516
|
+
|
517
|
+
reduce_func = 'lambda{|memo, item|
|
518
|
+
memo.size.times do |i|
|
519
|
+
memo[i] += item[i]
|
520
|
+
end
|
521
|
+
memo
|
522
|
+
}'
|
523
|
+
|
524
|
+
return buckets, new_rdd_from_command(Spark::Command::Histogram, even, buckets).reduce(reduce_func)
|
525
|
+
end
|
526
|
+
|
527
|
+
# Applies a function f to all elements of this RDD.
|
528
|
+
#
|
529
|
+
# == Example:
|
530
|
+
# rdd = $sc.parallelize(0..5)
|
531
|
+
# rdd.foreach(lambda{|x| puts x})
|
532
|
+
# # => nil
|
533
|
+
#
|
534
|
+
def foreach(f, options={})
|
535
|
+
new_rdd_from_command(Spark::Command::Foreach, f).collect
|
536
|
+
nil
|
537
|
+
end
|
538
|
+
|
539
|
+
# Applies a function f to each partition of this RDD.
|
540
|
+
#
|
541
|
+
# == Example:
|
542
|
+
# rdd = $sc.parallelize(0..5)
|
543
|
+
# rdd.foreachPartition(lambda{|x| puts x.to_s})
|
544
|
+
# # => nil
|
545
|
+
#
|
546
|
+
def foreach_partition(f, options={})
|
547
|
+
new_rdd_from_command(Spark::Command::ForeachPartition, f).collect
|
548
|
+
nil
|
549
|
+
end
|
550
|
+
|
551
|
+
|
552
|
+
# =============================================================================
|
553
|
+
# Transformations of RDD
|
554
|
+
|
555
|
+
# Return a new RDD by applying a function to all elements of this RDD.
|
556
|
+
#
|
557
|
+
# == Example:
|
558
|
+
# rdd = $sc.parallelize(0..5)
|
559
|
+
# rdd.map(lambda {|x| x*2}).collect
|
560
|
+
# # => [0, 2, 4, 6, 8, 10]
|
561
|
+
#
|
562
|
+
def map(f)
|
563
|
+
new_rdd_from_command(Spark::Command::Map, f)
|
564
|
+
end
|
565
|
+
|
566
|
+
# Return a new RDD by first applying a function to all elements of this
|
567
|
+
# RDD, and then flattening the results.
|
568
|
+
#
|
569
|
+
# == Example:
|
570
|
+
# rdd = $sc.parallelize(0..5)
|
571
|
+
# rdd.flat_map(lambda {|x| [x, 1]}).collect
|
572
|
+
# # => [0, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1]
|
573
|
+
#
|
574
|
+
def flat_map(f)
|
575
|
+
new_rdd_from_command(Spark::Command::FlatMap, f)
|
576
|
+
end
|
577
|
+
|
578
|
+
# Return a new RDD by applying a function to each partition of this RDD.
|
579
|
+
#
|
580
|
+
# == Example:
|
581
|
+
# rdd = $sc.parallelize(0..10, 2)
|
582
|
+
# rdd.map_partitions(lambda{|part| part.reduce(:+)}).collect
|
583
|
+
# # => [15, 40]
|
584
|
+
#
|
585
|
+
def map_partitions(f)
|
586
|
+
new_rdd_from_command(Spark::Command::MapPartitions, f)
|
587
|
+
end
|
588
|
+
|
589
|
+
# Return a new RDD by applying a function to each partition of this RDD, while tracking the index
|
590
|
+
# of the original partition.
|
591
|
+
#
|
592
|
+
# == Example:
|
593
|
+
# rdd = $sc.parallelize(0...4, 4, batch_size: 1)
|
594
|
+
# rdd.map_partitions_with_index(lambda{|part, index| part.first * index}).collect
|
595
|
+
# # => [0, 1, 4, 9]
|
596
|
+
#
|
597
|
+
def map_partitions_with_index(f, options={})
|
598
|
+
new_rdd_from_command(Spark::Command::MapPartitionsWithIndex, f)
|
599
|
+
end
|
600
|
+
|
601
|
+
# Return a new RDD containing only the elements that satisfy a predicate.
|
602
|
+
#
|
603
|
+
# == Example:
|
604
|
+
# rdd = $sc.parallelize(0..10)
|
605
|
+
# rdd.filter(lambda{|x| x.even?}).collect
|
606
|
+
# # => [0, 2, 4, 6, 8, 10]
|
607
|
+
#
|
608
|
+
def filter(f)
|
609
|
+
new_rdd_from_command(Spark::Command::Filter, f)
|
610
|
+
end
|
611
|
+
|
612
|
+
# Return a new RDD containing non-nil elements.
|
613
|
+
#
|
614
|
+
# == Example:
|
615
|
+
# rdd = $sc.parallelize([1, nil, 2, nil, 3])
|
616
|
+
# rdd.compact.collect
|
617
|
+
# # => [1, 2, 3]
|
618
|
+
#
|
619
|
+
def compact
|
620
|
+
new_rdd_from_command(Spark::Command::Compact)
|
621
|
+
end
|
622
|
+
|
623
|
+
# Return an RDD created by coalescing all elements within each partition into an array.
|
624
|
+
#
|
625
|
+
# == Example:
|
626
|
+
# rdd = $sc.parallelize(0..10, 3, batch_size: 1)
|
627
|
+
# rdd.glom.collect
|
628
|
+
# # => [[0, 1, 2], [3, 4, 5, 6], [7, 8, 9, 10]]
|
629
|
+
#
|
630
|
+
def glom
|
631
|
+
new_rdd_from_command(Spark::Command::Glom)
|
632
|
+
end
|
633
|
+
|
634
|
+
# Return a new RDD that is reduced into num_partitions partitions.
|
635
|
+
#
|
636
|
+
# == Example:
|
637
|
+
# rdd = $sc.parallelize(0..10, 3)
|
638
|
+
# rdd.coalesce(2).glom.collect
|
639
|
+
# # => [[0, 1, 2], [3, 4, 5, 6, 7, 8, 9, 10]]
|
640
|
+
#
|
641
|
+
def coalesce(num_partitions)
|
642
|
+
new_jrdd = jrdd.coalesce(num_partitions)
|
643
|
+
RDD.new(new_jrdd, context, @command.serializer, @command.deserializer)
|
644
|
+
end
|
645
|
+
|
646
|
+
# Return the Cartesian product of this RDD and another one, that is, the
|
647
|
+
# RDD of all pairs of elements `(a, b)` where `a` is in `self` and
|
648
|
+
# `b` is in `other`.
|
649
|
+
#
|
650
|
+
# == Example:
|
651
|
+
# rdd1 = $sc.parallelize([1,2,3])
|
652
|
+
# rdd2 = $sc.parallelize([4,5,6])
|
653
|
+
#
|
654
|
+
# rdd1.cartesian(rdd2).collect
|
655
|
+
# # => [[1, 4], [1, 5], [1, 6], [2, 4], [2, 5], [2, 6], [3, 4], [3, 5], [3, 6]]
|
656
|
+
#
|
657
|
+
def cartesian(other)
|
658
|
+
_deserializer = Spark::Serializer::Cartesian.new.set(self.deserializer, other.deserializer)
|
659
|
+
new_jrdd = jrdd.cartesian(other.jrdd)
|
660
|
+
RDD.new(new_jrdd, context, serializer, _deserializer)
|
661
|
+
end
|
662
|
+
|
663
|
+
# Return a new RDD containing the distinct elements in this RDD.
|
664
|
+
# Ordering is not preserved because of reducing
|
665
|
+
#
|
666
|
+
# == Example:
|
667
|
+
# rdd = $sc.parallelize([1,1,1,2,3])
|
668
|
+
# rdd.distinct.collect
|
669
|
+
# # => [1, 2, 3]
|
670
|
+
#
|
671
|
+
def distinct
|
672
|
+
self.map('lambda{|x| [x, nil]}')
|
673
|
+
.reduce_by_key('lambda{|x,_| x}')
|
674
|
+
.map('lambda{|x| x[0]}')
|
675
|
+
end
|
676
|
+
|
677
|
+
# Return a shuffled RDD.
|
678
|
+
#
|
679
|
+
# == Example:
|
680
|
+
# rdd = $sc.parallelize(0..10)
|
681
|
+
# rdd.shuffle.collect
|
682
|
+
# # => [3, 10, 6, 7, 8, 0, 4, 2, 9, 1, 5]
|
683
|
+
#
|
684
|
+
def shuffle(seed=nil)
|
685
|
+
seed ||= Random.new_seed
|
686
|
+
|
687
|
+
new_rdd_from_command(Spark::Command::Shuffle, seed)
|
688
|
+
end
|
689
|
+
|
690
|
+
# Return the union of this RDD and another one. Any identical elements will appear multiple
|
691
|
+
# times (use .distinct to eliminate them).
|
692
|
+
#
|
693
|
+
# == Example:
|
694
|
+
# rdd = $sc.parallelize([1, 2, 3])
|
695
|
+
# rdd.union(rdd).collect
|
696
|
+
# # => [1, 2, 3, 1, 2, 3]
|
697
|
+
#
|
698
|
+
def union(other)
|
699
|
+
if self.serializer != other.serializer
|
700
|
+
other = other.reserialize(serializer.name, serializer.batch_size)
|
701
|
+
end
|
702
|
+
|
703
|
+
new_jrdd = jrdd.union(other.jrdd)
|
704
|
+
RDD.new(new_jrdd, context, serializer, deserializer)
|
705
|
+
end
|
706
|
+
|
707
|
+
# Return a new RDD with different serializer. This method is useful during union
|
708
|
+
# and join operations.
|
709
|
+
#
|
710
|
+
# == Example:
|
711
|
+
# rdd = $sc.parallelize([1, 2, 3], nil, serializer: "marshal")
|
712
|
+
# rdd = rdd.map(lambda{|x| x.to_s})
|
713
|
+
# rdd.reserialize("oj").collect
|
714
|
+
# # => ["1", "2", "3"]
|
715
|
+
#
|
716
|
+
def reserialize(new_serializer, new_batch_size=nil)
|
717
|
+
new_batch_size ||= deserializer.batch_size
|
718
|
+
new_serializer = Spark::Serializer.get!(new_serializer).new(new_batch_size)
|
719
|
+
|
720
|
+
if serializer == new_serializer
|
721
|
+
return self
|
722
|
+
end
|
723
|
+
|
724
|
+
new_command = @command.deep_copy
|
725
|
+
new_command.serializer = new_serializer
|
726
|
+
|
727
|
+
PipelinedRDD.new(self, new_command)
|
728
|
+
end
|
729
|
+
|
730
|
+
# Return the intersection of this RDD and another one. The output will not contain
|
731
|
+
# any duplicate elements, even if the input RDDs did.
|
732
|
+
#
|
733
|
+
# == Example:
|
734
|
+
# rdd1 = $sc.parallelize([1,2,3,4,5])
|
735
|
+
# rdd2 = $sc.parallelize([1,4,5,6,7])
|
736
|
+
# rdd1.intersection(rdd2).collect
|
737
|
+
# # => [1, 4, 5]
|
738
|
+
#
|
739
|
+
def intersection(other)
|
740
|
+
mapping_function = 'lambda{|item| [item, nil]}'
|
741
|
+
filter_function = 'lambda{|(key, values)| values.size > 1}'
|
742
|
+
|
743
|
+
self.map(mapping_function)
|
744
|
+
.cogroup(other.map(mapping_function))
|
745
|
+
.filter(filter_function)
|
746
|
+
.keys
|
747
|
+
end
|
748
|
+
|
749
|
+
# Return a copy of the RDD partitioned using the specified partitioner.
|
750
|
+
#
|
751
|
+
# == Example:
|
752
|
+
# rdd = $sc.parallelize(["1","2","3","4","5"]).map(lambda {|x| [x, 1]})
|
753
|
+
# rdd.partitionBy(2).glom.collect
|
754
|
+
# # => [[["3", 1], ["4", 1]], [["1", 1], ["2", 1], ["5", 1]]]
|
755
|
+
#
|
756
|
+
def partition_by(num_partitions, partition_func=nil)
|
757
|
+
num_partitions ||= default_reduce_partitions
|
758
|
+
partition_func ||= 'lambda{|x| Spark::Digest.portable_hash(x.to_s)}'
|
759
|
+
|
760
|
+
_partition_by(num_partitions, Spark::Command::PartitionBy::Basic, partition_func)
|
761
|
+
end
|
762
|
+
|
763
|
+
# Return a sampled subset of this RDD. Operations are base on Poisson and Uniform
|
764
|
+
# distributions.
|
765
|
+
# TODO: Replace Unfirom for Bernoulli
|
766
|
+
#
|
767
|
+
# == Examples:
|
768
|
+
# rdd = $sc.parallelize(0..100)
|
769
|
+
#
|
770
|
+
# rdd.sample(true, 10).collect
|
771
|
+
# # => [17, 17, 22, 23, 51, 52, 62, 64, 69, 70, 96]
|
772
|
+
#
|
773
|
+
# rdd.sample(false, 0.1).collect
|
774
|
+
# # => [3, 5, 9, 32, 44, 55, 66, 68, 75, 80, 86, 91, 98]
|
775
|
+
#
|
776
|
+
def sample(with_replacement, fraction, seed=nil)
|
777
|
+
new_rdd_from_command(Spark::Command::Sample, with_replacement, fraction, seed)
|
778
|
+
end
|
779
|
+
|
780
|
+
# Return a fixed-size sampled subset of this RDD in an array
|
781
|
+
#
|
782
|
+
# == Examples:
|
783
|
+
# rdd = $sc.parallelize(0..100)
|
784
|
+
#
|
785
|
+
# rdd.take_sample(true, 10)
|
786
|
+
# # => [90, 84, 74, 44, 27, 22, 72, 96, 80, 54]
|
787
|
+
#
|
788
|
+
# rdd.take_sample(false, 10)
|
789
|
+
# # => [5, 35, 30, 48, 22, 33, 40, 75, 42, 32]
|
790
|
+
#
|
791
|
+
def take_sample(with_replacement, num, seed=nil)
|
792
|
+
|
793
|
+
if num < 0
|
794
|
+
raise Spark::RDDError, 'Size have to be greater than 0'
|
795
|
+
elsif num == 0
|
796
|
+
return []
|
797
|
+
end
|
798
|
+
|
799
|
+
# Taken from scala
|
800
|
+
num_st_dev = 10.0
|
801
|
+
|
802
|
+
# Number of items
|
803
|
+
initial_count = self.count
|
804
|
+
return [] if initial_count == 0
|
805
|
+
|
806
|
+
# Create new generator
|
807
|
+
seed ||= Random.new_seed
|
808
|
+
rng = Random.new(seed)
|
809
|
+
|
810
|
+
# Shuffle elements if requested num if greater than array size
|
811
|
+
if !with_replacement && num >= initial_count
|
812
|
+
return self.shuffle(seed).collect
|
813
|
+
end
|
814
|
+
|
815
|
+
# Max num
|
816
|
+
max_sample_size = Integer::MAX - (num_st_dev * Math.sqrt(Integer::MAX)).to_i
|
817
|
+
if num > max_sample_size
|
818
|
+
raise Spark::RDDError, "Size can not be greate than #{max_sample_size}"
|
819
|
+
end
|
820
|
+
|
821
|
+
# Approximate fraction with tolerance
|
822
|
+
fraction = compute_fraction(num, initial_count, with_replacement)
|
823
|
+
|
824
|
+
# Compute first samled subset
|
825
|
+
samples = self.sample(with_replacement, fraction, seed).collect
|
826
|
+
|
827
|
+
# If the first sample didn't turn out large enough, keep trying to take samples;
|
828
|
+
# this shouldn't happen often because we use a big multiplier for their initial size.
|
829
|
+
index = 0
|
830
|
+
while samples.size < num
|
831
|
+
log_warning("Needed to re-sample due to insufficient sample size. Repeat #{index}")
|
832
|
+
samples = self.sample(with_replacement, fraction, rng.rand(0..Integer::MAX)).collect
|
833
|
+
index += 1
|
834
|
+
end
|
835
|
+
|
836
|
+
samples.shuffle!(random: rng)
|
837
|
+
samples[0, num]
|
838
|
+
end
|
839
|
+
|
840
|
+
# Return an RDD created by piping elements to a forked external process.
|
841
|
+
#
|
842
|
+
# == Cmds:
|
843
|
+
# cmd = [env,] command... [,options]
|
844
|
+
#
|
845
|
+
# env: hash
|
846
|
+
# name => val : set the environment variable
|
847
|
+
# name => nil : unset the environment variable
|
848
|
+
# command...:
|
849
|
+
# commandline : command line string which is passed to the standard shell
|
850
|
+
# cmdname, arg1, ... : command name and one or more arguments (This form does
|
851
|
+
# not use the shell. See below for caveats.)
|
852
|
+
# [cmdname, argv0], arg1, ... : command name, argv[0] and zero or more arguments (no shell)
|
853
|
+
# options: hash
|
854
|
+
#
|
855
|
+
# See http://ruby-doc.org/core-2.2.0/Process.html#method-c-spawn
|
856
|
+
#
|
857
|
+
# == Examples:
|
858
|
+
# $sc.parallelize(0..5).pipe('cat').collect
|
859
|
+
# # => ["0", "1", "2", "3", "4", "5"]
|
860
|
+
#
|
861
|
+
# rdd = $sc.parallelize(0..5)
|
862
|
+
# rdd = rdd.pipe('cat', "awk '{print $1*10}'")
|
863
|
+
# rdd = rdd.map(lambda{|x| x.to_i + 1})
|
864
|
+
# rdd.collect
|
865
|
+
# # => [1, 11, 21, 31, 41, 51]
|
866
|
+
#
|
867
|
+
def pipe(*cmds)
|
868
|
+
new_rdd_from_command(Spark::Command::Pipe, cmds)
|
869
|
+
end
|
870
|
+
|
871
|
+
|
872
|
+
# =============================================================================
|
873
|
+
# Pair functions
|
874
|
+
|
875
|
+
# Merge the values for each key using an associative reduce function. This will also perform
|
876
|
+
# the merging locally on each mapper before sending results to a reducer, similarly to a
|
877
|
+
# "combiner" in MapReduce. Output will be hash-partitioned with the existing partitioner/
|
878
|
+
# parallelism level.
|
879
|
+
#
|
880
|
+
# == Example:
|
881
|
+
# rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"]).map(lambda{|x| [x, 1]})
|
882
|
+
# rdd.reduce_by_key(lambda{|x,y| x+y}).collect_as_hash
|
883
|
+
# # => {"a"=>3, "b"=>2, "c"=>3}
|
884
|
+
#
|
885
|
+
def reduce_by_key(f, num_partitions=nil)
|
886
|
+
combine_by_key('lambda {|x| x}', f, f, num_partitions)
|
887
|
+
end
|
888
|
+
|
889
|
+
# Generic function to combine the elements for each key using a custom set of aggregation
|
890
|
+
# functions. Turns a JavaPairRDD[(K, V)] into a result of type JavaPairRDD[(K, C)], for a
|
891
|
+
# "combined type" C * Note that V and C can be different -- for example, one might group an
|
892
|
+
# RDD of type (Int, Int) into an RDD of type (Int, List[Int]). Users provide three
|
893
|
+
# functions:
|
894
|
+
#
|
895
|
+
# == Parameters:
|
896
|
+
# create_combiner:: which turns a V into a C (e.g., creates a one-element list)
|
897
|
+
# merge_value:: to merge a V into a C (e.g., adds it to the end of a list)
|
898
|
+
# merge_combiners:: to combine two C's into a single one.
|
899
|
+
#
|
900
|
+
# == Example:
|
901
|
+
# def combiner(x)
|
902
|
+
# x
|
903
|
+
# end
|
904
|
+
#
|
905
|
+
# def merge(x,y)
|
906
|
+
# x+y
|
907
|
+
# end
|
908
|
+
#
|
909
|
+
# rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"], 2, batch_size: 1).map(lambda{|x| [x, 1]})
|
910
|
+
# rdd.combine_by_key(method(:combiner), method(:merge), method(:merge)).collect_as_hash
|
911
|
+
# # => {"a"=>3, "b"=>2, "c"=>3}
|
912
|
+
#
|
913
|
+
def combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions=nil)
|
914
|
+
_combine_by_key(
|
915
|
+
[Spark::Command::CombineByKey::Combine, create_combiner, merge_value],
|
916
|
+
[Spark::Command::CombineByKey::Merge, merge_combiners],
|
917
|
+
num_partitions
|
918
|
+
)
|
919
|
+
end
|
920
|
+
|
921
|
+
# Return an RDD of grouped items.
|
922
|
+
#
|
923
|
+
# == Example:
|
924
|
+
# rdd = $sc.parallelize(0..5)
|
925
|
+
# rdd.group_by(lambda{|x| x%2}).collect
|
926
|
+
# # => [[0, [0, 2, 4]], [1, [1, 3, 5]]]
|
927
|
+
#
|
928
|
+
def group_by(f, num_partitions=nil)
|
929
|
+
self.key_by(f).group_by_key(num_partitions)
|
930
|
+
end
|
931
|
+
|
932
|
+
# Group the values for each key in the RDD into a single sequence. Allows controlling the
|
933
|
+
# partitioning of the resulting key-value pair RDD by passing a Partitioner.
|
934
|
+
#
|
935
|
+
# Note: If you are grouping in order to perform an aggregation (such as a sum or average)
|
936
|
+
# over each key, using reduce_by_key or combine_by_key will provide much better performance.
|
937
|
+
#
|
938
|
+
# == Example:
|
939
|
+
# rdd = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
|
940
|
+
# rdd.group_by_key.collect
|
941
|
+
# # => [["a", [1, 2]], ["b", [3]]]
|
942
|
+
#
|
943
|
+
def group_by_key(num_partitions=nil)
|
944
|
+
create_combiner = 'lambda{|item| [item]}'
|
945
|
+
merge_value = 'lambda{|combiner, item| combiner << item; combiner}'
|
946
|
+
merge_combiners = 'lambda{|combiner_1, combiner_2| combiner_1 += combiner_2; combiner_1}'
|
947
|
+
|
948
|
+
combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions)
|
949
|
+
end
|
950
|
+
|
951
|
+
# Merge the values for each key using an associative function f
|
952
|
+
# and a neutral `zero_value` which may be added to the result an
|
953
|
+
# arbitrary number of times, and must not change the result
|
954
|
+
# (e.g., 0 for addition, or 1 for multiplication.).
|
955
|
+
#
|
956
|
+
# == Example:
|
957
|
+
# rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]])
|
958
|
+
# rdd.fold_by_key(1, lambda{|x,y| x+y})
|
959
|
+
# # => [["a", 9], ["c", 6], ["b", 3]]
|
960
|
+
#
|
961
|
+
def fold_by_key(zero_value, f, num_partitions=nil)
|
962
|
+
self.aggregate_by_key(zero_value, f, f, num_partitions)
|
963
|
+
end
|
964
|
+
|
965
|
+
# Aggregate the values of each key, using given combine functions and a neutral zero value.
|
966
|
+
#
|
967
|
+
# == Example:
|
968
|
+
# def combine(x,y)
|
969
|
+
# x+y
|
970
|
+
# end
|
971
|
+
#
|
972
|
+
# def merge(x,y)
|
973
|
+
# x*y
|
974
|
+
# end
|
975
|
+
#
|
976
|
+
# rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]], 2, batch_size: 1)
|
977
|
+
# rdd.aggregate_by_key(1, method(:combine), method(:merge))
|
978
|
+
# # => [["b", 3], ["a", 16], ["c", 6]]
|
979
|
+
#
|
980
|
+
def aggregate_by_key(zero_value, seq_func, comb_func, num_partitions=nil)
|
981
|
+
_combine_by_key(
|
982
|
+
[Spark::Command::CombineByKey::CombineWithZero, zero_value, seq_func],
|
983
|
+
[Spark::Command::CombineByKey::Merge, comb_func],
|
984
|
+
num_partitions
|
985
|
+
)
|
986
|
+
end
|
987
|
+
|
988
|
+
# The same functionality as cogroup but this can grouped only 2 rdd's and you
|
989
|
+
# can change num_partitions.
|
990
|
+
#
|
991
|
+
# == Example:
|
992
|
+
# rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
|
993
|
+
# rdd2 = $sc.parallelize([["a", 4], ["a", 5], ["b", 6]])
|
994
|
+
# rdd1.group_with(rdd2).collect
|
995
|
+
# # => [["a", [1, 2, 4, 5]], ["b", [3, 6]]]
|
996
|
+
#
|
997
|
+
def group_with(other, num_partitions=nil)
|
998
|
+
self.union(other).group_by_key(num_partitions)
|
999
|
+
end
|
1000
|
+
|
1001
|
+
# For each key k in `this` or `other`, return a resulting RDD that contains a tuple with the
|
1002
|
+
# list of values for that key in `this` as well as `other`.
|
1003
|
+
#
|
1004
|
+
# == Example:
|
1005
|
+
# rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
|
1006
|
+
# rdd2 = $sc.parallelize([["a", 4], ["a", 5], ["b", 6]])
|
1007
|
+
# rdd3 = $sc.parallelize([["a", 7], ["a", 8], ["b", 9]])
|
1008
|
+
# rdd1.cogroup(rdd2, rdd3).collect
|
1009
|
+
# # => [["a", [1, 2, 4, 5, 7, 8]], ["b", [3, 6, 9]]]
|
1010
|
+
#
|
1011
|
+
def cogroup(*others)
|
1012
|
+
unioned = self
|
1013
|
+
others.each do |other|
|
1014
|
+
unioned = unioned.union(other)
|
1015
|
+
end
|
1016
|
+
|
1017
|
+
unioned.group_by_key
|
1018
|
+
end
|
1019
|
+
|
1020
|
+
# Return each (key, value) pair in self RDD that has no pair with matching
|
1021
|
+
# key in other RDD.
|
1022
|
+
#
|
1023
|
+
# == Example:
|
1024
|
+
# rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3], ["c", 4]])
|
1025
|
+
# rdd2 = $sc.parallelize([["b", 5], ["c", 6]])
|
1026
|
+
# rdd1.subtract_by_key(rdd2).collect
|
1027
|
+
# # => [["a", 1], ["a", 2]]
|
1028
|
+
#
|
1029
|
+
def subtract_by_key(other, num_partitions=nil)
|
1030
|
+
create_combiner = 'lambda{|item| [[item]]}'
|
1031
|
+
merge_value = 'lambda{|combiner, item| combiner.first << item; combiner}'
|
1032
|
+
merge_combiners = 'lambda{|combiner_1, combiner_2| combiner_1 += combiner_2; combiner_1}'
|
1033
|
+
|
1034
|
+
self.union(other)
|
1035
|
+
.combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions)
|
1036
|
+
.filter('lambda{|(key,values)| values.size == 1}')
|
1037
|
+
.flat_map_values('lambda{|item| item.first}')
|
1038
|
+
end
|
1039
|
+
|
1040
|
+
# Return an RDD with the elements from self that are not in other.
|
1041
|
+
#
|
1042
|
+
# == Example:
|
1043
|
+
# rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3], ["c", 4]])
|
1044
|
+
# rdd2 = $sc.parallelize([["a", 2], ["c", 6]])
|
1045
|
+
# rdd1.subtract(rdd2).collect
|
1046
|
+
# # => [["a", 1], ["b", 3], ["c", 4]]
|
1047
|
+
#
|
1048
|
+
def subtract(other, num_partitions=nil)
|
1049
|
+
mapping_function = 'lambda{|x| [x,nil]}'
|
1050
|
+
|
1051
|
+
self.map(mapping_function)
|
1052
|
+
.subtract_by_key(other.map(mapping_function), num_partitions)
|
1053
|
+
.keys
|
1054
|
+
end
|
1055
|
+
|
1056
|
+
# Sort the RDD by key
|
1057
|
+
#
|
1058
|
+
# == Example:
|
1059
|
+
# rdd = $sc.parallelize([["c", 1], ["b", 2], ["a", 3]])
|
1060
|
+
# rdd.sort_by_key.collect
|
1061
|
+
# # => [["a", 3], ["b", 2], ["c", 1]]
|
1062
|
+
#
|
1063
|
+
def sort_by_key(ascending=true, num_partitions=nil)
|
1064
|
+
self.sort_by('lambda{|(key, _)| key}')
|
1065
|
+
end
|
1066
|
+
|
1067
|
+
# Sorts this RDD by the given key_function
|
1068
|
+
#
|
1069
|
+
# This is a different implementation than spark. Sort by doesn't use
|
1070
|
+
# key_by method first. It can be slower but take less memory and
|
1071
|
+
# you can always use map.sort_by_key
|
1072
|
+
#
|
1073
|
+
# == Example:
|
1074
|
+
# rdd = $sc.parallelize(["aaaaaaa", "cc", "b", "eeee", "ddd"])
|
1075
|
+
#
|
1076
|
+
# rdd.sort_by.collect
|
1077
|
+
# # => ["aaaaaaa", "b", "cc", "ddd", "eeee"]
|
1078
|
+
#
|
1079
|
+
# rdd.sort_by(lambda{|x| x.size}).collect
|
1080
|
+
# # => ["b", "cc", "ddd", "eeee", "aaaaaaa"]
|
1081
|
+
#
|
1082
|
+
def sort_by(key_function=nil, ascending=true, num_partitions=nil)
|
1083
|
+
key_function ||= 'lambda{|x| x}'
|
1084
|
+
num_partitions ||= default_reduce_partitions
|
1085
|
+
|
1086
|
+
command_klass = Spark::Command::SortByKey
|
1087
|
+
|
1088
|
+
# Allow spill data to disk due to memory limit
|
1089
|
+
# spilling = config['spark.shuffle.spill'] || false
|
1090
|
+
spilling = false
|
1091
|
+
memory = ''
|
1092
|
+
|
1093
|
+
# Set spilling to false if worker has unlimited memory
|
1094
|
+
if memory.empty?
|
1095
|
+
spilling = false
|
1096
|
+
memory = nil
|
1097
|
+
else
|
1098
|
+
memory = to_memory_size(memory)
|
1099
|
+
end
|
1100
|
+
|
1101
|
+
# Sorting should do one worker
|
1102
|
+
if num_partitions == 1
|
1103
|
+
rdd = self
|
1104
|
+
rdd = rdd.coalesce(1) if partitions_size > 1
|
1105
|
+
return rdd.new_rdd_from_command(command_klass, key_function, ascending, spilling, memory, serializer)
|
1106
|
+
end
|
1107
|
+
|
1108
|
+
# Compute boundary of collection
|
1109
|
+
# Collection should be evenly distributed
|
1110
|
+
# 20.0 is from scala RangePartitioner (for roughly balanced output partitions)
|
1111
|
+
count = self.count
|
1112
|
+
sample_size = num_partitions * 20.0
|
1113
|
+
fraction = [sample_size / [count, 1].max, 1.0].min
|
1114
|
+
samples = self.sample(false, fraction, 1).map(key_function).collect
|
1115
|
+
samples.sort!
|
1116
|
+
# Reverse is much faster than reverse sort_by
|
1117
|
+
samples.reverse! if !ascending
|
1118
|
+
|
1119
|
+
# Determine part bounds
|
1120
|
+
bounds = determine_bounds(samples, num_partitions)
|
1121
|
+
|
1122
|
+
shuffled = _partition_by(num_partitions, Spark::Command::PartitionBy::Sorting, key_function, bounds, ascending, num_partitions)
|
1123
|
+
shuffled.new_rdd_from_command(command_klass, key_function, ascending, spilling, memory, serializer)
|
1124
|
+
end
|
1125
|
+
|
1126
|
+
# Creates array of the elements in this RDD by applying function f.
|
1127
|
+
#
|
1128
|
+
# == Example:
|
1129
|
+
# rdd = $sc.parallelize(0..5)
|
1130
|
+
# rdd.key_by(lambda{|x| x%2}).collect
|
1131
|
+
# # => [[0, 0], [1, 1], [0, 2], [1, 3], [0, 4], [1, 5]]
|
1132
|
+
#
|
1133
|
+
def key_by(f)
|
1134
|
+
new_rdd_from_command(Spark::Command::KeyBy, f)
|
1135
|
+
end
|
1136
|
+
|
1137
|
+
# Pass each value in the key-value pair RDD through a map function without changing
|
1138
|
+
# the keys. This also retains the original RDD's partitioning.
|
1139
|
+
#
|
1140
|
+
# == Example:
|
1141
|
+
# rdd = $sc.parallelize(["ruby", "scala", "java"])
|
1142
|
+
# rdd = rdd.map(lambda{|x| [x, x]})
|
1143
|
+
# rdd = rdd.map_values(lambda{|x| x.upcase})
|
1144
|
+
# rdd.collect
|
1145
|
+
# # => [["ruby", "RUBY"], ["scala", "SCALA"], ["java", "JAVA"]]
|
1146
|
+
#
|
1147
|
+
def map_values(f)
|
1148
|
+
new_rdd_from_command(Spark::Command::MapValues, f)
|
1149
|
+
end
|
1150
|
+
|
1151
|
+
# Pass each value in the key-value pair RDD through a flat_map function
|
1152
|
+
# without changing the keys; this also retains the original RDD's
|
1153
|
+
# partitioning.
|
1154
|
+
#
|
1155
|
+
# == Example:
|
1156
|
+
# rdd = $sc.parallelize([["a", [1,2]], ["b", [3]]])
|
1157
|
+
# rdd = rdd.flat_map_values(lambda{|x| x*2})
|
1158
|
+
# rdd.collect
|
1159
|
+
# # => [["a", 1], ["a", 2], ["a", 1], ["a", 2], ["b", 3], ["b", 3]]
|
1160
|
+
#
|
1161
|
+
def flat_map_values(f)
|
1162
|
+
new_rdd_from_command(Spark::Command::FlatMapValues, f)
|
1163
|
+
end
|
1164
|
+
|
1165
|
+
# Return an RDD with the first element of PairRDD
|
1166
|
+
#
|
1167
|
+
# == Example:
|
1168
|
+
# rdd = $sc.parallelize([[1,2], [3,4], [5,6]])
|
1169
|
+
# rdd.keys.collect
|
1170
|
+
# # => [1, 3, 5]
|
1171
|
+
#
|
1172
|
+
def keys
|
1173
|
+
self.map('lambda{|(key, _)| key}')
|
1174
|
+
end
|
1175
|
+
|
1176
|
+
# Return an RDD with the second element of PairRDD
|
1177
|
+
#
|
1178
|
+
# == Example:
|
1179
|
+
# rdd = $sc.parallelize([[1,2], [3,4], [5,6]])
|
1180
|
+
# rdd.keys.collect
|
1181
|
+
# # => [2, 4, 6]
|
1182
|
+
#
|
1183
|
+
def values
|
1184
|
+
self.map('lambda{|(_, value)| value}')
|
1185
|
+
end
|
1186
|
+
|
1187
|
+
|
1188
|
+
# Aliases
|
1189
|
+
alias_method :partitionsSize, :partitions_size
|
1190
|
+
alias_method :defaultReducePartitions, :default_reduce_partitions
|
1191
|
+
alias_method :setName, :set_name
|
1192
|
+
alias_method :addLibrary, :add_library
|
1193
|
+
|
1194
|
+
alias_method :flatMap, :flat_map
|
1195
|
+
alias_method :mapPartitions, :map_partitions
|
1196
|
+
alias_method :mapPartitionsWithIndex, :map_partitions_with_index
|
1197
|
+
alias_method :reduceByKey, :reduce_by_key
|
1198
|
+
alias_method :combineByKey, :combine_by_key
|
1199
|
+
alias_method :groupByKey, :group_by_key
|
1200
|
+
alias_method :groupWith, :group_with
|
1201
|
+
alias_method :partitionBy, :partition_by
|
1202
|
+
alias_method :defaultReducePartitions, :default_reduce_partitions
|
1203
|
+
alias_method :foreachPartition, :foreach_partition
|
1204
|
+
alias_method :mapValues, :map_values
|
1205
|
+
alias_method :takeSample, :take_sample
|
1206
|
+
alias_method :sortBy, :sort_by
|
1207
|
+
alias_method :sortByKey, :sort_by_key
|
1208
|
+
alias_method :keyBy, :key_by
|
1209
|
+
alias_method :groupBy, :group_by
|
1210
|
+
alias_method :foldByKey, :fold_by_key
|
1211
|
+
alias_method :aggregateByKey, :aggregate_by_key
|
1212
|
+
alias_method :subtractByKey, :subtract_by_key
|
1213
|
+
alias_method :sampleStdev, :sample_stdev
|
1214
|
+
alias_method :sampleVariance, :sample_variance
|
1215
|
+
|
1216
|
+
private
|
1217
|
+
|
1218
|
+
# This is base method for reduce operation. Is used by reduce, fold and aggregation.
|
1219
|
+
# Only difference is that fold has zero value.
|
1220
|
+
#
|
1221
|
+
def _reduce(klass, seq_op, comb_op, zero_value=nil)
|
1222
|
+
if seq_op.nil?
|
1223
|
+
# Partitions are already reduced
|
1224
|
+
rdd = self
|
1225
|
+
else
|
1226
|
+
rdd = new_rdd_from_command(klass, seq_op, zero_value)
|
1227
|
+
end
|
1228
|
+
|
1229
|
+
# Send all results to one worker and combine results
|
1230
|
+
rdd = rdd.coalesce(1).compact
|
1231
|
+
|
1232
|
+
# Add the same function to new RDD
|
1233
|
+
comm = rdd.add_command(klass, comb_op, zero_value)
|
1234
|
+
comm.deserializer = @command.serializer
|
1235
|
+
|
1236
|
+
# Value is returned in array
|
1237
|
+
PipelinedRDD.new(rdd, comm).collect[0]
|
1238
|
+
end
|
1239
|
+
|
1240
|
+
def _partition_by(num_partitions, klass, *args)
|
1241
|
+
# RDD is transform from [key, value] to [hash, [key, value]]
|
1242
|
+
keyed = new_rdd_from_command(klass, *args)
|
1243
|
+
keyed.serializer.unbatch!
|
1244
|
+
|
1245
|
+
# PairwiseRDD and PythonPartitioner are borrowed from Python
|
1246
|
+
# but works great on ruby too
|
1247
|
+
pairwise_rdd = PairwiseRDD.new(keyed.jrdd.rdd).asJavaPairRDD
|
1248
|
+
partitioner = PythonPartitioner.new(num_partitions, args.first.object_id)
|
1249
|
+
new_jrdd = pairwise_rdd.partitionBy(partitioner).values
|
1250
|
+
|
1251
|
+
# Reset deserializer
|
1252
|
+
RDD.new(new_jrdd, context, @command.serializer, keyed.serializer)
|
1253
|
+
end
|
1254
|
+
|
1255
|
+
# For using a different combine_by_key
|
1256
|
+
#
|
1257
|
+
# == Used for:
|
1258
|
+
# * combine_by_key
|
1259
|
+
# * fold_by_key (with zero value)
|
1260
|
+
#
|
1261
|
+
def _combine_by_key(combine, merge, num_partitions)
|
1262
|
+
num_partitions ||= default_reduce_partitions
|
1263
|
+
|
1264
|
+
# Combine key
|
1265
|
+
combined = new_rdd_from_command(combine.shift, *combine)
|
1266
|
+
|
1267
|
+
# Merge items
|
1268
|
+
shuffled = combined.partition_by(num_partitions)
|
1269
|
+
merge_comm = shuffled.add_command(merge.shift, *merge)
|
1270
|
+
|
1271
|
+
PipelinedRDD.new(shuffled, merge_comm)
|
1272
|
+
end
|
1273
|
+
|
1274
|
+
end
|
1275
|
+
|
1276
|
+
# Pipelined Resilient Distributed Dataset, operations are pipelined and sended to worker
|
1277
|
+
#
|
1278
|
+
# RDD
|
1279
|
+
# `-- map
|
1280
|
+
# `-- map
|
1281
|
+
# `-- map
|
1282
|
+
#
|
1283
|
+
# Code is executed from top to bottom
|
1284
|
+
#
|
1285
|
+
class PipelinedRDD < RDD
|
1286
|
+
|
1287
|
+
attr_reader :prev_jrdd, :command
|
1288
|
+
|
1289
|
+
def initialize(prev, command)
|
1290
|
+
|
1291
|
+
if prev.is_a?(PipelinedRDD) && prev.pipelinable?
|
1292
|
+
# Second, ... stages
|
1293
|
+
@prev_jrdd = prev.prev_jrdd
|
1294
|
+
else
|
1295
|
+
# First stage
|
1296
|
+
@prev_jrdd = prev.jrdd
|
1297
|
+
end
|
1298
|
+
|
1299
|
+
@cached = false
|
1300
|
+
@checkpointed = false
|
1301
|
+
|
1302
|
+
@context = prev.context
|
1303
|
+
@command = command
|
1304
|
+
end
|
1305
|
+
|
1306
|
+
def pipelinable?
|
1307
|
+
!(cached? || checkpointed?)
|
1308
|
+
end
|
1309
|
+
|
1310
|
+
# Serialization necessary things and sent it to RubyRDD (scala extension)
|
1311
|
+
def jrdd
|
1312
|
+
@jrdd ||= _jrdd
|
1313
|
+
end
|
1314
|
+
|
1315
|
+
private
|
1316
|
+
|
1317
|
+
def _jrdd
|
1318
|
+
command = @command.build
|
1319
|
+
|
1320
|
+
broadcasts = @command.bound_objects.select{|_, value| value.is_a?(Spark::Broadcast)}.values
|
1321
|
+
broadcasts = to_java_array_list(broadcasts.map(&:jbroadcast))
|
1322
|
+
|
1323
|
+
ruby_rdd = RubyRDD.new(@prev_jrdd.rdd, command, broadcasts, @context.jaccumulator)
|
1324
|
+
ruby_rdd.asJavaRDD
|
1325
|
+
end
|
1326
|
+
|
1327
|
+
end
|
1328
|
+
end
|