ruby-spark 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (176) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +37 -0
  3. data/Gemfile +47 -0
  4. data/Guardfile +5 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +185 -0
  7. data/Rakefile +35 -0
  8. data/TODO.md +7 -0
  9. data/benchmark/aggregate.rb +33 -0
  10. data/benchmark/bisect.rb +88 -0
  11. data/benchmark/custom_marshal.rb +94 -0
  12. data/benchmark/digest.rb +150 -0
  13. data/benchmark/enumerator.rb +88 -0
  14. data/benchmark/performance/prepare.sh +18 -0
  15. data/benchmark/performance/python.py +156 -0
  16. data/benchmark/performance/r.r +69 -0
  17. data/benchmark/performance/ruby.rb +167 -0
  18. data/benchmark/performance/run-all.sh +160 -0
  19. data/benchmark/performance/scala.scala +181 -0
  20. data/benchmark/serializer.rb +82 -0
  21. data/benchmark/sort.rb +43 -0
  22. data/benchmark/sort2.rb +164 -0
  23. data/benchmark/take.rb +28 -0
  24. data/bin/ruby-spark +8 -0
  25. data/example/pi.rb +28 -0
  26. data/ext/ruby_c/extconf.rb +3 -0
  27. data/ext/ruby_c/murmur.c +158 -0
  28. data/ext/ruby_c/murmur.h +9 -0
  29. data/ext/ruby_c/ruby-spark.c +18 -0
  30. data/ext/ruby_java/Digest.java +36 -0
  31. data/ext/ruby_java/Murmur2.java +98 -0
  32. data/ext/ruby_java/RubySparkExtService.java +28 -0
  33. data/ext/ruby_java/extconf.rb +3 -0
  34. data/ext/spark/build.sbt +73 -0
  35. data/ext/spark/project/plugins.sbt +9 -0
  36. data/ext/spark/sbt/sbt +34 -0
  37. data/ext/spark/src/main/scala/Exec.scala +91 -0
  38. data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
  39. data/ext/spark/src/main/scala/Marshal.scala +52 -0
  40. data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
  41. data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
  42. data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
  43. data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
  44. data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
  45. data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
  46. data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
  47. data/ext/spark/src/main/scala/RubyPage.scala +34 -0
  48. data/ext/spark/src/main/scala/RubyRDD.scala +364 -0
  49. data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
  50. data/ext/spark/src/main/scala/RubyTab.scala +11 -0
  51. data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
  52. data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
  53. data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
  54. data/lib/ruby-spark.rb +1 -0
  55. data/lib/spark.rb +198 -0
  56. data/lib/spark/accumulator.rb +260 -0
  57. data/lib/spark/broadcast.rb +98 -0
  58. data/lib/spark/build.rb +43 -0
  59. data/lib/spark/cli.rb +169 -0
  60. data/lib/spark/command.rb +86 -0
  61. data/lib/spark/command/base.rb +154 -0
  62. data/lib/spark/command/basic.rb +345 -0
  63. data/lib/spark/command/pair.rb +124 -0
  64. data/lib/spark/command/sort.rb +51 -0
  65. data/lib/spark/command/statistic.rb +144 -0
  66. data/lib/spark/command_builder.rb +141 -0
  67. data/lib/spark/command_validator.rb +34 -0
  68. data/lib/spark/config.rb +244 -0
  69. data/lib/spark/constant.rb +14 -0
  70. data/lib/spark/context.rb +304 -0
  71. data/lib/spark/error.rb +50 -0
  72. data/lib/spark/ext/hash.rb +41 -0
  73. data/lib/spark/ext/integer.rb +25 -0
  74. data/lib/spark/ext/io.rb +57 -0
  75. data/lib/spark/ext/ip_socket.rb +29 -0
  76. data/lib/spark/ext/module.rb +58 -0
  77. data/lib/spark/ext/object.rb +24 -0
  78. data/lib/spark/ext/string.rb +24 -0
  79. data/lib/spark/helper.rb +10 -0
  80. data/lib/spark/helper/logger.rb +40 -0
  81. data/lib/spark/helper/parser.rb +85 -0
  82. data/lib/spark/helper/serialize.rb +71 -0
  83. data/lib/spark/helper/statistic.rb +93 -0
  84. data/lib/spark/helper/system.rb +42 -0
  85. data/lib/spark/java_bridge.rb +19 -0
  86. data/lib/spark/java_bridge/base.rb +203 -0
  87. data/lib/spark/java_bridge/jruby.rb +23 -0
  88. data/lib/spark/java_bridge/rjb.rb +41 -0
  89. data/lib/spark/logger.rb +76 -0
  90. data/lib/spark/mllib.rb +100 -0
  91. data/lib/spark/mllib/classification/common.rb +31 -0
  92. data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
  93. data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
  94. data/lib/spark/mllib/classification/svm.rb +135 -0
  95. data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
  96. data/lib/spark/mllib/clustering/kmeans.rb +118 -0
  97. data/lib/spark/mllib/matrix.rb +120 -0
  98. data/lib/spark/mllib/regression/common.rb +73 -0
  99. data/lib/spark/mllib/regression/labeled_point.rb +41 -0
  100. data/lib/spark/mllib/regression/lasso.rb +100 -0
  101. data/lib/spark/mllib/regression/linear.rb +124 -0
  102. data/lib/spark/mllib/regression/ridge.rb +97 -0
  103. data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
  104. data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
  105. data/lib/spark/mllib/stat/distribution.rb +12 -0
  106. data/lib/spark/mllib/vector.rb +185 -0
  107. data/lib/spark/rdd.rb +1328 -0
  108. data/lib/spark/sampler.rb +92 -0
  109. data/lib/spark/serializer.rb +24 -0
  110. data/lib/spark/serializer/base.rb +170 -0
  111. data/lib/spark/serializer/cartesian.rb +37 -0
  112. data/lib/spark/serializer/marshal.rb +19 -0
  113. data/lib/spark/serializer/message_pack.rb +25 -0
  114. data/lib/spark/serializer/oj.rb +25 -0
  115. data/lib/spark/serializer/pair.rb +27 -0
  116. data/lib/spark/serializer/utf8.rb +25 -0
  117. data/lib/spark/sort.rb +189 -0
  118. data/lib/spark/stat_counter.rb +125 -0
  119. data/lib/spark/storage_level.rb +39 -0
  120. data/lib/spark/version.rb +3 -0
  121. data/lib/spark/worker/master.rb +144 -0
  122. data/lib/spark/worker/spark_files.rb +15 -0
  123. data/lib/spark/worker/worker.rb +197 -0
  124. data/ruby-spark.gemspec +36 -0
  125. data/spec/generator.rb +37 -0
  126. data/spec/inputs/lorem_300.txt +316 -0
  127. data/spec/inputs/numbers/1.txt +50 -0
  128. data/spec/inputs/numbers/10.txt +50 -0
  129. data/spec/inputs/numbers/11.txt +50 -0
  130. data/spec/inputs/numbers/12.txt +50 -0
  131. data/spec/inputs/numbers/13.txt +50 -0
  132. data/spec/inputs/numbers/14.txt +50 -0
  133. data/spec/inputs/numbers/15.txt +50 -0
  134. data/spec/inputs/numbers/16.txt +50 -0
  135. data/spec/inputs/numbers/17.txt +50 -0
  136. data/spec/inputs/numbers/18.txt +50 -0
  137. data/spec/inputs/numbers/19.txt +50 -0
  138. data/spec/inputs/numbers/2.txt +50 -0
  139. data/spec/inputs/numbers/20.txt +50 -0
  140. data/spec/inputs/numbers/3.txt +50 -0
  141. data/spec/inputs/numbers/4.txt +50 -0
  142. data/spec/inputs/numbers/5.txt +50 -0
  143. data/spec/inputs/numbers/6.txt +50 -0
  144. data/spec/inputs/numbers/7.txt +50 -0
  145. data/spec/inputs/numbers/8.txt +50 -0
  146. data/spec/inputs/numbers/9.txt +50 -0
  147. data/spec/inputs/numbers_0_100.txt +101 -0
  148. data/spec/inputs/numbers_1_100.txt +100 -0
  149. data/spec/lib/collect_spec.rb +42 -0
  150. data/spec/lib/command_spec.rb +68 -0
  151. data/spec/lib/config_spec.rb +64 -0
  152. data/spec/lib/context_spec.rb +163 -0
  153. data/spec/lib/ext_spec.rb +72 -0
  154. data/spec/lib/external_apps_spec.rb +45 -0
  155. data/spec/lib/filter_spec.rb +80 -0
  156. data/spec/lib/flat_map_spec.rb +100 -0
  157. data/spec/lib/group_spec.rb +109 -0
  158. data/spec/lib/helper_spec.rb +19 -0
  159. data/spec/lib/key_spec.rb +41 -0
  160. data/spec/lib/manipulation_spec.rb +114 -0
  161. data/spec/lib/map_partitions_spec.rb +87 -0
  162. data/spec/lib/map_spec.rb +91 -0
  163. data/spec/lib/mllib/classification_spec.rb +54 -0
  164. data/spec/lib/mllib/clustering_spec.rb +35 -0
  165. data/spec/lib/mllib/matrix_spec.rb +32 -0
  166. data/spec/lib/mllib/regression_spec.rb +116 -0
  167. data/spec/lib/mllib/vector_spec.rb +77 -0
  168. data/spec/lib/reduce_by_key_spec.rb +118 -0
  169. data/spec/lib/reduce_spec.rb +131 -0
  170. data/spec/lib/sample_spec.rb +46 -0
  171. data/spec/lib/serializer_spec.rb +13 -0
  172. data/spec/lib/sort_spec.rb +58 -0
  173. data/spec/lib/statistic_spec.rb +168 -0
  174. data/spec/lib/whole_text_files_spec.rb +33 -0
  175. data/spec/spec_helper.rb +39 -0
  176. metadata +301 -0
@@ -0,0 +1,57 @@
1
+ require 'matrix'
2
+
3
+ # Based on ruby 2.1
4
+
5
+ class Vector
6
+ def self.elements(array, copy=true)
7
+ DenseVector.new(convert_to_array(array, copy))
8
+ end
9
+ end
10
+
11
+ module Spark
12
+ module Mllib
13
+ class VectorAdapter < ::Vector
14
+
15
+ def self.new(*args)
16
+ object = self.allocate
17
+ object.__send__(:initialize, *args)
18
+ object
19
+ end
20
+
21
+ def initialize(*args)
22
+ case args.shift
23
+ when :dense
24
+ values = args.shift.dup
25
+ when :sparse
26
+ values = [0.0] * args.shift.to_i
27
+ else
28
+ raise Spark::MllibError, 'Unknow vector type.'
29
+ end
30
+
31
+ super(values)
32
+ end
33
+
34
+ def []=(index, value)
35
+ @elements[index] = value
36
+ end
37
+
38
+ def dot(other)
39
+ if other.is_a?(Spark::Mllib::MatrixBase)
40
+ other * self
41
+ else
42
+ inner_product(other)
43
+ end
44
+ end
45
+
46
+ def squared_distance(other)
47
+ diff = self - other
48
+ diff.dot(diff)
49
+ end
50
+
51
+ def values
52
+ @values || to_a
53
+ end
54
+
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,12 @@
1
+ ##
2
+ # MultivariateGaussian
3
+ #
4
+ # This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution. In
5
+ # the event that the covariance matrix is singular, the density will be computed in a
6
+ # reduced dimensional subspace under which the distribution is supported.
7
+ #
8
+ # == Arguments:
9
+ # mu:: The mean vector of the distribution
10
+ # sigma:: The covariance matrix of the distribution
11
+ #
12
+ Spark::Mllib::MultivariateGaussian = Struct.new(:mu, :sigma)
@@ -0,0 +1,185 @@
1
+ module Spark
2
+ module Mllib
3
+ module Vectors
4
+
5
+ def self.dense(*args)
6
+ DenseVector.new(*args)
7
+ end
8
+
9
+ def self.sparse(*args)
10
+ SparseVector.new(*args)
11
+ end
12
+
13
+ def self.parse(data)
14
+ if data.start_with?('[') && data.end_with?(']')
15
+ DenseVector.parse(data)
16
+ elsif data.start_with?('(') && data.end_with?(')')
17
+ SparseVector.parse(data)
18
+ else
19
+ raise ArgumentError, 'Unknow vector.'
20
+ end
21
+ end
22
+
23
+ def self.to_vector(data)
24
+ if data.is_a?(SparseVector) || data.is_a?(DenseVector)
25
+ data
26
+ elsif data.is_a?(Array)
27
+ DenseVector.new(data)
28
+ end
29
+ end
30
+
31
+ end
32
+ end
33
+ end
34
+
35
+ module Spark
36
+ module Mllib
37
+ # @abstract Parent for all type of vectors
38
+ class VectorBase < VectorAdapter
39
+ end
40
+ end
41
+ end
42
+
43
+ module Spark
44
+ module Mllib
45
+ ##
46
+ # A dense vector represented by a value array.
47
+ #
48
+ # Dense vector is a vector in which most of the elements are non-zero.
49
+ #
50
+ # == Example:
51
+ # DenseVector.new([1,2,3,4,5]).values
52
+ # # => [1, 2, 3, 4, 5]
53
+ #
54
+ # DenseVector.new(1..5).values
55
+ # # => [1, 2, 3, 4, 5]
56
+ #
57
+ class DenseVector < VectorBase
58
+
59
+ def initialize(values)
60
+ super(:dense, values.to_a)
61
+ end
62
+
63
+ # Covert string to vector
64
+ #
65
+ # DenseVector.parse("[1.0,2.0,3.0,4.0,5.0]")
66
+ #
67
+ def self.parse(data)
68
+ unless data =~ /\[[0-9., ]+\]/
69
+ raise ArgumentError, 'Unknow format for DenseVector.'
70
+ end
71
+
72
+ data.sub!('[', '')
73
+ data.sub!(']', '')
74
+
75
+ data = data.split(',')
76
+ data.map!(&:to_f)
77
+
78
+ DenseVector.new(data)
79
+ end
80
+
81
+ # Convert vector to string
82
+ #
83
+ # DenseVector.new([1,2,3,4,5]).to_s
84
+ # # => "[1.0,2.0,3.0,4.0,5.0]"
85
+ #
86
+ def to_s
87
+ "[#{values.join(',')}]"
88
+ end
89
+
90
+ def to_java
91
+ JDenseVector.new(values)
92
+ end
93
+
94
+ def self.from_java(object)
95
+ DenseVector.new(object.values)
96
+ end
97
+
98
+ def marshal_dump
99
+ values
100
+ end
101
+
102
+ def marshal_load(array)
103
+ initialize(array)
104
+ end
105
+
106
+ end
107
+ end
108
+ end
109
+
110
+ module Spark
111
+ module Mllib
112
+ ##
113
+ # A sparse vector represented by an index array and an value array.
114
+ #
115
+ # Sparse vector is a vector in which most of the elements are zero.
116
+ #
117
+ # == Example:
118
+ # SparseVector.new(4, {1 => 1.0, 3 => 5.5}).values
119
+ # # => [0, 1.0, 0, 5.5]
120
+ #
121
+ # SparseVector.new(4, [[1, 3], [1.0, 5.5]]).values
122
+ # # => [0, 1.0, 0, 5.5]
123
+ #
124
+ # SparseVector.new(4, [1, 3], [1.0, 5.5]).values
125
+ # # => [0, 1.0, 0, 5.5]
126
+ #
127
+ class SparseVector < VectorBase
128
+
129
+ attr_reader :indices
130
+
131
+ def initialize(arg1, arg2=nil, arg3=nil)
132
+ super(:sparse, arg1)
133
+
134
+ if arg2.is_a?(Hash)
135
+ @indices = arg2.keys
136
+ @values = arg2.values
137
+ else
138
+ @indices = arg2
139
+ @values = arg3
140
+ end
141
+
142
+ @indices.zip(@values).each do |(index, value)|
143
+ self[index] = value
144
+ end
145
+ end
146
+
147
+ # Covert string to vector
148
+ #
149
+ # SparseVector.parse("(5,[1,4],[3.0,5.0])")
150
+ #
151
+ def self.parse(data)
152
+ data = data.match(/\(([0-9]+)[ ]*,[ ]*\[([0-9,. ]*)\][ ]*,[ ]*\[([0-9,. ]*)\]\)/)
153
+ if data
154
+ size = data[1].to_i
155
+ indices = data[2].split(',')
156
+ indices.map!(&:to_i)
157
+ values = data[3].split(',')
158
+ values.map!(&:to_f)
159
+
160
+ SparseVector.new(size, indices, values)
161
+ else
162
+ raise ArgumentError, 'Unknow format for SparseVector.'
163
+ end
164
+ end
165
+
166
+ # Convert vector to string
167
+ #
168
+ # SparseVector.new(5, {1 => 3, 4 => 5}).to_s
169
+ # # => "(5,[1,4],[3.0,5.0])"
170
+ #
171
+ def to_s
172
+ "(#{size},[#{indices.join(',')}],[#{values.join(',')}])"
173
+ end
174
+
175
+ def marshal_dump
176
+ [size, indices, values]
177
+ end
178
+
179
+ def marshal_load(array)
180
+ initialize(array[0], array[1], array[2])
181
+ end
182
+
183
+ end
184
+ end
185
+ end
data/lib/spark/rdd.rb ADDED
@@ -0,0 +1,1328 @@
1
+ module Spark
2
+ ##
3
+ # A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. Represents an immutable,
4
+ # partitioned collection of elements that can be operated on in parallel. This class contains the
5
+ # basic operations available on all RDDs, such as `map`, `filter`, and `persist`.
6
+ #
7
+ class RDD
8
+
9
+ extend Forwardable
10
+
11
+ attr_reader :jrdd, :context, :command
12
+
13
+ include Spark::Helper::Logger
14
+ include Spark::Helper::Parser
15
+ include Spark::Helper::Statistic
16
+
17
+ def_delegators :@command, :serializer, :deserializer, :libraries, :files
18
+
19
+ # Initializing RDD, this method is root of all Pipelined RDD - its unique
20
+ # If you call some operations on this class it will be computed in Java
21
+ #
22
+ # == Parameters:
23
+ # jrdd:: org.apache.spark.api.java.JavaRDD
24
+ # context:: {Spark::Context}
25
+ # serializer:: {Spark::Serializer}
26
+ #
27
+ def initialize(jrdd, context, serializer, deserializer=nil)
28
+ @jrdd = jrdd
29
+ @context = context
30
+
31
+ @cached = false
32
+ @checkpointed = false
33
+
34
+ @command = Spark::CommandBuilder.new(serializer, deserializer)
35
+ end
36
+
37
+
38
+ # =============================================================================
39
+ # Operators
40
+
41
+ def +(other)
42
+ self.union(other)
43
+ end
44
+
45
+
46
+ # =============================================================================
47
+ # Commad and serializer
48
+
49
+ def add_command(klass, *args)
50
+ @command.deep_copy.add_command(klass, *args)
51
+ end
52
+
53
+ # Add ruby library
54
+ # Libraries will be included before computing
55
+ #
56
+ # == Example:
57
+ # rdd.add_library('pry').add_library('nio4r', 'distribution')
58
+ #
59
+ def add_library(*libraries)
60
+ @command.add_library(*libraries)
61
+ self
62
+ end
63
+
64
+ # Bind object to RDD
65
+ #
66
+ # == Example:
67
+ # text = "test"
68
+ #
69
+ # rdd = $sc.parallelize(0..5)
70
+ # rdd = rdd.map(lambda{|x| x.to_s + " " + text})
71
+ # rdd = rdd.bind(text: text)
72
+ #
73
+ # rdd.collect
74
+ # # => ["0 test", "1 test", "2 test", "3 test", "4 test", "5 test"]
75
+ #
76
+ def bind(objects)
77
+ unless objects.is_a?(Hash)
78
+ raise ArgumentError, 'Argument must be a Hash.'
79
+ end
80
+
81
+ @command.bind(objects)
82
+ self
83
+ end
84
+
85
+ def new_rdd_from_command(klass, *args)
86
+ comm = add_command(klass, *args)
87
+ PipelinedRDD.new(self, comm)
88
+ end
89
+
90
+
91
+ # =============================================================================
92
+ # Variables and non-computing functions
93
+
94
+ def config
95
+ @context.config
96
+ end
97
+
98
+ def default_reduce_partitions
99
+ config['spark.default.parallelism'] || partitions_size
100
+ end
101
+
102
+ # Count of ParallelCollectionPartition
103
+ def partitions_size
104
+ jrdd.rdd.partitions.size
105
+ end
106
+
107
+ # A unique ID for this RDD (within its SparkContext).
108
+ def id
109
+ jrdd.id
110
+ end
111
+
112
+ # Persist this RDD with the default storage level MEMORY_ONLY_SER because of serialization.
113
+ def cache
114
+ persist('memory_only_ser')
115
+ end
116
+
117
+ # Set this RDD's storage level to persist its values across operations after the first time
118
+ # it is computed. This can only be used to assign a new storage level if the RDD does not
119
+ # have a storage level set yet.
120
+ #
121
+ # See StorageLevel for type of new_level
122
+ #
123
+ def persist(new_level)
124
+ @cached = true
125
+ jrdd.persist(Spark::StorageLevel.java_get(new_level))
126
+ self
127
+ end
128
+
129
+ # Mark the RDD as non-persistent, and remove all blocks for it from memory and disk.
130
+ #
131
+ # == Parameters:
132
+ # blocking:: whether to block until all blocks are deleted.
133
+ #
134
+ def unpersist(blocking=true)
135
+ @cached = false
136
+ jrdd.unpersist(blocking)
137
+ self
138
+ end
139
+
140
+ def cached?
141
+ @cached
142
+ end
143
+
144
+ def checkpointed?
145
+ @checkpointed
146
+ end
147
+
148
+ # Return the name of this RDD.
149
+ #
150
+ def name
151
+ _name = jrdd.name
152
+ _name && _name.encode(Encoding::UTF_8)
153
+ end
154
+
155
+ # Assign a name to this RDD.
156
+ #
157
+ def set_name(name)
158
+ jrdd.setName(name)
159
+ end
160
+
161
+ def to_java
162
+ rdd = self.reserialize('Marshal')
163
+ RubyRDD.toJava(rdd.jrdd, rdd.serializer.batched?)
164
+ end
165
+
166
+
167
+ # =============================================================================
168
+ # Actions which return value
169
+
170
+ # Return an array that contains all of the elements in this RDD.
171
+ # RJB raise an error if stage is killed.
172
+ def collect
173
+ collect_from_iterator(jrdd.collect.iterator)
174
+ rescue => e
175
+ raise Spark::RDDError, e.message
176
+ end
177
+
178
+ def collect_from_iterator(iterator)
179
+ if self.is_a?(PipelinedRDD)
180
+ klass = @command.serializer
181
+ else
182
+ klass = @command.deserializer
183
+ end
184
+
185
+ klass.load_from_iterator(iterator)
186
+ end
187
+
188
+ # Convert an Array to Hash
189
+ #
190
+ def collect_as_hash
191
+ Hash[collect]
192
+ end
193
+
194
+ # Take the first num elements of the RDD.
195
+ #
196
+ # It works by first scanning one partition, and use the results from
197
+ # that partition to estimate the number of additional partitions needed
198
+ # to satisfy the limit.
199
+ #
200
+ # == Example:
201
+ # rdd = $sc.parallelize(0..100, 20, batch_size: 1)
202
+ # rdd.take(5)
203
+ # # => [0, 1, 2, 3, 4]
204
+ #
205
+ def take(count)
206
+ buffer = []
207
+
208
+ parts_count = self.partitions_size
209
+ # No parts was scanned, yet
210
+ last_scanned = -1
211
+
212
+ while buffer.empty?
213
+ last_scanned += 1
214
+ buffer += context.run_job_with_command(self, [last_scanned], true, Spark::Command::Take, 0, -1)
215
+ end
216
+
217
+ # Assumption. Depend on batch_size and how Spark divided data.
218
+ items_per_part = buffer.size
219
+ left = count - buffer.size
220
+
221
+ while left > 0 && last_scanned < parts_count
222
+ parts_to_take = (left.to_f/items_per_part).ceil
223
+ parts_for_scanned = Array.new(parts_to_take) do
224
+ last_scanned += 1
225
+ end
226
+
227
+ # We cannot take exact number of items because workers are isolated from each other.
228
+ # => once you take e.g. 50% from last part and left is still > 0 then its very
229
+ # difficult merge new items
230
+ items = context.run_job_with_command(self, parts_for_scanned, true, Spark::Command::Take, left, last_scanned)
231
+ buffer += items
232
+
233
+ left = count - buffer.size
234
+ # Average size of all parts
235
+ items_per_part = [items_per_part, items.size].reduce(0){|sum, x| sum + x.to_f/2}
236
+ end
237
+
238
+ buffer.slice!(0, count)
239
+ end
240
+
241
+ # Return the first element in this RDD.
242
+ #
243
+ # == Example:
244
+ # rdd = $sc.parallelize(0..100)
245
+ # rdd.first
246
+ # # => 0
247
+ #
248
+ def first
249
+ self.take(1)[0]
250
+ end
251
+
252
+ # Reduces the elements of this RDD using the specified lambda or method.
253
+ #
254
+ # == Example:
255
+ # rdd = $sc.parallelize(0..10)
256
+ # rdd.reduce(lambda{|sum, x| sum+x})
257
+ # # => 55
258
+ #
259
+ def reduce(f)
260
+ _reduce(Spark::Command::Reduce, f, f)
261
+ end
262
+
263
+ # Aggregate the elements of each partition, and then the results for all the partitions, using a
264
+ # given associative function and a neutral "zero value".
265
+ #
266
+ # The function f(x, y) is allowed to modify x and return it as its result value to avoid
267
+ # object allocation; however, it should not modify y.
268
+ #
269
+ # Be careful, zero_values is applied to all stages. See example.
270
+ #
271
+ # == Example:
272
+ # rdd = $sc.parallelize(0..10, 2)
273
+ # rdd.fold(1, lambda{|sum, x| sum+x})
274
+ # # => 58
275
+ #
276
+ def fold(zero_value, f)
277
+ self.aggregate(zero_value, f, f)
278
+ end
279
+
280
+ # Aggregate the elements of each partition, and then the results for all the partitions, using
281
+ # given combine functions and a neutral "zero value".
282
+ #
283
+ # This function can return a different result type. We need one operation for merging.
284
+ #
285
+ # Result must be an Array otherwise Serializer Array's zero value will be send
286
+ # as multiple values and not just one.
287
+ #
288
+ # == Example:
289
+ # # 1 2 3 4 5 => 15 + 1 = 16
290
+ # # 6 7 8 9 10 => 40 + 1 = 41
291
+ # # 16 * 41 = 656
292
+ #
293
+ # seq = lambda{|x,y| x+y}
294
+ # com = lambda{|x,y| x*y}
295
+ #
296
+ # rdd = $sc.parallelize(1..10, 2, batch_size: 1)
297
+ # rdd.aggregate(1, seq, com)
298
+ # # => 656
299
+ #
300
+ def aggregate(zero_value, seq_op, comb_op)
301
+ _reduce(Spark::Command::Aggregate, seq_op, comb_op, zero_value)
302
+ end
303
+
304
+ # Return the max of this RDD
305
+ #
306
+ # == Example:
307
+ # rdd = $sc.parallelize(0..10)
308
+ # rdd.max
309
+ # # => 10
310
+ #
311
+ def max
312
+ self.reduce('lambda{|memo, item| memo > item ? memo : item }')
313
+ end
314
+
315
+ # Return the min of this RDD
316
+ #
317
+ # == Example:
318
+ # rdd = $sc.parallelize(0..10)
319
+ # rdd.min
320
+ # # => 0
321
+ #
322
+ def min
323
+ self.reduce('lambda{|memo, item| memo < item ? memo : item }')
324
+ end
325
+
326
+ # Return the sum of this RDD
327
+ #
328
+ # == Example:
329
+ # rdd = $sc.parallelize(0..10)
330
+ # rdd.sum
331
+ # # => 55
332
+ #
333
+ def sum
334
+ self.reduce('lambda{|sum, item| sum + item}')
335
+ end
336
+
337
+ # Return the number of values in this RDD
338
+ #
339
+ # == Example:
340
+ # rdd = $sc.parallelize(0..10)
341
+ # rdd.count
342
+ # # => 11
343
+ #
344
+ def count
345
+ # nil is for seq_op => it means the all result go directly to one worker for combine
346
+ @count ||= self.map_partitions('lambda{|iterator| iterator.to_a.size }')
347
+ .aggregate(0, nil, 'lambda{|sum, item| sum + item }')
348
+ end
349
+
350
+ # Return a {Spark::StatCounter} object that captures the mean, variance
351
+ # and count of the RDD's elements in one operation.
352
+ def stats
353
+ @stats ||= new_rdd_from_command(Spark::Command::Stats).reduce('lambda{|memo, item| memo.merge(item)}')
354
+ end
355
+
356
+ # Compute the mean of this RDD's elements.
357
+ #
358
+ # == Example:
359
+ # $sc.parallelize([1, 2, 3]).mean
360
+ # # => 2.0
361
+ #
362
+ def mean
363
+ stats.mean
364
+ end
365
+
366
+ # Compute the variance of this RDD's elements.
367
+ #
368
+ # == Example:
369
+ # $sc.parallelize([1, 2, 3]).variance
370
+ # # => 0.666...
371
+ #
372
+ def variance
373
+ stats.variance
374
+ end
375
+
376
+ # Compute the standard deviation of this RDD's elements.
377
+ #
378
+ # == Example:
379
+ # $sc.parallelize([1, 2, 3]).stdev
380
+ # # => 0.816...
381
+ #
382
+ def stdev
383
+ stats.stdev
384
+ end
385
+
386
+ # Compute the sample standard deviation of this RDD's elements (which
387
+ # corrects for bias in estimating the standard deviation by dividing by
388
+ # N-1 instead of N).
389
+ #
390
+ # == Example:
391
+ # $sc.parallelize([1, 2, 3]).sample_stdev
392
+ # # => 1.0
393
+ #
394
+ def sample_stdev
395
+ stats.sample_stdev
396
+ end
397
+
398
+ # Compute the sample variance of this RDD's elements (which corrects
399
+ # for bias in estimating the variance by dividing by N-1 instead of N).
400
+ #
401
+ # == Example:
402
+ # $sc.parallelize([1, 2, 3]).sample_variance
403
+ # # => 1.0
404
+ #
405
+ def sample_variance
406
+ stats.sample_variance
407
+ end
408
+
409
+ # Compute a histogram using the provided buckets. The buckets
410
+ # are all open to the right except for the last which is closed.
411
+ # e.g. [1,10,20,50] means the buckets are [1,10) [10,20) [20,50],
412
+ # which means 1<=x<10, 10<=x<20, 20<=x<=50. And on the input of 1
413
+ # and 50 we would have a histogram of 1,0,1.
414
+ #
415
+ # If your histogram is evenly spaced (e.g. [0, 10, 20, 30]),
416
+ # this can be switched from an O(log n) inseration to O(1) per
417
+ # element(where n = # buckets).
418
+ #
419
+ # Buckets must be sorted and not contain any duplicates, must be
420
+ # at least two elements.
421
+ #
422
+ # == Examples:
423
+ # rdd = $sc.parallelize(0..50)
424
+ #
425
+ # rdd.histogram(2)
426
+ # # => [[0.0, 25.0, 50], [25, 26]]
427
+ #
428
+ # rdd.histogram([0, 5, 25, 50])
429
+ # # => [[0, 5, 25, 50], [5, 20, 26]]
430
+ #
431
+ # rdd.histogram([0, 15, 30, 45, 60])
432
+ # # => [[0, 15, 30, 45, 60], [15, 15, 15, 6]]
433
+ #
434
+ def histogram(buckets)
435
+
436
+ # -----------------------------------------------------------------------
437
+ # Integer
438
+ #
439
+ if buckets.is_a?(Integer)
440
+
441
+ # Validation
442
+ if buckets < 1
443
+ raise ArgumentError, "Bucket count must be >= 1, #{buckets} inserted."
444
+ end
445
+
446
+ # Filter invalid values
447
+ # Nil and NaN
448
+ func = 'lambda{|x|
449
+ if x.nil? || (x.is_a?(Float) && x.nan?)
450
+ false
451
+ else
452
+ true
453
+ end
454
+ }'
455
+ filtered = self.filter(func)
456
+
457
+ # Compute the minimum and the maximum
458
+ func = 'lambda{|memo, item|
459
+ [memo[0] < item[0] ? memo[0] : item[0],
460
+ memo[1] > item[1] ? memo[1] : item[1]]
461
+ }'
462
+ min, max = filtered.map('lambda{|x| [x, x]}').reduce(func)
463
+
464
+ # Min, max must be valid numbers
465
+ if (min.is_a?(Float) && !min.finite?) || (max.is_a?(Float) && !max.finite?)
466
+ raise Spark::RDDError, 'Histogram on either an empty RDD or RDD containing +/-infinity or NaN'
467
+ end
468
+
469
+ # Already finished
470
+ if min == max || buckets == 1
471
+ return [min, max], [filtered.count]
472
+ end
473
+
474
+ # Custom range
475
+ begin
476
+ span = max - min # increment
477
+ buckets = (0...buckets).map do |x|
478
+ min + (x * span) / buckets.to_f
479
+ end
480
+ buckets << max
481
+ rescue NoMethodError
482
+ raise Spark::RDDError, 'Can not generate buckets with non-number in RDD'
483
+ end
484
+
485
+ even = true
486
+
487
+ # -----------------------------------------------------------------------
488
+ # Array
489
+ #
490
+ elsif buckets.is_a?(Array)
491
+
492
+ if buckets.size < 2
493
+ raise ArgumentError, 'Buckets should have more than one value.'
494
+ end
495
+
496
+ if buckets.detect{|x| x.nil? || (x.is_a?(Float) && x.nan?)}
497
+ raise ArgumentError, 'Can not have nil or nan numbers in buckets.'
498
+ end
499
+
500
+ if buckets.detect{|x| buckets.count(x) > 1}
501
+ raise ArgumentError, 'Buckets should not contain duplicated values.'
502
+ end
503
+
504
+ if buckets.sort != buckets
505
+ raise ArgumentError, 'Buckets must be sorted.'
506
+ end
507
+
508
+ even = false
509
+
510
+ # -----------------------------------------------------------------------
511
+ # Other
512
+ #
513
+ else
514
+ raise Spark::RDDError, 'Buckets should be number or array.'
515
+ end
516
+
517
+ reduce_func = 'lambda{|memo, item|
518
+ memo.size.times do |i|
519
+ memo[i] += item[i]
520
+ end
521
+ memo
522
+ }'
523
+
524
+ return buckets, new_rdd_from_command(Spark::Command::Histogram, even, buckets).reduce(reduce_func)
525
+ end
526
+
527
+ # Applies a function f to all elements of this RDD.
528
+ #
529
+ # == Example:
530
+ # rdd = $sc.parallelize(0..5)
531
+ # rdd.foreach(lambda{|x| puts x})
532
+ # # => nil
533
+ #
534
+ def foreach(f, options={})
535
+ new_rdd_from_command(Spark::Command::Foreach, f).collect
536
+ nil
537
+ end
538
+
539
+ # Applies a function f to each partition of this RDD.
540
+ #
541
+ # == Example:
542
+ # rdd = $sc.parallelize(0..5)
543
+ # rdd.foreachPartition(lambda{|x| puts x.to_s})
544
+ # # => nil
545
+ #
546
+ def foreach_partition(f, options={})
547
+ new_rdd_from_command(Spark::Command::ForeachPartition, f).collect
548
+ nil
549
+ end
550
+
551
+
552
+ # =============================================================================
553
+ # Transformations of RDD
554
+
555
+ # Return a new RDD by applying a function to all elements of this RDD.
556
+ #
557
+ # == Example:
558
+ # rdd = $sc.parallelize(0..5)
559
+ # rdd.map(lambda {|x| x*2}).collect
560
+ # # => [0, 2, 4, 6, 8, 10]
561
+ #
562
+ def map(f)
563
+ new_rdd_from_command(Spark::Command::Map, f)
564
+ end
565
+
566
+ # Return a new RDD by first applying a function to all elements of this
567
+ # RDD, and then flattening the results.
568
+ #
569
+ # == Example:
570
+ # rdd = $sc.parallelize(0..5)
571
+ # rdd.flat_map(lambda {|x| [x, 1]}).collect
572
+ # # => [0, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1]
573
+ #
574
+ def flat_map(f)
575
+ new_rdd_from_command(Spark::Command::FlatMap, f)
576
+ end
577
+
578
+ # Return a new RDD by applying a function to each partition of this RDD.
579
+ #
580
+ # == Example:
581
+ # rdd = $sc.parallelize(0..10, 2)
582
+ # rdd.map_partitions(lambda{|part| part.reduce(:+)}).collect
583
+ # # => [15, 40]
584
+ #
585
+ def map_partitions(f)
586
+ new_rdd_from_command(Spark::Command::MapPartitions, f)
587
+ end
588
+
589
+ # Return a new RDD by applying a function to each partition of this RDD, while tracking the index
590
+ # of the original partition.
591
+ #
592
+ # == Example:
593
+ # rdd = $sc.parallelize(0...4, 4, batch_size: 1)
594
+ # rdd.map_partitions_with_index(lambda{|part, index| part.first * index}).collect
595
+ # # => [0, 1, 4, 9]
596
+ #
597
+ def map_partitions_with_index(f, options={})
598
+ new_rdd_from_command(Spark::Command::MapPartitionsWithIndex, f)
599
+ end
600
+
601
+ # Return a new RDD containing only the elements that satisfy a predicate.
602
+ #
603
+ # == Example:
604
+ # rdd = $sc.parallelize(0..10)
605
+ # rdd.filter(lambda{|x| x.even?}).collect
606
+ # # => [0, 2, 4, 6, 8, 10]
607
+ #
608
+ def filter(f)
609
+ new_rdd_from_command(Spark::Command::Filter, f)
610
+ end
611
+
612
+ # Return a new RDD containing non-nil elements.
613
+ #
614
+ # == Example:
615
+ # rdd = $sc.parallelize([1, nil, 2, nil, 3])
616
+ # rdd.compact.collect
617
+ # # => [1, 2, 3]
618
+ #
619
+ def compact
620
+ new_rdd_from_command(Spark::Command::Compact)
621
+ end
622
+
623
+ # Return an RDD created by coalescing all elements within each partition into an array.
624
+ #
625
+ # == Example:
626
+ # rdd = $sc.parallelize(0..10, 3, batch_size: 1)
627
+ # rdd.glom.collect
628
+ # # => [[0, 1, 2], [3, 4, 5, 6], [7, 8, 9, 10]]
629
+ #
630
+ def glom
631
+ new_rdd_from_command(Spark::Command::Glom)
632
+ end
633
+
634
+ # Return a new RDD that is reduced into num_partitions partitions.
635
+ #
636
+ # == Example:
637
+ # rdd = $sc.parallelize(0..10, 3)
638
+ # rdd.coalesce(2).glom.collect
639
+ # # => [[0, 1, 2], [3, 4, 5, 6, 7, 8, 9, 10]]
640
+ #
641
+ def coalesce(num_partitions)
642
+ new_jrdd = jrdd.coalesce(num_partitions)
643
+ RDD.new(new_jrdd, context, @command.serializer, @command.deserializer)
644
+ end
645
+
646
+ # Return the Cartesian product of this RDD and another one, that is, the
647
+ # RDD of all pairs of elements `(a, b)` where `a` is in `self` and
648
+ # `b` is in `other`.
649
+ #
650
+ # == Example:
651
+ # rdd1 = $sc.parallelize([1,2,3])
652
+ # rdd2 = $sc.parallelize([4,5,6])
653
+ #
654
+ # rdd1.cartesian(rdd2).collect
655
+ # # => [[1, 4], [1, 5], [1, 6], [2, 4], [2, 5], [2, 6], [3, 4], [3, 5], [3, 6]]
656
+ #
657
+ def cartesian(other)
658
+ _deserializer = Spark::Serializer::Cartesian.new.set(self.deserializer, other.deserializer)
659
+ new_jrdd = jrdd.cartesian(other.jrdd)
660
+ RDD.new(new_jrdd, context, serializer, _deserializer)
661
+ end
662
+
663
+ # Return a new RDD containing the distinct elements in this RDD.
664
+ # Ordering is not preserved because of reducing
665
+ #
666
+ # == Example:
667
+ # rdd = $sc.parallelize([1,1,1,2,3])
668
+ # rdd.distinct.collect
669
+ # # => [1, 2, 3]
670
+ #
671
+ def distinct
672
+ self.map('lambda{|x| [x, nil]}')
673
+ .reduce_by_key('lambda{|x,_| x}')
674
+ .map('lambda{|x| x[0]}')
675
+ end
676
+
677
+ # Return a shuffled RDD.
678
+ #
679
+ # == Example:
680
+ # rdd = $sc.parallelize(0..10)
681
+ # rdd.shuffle.collect
682
+ # # => [3, 10, 6, 7, 8, 0, 4, 2, 9, 1, 5]
683
+ #
684
+ def shuffle(seed=nil)
685
+ seed ||= Random.new_seed
686
+
687
+ new_rdd_from_command(Spark::Command::Shuffle, seed)
688
+ end
689
+
690
+ # Return the union of this RDD and another one. Any identical elements will appear multiple
691
+ # times (use .distinct to eliminate them).
692
+ #
693
+ # == Example:
694
+ # rdd = $sc.parallelize([1, 2, 3])
695
+ # rdd.union(rdd).collect
696
+ # # => [1, 2, 3, 1, 2, 3]
697
+ #
698
+ def union(other)
699
+ if self.serializer != other.serializer
700
+ other = other.reserialize(serializer.name, serializer.batch_size)
701
+ end
702
+
703
+ new_jrdd = jrdd.union(other.jrdd)
704
+ RDD.new(new_jrdd, context, serializer, deserializer)
705
+ end
706
+
707
+ # Return a new RDD with different serializer. This method is useful during union
708
+ # and join operations.
709
+ #
710
+ # == Example:
711
+ # rdd = $sc.parallelize([1, 2, 3], nil, serializer: "marshal")
712
+ # rdd = rdd.map(lambda{|x| x.to_s})
713
+ # rdd.reserialize("oj").collect
714
+ # # => ["1", "2", "3"]
715
+ #
716
+ def reserialize(new_serializer, new_batch_size=nil)
717
+ new_batch_size ||= deserializer.batch_size
718
+ new_serializer = Spark::Serializer.get!(new_serializer).new(new_batch_size)
719
+
720
+ if serializer == new_serializer
721
+ return self
722
+ end
723
+
724
+ new_command = @command.deep_copy
725
+ new_command.serializer = new_serializer
726
+
727
+ PipelinedRDD.new(self, new_command)
728
+ end
729
+
730
+ # Return the intersection of this RDD and another one. The output will not contain
731
+ # any duplicate elements, even if the input RDDs did.
732
+ #
733
+ # == Example:
734
+ # rdd1 = $sc.parallelize([1,2,3,4,5])
735
+ # rdd2 = $sc.parallelize([1,4,5,6,7])
736
+ # rdd1.intersection(rdd2).collect
737
+ # # => [1, 4, 5]
738
+ #
739
+ def intersection(other)
740
+ mapping_function = 'lambda{|item| [item, nil]}'
741
+ filter_function = 'lambda{|(key, values)| values.size > 1}'
742
+
743
+ self.map(mapping_function)
744
+ .cogroup(other.map(mapping_function))
745
+ .filter(filter_function)
746
+ .keys
747
+ end
748
+
749
+ # Return a copy of the RDD partitioned using the specified partitioner.
750
+ #
751
+ # == Example:
752
+ # rdd = $sc.parallelize(["1","2","3","4","5"]).map(lambda {|x| [x, 1]})
753
+ # rdd.partitionBy(2).glom.collect
754
+ # # => [[["3", 1], ["4", 1]], [["1", 1], ["2", 1], ["5", 1]]]
755
+ #
756
+ def partition_by(num_partitions, partition_func=nil)
757
+ num_partitions ||= default_reduce_partitions
758
+ partition_func ||= 'lambda{|x| Spark::Digest.portable_hash(x.to_s)}'
759
+
760
+ _partition_by(num_partitions, Spark::Command::PartitionBy::Basic, partition_func)
761
+ end
762
+
763
+ # Return a sampled subset of this RDD. Operations are base on Poisson and Uniform
764
+ # distributions.
765
+ # TODO: Replace Unfirom for Bernoulli
766
+ #
767
+ # == Examples:
768
+ # rdd = $sc.parallelize(0..100)
769
+ #
770
+ # rdd.sample(true, 10).collect
771
+ # # => [17, 17, 22, 23, 51, 52, 62, 64, 69, 70, 96]
772
+ #
773
+ # rdd.sample(false, 0.1).collect
774
+ # # => [3, 5, 9, 32, 44, 55, 66, 68, 75, 80, 86, 91, 98]
775
+ #
776
+ def sample(with_replacement, fraction, seed=nil)
777
+ new_rdd_from_command(Spark::Command::Sample, with_replacement, fraction, seed)
778
+ end
779
+
780
+ # Return a fixed-size sampled subset of this RDD in an array
781
+ #
782
+ # == Examples:
783
+ # rdd = $sc.parallelize(0..100)
784
+ #
785
+ # rdd.take_sample(true, 10)
786
+ # # => [90, 84, 74, 44, 27, 22, 72, 96, 80, 54]
787
+ #
788
+ # rdd.take_sample(false, 10)
789
+ # # => [5, 35, 30, 48, 22, 33, 40, 75, 42, 32]
790
+ #
791
+ def take_sample(with_replacement, num, seed=nil)
792
+
793
+ if num < 0
794
+ raise Spark::RDDError, 'Size have to be greater than 0'
795
+ elsif num == 0
796
+ return []
797
+ end
798
+
799
+ # Taken from scala
800
+ num_st_dev = 10.0
801
+
802
+ # Number of items
803
+ initial_count = self.count
804
+ return [] if initial_count == 0
805
+
806
+ # Create new generator
807
+ seed ||= Random.new_seed
808
+ rng = Random.new(seed)
809
+
810
+ # Shuffle elements if requested num if greater than array size
811
+ if !with_replacement && num >= initial_count
812
+ return self.shuffle(seed).collect
813
+ end
814
+
815
+ # Max num
816
+ max_sample_size = Integer::MAX - (num_st_dev * Math.sqrt(Integer::MAX)).to_i
817
+ if num > max_sample_size
818
+ raise Spark::RDDError, "Size can not be greate than #{max_sample_size}"
819
+ end
820
+
821
+ # Approximate fraction with tolerance
822
+ fraction = compute_fraction(num, initial_count, with_replacement)
823
+
824
+ # Compute first samled subset
825
+ samples = self.sample(with_replacement, fraction, seed).collect
826
+
827
+ # If the first sample didn't turn out large enough, keep trying to take samples;
828
+ # this shouldn't happen often because we use a big multiplier for their initial size.
829
+ index = 0
830
+ while samples.size < num
831
+ log_warning("Needed to re-sample due to insufficient sample size. Repeat #{index}")
832
+ samples = self.sample(with_replacement, fraction, rng.rand(0..Integer::MAX)).collect
833
+ index += 1
834
+ end
835
+
836
+ samples.shuffle!(random: rng)
837
+ samples[0, num]
838
+ end
839
+
840
+ # Return an RDD created by piping elements to a forked external process.
841
+ #
842
+ # == Cmds:
843
+ # cmd = [env,] command... [,options]
844
+ #
845
+ # env: hash
846
+ # name => val : set the environment variable
847
+ # name => nil : unset the environment variable
848
+ # command...:
849
+ # commandline : command line string which is passed to the standard shell
850
+ # cmdname, arg1, ... : command name and one or more arguments (This form does
851
+ # not use the shell. See below for caveats.)
852
+ # [cmdname, argv0], arg1, ... : command name, argv[0] and zero or more arguments (no shell)
853
+ # options: hash
854
+ #
855
+ # See http://ruby-doc.org/core-2.2.0/Process.html#method-c-spawn
856
+ #
857
+ # == Examples:
858
+ # $sc.parallelize(0..5).pipe('cat').collect
859
+ # # => ["0", "1", "2", "3", "4", "5"]
860
+ #
861
+ # rdd = $sc.parallelize(0..5)
862
+ # rdd = rdd.pipe('cat', "awk '{print $1*10}'")
863
+ # rdd = rdd.map(lambda{|x| x.to_i + 1})
864
+ # rdd.collect
865
+ # # => [1, 11, 21, 31, 41, 51]
866
+ #
867
+ def pipe(*cmds)
868
+ new_rdd_from_command(Spark::Command::Pipe, cmds)
869
+ end
870
+
871
+
872
+ # =============================================================================
873
+ # Pair functions
874
+
875
+ # Merge the values for each key using an associative reduce function. This will also perform
876
+ # the merging locally on each mapper before sending results to a reducer, similarly to a
877
+ # "combiner" in MapReduce. Output will be hash-partitioned with the existing partitioner/
878
+ # parallelism level.
879
+ #
880
+ # == Example:
881
+ # rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"]).map(lambda{|x| [x, 1]})
882
+ # rdd.reduce_by_key(lambda{|x,y| x+y}).collect_as_hash
883
+ # # => {"a"=>3, "b"=>2, "c"=>3}
884
+ #
885
+ def reduce_by_key(f, num_partitions=nil)
886
+ combine_by_key('lambda {|x| x}', f, f, num_partitions)
887
+ end
888
+
889
+ # Generic function to combine the elements for each key using a custom set of aggregation
890
+ # functions. Turns a JavaPairRDD[(K, V)] into a result of type JavaPairRDD[(K, C)], for a
891
+ # "combined type" C * Note that V and C can be different -- for example, one might group an
892
+ # RDD of type (Int, Int) into an RDD of type (Int, List[Int]). Users provide three
893
+ # functions:
894
+ #
895
+ # == Parameters:
896
+ # create_combiner:: which turns a V into a C (e.g., creates a one-element list)
897
+ # merge_value:: to merge a V into a C (e.g., adds it to the end of a list)
898
+ # merge_combiners:: to combine two C's into a single one.
899
+ #
900
+ # == Example:
901
+ # def combiner(x)
902
+ # x
903
+ # end
904
+ #
905
+ # def merge(x,y)
906
+ # x+y
907
+ # end
908
+ #
909
+ # rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"], 2, batch_size: 1).map(lambda{|x| [x, 1]})
910
+ # rdd.combine_by_key(method(:combiner), method(:merge), method(:merge)).collect_as_hash
911
+ # # => {"a"=>3, "b"=>2, "c"=>3}
912
+ #
913
+ def combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions=nil)
914
+ _combine_by_key(
915
+ [Spark::Command::CombineByKey::Combine, create_combiner, merge_value],
916
+ [Spark::Command::CombineByKey::Merge, merge_combiners],
917
+ num_partitions
918
+ )
919
+ end
920
+
921
+ # Return an RDD of grouped items.
922
+ #
923
+ # == Example:
924
+ # rdd = $sc.parallelize(0..5)
925
+ # rdd.group_by(lambda{|x| x%2}).collect
926
+ # # => [[0, [0, 2, 4]], [1, [1, 3, 5]]]
927
+ #
928
+ def group_by(f, num_partitions=nil)
929
+ self.key_by(f).group_by_key(num_partitions)
930
+ end
931
+
932
+ # Group the values for each key in the RDD into a single sequence. Allows controlling the
933
+ # partitioning of the resulting key-value pair RDD by passing a Partitioner.
934
+ #
935
+ # Note: If you are grouping in order to perform an aggregation (such as a sum or average)
936
+ # over each key, using reduce_by_key or combine_by_key will provide much better performance.
937
+ #
938
+ # == Example:
939
+ # rdd = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
940
+ # rdd.group_by_key.collect
941
+ # # => [["a", [1, 2]], ["b", [3]]]
942
+ #
943
+ def group_by_key(num_partitions=nil)
944
+ create_combiner = 'lambda{|item| [item]}'
945
+ merge_value = 'lambda{|combiner, item| combiner << item; combiner}'
946
+ merge_combiners = 'lambda{|combiner_1, combiner_2| combiner_1 += combiner_2; combiner_1}'
947
+
948
+ combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions)
949
+ end
950
+
951
+ # Merge the values for each key using an associative function f
952
+ # and a neutral `zero_value` which may be added to the result an
953
+ # arbitrary number of times, and must not change the result
954
+ # (e.g., 0 for addition, or 1 for multiplication.).
955
+ #
956
+ # == Example:
957
+ # rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]])
958
+ # rdd.fold_by_key(1, lambda{|x,y| x+y})
959
+ # # => [["a", 9], ["c", 6], ["b", 3]]
960
+ #
961
+ def fold_by_key(zero_value, f, num_partitions=nil)
962
+ self.aggregate_by_key(zero_value, f, f, num_partitions)
963
+ end
964
+
965
+ # Aggregate the values of each key, using given combine functions and a neutral zero value.
966
+ #
967
+ # == Example:
968
+ # def combine(x,y)
969
+ # x+y
970
+ # end
971
+ #
972
+ # def merge(x,y)
973
+ # x*y
974
+ # end
975
+ #
976
+ # rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]], 2, batch_size: 1)
977
+ # rdd.aggregate_by_key(1, method(:combine), method(:merge))
978
+ # # => [["b", 3], ["a", 16], ["c", 6]]
979
+ #
980
+ def aggregate_by_key(zero_value, seq_func, comb_func, num_partitions=nil)
981
+ _combine_by_key(
982
+ [Spark::Command::CombineByKey::CombineWithZero, zero_value, seq_func],
983
+ [Spark::Command::CombineByKey::Merge, comb_func],
984
+ num_partitions
985
+ )
986
+ end
987
+
988
+ # The same functionality as cogroup but this can grouped only 2 rdd's and you
989
+ # can change num_partitions.
990
+ #
991
+ # == Example:
992
+ # rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
993
+ # rdd2 = $sc.parallelize([["a", 4], ["a", 5], ["b", 6]])
994
+ # rdd1.group_with(rdd2).collect
995
+ # # => [["a", [1, 2, 4, 5]], ["b", [3, 6]]]
996
+ #
997
+ def group_with(other, num_partitions=nil)
998
+ self.union(other).group_by_key(num_partitions)
999
+ end
1000
+
1001
+ # For each key k in `this` or `other`, return a resulting RDD that contains a tuple with the
1002
+ # list of values for that key in `this` as well as `other`.
1003
+ #
1004
+ # == Example:
1005
+ # rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
1006
+ # rdd2 = $sc.parallelize([["a", 4], ["a", 5], ["b", 6]])
1007
+ # rdd3 = $sc.parallelize([["a", 7], ["a", 8], ["b", 9]])
1008
+ # rdd1.cogroup(rdd2, rdd3).collect
1009
+ # # => [["a", [1, 2, 4, 5, 7, 8]], ["b", [3, 6, 9]]]
1010
+ #
1011
+ def cogroup(*others)
1012
+ unioned = self
1013
+ others.each do |other|
1014
+ unioned = unioned.union(other)
1015
+ end
1016
+
1017
+ unioned.group_by_key
1018
+ end
1019
+
1020
+ # Return each (key, value) pair in self RDD that has no pair with matching
1021
+ # key in other RDD.
1022
+ #
1023
+ # == Example:
1024
+ # rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3], ["c", 4]])
1025
+ # rdd2 = $sc.parallelize([["b", 5], ["c", 6]])
1026
+ # rdd1.subtract_by_key(rdd2).collect
1027
+ # # => [["a", 1], ["a", 2]]
1028
+ #
1029
+ def subtract_by_key(other, num_partitions=nil)
1030
+ create_combiner = 'lambda{|item| [[item]]}'
1031
+ merge_value = 'lambda{|combiner, item| combiner.first << item; combiner}'
1032
+ merge_combiners = 'lambda{|combiner_1, combiner_2| combiner_1 += combiner_2; combiner_1}'
1033
+
1034
+ self.union(other)
1035
+ .combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions)
1036
+ .filter('lambda{|(key,values)| values.size == 1}')
1037
+ .flat_map_values('lambda{|item| item.first}')
1038
+ end
1039
+
1040
+ # Return an RDD with the elements from self that are not in other.
1041
+ #
1042
+ # == Example:
1043
+ # rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3], ["c", 4]])
1044
+ # rdd2 = $sc.parallelize([["a", 2], ["c", 6]])
1045
+ # rdd1.subtract(rdd2).collect
1046
+ # # => [["a", 1], ["b", 3], ["c", 4]]
1047
+ #
1048
+ def subtract(other, num_partitions=nil)
1049
+ mapping_function = 'lambda{|x| [x,nil]}'
1050
+
1051
+ self.map(mapping_function)
1052
+ .subtract_by_key(other.map(mapping_function), num_partitions)
1053
+ .keys
1054
+ end
1055
+
1056
+ # Sort the RDD by key
1057
+ #
1058
+ # == Example:
1059
+ # rdd = $sc.parallelize([["c", 1], ["b", 2], ["a", 3]])
1060
+ # rdd.sort_by_key.collect
1061
+ # # => [["a", 3], ["b", 2], ["c", 1]]
1062
+ #
1063
+ def sort_by_key(ascending=true, num_partitions=nil)
1064
+ self.sort_by('lambda{|(key, _)| key}')
1065
+ end
1066
+
1067
+ # Sorts this RDD by the given key_function
1068
+ #
1069
+ # This is a different implementation than spark. Sort by doesn't use
1070
+ # key_by method first. It can be slower but take less memory and
1071
+ # you can always use map.sort_by_key
1072
+ #
1073
+ # == Example:
1074
+ # rdd = $sc.parallelize(["aaaaaaa", "cc", "b", "eeee", "ddd"])
1075
+ #
1076
+ # rdd.sort_by.collect
1077
+ # # => ["aaaaaaa", "b", "cc", "ddd", "eeee"]
1078
+ #
1079
+ # rdd.sort_by(lambda{|x| x.size}).collect
1080
+ # # => ["b", "cc", "ddd", "eeee", "aaaaaaa"]
1081
+ #
1082
+ def sort_by(key_function=nil, ascending=true, num_partitions=nil)
1083
+ key_function ||= 'lambda{|x| x}'
1084
+ num_partitions ||= default_reduce_partitions
1085
+
1086
+ command_klass = Spark::Command::SortByKey
1087
+
1088
+ # Allow spill data to disk due to memory limit
1089
+ # spilling = config['spark.shuffle.spill'] || false
1090
+ spilling = false
1091
+ memory = ''
1092
+
1093
+ # Set spilling to false if worker has unlimited memory
1094
+ if memory.empty?
1095
+ spilling = false
1096
+ memory = nil
1097
+ else
1098
+ memory = to_memory_size(memory)
1099
+ end
1100
+
1101
+ # Sorting should do one worker
1102
+ if num_partitions == 1
1103
+ rdd = self
1104
+ rdd = rdd.coalesce(1) if partitions_size > 1
1105
+ return rdd.new_rdd_from_command(command_klass, key_function, ascending, spilling, memory, serializer)
1106
+ end
1107
+
1108
+ # Compute boundary of collection
1109
+ # Collection should be evenly distributed
1110
+ # 20.0 is from scala RangePartitioner (for roughly balanced output partitions)
1111
+ count = self.count
1112
+ sample_size = num_partitions * 20.0
1113
+ fraction = [sample_size / [count, 1].max, 1.0].min
1114
+ samples = self.sample(false, fraction, 1).map(key_function).collect
1115
+ samples.sort!
1116
+ # Reverse is much faster than reverse sort_by
1117
+ samples.reverse! if !ascending
1118
+
1119
+ # Determine part bounds
1120
+ bounds = determine_bounds(samples, num_partitions)
1121
+
1122
+ shuffled = _partition_by(num_partitions, Spark::Command::PartitionBy::Sorting, key_function, bounds, ascending, num_partitions)
1123
+ shuffled.new_rdd_from_command(command_klass, key_function, ascending, spilling, memory, serializer)
1124
+ end
1125
+
1126
+ # Creates array of the elements in this RDD by applying function f.
1127
+ #
1128
+ # == Example:
1129
+ # rdd = $sc.parallelize(0..5)
1130
+ # rdd.key_by(lambda{|x| x%2}).collect
1131
+ # # => [[0, 0], [1, 1], [0, 2], [1, 3], [0, 4], [1, 5]]
1132
+ #
1133
+ def key_by(f)
1134
+ new_rdd_from_command(Spark::Command::KeyBy, f)
1135
+ end
1136
+
1137
+ # Pass each value in the key-value pair RDD through a map function without changing
1138
+ # the keys. This also retains the original RDD's partitioning.
1139
+ #
1140
+ # == Example:
1141
+ # rdd = $sc.parallelize(["ruby", "scala", "java"])
1142
+ # rdd = rdd.map(lambda{|x| [x, x]})
1143
+ # rdd = rdd.map_values(lambda{|x| x.upcase})
1144
+ # rdd.collect
1145
+ # # => [["ruby", "RUBY"], ["scala", "SCALA"], ["java", "JAVA"]]
1146
+ #
1147
+ def map_values(f)
1148
+ new_rdd_from_command(Spark::Command::MapValues, f)
1149
+ end
1150
+
1151
+ # Pass each value in the key-value pair RDD through a flat_map function
1152
+ # without changing the keys; this also retains the original RDD's
1153
+ # partitioning.
1154
+ #
1155
+ # == Example:
1156
+ # rdd = $sc.parallelize([["a", [1,2]], ["b", [3]]])
1157
+ # rdd = rdd.flat_map_values(lambda{|x| x*2})
1158
+ # rdd.collect
1159
+ # # => [["a", 1], ["a", 2], ["a", 1], ["a", 2], ["b", 3], ["b", 3]]
1160
+ #
1161
+ def flat_map_values(f)
1162
+ new_rdd_from_command(Spark::Command::FlatMapValues, f)
1163
+ end
1164
+
1165
+ # Return an RDD with the first element of PairRDD
1166
+ #
1167
+ # == Example:
1168
+ # rdd = $sc.parallelize([[1,2], [3,4], [5,6]])
1169
+ # rdd.keys.collect
1170
+ # # => [1, 3, 5]
1171
+ #
1172
+ def keys
1173
+ self.map('lambda{|(key, _)| key}')
1174
+ end
1175
+
1176
+ # Return an RDD with the second element of PairRDD
1177
+ #
1178
+ # == Example:
1179
+ # rdd = $sc.parallelize([[1,2], [3,4], [5,6]])
1180
+ # rdd.keys.collect
1181
+ # # => [2, 4, 6]
1182
+ #
1183
+ def values
1184
+ self.map('lambda{|(_, value)| value}')
1185
+ end
1186
+
1187
+
1188
+ # Aliases
1189
+ alias_method :partitionsSize, :partitions_size
1190
+ alias_method :defaultReducePartitions, :default_reduce_partitions
1191
+ alias_method :setName, :set_name
1192
+ alias_method :addLibrary, :add_library
1193
+
1194
+ alias_method :flatMap, :flat_map
1195
+ alias_method :mapPartitions, :map_partitions
1196
+ alias_method :mapPartitionsWithIndex, :map_partitions_with_index
1197
+ alias_method :reduceByKey, :reduce_by_key
1198
+ alias_method :combineByKey, :combine_by_key
1199
+ alias_method :groupByKey, :group_by_key
1200
+ alias_method :groupWith, :group_with
1201
+ alias_method :partitionBy, :partition_by
1202
+ alias_method :defaultReducePartitions, :default_reduce_partitions
1203
+ alias_method :foreachPartition, :foreach_partition
1204
+ alias_method :mapValues, :map_values
1205
+ alias_method :takeSample, :take_sample
1206
+ alias_method :sortBy, :sort_by
1207
+ alias_method :sortByKey, :sort_by_key
1208
+ alias_method :keyBy, :key_by
1209
+ alias_method :groupBy, :group_by
1210
+ alias_method :foldByKey, :fold_by_key
1211
+ alias_method :aggregateByKey, :aggregate_by_key
1212
+ alias_method :subtractByKey, :subtract_by_key
1213
+ alias_method :sampleStdev, :sample_stdev
1214
+ alias_method :sampleVariance, :sample_variance
1215
+
1216
+ private
1217
+
1218
+ # This is base method for reduce operation. Is used by reduce, fold and aggregation.
1219
+ # Only difference is that fold has zero value.
1220
+ #
1221
+ def _reduce(klass, seq_op, comb_op, zero_value=nil)
1222
+ if seq_op.nil?
1223
+ # Partitions are already reduced
1224
+ rdd = self
1225
+ else
1226
+ rdd = new_rdd_from_command(klass, seq_op, zero_value)
1227
+ end
1228
+
1229
+ # Send all results to one worker and combine results
1230
+ rdd = rdd.coalesce(1).compact
1231
+
1232
+ # Add the same function to new RDD
1233
+ comm = rdd.add_command(klass, comb_op, zero_value)
1234
+ comm.deserializer = @command.serializer
1235
+
1236
+ # Value is returned in array
1237
+ PipelinedRDD.new(rdd, comm).collect[0]
1238
+ end
1239
+
1240
+ def _partition_by(num_partitions, klass, *args)
1241
+ # RDD is transform from [key, value] to [hash, [key, value]]
1242
+ keyed = new_rdd_from_command(klass, *args)
1243
+ keyed.serializer.unbatch!
1244
+
1245
+ # PairwiseRDD and PythonPartitioner are borrowed from Python
1246
+ # but works great on ruby too
1247
+ pairwise_rdd = PairwiseRDD.new(keyed.jrdd.rdd).asJavaPairRDD
1248
+ partitioner = PythonPartitioner.new(num_partitions, args.first.object_id)
1249
+ new_jrdd = pairwise_rdd.partitionBy(partitioner).values
1250
+
1251
+ # Reset deserializer
1252
+ RDD.new(new_jrdd, context, @command.serializer, keyed.serializer)
1253
+ end
1254
+
1255
+ # For using a different combine_by_key
1256
+ #
1257
+ # == Used for:
1258
+ # * combine_by_key
1259
+ # * fold_by_key (with zero value)
1260
+ #
1261
+ def _combine_by_key(combine, merge, num_partitions)
1262
+ num_partitions ||= default_reduce_partitions
1263
+
1264
+ # Combine key
1265
+ combined = new_rdd_from_command(combine.shift, *combine)
1266
+
1267
+ # Merge items
1268
+ shuffled = combined.partition_by(num_partitions)
1269
+ merge_comm = shuffled.add_command(merge.shift, *merge)
1270
+
1271
+ PipelinedRDD.new(shuffled, merge_comm)
1272
+ end
1273
+
1274
+ end
1275
+
1276
+ # Pipelined Resilient Distributed Dataset, operations are pipelined and sended to worker
1277
+ #
1278
+ # RDD
1279
+ # `-- map
1280
+ # `-- map
1281
+ # `-- map
1282
+ #
1283
+ # Code is executed from top to bottom
1284
+ #
1285
+ class PipelinedRDD < RDD
1286
+
1287
+ attr_reader :prev_jrdd, :command
1288
+
1289
+ def initialize(prev, command)
1290
+
1291
+ if prev.is_a?(PipelinedRDD) && prev.pipelinable?
1292
+ # Second, ... stages
1293
+ @prev_jrdd = prev.prev_jrdd
1294
+ else
1295
+ # First stage
1296
+ @prev_jrdd = prev.jrdd
1297
+ end
1298
+
1299
+ @cached = false
1300
+ @checkpointed = false
1301
+
1302
+ @context = prev.context
1303
+ @command = command
1304
+ end
1305
+
1306
+ def pipelinable?
1307
+ !(cached? || checkpointed?)
1308
+ end
1309
+
1310
+ # Serialization necessary things and sent it to RubyRDD (scala extension)
1311
+ def jrdd
1312
+ @jrdd ||= _jrdd
1313
+ end
1314
+
1315
+ private
1316
+
1317
+ def _jrdd
1318
+ command = @command.build
1319
+
1320
+ broadcasts = @command.bound_objects.select{|_, value| value.is_a?(Spark::Broadcast)}.values
1321
+ broadcasts = to_java_array_list(broadcasts.map(&:jbroadcast))
1322
+
1323
+ ruby_rdd = RubyRDD.new(@prev_jrdd.rdd, command, broadcasts, @context.jaccumulator)
1324
+ ruby_rdd.asJavaRDD
1325
+ end
1326
+
1327
+ end
1328
+ end