ruby-spark 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +37 -0
  3. data/Gemfile +47 -0
  4. data/Guardfile +5 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +185 -0
  7. data/Rakefile +35 -0
  8. data/TODO.md +7 -0
  9. data/benchmark/aggregate.rb +33 -0
  10. data/benchmark/bisect.rb +88 -0
  11. data/benchmark/custom_marshal.rb +94 -0
  12. data/benchmark/digest.rb +150 -0
  13. data/benchmark/enumerator.rb +88 -0
  14. data/benchmark/performance/prepare.sh +18 -0
  15. data/benchmark/performance/python.py +156 -0
  16. data/benchmark/performance/r.r +69 -0
  17. data/benchmark/performance/ruby.rb +167 -0
  18. data/benchmark/performance/run-all.sh +160 -0
  19. data/benchmark/performance/scala.scala +181 -0
  20. data/benchmark/serializer.rb +82 -0
  21. data/benchmark/sort.rb +43 -0
  22. data/benchmark/sort2.rb +164 -0
  23. data/benchmark/take.rb +28 -0
  24. data/bin/ruby-spark +8 -0
  25. data/example/pi.rb +28 -0
  26. data/ext/ruby_c/extconf.rb +3 -0
  27. data/ext/ruby_c/murmur.c +158 -0
  28. data/ext/ruby_c/murmur.h +9 -0
  29. data/ext/ruby_c/ruby-spark.c +18 -0
  30. data/ext/ruby_java/Digest.java +36 -0
  31. data/ext/ruby_java/Murmur2.java +98 -0
  32. data/ext/ruby_java/RubySparkExtService.java +28 -0
  33. data/ext/ruby_java/extconf.rb +3 -0
  34. data/ext/spark/build.sbt +73 -0
  35. data/ext/spark/project/plugins.sbt +9 -0
  36. data/ext/spark/sbt/sbt +34 -0
  37. data/ext/spark/src/main/scala/Exec.scala +91 -0
  38. data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
  39. data/ext/spark/src/main/scala/Marshal.scala +52 -0
  40. data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
  41. data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
  42. data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
  43. data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
  44. data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
  45. data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
  46. data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
  47. data/ext/spark/src/main/scala/RubyPage.scala +34 -0
  48. data/ext/spark/src/main/scala/RubyRDD.scala +364 -0
  49. data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
  50. data/ext/spark/src/main/scala/RubyTab.scala +11 -0
  51. data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
  52. data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
  53. data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
  54. data/lib/ruby-spark.rb +1 -0
  55. data/lib/spark.rb +198 -0
  56. data/lib/spark/accumulator.rb +260 -0
  57. data/lib/spark/broadcast.rb +98 -0
  58. data/lib/spark/build.rb +43 -0
  59. data/lib/spark/cli.rb +169 -0
  60. data/lib/spark/command.rb +86 -0
  61. data/lib/spark/command/base.rb +154 -0
  62. data/lib/spark/command/basic.rb +345 -0
  63. data/lib/spark/command/pair.rb +124 -0
  64. data/lib/spark/command/sort.rb +51 -0
  65. data/lib/spark/command/statistic.rb +144 -0
  66. data/lib/spark/command_builder.rb +141 -0
  67. data/lib/spark/command_validator.rb +34 -0
  68. data/lib/spark/config.rb +244 -0
  69. data/lib/spark/constant.rb +14 -0
  70. data/lib/spark/context.rb +304 -0
  71. data/lib/spark/error.rb +50 -0
  72. data/lib/spark/ext/hash.rb +41 -0
  73. data/lib/spark/ext/integer.rb +25 -0
  74. data/lib/spark/ext/io.rb +57 -0
  75. data/lib/spark/ext/ip_socket.rb +29 -0
  76. data/lib/spark/ext/module.rb +58 -0
  77. data/lib/spark/ext/object.rb +24 -0
  78. data/lib/spark/ext/string.rb +24 -0
  79. data/lib/spark/helper.rb +10 -0
  80. data/lib/spark/helper/logger.rb +40 -0
  81. data/lib/spark/helper/parser.rb +85 -0
  82. data/lib/spark/helper/serialize.rb +71 -0
  83. data/lib/spark/helper/statistic.rb +93 -0
  84. data/lib/spark/helper/system.rb +42 -0
  85. data/lib/spark/java_bridge.rb +19 -0
  86. data/lib/spark/java_bridge/base.rb +203 -0
  87. data/lib/spark/java_bridge/jruby.rb +23 -0
  88. data/lib/spark/java_bridge/rjb.rb +41 -0
  89. data/lib/spark/logger.rb +76 -0
  90. data/lib/spark/mllib.rb +100 -0
  91. data/lib/spark/mllib/classification/common.rb +31 -0
  92. data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
  93. data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
  94. data/lib/spark/mllib/classification/svm.rb +135 -0
  95. data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
  96. data/lib/spark/mllib/clustering/kmeans.rb +118 -0
  97. data/lib/spark/mllib/matrix.rb +120 -0
  98. data/lib/spark/mllib/regression/common.rb +73 -0
  99. data/lib/spark/mllib/regression/labeled_point.rb +41 -0
  100. data/lib/spark/mllib/regression/lasso.rb +100 -0
  101. data/lib/spark/mllib/regression/linear.rb +124 -0
  102. data/lib/spark/mllib/regression/ridge.rb +97 -0
  103. data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
  104. data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
  105. data/lib/spark/mllib/stat/distribution.rb +12 -0
  106. data/lib/spark/mllib/vector.rb +185 -0
  107. data/lib/spark/rdd.rb +1328 -0
  108. data/lib/spark/sampler.rb +92 -0
  109. data/lib/spark/serializer.rb +24 -0
  110. data/lib/spark/serializer/base.rb +170 -0
  111. data/lib/spark/serializer/cartesian.rb +37 -0
  112. data/lib/spark/serializer/marshal.rb +19 -0
  113. data/lib/spark/serializer/message_pack.rb +25 -0
  114. data/lib/spark/serializer/oj.rb +25 -0
  115. data/lib/spark/serializer/pair.rb +27 -0
  116. data/lib/spark/serializer/utf8.rb +25 -0
  117. data/lib/spark/sort.rb +189 -0
  118. data/lib/spark/stat_counter.rb +125 -0
  119. data/lib/spark/storage_level.rb +39 -0
  120. data/lib/spark/version.rb +3 -0
  121. data/lib/spark/worker/master.rb +144 -0
  122. data/lib/spark/worker/spark_files.rb +15 -0
  123. data/lib/spark/worker/worker.rb +197 -0
  124. data/ruby-spark.gemspec +36 -0
  125. data/spec/generator.rb +37 -0
  126. data/spec/inputs/lorem_300.txt +316 -0
  127. data/spec/inputs/numbers/1.txt +50 -0
  128. data/spec/inputs/numbers/10.txt +50 -0
  129. data/spec/inputs/numbers/11.txt +50 -0
  130. data/spec/inputs/numbers/12.txt +50 -0
  131. data/spec/inputs/numbers/13.txt +50 -0
  132. data/spec/inputs/numbers/14.txt +50 -0
  133. data/spec/inputs/numbers/15.txt +50 -0
  134. data/spec/inputs/numbers/16.txt +50 -0
  135. data/spec/inputs/numbers/17.txt +50 -0
  136. data/spec/inputs/numbers/18.txt +50 -0
  137. data/spec/inputs/numbers/19.txt +50 -0
  138. data/spec/inputs/numbers/2.txt +50 -0
  139. data/spec/inputs/numbers/20.txt +50 -0
  140. data/spec/inputs/numbers/3.txt +50 -0
  141. data/spec/inputs/numbers/4.txt +50 -0
  142. data/spec/inputs/numbers/5.txt +50 -0
  143. data/spec/inputs/numbers/6.txt +50 -0
  144. data/spec/inputs/numbers/7.txt +50 -0
  145. data/spec/inputs/numbers/8.txt +50 -0
  146. data/spec/inputs/numbers/9.txt +50 -0
  147. data/spec/inputs/numbers_0_100.txt +101 -0
  148. data/spec/inputs/numbers_1_100.txt +100 -0
  149. data/spec/lib/collect_spec.rb +42 -0
  150. data/spec/lib/command_spec.rb +68 -0
  151. data/spec/lib/config_spec.rb +64 -0
  152. data/spec/lib/context_spec.rb +163 -0
  153. data/spec/lib/ext_spec.rb +72 -0
  154. data/spec/lib/external_apps_spec.rb +45 -0
  155. data/spec/lib/filter_spec.rb +80 -0
  156. data/spec/lib/flat_map_spec.rb +100 -0
  157. data/spec/lib/group_spec.rb +109 -0
  158. data/spec/lib/helper_spec.rb +19 -0
  159. data/spec/lib/key_spec.rb +41 -0
  160. data/spec/lib/manipulation_spec.rb +114 -0
  161. data/spec/lib/map_partitions_spec.rb +87 -0
  162. data/spec/lib/map_spec.rb +91 -0
  163. data/spec/lib/mllib/classification_spec.rb +54 -0
  164. data/spec/lib/mllib/clustering_spec.rb +35 -0
  165. data/spec/lib/mllib/matrix_spec.rb +32 -0
  166. data/spec/lib/mllib/regression_spec.rb +116 -0
  167. data/spec/lib/mllib/vector_spec.rb +77 -0
  168. data/spec/lib/reduce_by_key_spec.rb +118 -0
  169. data/spec/lib/reduce_spec.rb +131 -0
  170. data/spec/lib/sample_spec.rb +46 -0
  171. data/spec/lib/serializer_spec.rb +13 -0
  172. data/spec/lib/sort_spec.rb +58 -0
  173. data/spec/lib/statistic_spec.rb +168 -0
  174. data/spec/lib/whole_text_files_spec.rb +33 -0
  175. data/spec/spec_helper.rb +39 -0
  176. metadata +301 -0
@@ -0,0 +1,57 @@
1
+ require 'matrix'
2
+
3
+ # Based on ruby 2.1
4
+
5
+ class Vector
6
+ def self.elements(array, copy=true)
7
+ DenseVector.new(convert_to_array(array, copy))
8
+ end
9
+ end
10
+
11
+ module Spark
12
+ module Mllib
13
+ class VectorAdapter < ::Vector
14
+
15
+ def self.new(*args)
16
+ object = self.allocate
17
+ object.__send__(:initialize, *args)
18
+ object
19
+ end
20
+
21
+ def initialize(*args)
22
+ case args.shift
23
+ when :dense
24
+ values = args.shift.dup
25
+ when :sparse
26
+ values = [0.0] * args.shift.to_i
27
+ else
28
+ raise Spark::MllibError, 'Unknow vector type.'
29
+ end
30
+
31
+ super(values)
32
+ end
33
+
34
+ def []=(index, value)
35
+ @elements[index] = value
36
+ end
37
+
38
+ def dot(other)
39
+ if other.is_a?(Spark::Mllib::MatrixBase)
40
+ other * self
41
+ else
42
+ inner_product(other)
43
+ end
44
+ end
45
+
46
+ def squared_distance(other)
47
+ diff = self - other
48
+ diff.dot(diff)
49
+ end
50
+
51
+ def values
52
+ @values || to_a
53
+ end
54
+
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,12 @@
1
+ ##
2
+ # MultivariateGaussian
3
+ #
4
+ # This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution. In
5
+ # the event that the covariance matrix is singular, the density will be computed in a
6
+ # reduced dimensional subspace under which the distribution is supported.
7
+ #
8
+ # == Arguments:
9
+ # mu:: The mean vector of the distribution
10
+ # sigma:: The covariance matrix of the distribution
11
+ #
12
+ Spark::Mllib::MultivariateGaussian = Struct.new(:mu, :sigma)
@@ -0,0 +1,185 @@
1
+ module Spark
2
+ module Mllib
3
+ module Vectors
4
+
5
+ def self.dense(*args)
6
+ DenseVector.new(*args)
7
+ end
8
+
9
+ def self.sparse(*args)
10
+ SparseVector.new(*args)
11
+ end
12
+
13
+ def self.parse(data)
14
+ if data.start_with?('[') && data.end_with?(']')
15
+ DenseVector.parse(data)
16
+ elsif data.start_with?('(') && data.end_with?(')')
17
+ SparseVector.parse(data)
18
+ else
19
+ raise ArgumentError, 'Unknow vector.'
20
+ end
21
+ end
22
+
23
+ def self.to_vector(data)
24
+ if data.is_a?(SparseVector) || data.is_a?(DenseVector)
25
+ data
26
+ elsif data.is_a?(Array)
27
+ DenseVector.new(data)
28
+ end
29
+ end
30
+
31
+ end
32
+ end
33
+ end
34
+
35
+ module Spark
36
+ module Mllib
37
+ # @abstract Parent for all type of vectors
38
+ class VectorBase < VectorAdapter
39
+ end
40
+ end
41
+ end
42
+
43
+ module Spark
44
+ module Mllib
45
+ ##
46
+ # A dense vector represented by a value array.
47
+ #
48
+ # Dense vector is a vector in which most of the elements are non-zero.
49
+ #
50
+ # == Example:
51
+ # DenseVector.new([1,2,3,4,5]).values
52
+ # # => [1, 2, 3, 4, 5]
53
+ #
54
+ # DenseVector.new(1..5).values
55
+ # # => [1, 2, 3, 4, 5]
56
+ #
57
+ class DenseVector < VectorBase
58
+
59
+ def initialize(values)
60
+ super(:dense, values.to_a)
61
+ end
62
+
63
+ # Covert string to vector
64
+ #
65
+ # DenseVector.parse("[1.0,2.0,3.0,4.0,5.0]")
66
+ #
67
+ def self.parse(data)
68
+ unless data =~ /\[[0-9., ]+\]/
69
+ raise ArgumentError, 'Unknow format for DenseVector.'
70
+ end
71
+
72
+ data.sub!('[', '')
73
+ data.sub!(']', '')
74
+
75
+ data = data.split(',')
76
+ data.map!(&:to_f)
77
+
78
+ DenseVector.new(data)
79
+ end
80
+
81
+ # Convert vector to string
82
+ #
83
+ # DenseVector.new([1,2,3,4,5]).to_s
84
+ # # => "[1.0,2.0,3.0,4.0,5.0]"
85
+ #
86
+ def to_s
87
+ "[#{values.join(',')}]"
88
+ end
89
+
90
+ def to_java
91
+ JDenseVector.new(values)
92
+ end
93
+
94
+ def self.from_java(object)
95
+ DenseVector.new(object.values)
96
+ end
97
+
98
+ def marshal_dump
99
+ values
100
+ end
101
+
102
+ def marshal_load(array)
103
+ initialize(array)
104
+ end
105
+
106
+ end
107
+ end
108
+ end
109
+
110
+ module Spark
111
+ module Mllib
112
+ ##
113
+ # A sparse vector represented by an index array and an value array.
114
+ #
115
+ # Sparse vector is a vector in which most of the elements are zero.
116
+ #
117
+ # == Example:
118
+ # SparseVector.new(4, {1 => 1.0, 3 => 5.5}).values
119
+ # # => [0, 1.0, 0, 5.5]
120
+ #
121
+ # SparseVector.new(4, [[1, 3], [1.0, 5.5]]).values
122
+ # # => [0, 1.0, 0, 5.5]
123
+ #
124
+ # SparseVector.new(4, [1, 3], [1.0, 5.5]).values
125
+ # # => [0, 1.0, 0, 5.5]
126
+ #
127
+ class SparseVector < VectorBase
128
+
129
+ attr_reader :indices
130
+
131
+ def initialize(arg1, arg2=nil, arg3=nil)
132
+ super(:sparse, arg1)
133
+
134
+ if arg2.is_a?(Hash)
135
+ @indices = arg2.keys
136
+ @values = arg2.values
137
+ else
138
+ @indices = arg2
139
+ @values = arg3
140
+ end
141
+
142
+ @indices.zip(@values).each do |(index, value)|
143
+ self[index] = value
144
+ end
145
+ end
146
+
147
+ # Covert string to vector
148
+ #
149
+ # SparseVector.parse("(5,[1,4],[3.0,5.0])")
150
+ #
151
+ def self.parse(data)
152
+ data = data.match(/\(([0-9]+)[ ]*,[ ]*\[([0-9,. ]*)\][ ]*,[ ]*\[([0-9,. ]*)\]\)/)
153
+ if data
154
+ size = data[1].to_i
155
+ indices = data[2].split(',')
156
+ indices.map!(&:to_i)
157
+ values = data[3].split(',')
158
+ values.map!(&:to_f)
159
+
160
+ SparseVector.new(size, indices, values)
161
+ else
162
+ raise ArgumentError, 'Unknow format for SparseVector.'
163
+ end
164
+ end
165
+
166
+ # Convert vector to string
167
+ #
168
+ # SparseVector.new(5, {1 => 3, 4 => 5}).to_s
169
+ # # => "(5,[1,4],[3.0,5.0])"
170
+ #
171
+ def to_s
172
+ "(#{size},[#{indices.join(',')}],[#{values.join(',')}])"
173
+ end
174
+
175
+ def marshal_dump
176
+ [size, indices, values]
177
+ end
178
+
179
+ def marshal_load(array)
180
+ initialize(array[0], array[1], array[2])
181
+ end
182
+
183
+ end
184
+ end
185
+ end
data/lib/spark/rdd.rb ADDED
@@ -0,0 +1,1328 @@
1
+ module Spark
2
+ ##
3
+ # A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. Represents an immutable,
4
+ # partitioned collection of elements that can be operated on in parallel. This class contains the
5
+ # basic operations available on all RDDs, such as `map`, `filter`, and `persist`.
6
+ #
7
+ class RDD
8
+
9
+ extend Forwardable
10
+
11
+ attr_reader :jrdd, :context, :command
12
+
13
+ include Spark::Helper::Logger
14
+ include Spark::Helper::Parser
15
+ include Spark::Helper::Statistic
16
+
17
+ def_delegators :@command, :serializer, :deserializer, :libraries, :files
18
+
19
+ # Initializing RDD, this method is root of all Pipelined RDD - its unique
20
+ # If you call some operations on this class it will be computed in Java
21
+ #
22
+ # == Parameters:
23
+ # jrdd:: org.apache.spark.api.java.JavaRDD
24
+ # context:: {Spark::Context}
25
+ # serializer:: {Spark::Serializer}
26
+ #
27
+ def initialize(jrdd, context, serializer, deserializer=nil)
28
+ @jrdd = jrdd
29
+ @context = context
30
+
31
+ @cached = false
32
+ @checkpointed = false
33
+
34
+ @command = Spark::CommandBuilder.new(serializer, deserializer)
35
+ end
36
+
37
+
38
+ # =============================================================================
39
+ # Operators
40
+
41
+ def +(other)
42
+ self.union(other)
43
+ end
44
+
45
+
46
+ # =============================================================================
47
+ # Commad and serializer
48
+
49
+ def add_command(klass, *args)
50
+ @command.deep_copy.add_command(klass, *args)
51
+ end
52
+
53
+ # Add ruby library
54
+ # Libraries will be included before computing
55
+ #
56
+ # == Example:
57
+ # rdd.add_library('pry').add_library('nio4r', 'distribution')
58
+ #
59
+ def add_library(*libraries)
60
+ @command.add_library(*libraries)
61
+ self
62
+ end
63
+
64
+ # Bind object to RDD
65
+ #
66
+ # == Example:
67
+ # text = "test"
68
+ #
69
+ # rdd = $sc.parallelize(0..5)
70
+ # rdd = rdd.map(lambda{|x| x.to_s + " " + text})
71
+ # rdd = rdd.bind(text: text)
72
+ #
73
+ # rdd.collect
74
+ # # => ["0 test", "1 test", "2 test", "3 test", "4 test", "5 test"]
75
+ #
76
+ def bind(objects)
77
+ unless objects.is_a?(Hash)
78
+ raise ArgumentError, 'Argument must be a Hash.'
79
+ end
80
+
81
+ @command.bind(objects)
82
+ self
83
+ end
84
+
85
+ def new_rdd_from_command(klass, *args)
86
+ comm = add_command(klass, *args)
87
+ PipelinedRDD.new(self, comm)
88
+ end
89
+
90
+
91
+ # =============================================================================
92
+ # Variables and non-computing functions
93
+
94
+ def config
95
+ @context.config
96
+ end
97
+
98
+ def default_reduce_partitions
99
+ config['spark.default.parallelism'] || partitions_size
100
+ end
101
+
102
+ # Count of ParallelCollectionPartition
103
+ def partitions_size
104
+ jrdd.rdd.partitions.size
105
+ end
106
+
107
+ # A unique ID for this RDD (within its SparkContext).
108
+ def id
109
+ jrdd.id
110
+ end
111
+
112
+ # Persist this RDD with the default storage level MEMORY_ONLY_SER because of serialization.
113
+ def cache
114
+ persist('memory_only_ser')
115
+ end
116
+
117
+ # Set this RDD's storage level to persist its values across operations after the first time
118
+ # it is computed. This can only be used to assign a new storage level if the RDD does not
119
+ # have a storage level set yet.
120
+ #
121
+ # See StorageLevel for type of new_level
122
+ #
123
+ def persist(new_level)
124
+ @cached = true
125
+ jrdd.persist(Spark::StorageLevel.java_get(new_level))
126
+ self
127
+ end
128
+
129
+ # Mark the RDD as non-persistent, and remove all blocks for it from memory and disk.
130
+ #
131
+ # == Parameters:
132
+ # blocking:: whether to block until all blocks are deleted.
133
+ #
134
+ def unpersist(blocking=true)
135
+ @cached = false
136
+ jrdd.unpersist(blocking)
137
+ self
138
+ end
139
+
140
+ def cached?
141
+ @cached
142
+ end
143
+
144
+ def checkpointed?
145
+ @checkpointed
146
+ end
147
+
148
+ # Return the name of this RDD.
149
+ #
150
+ def name
151
+ _name = jrdd.name
152
+ _name && _name.encode(Encoding::UTF_8)
153
+ end
154
+
155
+ # Assign a name to this RDD.
156
+ #
157
+ def set_name(name)
158
+ jrdd.setName(name)
159
+ end
160
+
161
+ def to_java
162
+ rdd = self.reserialize('Marshal')
163
+ RubyRDD.toJava(rdd.jrdd, rdd.serializer.batched?)
164
+ end
165
+
166
+
167
+ # =============================================================================
168
+ # Actions which return value
169
+
170
+ # Return an array that contains all of the elements in this RDD.
171
+ # RJB raise an error if stage is killed.
172
+ def collect
173
+ collect_from_iterator(jrdd.collect.iterator)
174
+ rescue => e
175
+ raise Spark::RDDError, e.message
176
+ end
177
+
178
+ def collect_from_iterator(iterator)
179
+ if self.is_a?(PipelinedRDD)
180
+ klass = @command.serializer
181
+ else
182
+ klass = @command.deserializer
183
+ end
184
+
185
+ klass.load_from_iterator(iterator)
186
+ end
187
+
188
+ # Convert an Array to Hash
189
+ #
190
+ def collect_as_hash
191
+ Hash[collect]
192
+ end
193
+
194
+ # Take the first num elements of the RDD.
195
+ #
196
+ # It works by first scanning one partition, and use the results from
197
+ # that partition to estimate the number of additional partitions needed
198
+ # to satisfy the limit.
199
+ #
200
+ # == Example:
201
+ # rdd = $sc.parallelize(0..100, 20, batch_size: 1)
202
+ # rdd.take(5)
203
+ # # => [0, 1, 2, 3, 4]
204
+ #
205
+ def take(count)
206
+ buffer = []
207
+
208
+ parts_count = self.partitions_size
209
+ # No parts was scanned, yet
210
+ last_scanned = -1
211
+
212
+ while buffer.empty?
213
+ last_scanned += 1
214
+ buffer += context.run_job_with_command(self, [last_scanned], true, Spark::Command::Take, 0, -1)
215
+ end
216
+
217
+ # Assumption. Depend on batch_size and how Spark divided data.
218
+ items_per_part = buffer.size
219
+ left = count - buffer.size
220
+
221
+ while left > 0 && last_scanned < parts_count
222
+ parts_to_take = (left.to_f/items_per_part).ceil
223
+ parts_for_scanned = Array.new(parts_to_take) do
224
+ last_scanned += 1
225
+ end
226
+
227
+ # We cannot take exact number of items because workers are isolated from each other.
228
+ # => once you take e.g. 50% from last part and left is still > 0 then its very
229
+ # difficult merge new items
230
+ items = context.run_job_with_command(self, parts_for_scanned, true, Spark::Command::Take, left, last_scanned)
231
+ buffer += items
232
+
233
+ left = count - buffer.size
234
+ # Average size of all parts
235
+ items_per_part = [items_per_part, items.size].reduce(0){|sum, x| sum + x.to_f/2}
236
+ end
237
+
238
+ buffer.slice!(0, count)
239
+ end
240
+
241
+ # Return the first element in this RDD.
242
+ #
243
+ # == Example:
244
+ # rdd = $sc.parallelize(0..100)
245
+ # rdd.first
246
+ # # => 0
247
+ #
248
+ def first
249
+ self.take(1)[0]
250
+ end
251
+
252
+ # Reduces the elements of this RDD using the specified lambda or method.
253
+ #
254
+ # == Example:
255
+ # rdd = $sc.parallelize(0..10)
256
+ # rdd.reduce(lambda{|sum, x| sum+x})
257
+ # # => 55
258
+ #
259
+ def reduce(f)
260
+ _reduce(Spark::Command::Reduce, f, f)
261
+ end
262
+
263
+ # Aggregate the elements of each partition, and then the results for all the partitions, using a
264
+ # given associative function and a neutral "zero value".
265
+ #
266
+ # The function f(x, y) is allowed to modify x and return it as its result value to avoid
267
+ # object allocation; however, it should not modify y.
268
+ #
269
+ # Be careful, zero_values is applied to all stages. See example.
270
+ #
271
+ # == Example:
272
+ # rdd = $sc.parallelize(0..10, 2)
273
+ # rdd.fold(1, lambda{|sum, x| sum+x})
274
+ # # => 58
275
+ #
276
+ def fold(zero_value, f)
277
+ self.aggregate(zero_value, f, f)
278
+ end
279
+
280
+ # Aggregate the elements of each partition, and then the results for all the partitions, using
281
+ # given combine functions and a neutral "zero value".
282
+ #
283
+ # This function can return a different result type. We need one operation for merging.
284
+ #
285
+ # Result must be an Array otherwise Serializer Array's zero value will be send
286
+ # as multiple values and not just one.
287
+ #
288
+ # == Example:
289
+ # # 1 2 3 4 5 => 15 + 1 = 16
290
+ # # 6 7 8 9 10 => 40 + 1 = 41
291
+ # # 16 * 41 = 656
292
+ #
293
+ # seq = lambda{|x,y| x+y}
294
+ # com = lambda{|x,y| x*y}
295
+ #
296
+ # rdd = $sc.parallelize(1..10, 2, batch_size: 1)
297
+ # rdd.aggregate(1, seq, com)
298
+ # # => 656
299
+ #
300
+ def aggregate(zero_value, seq_op, comb_op)
301
+ _reduce(Spark::Command::Aggregate, seq_op, comb_op, zero_value)
302
+ end
303
+
304
+ # Return the max of this RDD
305
+ #
306
+ # == Example:
307
+ # rdd = $sc.parallelize(0..10)
308
+ # rdd.max
309
+ # # => 10
310
+ #
311
+ def max
312
+ self.reduce('lambda{|memo, item| memo > item ? memo : item }')
313
+ end
314
+
315
+ # Return the min of this RDD
316
+ #
317
+ # == Example:
318
+ # rdd = $sc.parallelize(0..10)
319
+ # rdd.min
320
+ # # => 0
321
+ #
322
+ def min
323
+ self.reduce('lambda{|memo, item| memo < item ? memo : item }')
324
+ end
325
+
326
+ # Return the sum of this RDD
327
+ #
328
+ # == Example:
329
+ # rdd = $sc.parallelize(0..10)
330
+ # rdd.sum
331
+ # # => 55
332
+ #
333
+ def sum
334
+ self.reduce('lambda{|sum, item| sum + item}')
335
+ end
336
+
337
+ # Return the number of values in this RDD
338
+ #
339
+ # == Example:
340
+ # rdd = $sc.parallelize(0..10)
341
+ # rdd.count
342
+ # # => 11
343
+ #
344
+ def count
345
+ # nil is for seq_op => it means the all result go directly to one worker for combine
346
+ @count ||= self.map_partitions('lambda{|iterator| iterator.to_a.size }')
347
+ .aggregate(0, nil, 'lambda{|sum, item| sum + item }')
348
+ end
349
+
350
+ # Return a {Spark::StatCounter} object that captures the mean, variance
351
+ # and count of the RDD's elements in one operation.
352
+ def stats
353
+ @stats ||= new_rdd_from_command(Spark::Command::Stats).reduce('lambda{|memo, item| memo.merge(item)}')
354
+ end
355
+
356
+ # Compute the mean of this RDD's elements.
357
+ #
358
+ # == Example:
359
+ # $sc.parallelize([1, 2, 3]).mean
360
+ # # => 2.0
361
+ #
362
+ def mean
363
+ stats.mean
364
+ end
365
+
366
+ # Compute the variance of this RDD's elements.
367
+ #
368
+ # == Example:
369
+ # $sc.parallelize([1, 2, 3]).variance
370
+ # # => 0.666...
371
+ #
372
+ def variance
373
+ stats.variance
374
+ end
375
+
376
+ # Compute the standard deviation of this RDD's elements.
377
+ #
378
+ # == Example:
379
+ # $sc.parallelize([1, 2, 3]).stdev
380
+ # # => 0.816...
381
+ #
382
+ def stdev
383
+ stats.stdev
384
+ end
385
+
386
+ # Compute the sample standard deviation of this RDD's elements (which
387
+ # corrects for bias in estimating the standard deviation by dividing by
388
+ # N-1 instead of N).
389
+ #
390
+ # == Example:
391
+ # $sc.parallelize([1, 2, 3]).sample_stdev
392
+ # # => 1.0
393
+ #
394
+ def sample_stdev
395
+ stats.sample_stdev
396
+ end
397
+
398
+ # Compute the sample variance of this RDD's elements (which corrects
399
+ # for bias in estimating the variance by dividing by N-1 instead of N).
400
+ #
401
+ # == Example:
402
+ # $sc.parallelize([1, 2, 3]).sample_variance
403
+ # # => 1.0
404
+ #
405
+ def sample_variance
406
+ stats.sample_variance
407
+ end
408
+
409
+ # Compute a histogram using the provided buckets. The buckets
410
+ # are all open to the right except for the last which is closed.
411
+ # e.g. [1,10,20,50] means the buckets are [1,10) [10,20) [20,50],
412
+ # which means 1<=x<10, 10<=x<20, 20<=x<=50. And on the input of 1
413
+ # and 50 we would have a histogram of 1,0,1.
414
+ #
415
+ # If your histogram is evenly spaced (e.g. [0, 10, 20, 30]),
416
+ # this can be switched from an O(log n) inseration to O(1) per
417
+ # element(where n = # buckets).
418
+ #
419
+ # Buckets must be sorted and not contain any duplicates, must be
420
+ # at least two elements.
421
+ #
422
+ # == Examples:
423
+ # rdd = $sc.parallelize(0..50)
424
+ #
425
+ # rdd.histogram(2)
426
+ # # => [[0.0, 25.0, 50], [25, 26]]
427
+ #
428
+ # rdd.histogram([0, 5, 25, 50])
429
+ # # => [[0, 5, 25, 50], [5, 20, 26]]
430
+ #
431
+ # rdd.histogram([0, 15, 30, 45, 60])
432
+ # # => [[0, 15, 30, 45, 60], [15, 15, 15, 6]]
433
+ #
434
+ def histogram(buckets)
435
+
436
+ # -----------------------------------------------------------------------
437
+ # Integer
438
+ #
439
+ if buckets.is_a?(Integer)
440
+
441
+ # Validation
442
+ if buckets < 1
443
+ raise ArgumentError, "Bucket count must be >= 1, #{buckets} inserted."
444
+ end
445
+
446
+ # Filter invalid values
447
+ # Nil and NaN
448
+ func = 'lambda{|x|
449
+ if x.nil? || (x.is_a?(Float) && x.nan?)
450
+ false
451
+ else
452
+ true
453
+ end
454
+ }'
455
+ filtered = self.filter(func)
456
+
457
+ # Compute the minimum and the maximum
458
+ func = 'lambda{|memo, item|
459
+ [memo[0] < item[0] ? memo[0] : item[0],
460
+ memo[1] > item[1] ? memo[1] : item[1]]
461
+ }'
462
+ min, max = filtered.map('lambda{|x| [x, x]}').reduce(func)
463
+
464
+ # Min, max must be valid numbers
465
+ if (min.is_a?(Float) && !min.finite?) || (max.is_a?(Float) && !max.finite?)
466
+ raise Spark::RDDError, 'Histogram on either an empty RDD or RDD containing +/-infinity or NaN'
467
+ end
468
+
469
+ # Already finished
470
+ if min == max || buckets == 1
471
+ return [min, max], [filtered.count]
472
+ end
473
+
474
+ # Custom range
475
+ begin
476
+ span = max - min # increment
477
+ buckets = (0...buckets).map do |x|
478
+ min + (x * span) / buckets.to_f
479
+ end
480
+ buckets << max
481
+ rescue NoMethodError
482
+ raise Spark::RDDError, 'Can not generate buckets with non-number in RDD'
483
+ end
484
+
485
+ even = true
486
+
487
+ # -----------------------------------------------------------------------
488
+ # Array
489
+ #
490
+ elsif buckets.is_a?(Array)
491
+
492
+ if buckets.size < 2
493
+ raise ArgumentError, 'Buckets should have more than one value.'
494
+ end
495
+
496
+ if buckets.detect{|x| x.nil? || (x.is_a?(Float) && x.nan?)}
497
+ raise ArgumentError, 'Can not have nil or nan numbers in buckets.'
498
+ end
499
+
500
+ if buckets.detect{|x| buckets.count(x) > 1}
501
+ raise ArgumentError, 'Buckets should not contain duplicated values.'
502
+ end
503
+
504
+ if buckets.sort != buckets
505
+ raise ArgumentError, 'Buckets must be sorted.'
506
+ end
507
+
508
+ even = false
509
+
510
+ # -----------------------------------------------------------------------
511
+ # Other
512
+ #
513
+ else
514
+ raise Spark::RDDError, 'Buckets should be number or array.'
515
+ end
516
+
517
+ reduce_func = 'lambda{|memo, item|
518
+ memo.size.times do |i|
519
+ memo[i] += item[i]
520
+ end
521
+ memo
522
+ }'
523
+
524
+ return buckets, new_rdd_from_command(Spark::Command::Histogram, even, buckets).reduce(reduce_func)
525
+ end
526
+
527
+ # Applies a function f to all elements of this RDD.
528
+ #
529
+ # == Example:
530
+ # rdd = $sc.parallelize(0..5)
531
+ # rdd.foreach(lambda{|x| puts x})
532
+ # # => nil
533
+ #
534
+ def foreach(f, options={})
535
+ new_rdd_from_command(Spark::Command::Foreach, f).collect
536
+ nil
537
+ end
538
+
539
+ # Applies a function f to each partition of this RDD.
540
+ #
541
+ # == Example:
542
+ # rdd = $sc.parallelize(0..5)
543
+ # rdd.foreachPartition(lambda{|x| puts x.to_s})
544
+ # # => nil
545
+ #
546
+ def foreach_partition(f, options={})
547
+ new_rdd_from_command(Spark::Command::ForeachPartition, f).collect
548
+ nil
549
+ end
550
+
551
+
552
+ # =============================================================================
553
+ # Transformations of RDD
554
+
555
+ # Return a new RDD by applying a function to all elements of this RDD.
556
+ #
557
+ # == Example:
558
+ # rdd = $sc.parallelize(0..5)
559
+ # rdd.map(lambda {|x| x*2}).collect
560
+ # # => [0, 2, 4, 6, 8, 10]
561
+ #
562
+ def map(f)
563
+ new_rdd_from_command(Spark::Command::Map, f)
564
+ end
565
+
566
+ # Return a new RDD by first applying a function to all elements of this
567
+ # RDD, and then flattening the results.
568
+ #
569
+ # == Example:
570
+ # rdd = $sc.parallelize(0..5)
571
+ # rdd.flat_map(lambda {|x| [x, 1]}).collect
572
+ # # => [0, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1]
573
+ #
574
+ def flat_map(f)
575
+ new_rdd_from_command(Spark::Command::FlatMap, f)
576
+ end
577
+
578
+ # Return a new RDD by applying a function to each partition of this RDD.
579
+ #
580
+ # == Example:
581
+ # rdd = $sc.parallelize(0..10, 2)
582
+ # rdd.map_partitions(lambda{|part| part.reduce(:+)}).collect
583
+ # # => [15, 40]
584
+ #
585
+ def map_partitions(f)
586
+ new_rdd_from_command(Spark::Command::MapPartitions, f)
587
+ end
588
+
589
+ # Return a new RDD by applying a function to each partition of this RDD, while tracking the index
590
+ # of the original partition.
591
+ #
592
+ # == Example:
593
+ # rdd = $sc.parallelize(0...4, 4, batch_size: 1)
594
+ # rdd.map_partitions_with_index(lambda{|part, index| part.first * index}).collect
595
+ # # => [0, 1, 4, 9]
596
+ #
597
+ def map_partitions_with_index(f, options={})
598
+ new_rdd_from_command(Spark::Command::MapPartitionsWithIndex, f)
599
+ end
600
+
601
+ # Return a new RDD containing only the elements that satisfy a predicate.
602
+ #
603
+ # == Example:
604
+ # rdd = $sc.parallelize(0..10)
605
+ # rdd.filter(lambda{|x| x.even?}).collect
606
+ # # => [0, 2, 4, 6, 8, 10]
607
+ #
608
+ def filter(f)
609
+ new_rdd_from_command(Spark::Command::Filter, f)
610
+ end
611
+
612
+ # Return a new RDD containing non-nil elements.
613
+ #
614
+ # == Example:
615
+ # rdd = $sc.parallelize([1, nil, 2, nil, 3])
616
+ # rdd.compact.collect
617
+ # # => [1, 2, 3]
618
+ #
619
+ def compact
620
+ new_rdd_from_command(Spark::Command::Compact)
621
+ end
622
+
623
+ # Return an RDD created by coalescing all elements within each partition into an array.
624
+ #
625
+ # == Example:
626
+ # rdd = $sc.parallelize(0..10, 3, batch_size: 1)
627
+ # rdd.glom.collect
628
+ # # => [[0, 1, 2], [3, 4, 5, 6], [7, 8, 9, 10]]
629
+ #
630
+ def glom
631
+ new_rdd_from_command(Spark::Command::Glom)
632
+ end
633
+
634
+ # Return a new RDD that is reduced into num_partitions partitions.
635
+ #
636
+ # == Example:
637
+ # rdd = $sc.parallelize(0..10, 3)
638
+ # rdd.coalesce(2).glom.collect
639
+ # # => [[0, 1, 2], [3, 4, 5, 6, 7, 8, 9, 10]]
640
+ #
641
+ def coalesce(num_partitions)
642
+ new_jrdd = jrdd.coalesce(num_partitions)
643
+ RDD.new(new_jrdd, context, @command.serializer, @command.deserializer)
644
+ end
645
+
646
+ # Return the Cartesian product of this RDD and another one, that is, the
647
+ # RDD of all pairs of elements `(a, b)` where `a` is in `self` and
648
+ # `b` is in `other`.
649
+ #
650
+ # == Example:
651
+ # rdd1 = $sc.parallelize([1,2,3])
652
+ # rdd2 = $sc.parallelize([4,5,6])
653
+ #
654
+ # rdd1.cartesian(rdd2).collect
655
+ # # => [[1, 4], [1, 5], [1, 6], [2, 4], [2, 5], [2, 6], [3, 4], [3, 5], [3, 6]]
656
+ #
657
+ def cartesian(other)
658
+ _deserializer = Spark::Serializer::Cartesian.new.set(self.deserializer, other.deserializer)
659
+ new_jrdd = jrdd.cartesian(other.jrdd)
660
+ RDD.new(new_jrdd, context, serializer, _deserializer)
661
+ end
662
+
663
+ # Return a new RDD containing the distinct elements in this RDD.
664
+ # Ordering is not preserved because of reducing
665
+ #
666
+ # == Example:
667
+ # rdd = $sc.parallelize([1,1,1,2,3])
668
+ # rdd.distinct.collect
669
+ # # => [1, 2, 3]
670
+ #
671
+ def distinct
672
+ self.map('lambda{|x| [x, nil]}')
673
+ .reduce_by_key('lambda{|x,_| x}')
674
+ .map('lambda{|x| x[0]}')
675
+ end
676
+
677
+ # Return a shuffled RDD.
678
+ #
679
+ # == Example:
680
+ # rdd = $sc.parallelize(0..10)
681
+ # rdd.shuffle.collect
682
+ # # => [3, 10, 6, 7, 8, 0, 4, 2, 9, 1, 5]
683
+ #
684
+ def shuffle(seed=nil)
685
+ seed ||= Random.new_seed
686
+
687
+ new_rdd_from_command(Spark::Command::Shuffle, seed)
688
+ end
689
+
690
+ # Return the union of this RDD and another one. Any identical elements will appear multiple
691
+ # times (use .distinct to eliminate them).
692
+ #
693
+ # == Example:
694
+ # rdd = $sc.parallelize([1, 2, 3])
695
+ # rdd.union(rdd).collect
696
+ # # => [1, 2, 3, 1, 2, 3]
697
+ #
698
+ def union(other)
699
+ if self.serializer != other.serializer
700
+ other = other.reserialize(serializer.name, serializer.batch_size)
701
+ end
702
+
703
+ new_jrdd = jrdd.union(other.jrdd)
704
+ RDD.new(new_jrdd, context, serializer, deserializer)
705
+ end
706
+
707
+ # Return a new RDD with different serializer. This method is useful during union
708
+ # and join operations.
709
+ #
710
+ # == Example:
711
+ # rdd = $sc.parallelize([1, 2, 3], nil, serializer: "marshal")
712
+ # rdd = rdd.map(lambda{|x| x.to_s})
713
+ # rdd.reserialize("oj").collect
714
+ # # => ["1", "2", "3"]
715
+ #
716
+ def reserialize(new_serializer, new_batch_size=nil)
717
+ new_batch_size ||= deserializer.batch_size
718
+ new_serializer = Spark::Serializer.get!(new_serializer).new(new_batch_size)
719
+
720
+ if serializer == new_serializer
721
+ return self
722
+ end
723
+
724
+ new_command = @command.deep_copy
725
+ new_command.serializer = new_serializer
726
+
727
+ PipelinedRDD.new(self, new_command)
728
+ end
729
+
730
+ # Return the intersection of this RDD and another one. The output will not contain
731
+ # any duplicate elements, even if the input RDDs did.
732
+ #
733
+ # == Example:
734
+ # rdd1 = $sc.parallelize([1,2,3,4,5])
735
+ # rdd2 = $sc.parallelize([1,4,5,6,7])
736
+ # rdd1.intersection(rdd2).collect
737
+ # # => [1, 4, 5]
738
+ #
739
+ def intersection(other)
740
+ mapping_function = 'lambda{|item| [item, nil]}'
741
+ filter_function = 'lambda{|(key, values)| values.size > 1}'
742
+
743
+ self.map(mapping_function)
744
+ .cogroup(other.map(mapping_function))
745
+ .filter(filter_function)
746
+ .keys
747
+ end
748
+
749
+ # Return a copy of the RDD partitioned using the specified partitioner.
750
+ #
751
+ # == Example:
752
+ # rdd = $sc.parallelize(["1","2","3","4","5"]).map(lambda {|x| [x, 1]})
753
+ # rdd.partitionBy(2).glom.collect
754
+ # # => [[["3", 1], ["4", 1]], [["1", 1], ["2", 1], ["5", 1]]]
755
+ #
756
+ def partition_by(num_partitions, partition_func=nil)
757
+ num_partitions ||= default_reduce_partitions
758
+ partition_func ||= 'lambda{|x| Spark::Digest.portable_hash(x.to_s)}'
759
+
760
+ _partition_by(num_partitions, Spark::Command::PartitionBy::Basic, partition_func)
761
+ end
762
+
763
+ # Return a sampled subset of this RDD. Operations are base on Poisson and Uniform
764
+ # distributions.
765
+ # TODO: Replace Unfirom for Bernoulli
766
+ #
767
+ # == Examples:
768
+ # rdd = $sc.parallelize(0..100)
769
+ #
770
+ # rdd.sample(true, 10).collect
771
+ # # => [17, 17, 22, 23, 51, 52, 62, 64, 69, 70, 96]
772
+ #
773
+ # rdd.sample(false, 0.1).collect
774
+ # # => [3, 5, 9, 32, 44, 55, 66, 68, 75, 80, 86, 91, 98]
775
+ #
776
+ def sample(with_replacement, fraction, seed=nil)
777
+ new_rdd_from_command(Spark::Command::Sample, with_replacement, fraction, seed)
778
+ end
779
+
780
+ # Return a fixed-size sampled subset of this RDD in an array
781
+ #
782
+ # == Examples:
783
+ # rdd = $sc.parallelize(0..100)
784
+ #
785
+ # rdd.take_sample(true, 10)
786
+ # # => [90, 84, 74, 44, 27, 22, 72, 96, 80, 54]
787
+ #
788
+ # rdd.take_sample(false, 10)
789
+ # # => [5, 35, 30, 48, 22, 33, 40, 75, 42, 32]
790
+ #
791
+ def take_sample(with_replacement, num, seed=nil)
792
+
793
+ if num < 0
794
+ raise Spark::RDDError, 'Size have to be greater than 0'
795
+ elsif num == 0
796
+ return []
797
+ end
798
+
799
+ # Taken from scala
800
+ num_st_dev = 10.0
801
+
802
+ # Number of items
803
+ initial_count = self.count
804
+ return [] if initial_count == 0
805
+
806
+ # Create new generator
807
+ seed ||= Random.new_seed
808
+ rng = Random.new(seed)
809
+
810
+ # Shuffle elements if requested num if greater than array size
811
+ if !with_replacement && num >= initial_count
812
+ return self.shuffle(seed).collect
813
+ end
814
+
815
+ # Max num
816
+ max_sample_size = Integer::MAX - (num_st_dev * Math.sqrt(Integer::MAX)).to_i
817
+ if num > max_sample_size
818
+ raise Spark::RDDError, "Size can not be greate than #{max_sample_size}"
819
+ end
820
+
821
+ # Approximate fraction with tolerance
822
+ fraction = compute_fraction(num, initial_count, with_replacement)
823
+
824
+ # Compute first samled subset
825
+ samples = self.sample(with_replacement, fraction, seed).collect
826
+
827
+ # If the first sample didn't turn out large enough, keep trying to take samples;
828
+ # this shouldn't happen often because we use a big multiplier for their initial size.
829
+ index = 0
830
+ while samples.size < num
831
+ log_warning("Needed to re-sample due to insufficient sample size. Repeat #{index}")
832
+ samples = self.sample(with_replacement, fraction, rng.rand(0..Integer::MAX)).collect
833
+ index += 1
834
+ end
835
+
836
+ samples.shuffle!(random: rng)
837
+ samples[0, num]
838
+ end
839
+
840
+ # Return an RDD created by piping elements to a forked external process.
841
+ #
842
+ # == Cmds:
843
+ # cmd = [env,] command... [,options]
844
+ #
845
+ # env: hash
846
+ # name => val : set the environment variable
847
+ # name => nil : unset the environment variable
848
+ # command...:
849
+ # commandline : command line string which is passed to the standard shell
850
+ # cmdname, arg1, ... : command name and one or more arguments (This form does
851
+ # not use the shell. See below for caveats.)
852
+ # [cmdname, argv0], arg1, ... : command name, argv[0] and zero or more arguments (no shell)
853
+ # options: hash
854
+ #
855
+ # See http://ruby-doc.org/core-2.2.0/Process.html#method-c-spawn
856
+ #
857
+ # == Examples:
858
+ # $sc.parallelize(0..5).pipe('cat').collect
859
+ # # => ["0", "1", "2", "3", "4", "5"]
860
+ #
861
+ # rdd = $sc.parallelize(0..5)
862
+ # rdd = rdd.pipe('cat', "awk '{print $1*10}'")
863
+ # rdd = rdd.map(lambda{|x| x.to_i + 1})
864
+ # rdd.collect
865
+ # # => [1, 11, 21, 31, 41, 51]
866
+ #
867
+ def pipe(*cmds)
868
+ new_rdd_from_command(Spark::Command::Pipe, cmds)
869
+ end
870
+
871
+
872
+ # =============================================================================
873
+ # Pair functions
874
+
875
+ # Merge the values for each key using an associative reduce function. This will also perform
876
+ # the merging locally on each mapper before sending results to a reducer, similarly to a
877
+ # "combiner" in MapReduce. Output will be hash-partitioned with the existing partitioner/
878
+ # parallelism level.
879
+ #
880
+ # == Example:
881
+ # rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"]).map(lambda{|x| [x, 1]})
882
+ # rdd.reduce_by_key(lambda{|x,y| x+y}).collect_as_hash
883
+ # # => {"a"=>3, "b"=>2, "c"=>3}
884
+ #
885
+ def reduce_by_key(f, num_partitions=nil)
886
+ combine_by_key('lambda {|x| x}', f, f, num_partitions)
887
+ end
888
+
889
+ # Generic function to combine the elements for each key using a custom set of aggregation
890
+ # functions. Turns a JavaPairRDD[(K, V)] into a result of type JavaPairRDD[(K, C)], for a
891
+ # "combined type" C * Note that V and C can be different -- for example, one might group an
892
+ # RDD of type (Int, Int) into an RDD of type (Int, List[Int]). Users provide three
893
+ # functions:
894
+ #
895
+ # == Parameters:
896
+ # create_combiner:: which turns a V into a C (e.g., creates a one-element list)
897
+ # merge_value:: to merge a V into a C (e.g., adds it to the end of a list)
898
+ # merge_combiners:: to combine two C's into a single one.
899
+ #
900
+ # == Example:
901
+ # def combiner(x)
902
+ # x
903
+ # end
904
+ #
905
+ # def merge(x,y)
906
+ # x+y
907
+ # end
908
+ #
909
+ # rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"], 2, batch_size: 1).map(lambda{|x| [x, 1]})
910
+ # rdd.combine_by_key(method(:combiner), method(:merge), method(:merge)).collect_as_hash
911
+ # # => {"a"=>3, "b"=>2, "c"=>3}
912
+ #
913
+ def combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions=nil)
914
+ _combine_by_key(
915
+ [Spark::Command::CombineByKey::Combine, create_combiner, merge_value],
916
+ [Spark::Command::CombineByKey::Merge, merge_combiners],
917
+ num_partitions
918
+ )
919
+ end
920
+
921
+ # Return an RDD of grouped items.
922
+ #
923
+ # == Example:
924
+ # rdd = $sc.parallelize(0..5)
925
+ # rdd.group_by(lambda{|x| x%2}).collect
926
+ # # => [[0, [0, 2, 4]], [1, [1, 3, 5]]]
927
+ #
928
+ def group_by(f, num_partitions=nil)
929
+ self.key_by(f).group_by_key(num_partitions)
930
+ end
931
+
932
+ # Group the values for each key in the RDD into a single sequence. Allows controlling the
933
+ # partitioning of the resulting key-value pair RDD by passing a Partitioner.
934
+ #
935
+ # Note: If you are grouping in order to perform an aggregation (such as a sum or average)
936
+ # over each key, using reduce_by_key or combine_by_key will provide much better performance.
937
+ #
938
+ # == Example:
939
+ # rdd = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
940
+ # rdd.group_by_key.collect
941
+ # # => [["a", [1, 2]], ["b", [3]]]
942
+ #
943
+ def group_by_key(num_partitions=nil)
944
+ create_combiner = 'lambda{|item| [item]}'
945
+ merge_value = 'lambda{|combiner, item| combiner << item; combiner}'
946
+ merge_combiners = 'lambda{|combiner_1, combiner_2| combiner_1 += combiner_2; combiner_1}'
947
+
948
+ combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions)
949
+ end
950
+
951
+ # Merge the values for each key using an associative function f
952
+ # and a neutral `zero_value` which may be added to the result an
953
+ # arbitrary number of times, and must not change the result
954
+ # (e.g., 0 for addition, or 1 for multiplication.).
955
+ #
956
+ # == Example:
957
+ # rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]])
958
+ # rdd.fold_by_key(1, lambda{|x,y| x+y})
959
+ # # => [["a", 9], ["c", 6], ["b", 3]]
960
+ #
961
+ def fold_by_key(zero_value, f, num_partitions=nil)
962
+ self.aggregate_by_key(zero_value, f, f, num_partitions)
963
+ end
964
+
965
+ # Aggregate the values of each key, using given combine functions and a neutral zero value.
966
+ #
967
+ # == Example:
968
+ # def combine(x,y)
969
+ # x+y
970
+ # end
971
+ #
972
+ # def merge(x,y)
973
+ # x*y
974
+ # end
975
+ #
976
+ # rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]], 2, batch_size: 1)
977
+ # rdd.aggregate_by_key(1, method(:combine), method(:merge))
978
+ # # => [["b", 3], ["a", 16], ["c", 6]]
979
+ #
980
+ def aggregate_by_key(zero_value, seq_func, comb_func, num_partitions=nil)
981
+ _combine_by_key(
982
+ [Spark::Command::CombineByKey::CombineWithZero, zero_value, seq_func],
983
+ [Spark::Command::CombineByKey::Merge, comb_func],
984
+ num_partitions
985
+ )
986
+ end
987
+
988
+ # The same functionality as cogroup but this can grouped only 2 rdd's and you
989
+ # can change num_partitions.
990
+ #
991
+ # == Example:
992
+ # rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
993
+ # rdd2 = $sc.parallelize([["a", 4], ["a", 5], ["b", 6]])
994
+ # rdd1.group_with(rdd2).collect
995
+ # # => [["a", [1, 2, 4, 5]], ["b", [3, 6]]]
996
+ #
997
+ def group_with(other, num_partitions=nil)
998
+ self.union(other).group_by_key(num_partitions)
999
+ end
1000
+
1001
+ # For each key k in `this` or `other`, return a resulting RDD that contains a tuple with the
1002
+ # list of values for that key in `this` as well as `other`.
1003
+ #
1004
+ # == Example:
1005
+ # rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
1006
+ # rdd2 = $sc.parallelize([["a", 4], ["a", 5], ["b", 6]])
1007
+ # rdd3 = $sc.parallelize([["a", 7], ["a", 8], ["b", 9]])
1008
+ # rdd1.cogroup(rdd2, rdd3).collect
1009
+ # # => [["a", [1, 2, 4, 5, 7, 8]], ["b", [3, 6, 9]]]
1010
+ #
1011
+ def cogroup(*others)
1012
+ unioned = self
1013
+ others.each do |other|
1014
+ unioned = unioned.union(other)
1015
+ end
1016
+
1017
+ unioned.group_by_key
1018
+ end
1019
+
1020
+ # Return each (key, value) pair in self RDD that has no pair with matching
1021
+ # key in other RDD.
1022
+ #
1023
+ # == Example:
1024
+ # rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3], ["c", 4]])
1025
+ # rdd2 = $sc.parallelize([["b", 5], ["c", 6]])
1026
+ # rdd1.subtract_by_key(rdd2).collect
1027
+ # # => [["a", 1], ["a", 2]]
1028
+ #
1029
+ def subtract_by_key(other, num_partitions=nil)
1030
+ create_combiner = 'lambda{|item| [[item]]}'
1031
+ merge_value = 'lambda{|combiner, item| combiner.first << item; combiner}'
1032
+ merge_combiners = 'lambda{|combiner_1, combiner_2| combiner_1 += combiner_2; combiner_1}'
1033
+
1034
+ self.union(other)
1035
+ .combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions)
1036
+ .filter('lambda{|(key,values)| values.size == 1}')
1037
+ .flat_map_values('lambda{|item| item.first}')
1038
+ end
1039
+
1040
+ # Return an RDD with the elements from self that are not in other.
1041
+ #
1042
+ # == Example:
1043
+ # rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3], ["c", 4]])
1044
+ # rdd2 = $sc.parallelize([["a", 2], ["c", 6]])
1045
+ # rdd1.subtract(rdd2).collect
1046
+ # # => [["a", 1], ["b", 3], ["c", 4]]
1047
+ #
1048
+ def subtract(other, num_partitions=nil)
1049
+ mapping_function = 'lambda{|x| [x,nil]}'
1050
+
1051
+ self.map(mapping_function)
1052
+ .subtract_by_key(other.map(mapping_function), num_partitions)
1053
+ .keys
1054
+ end
1055
+
1056
+ # Sort the RDD by key
1057
+ #
1058
+ # == Example:
1059
+ # rdd = $sc.parallelize([["c", 1], ["b", 2], ["a", 3]])
1060
+ # rdd.sort_by_key.collect
1061
+ # # => [["a", 3], ["b", 2], ["c", 1]]
1062
+ #
1063
+ def sort_by_key(ascending=true, num_partitions=nil)
1064
+ self.sort_by('lambda{|(key, _)| key}')
1065
+ end
1066
+
1067
+ # Sorts this RDD by the given key_function
1068
+ #
1069
+ # This is a different implementation than spark. Sort by doesn't use
1070
+ # key_by method first. It can be slower but take less memory and
1071
+ # you can always use map.sort_by_key
1072
+ #
1073
+ # == Example:
1074
+ # rdd = $sc.parallelize(["aaaaaaa", "cc", "b", "eeee", "ddd"])
1075
+ #
1076
+ # rdd.sort_by.collect
1077
+ # # => ["aaaaaaa", "b", "cc", "ddd", "eeee"]
1078
+ #
1079
+ # rdd.sort_by(lambda{|x| x.size}).collect
1080
+ # # => ["b", "cc", "ddd", "eeee", "aaaaaaa"]
1081
+ #
1082
+ def sort_by(key_function=nil, ascending=true, num_partitions=nil)
1083
+ key_function ||= 'lambda{|x| x}'
1084
+ num_partitions ||= default_reduce_partitions
1085
+
1086
+ command_klass = Spark::Command::SortByKey
1087
+
1088
+ # Allow spill data to disk due to memory limit
1089
+ # spilling = config['spark.shuffle.spill'] || false
1090
+ spilling = false
1091
+ memory = ''
1092
+
1093
+ # Set spilling to false if worker has unlimited memory
1094
+ if memory.empty?
1095
+ spilling = false
1096
+ memory = nil
1097
+ else
1098
+ memory = to_memory_size(memory)
1099
+ end
1100
+
1101
+ # Sorting should do one worker
1102
+ if num_partitions == 1
1103
+ rdd = self
1104
+ rdd = rdd.coalesce(1) if partitions_size > 1
1105
+ return rdd.new_rdd_from_command(command_klass, key_function, ascending, spilling, memory, serializer)
1106
+ end
1107
+
1108
+ # Compute boundary of collection
1109
+ # Collection should be evenly distributed
1110
+ # 20.0 is from scala RangePartitioner (for roughly balanced output partitions)
1111
+ count = self.count
1112
+ sample_size = num_partitions * 20.0
1113
+ fraction = [sample_size / [count, 1].max, 1.0].min
1114
+ samples = self.sample(false, fraction, 1).map(key_function).collect
1115
+ samples.sort!
1116
+ # Reverse is much faster than reverse sort_by
1117
+ samples.reverse! if !ascending
1118
+
1119
+ # Determine part bounds
1120
+ bounds = determine_bounds(samples, num_partitions)
1121
+
1122
+ shuffled = _partition_by(num_partitions, Spark::Command::PartitionBy::Sorting, key_function, bounds, ascending, num_partitions)
1123
+ shuffled.new_rdd_from_command(command_klass, key_function, ascending, spilling, memory, serializer)
1124
+ end
1125
+
1126
+ # Creates array of the elements in this RDD by applying function f.
1127
+ #
1128
+ # == Example:
1129
+ # rdd = $sc.parallelize(0..5)
1130
+ # rdd.key_by(lambda{|x| x%2}).collect
1131
+ # # => [[0, 0], [1, 1], [0, 2], [1, 3], [0, 4], [1, 5]]
1132
+ #
1133
+ def key_by(f)
1134
+ new_rdd_from_command(Spark::Command::KeyBy, f)
1135
+ end
1136
+
1137
+ # Pass each value in the key-value pair RDD through a map function without changing
1138
+ # the keys. This also retains the original RDD's partitioning.
1139
+ #
1140
+ # == Example:
1141
+ # rdd = $sc.parallelize(["ruby", "scala", "java"])
1142
+ # rdd = rdd.map(lambda{|x| [x, x]})
1143
+ # rdd = rdd.map_values(lambda{|x| x.upcase})
1144
+ # rdd.collect
1145
+ # # => [["ruby", "RUBY"], ["scala", "SCALA"], ["java", "JAVA"]]
1146
+ #
1147
+ def map_values(f)
1148
+ new_rdd_from_command(Spark::Command::MapValues, f)
1149
+ end
1150
+
1151
+ # Pass each value in the key-value pair RDD through a flat_map function
1152
+ # without changing the keys; this also retains the original RDD's
1153
+ # partitioning.
1154
+ #
1155
+ # == Example:
1156
+ # rdd = $sc.parallelize([["a", [1,2]], ["b", [3]]])
1157
+ # rdd = rdd.flat_map_values(lambda{|x| x*2})
1158
+ # rdd.collect
1159
+ # # => [["a", 1], ["a", 2], ["a", 1], ["a", 2], ["b", 3], ["b", 3]]
1160
+ #
1161
+ def flat_map_values(f)
1162
+ new_rdd_from_command(Spark::Command::FlatMapValues, f)
1163
+ end
1164
+
1165
+ # Return an RDD with the first element of PairRDD
1166
+ #
1167
+ # == Example:
1168
+ # rdd = $sc.parallelize([[1,2], [3,4], [5,6]])
1169
+ # rdd.keys.collect
1170
+ # # => [1, 3, 5]
1171
+ #
1172
+ def keys
1173
+ self.map('lambda{|(key, _)| key}')
1174
+ end
1175
+
1176
+ # Return an RDD with the second element of PairRDD
1177
+ #
1178
+ # == Example:
1179
+ # rdd = $sc.parallelize([[1,2], [3,4], [5,6]])
1180
+ # rdd.keys.collect
1181
+ # # => [2, 4, 6]
1182
+ #
1183
+ def values
1184
+ self.map('lambda{|(_, value)| value}')
1185
+ end
1186
+
1187
+
1188
+ # Aliases
1189
+ alias_method :partitionsSize, :partitions_size
1190
+ alias_method :defaultReducePartitions, :default_reduce_partitions
1191
+ alias_method :setName, :set_name
1192
+ alias_method :addLibrary, :add_library
1193
+
1194
+ alias_method :flatMap, :flat_map
1195
+ alias_method :mapPartitions, :map_partitions
1196
+ alias_method :mapPartitionsWithIndex, :map_partitions_with_index
1197
+ alias_method :reduceByKey, :reduce_by_key
1198
+ alias_method :combineByKey, :combine_by_key
1199
+ alias_method :groupByKey, :group_by_key
1200
+ alias_method :groupWith, :group_with
1201
+ alias_method :partitionBy, :partition_by
1202
+ alias_method :defaultReducePartitions, :default_reduce_partitions
1203
+ alias_method :foreachPartition, :foreach_partition
1204
+ alias_method :mapValues, :map_values
1205
+ alias_method :takeSample, :take_sample
1206
+ alias_method :sortBy, :sort_by
1207
+ alias_method :sortByKey, :sort_by_key
1208
+ alias_method :keyBy, :key_by
1209
+ alias_method :groupBy, :group_by
1210
+ alias_method :foldByKey, :fold_by_key
1211
+ alias_method :aggregateByKey, :aggregate_by_key
1212
+ alias_method :subtractByKey, :subtract_by_key
1213
+ alias_method :sampleStdev, :sample_stdev
1214
+ alias_method :sampleVariance, :sample_variance
1215
+
1216
+ private
1217
+
1218
+ # This is base method for reduce operation. Is used by reduce, fold and aggregation.
1219
+ # Only difference is that fold has zero value.
1220
+ #
1221
+ def _reduce(klass, seq_op, comb_op, zero_value=nil)
1222
+ if seq_op.nil?
1223
+ # Partitions are already reduced
1224
+ rdd = self
1225
+ else
1226
+ rdd = new_rdd_from_command(klass, seq_op, zero_value)
1227
+ end
1228
+
1229
+ # Send all results to one worker and combine results
1230
+ rdd = rdd.coalesce(1).compact
1231
+
1232
+ # Add the same function to new RDD
1233
+ comm = rdd.add_command(klass, comb_op, zero_value)
1234
+ comm.deserializer = @command.serializer
1235
+
1236
+ # Value is returned in array
1237
+ PipelinedRDD.new(rdd, comm).collect[0]
1238
+ end
1239
+
1240
+ def _partition_by(num_partitions, klass, *args)
1241
+ # RDD is transform from [key, value] to [hash, [key, value]]
1242
+ keyed = new_rdd_from_command(klass, *args)
1243
+ keyed.serializer.unbatch!
1244
+
1245
+ # PairwiseRDD and PythonPartitioner are borrowed from Python
1246
+ # but works great on ruby too
1247
+ pairwise_rdd = PairwiseRDD.new(keyed.jrdd.rdd).asJavaPairRDD
1248
+ partitioner = PythonPartitioner.new(num_partitions, args.first.object_id)
1249
+ new_jrdd = pairwise_rdd.partitionBy(partitioner).values
1250
+
1251
+ # Reset deserializer
1252
+ RDD.new(new_jrdd, context, @command.serializer, keyed.serializer)
1253
+ end
1254
+
1255
+ # For using a different combine_by_key
1256
+ #
1257
+ # == Used for:
1258
+ # * combine_by_key
1259
+ # * fold_by_key (with zero value)
1260
+ #
1261
+ def _combine_by_key(combine, merge, num_partitions)
1262
+ num_partitions ||= default_reduce_partitions
1263
+
1264
+ # Combine key
1265
+ combined = new_rdd_from_command(combine.shift, *combine)
1266
+
1267
+ # Merge items
1268
+ shuffled = combined.partition_by(num_partitions)
1269
+ merge_comm = shuffled.add_command(merge.shift, *merge)
1270
+
1271
+ PipelinedRDD.new(shuffled, merge_comm)
1272
+ end
1273
+
1274
+ end
1275
+
1276
+ # Pipelined Resilient Distributed Dataset, operations are pipelined and sended to worker
1277
+ #
1278
+ # RDD
1279
+ # `-- map
1280
+ # `-- map
1281
+ # `-- map
1282
+ #
1283
+ # Code is executed from top to bottom
1284
+ #
1285
+ class PipelinedRDD < RDD
1286
+
1287
+ attr_reader :prev_jrdd, :command
1288
+
1289
+ def initialize(prev, command)
1290
+
1291
+ if prev.is_a?(PipelinedRDD) && prev.pipelinable?
1292
+ # Second, ... stages
1293
+ @prev_jrdd = prev.prev_jrdd
1294
+ else
1295
+ # First stage
1296
+ @prev_jrdd = prev.jrdd
1297
+ end
1298
+
1299
+ @cached = false
1300
+ @checkpointed = false
1301
+
1302
+ @context = prev.context
1303
+ @command = command
1304
+ end
1305
+
1306
+ def pipelinable?
1307
+ !(cached? || checkpointed?)
1308
+ end
1309
+
1310
+ # Serialization necessary things and sent it to RubyRDD (scala extension)
1311
+ def jrdd
1312
+ @jrdd ||= _jrdd
1313
+ end
1314
+
1315
+ private
1316
+
1317
+ def _jrdd
1318
+ command = @command.build
1319
+
1320
+ broadcasts = @command.bound_objects.select{|_, value| value.is_a?(Spark::Broadcast)}.values
1321
+ broadcasts = to_java_array_list(broadcasts.map(&:jbroadcast))
1322
+
1323
+ ruby_rdd = RubyRDD.new(@prev_jrdd.rdd, command, broadcasts, @context.jaccumulator)
1324
+ ruby_rdd.asJavaRDD
1325
+ end
1326
+
1327
+ end
1328
+ end