ruby-spark 1.1.0.1-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (180) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +37 -0
  3. data/Gemfile +47 -0
  4. data/Guardfile +5 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +252 -0
  7. data/Rakefile +35 -0
  8. data/TODO.md +6 -0
  9. data/benchmark/aggregate.rb +33 -0
  10. data/benchmark/bisect.rb +88 -0
  11. data/benchmark/comparison/prepare.sh +18 -0
  12. data/benchmark/comparison/python.py +156 -0
  13. data/benchmark/comparison/r.r +69 -0
  14. data/benchmark/comparison/ruby.rb +167 -0
  15. data/benchmark/comparison/run-all.sh +160 -0
  16. data/benchmark/comparison/scala.scala +181 -0
  17. data/benchmark/custom_marshal.rb +94 -0
  18. data/benchmark/digest.rb +150 -0
  19. data/benchmark/enumerator.rb +88 -0
  20. data/benchmark/serializer.rb +82 -0
  21. data/benchmark/sort.rb +43 -0
  22. data/benchmark/sort2.rb +164 -0
  23. data/benchmark/take.rb +28 -0
  24. data/bin/ruby-spark +8 -0
  25. data/example/pi.rb +28 -0
  26. data/example/website_search.rb +83 -0
  27. data/ext/ruby_c/extconf.rb +3 -0
  28. data/ext/ruby_c/murmur.c +158 -0
  29. data/ext/ruby_c/murmur.h +9 -0
  30. data/ext/ruby_c/ruby-spark.c +18 -0
  31. data/ext/ruby_java/Digest.java +36 -0
  32. data/ext/ruby_java/Murmur2.java +98 -0
  33. data/ext/ruby_java/RubySparkExtService.java +28 -0
  34. data/ext/ruby_java/extconf.rb +3 -0
  35. data/ext/spark/build.sbt +73 -0
  36. data/ext/spark/project/plugins.sbt +9 -0
  37. data/ext/spark/sbt/sbt +34 -0
  38. data/ext/spark/src/main/scala/Exec.scala +91 -0
  39. data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
  40. data/ext/spark/src/main/scala/Marshal.scala +52 -0
  41. data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
  42. data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
  43. data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
  44. data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
  45. data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
  46. data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
  47. data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
  48. data/ext/spark/src/main/scala/RubyPage.scala +34 -0
  49. data/ext/spark/src/main/scala/RubyRDD.scala +392 -0
  50. data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
  51. data/ext/spark/src/main/scala/RubyTab.scala +11 -0
  52. data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
  53. data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
  54. data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
  55. data/lib/ruby-spark.rb +1 -0
  56. data/lib/spark.rb +198 -0
  57. data/lib/spark/accumulator.rb +260 -0
  58. data/lib/spark/broadcast.rb +98 -0
  59. data/lib/spark/build.rb +43 -0
  60. data/lib/spark/cli.rb +169 -0
  61. data/lib/spark/command.rb +86 -0
  62. data/lib/spark/command/base.rb +158 -0
  63. data/lib/spark/command/basic.rb +345 -0
  64. data/lib/spark/command/pair.rb +124 -0
  65. data/lib/spark/command/sort.rb +51 -0
  66. data/lib/spark/command/statistic.rb +144 -0
  67. data/lib/spark/command_builder.rb +141 -0
  68. data/lib/spark/command_validator.rb +34 -0
  69. data/lib/spark/config.rb +238 -0
  70. data/lib/spark/constant.rb +14 -0
  71. data/lib/spark/context.rb +322 -0
  72. data/lib/spark/error.rb +50 -0
  73. data/lib/spark/ext/hash.rb +41 -0
  74. data/lib/spark/ext/integer.rb +25 -0
  75. data/lib/spark/ext/io.rb +67 -0
  76. data/lib/spark/ext/ip_socket.rb +29 -0
  77. data/lib/spark/ext/module.rb +58 -0
  78. data/lib/spark/ext/object.rb +24 -0
  79. data/lib/spark/ext/string.rb +24 -0
  80. data/lib/spark/helper.rb +10 -0
  81. data/lib/spark/helper/logger.rb +40 -0
  82. data/lib/spark/helper/parser.rb +85 -0
  83. data/lib/spark/helper/serialize.rb +71 -0
  84. data/lib/spark/helper/statistic.rb +93 -0
  85. data/lib/spark/helper/system.rb +42 -0
  86. data/lib/spark/java_bridge.rb +19 -0
  87. data/lib/spark/java_bridge/base.rb +203 -0
  88. data/lib/spark/java_bridge/jruby.rb +23 -0
  89. data/lib/spark/java_bridge/rjb.rb +41 -0
  90. data/lib/spark/logger.rb +76 -0
  91. data/lib/spark/mllib.rb +100 -0
  92. data/lib/spark/mllib/classification/common.rb +31 -0
  93. data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
  94. data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
  95. data/lib/spark/mllib/classification/svm.rb +135 -0
  96. data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
  97. data/lib/spark/mllib/clustering/kmeans.rb +118 -0
  98. data/lib/spark/mllib/matrix.rb +120 -0
  99. data/lib/spark/mllib/regression/common.rb +73 -0
  100. data/lib/spark/mllib/regression/labeled_point.rb +41 -0
  101. data/lib/spark/mllib/regression/lasso.rb +100 -0
  102. data/lib/spark/mllib/regression/linear.rb +124 -0
  103. data/lib/spark/mllib/regression/ridge.rb +97 -0
  104. data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
  105. data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
  106. data/lib/spark/mllib/stat/distribution.rb +12 -0
  107. data/lib/spark/mllib/vector.rb +185 -0
  108. data/lib/spark/rdd.rb +1377 -0
  109. data/lib/spark/sampler.rb +92 -0
  110. data/lib/spark/serializer.rb +79 -0
  111. data/lib/spark/serializer/auto_batched.rb +59 -0
  112. data/lib/spark/serializer/base.rb +63 -0
  113. data/lib/spark/serializer/batched.rb +84 -0
  114. data/lib/spark/serializer/cartesian.rb +13 -0
  115. data/lib/spark/serializer/compressed.rb +27 -0
  116. data/lib/spark/serializer/marshal.rb +17 -0
  117. data/lib/spark/serializer/message_pack.rb +23 -0
  118. data/lib/spark/serializer/oj.rb +23 -0
  119. data/lib/spark/serializer/pair.rb +41 -0
  120. data/lib/spark/serializer/text.rb +25 -0
  121. data/lib/spark/sort.rb +189 -0
  122. data/lib/spark/stat_counter.rb +125 -0
  123. data/lib/spark/storage_level.rb +39 -0
  124. data/lib/spark/version.rb +3 -0
  125. data/lib/spark/worker/master.rb +144 -0
  126. data/lib/spark/worker/spark_files.rb +15 -0
  127. data/lib/spark/worker/worker.rb +200 -0
  128. data/ruby-spark.gemspec +47 -0
  129. data/spec/generator.rb +37 -0
  130. data/spec/inputs/lorem_300.txt +316 -0
  131. data/spec/inputs/numbers/1.txt +50 -0
  132. data/spec/inputs/numbers/10.txt +50 -0
  133. data/spec/inputs/numbers/11.txt +50 -0
  134. data/spec/inputs/numbers/12.txt +50 -0
  135. data/spec/inputs/numbers/13.txt +50 -0
  136. data/spec/inputs/numbers/14.txt +50 -0
  137. data/spec/inputs/numbers/15.txt +50 -0
  138. data/spec/inputs/numbers/16.txt +50 -0
  139. data/spec/inputs/numbers/17.txt +50 -0
  140. data/spec/inputs/numbers/18.txt +50 -0
  141. data/spec/inputs/numbers/19.txt +50 -0
  142. data/spec/inputs/numbers/2.txt +50 -0
  143. data/spec/inputs/numbers/20.txt +50 -0
  144. data/spec/inputs/numbers/3.txt +50 -0
  145. data/spec/inputs/numbers/4.txt +50 -0
  146. data/spec/inputs/numbers/5.txt +50 -0
  147. data/spec/inputs/numbers/6.txt +50 -0
  148. data/spec/inputs/numbers/7.txt +50 -0
  149. data/spec/inputs/numbers/8.txt +50 -0
  150. data/spec/inputs/numbers/9.txt +50 -0
  151. data/spec/inputs/numbers_0_100.txt +101 -0
  152. data/spec/inputs/numbers_1_100.txt +100 -0
  153. data/spec/lib/collect_spec.rb +42 -0
  154. data/spec/lib/command_spec.rb +68 -0
  155. data/spec/lib/config_spec.rb +64 -0
  156. data/spec/lib/context_spec.rb +165 -0
  157. data/spec/lib/ext_spec.rb +72 -0
  158. data/spec/lib/external_apps_spec.rb +45 -0
  159. data/spec/lib/filter_spec.rb +80 -0
  160. data/spec/lib/flat_map_spec.rb +100 -0
  161. data/spec/lib/group_spec.rb +109 -0
  162. data/spec/lib/helper_spec.rb +19 -0
  163. data/spec/lib/key_spec.rb +41 -0
  164. data/spec/lib/manipulation_spec.rb +122 -0
  165. data/spec/lib/map_partitions_spec.rb +87 -0
  166. data/spec/lib/map_spec.rb +91 -0
  167. data/spec/lib/mllib/classification_spec.rb +54 -0
  168. data/spec/lib/mllib/clustering_spec.rb +35 -0
  169. data/spec/lib/mllib/matrix_spec.rb +32 -0
  170. data/spec/lib/mllib/regression_spec.rb +116 -0
  171. data/spec/lib/mllib/vector_spec.rb +77 -0
  172. data/spec/lib/reduce_by_key_spec.rb +118 -0
  173. data/spec/lib/reduce_spec.rb +131 -0
  174. data/spec/lib/sample_spec.rb +46 -0
  175. data/spec/lib/serializer_spec.rb +88 -0
  176. data/spec/lib/sort_spec.rb +58 -0
  177. data/spec/lib/statistic_spec.rb +170 -0
  178. data/spec/lib/whole_text_files_spec.rb +33 -0
  179. data/spec/spec_helper.rb +38 -0
  180. metadata +389 -0
@@ -0,0 +1,57 @@
1
+ require 'matrix'
2
+
3
+ # Based on ruby 2.1
4
+
5
+ class Vector
6
+ def self.elements(array, copy=true)
7
+ DenseVector.new(convert_to_array(array, copy))
8
+ end
9
+ end
10
+
11
+ module Spark
12
+ module Mllib
13
+ class VectorAdapter < ::Vector
14
+
15
+ def self.new(*args)
16
+ object = self.allocate
17
+ object.__send__(:initialize, *args)
18
+ object
19
+ end
20
+
21
+ def initialize(*args)
22
+ case args.shift
23
+ when :dense
24
+ values = args.shift.dup
25
+ when :sparse
26
+ values = [0.0] * args.shift.to_i
27
+ else
28
+ raise Spark::MllibError, 'Unknow vector type.'
29
+ end
30
+
31
+ super(values)
32
+ end
33
+
34
+ def []=(index, value)
35
+ @elements[index] = value
36
+ end
37
+
38
+ def dot(other)
39
+ if other.is_a?(Spark::Mllib::MatrixBase)
40
+ other * self
41
+ else
42
+ inner_product(other)
43
+ end
44
+ end
45
+
46
+ def squared_distance(other)
47
+ diff = self - other
48
+ diff.dot(diff)
49
+ end
50
+
51
+ def values
52
+ @values || to_a
53
+ end
54
+
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,12 @@
1
+ ##
2
+ # MultivariateGaussian
3
+ #
4
+ # This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution. In
5
+ # the event that the covariance matrix is singular, the density will be computed in a
6
+ # reduced dimensional subspace under which the distribution is supported.
7
+ #
8
+ # == Arguments:
9
+ # mu:: The mean vector of the distribution
10
+ # sigma:: The covariance matrix of the distribution
11
+ #
12
+ Spark::Mllib::MultivariateGaussian = Struct.new(:mu, :sigma)
@@ -0,0 +1,185 @@
1
+ module Spark
2
+ module Mllib
3
+ module Vectors
4
+
5
+ def self.dense(*args)
6
+ DenseVector.new(*args)
7
+ end
8
+
9
+ def self.sparse(*args)
10
+ SparseVector.new(*args)
11
+ end
12
+
13
+ def self.parse(data)
14
+ if data.start_with?('[') && data.end_with?(']')
15
+ DenseVector.parse(data)
16
+ elsif data.start_with?('(') && data.end_with?(')')
17
+ SparseVector.parse(data)
18
+ else
19
+ raise ArgumentError, 'Unknow vector.'
20
+ end
21
+ end
22
+
23
+ def self.to_vector(data)
24
+ if data.is_a?(SparseVector) || data.is_a?(DenseVector)
25
+ data
26
+ elsif data.is_a?(Array)
27
+ DenseVector.new(data)
28
+ end
29
+ end
30
+
31
+ end
32
+ end
33
+ end
34
+
35
+ module Spark
36
+ module Mllib
37
+ # @abstract Parent for all type of vectors
38
+ class VectorBase < VectorAdapter
39
+ end
40
+ end
41
+ end
42
+
43
+ module Spark
44
+ module Mllib
45
+ ##
46
+ # A dense vector represented by a value array.
47
+ #
48
+ # Dense vector is a vector in which most of the elements are non-zero.
49
+ #
50
+ # == Example:
51
+ # DenseVector.new([1,2,3,4,5]).values
52
+ # # => [1, 2, 3, 4, 5]
53
+ #
54
+ # DenseVector.new(1..5).values
55
+ # # => [1, 2, 3, 4, 5]
56
+ #
57
+ class DenseVector < VectorBase
58
+
59
+ def initialize(values)
60
+ super(:dense, values.to_a)
61
+ end
62
+
63
+ # Covert string to vector
64
+ #
65
+ # DenseVector.parse("[1.0,2.0,3.0,4.0,5.0]")
66
+ #
67
+ def self.parse(data)
68
+ unless data =~ /\[[0-9., ]+\]/
69
+ raise ArgumentError, 'Unknow format for DenseVector.'
70
+ end
71
+
72
+ data.sub!('[', '')
73
+ data.sub!(']', '')
74
+
75
+ data = data.split(',')
76
+ data.map!(&:to_f)
77
+
78
+ DenseVector.new(data)
79
+ end
80
+
81
+ # Convert vector to string
82
+ #
83
+ # DenseVector.new([1,2,3,4,5]).to_s
84
+ # # => "[1.0,2.0,3.0,4.0,5.0]"
85
+ #
86
+ def to_s
87
+ "[#{values.join(',')}]"
88
+ end
89
+
90
+ def to_java
91
+ JDenseVector.new(values)
92
+ end
93
+
94
+ def self.from_java(object)
95
+ DenseVector.new(object.values)
96
+ end
97
+
98
+ def marshal_dump
99
+ values
100
+ end
101
+
102
+ def marshal_load(array)
103
+ initialize(array)
104
+ end
105
+
106
+ end
107
+ end
108
+ end
109
+
110
+ module Spark
111
+ module Mllib
112
+ ##
113
+ # A sparse vector represented by an index array and an value array.
114
+ #
115
+ # Sparse vector is a vector in which most of the elements are zero.
116
+ #
117
+ # == Example:
118
+ # SparseVector.new(4, {1 => 1.0, 3 => 5.5}).values
119
+ # # => [0, 1.0, 0, 5.5]
120
+ #
121
+ # SparseVector.new(4, [[1, 3], [1.0, 5.5]]).values
122
+ # # => [0, 1.0, 0, 5.5]
123
+ #
124
+ # SparseVector.new(4, [1, 3], [1.0, 5.5]).values
125
+ # # => [0, 1.0, 0, 5.5]
126
+ #
127
+ class SparseVector < VectorBase
128
+
129
+ attr_reader :indices
130
+
131
+ def initialize(arg1, arg2=nil, arg3=nil)
132
+ super(:sparse, arg1)
133
+
134
+ if arg2.is_a?(Hash)
135
+ @indices = arg2.keys
136
+ @values = arg2.values
137
+ else
138
+ @indices = arg2
139
+ @values = arg3
140
+ end
141
+
142
+ @indices.zip(@values).each do |(index, value)|
143
+ self[index] = value
144
+ end
145
+ end
146
+
147
+ # Covert string to vector
148
+ #
149
+ # SparseVector.parse("(5,[1,4],[3.0,5.0])")
150
+ #
151
+ def self.parse(data)
152
+ data = data.match(/\(([0-9]+)[ ]*,[ ]*\[([0-9,. ]*)\][ ]*,[ ]*\[([0-9,. ]*)\]\)/)
153
+ if data
154
+ size = data[1].to_i
155
+ indices = data[2].split(',')
156
+ indices.map!(&:to_i)
157
+ values = data[3].split(',')
158
+ values.map!(&:to_f)
159
+
160
+ SparseVector.new(size, indices, values)
161
+ else
162
+ raise ArgumentError, 'Unknow format for SparseVector.'
163
+ end
164
+ end
165
+
166
+ # Convert vector to string
167
+ #
168
+ # SparseVector.new(5, {1 => 3, 4 => 5}).to_s
169
+ # # => "(5,[1,4],[3.0,5.0])"
170
+ #
171
+ def to_s
172
+ "(#{size},[#{indices.join(',')}],[#{values.join(',')}])"
173
+ end
174
+
175
+ def marshal_dump
176
+ [size, indices, values]
177
+ end
178
+
179
+ def marshal_load(array)
180
+ initialize(array[0], array[1], array[2])
181
+ end
182
+
183
+ end
184
+ end
185
+ end
@@ -0,0 +1,1377 @@
1
+ module Spark
2
+ ##
3
+ # A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. Represents an immutable,
4
+ # partitioned collection of elements that can be operated on in parallel. This class contains the
5
+ # basic operations available on all RDDs, such as `map`, `filter`, and `persist`.
6
+ #
7
+ class RDD
8
+
9
+ extend Forwardable
10
+
11
+ attr_reader :jrdd, :context, :command
12
+
13
+ include Spark::Helper::Logger
14
+ include Spark::Helper::Parser
15
+ include Spark::Helper::Statistic
16
+
17
+ def_delegators :@command, :serializer, :deserializer, :libraries, :files
18
+
19
+ # Initializing RDD, this method is root of all Pipelined RDD - its unique
20
+ # If you call some operations on this class it will be computed in Java
21
+ #
22
+ # == Parameters:
23
+ # jrdd:: org.apache.spark.api.java.JavaRDD
24
+ # context:: {Spark::Context}
25
+ # serializer:: {Spark::Serializer}
26
+ #
27
+ def initialize(jrdd, context, serializer, deserializer=nil)
28
+ @jrdd = jrdd
29
+ @context = context
30
+
31
+ @cached = false
32
+ @checkpointed = false
33
+
34
+ @command = Spark::CommandBuilder.new(serializer, deserializer)
35
+ end
36
+
37
+ def inspect
38
+ comms = @command.commands.join(' -> ')
39
+
40
+ result = %{#<#{self.class.name}:0x#{object_id}}
41
+ result << %{ (#{comms})} unless comms.empty?
42
+ result << %{\n}
43
+ result << %{ Serializer: "#{serializer}"\n}
44
+ result << %{Deserializer: "#{deserializer}"}
45
+ result << %{>}
46
+ result
47
+ end
48
+
49
+
50
+ # =============================================================================
51
+ # Operators
52
+
53
+ def +(other)
54
+ self.union(other)
55
+ end
56
+
57
+
58
+ # =============================================================================
59
+ # Commad and serializer
60
+
61
+ def add_command(klass, *args)
62
+ @command.deep_copy.add_command(klass, *args)
63
+ end
64
+
65
+ # Add ruby library
66
+ # Libraries will be included before computing
67
+ #
68
+ # == Example:
69
+ # rdd.add_library('pry').add_library('nio4r', 'distribution')
70
+ #
71
+ def add_library(*libraries)
72
+ @command.add_library(*libraries)
73
+ self
74
+ end
75
+
76
+ # Bind object to RDD
77
+ #
78
+ # == Example:
79
+ # text = "test"
80
+ #
81
+ # rdd = $sc.parallelize(0..5)
82
+ # rdd = rdd.map(lambda{|x| x.to_s + " " + text})
83
+ # rdd = rdd.bind(text: text)
84
+ #
85
+ # rdd.collect
86
+ # # => ["0 test", "1 test", "2 test", "3 test", "4 test", "5 test"]
87
+ #
88
+ def bind(objects)
89
+ unless objects.is_a?(Hash)
90
+ raise ArgumentError, 'Argument must be a Hash.'
91
+ end
92
+
93
+ @command.bind(objects)
94
+ self
95
+ end
96
+
97
+ def new_rdd_from_command(klass, *args)
98
+ comm = add_command(klass, *args)
99
+ PipelinedRDD.new(self, comm)
100
+ end
101
+
102
+
103
+ # =============================================================================
104
+ # Variables and non-computing functions
105
+
106
+ def config
107
+ @context.config
108
+ end
109
+
110
+ def default_reduce_partitions
111
+ config['spark.default.parallelism'] || partitions_size
112
+ end
113
+
114
+ # Count of ParallelCollectionPartition
115
+ def partitions_size
116
+ jrdd.rdd.partitions.size
117
+ end
118
+
119
+ # A unique ID for this RDD (within its SparkContext).
120
+ def id
121
+ jrdd.id
122
+ end
123
+
124
+ # Persist this RDD with the default storage level MEMORY_ONLY_SER because of serialization.
125
+ def cache
126
+ persist('memory_only_ser')
127
+ end
128
+
129
+ # Set this RDD's storage level to persist its values across operations after the first time
130
+ # it is computed. This can only be used to assign a new storage level if the RDD does not
131
+ # have a storage level set yet.
132
+ #
133
+ # See StorageLevel for type of new_level
134
+ #
135
+ def persist(new_level)
136
+ @cached = true
137
+ jrdd.persist(Spark::StorageLevel.java_get(new_level))
138
+ self
139
+ end
140
+
141
+ # Mark the RDD as non-persistent, and remove all blocks for it from memory and disk.
142
+ #
143
+ # == Parameters:
144
+ # blocking:: whether to block until all blocks are deleted.
145
+ #
146
+ def unpersist(blocking=true)
147
+ @cached = false
148
+ jrdd.unpersist(blocking)
149
+ self
150
+ end
151
+
152
+ def cached?
153
+ @cached
154
+ end
155
+
156
+ def checkpointed?
157
+ @checkpointed
158
+ end
159
+
160
+ # Return the name of this RDD.
161
+ #
162
+ def name
163
+ _name = jrdd.name
164
+ _name && _name.encode(Encoding::UTF_8)
165
+ end
166
+
167
+ # Assign a name to this RDD.
168
+ #
169
+ def set_name(name)
170
+ jrdd.setName(name)
171
+ end
172
+
173
+ def to_java
174
+ marshal = Spark::Serializer.marshal
175
+
176
+ if deserializer.batched?
177
+ ser = deserializer.deep_copy
178
+ ser.serializer = marshal
179
+ else
180
+ ser = Spark::Serializer.batched(marshal)
181
+ end
182
+
183
+ rdd = self.reserialize(ser)
184
+ RubyRDD.toJava(rdd.jrdd, rdd.serializer.batched?)
185
+ end
186
+
187
+
188
+ # =============================================================================
189
+ # Actions which return value
190
+
191
+ # Return an array that contains all of the elements in this RDD.
192
+ # RJB raise an error if stage is killed.
193
+ def collect(as_enum=false)
194
+ file = Tempfile.new('collect', context.temp_dir)
195
+
196
+ RubyRDD.writeRDDToFile(jrdd.rdd, file.path)
197
+
198
+ collect_from_file(file, as_enum)
199
+ rescue => e
200
+ raise Spark::RDDError, e.message
201
+ end
202
+
203
+ def collect_from_file(file, as_enum=false)
204
+ if self.is_a?(PipelinedRDD)
205
+ klass = @command.serializer
206
+ else
207
+ klass = @command.deserializer
208
+ end
209
+
210
+ if as_enum
211
+ result = klass.load_from_file(file)
212
+ else
213
+ result = klass.load_from_io(file).to_a
214
+ file.close
215
+ file.unlink
216
+ end
217
+
218
+ result
219
+ end
220
+
221
+ # Convert an Array to Hash
222
+ #
223
+ def collect_as_hash
224
+ Hash[collect]
225
+ end
226
+
227
+ # Take the first num elements of the RDD.
228
+ #
229
+ # It works by first scanning one partition, and use the results from
230
+ # that partition to estimate the number of additional partitions needed
231
+ # to satisfy the limit.
232
+ #
233
+ # == Example:
234
+ # rdd = $sc.parallelize(0..100, 20)
235
+ # rdd.take(5)
236
+ # # => [0, 1, 2, 3, 4]
237
+ #
238
+ def take(count)
239
+ buffer = []
240
+
241
+ parts_count = self.partitions_size
242
+ # No parts was scanned, yet
243
+ last_scanned = -1
244
+
245
+ while buffer.empty?
246
+ last_scanned += 1
247
+ buffer += context.run_job_with_command(self, [last_scanned], true, Spark::Command::Take, 0, -1)
248
+ end
249
+
250
+ # Assumption. Depend on batch_size and how Spark divided data.
251
+ items_per_part = buffer.size
252
+ left = count - buffer.size
253
+
254
+ while left > 0 && last_scanned < parts_count
255
+ parts_to_take = (left.to_f/items_per_part).ceil
256
+ parts_for_scanned = Array.new(parts_to_take) do
257
+ last_scanned += 1
258
+ end
259
+
260
+ # We cannot take exact number of items because workers are isolated from each other.
261
+ # => once you take e.g. 50% from last part and left is still > 0 then its very
262
+ # difficult merge new items
263
+ items = context.run_job_with_command(self, parts_for_scanned, true, Spark::Command::Take, left, last_scanned)
264
+ buffer += items
265
+
266
+ left = count - buffer.size
267
+ # Average size of all parts
268
+ items_per_part = [items_per_part, items.size].reduce(0){|sum, x| sum + x.to_f/2}
269
+ end
270
+
271
+ buffer.slice!(0, count)
272
+ end
273
+
274
+ # Return the first element in this RDD.
275
+ #
276
+ # == Example:
277
+ # rdd = $sc.parallelize(0..100)
278
+ # rdd.first
279
+ # # => 0
280
+ #
281
+ def first
282
+ self.take(1)[0]
283
+ end
284
+
285
+ # Reduces the elements of this RDD using the specified lambda or method.
286
+ #
287
+ # == Example:
288
+ # rdd = $sc.parallelize(0..10)
289
+ # rdd.reduce(lambda{|sum, x| sum+x})
290
+ # # => 55
291
+ #
292
+ def reduce(f)
293
+ _reduce(Spark::Command::Reduce, f, f)
294
+ end
295
+
296
+ # Aggregate the elements of each partition, and then the results for all the partitions, using a
297
+ # given associative function and a neutral "zero value".
298
+ #
299
+ # The function f(x, y) is allowed to modify x and return it as its result value to avoid
300
+ # object allocation; however, it should not modify y.
301
+ #
302
+ # Be careful, zero_values is applied to all stages. See example.
303
+ #
304
+ # == Example:
305
+ # rdd = $sc.parallelize(0..10, 2)
306
+ # rdd.fold(1, lambda{|sum, x| sum+x})
307
+ # # => 58
308
+ #
309
+ def fold(zero_value, f)
310
+ self.aggregate(zero_value, f, f)
311
+ end
312
+
313
+ # Aggregate the elements of each partition, and then the results for all the partitions, using
314
+ # given combine functions and a neutral "zero value".
315
+ #
316
+ # This function can return a different result type. We need one operation for merging.
317
+ #
318
+ # Result must be an Array otherwise Serializer Array's zero value will be send
319
+ # as multiple values and not just one.
320
+ #
321
+ # == Example:
322
+ # # 1 2 3 4 5 => 15 + 1 = 16
323
+ # # 6 7 8 9 10 => 40 + 1 = 41
324
+ # # 16 * 41 = 656
325
+ #
326
+ # seq = lambda{|x,y| x+y}
327
+ # com = lambda{|x,y| x*y}
328
+ #
329
+ # rdd = $sc.parallelize(1..10, 2)
330
+ # rdd.aggregate(1, seq, com)
331
+ # # => 656
332
+ #
333
+ def aggregate(zero_value, seq_op, comb_op)
334
+ _reduce(Spark::Command::Aggregate, seq_op, comb_op, zero_value)
335
+ end
336
+
337
+ # Return the max of this RDD
338
+ #
339
+ # == Example:
340
+ # rdd = $sc.parallelize(0..10)
341
+ # rdd.max
342
+ # # => 10
343
+ #
344
+ def max
345
+ self.reduce('lambda{|memo, item| memo > item ? memo : item }')
346
+ end
347
+
348
+ # Return the min of this RDD
349
+ #
350
+ # == Example:
351
+ # rdd = $sc.parallelize(0..10)
352
+ # rdd.min
353
+ # # => 0
354
+ #
355
+ def min
356
+ self.reduce('lambda{|memo, item| memo < item ? memo : item }')
357
+ end
358
+
359
+ # Return the sum of this RDD
360
+ #
361
+ # == Example:
362
+ # rdd = $sc.parallelize(0..10)
363
+ # rdd.sum
364
+ # # => 55
365
+ #
366
+ def sum
367
+ self.reduce('lambda{|sum, item| sum + item}')
368
+ end
369
+
370
+ # Return the number of values in this RDD
371
+ #
372
+ # == Example:
373
+ # rdd = $sc.parallelize(0..10)
374
+ # rdd.count
375
+ # # => 11
376
+ #
377
+ def count
378
+ # nil is for seq_op => it means the all result go directly to one worker for combine
379
+ @count ||= self.map_partitions('lambda{|iterator| iterator.to_a.size }')
380
+ .aggregate(0, nil, 'lambda{|sum, item| sum + item }')
381
+ end
382
+
383
+ # Return a {Spark::StatCounter} object that captures the mean, variance
384
+ # and count of the RDD's elements in one operation.
385
+ def stats
386
+ @stats ||= new_rdd_from_command(Spark::Command::Stats).reduce('lambda{|memo, item| memo.merge(item)}')
387
+ end
388
+
389
+ # Compute the mean of this RDD's elements.
390
+ #
391
+ # == Example:
392
+ # $sc.parallelize([1, 2, 3]).mean
393
+ # # => 2.0
394
+ #
395
+ def mean
396
+ stats.mean
397
+ end
398
+
399
+ # Compute the variance of this RDD's elements.
400
+ #
401
+ # == Example:
402
+ # $sc.parallelize([1, 2, 3]).variance
403
+ # # => 0.666...
404
+ #
405
+ def variance
406
+ stats.variance
407
+ end
408
+
409
+ # Compute the standard deviation of this RDD's elements.
410
+ #
411
+ # == Example:
412
+ # $sc.parallelize([1, 2, 3]).stdev
413
+ # # => 0.816...
414
+ #
415
+ def stdev
416
+ stats.stdev
417
+ end
418
+
419
+ # Compute the sample standard deviation of this RDD's elements (which
420
+ # corrects for bias in estimating the standard deviation by dividing by
421
+ # N-1 instead of N).
422
+ #
423
+ # == Example:
424
+ # $sc.parallelize([1, 2, 3]).sample_stdev
425
+ # # => 1.0
426
+ #
427
+ def sample_stdev
428
+ stats.sample_stdev
429
+ end
430
+
431
+ # Compute the sample variance of this RDD's elements (which corrects
432
+ # for bias in estimating the variance by dividing by N-1 instead of N).
433
+ #
434
+ # == Example:
435
+ # $sc.parallelize([1, 2, 3]).sample_variance
436
+ # # => 1.0
437
+ #
438
+ def sample_variance
439
+ stats.sample_variance
440
+ end
441
+
442
+ # Compute a histogram using the provided buckets. The buckets
443
+ # are all open to the right except for the last which is closed.
444
+ # e.g. [1,10,20,50] means the buckets are [1,10) [10,20) [20,50],
445
+ # which means 1<=x<10, 10<=x<20, 20<=x<=50. And on the input of 1
446
+ # and 50 we would have a histogram of 1,0,1.
447
+ #
448
+ # If your histogram is evenly spaced (e.g. [0, 10, 20, 30]),
449
+ # this can be switched from an O(log n) inseration to O(1) per
450
+ # element(where n = # buckets).
451
+ #
452
+ # Buckets must be sorted and not contain any duplicates, must be
453
+ # at least two elements.
454
+ #
455
+ # == Examples:
456
+ # rdd = $sc.parallelize(0..50)
457
+ #
458
+ # rdd.histogram(2)
459
+ # # => [[0.0, 25.0, 50], [25, 26]]
460
+ #
461
+ # rdd.histogram([0, 5, 25, 50])
462
+ # # => [[0, 5, 25, 50], [5, 20, 26]]
463
+ #
464
+ # rdd.histogram([0, 15, 30, 45, 60])
465
+ # # => [[0, 15, 30, 45, 60], [15, 15, 15, 6]]
466
+ #
467
+ def histogram(buckets)
468
+
469
+ # -----------------------------------------------------------------------
470
+ # Integer
471
+ #
472
+ if buckets.is_a?(Integer)
473
+
474
+ # Validation
475
+ if buckets < 1
476
+ raise ArgumentError, "Bucket count must be >= 1, #{buckets} inserted."
477
+ end
478
+
479
+ # Filter invalid values
480
+ # Nil and NaN
481
+ func = 'lambda{|x|
482
+ if x.nil? || (x.is_a?(Float) && x.nan?)
483
+ false
484
+ else
485
+ true
486
+ end
487
+ }'
488
+ filtered = self.filter(func)
489
+
490
+ # Compute the minimum and the maximum
491
+ func = 'lambda{|memo, item|
492
+ [memo[0] < item[0] ? memo[0] : item[0],
493
+ memo[1] > item[1] ? memo[1] : item[1]]
494
+ }'
495
+ min, max = filtered.map('lambda{|x| [x, x]}').reduce(func)
496
+
497
+ # Min, max must be valid numbers
498
+ if (min.is_a?(Float) && !min.finite?) || (max.is_a?(Float) && !max.finite?)
499
+ raise Spark::RDDError, 'Histogram on either an empty RDD or RDD containing +/-infinity or NaN'
500
+ end
501
+
502
+ # Already finished
503
+ if min == max || buckets == 1
504
+ return [min, max], [filtered.count]
505
+ end
506
+
507
+ # Custom range
508
+ begin
509
+ span = max - min # increment
510
+ buckets = (0...buckets).map do |x|
511
+ min + (x * span) / buckets.to_f
512
+ end
513
+ buckets << max
514
+ rescue NoMethodError
515
+ raise Spark::RDDError, 'Can not generate buckets with non-number in RDD'
516
+ end
517
+
518
+ even = true
519
+
520
+ # -----------------------------------------------------------------------
521
+ # Array
522
+ #
523
+ elsif buckets.is_a?(Array)
524
+
525
+ if buckets.size < 2
526
+ raise ArgumentError, 'Buckets should have more than one value.'
527
+ end
528
+
529
+ if buckets.detect{|x| x.nil? || (x.is_a?(Float) && x.nan?)}
530
+ raise ArgumentError, 'Can not have nil or nan numbers in buckets.'
531
+ end
532
+
533
+ if buckets.detect{|x| buckets.count(x) > 1}
534
+ raise ArgumentError, 'Buckets should not contain duplicated values.'
535
+ end
536
+
537
+ if buckets.sort != buckets
538
+ raise ArgumentError, 'Buckets must be sorted.'
539
+ end
540
+
541
+ even = false
542
+
543
+ # -----------------------------------------------------------------------
544
+ # Other
545
+ #
546
+ else
547
+ raise Spark::RDDError, 'Buckets should be number or array.'
548
+ end
549
+
550
+ reduce_func = 'lambda{|memo, item|
551
+ memo.size.times do |i|
552
+ memo[i] += item[i]
553
+ end
554
+ memo
555
+ }'
556
+
557
+ return buckets, new_rdd_from_command(Spark::Command::Histogram, even, buckets).reduce(reduce_func)
558
+ end
559
+
560
+ # Applies a function f to all elements of this RDD.
561
+ #
562
+ # == Example:
563
+ # rdd = $sc.parallelize(0..5)
564
+ # rdd.foreach(lambda{|x| puts x})
565
+ # # => nil
566
+ #
567
+ def foreach(f, options={})
568
+ new_rdd_from_command(Spark::Command::Foreach, f).collect
569
+ nil
570
+ end
571
+
572
+ # Applies a function f to each partition of this RDD.
573
+ #
574
+ # == Example:
575
+ # rdd = $sc.parallelize(0..5)
576
+ # rdd.foreachPartition(lambda{|x| puts x.to_s})
577
+ # # => nil
578
+ #
579
+ def foreach_partition(f, options={})
580
+ new_rdd_from_command(Spark::Command::ForeachPartition, f).collect
581
+ nil
582
+ end
583
+
584
+
585
+ # =============================================================================
586
+ # Transformations of RDD
587
+
588
+ # Return a new RDD by applying a function to all elements of this RDD.
589
+ #
590
+ # == Example:
591
+ # rdd = $sc.parallelize(0..5)
592
+ # rdd.map(lambda {|x| x*2}).collect
593
+ # # => [0, 2, 4, 6, 8, 10]
594
+ #
595
+ def map(f)
596
+ new_rdd_from_command(Spark::Command::Map, f)
597
+ end
598
+
599
+ # Return a new RDD by first applying a function to all elements of this
600
+ # RDD, and then flattening the results.
601
+ #
602
+ # == Example:
603
+ # rdd = $sc.parallelize(0..5)
604
+ # rdd.flat_map(lambda {|x| [x, 1]}).collect
605
+ # # => [0, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1]
606
+ #
607
+ def flat_map(f)
608
+ new_rdd_from_command(Spark::Command::FlatMap, f)
609
+ end
610
+
611
+ # Return a new RDD by applying a function to each partition of this RDD.
612
+ #
613
+ # == Example:
614
+ # rdd = $sc.parallelize(0..10, 2)
615
+ # rdd.map_partitions(lambda{|part| part.reduce(:+)}).collect
616
+ # # => [15, 40]
617
+ #
618
+ def map_partitions(f)
619
+ new_rdd_from_command(Spark::Command::MapPartitions, f)
620
+ end
621
+
622
+ # Return a new RDD by applying a function to each partition of this RDD, while tracking the index
623
+ # of the original partition.
624
+ #
625
+ # == Example:
626
+ # rdd = $sc.parallelize(0...4, 4)
627
+ # rdd.map_partitions_with_index(lambda{|part, index| part.first * index}).collect
628
+ # # => [0, 1, 4, 9]
629
+ #
630
+ def map_partitions_with_index(f, options={})
631
+ new_rdd_from_command(Spark::Command::MapPartitionsWithIndex, f)
632
+ end
633
+
634
+ # Return a new RDD containing only the elements that satisfy a predicate.
635
+ #
636
+ # == Example:
637
+ # rdd = $sc.parallelize(0..10)
638
+ # rdd.filter(lambda{|x| x.even?}).collect
639
+ # # => [0, 2, 4, 6, 8, 10]
640
+ #
641
+ def filter(f)
642
+ new_rdd_from_command(Spark::Command::Filter, f)
643
+ end
644
+
645
+ # Return a new RDD containing non-nil elements.
646
+ #
647
+ # == Example:
648
+ # rdd = $sc.parallelize([1, nil, 2, nil, 3])
649
+ # rdd.compact.collect
650
+ # # => [1, 2, 3]
651
+ #
652
+ def compact
653
+ new_rdd_from_command(Spark::Command::Compact)
654
+ end
655
+
656
+ # Return an RDD created by coalescing all elements within each partition into an array.
657
+ #
658
+ # == Example:
659
+ # rdd = $sc.parallelize(0..10, 3)
660
+ # rdd.glom.collect
661
+ # # => [[0, 1, 2], [3, 4, 5, 6], [7, 8, 9, 10]]
662
+ #
663
+ def glom
664
+ new_rdd_from_command(Spark::Command::Glom)
665
+ end
666
+
667
+ # Return a new RDD that is reduced into num_partitions partitions.
668
+ #
669
+ # == Example:
670
+ # rdd = $sc.parallelize(0..10, 3)
671
+ # rdd.coalesce(2).glom.collect
672
+ # # => [[0, 1, 2], [3, 4, 5, 6, 7, 8, 9, 10]]
673
+ #
674
+ def coalesce(num_partitions)
675
+ if self.is_a?(PipelinedRDD)
676
+ deser = @command.serializer
677
+ else
678
+ deser = @command.deserializer
679
+ end
680
+
681
+ new_jrdd = jrdd.coalesce(num_partitions)
682
+ RDD.new(new_jrdd, context, @command.serializer, deser)
683
+ end
684
+
685
+ # Return the Cartesian product of this RDD and another one, that is, the
686
+ # RDD of all pairs of elements `(a, b)` where `a` is in `self` and
687
+ # `b` is in `other`.
688
+ #
689
+ # == Example:
690
+ # rdd1 = $sc.parallelize([1,2,3])
691
+ # rdd2 = $sc.parallelize([4,5,6])
692
+ #
693
+ # rdd1.cartesian(rdd2).collect
694
+ # # => [[1, 4], [1, 5], [1, 6], [2, 4], [2, 5], [2, 6], [3, 4], [3, 5], [3, 6]]
695
+ #
696
+ def cartesian(other)
697
+ _deserializer = Spark::Serializer::Cartesian.new(self.deserializer, other.deserializer)
698
+
699
+ new_jrdd = jrdd.cartesian(other.jrdd)
700
+ RDD.new(new_jrdd, context, serializer, _deserializer)
701
+ end
702
+
703
+ # Return a new RDD containing the distinct elements in this RDD.
704
+ # Ordering is not preserved because of reducing
705
+ #
706
+ # == Example:
707
+ # rdd = $sc.parallelize([1,1,1,2,3])
708
+ # rdd.distinct.collect
709
+ # # => [1, 2, 3]
710
+ #
711
+ def distinct
712
+ self.map('lambda{|x| [x, nil]}')
713
+ .reduce_by_key('lambda{|x,_| x}')
714
+ .map('lambda{|x| x[0]}')
715
+ end
716
+
717
+ # Return a shuffled RDD.
718
+ #
719
+ # == Example:
720
+ # rdd = $sc.parallelize(0..10)
721
+ # rdd.shuffle.collect
722
+ # # => [3, 10, 6, 7, 8, 0, 4, 2, 9, 1, 5]
723
+ #
724
+ def shuffle(seed=nil)
725
+ seed ||= Random.new_seed
726
+
727
+ new_rdd_from_command(Spark::Command::Shuffle, seed)
728
+ end
729
+
730
+ # Return the union of this RDD and another one. Any identical elements will appear multiple
731
+ # times (use .distinct to eliminate them).
732
+ #
733
+ # == Example:
734
+ # rdd = $sc.parallelize([1, 2, 3])
735
+ # rdd.union(rdd).collect
736
+ # # => [1, 2, 3, 1, 2, 3]
737
+ #
738
+ def union(other)
739
+ if self.serializer != other.serializer
740
+ other = other.reserialize(serializer)
741
+ end
742
+
743
+ new_jrdd = jrdd.union(other.jrdd)
744
+ RDD.new(new_jrdd, context, serializer, deserializer)
745
+ end
746
+
747
+ # Return a new RDD with different serializer. This method is useful during union
748
+ # and join operations.
749
+ #
750
+ # == Example:
751
+ # rdd = $sc.parallelize([1, 2, 3], nil, serializer: "marshal")
752
+ # rdd = rdd.map(lambda{|x| x.to_s})
753
+ # rdd.reserialize("oj").collect
754
+ # # => ["1", "2", "3"]
755
+ #
756
+ def reserialize(new_serializer)
757
+ if serializer == new_serializer
758
+ return self
759
+ end
760
+
761
+ new_command = @command.deep_copy
762
+ new_command.serializer = new_serializer
763
+
764
+ PipelinedRDD.new(self, new_command)
765
+ end
766
+
767
+ # Return the intersection of this RDD and another one. The output will not contain
768
+ # any duplicate elements, even if the input RDDs did.
769
+ #
770
+ # == Example:
771
+ # rdd1 = $sc.parallelize([1,2,3,4,5])
772
+ # rdd2 = $sc.parallelize([1,4,5,6,7])
773
+ # rdd1.intersection(rdd2).collect
774
+ # # => [1, 4, 5]
775
+ #
776
+ def intersection(other)
777
+ mapping_function = 'lambda{|item| [item, nil]}'
778
+ filter_function = 'lambda{|(key, values)| values.size > 1}'
779
+
780
+ self.map(mapping_function)
781
+ .cogroup(other.map(mapping_function))
782
+ .filter(filter_function)
783
+ .keys
784
+ end
785
+
786
+ # Return a copy of the RDD partitioned using the specified partitioner.
787
+ #
788
+ # == Example:
789
+ # rdd = $sc.parallelize(["1","2","3","4","5"]).map(lambda {|x| [x, 1]})
790
+ # rdd.partitionBy(2).glom.collect
791
+ # # => [[["3", 1], ["4", 1]], [["1", 1], ["2", 1], ["5", 1]]]
792
+ #
793
+ def partition_by(num_partitions, partition_func=nil)
794
+ num_partitions ||= default_reduce_partitions
795
+ partition_func ||= 'lambda{|x| Spark::Digest.portable_hash(x.to_s)}'
796
+
797
+ _partition_by(num_partitions, Spark::Command::PartitionBy::Basic, partition_func)
798
+ end
799
+
800
+ # Return a sampled subset of this RDD. Operations are base on Poisson and Uniform
801
+ # distributions.
802
+ # TODO: Replace Unfirom for Bernoulli
803
+ #
804
+ # == Examples:
805
+ # rdd = $sc.parallelize(0..100)
806
+ #
807
+ # rdd.sample(true, 10).collect
808
+ # # => [17, 17, 22, 23, 51, 52, 62, 64, 69, 70, 96]
809
+ #
810
+ # rdd.sample(false, 0.1).collect
811
+ # # => [3, 5, 9, 32, 44, 55, 66, 68, 75, 80, 86, 91, 98]
812
+ #
813
+ def sample(with_replacement, fraction, seed=nil)
814
+ new_rdd_from_command(Spark::Command::Sample, with_replacement, fraction, seed)
815
+ end
816
+
817
+ # Return a fixed-size sampled subset of this RDD in an array
818
+ #
819
+ # == Examples:
820
+ # rdd = $sc.parallelize(0..100)
821
+ #
822
+ # rdd.take_sample(true, 10)
823
+ # # => [90, 84, 74, 44, 27, 22, 72, 96, 80, 54]
824
+ #
825
+ # rdd.take_sample(false, 10)
826
+ # # => [5, 35, 30, 48, 22, 33, 40, 75, 42, 32]
827
+ #
828
+ def take_sample(with_replacement, num, seed=nil)
829
+
830
+ if num < 0
831
+ raise Spark::RDDError, 'Size have to be greater than 0'
832
+ elsif num == 0
833
+ return []
834
+ end
835
+
836
+ # Taken from scala
837
+ num_st_dev = 10.0
838
+
839
+ # Number of items
840
+ initial_count = self.count
841
+ return [] if initial_count == 0
842
+
843
+ # Create new generator
844
+ seed ||= Random.new_seed
845
+ rng = Random.new(seed)
846
+
847
+ # Shuffle elements if requested num if greater than array size
848
+ if !with_replacement && num >= initial_count
849
+ return self.shuffle(seed).collect
850
+ end
851
+
852
+ # Max num
853
+ max_sample_size = Integer::MAX - (num_st_dev * Math.sqrt(Integer::MAX)).to_i
854
+ if num > max_sample_size
855
+ raise Spark::RDDError, "Size can not be greate than #{max_sample_size}"
856
+ end
857
+
858
+ # Approximate fraction with tolerance
859
+ fraction = compute_fraction(num, initial_count, with_replacement)
860
+
861
+ # Compute first samled subset
862
+ samples = self.sample(with_replacement, fraction, seed).collect
863
+
864
+ # If the first sample didn't turn out large enough, keep trying to take samples;
865
+ # this shouldn't happen often because we use a big multiplier for their initial size.
866
+ index = 0
867
+ while samples.size < num
868
+ log_warning("Needed to re-sample due to insufficient sample size. Repeat #{index}")
869
+ samples = self.sample(with_replacement, fraction, rng.rand(0..Integer::MAX)).collect
870
+ index += 1
871
+ end
872
+
873
+ samples.shuffle!(random: rng)
874
+ samples[0, num]
875
+ end
876
+
877
+ # Return an RDD created by piping elements to a forked external process.
878
+ #
879
+ # == Cmds:
880
+ # cmd = [env,] command... [,options]
881
+ #
882
+ # env: hash
883
+ # name => val : set the environment variable
884
+ # name => nil : unset the environment variable
885
+ # command...:
886
+ # commandline : command line string which is passed to the standard shell
887
+ # cmdname, arg1, ... : command name and one or more arguments (This form does
888
+ # not use the shell. See below for caveats.)
889
+ # [cmdname, argv0], arg1, ... : command name, argv[0] and zero or more arguments (no shell)
890
+ # options: hash
891
+ #
892
+ # See http://ruby-doc.org/core-2.2.0/Process.html#method-c-spawn
893
+ #
894
+ # == Examples:
895
+ # $sc.parallelize(0..5).pipe('cat').collect
896
+ # # => ["0", "1", "2", "3", "4", "5"]
897
+ #
898
+ # rdd = $sc.parallelize(0..5)
899
+ # rdd = rdd.pipe('cat', "awk '{print $1*10}'")
900
+ # rdd = rdd.map(lambda{|x| x.to_i + 1})
901
+ # rdd.collect
902
+ # # => [1, 11, 21, 31, 41, 51]
903
+ #
904
+ def pipe(*cmds)
905
+ new_rdd_from_command(Spark::Command::Pipe, cmds)
906
+ end
907
+
908
+
909
+ # =============================================================================
910
+ # Pair functions
911
+
912
+ # Merge the values for each key using an associative reduce function. This will also perform
913
+ # the merging locally on each mapper before sending results to a reducer, similarly to a
914
+ # "combiner" in MapReduce. Output will be hash-partitioned with the existing partitioner/
915
+ # parallelism level.
916
+ #
917
+ # == Example:
918
+ # rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"]).map(lambda{|x| [x, 1]})
919
+ # rdd.reduce_by_key(lambda{|x,y| x+y}).collect_as_hash
920
+ # # => {"a"=>3, "b"=>2, "c"=>3}
921
+ #
922
+ def reduce_by_key(f, num_partitions=nil)
923
+ combine_by_key('lambda {|x| x}', f, f, num_partitions)
924
+ end
925
+
926
+ # Generic function to combine the elements for each key using a custom set of aggregation
927
+ # functions. Turns a JavaPairRDD[(K, V)] into a result of type JavaPairRDD[(K, C)], for a
928
+ # "combined type" C * Note that V and C can be different -- for example, one might group an
929
+ # RDD of type (Int, Int) into an RDD of type (Int, List[Int]). Users provide three
930
+ # functions:
931
+ #
932
+ # == Parameters:
933
+ # create_combiner:: which turns a V into a C (e.g., creates a one-element list)
934
+ # merge_value:: to merge a V into a C (e.g., adds it to the end of a list)
935
+ # merge_combiners:: to combine two C's into a single one.
936
+ #
937
+ # == Example:
938
+ # def combiner(x)
939
+ # x
940
+ # end
941
+ #
942
+ # def merge(x,y)
943
+ # x+y
944
+ # end
945
+ #
946
+ # rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"], 2).map(lambda{|x| [x, 1]})
947
+ # rdd.combine_by_key(method(:combiner), method(:merge), method(:merge)).collect_as_hash
948
+ # # => {"a"=>3, "b"=>2, "c"=>3}
949
+ #
950
+ def combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions=nil)
951
+ _combine_by_key(
952
+ [Spark::Command::CombineByKey::Combine, create_combiner, merge_value],
953
+ [Spark::Command::CombineByKey::Merge, merge_combiners],
954
+ num_partitions
955
+ )
956
+ end
957
+
958
+ # Return an RDD of grouped items.
959
+ #
960
+ # == Example:
961
+ # rdd = $sc.parallelize(0..5)
962
+ # rdd.group_by(lambda{|x| x%2}).collect
963
+ # # => [[0, [0, 2, 4]], [1, [1, 3, 5]]]
964
+ #
965
+ def group_by(f, num_partitions=nil)
966
+ self.key_by(f).group_by_key(num_partitions)
967
+ end
968
+
969
+ # Group the values for each key in the RDD into a single sequence. Allows controlling the
970
+ # partitioning of the resulting key-value pair RDD by passing a Partitioner.
971
+ #
972
+ # Note: If you are grouping in order to perform an aggregation (such as a sum or average)
973
+ # over each key, using reduce_by_key or combine_by_key will provide much better performance.
974
+ #
975
+ # == Example:
976
+ # rdd = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
977
+ # rdd.group_by_key.collect
978
+ # # => [["a", [1, 2]], ["b", [3]]]
979
+ #
980
+ def group_by_key(num_partitions=nil)
981
+ create_combiner = 'lambda{|item| [item]}'
982
+ merge_value = 'lambda{|combiner, item| combiner << item; combiner}'
983
+ merge_combiners = 'lambda{|combiner_1, combiner_2| combiner_1 += combiner_2; combiner_1}'
984
+
985
+ combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions)
986
+ end
987
+
988
+ # Merge the values for each key using an associative function f
989
+ # and a neutral `zero_value` which may be added to the result an
990
+ # arbitrary number of times, and must not change the result
991
+ # (e.g., 0 for addition, or 1 for multiplication.).
992
+ #
993
+ # == Example:
994
+ # rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]])
995
+ # rdd.fold_by_key(1, lambda{|x,y| x+y})
996
+ # # => [["a", 9], ["c", 6], ["b", 3]]
997
+ #
998
+ def fold_by_key(zero_value, f, num_partitions=nil)
999
+ self.aggregate_by_key(zero_value, f, f, num_partitions)
1000
+ end
1001
+
1002
+ # Aggregate the values of each key, using given combine functions and a neutral zero value.
1003
+ #
1004
+ # == Example:
1005
+ # def combine(x,y)
1006
+ # x+y
1007
+ # end
1008
+ #
1009
+ # def merge(x,y)
1010
+ # x*y
1011
+ # end
1012
+ #
1013
+ # rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]], 2)
1014
+ # rdd.aggregate_by_key(1, method(:combine), method(:merge))
1015
+ # # => [["b", 3], ["a", 16], ["c", 6]]
1016
+ #
1017
+ def aggregate_by_key(zero_value, seq_func, comb_func, num_partitions=nil)
1018
+ _combine_by_key(
1019
+ [Spark::Command::CombineByKey::CombineWithZero, zero_value, seq_func],
1020
+ [Spark::Command::CombineByKey::Merge, comb_func],
1021
+ num_partitions
1022
+ )
1023
+ end
1024
+
1025
+ # The same functionality as cogroup but this can grouped only 2 rdd's and you
1026
+ # can change num_partitions.
1027
+ #
1028
+ # == Example:
1029
+ # rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
1030
+ # rdd2 = $sc.parallelize([["a", 4], ["a", 5], ["b", 6]])
1031
+ # rdd1.group_with(rdd2).collect
1032
+ # # => [["a", [1, 2, 4, 5]], ["b", [3, 6]]]
1033
+ #
1034
+ def group_with(other, num_partitions=nil)
1035
+ self.union(other).group_by_key(num_partitions)
1036
+ end
1037
+
1038
+ # For each key k in `this` or `other`, return a resulting RDD that contains a tuple with the
1039
+ # list of values for that key in `this` as well as `other`.
1040
+ #
1041
+ # == Example:
1042
+ # rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
1043
+ # rdd2 = $sc.parallelize([["a", 4], ["a", 5], ["b", 6]])
1044
+ # rdd3 = $sc.parallelize([["a", 7], ["a", 8], ["b", 9]])
1045
+ # rdd1.cogroup(rdd2, rdd3).collect
1046
+ # # => [["a", [1, 2, 4, 5, 7, 8]], ["b", [3, 6, 9]]]
1047
+ #
1048
+ def cogroup(*others)
1049
+ unioned = self
1050
+ others.each do |other|
1051
+ unioned = unioned.union(other)
1052
+ end
1053
+
1054
+ unioned.group_by_key
1055
+ end
1056
+
1057
+ # Return each (key, value) pair in self RDD that has no pair with matching
1058
+ # key in other RDD.
1059
+ #
1060
+ # == Example:
1061
+ # rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3], ["c", 4]])
1062
+ # rdd2 = $sc.parallelize([["b", 5], ["c", 6]])
1063
+ # rdd1.subtract_by_key(rdd2).collect
1064
+ # # => [["a", 1], ["a", 2]]
1065
+ #
1066
+ def subtract_by_key(other, num_partitions=nil)
1067
+ create_combiner = 'lambda{|item| [[item]]}'
1068
+ merge_value = 'lambda{|combiner, item| combiner.first << item; combiner}'
1069
+ merge_combiners = 'lambda{|combiner_1, combiner_2| combiner_1 += combiner_2; combiner_1}'
1070
+
1071
+ self.union(other)
1072
+ .combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions)
1073
+ .filter('lambda{|(key,values)| values.size == 1}')
1074
+ .flat_map_values('lambda{|item| item.first}')
1075
+ end
1076
+
1077
+ # Return an RDD with the elements from self that are not in other.
1078
+ #
1079
+ # == Example:
1080
+ # rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3], ["c", 4]])
1081
+ # rdd2 = $sc.parallelize([["a", 2], ["c", 6]])
1082
+ # rdd1.subtract(rdd2).collect
1083
+ # # => [["a", 1], ["b", 3], ["c", 4]]
1084
+ #
1085
+ def subtract(other, num_partitions=nil)
1086
+ mapping_function = 'lambda{|x| [x,nil]}'
1087
+
1088
+ self.map(mapping_function)
1089
+ .subtract_by_key(other.map(mapping_function), num_partitions)
1090
+ .keys
1091
+ end
1092
+
1093
+ # Sort the RDD by key
1094
+ #
1095
+ # == Example:
1096
+ # rdd = $sc.parallelize([["c", 1], ["b", 2], ["a", 3]])
1097
+ # rdd.sort_by_key.collect
1098
+ # # => [["a", 3], ["b", 2], ["c", 1]]
1099
+ #
1100
+ def sort_by_key(ascending=true, num_partitions=nil)
1101
+ self.sort_by('lambda{|(key, _)| key}')
1102
+ end
1103
+
1104
+ # Sort the RDD by value
1105
+ #
1106
+ # == Example:
1107
+ # rdd = $sc.parallelize([["a", 3], ["b", 1], ["c", 2]])
1108
+ # rdd.sort_by_value.collect
1109
+ # # => [["b", 1], ["c", 2], ["a", 3]]
1110
+ #
1111
+ def sort_by_value(ascending=true, num_partitions=nil)
1112
+ self.sort_by('lambda{|(_, value)| value}')
1113
+ end
1114
+
1115
+ # Sorts this RDD by the given key_function
1116
+ #
1117
+ # This is a different implementation than spark. Sort by doesn't use
1118
+ # key_by method first. It can be slower but take less memory and
1119
+ # you can always use map.sort_by_key
1120
+ #
1121
+ # == Example:
1122
+ # rdd = $sc.parallelize(["aaaaaaa", "cc", "b", "eeee", "ddd"])
1123
+ #
1124
+ # rdd.sort_by.collect
1125
+ # # => ["aaaaaaa", "b", "cc", "ddd", "eeee"]
1126
+ #
1127
+ # rdd.sort_by(lambda{|x| x.size}).collect
1128
+ # # => ["b", "cc", "ddd", "eeee", "aaaaaaa"]
1129
+ #
1130
+ def sort_by(key_function=nil, ascending=true, num_partitions=nil)
1131
+ key_function ||= 'lambda{|x| x}'
1132
+ num_partitions ||= default_reduce_partitions
1133
+
1134
+ command_klass = Spark::Command::SortByKey
1135
+
1136
+ # Allow spill data to disk due to memory limit
1137
+ # spilling = config['spark.shuffle.spill'] || false
1138
+ spilling = false
1139
+ memory = ''
1140
+
1141
+ # Set spilling to false if worker has unlimited memory
1142
+ if memory.empty?
1143
+ spilling = false
1144
+ memory = nil
1145
+ else
1146
+ memory = to_memory_size(memory)
1147
+ end
1148
+
1149
+ # Sorting should do one worker
1150
+ if num_partitions == 1
1151
+ rdd = self
1152
+ rdd = rdd.coalesce(1) if partitions_size > 1
1153
+ return rdd.new_rdd_from_command(command_klass, key_function, ascending, spilling, memory, serializer)
1154
+ end
1155
+
1156
+ # Compute boundary of collection
1157
+ # Collection should be evenly distributed
1158
+ # 20.0 is from scala RangePartitioner (for roughly balanced output partitions)
1159
+ count = self.count
1160
+ sample_size = num_partitions * 20.0
1161
+ fraction = [sample_size / [count, 1].max, 1.0].min
1162
+ samples = self.sample(false, fraction, 1).map(key_function).collect
1163
+ samples.sort!
1164
+ # Reverse is much faster than reverse sort_by
1165
+ samples.reverse! if !ascending
1166
+
1167
+ # Determine part bounds
1168
+ bounds = determine_bounds(samples, num_partitions)
1169
+
1170
+ shuffled = _partition_by(num_partitions, Spark::Command::PartitionBy::Sorting, key_function, bounds, ascending, num_partitions)
1171
+ shuffled.new_rdd_from_command(command_klass, key_function, ascending, spilling, memory, serializer)
1172
+ end
1173
+
1174
+ # Creates array of the elements in this RDD by applying function f.
1175
+ #
1176
+ # == Example:
1177
+ # rdd = $sc.parallelize(0..5)
1178
+ # rdd.key_by(lambda{|x| x%2}).collect
1179
+ # # => [[0, 0], [1, 1], [0, 2], [1, 3], [0, 4], [1, 5]]
1180
+ #
1181
+ def key_by(f)
1182
+ new_rdd_from_command(Spark::Command::KeyBy, f)
1183
+ end
1184
+
1185
+ # Pass each value in the key-value pair RDD through a map function without changing
1186
+ # the keys. This also retains the original RDD's partitioning.
1187
+ #
1188
+ # == Example:
1189
+ # rdd = $sc.parallelize(["ruby", "scala", "java"])
1190
+ # rdd = rdd.map(lambda{|x| [x, x]})
1191
+ # rdd = rdd.map_values(lambda{|x| x.upcase})
1192
+ # rdd.collect
1193
+ # # => [["ruby", "RUBY"], ["scala", "SCALA"], ["java", "JAVA"]]
1194
+ #
1195
+ def map_values(f)
1196
+ new_rdd_from_command(Spark::Command::MapValues, f)
1197
+ end
1198
+
1199
+ # Pass each value in the key-value pair RDD through a flat_map function
1200
+ # without changing the keys; this also retains the original RDD's
1201
+ # partitioning.
1202
+ #
1203
+ # == Example:
1204
+ # rdd = $sc.parallelize([["a", [1,2]], ["b", [3]]])
1205
+ # rdd = rdd.flat_map_values(lambda{|x| x*2})
1206
+ # rdd.collect
1207
+ # # => [["a", 1], ["a", 2], ["a", 1], ["a", 2], ["b", 3], ["b", 3]]
1208
+ #
1209
+ def flat_map_values(f)
1210
+ new_rdd_from_command(Spark::Command::FlatMapValues, f)
1211
+ end
1212
+
1213
+ # Return an RDD with the first element of PairRDD
1214
+ #
1215
+ # == Example:
1216
+ # rdd = $sc.parallelize([[1,2], [3,4], [5,6]])
1217
+ # rdd.keys.collect
1218
+ # # => [1, 3, 5]
1219
+ #
1220
+ def keys
1221
+ self.map('lambda{|(key, _)| key}')
1222
+ end
1223
+
1224
+ # Return an RDD with the second element of PairRDD
1225
+ #
1226
+ # == Example:
1227
+ # rdd = $sc.parallelize([[1,2], [3,4], [5,6]])
1228
+ # rdd.keys.collect
1229
+ # # => [2, 4, 6]
1230
+ #
1231
+ def values
1232
+ self.map('lambda{|(_, value)| value}')
1233
+ end
1234
+
1235
+
1236
+ # Aliases
1237
+ alias_method :partitionsSize, :partitions_size
1238
+ alias_method :defaultReducePartitions, :default_reduce_partitions
1239
+ alias_method :setName, :set_name
1240
+ alias_method :addLibrary, :add_library
1241
+ alias_method :require, :add_library
1242
+
1243
+ alias_method :flatMap, :flat_map
1244
+ alias_method :mapPartitions, :map_partitions
1245
+ alias_method :mapPartitionsWithIndex, :map_partitions_with_index
1246
+ alias_method :reduceByKey, :reduce_by_key
1247
+ alias_method :combineByKey, :combine_by_key
1248
+ alias_method :groupByKey, :group_by_key
1249
+ alias_method :groupWith, :group_with
1250
+ alias_method :partitionBy, :partition_by
1251
+ alias_method :defaultReducePartitions, :default_reduce_partitions
1252
+ alias_method :foreachPartition, :foreach_partition
1253
+ alias_method :mapValues, :map_values
1254
+ alias_method :takeSample, :take_sample
1255
+ alias_method :sortBy, :sort_by
1256
+ alias_method :sortByKey, :sort_by_key
1257
+ alias_method :keyBy, :key_by
1258
+ alias_method :groupBy, :group_by
1259
+ alias_method :foldByKey, :fold_by_key
1260
+ alias_method :aggregateByKey, :aggregate_by_key
1261
+ alias_method :subtractByKey, :subtract_by_key
1262
+ alias_method :sampleStdev, :sample_stdev
1263
+ alias_method :sampleVariance, :sample_variance
1264
+
1265
+ private
1266
+
1267
+ # This is base method for reduce operation. Is used by reduce, fold and aggregation.
1268
+ # Only difference is that fold has zero value.
1269
+ #
1270
+ def _reduce(klass, seq_op, comb_op, zero_value=nil)
1271
+ if seq_op.nil?
1272
+ # Partitions are already reduced
1273
+ rdd = self
1274
+ else
1275
+ rdd = new_rdd_from_command(klass, seq_op, zero_value)
1276
+ end
1277
+
1278
+ # Send all results to one worker and combine results
1279
+ rdd = rdd.coalesce(1).compact
1280
+
1281
+ # Add the same function to new RDD
1282
+ comm = rdd.add_command(klass, comb_op, zero_value)
1283
+ comm.deserializer = @command.serializer
1284
+
1285
+ # Value is returned in array
1286
+ PipelinedRDD.new(rdd, comm).collect[0]
1287
+ end
1288
+
1289
+ def _partition_by(num_partitions, klass, *args)
1290
+ # RDD is transform from [key, value] to [hash, [key, value]]
1291
+ keyed = new_rdd_from_command(klass, *args)
1292
+ keyed.serializer.unbatch!
1293
+
1294
+ # PairwiseRDD and PythonPartitioner are borrowed from Python
1295
+ # but works great on ruby too
1296
+ pairwise_rdd = PairwiseRDD.new(keyed.jrdd.rdd).asJavaPairRDD
1297
+ partitioner = PythonPartitioner.new(num_partitions, args.first.object_id)
1298
+ new_jrdd = pairwise_rdd.partitionBy(partitioner).values
1299
+
1300
+ # Reset deserializer
1301
+ RDD.new(new_jrdd, context, @command.serializer, keyed.serializer)
1302
+ end
1303
+
1304
+ # For using a different combine_by_key
1305
+ #
1306
+ # == Used for:
1307
+ # * combine_by_key
1308
+ # * fold_by_key (with zero value)
1309
+ #
1310
+ def _combine_by_key(combine, merge, num_partitions)
1311
+ num_partitions ||= default_reduce_partitions
1312
+
1313
+ # Combine key
1314
+ combined = new_rdd_from_command(combine.shift, *combine)
1315
+
1316
+ # Merge items
1317
+ shuffled = combined.partition_by(num_partitions)
1318
+ merge_comm = shuffled.add_command(merge.shift, *merge)
1319
+
1320
+ PipelinedRDD.new(shuffled, merge_comm)
1321
+ end
1322
+
1323
+ end
1324
+
1325
+ # Pipelined Resilient Distributed Dataset, operations are pipelined and sended to worker
1326
+ #
1327
+ # RDD
1328
+ # `-- map
1329
+ # `-- map
1330
+ # `-- map
1331
+ #
1332
+ # Code is executed from top to bottom
1333
+ #
1334
+ class PipelinedRDD < RDD
1335
+
1336
+ attr_reader :prev_jrdd, :command
1337
+
1338
+ def initialize(prev, command)
1339
+
1340
+ if prev.is_a?(PipelinedRDD) && prev.pipelinable?
1341
+ # Second, ... stages
1342
+ @prev_jrdd = prev.prev_jrdd
1343
+ else
1344
+ # First stage
1345
+ @prev_jrdd = prev.jrdd
1346
+ end
1347
+
1348
+ @cached = false
1349
+ @checkpointed = false
1350
+
1351
+ @context = prev.context
1352
+ @command = command
1353
+ end
1354
+
1355
+ def pipelinable?
1356
+ !(cached? || checkpointed?)
1357
+ end
1358
+
1359
+ # Serialization necessary things and sent it to RubyRDD (scala extension)
1360
+ def jrdd
1361
+ @jrdd ||= _jrdd
1362
+ end
1363
+
1364
+ private
1365
+
1366
+ def _jrdd
1367
+ command = @command.build
1368
+
1369
+ broadcasts = @command.bound_objects.select{|_, value| value.is_a?(Spark::Broadcast)}.values
1370
+ broadcasts = to_java_array_list(broadcasts.map(&:jbroadcast))
1371
+
1372
+ ruby_rdd = RubyRDD.new(@prev_jrdd.rdd, command, broadcasts, @context.jaccumulator)
1373
+ ruby_rdd.asJavaRDD
1374
+ end
1375
+
1376
+ end
1377
+ end