ruby-spark 1.1.0.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +37 -0
  3. data/Gemfile +47 -0
  4. data/Guardfile +5 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +252 -0
  7. data/Rakefile +35 -0
  8. data/TODO.md +6 -0
  9. data/benchmark/aggregate.rb +33 -0
  10. data/benchmark/bisect.rb +88 -0
  11. data/benchmark/comparison/prepare.sh +18 -0
  12. data/benchmark/comparison/python.py +156 -0
  13. data/benchmark/comparison/r.r +69 -0
  14. data/benchmark/comparison/ruby.rb +167 -0
  15. data/benchmark/comparison/run-all.sh +160 -0
  16. data/benchmark/comparison/scala.scala +181 -0
  17. data/benchmark/custom_marshal.rb +94 -0
  18. data/benchmark/digest.rb +150 -0
  19. data/benchmark/enumerator.rb +88 -0
  20. data/benchmark/serializer.rb +82 -0
  21. data/benchmark/sort.rb +43 -0
  22. data/benchmark/sort2.rb +164 -0
  23. data/benchmark/take.rb +28 -0
  24. data/bin/ruby-spark +8 -0
  25. data/example/pi.rb +28 -0
  26. data/example/website_search.rb +83 -0
  27. data/ext/ruby_c/extconf.rb +3 -0
  28. data/ext/ruby_c/murmur.c +158 -0
  29. data/ext/ruby_c/murmur.h +9 -0
  30. data/ext/ruby_c/ruby-spark.c +18 -0
  31. data/ext/ruby_java/Digest.java +36 -0
  32. data/ext/ruby_java/Murmur2.java +98 -0
  33. data/ext/ruby_java/RubySparkExtService.java +28 -0
  34. data/ext/ruby_java/extconf.rb +3 -0
  35. data/ext/spark/build.sbt +73 -0
  36. data/ext/spark/project/plugins.sbt +9 -0
  37. data/ext/spark/sbt/sbt +34 -0
  38. data/ext/spark/src/main/scala/Exec.scala +91 -0
  39. data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
  40. data/ext/spark/src/main/scala/Marshal.scala +52 -0
  41. data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
  42. data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
  43. data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
  44. data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
  45. data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
  46. data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
  47. data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
  48. data/ext/spark/src/main/scala/RubyPage.scala +34 -0
  49. data/ext/spark/src/main/scala/RubyRDD.scala +392 -0
  50. data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
  51. data/ext/spark/src/main/scala/RubyTab.scala +11 -0
  52. data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
  53. data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
  54. data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
  55. data/lib/ruby-spark.rb +1 -0
  56. data/lib/spark.rb +198 -0
  57. data/lib/spark/accumulator.rb +260 -0
  58. data/lib/spark/broadcast.rb +98 -0
  59. data/lib/spark/build.rb +43 -0
  60. data/lib/spark/cli.rb +169 -0
  61. data/lib/spark/command.rb +86 -0
  62. data/lib/spark/command/base.rb +158 -0
  63. data/lib/spark/command/basic.rb +345 -0
  64. data/lib/spark/command/pair.rb +124 -0
  65. data/lib/spark/command/sort.rb +51 -0
  66. data/lib/spark/command/statistic.rb +144 -0
  67. data/lib/spark/command_builder.rb +141 -0
  68. data/lib/spark/command_validator.rb +34 -0
  69. data/lib/spark/config.rb +238 -0
  70. data/lib/spark/constant.rb +14 -0
  71. data/lib/spark/context.rb +322 -0
  72. data/lib/spark/error.rb +50 -0
  73. data/lib/spark/ext/hash.rb +41 -0
  74. data/lib/spark/ext/integer.rb +25 -0
  75. data/lib/spark/ext/io.rb +67 -0
  76. data/lib/spark/ext/ip_socket.rb +29 -0
  77. data/lib/spark/ext/module.rb +58 -0
  78. data/lib/spark/ext/object.rb +24 -0
  79. data/lib/spark/ext/string.rb +24 -0
  80. data/lib/spark/helper.rb +10 -0
  81. data/lib/spark/helper/logger.rb +40 -0
  82. data/lib/spark/helper/parser.rb +85 -0
  83. data/lib/spark/helper/serialize.rb +71 -0
  84. data/lib/spark/helper/statistic.rb +93 -0
  85. data/lib/spark/helper/system.rb +42 -0
  86. data/lib/spark/java_bridge.rb +19 -0
  87. data/lib/spark/java_bridge/base.rb +203 -0
  88. data/lib/spark/java_bridge/jruby.rb +23 -0
  89. data/lib/spark/java_bridge/rjb.rb +41 -0
  90. data/lib/spark/logger.rb +76 -0
  91. data/lib/spark/mllib.rb +100 -0
  92. data/lib/spark/mllib/classification/common.rb +31 -0
  93. data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
  94. data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
  95. data/lib/spark/mllib/classification/svm.rb +135 -0
  96. data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
  97. data/lib/spark/mllib/clustering/kmeans.rb +118 -0
  98. data/lib/spark/mllib/matrix.rb +120 -0
  99. data/lib/spark/mllib/regression/common.rb +73 -0
  100. data/lib/spark/mllib/regression/labeled_point.rb +41 -0
  101. data/lib/spark/mllib/regression/lasso.rb +100 -0
  102. data/lib/spark/mllib/regression/linear.rb +124 -0
  103. data/lib/spark/mllib/regression/ridge.rb +97 -0
  104. data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
  105. data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
  106. data/lib/spark/mllib/stat/distribution.rb +12 -0
  107. data/lib/spark/mllib/vector.rb +185 -0
  108. data/lib/spark/rdd.rb +1377 -0
  109. data/lib/spark/sampler.rb +92 -0
  110. data/lib/spark/serializer.rb +79 -0
  111. data/lib/spark/serializer/auto_batched.rb +59 -0
  112. data/lib/spark/serializer/base.rb +63 -0
  113. data/lib/spark/serializer/batched.rb +84 -0
  114. data/lib/spark/serializer/cartesian.rb +13 -0
  115. data/lib/spark/serializer/compressed.rb +27 -0
  116. data/lib/spark/serializer/marshal.rb +17 -0
  117. data/lib/spark/serializer/message_pack.rb +23 -0
  118. data/lib/spark/serializer/oj.rb +23 -0
  119. data/lib/spark/serializer/pair.rb +41 -0
  120. data/lib/spark/serializer/text.rb +25 -0
  121. data/lib/spark/sort.rb +189 -0
  122. data/lib/spark/stat_counter.rb +125 -0
  123. data/lib/spark/storage_level.rb +39 -0
  124. data/lib/spark/version.rb +3 -0
  125. data/lib/spark/worker/master.rb +144 -0
  126. data/lib/spark/worker/spark_files.rb +15 -0
  127. data/lib/spark/worker/worker.rb +200 -0
  128. data/ruby-spark.gemspec +47 -0
  129. data/spec/generator.rb +37 -0
  130. data/spec/inputs/lorem_300.txt +316 -0
  131. data/spec/inputs/numbers/1.txt +50 -0
  132. data/spec/inputs/numbers/10.txt +50 -0
  133. data/spec/inputs/numbers/11.txt +50 -0
  134. data/spec/inputs/numbers/12.txt +50 -0
  135. data/spec/inputs/numbers/13.txt +50 -0
  136. data/spec/inputs/numbers/14.txt +50 -0
  137. data/spec/inputs/numbers/15.txt +50 -0
  138. data/spec/inputs/numbers/16.txt +50 -0
  139. data/spec/inputs/numbers/17.txt +50 -0
  140. data/spec/inputs/numbers/18.txt +50 -0
  141. data/spec/inputs/numbers/19.txt +50 -0
  142. data/spec/inputs/numbers/2.txt +50 -0
  143. data/spec/inputs/numbers/20.txt +50 -0
  144. data/spec/inputs/numbers/3.txt +50 -0
  145. data/spec/inputs/numbers/4.txt +50 -0
  146. data/spec/inputs/numbers/5.txt +50 -0
  147. data/spec/inputs/numbers/6.txt +50 -0
  148. data/spec/inputs/numbers/7.txt +50 -0
  149. data/spec/inputs/numbers/8.txt +50 -0
  150. data/spec/inputs/numbers/9.txt +50 -0
  151. data/spec/inputs/numbers_0_100.txt +101 -0
  152. data/spec/inputs/numbers_1_100.txt +100 -0
  153. data/spec/lib/collect_spec.rb +42 -0
  154. data/spec/lib/command_spec.rb +68 -0
  155. data/spec/lib/config_spec.rb +64 -0
  156. data/spec/lib/context_spec.rb +165 -0
  157. data/spec/lib/ext_spec.rb +72 -0
  158. data/spec/lib/external_apps_spec.rb +45 -0
  159. data/spec/lib/filter_spec.rb +80 -0
  160. data/spec/lib/flat_map_spec.rb +100 -0
  161. data/spec/lib/group_spec.rb +109 -0
  162. data/spec/lib/helper_spec.rb +19 -0
  163. data/spec/lib/key_spec.rb +41 -0
  164. data/spec/lib/manipulation_spec.rb +122 -0
  165. data/spec/lib/map_partitions_spec.rb +87 -0
  166. data/spec/lib/map_spec.rb +91 -0
  167. data/spec/lib/mllib/classification_spec.rb +54 -0
  168. data/spec/lib/mllib/clustering_spec.rb +35 -0
  169. data/spec/lib/mllib/matrix_spec.rb +32 -0
  170. data/spec/lib/mllib/regression_spec.rb +116 -0
  171. data/spec/lib/mllib/vector_spec.rb +77 -0
  172. data/spec/lib/reduce_by_key_spec.rb +118 -0
  173. data/spec/lib/reduce_spec.rb +131 -0
  174. data/spec/lib/sample_spec.rb +46 -0
  175. data/spec/lib/serializer_spec.rb +88 -0
  176. data/spec/lib/sort_spec.rb +58 -0
  177. data/spec/lib/statistic_spec.rb +170 -0
  178. data/spec/lib/whole_text_files_spec.rb +33 -0
  179. data/spec/spec_helper.rb +38 -0
  180. metadata +389 -0
@@ -0,0 +1,57 @@
1
+ require 'matrix'
2
+
3
+ # Based on ruby 2.1
4
+
5
+ class Vector
6
+ def self.elements(array, copy=true)
7
+ DenseVector.new(convert_to_array(array, copy))
8
+ end
9
+ end
10
+
11
+ module Spark
12
+ module Mllib
13
+ class VectorAdapter < ::Vector
14
+
15
+ def self.new(*args)
16
+ object = self.allocate
17
+ object.__send__(:initialize, *args)
18
+ object
19
+ end
20
+
21
+ def initialize(*args)
22
+ case args.shift
23
+ when :dense
24
+ values = args.shift.dup
25
+ when :sparse
26
+ values = [0.0] * args.shift.to_i
27
+ else
28
+ raise Spark::MllibError, 'Unknow vector type.'
29
+ end
30
+
31
+ super(values)
32
+ end
33
+
34
+ def []=(index, value)
35
+ @elements[index] = value
36
+ end
37
+
38
+ def dot(other)
39
+ if other.is_a?(Spark::Mllib::MatrixBase)
40
+ other * self
41
+ else
42
+ inner_product(other)
43
+ end
44
+ end
45
+
46
+ def squared_distance(other)
47
+ diff = self - other
48
+ diff.dot(diff)
49
+ end
50
+
51
+ def values
52
+ @values || to_a
53
+ end
54
+
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,12 @@
1
+ ##
2
+ # MultivariateGaussian
3
+ #
4
+ # This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution. In
5
+ # the event that the covariance matrix is singular, the density will be computed in a
6
+ # reduced dimensional subspace under which the distribution is supported.
7
+ #
8
+ # == Arguments:
9
+ # mu:: The mean vector of the distribution
10
+ # sigma:: The covariance matrix of the distribution
11
+ #
12
+ Spark::Mllib::MultivariateGaussian = Struct.new(:mu, :sigma)
@@ -0,0 +1,185 @@
1
+ module Spark
2
+ module Mllib
3
+ module Vectors
4
+
5
+ def self.dense(*args)
6
+ DenseVector.new(*args)
7
+ end
8
+
9
+ def self.sparse(*args)
10
+ SparseVector.new(*args)
11
+ end
12
+
13
+ def self.parse(data)
14
+ if data.start_with?('[') && data.end_with?(']')
15
+ DenseVector.parse(data)
16
+ elsif data.start_with?('(') && data.end_with?(')')
17
+ SparseVector.parse(data)
18
+ else
19
+ raise ArgumentError, 'Unknow vector.'
20
+ end
21
+ end
22
+
23
+ def self.to_vector(data)
24
+ if data.is_a?(SparseVector) || data.is_a?(DenseVector)
25
+ data
26
+ elsif data.is_a?(Array)
27
+ DenseVector.new(data)
28
+ end
29
+ end
30
+
31
+ end
32
+ end
33
+ end
34
+
35
+ module Spark
36
+ module Mllib
37
+ # @abstract Parent for all type of vectors
38
+ class VectorBase < VectorAdapter
39
+ end
40
+ end
41
+ end
42
+
43
+ module Spark
44
+ module Mllib
45
+ ##
46
+ # A dense vector represented by a value array.
47
+ #
48
+ # Dense vector is a vector in which most of the elements are non-zero.
49
+ #
50
+ # == Example:
51
+ # DenseVector.new([1,2,3,4,5]).values
52
+ # # => [1, 2, 3, 4, 5]
53
+ #
54
+ # DenseVector.new(1..5).values
55
+ # # => [1, 2, 3, 4, 5]
56
+ #
57
+ class DenseVector < VectorBase
58
+
59
+ def initialize(values)
60
+ super(:dense, values.to_a)
61
+ end
62
+
63
+ # Covert string to vector
64
+ #
65
+ # DenseVector.parse("[1.0,2.0,3.0,4.0,5.0]")
66
+ #
67
+ def self.parse(data)
68
+ unless data =~ /\[[0-9., ]+\]/
69
+ raise ArgumentError, 'Unknow format for DenseVector.'
70
+ end
71
+
72
+ data.sub!('[', '')
73
+ data.sub!(']', '')
74
+
75
+ data = data.split(',')
76
+ data.map!(&:to_f)
77
+
78
+ DenseVector.new(data)
79
+ end
80
+
81
+ # Convert vector to string
82
+ #
83
+ # DenseVector.new([1,2,3,4,5]).to_s
84
+ # # => "[1.0,2.0,3.0,4.0,5.0]"
85
+ #
86
+ def to_s
87
+ "[#{values.join(',')}]"
88
+ end
89
+
90
+ def to_java
91
+ JDenseVector.new(values)
92
+ end
93
+
94
+ def self.from_java(object)
95
+ DenseVector.new(object.values)
96
+ end
97
+
98
+ def marshal_dump
99
+ values
100
+ end
101
+
102
+ def marshal_load(array)
103
+ initialize(array)
104
+ end
105
+
106
+ end
107
+ end
108
+ end
109
+
110
+ module Spark
111
+ module Mllib
112
+ ##
113
+ # A sparse vector represented by an index array and an value array.
114
+ #
115
+ # Sparse vector is a vector in which most of the elements are zero.
116
+ #
117
+ # == Example:
118
+ # SparseVector.new(4, {1 => 1.0, 3 => 5.5}).values
119
+ # # => [0, 1.0, 0, 5.5]
120
+ #
121
+ # SparseVector.new(4, [[1, 3], [1.0, 5.5]]).values
122
+ # # => [0, 1.0, 0, 5.5]
123
+ #
124
+ # SparseVector.new(4, [1, 3], [1.0, 5.5]).values
125
+ # # => [0, 1.0, 0, 5.5]
126
+ #
127
+ class SparseVector < VectorBase
128
+
129
+ attr_reader :indices
130
+
131
+ def initialize(arg1, arg2=nil, arg3=nil)
132
+ super(:sparse, arg1)
133
+
134
+ if arg2.is_a?(Hash)
135
+ @indices = arg2.keys
136
+ @values = arg2.values
137
+ else
138
+ @indices = arg2
139
+ @values = arg3
140
+ end
141
+
142
+ @indices.zip(@values).each do |(index, value)|
143
+ self[index] = value
144
+ end
145
+ end
146
+
147
+ # Covert string to vector
148
+ #
149
+ # SparseVector.parse("(5,[1,4],[3.0,5.0])")
150
+ #
151
+ def self.parse(data)
152
+ data = data.match(/\(([0-9]+)[ ]*,[ ]*\[([0-9,. ]*)\][ ]*,[ ]*\[([0-9,. ]*)\]\)/)
153
+ if data
154
+ size = data[1].to_i
155
+ indices = data[2].split(',')
156
+ indices.map!(&:to_i)
157
+ values = data[3].split(',')
158
+ values.map!(&:to_f)
159
+
160
+ SparseVector.new(size, indices, values)
161
+ else
162
+ raise ArgumentError, 'Unknow format for SparseVector.'
163
+ end
164
+ end
165
+
166
+ # Convert vector to string
167
+ #
168
+ # SparseVector.new(5, {1 => 3, 4 => 5}).to_s
169
+ # # => "(5,[1,4],[3.0,5.0])"
170
+ #
171
+ def to_s
172
+ "(#{size},[#{indices.join(',')}],[#{values.join(',')}])"
173
+ end
174
+
175
+ def marshal_dump
176
+ [size, indices, values]
177
+ end
178
+
179
+ def marshal_load(array)
180
+ initialize(array[0], array[1], array[2])
181
+ end
182
+
183
+ end
184
+ end
185
+ end
@@ -0,0 +1,1377 @@
1
+ module Spark
2
+ ##
3
+ # A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. Represents an immutable,
4
+ # partitioned collection of elements that can be operated on in parallel. This class contains the
5
+ # basic operations available on all RDDs, such as `map`, `filter`, and `persist`.
6
+ #
7
+ class RDD
8
+
9
+ extend Forwardable
10
+
11
+ attr_reader :jrdd, :context, :command
12
+
13
+ include Spark::Helper::Logger
14
+ include Spark::Helper::Parser
15
+ include Spark::Helper::Statistic
16
+
17
+ def_delegators :@command, :serializer, :deserializer, :libraries, :files
18
+
19
+ # Initializing RDD, this method is root of all Pipelined RDD - its unique
20
+ # If you call some operations on this class it will be computed in Java
21
+ #
22
+ # == Parameters:
23
+ # jrdd:: org.apache.spark.api.java.JavaRDD
24
+ # context:: {Spark::Context}
25
+ # serializer:: {Spark::Serializer}
26
+ #
27
+ def initialize(jrdd, context, serializer, deserializer=nil)
28
+ @jrdd = jrdd
29
+ @context = context
30
+
31
+ @cached = false
32
+ @checkpointed = false
33
+
34
+ @command = Spark::CommandBuilder.new(serializer, deserializer)
35
+ end
36
+
37
+ def inspect
38
+ comms = @command.commands.join(' -> ')
39
+
40
+ result = %{#<#{self.class.name}:0x#{object_id}}
41
+ result << %{ (#{comms})} unless comms.empty?
42
+ result << %{\n}
43
+ result << %{ Serializer: "#{serializer}"\n}
44
+ result << %{Deserializer: "#{deserializer}"}
45
+ result << %{>}
46
+ result
47
+ end
48
+
49
+
50
+ # =============================================================================
51
+ # Operators
52
+
53
+ def +(other)
54
+ self.union(other)
55
+ end
56
+
57
+
58
+ # =============================================================================
59
+ # Commad and serializer
60
+
61
+ def add_command(klass, *args)
62
+ @command.deep_copy.add_command(klass, *args)
63
+ end
64
+
65
+ # Add ruby library
66
+ # Libraries will be included before computing
67
+ #
68
+ # == Example:
69
+ # rdd.add_library('pry').add_library('nio4r', 'distribution')
70
+ #
71
+ def add_library(*libraries)
72
+ @command.add_library(*libraries)
73
+ self
74
+ end
75
+
76
+ # Bind object to RDD
77
+ #
78
+ # == Example:
79
+ # text = "test"
80
+ #
81
+ # rdd = $sc.parallelize(0..5)
82
+ # rdd = rdd.map(lambda{|x| x.to_s + " " + text})
83
+ # rdd = rdd.bind(text: text)
84
+ #
85
+ # rdd.collect
86
+ # # => ["0 test", "1 test", "2 test", "3 test", "4 test", "5 test"]
87
+ #
88
+ def bind(objects)
89
+ unless objects.is_a?(Hash)
90
+ raise ArgumentError, 'Argument must be a Hash.'
91
+ end
92
+
93
+ @command.bind(objects)
94
+ self
95
+ end
96
+
97
+ def new_rdd_from_command(klass, *args)
98
+ comm = add_command(klass, *args)
99
+ PipelinedRDD.new(self, comm)
100
+ end
101
+
102
+
103
+ # =============================================================================
104
+ # Variables and non-computing functions
105
+
106
+ def config
107
+ @context.config
108
+ end
109
+
110
+ def default_reduce_partitions
111
+ config['spark.default.parallelism'] || partitions_size
112
+ end
113
+
114
+ # Count of ParallelCollectionPartition
115
+ def partitions_size
116
+ jrdd.rdd.partitions.size
117
+ end
118
+
119
+ # A unique ID for this RDD (within its SparkContext).
120
+ def id
121
+ jrdd.id
122
+ end
123
+
124
+ # Persist this RDD with the default storage level MEMORY_ONLY_SER because of serialization.
125
+ def cache
126
+ persist('memory_only_ser')
127
+ end
128
+
129
+ # Set this RDD's storage level to persist its values across operations after the first time
130
+ # it is computed. This can only be used to assign a new storage level if the RDD does not
131
+ # have a storage level set yet.
132
+ #
133
+ # See StorageLevel for type of new_level
134
+ #
135
+ def persist(new_level)
136
+ @cached = true
137
+ jrdd.persist(Spark::StorageLevel.java_get(new_level))
138
+ self
139
+ end
140
+
141
+ # Mark the RDD as non-persistent, and remove all blocks for it from memory and disk.
142
+ #
143
+ # == Parameters:
144
+ # blocking:: whether to block until all blocks are deleted.
145
+ #
146
+ def unpersist(blocking=true)
147
+ @cached = false
148
+ jrdd.unpersist(blocking)
149
+ self
150
+ end
151
+
152
+ def cached?
153
+ @cached
154
+ end
155
+
156
+ def checkpointed?
157
+ @checkpointed
158
+ end
159
+
160
+ # Return the name of this RDD.
161
+ #
162
+ def name
163
+ _name = jrdd.name
164
+ _name && _name.encode(Encoding::UTF_8)
165
+ end
166
+
167
+ # Assign a name to this RDD.
168
+ #
169
+ def set_name(name)
170
+ jrdd.setName(name)
171
+ end
172
+
173
+ def to_java
174
+ marshal = Spark::Serializer.marshal
175
+
176
+ if deserializer.batched?
177
+ ser = deserializer.deep_copy
178
+ ser.serializer = marshal
179
+ else
180
+ ser = Spark::Serializer.batched(marshal)
181
+ end
182
+
183
+ rdd = self.reserialize(ser)
184
+ RubyRDD.toJava(rdd.jrdd, rdd.serializer.batched?)
185
+ end
186
+
187
+
188
+ # =============================================================================
189
+ # Actions which return value
190
+
191
+ # Return an array that contains all of the elements in this RDD.
192
+ # RJB raise an error if stage is killed.
193
+ def collect(as_enum=false)
194
+ file = Tempfile.new('collect', context.temp_dir)
195
+
196
+ RubyRDD.writeRDDToFile(jrdd.rdd, file.path)
197
+
198
+ collect_from_file(file, as_enum)
199
+ rescue => e
200
+ raise Spark::RDDError, e.message
201
+ end
202
+
203
+ def collect_from_file(file, as_enum=false)
204
+ if self.is_a?(PipelinedRDD)
205
+ klass = @command.serializer
206
+ else
207
+ klass = @command.deserializer
208
+ end
209
+
210
+ if as_enum
211
+ result = klass.load_from_file(file)
212
+ else
213
+ result = klass.load_from_io(file).to_a
214
+ file.close
215
+ file.unlink
216
+ end
217
+
218
+ result
219
+ end
220
+
221
+ # Convert an Array to Hash
222
+ #
223
+ def collect_as_hash
224
+ Hash[collect]
225
+ end
226
+
227
+ # Take the first num elements of the RDD.
228
+ #
229
+ # It works by first scanning one partition, and use the results from
230
+ # that partition to estimate the number of additional partitions needed
231
+ # to satisfy the limit.
232
+ #
233
+ # == Example:
234
+ # rdd = $sc.parallelize(0..100, 20)
235
+ # rdd.take(5)
236
+ # # => [0, 1, 2, 3, 4]
237
+ #
238
+ def take(count)
239
+ buffer = []
240
+
241
+ parts_count = self.partitions_size
242
+ # No parts was scanned, yet
243
+ last_scanned = -1
244
+
245
+ while buffer.empty?
246
+ last_scanned += 1
247
+ buffer += context.run_job_with_command(self, [last_scanned], true, Spark::Command::Take, 0, -1)
248
+ end
249
+
250
+ # Assumption. Depend on batch_size and how Spark divided data.
251
+ items_per_part = buffer.size
252
+ left = count - buffer.size
253
+
254
+ while left > 0 && last_scanned < parts_count
255
+ parts_to_take = (left.to_f/items_per_part).ceil
256
+ parts_for_scanned = Array.new(parts_to_take) do
257
+ last_scanned += 1
258
+ end
259
+
260
+ # We cannot take exact number of items because workers are isolated from each other.
261
+ # => once you take e.g. 50% from last part and left is still > 0 then its very
262
+ # difficult merge new items
263
+ items = context.run_job_with_command(self, parts_for_scanned, true, Spark::Command::Take, left, last_scanned)
264
+ buffer += items
265
+
266
+ left = count - buffer.size
267
+ # Average size of all parts
268
+ items_per_part = [items_per_part, items.size].reduce(0){|sum, x| sum + x.to_f/2}
269
+ end
270
+
271
+ buffer.slice!(0, count)
272
+ end
273
+
274
+ # Return the first element in this RDD.
275
+ #
276
+ # == Example:
277
+ # rdd = $sc.parallelize(0..100)
278
+ # rdd.first
279
+ # # => 0
280
+ #
281
+ def first
282
+ self.take(1)[0]
283
+ end
284
+
285
+ # Reduces the elements of this RDD using the specified lambda or method.
286
+ #
287
+ # == Example:
288
+ # rdd = $sc.parallelize(0..10)
289
+ # rdd.reduce(lambda{|sum, x| sum+x})
290
+ # # => 55
291
+ #
292
+ def reduce(f)
293
+ _reduce(Spark::Command::Reduce, f, f)
294
+ end
295
+
296
+ # Aggregate the elements of each partition, and then the results for all the partitions, using a
297
+ # given associative function and a neutral "zero value".
298
+ #
299
+ # The function f(x, y) is allowed to modify x and return it as its result value to avoid
300
+ # object allocation; however, it should not modify y.
301
+ #
302
+ # Be careful, zero_values is applied to all stages. See example.
303
+ #
304
+ # == Example:
305
+ # rdd = $sc.parallelize(0..10, 2)
306
+ # rdd.fold(1, lambda{|sum, x| sum+x})
307
+ # # => 58
308
+ #
309
+ def fold(zero_value, f)
310
+ self.aggregate(zero_value, f, f)
311
+ end
312
+
313
+ # Aggregate the elements of each partition, and then the results for all the partitions, using
314
+ # given combine functions and a neutral "zero value".
315
+ #
316
+ # This function can return a different result type. We need one operation for merging.
317
+ #
318
+ # Result must be an Array otherwise Serializer Array's zero value will be send
319
+ # as multiple values and not just one.
320
+ #
321
+ # == Example:
322
+ # # 1 2 3 4 5 => 15 + 1 = 16
323
+ # # 6 7 8 9 10 => 40 + 1 = 41
324
+ # # 16 * 41 = 656
325
+ #
326
+ # seq = lambda{|x,y| x+y}
327
+ # com = lambda{|x,y| x*y}
328
+ #
329
+ # rdd = $sc.parallelize(1..10, 2)
330
+ # rdd.aggregate(1, seq, com)
331
+ # # => 656
332
+ #
333
+ def aggregate(zero_value, seq_op, comb_op)
334
+ _reduce(Spark::Command::Aggregate, seq_op, comb_op, zero_value)
335
+ end
336
+
337
+ # Return the max of this RDD
338
+ #
339
+ # == Example:
340
+ # rdd = $sc.parallelize(0..10)
341
+ # rdd.max
342
+ # # => 10
343
+ #
344
+ def max
345
+ self.reduce('lambda{|memo, item| memo > item ? memo : item }')
346
+ end
347
+
348
+ # Return the min of this RDD
349
+ #
350
+ # == Example:
351
+ # rdd = $sc.parallelize(0..10)
352
+ # rdd.min
353
+ # # => 0
354
+ #
355
+ def min
356
+ self.reduce('lambda{|memo, item| memo < item ? memo : item }')
357
+ end
358
+
359
+ # Return the sum of this RDD
360
+ #
361
+ # == Example:
362
+ # rdd = $sc.parallelize(0..10)
363
+ # rdd.sum
364
+ # # => 55
365
+ #
366
+ def sum
367
+ self.reduce('lambda{|sum, item| sum + item}')
368
+ end
369
+
370
+ # Return the number of values in this RDD
371
+ #
372
+ # == Example:
373
+ # rdd = $sc.parallelize(0..10)
374
+ # rdd.count
375
+ # # => 11
376
+ #
377
+ def count
378
+ # nil is for seq_op => it means the all result go directly to one worker for combine
379
+ @count ||= self.map_partitions('lambda{|iterator| iterator.to_a.size }')
380
+ .aggregate(0, nil, 'lambda{|sum, item| sum + item }')
381
+ end
382
+
383
+ # Return a {Spark::StatCounter} object that captures the mean, variance
384
+ # and count of the RDD's elements in one operation.
385
+ def stats
386
+ @stats ||= new_rdd_from_command(Spark::Command::Stats).reduce('lambda{|memo, item| memo.merge(item)}')
387
+ end
388
+
389
+ # Compute the mean of this RDD's elements.
390
+ #
391
+ # == Example:
392
+ # $sc.parallelize([1, 2, 3]).mean
393
+ # # => 2.0
394
+ #
395
+ def mean
396
+ stats.mean
397
+ end
398
+
399
+ # Compute the variance of this RDD's elements.
400
+ #
401
+ # == Example:
402
+ # $sc.parallelize([1, 2, 3]).variance
403
+ # # => 0.666...
404
+ #
405
+ def variance
406
+ stats.variance
407
+ end
408
+
409
+ # Compute the standard deviation of this RDD's elements.
410
+ #
411
+ # == Example:
412
+ # $sc.parallelize([1, 2, 3]).stdev
413
+ # # => 0.816...
414
+ #
415
+ def stdev
416
+ stats.stdev
417
+ end
418
+
419
+ # Compute the sample standard deviation of this RDD's elements (which
420
+ # corrects for bias in estimating the standard deviation by dividing by
421
+ # N-1 instead of N).
422
+ #
423
+ # == Example:
424
+ # $sc.parallelize([1, 2, 3]).sample_stdev
425
+ # # => 1.0
426
+ #
427
+ def sample_stdev
428
+ stats.sample_stdev
429
+ end
430
+
431
+ # Compute the sample variance of this RDD's elements (which corrects
432
+ # for bias in estimating the variance by dividing by N-1 instead of N).
433
+ #
434
+ # == Example:
435
+ # $sc.parallelize([1, 2, 3]).sample_variance
436
+ # # => 1.0
437
+ #
438
+ def sample_variance
439
+ stats.sample_variance
440
+ end
441
+
442
+ # Compute a histogram using the provided buckets. The buckets
443
+ # are all open to the right except for the last which is closed.
444
+ # e.g. [1,10,20,50] means the buckets are [1,10) [10,20) [20,50],
445
+ # which means 1<=x<10, 10<=x<20, 20<=x<=50. And on the input of 1
446
+ # and 50 we would have a histogram of 1,0,1.
447
+ #
448
+ # If your histogram is evenly spaced (e.g. [0, 10, 20, 30]),
449
+ # this can be switched from an O(log n) inseration to O(1) per
450
+ # element(where n = # buckets).
451
+ #
452
+ # Buckets must be sorted and not contain any duplicates, must be
453
+ # at least two elements.
454
+ #
455
+ # == Examples:
456
+ # rdd = $sc.parallelize(0..50)
457
+ #
458
+ # rdd.histogram(2)
459
+ # # => [[0.0, 25.0, 50], [25, 26]]
460
+ #
461
+ # rdd.histogram([0, 5, 25, 50])
462
+ # # => [[0, 5, 25, 50], [5, 20, 26]]
463
+ #
464
+ # rdd.histogram([0, 15, 30, 45, 60])
465
+ # # => [[0, 15, 30, 45, 60], [15, 15, 15, 6]]
466
+ #
467
+ def histogram(buckets)
468
+
469
+ # -----------------------------------------------------------------------
470
+ # Integer
471
+ #
472
+ if buckets.is_a?(Integer)
473
+
474
+ # Validation
475
+ if buckets < 1
476
+ raise ArgumentError, "Bucket count must be >= 1, #{buckets} inserted."
477
+ end
478
+
479
+ # Filter invalid values
480
+ # Nil and NaN
481
+ func = 'lambda{|x|
482
+ if x.nil? || (x.is_a?(Float) && x.nan?)
483
+ false
484
+ else
485
+ true
486
+ end
487
+ }'
488
+ filtered = self.filter(func)
489
+
490
+ # Compute the minimum and the maximum
491
+ func = 'lambda{|memo, item|
492
+ [memo[0] < item[0] ? memo[0] : item[0],
493
+ memo[1] > item[1] ? memo[1] : item[1]]
494
+ }'
495
+ min, max = filtered.map('lambda{|x| [x, x]}').reduce(func)
496
+
497
+ # Min, max must be valid numbers
498
+ if (min.is_a?(Float) && !min.finite?) || (max.is_a?(Float) && !max.finite?)
499
+ raise Spark::RDDError, 'Histogram on either an empty RDD or RDD containing +/-infinity or NaN'
500
+ end
501
+
502
+ # Already finished
503
+ if min == max || buckets == 1
504
+ return [min, max], [filtered.count]
505
+ end
506
+
507
+ # Custom range
508
+ begin
509
+ span = max - min # increment
510
+ buckets = (0...buckets).map do |x|
511
+ min + (x * span) / buckets.to_f
512
+ end
513
+ buckets << max
514
+ rescue NoMethodError
515
+ raise Spark::RDDError, 'Can not generate buckets with non-number in RDD'
516
+ end
517
+
518
+ even = true
519
+
520
+ # -----------------------------------------------------------------------
521
+ # Array
522
+ #
523
+ elsif buckets.is_a?(Array)
524
+
525
+ if buckets.size < 2
526
+ raise ArgumentError, 'Buckets should have more than one value.'
527
+ end
528
+
529
+ if buckets.detect{|x| x.nil? || (x.is_a?(Float) && x.nan?)}
530
+ raise ArgumentError, 'Can not have nil or nan numbers in buckets.'
531
+ end
532
+
533
+ if buckets.detect{|x| buckets.count(x) > 1}
534
+ raise ArgumentError, 'Buckets should not contain duplicated values.'
535
+ end
536
+
537
+ if buckets.sort != buckets
538
+ raise ArgumentError, 'Buckets must be sorted.'
539
+ end
540
+
541
+ even = false
542
+
543
+ # -----------------------------------------------------------------------
544
+ # Other
545
+ #
546
+ else
547
+ raise Spark::RDDError, 'Buckets should be number or array.'
548
+ end
549
+
550
+ reduce_func = 'lambda{|memo, item|
551
+ memo.size.times do |i|
552
+ memo[i] += item[i]
553
+ end
554
+ memo
555
+ }'
556
+
557
+ return buckets, new_rdd_from_command(Spark::Command::Histogram, even, buckets).reduce(reduce_func)
558
+ end
559
+
560
+ # Applies a function f to all elements of this RDD.
561
+ #
562
+ # == Example:
563
+ # rdd = $sc.parallelize(0..5)
564
+ # rdd.foreach(lambda{|x| puts x})
565
+ # # => nil
566
+ #
567
+ def foreach(f, options={})
568
+ new_rdd_from_command(Spark::Command::Foreach, f).collect
569
+ nil
570
+ end
571
+
572
+ # Applies a function f to each partition of this RDD.
573
+ #
574
+ # == Example:
575
+ # rdd = $sc.parallelize(0..5)
576
+ # rdd.foreachPartition(lambda{|x| puts x.to_s})
577
+ # # => nil
578
+ #
579
+ def foreach_partition(f, options={})
580
+ new_rdd_from_command(Spark::Command::ForeachPartition, f).collect
581
+ nil
582
+ end
583
+
584
+
585
+ # =============================================================================
586
+ # Transformations of RDD
587
+
588
+ # Return a new RDD by applying a function to all elements of this RDD.
589
+ #
590
+ # == Example:
591
+ # rdd = $sc.parallelize(0..5)
592
+ # rdd.map(lambda {|x| x*2}).collect
593
+ # # => [0, 2, 4, 6, 8, 10]
594
+ #
595
+ def map(f)
596
+ new_rdd_from_command(Spark::Command::Map, f)
597
+ end
598
+
599
+ # Return a new RDD by first applying a function to all elements of this
600
+ # RDD, and then flattening the results.
601
+ #
602
+ # == Example:
603
+ # rdd = $sc.parallelize(0..5)
604
+ # rdd.flat_map(lambda {|x| [x, 1]}).collect
605
+ # # => [0, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1]
606
+ #
607
+ def flat_map(f)
608
+ new_rdd_from_command(Spark::Command::FlatMap, f)
609
+ end
610
+
611
+ # Return a new RDD by applying a function to each partition of this RDD.
612
+ #
613
+ # == Example:
614
+ # rdd = $sc.parallelize(0..10, 2)
615
+ # rdd.map_partitions(lambda{|part| part.reduce(:+)}).collect
616
+ # # => [15, 40]
617
+ #
618
+ def map_partitions(f)
619
+ new_rdd_from_command(Spark::Command::MapPartitions, f)
620
+ end
621
+
622
+ # Return a new RDD by applying a function to each partition of this RDD, while tracking the index
623
+ # of the original partition.
624
+ #
625
+ # == Example:
626
+ # rdd = $sc.parallelize(0...4, 4)
627
+ # rdd.map_partitions_with_index(lambda{|part, index| part.first * index}).collect
628
+ # # => [0, 1, 4, 9]
629
+ #
630
+ def map_partitions_with_index(f, options={})
631
+ new_rdd_from_command(Spark::Command::MapPartitionsWithIndex, f)
632
+ end
633
+
634
+ # Return a new RDD containing only the elements that satisfy a predicate.
635
+ #
636
+ # == Example:
637
+ # rdd = $sc.parallelize(0..10)
638
+ # rdd.filter(lambda{|x| x.even?}).collect
639
+ # # => [0, 2, 4, 6, 8, 10]
640
+ #
641
+ def filter(f)
642
+ new_rdd_from_command(Spark::Command::Filter, f)
643
+ end
644
+
645
+ # Return a new RDD containing non-nil elements.
646
+ #
647
+ # == Example:
648
+ # rdd = $sc.parallelize([1, nil, 2, nil, 3])
649
+ # rdd.compact.collect
650
+ # # => [1, 2, 3]
651
+ #
652
+ def compact
653
+ new_rdd_from_command(Spark::Command::Compact)
654
+ end
655
+
656
+ # Return an RDD created by coalescing all elements within each partition into an array.
657
+ #
658
+ # == Example:
659
+ # rdd = $sc.parallelize(0..10, 3)
660
+ # rdd.glom.collect
661
+ # # => [[0, 1, 2], [3, 4, 5, 6], [7, 8, 9, 10]]
662
+ #
663
+ def glom
664
+ new_rdd_from_command(Spark::Command::Glom)
665
+ end
666
+
667
+ # Return a new RDD that is reduced into num_partitions partitions.
668
+ #
669
+ # == Example:
670
+ # rdd = $sc.parallelize(0..10, 3)
671
+ # rdd.coalesce(2).glom.collect
672
+ # # => [[0, 1, 2], [3, 4, 5, 6, 7, 8, 9, 10]]
673
+ #
674
+ def coalesce(num_partitions)
675
+ if self.is_a?(PipelinedRDD)
676
+ deser = @command.serializer
677
+ else
678
+ deser = @command.deserializer
679
+ end
680
+
681
+ new_jrdd = jrdd.coalesce(num_partitions)
682
+ RDD.new(new_jrdd, context, @command.serializer, deser)
683
+ end
684
+
685
+ # Return the Cartesian product of this RDD and another one, that is, the
686
+ # RDD of all pairs of elements `(a, b)` where `a` is in `self` and
687
+ # `b` is in `other`.
688
+ #
689
+ # == Example:
690
+ # rdd1 = $sc.parallelize([1,2,3])
691
+ # rdd2 = $sc.parallelize([4,5,6])
692
+ #
693
+ # rdd1.cartesian(rdd2).collect
694
+ # # => [[1, 4], [1, 5], [1, 6], [2, 4], [2, 5], [2, 6], [3, 4], [3, 5], [3, 6]]
695
+ #
696
+ def cartesian(other)
697
+ _deserializer = Spark::Serializer::Cartesian.new(self.deserializer, other.deserializer)
698
+
699
+ new_jrdd = jrdd.cartesian(other.jrdd)
700
+ RDD.new(new_jrdd, context, serializer, _deserializer)
701
+ end
702
+
703
+ # Return a new RDD containing the distinct elements in this RDD.
704
+ # Ordering is not preserved because of reducing
705
+ #
706
+ # == Example:
707
+ # rdd = $sc.parallelize([1,1,1,2,3])
708
+ # rdd.distinct.collect
709
+ # # => [1, 2, 3]
710
+ #
711
+ def distinct
712
+ self.map('lambda{|x| [x, nil]}')
713
+ .reduce_by_key('lambda{|x,_| x}')
714
+ .map('lambda{|x| x[0]}')
715
+ end
716
+
717
+ # Return a shuffled RDD.
718
+ #
719
+ # == Example:
720
+ # rdd = $sc.parallelize(0..10)
721
+ # rdd.shuffle.collect
722
+ # # => [3, 10, 6, 7, 8, 0, 4, 2, 9, 1, 5]
723
+ #
724
+ def shuffle(seed=nil)
725
+ seed ||= Random.new_seed
726
+
727
+ new_rdd_from_command(Spark::Command::Shuffle, seed)
728
+ end
729
+
730
+ # Return the union of this RDD and another one. Any identical elements will appear multiple
731
+ # times (use .distinct to eliminate them).
732
+ #
733
+ # == Example:
734
+ # rdd = $sc.parallelize([1, 2, 3])
735
+ # rdd.union(rdd).collect
736
+ # # => [1, 2, 3, 1, 2, 3]
737
+ #
738
+ def union(other)
739
+ if self.serializer != other.serializer
740
+ other = other.reserialize(serializer)
741
+ end
742
+
743
+ new_jrdd = jrdd.union(other.jrdd)
744
+ RDD.new(new_jrdd, context, serializer, deserializer)
745
+ end
746
+
747
+ # Return a new RDD with different serializer. This method is useful during union
748
+ # and join operations.
749
+ #
750
+ # == Example:
751
+ # rdd = $sc.parallelize([1, 2, 3], nil, serializer: "marshal")
752
+ # rdd = rdd.map(lambda{|x| x.to_s})
753
+ # rdd.reserialize("oj").collect
754
+ # # => ["1", "2", "3"]
755
+ #
756
+ def reserialize(new_serializer)
757
+ if serializer == new_serializer
758
+ return self
759
+ end
760
+
761
+ new_command = @command.deep_copy
762
+ new_command.serializer = new_serializer
763
+
764
+ PipelinedRDD.new(self, new_command)
765
+ end
766
+
767
+ # Return the intersection of this RDD and another one. The output will not contain
768
+ # any duplicate elements, even if the input RDDs did.
769
+ #
770
+ # == Example:
771
+ # rdd1 = $sc.parallelize([1,2,3,4,5])
772
+ # rdd2 = $sc.parallelize([1,4,5,6,7])
773
+ # rdd1.intersection(rdd2).collect
774
+ # # => [1, 4, 5]
775
+ #
776
+ def intersection(other)
777
+ mapping_function = 'lambda{|item| [item, nil]}'
778
+ filter_function = 'lambda{|(key, values)| values.size > 1}'
779
+
780
+ self.map(mapping_function)
781
+ .cogroup(other.map(mapping_function))
782
+ .filter(filter_function)
783
+ .keys
784
+ end
785
+
786
+ # Return a copy of the RDD partitioned using the specified partitioner.
787
+ #
788
+ # == Example:
789
+ # rdd = $sc.parallelize(["1","2","3","4","5"]).map(lambda {|x| [x, 1]})
790
+ # rdd.partitionBy(2).glom.collect
791
+ # # => [[["3", 1], ["4", 1]], [["1", 1], ["2", 1], ["5", 1]]]
792
+ #
793
+ def partition_by(num_partitions, partition_func=nil)
794
+ num_partitions ||= default_reduce_partitions
795
+ partition_func ||= 'lambda{|x| Spark::Digest.portable_hash(x.to_s)}'
796
+
797
+ _partition_by(num_partitions, Spark::Command::PartitionBy::Basic, partition_func)
798
+ end
799
+
800
+ # Return a sampled subset of this RDD. Operations are base on Poisson and Uniform
801
+ # distributions.
802
+ # TODO: Replace Unfirom for Bernoulli
803
+ #
804
+ # == Examples:
805
+ # rdd = $sc.parallelize(0..100)
806
+ #
807
+ # rdd.sample(true, 10).collect
808
+ # # => [17, 17, 22, 23, 51, 52, 62, 64, 69, 70, 96]
809
+ #
810
+ # rdd.sample(false, 0.1).collect
811
+ # # => [3, 5, 9, 32, 44, 55, 66, 68, 75, 80, 86, 91, 98]
812
+ #
813
+ def sample(with_replacement, fraction, seed=nil)
814
+ new_rdd_from_command(Spark::Command::Sample, with_replacement, fraction, seed)
815
+ end
816
+
817
+ # Return a fixed-size sampled subset of this RDD in an array
818
+ #
819
+ # == Examples:
820
+ # rdd = $sc.parallelize(0..100)
821
+ #
822
+ # rdd.take_sample(true, 10)
823
+ # # => [90, 84, 74, 44, 27, 22, 72, 96, 80, 54]
824
+ #
825
+ # rdd.take_sample(false, 10)
826
+ # # => [5, 35, 30, 48, 22, 33, 40, 75, 42, 32]
827
+ #
828
+ def take_sample(with_replacement, num, seed=nil)
829
+
830
+ if num < 0
831
+ raise Spark::RDDError, 'Size have to be greater than 0'
832
+ elsif num == 0
833
+ return []
834
+ end
835
+
836
+ # Taken from scala
837
+ num_st_dev = 10.0
838
+
839
+ # Number of items
840
+ initial_count = self.count
841
+ return [] if initial_count == 0
842
+
843
+ # Create new generator
844
+ seed ||= Random.new_seed
845
+ rng = Random.new(seed)
846
+
847
+ # Shuffle elements if requested num if greater than array size
848
+ if !with_replacement && num >= initial_count
849
+ return self.shuffle(seed).collect
850
+ end
851
+
852
+ # Max num
853
+ max_sample_size = Integer::MAX - (num_st_dev * Math.sqrt(Integer::MAX)).to_i
854
+ if num > max_sample_size
855
+ raise Spark::RDDError, "Size can not be greate than #{max_sample_size}"
856
+ end
857
+
858
+ # Approximate fraction with tolerance
859
+ fraction = compute_fraction(num, initial_count, with_replacement)
860
+
861
+ # Compute first samled subset
862
+ samples = self.sample(with_replacement, fraction, seed).collect
863
+
864
+ # If the first sample didn't turn out large enough, keep trying to take samples;
865
+ # this shouldn't happen often because we use a big multiplier for their initial size.
866
+ index = 0
867
+ while samples.size < num
868
+ log_warning("Needed to re-sample due to insufficient sample size. Repeat #{index}")
869
+ samples = self.sample(with_replacement, fraction, rng.rand(0..Integer::MAX)).collect
870
+ index += 1
871
+ end
872
+
873
+ samples.shuffle!(random: rng)
874
+ samples[0, num]
875
+ end
876
+
877
+ # Return an RDD created by piping elements to a forked external process.
878
+ #
879
+ # == Cmds:
880
+ # cmd = [env,] command... [,options]
881
+ #
882
+ # env: hash
883
+ # name => val : set the environment variable
884
+ # name => nil : unset the environment variable
885
+ # command...:
886
+ # commandline : command line string which is passed to the standard shell
887
+ # cmdname, arg1, ... : command name and one or more arguments (This form does
888
+ # not use the shell. See below for caveats.)
889
+ # [cmdname, argv0], arg1, ... : command name, argv[0] and zero or more arguments (no shell)
890
+ # options: hash
891
+ #
892
+ # See http://ruby-doc.org/core-2.2.0/Process.html#method-c-spawn
893
+ #
894
+ # == Examples:
895
+ # $sc.parallelize(0..5).pipe('cat').collect
896
+ # # => ["0", "1", "2", "3", "4", "5"]
897
+ #
898
+ # rdd = $sc.parallelize(0..5)
899
+ # rdd = rdd.pipe('cat', "awk '{print $1*10}'")
900
+ # rdd = rdd.map(lambda{|x| x.to_i + 1})
901
+ # rdd.collect
902
+ # # => [1, 11, 21, 31, 41, 51]
903
+ #
904
+ def pipe(*cmds)
905
+ new_rdd_from_command(Spark::Command::Pipe, cmds)
906
+ end
907
+
908
+
909
+ # =============================================================================
910
+ # Pair functions
911
+
912
+ # Merge the values for each key using an associative reduce function. This will also perform
913
+ # the merging locally on each mapper before sending results to a reducer, similarly to a
914
+ # "combiner" in MapReduce. Output will be hash-partitioned with the existing partitioner/
915
+ # parallelism level.
916
+ #
917
+ # == Example:
918
+ # rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"]).map(lambda{|x| [x, 1]})
919
+ # rdd.reduce_by_key(lambda{|x,y| x+y}).collect_as_hash
920
+ # # => {"a"=>3, "b"=>2, "c"=>3}
921
+ #
922
+ def reduce_by_key(f, num_partitions=nil)
923
+ combine_by_key('lambda {|x| x}', f, f, num_partitions)
924
+ end
925
+
926
+ # Generic function to combine the elements for each key using a custom set of aggregation
927
+ # functions. Turns a JavaPairRDD[(K, V)] into a result of type JavaPairRDD[(K, C)], for a
928
+ # "combined type" C * Note that V and C can be different -- for example, one might group an
929
+ # RDD of type (Int, Int) into an RDD of type (Int, List[Int]). Users provide three
930
+ # functions:
931
+ #
932
+ # == Parameters:
933
+ # create_combiner:: which turns a V into a C (e.g., creates a one-element list)
934
+ # merge_value:: to merge a V into a C (e.g., adds it to the end of a list)
935
+ # merge_combiners:: to combine two C's into a single one.
936
+ #
937
+ # == Example:
938
+ # def combiner(x)
939
+ # x
940
+ # end
941
+ #
942
+ # def merge(x,y)
943
+ # x+y
944
+ # end
945
+ #
946
+ # rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"], 2).map(lambda{|x| [x, 1]})
947
+ # rdd.combine_by_key(method(:combiner), method(:merge), method(:merge)).collect_as_hash
948
+ # # => {"a"=>3, "b"=>2, "c"=>3}
949
+ #
950
+ def combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions=nil)
951
+ _combine_by_key(
952
+ [Spark::Command::CombineByKey::Combine, create_combiner, merge_value],
953
+ [Spark::Command::CombineByKey::Merge, merge_combiners],
954
+ num_partitions
955
+ )
956
+ end
957
+
958
+ # Return an RDD of grouped items.
959
+ #
960
+ # == Example:
961
+ # rdd = $sc.parallelize(0..5)
962
+ # rdd.group_by(lambda{|x| x%2}).collect
963
+ # # => [[0, [0, 2, 4]], [1, [1, 3, 5]]]
964
+ #
965
+ def group_by(f, num_partitions=nil)
966
+ self.key_by(f).group_by_key(num_partitions)
967
+ end
968
+
969
+ # Group the values for each key in the RDD into a single sequence. Allows controlling the
970
+ # partitioning of the resulting key-value pair RDD by passing a Partitioner.
971
+ #
972
+ # Note: If you are grouping in order to perform an aggregation (such as a sum or average)
973
+ # over each key, using reduce_by_key or combine_by_key will provide much better performance.
974
+ #
975
+ # == Example:
976
+ # rdd = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
977
+ # rdd.group_by_key.collect
978
+ # # => [["a", [1, 2]], ["b", [3]]]
979
+ #
980
+ def group_by_key(num_partitions=nil)
981
+ create_combiner = 'lambda{|item| [item]}'
982
+ merge_value = 'lambda{|combiner, item| combiner << item; combiner}'
983
+ merge_combiners = 'lambda{|combiner_1, combiner_2| combiner_1 += combiner_2; combiner_1}'
984
+
985
+ combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions)
986
+ end
987
+
988
+ # Merge the values for each key using an associative function f
989
+ # and a neutral `zero_value` which may be added to the result an
990
+ # arbitrary number of times, and must not change the result
991
+ # (e.g., 0 for addition, or 1 for multiplication.).
992
+ #
993
+ # == Example:
994
+ # rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]])
995
+ # rdd.fold_by_key(1, lambda{|x,y| x+y})
996
+ # # => [["a", 9], ["c", 6], ["b", 3]]
997
+ #
998
+ def fold_by_key(zero_value, f, num_partitions=nil)
999
+ self.aggregate_by_key(zero_value, f, f, num_partitions)
1000
+ end
1001
+
1002
+ # Aggregate the values of each key, using given combine functions and a neutral zero value.
1003
+ #
1004
+ # == Example:
1005
+ # def combine(x,y)
1006
+ # x+y
1007
+ # end
1008
+ #
1009
+ # def merge(x,y)
1010
+ # x*y
1011
+ # end
1012
+ #
1013
+ # rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]], 2)
1014
+ # rdd.aggregate_by_key(1, method(:combine), method(:merge))
1015
+ # # => [["b", 3], ["a", 16], ["c", 6]]
1016
+ #
1017
+ def aggregate_by_key(zero_value, seq_func, comb_func, num_partitions=nil)
1018
+ _combine_by_key(
1019
+ [Spark::Command::CombineByKey::CombineWithZero, zero_value, seq_func],
1020
+ [Spark::Command::CombineByKey::Merge, comb_func],
1021
+ num_partitions
1022
+ )
1023
+ end
1024
+
1025
+ # The same functionality as cogroup but this can grouped only 2 rdd's and you
1026
+ # can change num_partitions.
1027
+ #
1028
+ # == Example:
1029
+ # rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
1030
+ # rdd2 = $sc.parallelize([["a", 4], ["a", 5], ["b", 6]])
1031
+ # rdd1.group_with(rdd2).collect
1032
+ # # => [["a", [1, 2, 4, 5]], ["b", [3, 6]]]
1033
+ #
1034
+ def group_with(other, num_partitions=nil)
1035
+ self.union(other).group_by_key(num_partitions)
1036
+ end
1037
+
1038
+ # For each key k in `this` or `other`, return a resulting RDD that contains a tuple with the
1039
+ # list of values for that key in `this` as well as `other`.
1040
+ #
1041
+ # == Example:
1042
+ # rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3]])
1043
+ # rdd2 = $sc.parallelize([["a", 4], ["a", 5], ["b", 6]])
1044
+ # rdd3 = $sc.parallelize([["a", 7], ["a", 8], ["b", 9]])
1045
+ # rdd1.cogroup(rdd2, rdd3).collect
1046
+ # # => [["a", [1, 2, 4, 5, 7, 8]], ["b", [3, 6, 9]]]
1047
+ #
1048
+ def cogroup(*others)
1049
+ unioned = self
1050
+ others.each do |other|
1051
+ unioned = unioned.union(other)
1052
+ end
1053
+
1054
+ unioned.group_by_key
1055
+ end
1056
+
1057
+ # Return each (key, value) pair in self RDD that has no pair with matching
1058
+ # key in other RDD.
1059
+ #
1060
+ # == Example:
1061
+ # rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3], ["c", 4]])
1062
+ # rdd2 = $sc.parallelize([["b", 5], ["c", 6]])
1063
+ # rdd1.subtract_by_key(rdd2).collect
1064
+ # # => [["a", 1], ["a", 2]]
1065
+ #
1066
+ def subtract_by_key(other, num_partitions=nil)
1067
+ create_combiner = 'lambda{|item| [[item]]}'
1068
+ merge_value = 'lambda{|combiner, item| combiner.first << item; combiner}'
1069
+ merge_combiners = 'lambda{|combiner_1, combiner_2| combiner_1 += combiner_2; combiner_1}'
1070
+
1071
+ self.union(other)
1072
+ .combine_by_key(create_combiner, merge_value, merge_combiners, num_partitions)
1073
+ .filter('lambda{|(key,values)| values.size == 1}')
1074
+ .flat_map_values('lambda{|item| item.first}')
1075
+ end
1076
+
1077
+ # Return an RDD with the elements from self that are not in other.
1078
+ #
1079
+ # == Example:
1080
+ # rdd1 = $sc.parallelize([["a", 1], ["a", 2], ["b", 3], ["c", 4]])
1081
+ # rdd2 = $sc.parallelize([["a", 2], ["c", 6]])
1082
+ # rdd1.subtract(rdd2).collect
1083
+ # # => [["a", 1], ["b", 3], ["c", 4]]
1084
+ #
1085
+ def subtract(other, num_partitions=nil)
1086
+ mapping_function = 'lambda{|x| [x,nil]}'
1087
+
1088
+ self.map(mapping_function)
1089
+ .subtract_by_key(other.map(mapping_function), num_partitions)
1090
+ .keys
1091
+ end
1092
+
1093
+ # Sort the RDD by key
1094
+ #
1095
+ # == Example:
1096
+ # rdd = $sc.parallelize([["c", 1], ["b", 2], ["a", 3]])
1097
+ # rdd.sort_by_key.collect
1098
+ # # => [["a", 3], ["b", 2], ["c", 1]]
1099
+ #
1100
+ def sort_by_key(ascending=true, num_partitions=nil)
1101
+ self.sort_by('lambda{|(key, _)| key}')
1102
+ end
1103
+
1104
+ # Sort the RDD by value
1105
+ #
1106
+ # == Example:
1107
+ # rdd = $sc.parallelize([["a", 3], ["b", 1], ["c", 2]])
1108
+ # rdd.sort_by_value.collect
1109
+ # # => [["b", 1], ["c", 2], ["a", 3]]
1110
+ #
1111
+ def sort_by_value(ascending=true, num_partitions=nil)
1112
+ self.sort_by('lambda{|(_, value)| value}')
1113
+ end
1114
+
1115
+ # Sorts this RDD by the given key_function
1116
+ #
1117
+ # This is a different implementation than spark. Sort by doesn't use
1118
+ # key_by method first. It can be slower but take less memory and
1119
+ # you can always use map.sort_by_key
1120
+ #
1121
+ # == Example:
1122
+ # rdd = $sc.parallelize(["aaaaaaa", "cc", "b", "eeee", "ddd"])
1123
+ #
1124
+ # rdd.sort_by.collect
1125
+ # # => ["aaaaaaa", "b", "cc", "ddd", "eeee"]
1126
+ #
1127
+ # rdd.sort_by(lambda{|x| x.size}).collect
1128
+ # # => ["b", "cc", "ddd", "eeee", "aaaaaaa"]
1129
+ #
1130
+ def sort_by(key_function=nil, ascending=true, num_partitions=nil)
1131
+ key_function ||= 'lambda{|x| x}'
1132
+ num_partitions ||= default_reduce_partitions
1133
+
1134
+ command_klass = Spark::Command::SortByKey
1135
+
1136
+ # Allow spill data to disk due to memory limit
1137
+ # spilling = config['spark.shuffle.spill'] || false
1138
+ spilling = false
1139
+ memory = ''
1140
+
1141
+ # Set spilling to false if worker has unlimited memory
1142
+ if memory.empty?
1143
+ spilling = false
1144
+ memory = nil
1145
+ else
1146
+ memory = to_memory_size(memory)
1147
+ end
1148
+
1149
+ # Sorting should do one worker
1150
+ if num_partitions == 1
1151
+ rdd = self
1152
+ rdd = rdd.coalesce(1) if partitions_size > 1
1153
+ return rdd.new_rdd_from_command(command_klass, key_function, ascending, spilling, memory, serializer)
1154
+ end
1155
+
1156
+ # Compute boundary of collection
1157
+ # Collection should be evenly distributed
1158
+ # 20.0 is from scala RangePartitioner (for roughly balanced output partitions)
1159
+ count = self.count
1160
+ sample_size = num_partitions * 20.0
1161
+ fraction = [sample_size / [count, 1].max, 1.0].min
1162
+ samples = self.sample(false, fraction, 1).map(key_function).collect
1163
+ samples.sort!
1164
+ # Reverse is much faster than reverse sort_by
1165
+ samples.reverse! if !ascending
1166
+
1167
+ # Determine part bounds
1168
+ bounds = determine_bounds(samples, num_partitions)
1169
+
1170
+ shuffled = _partition_by(num_partitions, Spark::Command::PartitionBy::Sorting, key_function, bounds, ascending, num_partitions)
1171
+ shuffled.new_rdd_from_command(command_klass, key_function, ascending, spilling, memory, serializer)
1172
+ end
1173
+
1174
+ # Creates array of the elements in this RDD by applying function f.
1175
+ #
1176
+ # == Example:
1177
+ # rdd = $sc.parallelize(0..5)
1178
+ # rdd.key_by(lambda{|x| x%2}).collect
1179
+ # # => [[0, 0], [1, 1], [0, 2], [1, 3], [0, 4], [1, 5]]
1180
+ #
1181
+ def key_by(f)
1182
+ new_rdd_from_command(Spark::Command::KeyBy, f)
1183
+ end
1184
+
1185
+ # Pass each value in the key-value pair RDD through a map function without changing
1186
+ # the keys. This also retains the original RDD's partitioning.
1187
+ #
1188
+ # == Example:
1189
+ # rdd = $sc.parallelize(["ruby", "scala", "java"])
1190
+ # rdd = rdd.map(lambda{|x| [x, x]})
1191
+ # rdd = rdd.map_values(lambda{|x| x.upcase})
1192
+ # rdd.collect
1193
+ # # => [["ruby", "RUBY"], ["scala", "SCALA"], ["java", "JAVA"]]
1194
+ #
1195
+ def map_values(f)
1196
+ new_rdd_from_command(Spark::Command::MapValues, f)
1197
+ end
1198
+
1199
+ # Pass each value in the key-value pair RDD through a flat_map function
1200
+ # without changing the keys; this also retains the original RDD's
1201
+ # partitioning.
1202
+ #
1203
+ # == Example:
1204
+ # rdd = $sc.parallelize([["a", [1,2]], ["b", [3]]])
1205
+ # rdd = rdd.flat_map_values(lambda{|x| x*2})
1206
+ # rdd.collect
1207
+ # # => [["a", 1], ["a", 2], ["a", 1], ["a", 2], ["b", 3], ["b", 3]]
1208
+ #
1209
+ def flat_map_values(f)
1210
+ new_rdd_from_command(Spark::Command::FlatMapValues, f)
1211
+ end
1212
+
1213
+ # Return an RDD with the first element of PairRDD
1214
+ #
1215
+ # == Example:
1216
+ # rdd = $sc.parallelize([[1,2], [3,4], [5,6]])
1217
+ # rdd.keys.collect
1218
+ # # => [1, 3, 5]
1219
+ #
1220
+ def keys
1221
+ self.map('lambda{|(key, _)| key}')
1222
+ end
1223
+
1224
+ # Return an RDD with the second element of PairRDD
1225
+ #
1226
+ # == Example:
1227
+ # rdd = $sc.parallelize([[1,2], [3,4], [5,6]])
1228
+ # rdd.keys.collect
1229
+ # # => [2, 4, 6]
1230
+ #
1231
+ def values
1232
+ self.map('lambda{|(_, value)| value}')
1233
+ end
1234
+
1235
+
1236
+ # Aliases
1237
+ alias_method :partitionsSize, :partitions_size
1238
+ alias_method :defaultReducePartitions, :default_reduce_partitions
1239
+ alias_method :setName, :set_name
1240
+ alias_method :addLibrary, :add_library
1241
+ alias_method :require, :add_library
1242
+
1243
+ alias_method :flatMap, :flat_map
1244
+ alias_method :mapPartitions, :map_partitions
1245
+ alias_method :mapPartitionsWithIndex, :map_partitions_with_index
1246
+ alias_method :reduceByKey, :reduce_by_key
1247
+ alias_method :combineByKey, :combine_by_key
1248
+ alias_method :groupByKey, :group_by_key
1249
+ alias_method :groupWith, :group_with
1250
+ alias_method :partitionBy, :partition_by
1251
+ alias_method :defaultReducePartitions, :default_reduce_partitions
1252
+ alias_method :foreachPartition, :foreach_partition
1253
+ alias_method :mapValues, :map_values
1254
+ alias_method :takeSample, :take_sample
1255
+ alias_method :sortBy, :sort_by
1256
+ alias_method :sortByKey, :sort_by_key
1257
+ alias_method :keyBy, :key_by
1258
+ alias_method :groupBy, :group_by
1259
+ alias_method :foldByKey, :fold_by_key
1260
+ alias_method :aggregateByKey, :aggregate_by_key
1261
+ alias_method :subtractByKey, :subtract_by_key
1262
+ alias_method :sampleStdev, :sample_stdev
1263
+ alias_method :sampleVariance, :sample_variance
1264
+
1265
+ private
1266
+
1267
+ # This is base method for reduce operation. Is used by reduce, fold and aggregation.
1268
+ # Only difference is that fold has zero value.
1269
+ #
1270
+ def _reduce(klass, seq_op, comb_op, zero_value=nil)
1271
+ if seq_op.nil?
1272
+ # Partitions are already reduced
1273
+ rdd = self
1274
+ else
1275
+ rdd = new_rdd_from_command(klass, seq_op, zero_value)
1276
+ end
1277
+
1278
+ # Send all results to one worker and combine results
1279
+ rdd = rdd.coalesce(1).compact
1280
+
1281
+ # Add the same function to new RDD
1282
+ comm = rdd.add_command(klass, comb_op, zero_value)
1283
+ comm.deserializer = @command.serializer
1284
+
1285
+ # Value is returned in array
1286
+ PipelinedRDD.new(rdd, comm).collect[0]
1287
+ end
1288
+
1289
+ def _partition_by(num_partitions, klass, *args)
1290
+ # RDD is transform from [key, value] to [hash, [key, value]]
1291
+ keyed = new_rdd_from_command(klass, *args)
1292
+ keyed.serializer.unbatch!
1293
+
1294
+ # PairwiseRDD and PythonPartitioner are borrowed from Python
1295
+ # but works great on ruby too
1296
+ pairwise_rdd = PairwiseRDD.new(keyed.jrdd.rdd).asJavaPairRDD
1297
+ partitioner = PythonPartitioner.new(num_partitions, args.first.object_id)
1298
+ new_jrdd = pairwise_rdd.partitionBy(partitioner).values
1299
+
1300
+ # Reset deserializer
1301
+ RDD.new(new_jrdd, context, @command.serializer, keyed.serializer)
1302
+ end
1303
+
1304
+ # For using a different combine_by_key
1305
+ #
1306
+ # == Used for:
1307
+ # * combine_by_key
1308
+ # * fold_by_key (with zero value)
1309
+ #
1310
+ def _combine_by_key(combine, merge, num_partitions)
1311
+ num_partitions ||= default_reduce_partitions
1312
+
1313
+ # Combine key
1314
+ combined = new_rdd_from_command(combine.shift, *combine)
1315
+
1316
+ # Merge items
1317
+ shuffled = combined.partition_by(num_partitions)
1318
+ merge_comm = shuffled.add_command(merge.shift, *merge)
1319
+
1320
+ PipelinedRDD.new(shuffled, merge_comm)
1321
+ end
1322
+
1323
+ end
1324
+
1325
+ # Pipelined Resilient Distributed Dataset, operations are pipelined and sended to worker
1326
+ #
1327
+ # RDD
1328
+ # `-- map
1329
+ # `-- map
1330
+ # `-- map
1331
+ #
1332
+ # Code is executed from top to bottom
1333
+ #
1334
+ class PipelinedRDD < RDD
1335
+
1336
+ attr_reader :prev_jrdd, :command
1337
+
1338
+ def initialize(prev, command)
1339
+
1340
+ if prev.is_a?(PipelinedRDD) && prev.pipelinable?
1341
+ # Second, ... stages
1342
+ @prev_jrdd = prev.prev_jrdd
1343
+ else
1344
+ # First stage
1345
+ @prev_jrdd = prev.jrdd
1346
+ end
1347
+
1348
+ @cached = false
1349
+ @checkpointed = false
1350
+
1351
+ @context = prev.context
1352
+ @command = command
1353
+ end
1354
+
1355
+ def pipelinable?
1356
+ !(cached? || checkpointed?)
1357
+ end
1358
+
1359
+ # Serialization necessary things and sent it to RubyRDD (scala extension)
1360
+ def jrdd
1361
+ @jrdd ||= _jrdd
1362
+ end
1363
+
1364
+ private
1365
+
1366
+ def _jrdd
1367
+ command = @command.build
1368
+
1369
+ broadcasts = @command.bound_objects.select{|_, value| value.is_a?(Spark::Broadcast)}.values
1370
+ broadcasts = to_java_array_list(broadcasts.map(&:jbroadcast))
1371
+
1372
+ ruby_rdd = RubyRDD.new(@prev_jrdd.rdd, command, broadcasts, @context.jaccumulator)
1373
+ ruby_rdd.asJavaRDD
1374
+ end
1375
+
1376
+ end
1377
+ end