ruby-spark 1.1.0.1 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/.travis.yml +15 -0
  4. data/CHANGELOG.md +8 -0
  5. data/README.md +184 -57
  6. data/TODO.md +3 -1
  7. data/ext/spark/build.sbt +5 -5
  8. data/ext/spark/src/main/scala/RubyWorker.scala +7 -16
  9. data/lib/spark.rb +69 -10
  10. data/lib/spark/accumulator.rb +8 -0
  11. data/lib/spark/broadcast.rb +7 -0
  12. data/lib/spark/build.rb +10 -10
  13. data/lib/spark/cli.rb +68 -76
  14. data/lib/spark/config.rb +13 -17
  15. data/lib/spark/context.rb +10 -7
  16. data/lib/spark/error.rb +4 -0
  17. data/lib/spark/helper/statistic.rb +5 -1
  18. data/lib/spark/java_bridge.rb +5 -3
  19. data/lib/spark/java_bridge/base.rb +15 -15
  20. data/lib/spark/java_bridge/jruby.rb +3 -1
  21. data/lib/spark/java_bridge/rjb.rb +2 -0
  22. data/lib/spark/mllib/classification/logistic_regression.rb +10 -2
  23. data/lib/spark/mllib/classification/svm.rb +10 -2
  24. data/lib/spark/mllib/clustering/kmeans.rb +6 -2
  25. data/lib/spark/mllib/regression/lasso.rb +18 -2
  26. data/lib/spark/mllib/regression/linear.rb +11 -3
  27. data/lib/spark/mllib/regression/ridge.rb +18 -2
  28. data/lib/spark/rdd.rb +11 -2
  29. data/lib/spark/serializer.rb +1 -1
  30. data/lib/spark/serializer/auto_batched.rb +7 -0
  31. data/lib/spark/version.rb +1 -1
  32. data/ruby-spark.gemspec +4 -5
  33. data/spec/generator.rb +1 -1
  34. data/spec/lib/collect_spec.rb +10 -10
  35. data/spec/lib/config_spec.rb +10 -10
  36. data/spec/lib/context_spec.rb +116 -115
  37. data/spec/lib/ext_spec.rb +17 -17
  38. data/spec/lib/external_apps_spec.rb +1 -1
  39. data/spec/lib/filter_spec.rb +17 -17
  40. data/spec/lib/flat_map_spec.rb +22 -19
  41. data/spec/lib/group_spec.rb +22 -19
  42. data/spec/lib/helper_spec.rb +60 -12
  43. data/spec/lib/key_spec.rb +9 -8
  44. data/spec/lib/manipulation_spec.rb +15 -15
  45. data/spec/lib/map_partitions_spec.rb +6 -4
  46. data/spec/lib/map_spec.rb +22 -19
  47. data/spec/lib/reduce_by_key_spec.rb +19 -19
  48. data/spec/lib/reduce_spec.rb +22 -20
  49. data/spec/lib/sample_spec.rb +13 -12
  50. data/spec/lib/serializer_spec.rb +27 -0
  51. data/spec/lib/sort_spec.rb +16 -14
  52. data/spec/lib/statistic_spec.rb +4 -2
  53. data/spec/lib/whole_text_files_spec.rb +9 -8
  54. data/spec/spec_helper.rb +3 -3
  55. metadata +19 -18
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 64125882e5773c705d62c737e57a481d7bc8bb71
4
- data.tar.gz: 4dd464e678a79c9e2d5655fc5b0aae813c2a6346
3
+ metadata.gz: cd863f728212557da03e76f6e98eeed05695ea5d
4
+ data.tar.gz: 214b2022187727a50badcd1910313550e59aefdf
5
5
  SHA512:
6
- metadata.gz: 5d11709879c2ce1d1c1a7153dafc7b85908d51227b864f276ef1bfb257eb239ae9f6216054bb3a0e601c6df23ec756fb60df70793113b282fc8db1e73b7b9b5d
7
- data.tar.gz: 4452699e228a9f4196a3afc583a9f6ea792ca41864524ab2b029aef04abb53d662c9ce538f8190add24a528605cca3d3fcc2f6c99e49f8ae33836df0a0181a13
6
+ metadata.gz: 23c0c7b6ab63a2f9c191cddc4836c73cde61722b9e6f3c7e25b090afed7cda2eaff0d8718074ae3337ff5c4bd57e1223dab76f6cf7772b4c7dda3e7ed69d98c6
7
+ data.tar.gz: 234897b1851614ae1371b3a33417c8d036b00a4551185829b99ef398a110a614ffe1eaeac556c00859a45598bed4219e59eb98ddbffbe0fe2c25c024408b8628
data/.gitignore CHANGED
@@ -35,3 +35,4 @@ ext/spark/project/project/target/*
35
35
  wiki
36
36
  /benchmark/performance/spark/*
37
37
  /benchmark/performance/rspark/*
38
+ /_*
@@ -0,0 +1,15 @@
1
+ language: ruby
2
+
3
+ rvm:
4
+ - 2.2.0
5
+
6
+ before_script:
7
+ - bundle exec rake compile
8
+ - bundle exec ruby bin/ruby-spark build
9
+
10
+ cache:
11
+ bundler: true
12
+ directories:
13
+ - $HOME/.m2
14
+ - $HOME/.ivy2
15
+ - $HOME/.sbt
@@ -0,0 +1,8 @@
1
+ ## 1.2.0 (15.06.2015)
2
+
3
+ - target folder is now located at HOME
4
+ - better serializators
5
+ - error when java class does not exist
6
+ - default setting at ~/.ruby-spark.conf
7
+ - compatible with Spark 1.4.0
8
+ - added calling site to RDD
data/README.md CHANGED
@@ -1,8 +1,8 @@
1
- # Ruby-Spark
1
+ # Ruby-Spark [![Build Status](https://travis-ci.org/ondra-m/ruby-spark.svg?branch=master)](https://travis-ci.org/ondra-m/ruby-spark)
2
2
 
3
3
  Apache Spark™ is a fast and general engine for large-scale data processing.
4
4
 
5
- This Gem allows you use Spark functionality on Ruby.
5
+ This Gem allows the use Spark functionality on Ruby.
6
6
 
7
7
  > Word count in Spark's Ruby API
8
8
 
@@ -16,7 +16,7 @@ file.flat_map(:split)
16
16
 
17
17
  - [Apache Spark](http://spark.apache.org)
18
18
  - [Wiki](https://github.com/ondra-m/ruby-spark/wiki)
19
- - [Ruby-doc](http://www.rubydoc.info/github/ondra-m/ruby-spark)
19
+ - [Rubydoc](http://www.rubydoc.info/gems/ruby-spark)
20
20
 
21
21
  ## Installation
22
22
 
@@ -24,6 +24,7 @@ file.flat_map(:split)
24
24
 
25
25
  - Java 7+
26
26
  - Ruby 2+
27
+ - wget or curl
27
28
  - MRI or JRuby
28
29
 
29
30
  Add this line to your application's Gemfile:
@@ -48,12 +49,13 @@ Run `rake compile` if you are using gem from local filesystem.
48
49
 
49
50
  ### Build Apache Spark
50
51
 
51
- This command will download Spark and build extensions for this gem ([SBT](ext/spark/build.sbt) is used for compiling). For more informations check [wiki](https://github.com/ondra-m/ruby-spark/wiki/Installation). Everything is stored by default at [GEM_ROOT]/target,
52
+ This command will download Spark and build extensions for this gem ([SBT](ext/spark/build.sbt) is used for compiling). For more informations check [wiki](https://github.com/ondra-m/ruby-spark/wiki/Installation). Jars will be stored at you HOME directory.
52
53
 
53
54
  ```
54
55
  $ ruby-spark build
55
56
  ```
56
57
 
58
+
57
59
  ## Usage
58
60
 
59
61
  You can use Ruby Spark via interactive shell (Pry is used)
@@ -62,26 +64,13 @@ You can use Ruby Spark via interactive shell (Pry is used)
62
64
  $ ruby-spark shell
63
65
  ```
64
66
 
65
- Or on existing project
66
-
67
- ```ruby
68
- require 'ruby-spark'
69
-
70
- # Create a SparkContext
71
- Spark.start
72
-
73
- # Context reference
74
- Spark.sc
75
- ```
67
+ Or on existing project.
76
68
 
77
69
  If you want configure Spark first. See [configurations](https://github.com/ondra-m/ruby-spark/wiki/Configuration) for more details.
78
70
 
79
71
  ```ruby
80
72
  require 'ruby-spark'
81
73
 
82
- # Use if you have custom SPARK_HOME
83
- Spark.load_lib(spark_home)
84
-
85
74
  # Configuration
86
75
  Spark.config do
87
76
  set_app_name "RubySpark"
@@ -91,17 +80,21 @@ end
91
80
 
92
81
  # Start Apache Spark
93
82
  Spark.start
83
+
84
+ # Context reference
85
+ Spark.sc
94
86
  ```
95
87
 
96
- Finally, to stop the cluster. On the shell is Spark stopped automatically when you exist.
88
+ Finally, to stop the cluster. On the shell is Spark stopped automatically when environment exit.
97
89
 
98
90
  ```ruby
99
91
  Spark.stop
100
92
  ```
93
+ After first use, global configuration is created at **~/.ruby-spark.conf**. There can be specified properties for Spark and RubySpark.
101
94
 
102
95
 
103
96
 
104
- ## Creating RDD (upload data)
97
+ ## Creating RDD (a new collection)
105
98
 
106
99
  Single text file:
107
100
 
@@ -115,28 +108,18 @@ All files on directory:
115
108
  rdd = sc.whole_text_files(DIRECTORY, workers_num, serializer=nil)
116
109
  ```
117
110
 
118
- Direct uploading structures from ruby (choosen serializer must be able to serialize it):
111
+ Direct uploading structures from ruby:
119
112
 
120
113
  ```ruby
121
114
  rdd = sc.parallelize([1,2,3,4,5], workers_num, serializer=nil)
122
115
  rdd = sc.parallelize(1..5, workers_num, serializer=nil)
123
116
  ```
124
117
 
125
- ### Options
118
+ There is 2 conditions:
119
+ 1. choosen serializer must be able to serialize it
120
+ 2. data must be iterable
126
121
 
127
- <dl>
128
- <dt>workers_num</dt>
129
- <dd>
130
- Min count of works computing this task.<br>
131
- <i>(This value can be overwriten by spark)</i>
132
- </dd>
133
-
134
- <dt>serializer</dt>
135
- <dd>
136
- Custom serializer.<br>
137
- <i>(default: by <b>spark.ruby.serializer</b> options)</i>
138
- </dd>
139
- </dl>
122
+ If you do not specified serializer -> default is used (defined from spark.ruby.serializer.* options). [Check this](https://github.com/ondra-m/ruby-spark/wiki/Loading-data#custom-serializer) if you want create custom serializer.
140
123
 
141
124
  ## Operations
142
125
 
@@ -145,39 +128,150 @@ All operations can be divided into 2 groups:
145
128
  - **Transformations:** append new operation to current RDD and return new
146
129
  - **Actions:** add operation and start calculations
147
130
 
148
- See [wiki page](https://github.com/ondra-m/ruby-spark/wiki/RDD) or [Ruby-doc](http://www.rubydoc.info/github/ondra-m/ruby-spark/master/Spark/RDD) for more details.
131
+ More informations:
132
+
133
+ - [Wiki page](https://github.com/ondra-m/ruby-spark/wiki/RDD)
134
+ - [Rubydoc](http://www.rubydoc.info/github/ondra-m/ruby-spark/master/Spark/RDD)
135
+ - [rdd.rb](https://github.com/ondra-m/ruby-spark/blob/master/lib/spark/rdd.rb)
136
+
137
+ You can also check official Spark documentation. First make sure that method is implemented here.
138
+
139
+ - [Transformations](http://spark.apache.org/docs/latest/programming-guide.html#transformations)
140
+ - [Actions](http://spark.apache.org/docs/latest/programming-guide.html#actions)
149
141
 
150
142
  #### Transformations
151
143
 
152
- ```ruby
153
- rdd.map(lambda{|item| ...})
154
- rdd.flat_map(lambda{|item| ...})
155
- rdd.filter(lambda{|item| ...})
156
- rdd.union(rdd)
157
- rdd.map_paritions(lambda{|iterator| ...})
158
- # ...
159
- ```
144
+ <dl>
145
+ <dt><code>rdd.map(function)</code></dt>
146
+ <dd>Return a new RDD by applying a function to all elements of this RDD.</dd>
147
+
148
+ <dt><code>rdd.flat_map(function)</code></dt>
149
+ <dd>Return a new RDD by first applying a function to all elements of this RDD, and then flattening the results.</dd>
150
+
151
+ <dt><code>rdd.map_partitions(function)</code></dt>
152
+ <dd>Return a new RDD by applying a function to each partition of this RDD.</dd>
153
+
154
+ <dt><code>rdd.filter(function)</code></dt>
155
+ <dd>Return a new RDD containing only the elements that satisfy a predicate.</dd>
156
+
157
+ <dt><code>rdd.cartesian(other)</code></dt>
158
+ <dd>Return the Cartesian product of this RDD and another one, that is, the RDD of all pairs of elements `(a, b)` where `a` is in `self` and `b` is in `other`.</dd>
159
+
160
+ <dt><code>rdd.intersection(other)</code></dt>
161
+ <dd>Return the intersection of this RDD and another one. The output will not contain any duplicate elements, even if the input RDDs did.</dd>
162
+
163
+ <dt><code>rdd.sample(with_replacement, fraction, seed)</code></dt>
164
+ <dd>Return a sampled subset of this RDD. Operations are base on Poisson and Uniform distributions.</dd>
165
+
166
+ <dt><code>rdd.group_by_key(num_partitions)</code></dt>
167
+ <dd>Group the values for each key in the RDD into a single sequence.</dd>
168
+
169
+ <dt><a href="http://www.rubydoc.info/gems/ruby-spark/Spark/RDD" target="_blank"><code>...many more...</code></a></dt>
170
+ <dd></dd>
171
+ </dl>
172
+
160
173
 
161
174
  #### Actions
162
175
 
163
- ```ruby
164
- rdd.count
165
- rdd.take(n)
166
- rdd.collect
167
- # ...
168
- ```
176
+ <dl>
177
+ <dt><code>rdd.take(count)</code></dt>
178
+ <dd>Take the first num elements of the RDD.</dd>
179
+
180
+ <dt><code>rdd.reduce(function)</code></dt>
181
+ <dd>Reduces the elements of this RDD using the specified lambda or method.</dd>
182
+
183
+ <dt><code>rdd.aggregate(zero_value, seq_op, comb_op)</code></dt>
184
+ <dd>Aggregate the elements of each partition, and then the results for all the partitions, using given combine functions and a neutral “zero value”.</dd>
185
+
186
+ <dt><code>rdd.histogram(buckets)</code></dt>
187
+ <dd>Compute a histogram using the provided buckets.</dd>
188
+
189
+ <dt><code>rdd.collect</code></dt>
190
+ <dd>Return an array that contains all of the elements in this RDD.</dd>
191
+
192
+ <dt><a href="http://www.rubydoc.info/gems/ruby-spark/Spark/RDD" target="_blank"><code>...many more...</code></a></dt>
193
+ <dd></dd>
194
+ </dl>
169
195
 
170
196
 
171
197
  ## Examples
172
198
 
173
- Sum of numbers
199
+ ##### Basic methods
174
200
 
175
201
  ```ruby
176
- sc.parallelize(0..10).sum
177
- # => 55
202
+ # Every batch will be serialized by Marshal and will have size 10
203
+ ser = Spark::Serializer.build('batched(marshal, 10)')
204
+
205
+ # Range 0..100, 2 workers, custom serializer
206
+ rdd = Spark.sc.parallelize(0..100, 2, ser)
207
+
208
+
209
+ # Take first 5 items
210
+ rdd.take(5)
211
+ # => [0, 1, 2, 3, 4]
212
+
213
+
214
+ # Numbers reducing
215
+ rdd.reduce(lambda{|sum, x| sum+x})
216
+ rdd.reduce(:+)
217
+ rdd.sum
218
+ # => 5050
219
+
220
+
221
+ # Reducing with zero items
222
+ seq = lambda{|x,y| x+y}
223
+ com = lambda{|x,y| x*y}
224
+ rdd.aggregate(1, seq, com)
225
+ # 1. Every workers adds numbers
226
+ # => [1226, 3826]
227
+ # 2. Results are multiplied
228
+ # => 4690676
229
+
230
+
231
+ # Statistic method
232
+ rdd.stats
233
+ # => StatCounter: (count, mean, max, min, variance,
234
+ # sample_variance, stdev, sample_stdev)
235
+
236
+
237
+ # Compute a histogram using the provided buckets.
238
+ rdd.histogram(2)
239
+ # => [[0.0, 50.0, 100], [50, 51]]
240
+
241
+
242
+ # Mapping
243
+ rdd.map(lambda {|x| x*2}).collect
244
+ # => [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, ...]
245
+ rdd.map(:to_f).collect
246
+ # => [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...]
247
+
248
+
249
+ # Mapping the whole collection
250
+ rdd.map_partitions(lambda{|part| part.reduce(:+)}).collect
251
+ # => [1225, 3825]
252
+
253
+
254
+ # Selecting
255
+ rdd.filter(lambda{|x| x.even?}).collect
256
+ # => [0, 2, 4, 6, 8, 10, 12, 14, 16, ...]
257
+
258
+
259
+ # Sampling
260
+ rdd.sample(true, 10).collect
261
+ # => [3, 36, 40, 54, 58, 82, 86, 95, 98]
262
+
263
+
264
+ # Sampling X items
265
+ rdd.take_sample(true, 10)
266
+ # => [53, 87, 71, 74, 18, 75, 55, 94, 46, 32]
267
+
268
+
269
+ # Using external process
270
+ rdd.pipe('cat', "awk '{print $1*10}'")
271
+ # => ["0", "10", "20", "30", "40", "50", ...]
178
272
  ```
179
273
 
180
- Words count using methods
274
+ ##### Words count using methods
181
275
 
182
276
  ```ruby
183
277
  # Content:
@@ -198,7 +292,7 @@ rdd = rdd.reduce_by_key(lambda{|a, b| a+b})
198
292
  rdd.collect_as_hash
199
293
  ```
200
294
 
201
- Estimating PI with a custom serializer
295
+ ##### Estimating PI with a custom serializer
202
296
 
203
297
  ```ruby
204
298
  slices = 3
@@ -221,7 +315,7 @@ rdd = rdd.map(method(:map))
221
315
  puts 'Pi is roughly %f' % (4.0 * rdd.sum / n)
222
316
  ```
223
317
 
224
- Estimating PI
318
+ ##### Estimating PI
225
319
 
226
320
  ```ruby
227
321
  rdd = sc.parallelize([10_000], 1)
@@ -230,7 +324,16 @@ rdd = rdd.map(lambda{|x| BigMath.PI(x)})
230
324
  rdd.collect # => #<BigDecimal, '0.31415926...'>
231
325
  ```
232
326
 
233
- Linear regression
327
+ ### Mllib (Machine Learning Library)
328
+
329
+ Mllib functions are using Spark's Machine Learning Library. Ruby objects are serialized and deserialized in Java so you cannot use custom classes. Supported are primitive types such as string or integers.
330
+
331
+ All supported methods/models:
332
+
333
+ - [Rubydoc / Mllib](http://www.rubydoc.info/github/ondra-m/ruby-spark/Spark/Mllib)
334
+ - [Github / Mllib](https://github.com/ondra-m/ruby-spark/tree/master/lib/spark/mllib)
335
+
336
+ ##### Linear regression
234
337
 
235
338
  ```ruby
236
339
  # Import Mllib classes into Object
@@ -250,3 +353,27 @@ lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initial_weights: [1.0]
250
353
 
251
354
  lrm.predict([0.0])
252
355
  ```
356
+
357
+ ##### K-Mean
358
+
359
+ ```ruby
360
+ Spark::Mllib.import
361
+
362
+ # Dense vectors
363
+ data = [
364
+ DenseVector.new([0.0,0.0]),
365
+ DenseVector.new([1.0,1.0]),
366
+ DenseVector.new([9.0,8.0]),
367
+ DenseVector.new([8.0,9.0])
368
+ ]
369
+
370
+ model = KMeans.train(sc.parallelize(data), 2)
371
+
372
+ model.predict([0.0, 0.0]) == model.predict([1.0, 1.0])
373
+ # => true
374
+ model.predict([8.0, 9.0]) == model.predict([9.0, 8.0])
375
+ # => true
376
+ ```
377
+
378
+ ## Benchmarks
379
+
data/TODO.md CHANGED
@@ -3,4 +3,6 @@
3
3
  - add SQL
4
4
  - worker informations (time, memory, ...)
5
5
  - killing zombie workers
6
- - global config to ~/.ruby-spark.conf (e.g. target folder for spark)
6
+ - add_rb, add_inline_rb to Spark::{Context, RDD}
7
+ - fix broadcast for cluster
8
+ - dump to disk if there is memory limit
@@ -6,15 +6,15 @@ assemblySettings
6
6
  val defaultScalaVersion = "2.10.4"
7
7
  val defaultSparkVersion = "1.3.0"
8
8
  val defaultSparkCoreVersion = "2.10"
9
- val defaultSparkHome = "target"
9
+ val defaultTargetDir = "target"
10
10
  val defaultHadoopVersion = "1.0.4"
11
11
 
12
12
  // Values
13
+ val _hadoopVersion = scala.util.Properties.envOrElse("HADOOP_VERSION", defaultHadoopVersion)
13
14
  val _scalaVersion = scala.util.Properties.envOrElse("SCALA_VERSION", defaultScalaVersion)
14
15
  val _sparkVersion = scala.util.Properties.envOrElse("SPARK_VERSION", defaultSparkVersion)
15
16
  val _sparkCoreVersion = scala.util.Properties.envOrElse("SPARK_CORE_VERSION", defaultSparkCoreVersion)
16
- val _sparkHome = scala.util.Properties.envOrElse("SPARK_HOME", defaultSparkHome)
17
- val _hadoopVersion = scala.util.Properties.envOrElse("HADOOP_VERSION", defaultHadoopVersion)
17
+ val _targetDir = scala.util.Properties.envOrElse("TARGET_DIR", defaultTargetDir)
18
18
 
19
19
  // Project settings
20
20
  name := "ruby-spark"
@@ -26,8 +26,8 @@ scalaVersion := _scalaVersion
26
26
  javacOptions ++= Seq("-source", "1.7", "-target", "1.7")
27
27
 
28
28
  // Jar target folder
29
- artifactPath in Compile in packageBin := file(s"${_sparkHome}/ruby-spark.jar")
30
- outputPath in packageDependency := file(s"${_sparkHome}/ruby-spark-deps.jar")
29
+ artifactPath in Compile in packageBin := file(s"${_targetDir}/ruby-spark.jar")
30
+ outputPath in packageDependency := file(s"${_targetDir}/ruby-spark-deps.jar")
31
31
 
32
32
  // Protocol buffer support
33
33
  seq(sbtprotobuf.ProtobufPlugin.protobufSettings: _*)
@@ -123,22 +123,13 @@ object RubyWorker extends Logging {
123
123
  executorLocation = env.conf.get("spark.ruby.driver_home")
124
124
  }
125
125
  else{
126
- // Ruby-spark package uri
127
- val uri = env.conf.get("spark.ruby.executor.uri", "")
128
-
129
- if(uri.isEmpty){
130
- // Use gem installed on the system
131
- try {
132
- val homeCommand = new FileCommand(commandTemplate, "ruby-spark home", env, getEnvVars(env))
133
-
134
- executorLocation = homeCommand.run.readLine
135
- } catch {
136
- case e: java.io.IOException =>
137
- throw new SparkException("Ruby-spark gem is not installed.", e)
138
- }
139
- }
140
- else{
141
- // Prepare and use gem from uri
126
+ // Use gem installed on the system
127
+ try {
128
+ val homeCommand = (new FileCommand(commandTemplate, "ruby-spark home", env, getEnvVars(env))).run
129
+ executorLocation = homeCommand.readLine
130
+ } catch {
131
+ case e: Exception =>
132
+ throw new SparkException("Ruby-spark gem is not installed.", e)
142
133
  }
143
134
  }
144
135