RubyGems - ruby-spark - Versions diffs - 1.1.0.1 → 1.2.0 - Mend

ruby-spark 1.1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/.travis.yml +15 -0
data/CHANGELOG.md +8 -0
data/README.md +184 -57
data/TODO.md +3 -1
data/ext/spark/build.sbt +5 -5
data/ext/spark/src/main/scala/RubyWorker.scala +7 -16
data/lib/spark.rb +69 -10
data/lib/spark/accumulator.rb +8 -0
data/lib/spark/broadcast.rb +7 -0
data/lib/spark/build.rb +10 -10
data/lib/spark/cli.rb +68 -76
data/lib/spark/config.rb +13 -17
data/lib/spark/context.rb +10 -7
data/lib/spark/error.rb +4 -0
data/lib/spark/helper/statistic.rb +5 -1
data/lib/spark/java_bridge.rb +5 -3
data/lib/spark/java_bridge/base.rb +15 -15
data/lib/spark/java_bridge/jruby.rb +3 -1
data/lib/spark/java_bridge/rjb.rb +2 -0
data/lib/spark/mllib/classification/logistic_regression.rb +10 -2
data/lib/spark/mllib/classification/svm.rb +10 -2
data/lib/spark/mllib/clustering/kmeans.rb +6 -2
data/lib/spark/mllib/regression/lasso.rb +18 -2
data/lib/spark/mllib/regression/linear.rb +11 -3
data/lib/spark/mllib/regression/ridge.rb +18 -2
data/lib/spark/rdd.rb +11 -2
data/lib/spark/serializer.rb +1 -1
data/lib/spark/serializer/auto_batched.rb +7 -0
data/lib/spark/version.rb +1 -1
data/ruby-spark.gemspec +4 -5
data/spec/generator.rb +1 -1
data/spec/lib/collect_spec.rb +10 -10
data/spec/lib/config_spec.rb +10 -10
data/spec/lib/context_spec.rb +116 -115
data/spec/lib/ext_spec.rb +17 -17
data/spec/lib/external_apps_spec.rb +1 -1
data/spec/lib/filter_spec.rb +17 -17
data/spec/lib/flat_map_spec.rb +22 -19
data/spec/lib/group_spec.rb +22 -19
data/spec/lib/helper_spec.rb +60 -12
data/spec/lib/key_spec.rb +9 -8
data/spec/lib/manipulation_spec.rb +15 -15
data/spec/lib/map_partitions_spec.rb +6 -4
data/spec/lib/map_spec.rb +22 -19
data/spec/lib/reduce_by_key_spec.rb +19 -19
data/spec/lib/reduce_spec.rb +22 -20
data/spec/lib/sample_spec.rb +13 -12
data/spec/lib/serializer_spec.rb +27 -0
data/spec/lib/sort_spec.rb +16 -14
data/spec/lib/statistic_spec.rb +4 -2
data/spec/lib/whole_text_files_spec.rb +9 -8
data/spec/spec_helper.rb +3 -3
metadata +19 -18

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 64125882e5773c705d62c737e57a481d7bc8bb71
-  data.tar.gz: 4dd464e678a79c9e2d5655fc5b0aae813c2a6346
+  metadata.gz: cd863f728212557da03e76f6e98eeed05695ea5d
+  data.tar.gz: 214b2022187727a50badcd1910313550e59aefdf
 SHA512:
-  metadata.gz: 5d11709879c2ce1d1c1a7153dafc7b85908d51227b864f276ef1bfb257eb239ae9f6216054bb3a0e601c6df23ec756fb60df70793113b282fc8db1e73b7b9b5d
-  data.tar.gz: 4452699e228a9f4196a3afc583a9f6ea792ca41864524ab2b029aef04abb53d662c9ce538f8190add24a528605cca3d3fcc2f6c99e49f8ae33836df0a0181a13
+  metadata.gz: 23c0c7b6ab63a2f9c191cddc4836c73cde61722b9e6f3c7e25b090afed7cda2eaff0d8718074ae3337ff5c4bd57e1223dab76f6cf7772b4c7dda3e7ed69d98c6
+  data.tar.gz: 234897b1851614ae1371b3a33417c8d036b00a4551185829b99ef398a110a614ffe1eaeac556c00859a45598bed4219e59eb98ddbffbe0fe2c25c024408b8628

data/.gitignore CHANGED

@@ -35,3 +35,4 @@ ext/spark/project/project/target/*
 wiki
 /benchmark/performance/spark/*
 /benchmark/performance/rspark/*
+/_*

data/.travis.yml ADDED

@@ -0,0 +1,15 @@
+language: ruby
+rvm:
+  - 2.2.0
+before_script:
+  - bundle exec rake compile
+  - bundle exec ruby bin/ruby-spark build
+cache:
+  bundler: true
+  directories:
+    - $HOME/.m2
+    - $HOME/.ivy2
+    - $HOME/.sbt

data/CHANGELOG.md ADDED

@@ -0,0 +1,8 @@
+## 1.2.0 (15.06.2015)
+  - target folder is now located at HOME
+  - better serializators
+  - error when java class does not exist
+  - default setting at ~/.ruby-spark.conf
+  - compatible with Spark 1.4.0
+  - added calling site to RDD

data/README.md CHANGED

@@ -1,8 +1,8 @@
-# Ruby-Spark
+# Ruby-Spark [![Build Status](https://travis-ci.org/ondra-m/ruby-spark.svg?branch=master)](https://travis-ci.org/ondra-m/ruby-spark)
 Apache Spark™ is a fast and general engine for large-scale data processing.
-This Gem allows you use Spark functionality on Ruby.
+This Gem allows the use Spark functionality on Ruby.
 > Word count in Spark's Ruby API
@@ -16,7 +16,7 @@ file.flat_map(:split)
 - [Apache Spark](http://spark.apache.org)
 - [Wiki](https://github.com/ondra-m/ruby-spark/wiki)
-- [Ruby-doc](http://www.rubydoc.info/github/ondra-m/ruby-spark)
+- [Rubydoc](http://www.rubydoc.info/gems/ruby-spark)
 ## Installation
@@ -24,6 +24,7 @@ file.flat_map(:split)
 - Java 7+
 - Ruby 2+
+- wget or curl
 - MRI or JRuby
 Add this line to your application's Gemfile:
@@ -48,12 +49,13 @@ Run `rake compile` if you are using gem from local filesystem.
 ### Build Apache Spark
-This command will download Spark and build extensions for this gem ([SBT](ext/spark/build.sbt) is used for compiling). For more informations check [wiki](https://github.com/ondra-m/ruby-spark/wiki/Installation). Everything is stored by default at [GEM_ROOT]/target,
+This command will download Spark and build extensions for this gem ([SBT](ext/spark/build.sbt) is used for compiling). For more informations check [wiki](https://github.com/ondra-m/ruby-spark/wiki/Installation). Jars will be stored at you HOME directory.
 ```
 $ ruby-spark build
 ```
 ## Usage
 You can use Ruby Spark via interactive shell (Pry is used)
@@ -62,26 +64,13 @@ You can use Ruby Spark via interactive shell (Pry is used)
 $ ruby-spark shell
 ```
-Or on existing project
-```ruby
-require 'ruby-spark'
-# Create a SparkContext
-Spark.start
-# Context reference
-Spark.sc
-```
+Or on existing project.
 If you want configure Spark first. See [configurations](https://github.com/ondra-m/ruby-spark/wiki/Configuration) for more details.
 ```ruby
 require 'ruby-spark'
-# Use if you have custom SPARK_HOME
-Spark.load_lib(spark_home)
 # Configuration
 Spark.config do
    set_app_name "RubySpark"
@@ -91,17 +80,21 @@ end
 # Start Apache Spark
 Spark.start
+# Context reference
+Spark.sc
 ```
-Finally, to stop the cluster. On the shell is Spark stopped automatically when you exist.
+Finally, to stop the cluster. On the shell is Spark stopped automatically when environment exit.
 ```ruby
 Spark.stop
 ```
+After first use, global configuration is created at **~/.ruby-spark.conf**. There can be specified properties for Spark and RubySpark.
-## Creating RDD (upload data)
+## Creating RDD (a new collection)
 Single text file:
@@ -115,28 +108,18 @@ All files on directory:
 rdd = sc.whole_text_files(DIRECTORY, workers_num, serializer=nil)
 ```
-Direct uploading structures from ruby (choosen serializer must be able to serialize it):
+Direct uploading structures from ruby:
 ```ruby
 rdd = sc.parallelize([1,2,3,4,5], workers_num, serializer=nil)
 rdd = sc.parallelize(1..5, workers_num, serializer=nil)
 ```
-### Options
+There is 2 conditions:
+1. choosen serializer must be able to serialize it
+2. data must be iterable
-<dl>
-  <dt>workers_num</dt>
-  <dd>
-    Min count of works computing this task.<br>
-    <i>(This value can be overwriten by spark)</i>
-  </dd>
-  <dt>serializer</dt>
-  <dd>
-    Custom serializer.<br>
-    <i>(default: by <b>spark.ruby.serializer</b> options)</i>
-  </dd>
-</dl>
+If you do not specified serializer -> default is used (defined from spark.ruby.serializer.* options). [Check this](https://github.com/ondra-m/ruby-spark/wiki/Loading-data#custom-serializer) if you want create custom serializer.
 ## Operations
@@ -145,39 +128,150 @@ All operations can be divided into 2 groups:
 - **Transformations:** append new operation to current RDD and return new
 - **Actions:** add operation and start calculations
-See [wiki page](https://github.com/ondra-m/ruby-spark/wiki/RDD) or [Ruby-doc](http://www.rubydoc.info/github/ondra-m/ruby-spark/master/Spark/RDD) for more details.
+More informations:
+- [Wiki page](https://github.com/ondra-m/ruby-spark/wiki/RDD)
+- [Rubydoc](http://www.rubydoc.info/github/ondra-m/ruby-spark/master/Spark/RDD)
+- [rdd.rb](https://github.com/ondra-m/ruby-spark/blob/master/lib/spark/rdd.rb)
+You can also check official Spark documentation. First make sure that method is implemented here.
+- [Transformations](http://spark.apache.org/docs/latest/programming-guide.html#transformations)
+- [Actions](http://spark.apache.org/docs/latest/programming-guide.html#actions)
 #### Transformations
-```ruby
-rdd.map(lambda{|item| ...})
-rdd.flat_map(lambda{|item| ...})
-rdd.filter(lambda{|item| ...})
-rdd.union(rdd)
-rdd.map_paritions(lambda{|iterator| ...})
-# ...
-```
+<dl>
+  <dt><code>rdd.map(function)</code></dt>
+  <dd>Return a new RDD by applying a function to all elements of this RDD.</dd>
+  <dt><code>rdd.flat_map(function)</code></dt>
+  <dd>Return a new RDD by first applying a function to all elements of this RDD, and then flattening the results.</dd>
+  <dt><code>rdd.map_partitions(function)</code></dt>
+  <dd>Return a new RDD by applying a function to each partition of this RDD.</dd>
+  <dt><code>rdd.filter(function)</code></dt>
+  <dd>Return a new RDD containing only the elements that satisfy a predicate.</dd>
+  <dt><code>rdd.cartesian(other)</code></dt>
+  <dd>Return the Cartesian product of this RDD and another one, that is, the RDD of all pairs of elements `(a, b)` where `a` is in `self` and `b` is in `other`.</dd>
+  <dt><code>rdd.intersection(other)</code></dt>
+  <dd>Return the intersection of this RDD and another one. The output will not contain any duplicate elements, even if the input RDDs did.</dd>
+  <dt><code>rdd.sample(with_replacement, fraction, seed)</code></dt>
+  <dd>Return a sampled subset of this RDD. Operations are base on Poisson and Uniform distributions.</dd>
+  <dt><code>rdd.group_by_key(num_partitions)</code></dt>
+  <dd>Group the values for each key in the RDD into a single sequence.</dd>
+  <dt><a href="http://www.rubydoc.info/gems/ruby-spark/Spark/RDD" target="_blank"><code>...many more...</code></a></dt>
+  <dd></dd>
+</dl>
 #### Actions
-```ruby
-rdd.count
-rdd.take(n)
-rdd.collect
-# ...
-```
+<dl>
+  <dt><code>rdd.take(count)</code></dt>
+  <dd>Take the first num elements of the RDD.</dd>
+  <dt><code>rdd.reduce(function)</code></dt>
+  <dd>Reduces the elements of this RDD using the specified lambda or method.</dd>
+  <dt><code>rdd.aggregate(zero_value, seq_op, comb_op)</code></dt>
+  <dd>Aggregate the elements of each partition, and then the results for all the partitions, using given combine functions and a neutral “zero value”.</dd>
+  <dt><code>rdd.histogram(buckets)</code></dt>
+  <dd>Compute a histogram using the provided buckets.</dd>
+  <dt><code>rdd.collect</code></dt>
+  <dd>Return an array that contains all of the elements in this RDD.</dd>
+  <dt><a href="http://www.rubydoc.info/gems/ruby-spark/Spark/RDD" target="_blank"><code>...many more...</code></a></dt>
+  <dd></dd>
+</dl>
 ## Examples
-Sum of numbers
+##### Basic methods
 ```ruby
-sc.parallelize(0..10).sum
-# => 55
+# Every batch will be serialized by Marshal and will have size 10
+ser = Spark::Serializer.build('batched(marshal, 10)')
+# Range 0..100, 2 workers, custom serializer
+rdd = Spark.sc.parallelize(0..100, 2, ser)
+# Take first 5 items
+rdd.take(5)
+# => [0, 1, 2, 3, 4]
+# Numbers reducing
+rdd.reduce(lambda{|sum, x| sum+x})
+rdd.reduce(:+)
+rdd.sum
+# => 5050
+# Reducing with zero items
+seq = lambda{|x,y| x+y}
+com = lambda{|x,y| x*y}
+rdd.aggregate(1, seq, com)
+# 1. Every workers adds numbers
+#    => [1226, 3826]
+# 2. Results are multiplied
+#    => 4690676
+# Statistic method
+rdd.stats
+# => StatCounter: (count, mean, max, min, variance,
+#                  sample_variance, stdev, sample_stdev)
+# Compute a histogram using the provided buckets.
+rdd.histogram(2)
+# => [[0.0, 50.0, 100], [50, 51]]
+# Mapping
+rdd.map(lambda {|x| x*2}).collect
+# => [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, ...]
+rdd.map(:to_f).collect
+# => [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...]
+# Mapping the whole collection
+rdd.map_partitions(lambda{|part| part.reduce(:+)}).collect
+# => [1225, 3825]
+# Selecting
+rdd.filter(lambda{|x| x.even?}).collect
+# => [0, 2, 4, 6, 8, 10, 12, 14, 16, ...]
+# Sampling
+rdd.sample(true, 10).collect
+# => [3, 36, 40, 54, 58, 82, 86, 95, 98]
+# Sampling X items
+rdd.take_sample(true, 10)
+# => [53, 87, 71, 74, 18, 75, 55, 94, 46, 32]
+# Using external process
+rdd.pipe('cat', "awk '{print $1*10}'")
+# => ["0", "10", "20", "30", "40", "50", ...]
 ```
-Words count using methods
+##### Words count using methods
 ```ruby
 # Content:
@@ -198,7 +292,7 @@ rdd = rdd.reduce_by_key(lambda{|a, b| a+b})
 rdd.collect_as_hash
 ```
-Estimating PI with a custom serializer
+##### Estimating PI with a custom serializer
 ```ruby
 slices = 3
@@ -221,7 +315,7 @@ rdd = rdd.map(method(:map))
 puts 'Pi is roughly %f' % (4.0 * rdd.sum / n)
 ```
-Estimating PI
+##### Estimating PI
 ```ruby
 rdd = sc.parallelize([10_000], 1)
@@ -230,7 +324,16 @@ rdd = rdd.map(lambda{|x| BigMath.PI(x)})
 rdd.collect # => #<BigDecimal, '0.31415926...'>
 ```
-Linear regression
+### Mllib (Machine Learning Library)
+Mllib functions are using Spark's Machine Learning Library. Ruby objects are serialized and deserialized in Java so you cannot use custom classes. Supported are primitive types such as string or integers.
+All supported methods/models:
+- [Rubydoc / Mllib](http://www.rubydoc.info/github/ondra-m/ruby-spark/Spark/Mllib)
+- [Github / Mllib](https://github.com/ondra-m/ruby-spark/tree/master/lib/spark/mllib)
+##### Linear regression
 ```ruby
 # Import Mllib classes into Object
@@ -250,3 +353,27 @@ lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initial_weights: [1.0]
 lrm.predict([0.0])
 ```
+##### K-Mean
+```ruby
+Spark::Mllib.import
+# Dense vectors
+data = [
+  DenseVector.new([0.0,0.0]),
+  DenseVector.new([1.0,1.0]),
+  DenseVector.new([9.0,8.0]),
+  DenseVector.new([8.0,9.0])
+]
+model = KMeans.train(sc.parallelize(data), 2)
+model.predict([0.0, 0.0]) == model.predict([1.0, 1.0])
+# => true
+model.predict([8.0, 9.0]) == model.predict([9.0, 8.0])
+# => true
+```
+## Benchmarks

data/TODO.md CHANGED

@@ -3,4 +3,6 @@
 - add SQL
 - worker informations (time, memory, ...)
 - killing zombie workers
-- global config to ~/.ruby-spark.conf (e.g. target folder for spark)
+- add_rb, add_inline_rb to Spark::{Context, RDD}
+- fix broadcast for cluster
+- dump to disk if there is memory limit

data/ext/spark/build.sbt CHANGED

@@ -6,15 +6,15 @@ assemblySettings
 val defaultScalaVersion     = "2.10.4"
 val defaultSparkVersion     = "1.3.0"
 val defaultSparkCoreVersion = "2.10"
-val defaultSparkHome        = "target"
+val defaultTargetDir        = "target"
 val defaultHadoopVersion    = "1.0.4"
 // Values
+val _hadoopVersion    = scala.util.Properties.envOrElse("HADOOP_VERSION", defaultHadoopVersion)
 val _scalaVersion     = scala.util.Properties.envOrElse("SCALA_VERSION", defaultScalaVersion)
 val _sparkVersion     = scala.util.Properties.envOrElse("SPARK_VERSION", defaultSparkVersion)
 val _sparkCoreVersion = scala.util.Properties.envOrElse("SPARK_CORE_VERSION", defaultSparkCoreVersion)
-val _sparkHome        = scala.util.Properties.envOrElse("SPARK_HOME", defaultSparkHome)
-val _hadoopVersion    = scala.util.Properties.envOrElse("HADOOP_VERSION", defaultHadoopVersion)
+val _targetDir        = scala.util.Properties.envOrElse("TARGET_DIR", defaultTargetDir)
 // Project settings
 name := "ruby-spark"
@@ -26,8 +26,8 @@ scalaVersion := _scalaVersion
 javacOptions ++= Seq("-source", "1.7", "-target", "1.7")
 // Jar target folder
-artifactPath in Compile in packageBin := file(s"${_sparkHome}/ruby-spark.jar")
-outputPath in packageDependency := file(s"${_sparkHome}/ruby-spark-deps.jar")
+artifactPath in Compile in packageBin := file(s"${_targetDir}/ruby-spark.jar")
+outputPath in packageDependency := file(s"${_targetDir}/ruby-spark-deps.jar")
 // Protocol buffer support
 seq(sbtprotobuf.ProtobufPlugin.protobufSettings: _*)

data/ext/spark/src/main/scala/RubyWorker.scala CHANGED

@@ -123,22 +123,13 @@ object RubyWorker extends Logging {
         executorLocation = env.conf.get("spark.ruby.driver_home")
       }
       else{
-        // Ruby-spark package uri
-        val uri = env.conf.get("spark.ruby.executor.uri", "")
-        if(uri.isEmpty){
-          // Use gem installed on the system
-          try {
-            val homeCommand = new FileCommand(commandTemplate, "ruby-spark home", env, getEnvVars(env))
-            executorLocation = homeCommand.run.readLine
-          } catch {
-            case e: java.io.IOException =>
-              throw new SparkException("Ruby-spark gem is not installed.", e)
-          }
-        }
-        else{
-          // Prepare and use gem from uri
+        // Use gem installed on the system
+        try {
+          val homeCommand = (new FileCommand(commandTemplate, "ruby-spark home", env, getEnvVars(env))).run
+          executorLocation = homeCommand.readLine
+        } catch {
+          case e: Exception =>
+            throw new SparkException("Ruby-spark gem is not installed.", e)
         }
       }