RubyGems - ruby-spark - Versions diffs - 1.0.0 → 1.1.0.1 - Mend

ruby-spark 1.0.0 → 1.1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

checksums.yaml +4 -4
data/.gitignore +1 -1
data/README.md +99 -32
data/TODO.md +2 -3
data/benchmark/{performance → comparison}/prepare.sh +0 -0
data/benchmark/{performance → comparison}/python.py +0 -0
data/benchmark/{performance → comparison}/r.r +0 -0
data/benchmark/{performance → comparison}/ruby.rb +0 -0
data/benchmark/{performance → comparison}/run-all.sh +0 -0
data/benchmark/{performance → comparison}/scala.scala +0 -0
data/example/pi.rb +1 -1
data/example/website_search.rb +83 -0
data/ext/spark/src/main/scala/RubyRDD.scala +30 -2
data/lib/spark.rb +2 -2
data/lib/spark/build.rb +1 -1
data/lib/spark/cli.rb +1 -1
data/lib/spark/command/base.rb +4 -0
data/lib/spark/command_builder.rb +2 -2
data/lib/spark/config.rb +11 -17
data/lib/spark/context.rb +63 -45
data/lib/spark/ext/io.rb +11 -1
data/lib/spark/java_bridge/base.rb +2 -2
data/lib/spark/rdd.rb +67 -18
data/lib/spark/serializer.rb +68 -13
data/lib/spark/serializer/auto_batched.rb +59 -0
data/lib/spark/serializer/base.rb +30 -137
data/lib/spark/serializer/batched.rb +84 -0
data/lib/spark/serializer/cartesian.rb +5 -29
data/lib/spark/serializer/compressed.rb +27 -0
data/lib/spark/serializer/marshal.rb +6 -8
data/lib/spark/serializer/message_pack.rb +8 -10
data/lib/spark/serializer/oj.rb +8 -10
data/lib/spark/serializer/pair.rb +27 -13
data/lib/spark/serializer/text.rb +25 -0
data/lib/spark/version.rb +1 -1
data/lib/spark/worker/worker.rb +5 -2
data/ruby-spark.gemspec +13 -1
data/spec/lib/context_spec.rb +3 -1
data/spec/lib/manipulation_spec.rb +18 -10
data/spec/lib/map_partitions_spec.rb +16 -16
data/spec/lib/serializer_spec.rb +84 -9
data/spec/lib/statistic_spec.rb +26 -24
data/spec/spec_helper.rb +1 -2
metadata +112 -10
data/lib/spark/serializer/utf8.rb +0 -25

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 13c074c360edf1875745bf728b274f5bfb6e0d0a
-  data.tar.gz: a59ba09fac91e2e4c0a58aab99dd60ffb2ec5e3f
+  metadata.gz: 64125882e5773c705d62c737e57a481d7bc8bb71
+  data.tar.gz: 4dd464e678a79c9e2d5655fc5b0aae813c2a6346
 SHA512:
-  metadata.gz: 2e667dabd55b05100831cf3d0e58044941ce965965de1d6dce9a4e8fa5be843bfac1d57d5b6674ea056b9e9395650b84e3e77a86f0a256a1b1c4bfd8ca257340
-  data.tar.gz: 690b88857fa4f841c8c0a5940af75d926285f42a81ff6753337ccc926473a4a3ce510d1c7e9b07ce2ebfd2559c05dd77432cf1dba132f15629669fbf8dfc51b6
+  metadata.gz: 5d11709879c2ce1d1c1a7153dafc7b85908d51227b864f276ef1bfb257eb239ae9f6216054bb3a0e601c6df23ec756fb60df70793113b282fc8db1e73b7b9b5d
+  data.tar.gz: 4452699e228a9f4196a3afc583a9f6ea792ca41864524ab2b029aef04abb53d662c9ce538f8190add24a528605cca3d3fcc2f6c99e49f8ae33836df0a0181a13

data/.gitignore CHANGED Viewed

@@ -32,6 +32,6 @@ mkmf.log
 ext/spark/target/*
 ext/spark/project/target/*
 ext/spark/project/project/target/*
-wiki/*
+wiki
 /benchmark/performance/spark/*
 /benchmark/performance/rspark/*

data/README.md CHANGED Viewed

@@ -20,6 +20,12 @@ file.flat_map(:split)
 ## Installation
+### Requirments
+- Java 7+
+- Ruby 2+
+- MRI or JRuby
 Add this line to your application's Gemfile:
 ```ruby
@@ -38,33 +44,34 @@ Or install it yourself as:
 $ gem install ruby-spark
 ```
-### Install Apache Spark
+Run `rake compile` if you are using gem from local filesystem.
-To install latest supported Spark. First compile native extension:
+### Build Apache Spark
+This command will download Spark and build extensions for this gem ([SBT](ext/spark/build.sbt) is used for compiling). For more informations check [wiki](https://github.com/ondra-m/ruby-spark/wiki/Installation). Everything is stored by default at [GEM_ROOT]/target,
-```
-$ rake compile
-```
-Then build Spark, [SBT](ext/spark/build.sbt) is used for compiling.
 ```
 $ ruby-spark build
 ```
 ## Usage
-You can use Ruby Spark via interactive shell
+You can use Ruby Spark via interactive shell (Pry is used)
 ```
-$ ruby-spark pry
+$ ruby-spark shell
 ```
 Or on existing project
 ```ruby
 require 'ruby-spark'
+# Create a SparkContext
 Spark.start
-Spark.sc # => context
+# Context reference
+Spark.sc
 ```
 If you want configure Spark first. See [configurations](https://github.com/ondra-m/ruby-spark/wiki/Configuration) for more details.
@@ -72,36 +79,47 @@ If you want configure Spark first. See [configurations](https://github.com/ondra
 ```ruby
 require 'ruby-spark'
+# Use if you have custom SPARK_HOME
 Spark.load_lib(spark_home)
+# Configuration
 Spark.config do
    set_app_name "RubySpark"
-   set 'spark.ruby.batch_size', 100
    set 'spark.ruby.serializer', 'oj'
+   set 'spark.ruby.serializer.batch_size', 100
 end
+# Start Apache Spark
 Spark.start
+```
+Finally, to stop the cluster. On the shell is Spark stopped automatically when you exist.
-Spark.sc # => context
+```ruby
+Spark.stop
 ```
-## Uploading a data
-Single file
+## Creating RDD (upload data)
+Single text file:
 ```ruby
-$sc.text_file(FILE, workers_num, custom_options)
+rdd = sc.text_file(FILE, workers_num, serializer=nil)
 ```
-All files on directory
+All files on directory:
 ```ruby
-$sc.whole_text_files(DIRECTORY, workers_num, custom_options)
+rdd = sc.whole_text_files(DIRECTORY, workers_num, serializer=nil)
 ```
-Direct
+Direct uploading structures from ruby (choosen serializer must be able to serialize it):
 ```ruby
-$sc.parallelize([1,2,3,4,5], workers_num, custom_options)
-$sc.parallelize(1..5, workers_num, custom_options)
+rdd = sc.parallelize([1,2,3,4,5], workers_num, serializer=nil)
+rdd = sc.parallelize(1..5, workers_num, serializer=nil)
 ```
 ### Options
@@ -113,39 +131,74 @@ $sc.parallelize(1..5, workers_num, custom_options)
     <i>(This value can be overwriten by spark)</i>
   </dd>
-  <dt>custom_options</dt>
+  <dt>serializer</dt>
   <dd>
-    <b>serializer</b>: name of serializator used for this RDD<br>
-    <b>batch_size</b>: see configuration<br>
-    <br>
-    <i>(Available only for parallelize)</i><br>
-    <b>use</b>: <i>direct (upload direct to java)</i>, <i>file (upload throught a file)</i>
+    Custom serializer.<br>
+    <i>(default: by <b>spark.ruby.serializer</b> options)</i>
   </dd>
 </dl>
+## Operations
+All operations can be divided into 2 groups:
+- **Transformations:** append new operation to current RDD and return new
+- **Actions:** add operation and start calculations
+See [wiki page](https://github.com/ondra-m/ruby-spark/wiki/RDD) or [Ruby-doc](http://www.rubydoc.info/github/ondra-m/ruby-spark/master/Spark/RDD) for more details.
+#### Transformations
+```ruby
+rdd.map(lambda{|item| ...})
+rdd.flat_map(lambda{|item| ...})
+rdd.filter(lambda{|item| ...})
+rdd.union(rdd)
+rdd.map_paritions(lambda{|iterator| ...})
+# ...
+```
+#### Actions
+```ruby
+rdd.count
+rdd.take(n)
+rdd.collect
+# ...
+```
 ## Examples
 Sum of numbers
 ```ruby
-$sc.parallelize(0..10).sum
+sc.parallelize(0..10).sum
 # => 55
 ```
 Words count using methods
 ```ruby
-rdd = $sc.text_file(PATH)
+# Content:
+# "first line"
+# "second line"
+rdd = sc.text_file(PATH)
+# ["first", "line", "second", "line"]
 rdd = rdd.flat_map(lambda{|line| line.split})
-         .map(lambda{|word| [word, 1]})
-         .reduce_by_key(lambda{|a, b| a+b})
+# [["first", 1], ["line", 1], ["second", 1], ["line", 1]]
+rdd = rdd.map(lambda{|word| [word, 1]})
+# [["first", 1], ["line", 2], ["second", 1]]
+rdd = rdd.reduce_by_key(lambda{|a, b| a+b})
+# {"first"=>1, "line"=>2, "second"=>1}
 rdd.collect_as_hash
 ```
-Estimating pi with a custom serializer
+Estimating PI with a custom serializer
 ```ruby
 slices = 3
@@ -168,18 +221,32 @@ rdd = rdd.map(method(:map))
 puts 'Pi is roughly %f' % (4.0 * rdd.sum / n)
 ```
+Estimating PI
+```ruby
+rdd = sc.parallelize([10_000], 1)
+rdd = rdd.add_library('bigdecimal/math')
+rdd = rdd.map(lambda{|x| BigMath.PI(x)})
+rdd.collect # => #<BigDecimal, '0.31415926...'>
+```
 Linear regression
 ```ruby
-Spark::Mllib.import
+# Import Mllib classes into Object
+# Otherwise are accessible via Spark::Mllib::LinearRegressionWithSGD
+Spark::Mllib.import(Object)
+# Training data
 data = [
   LabeledPoint.new(0.0, [0.0]),
   LabeledPoint.new(1.0, [1.0]),
   LabeledPoint.new(3.0, [2.0]),
   LabeledPoint.new(2.0, [3.0])
 ]
-lrm = LinearRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0])
+# Train a model
+lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initial_weights: [1.0])
 lrm.predict([0.0])
 ```

data/TODO.md CHANGED Viewed

@@ -1,7 +1,6 @@
-- add compress
 - refactor JavaBridge: to_java, from_java
 - add Streaming
 - add SQL
-- autobatch serializer
 - worker informations (time, memory, ...)
-- encoding parameter to context.text_file
+- killing zombie workers
+- global config to ~/.ruby-spark.conf (e.g. target folder for spark)

data/benchmark/{performance → comparison}/prepare.sh RENAMED Viewed

File without changes

data/benchmark/{performance → comparison}/python.py RENAMED Viewed

File without changes

data/benchmark/{performance → comparison}/r.r RENAMED Viewed

File without changes

data/benchmark/{performance → comparison}/ruby.rb RENAMED Viewed

File without changes

data/benchmark/{performance → comparison}/run-all.sh RENAMED Viewed

File without changes

data/benchmark/{performance → comparison}/scala.scala RENAMED Viewed

File without changes

data/example/pi.rb CHANGED Viewed

@@ -22,7 +22,7 @@ def map(_)
   end
 end
-rdd = Spark.context.parallelize(1..n, slices, serializer: 'oj')
+rdd = Spark.context.parallelize(1..n, slices)
 rdd = rdd.map(method(:map))
 puts 'Pi is roughly %f' % (4.0 * rdd.sum / n)

data/example/website_search.rb ADDED Viewed

@@ -0,0 +1,83 @@
+#!/usr/bin/env ruby
+# Parse sitemap and search word on every page
+require 'optparse'
+require 'open-uri'
+require 'nokogiri'
+require 'ruby-spark'
+options = {
+  sitemap: 'http://fit.cvut.cz/sitemap.xml',
+  query: 'cvut',
+  workers: 2
+}
+opt_parser = OptionParser.new do |opts|
+  opts.banner = 'Usage: website_search.rb [options]'
+  opts.separator ''
+  opts.separator 'Specific options:'
+  opts.on('-s', '--sitemap SITEMAP', 'Sitemap URL') do |sitemap|
+    options[:sitemap] = sitemap
+  end
+  opts.on('-q', '--query QUERY', 'Query to search') do |query|
+    options[:query] = query
+  end
+  opts.on('-w', '--workers WORKERS_NUM', Integer, 'Number of workers') do |workers|
+    options[:workers] = workers
+  end
+  opts.on('--quite', 'Run quitely') do |v|
+    Spark.logger.disabled
+  end
+  opts.on_tail('-h', '--help', 'Show this message') do
+    puts opts
+    exit
+  end
+end
+opt_parser.parse!
+@links = []
+def parse_sitemap(doc)
+  doc.xpath('//sitemapindex/sitemap/loc').each do |loc|
+    next_doc = Nokogiri::HTML(open(loc.text))
+    parse_sitemap(next_doc)
+  end
+  doc.xpath('//url/loc').each do |loc|
+    @links << loc.text
+  end
+end
+doc = Nokogiri::HTML(open(options[:sitemap]))
+parse_sitemap(doc)
+# Map function
+func = Proc.new do |url|
+  begin
+    open(url) {|f|
+      [url, f.read.scan(query).count]
+    }
+  rescue
+    [url, 0]
+  end
+end
+Spark.start
+rdd = Spark.sc.parallelize(@links, options[:workers])
+              .add_library('open-uri')
+              .bind(query: options[:query])
+              .map(func)
+              .sort_by(lambda{|(_, value)| value}, false)
+rdd.collect.each do |(url, count)|
+  puts "#{url} => #{count}"
+end

data/ext/spark/src/main/scala/RubyRDD.scala CHANGED Viewed

@@ -290,9 +290,21 @@ class PairwiseRDD(prev: RDD[Array[Byte]]) extends RDD[(Long, Array[Byte])](prev)
 object RubyRDD extends Logging {
+  def runJob(
+      sc: SparkContext,
+      rdd: JavaRDD[Array[Byte]],
+      partitions: ArrayList[Int],
+      allowLocal: Boolean,
+      filename: String): String = {
+    type ByteArray = Array[Byte]
+    type UnrolledPartition = Array[ByteArray]
+    val allPartitions: Array[UnrolledPartition] =
+      sc.runJob(rdd, (x: Iterator[ByteArray]) => x.toArray, partitions, allowLocal)
+    val flattenedPartition: UnrolledPartition = Array.concat(allPartitions: _*)
+    writeRDDToFile(flattenedPartition.iterator, filename)
+  }
   def readRDDFromFile(sc: JavaSparkContext, filename: String, parallelism: Int): JavaRDD[Array[Byte]] = {
-    // Too slow
-    // val file = new DataInputStream(new FileInputStream(filename))
     val file = new DataInputStream(new BufferedInputStream(new FileInputStream(filename)))
     val objs = new collection.mutable.ArrayBuffer[Array[Byte]]
     try {
@@ -308,6 +320,22 @@ object RubyRDD extends Logging {
     JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))
   }
+  def writeRDDToFile[T](items: Iterator[T], filename: String): String = {
+    val file = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(filename)))
+    try {
+      PythonRDD.writeIteratorToStream(items, file)
+    } finally {
+      file.close()
+    }
+    filename
+  }
+  def writeRDDToFile[T](rdd: RDD[T], filename: String): String = {
+    writeRDDToFile(rdd.collect.iterator, filename)
+  }
   def readBroadcastFromFile(sc: JavaSparkContext, path: String, id: java.lang.Long): Broadcast[RubyBroadcast] = {
     sc.broadcast(new RubyBroadcast(path, id))
   }

data/lib/spark.rb CHANGED Viewed

@@ -190,9 +190,9 @@ require 'spark/version'
 require 'spark/error'
 # Make sure that Spark be always stopped
-Kernel::at_exit do
+Kernel.at_exit do
   begin
-    Spark.stop
+    Spark.started? && Spark.stop
   rescue
   end
 end

data/lib/spark/build.rb CHANGED Viewed

@@ -30,7 +30,7 @@ module Spark
       cmd = [SBT]
       cmd << SBT_EXT
       cmd << SBT_DEPS unless only_ext
-      cmd << SBT_CLEAN unless $debug
+      cmd << SBT_CLEAN unless $DEBUG
       Dir.chdir(Spark.spark_ext_dir) do
         unless Kernel.system(env, cmd.join(' '))

data/lib/spark/cli.rb CHANGED Viewed

@@ -21,7 +21,7 @@ module Spark
       program :version, Spark::VERSION
       program :description, 'Ruby wrapper for Spark'
-      global_option('-d', '--debug', 'Logging message to stdout'){ $debug = true }
+      global_option('-d', '--debug', 'Logging message to stdout'){ $DEBUG = true }
       default_command :help