RubyGems - ruby-spark - Versions diffs - 1.1.0.1 → 1.2.0 - Mend

ruby-spark 1.1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/.travis.yml +15 -0
data/CHANGELOG.md +8 -0
data/README.md +184 -57
data/TODO.md +3 -1
data/ext/spark/build.sbt +5 -5
data/ext/spark/src/main/scala/RubyWorker.scala +7 -16
data/lib/spark.rb +69 -10
data/lib/spark/accumulator.rb +8 -0
data/lib/spark/broadcast.rb +7 -0
data/lib/spark/build.rb +10 -10
data/lib/spark/cli.rb +68 -76
data/lib/spark/config.rb +13 -17
data/lib/spark/context.rb +10 -7
data/lib/spark/error.rb +4 -0
data/lib/spark/helper/statistic.rb +5 -1
data/lib/spark/java_bridge.rb +5 -3
data/lib/spark/java_bridge/base.rb +15 -15
data/lib/spark/java_bridge/jruby.rb +3 -1
data/lib/spark/java_bridge/rjb.rb +2 -0
data/lib/spark/mllib/classification/logistic_regression.rb +10 -2
data/lib/spark/mllib/classification/svm.rb +10 -2
data/lib/spark/mllib/clustering/kmeans.rb +6 -2
data/lib/spark/mllib/regression/lasso.rb +18 -2
data/lib/spark/mllib/regression/linear.rb +11 -3
data/lib/spark/mllib/regression/ridge.rb +18 -2
data/lib/spark/rdd.rb +11 -2
data/lib/spark/serializer.rb +1 -1
data/lib/spark/serializer/auto_batched.rb +7 -0
data/lib/spark/version.rb +1 -1
data/ruby-spark.gemspec +4 -5
data/spec/generator.rb +1 -1
data/spec/lib/collect_spec.rb +10 -10
data/spec/lib/config_spec.rb +10 -10
data/spec/lib/context_spec.rb +116 -115
data/spec/lib/ext_spec.rb +17 -17
data/spec/lib/external_apps_spec.rb +1 -1
data/spec/lib/filter_spec.rb +17 -17
data/spec/lib/flat_map_spec.rb +22 -19
data/spec/lib/group_spec.rb +22 -19
data/spec/lib/helper_spec.rb +60 -12
data/spec/lib/key_spec.rb +9 -8
data/spec/lib/manipulation_spec.rb +15 -15
data/spec/lib/map_partitions_spec.rb +6 -4
data/spec/lib/map_spec.rb +22 -19
data/spec/lib/reduce_by_key_spec.rb +19 -19
data/spec/lib/reduce_spec.rb +22 -20
data/spec/lib/sample_spec.rb +13 -12
data/spec/lib/serializer_spec.rb +27 -0
data/spec/lib/sort_spec.rb +16 -14
data/spec/lib/statistic_spec.rb +4 -2
data/spec/lib/whole_text_files_spec.rb +9 -8
data/spec/spec_helper.rb +3 -3
metadata +19 -18

data/spec/generator.rb CHANGED

@@ -18,7 +18,7 @@ class Generator
   def self.lines(size=1000, letters=3)
     Array.new(size) do
       Array.new(rand(50..100)){
-        (97+rand(letters)).chr + (" " * (rand(10) == 0 ? 1 : 0))
+        (97+rand(letters)).chr + (' ' * (rand(10) == 0 ? 1 : 0))
       }.join
     end
   end

data/spec/lib/collect_spec.rb CHANGED

@@ -1,40 +1,40 @@
-require "spec_helper"
+require 'spec_helper'
-RSpec::describe Spark::RDD do
+RSpec.describe Spark::RDD do
   let(:mapping) { lambda{|x| [x, 1]} }
   let(:numbers) { Generator.numbers }
-  it ".collect_as_hash" do
+  it '.collect_as_hash' do
     rdd = $sc.parallelize(numbers)
     rdd = rdd.map(mapping)
     expect(rdd.collect_as_hash).to eql(Hash[numbers.map(&mapping)])
   end
-  context ".take" do
+  context '.take' do
     let(:size)    { 1000 }
     let(:numbers) { Generator.numbers(size) }
     let(:rdd)     { $sc.parallelize(numbers) }
-    it "nothing" do
+    it 'nothing' do
       expect(rdd.take(0)).to eql([])
     end
-    it "first" do
+    it 'first' do
       expect(rdd.first).to eql(numbers.first)
     end
-    it "less than limit" do
+    it 'less than limit' do
       _size = size / 2
       expect(rdd.take(_size)).to eql(numbers.take(_size))
     end
-    it "all" do
+    it 'all' do
       expect(rdd.take(size)).to eql(numbers)
     end
-    it "more than limit" do
+    it 'more than limit' do
       expect(rdd.take(size*2)).to eql(numbers)
     end
   end

data/spec/lib/config_spec.rb CHANGED

@@ -1,6 +1,6 @@
-require "spec_helper"
+require 'spec_helper'
-RSpec::describe Spark::Config do
+RSpec.describe Spark::Config do
   before(:context) do
     Spark.stop
@@ -10,17 +10,17 @@ RSpec::describe Spark::Config do
     spark_start
   end
-  it "should be stopped" do
+  it 'should be stopped' do
     expect(Spark.started?).to be_falsy
   end
-  context "new config" do
+  context 'new config' do
     let(:configuration) do
       {
-        "test.test1" => "test1",
-        "test.test2" => "test2",
-        "test.test3" => "test3"
+        'test.test1' => 'test1',
+        'test.test2' => 'test2',
+        'test.test3' => 'test3'
       }
     end
@@ -28,7 +28,7 @@ RSpec::describe Spark::Config do
       Spark.clear_config
     end
-    it "throught methods" do
+    it 'throught methods' do
       configuration.each do |key, value|
         Spark.config.set(key, value)
       end
@@ -38,7 +38,7 @@ RSpec::describe Spark::Config do
       end
     end
-    it "throught hash style" do
+    it 'throught hash style' do
       configuration.each do |key, value|
         Spark.config[key] = value
       end
@@ -48,7 +48,7 @@ RSpec::describe Spark::Config do
       end
     end
-    it "throught dsl" do
+    it 'throught dsl' do
       configuration.each do |key, value|
         Spark.config {
           set key, value

data/spec/lib/context_spec.rb CHANGED

@@ -46,120 +46,121 @@ RSpec.describe Spark::Context do
     )
   end
-  context '.accumulator' do
-    it 'test' do
-      accum1 = $sc.accumulator(0,)
-      accum2 = $sc.accumulator(1, :*, 1)
-      accum3 = $sc.accumulator(0, lambda{|max, val| val > max ? val : max})
-      accum1 += 1
-      accum2.add(2)
-      accum2.add(2)
-      accum2.add(2)
-      accum3.add(9)
-      accum3.add(6)
-      accum3.add(7)
-      expect(accum1.value).to eql(1)
-      expect(accum2.value).to eql(8)
-      expect(accum3.value).to eql(9)
-      func = Proc.new do |_, index|
-        accum1.add(1)
-        accum2.add(2)
-        accum3.add(index * 10)
-      end
-      rdd = $sc.parallelize(0..4, 4)
-      rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
-      rdd = rdd.map_partitions_with_index(func)
-      rdd.collect
-      # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
-      sleep(1)
-      expect(accum1.value).to eql(5)
-      expect(accum2.value).to eql(128)
-      expect(accum3.value).to eql(30)
-    end
-    context 'accum param' do
-      it 'symbol' do
-        accum1 = $sc.accumulator(1, :+, 0)
-        accum2 = $sc.accumulator(5, :-, 3)
-        accum3 = $sc.accumulator(1, :*, 1)
-        accum4 = $sc.accumulator(1.0, :/, 1.0)
-        accum5 = $sc.accumulator(2, :**, 2)
-        func = Proc.new do |_|
-          accum1.add(1)
-          accum2.add(1)
-          accum3.add(2)
-          accum4.add(2)
-          accum5.add(2)
-        end
-        rdd = $sc.parallelize(0..4, 2)
-        rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3, accum4: accum4, accum5: accum5)
-        rdd = rdd.map_partitions(func)
-        rdd.collect
-        # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
-        sleep(1)
-        expect(accum1.value).to eq(3)
-        expect(accum2.value).to eq(1)
-        expect(accum3.value).to eq(4)
-        expect(accum4.value).to eq(4)
-        expect(accum5.value).to eq(65536)
-      end
-      it 'proc' do
-        accum1 = $sc.accumulator(1, lambda{|mem, val| mem + val}, 0)
-        accum2 = $sc.accumulator('a', lambda{|mem, val| mem + val}, '')
-        accum3 = $sc.accumulator([], lambda{|mem, val| mem << val}, [])
-        func = Proc.new do |_|
-          accum1.add(1)
-          accum2.add('a')
-          accum3.add(1)
-        end
-        rdd = $sc.parallelize(0..4, 2)
-        rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
-        rdd = rdd.map_partitions(func)
-        rdd.collect
-        # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
-        sleep(1)
-        expect(accum1.value).to eq(3)
-        expect(accum2.value).to eq('aaa')
-        expect(accum3.value).to eq([[1], [1]])
-      end
-      it 'string' do
-        expect { $sc.accumulator(1, '0') }.to raise_error(Spark::SerializeError)
-        accum = $sc.accumulator(1, 'lambda{|mem, val| mem + val}', 0)
-        func = Proc.new do |_|
-          accum.add(1)
-        end
-        rdd = $sc.parallelize(0..4, 2)
-        rdd = rdd.bind(accum: accum)
-        rdd = rdd.map_partitions(func)
-        rdd.collect
-        # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
-        sleep(1)
-        expect(accum.value).to eq(3)
-      end
-    end
-  end
+  # context '.accumulator' do
+  #   it 'test' do
+  #     accum1 = $sc.accumulator(0,)
+  #     accum2 = $sc.accumulator(1, :*, 1)
+  #     accum3 = $sc.accumulator(0, lambda{|max, val| val > max ? val : max})
+  #     accum1 += 1
+  #     accum2.add(2)
+  #     accum2.add(2)
+  #     accum2.add(2)
+  #     accum3.add(9)
+  #     accum3.add(6)
+  #     accum3.add(7)
+  #     expect(accum1.value).to eql(1)
+  #     expect(accum2.value).to eql(8)
+  #     expect(accum3.value).to eql(9)
+  #     func = Proc.new do |_, index|
+  #       accum1.add(1)
+  #       accum2.add(2)
+  #       accum3.add(index * 10)
+  #     end
+  #     rdd = $sc.parallelize(0..4, 4)
+  #     rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
+  #     rdd = rdd.map_partitions_with_index(func)
+  #     rdd.collect
+  #     # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
+  #     sleep(1)
+  #     expect(accum1.value).to eql(5)
+  #     expect(accum2.value).to eql(128)
+  #     expect(accum3.value).to eql(30)
+  #   end
+  #   context 'accum param' do
+  #     it 'symbol' do
+  #       accum1 = $sc.accumulator(1, :+, 0)
+  #       accum2 = $sc.accumulator(5, :-, 3)
+  #       accum3 = $sc.accumulator(1, :*, 1)
+  #       accum4 = $sc.accumulator(1.0, :/, 1.0)
+  #       accum5 = $sc.accumulator(2, :**, 2)
+  #       func = Proc.new do |_|
+  #         accum1.add(1)
+  #         accum2.add(1)
+  #         accum3.add(2)
+  #         accum4.add(2)
+  #         accum5.add(2)
+  #       end
+  #       rdd = $sc.parallelize(0..4, 2)
+  #       rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3, accum4: accum4, accum5: accum5)
+  #       rdd = rdd.map_partitions(func)
+  #       rdd.collect
+  #       # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
+  #       sleep(1)
+  #       expect(accum1.value).to eq(3)
+  #       expect(accum2.value).to eq(1)
+  #       expect(accum3.value).to eq(4)
+  #       expect(accum4.value).to eq(4)
+  #       expect(accum5.value).to eq(65536)
+  #     end
+  #     it 'proc' do
+  #       accum1 = $sc.accumulator(1, lambda{|mem, val| mem + val}, 0)
+  #       accum2 = $sc.accumulator('a', lambda{|mem, val| mem + val}, '')
+  #       accum3 = $sc.accumulator([], lambda{|mem, val| mem << val}, [])
+  #       func = Proc.new do |_|
+  #         accum1.add(1)
+  #         accum2.add('a')
+  #         accum3.add(1)
+  #       end
+  #       rdd = $sc.parallelize(0..4, 2)
+  #       rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
+  #       rdd = rdd.map_partitions(func)
+  #       rdd.collect
+  #       # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
+  #       sleep(1)
+  #       expect(accum1.value).to eq(3)
+  #       expect(accum2.value).to eq('aaa')
+  #       expect(accum3.value).to eq([[1], [1]])
+  #     end
+  #     it 'string' do
+  #       expect { $sc.accumulator(1, '0') }.to raise_error(Spark::SerializeError)
+  #       accum = $sc.accumulator(1, 'lambda{|mem, val| mem + val}', 0)
+  #       func = Proc.new do |_|
+  #         accum.add(1)
+  #       end
+  #       rdd = $sc.parallelize(0..4, 2)
+  #       rdd = rdd.bind(accum: accum)
+  #       rdd = rdd.map_partitions(func)
+  #       rdd.collect
+  #       # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
+  #       sleep(1)
+  #       expect(accum.value).to eq(3)
+  #     end
+  #   end
+  # end
 end

data/spec/lib/ext_spec.rb CHANGED

@@ -1,18 +1,18 @@
-require "spec_helper"
+require 'spec_helper'
 RSpec.describe Array do
-  it ".deep_copy" do
-    data = ["a", "b", "c"]
+  it '.deep_copy' do
+    data = ['a', 'b', 'c']
     new_data = data.dup
-    data[0] << "a"
+    data[0] << 'a'
     expect(data).to eql(new_data)
     new_data = data.deep_copy
-    data[1] << "b"
+    data[1] << 'b'
     expect(data).to_not eql(new_data)
   end
@@ -21,19 +21,19 @@ end
 RSpec.describe Hash do
-  it ".stringify_keys!" do
+  it '.stringify_keys!' do
     data = {
-      a: "a",
-      b: "b",
-      c: "c"
+      a: 'a',
+      b: 'b',
+      c: 'c'
     }
     data.stringify_keys!
     expect(data).to eql({
-      "a" => "a",
-      "b" => "b",
-      "c" => "c"
+      'a' => 'a',
+      'b' => 'b',
+      'c' => 'c'
     })
   end
@@ -41,9 +41,9 @@ end
 RSpec.describe String do
-  it ".camelize" do
-    data = "aaa_bbb_ccc".camelize
-    expect(data).to eql("AaaBbbCcc")
+  it '.camelize' do
+    data = 'aaa_bbb_ccc'.camelize
+    expect(data).to eql('AaaBbbCcc')
   end
 end
@@ -56,14 +56,14 @@ RSpec.describe IO do
     file.write_int(1)
     file.write_long(2)
-    file.write_string("3")
+    file.write_string('3')
     file.write_data([4])
     file.rewind
     expect(file.read_int).to eq(1)
     expect(file.read_long).to eq(2)
-    expect(file.read_string).to eq("3")
+    expect(file.read_string).to eq('3')
     expect(file.read_data).to eq([4])
     file.unlink

data/spec/lib/external_apps_spec.rb CHANGED

@@ -1,6 +1,6 @@
 require 'spec_helper'
-RSpec::describe Spark::RDD do
+RSpec.describe Spark::RDD do
   context '.pipe' do
     let(:words)   { Generator.words }

data/spec/lib/filter_spec.rb CHANGED

@@ -1,12 +1,12 @@
-require "spec_helper"
+require 'spec_helper'
 def func4(item)
-  item.start_with?("a") && item.size > 3 && item[1].to_s.ord > 106
+  item.start_with?('a') && item.size > 3 && item[1].to_s.ord > 106
 end
-RSpec::shared_examples "a filtering" do |workers|
+RSpec.shared_examples 'a filtering' do |workers|
   context "with #{workers || 'default'} worker" do
-    it "when numbers" do
+    it 'when numbers' do
       rdd2 = rdd_numbers(workers)
       rdd2 = rdd2.filter(func1)
       result = numbers.select(&func1)
@@ -20,7 +20,7 @@ RSpec::shared_examples "a filtering" do |workers|
       expect(rdd3.collect).to eql([])
     end
-    it "when words" do
+    it 'when words' do
       rdd2 = rdd_words(workers)
       rdd2 = rdd2.filter(func3)
       result = words.select{|x| func3.call(x)}
@@ -36,12 +36,12 @@ RSpec::shared_examples "a filtering" do |workers|
   end
 end
-RSpec::describe "Spark::RDD.filter" do
+RSpec.describe 'Spark::RDD.filter' do
   let(:func1) { lambda{|x| x.to_i.even?} }
   let(:func2) { lambda{|x| x.to_i.odd?} }
-  let(:func3) { lambda{|x| x.to_s.start_with?("b")} }
+  let(:func3) { lambda{|x| x.to_s.start_with?('b')} }
-  context "throught parallelize" do
+  context 'throught parallelize' do
     let(:numbers) { Generator.numbers_with_zero }
     let(:words)   { Generator.words }
@@ -53,14 +53,14 @@ RSpec::describe "Spark::RDD.filter" do
       $sc.parallelize(words, workers)
     end
-    it_behaves_like "a filtering", nil
-    it_behaves_like "a filtering", 1
-    it_behaves_like "a filtering", rand(2..10)
+    it_behaves_like 'a filtering', 2
+    # it_behaves_like 'a filtering', nil
+    # it_behaves_like 'a filtering', rand(2..10)
   end
-  context "throught text_file" do
-    let(:file_numbers) { File.join("spec", "inputs", "numbers_0_100.txt") }
-    let(:file_words)   { File.join("spec", "inputs", "lorem_300.txt") }
+  context 'throught text_file' do
+    let(:file_numbers) { File.join('spec', 'inputs', 'numbers_0_100.txt') }
+    let(:file_words)   { File.join('spec', 'inputs', 'lorem_300.txt') }
     let(:numbers) { File.readlines(file_numbers).map(&:strip) }
     let(:words)   { File.readlines(file_words).map(&:strip) }
@@ -73,8 +73,8 @@ RSpec::describe "Spark::RDD.filter" do
       $sc.text_file(file_words, workers)
     end
-    it_behaves_like "a filtering", nil
-    it_behaves_like "a filtering", 1
-    it_behaves_like "a filtering", rand(2..10)
+    it_behaves_like 'a filtering', 2
+    # it_behaves_like 'a filtering', nil
+    # it_behaves_like 'a filtering', rand(2..10)
   end
 end