ruby-spark 1.1.0.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.travis.yml +15 -0
- data/CHANGELOG.md +8 -0
- data/README.md +184 -57
- data/TODO.md +3 -1
- data/ext/spark/build.sbt +5 -5
- data/ext/spark/src/main/scala/RubyWorker.scala +7 -16
- data/lib/spark.rb +69 -10
- data/lib/spark/accumulator.rb +8 -0
- data/lib/spark/broadcast.rb +7 -0
- data/lib/spark/build.rb +10 -10
- data/lib/spark/cli.rb +68 -76
- data/lib/spark/config.rb +13 -17
- data/lib/spark/context.rb +10 -7
- data/lib/spark/error.rb +4 -0
- data/lib/spark/helper/statistic.rb +5 -1
- data/lib/spark/java_bridge.rb +5 -3
- data/lib/spark/java_bridge/base.rb +15 -15
- data/lib/spark/java_bridge/jruby.rb +3 -1
- data/lib/spark/java_bridge/rjb.rb +2 -0
- data/lib/spark/mllib/classification/logistic_regression.rb +10 -2
- data/lib/spark/mllib/classification/svm.rb +10 -2
- data/lib/spark/mllib/clustering/kmeans.rb +6 -2
- data/lib/spark/mllib/regression/lasso.rb +18 -2
- data/lib/spark/mllib/regression/linear.rb +11 -3
- data/lib/spark/mllib/regression/ridge.rb +18 -2
- data/lib/spark/rdd.rb +11 -2
- data/lib/spark/serializer.rb +1 -1
- data/lib/spark/serializer/auto_batched.rb +7 -0
- data/lib/spark/version.rb +1 -1
- data/ruby-spark.gemspec +4 -5
- data/spec/generator.rb +1 -1
- data/spec/lib/collect_spec.rb +10 -10
- data/spec/lib/config_spec.rb +10 -10
- data/spec/lib/context_spec.rb +116 -115
- data/spec/lib/ext_spec.rb +17 -17
- data/spec/lib/external_apps_spec.rb +1 -1
- data/spec/lib/filter_spec.rb +17 -17
- data/spec/lib/flat_map_spec.rb +22 -19
- data/spec/lib/group_spec.rb +22 -19
- data/spec/lib/helper_spec.rb +60 -12
- data/spec/lib/key_spec.rb +9 -8
- data/spec/lib/manipulation_spec.rb +15 -15
- data/spec/lib/map_partitions_spec.rb +6 -4
- data/spec/lib/map_spec.rb +22 -19
- data/spec/lib/reduce_by_key_spec.rb +19 -19
- data/spec/lib/reduce_spec.rb +22 -20
- data/spec/lib/sample_spec.rb +13 -12
- data/spec/lib/serializer_spec.rb +27 -0
- data/spec/lib/sort_spec.rb +16 -14
- data/spec/lib/statistic_spec.rb +4 -2
- data/spec/lib/whole_text_files_spec.rb +9 -8
- data/spec/spec_helper.rb +3 -3
- metadata +19 -18
    
        data/spec/lib/flat_map_spec.rb
    CHANGED
    
    | @@ -1,6 +1,6 @@ | |
| 1 | 
            -
            require  | 
| 1 | 
            +
            require 'spec_helper'
         | 
| 2 2 |  | 
| 3 | 
            -
            RSpec | 
| 3 | 
            +
            RSpec.shared_examples 'a flat mapping' do |workers|
         | 
| 4 4 | 
             
              it "with #{workers || 'default'} worker" do
         | 
| 5 5 | 
             
                rdd2 = rdd(workers).map(func1)
         | 
| 6 6 | 
             
                result = numbers.flat_map(&func1)
         | 
| @@ -24,7 +24,7 @@ RSpec::shared_examples "a flat mapping" do |workers| | |
| 24 24 | 
             
              end
         | 
| 25 25 | 
             
            end
         | 
| 26 26 |  | 
| 27 | 
            -
            RSpec | 
| 27 | 
            +
            RSpec.shared_examples 'a flat mapping values' do |workers|
         | 
| 28 28 | 
             
              it "with #{workers || 'default'} worker" do
         | 
| 29 29 | 
             
                rdd2 = rdd(workers).flat_map_values(func1)
         | 
| 30 30 | 
             
                result = []
         | 
| @@ -50,25 +50,26 @@ RSpec::shared_examples "a flat mapping values" do |workers| | |
| 50 50 | 
             
              end
         | 
| 51 51 | 
             
            end
         | 
| 52 52 |  | 
| 53 | 
            -
            RSpec | 
| 53 | 
            +
            RSpec.describe 'Spark::RDD' do
         | 
| 54 54 | 
             
              let(:func1) { lambda{|x| x*2} }
         | 
| 55 55 | 
             
              let(:func2) { lambda{|x| [x*3, 1, 1]} }
         | 
| 56 56 | 
             
              let(:func3) { lambda{|x| [x*4, 2, 2]} }
         | 
| 57 57 |  | 
| 58 | 
            -
              context  | 
| 59 | 
            -
                context  | 
| 58 | 
            +
              context 'throught parallelize' do
         | 
| 59 | 
            +
                context '.flat_map' do
         | 
| 60 60 | 
             
                  let(:numbers) { Generator.numbers_with_zero }
         | 
| 61 61 |  | 
| 62 62 | 
             
                  def rdd(workers)
         | 
| 63 63 | 
             
                    $sc.parallelize(numbers, workers)
         | 
| 64 64 | 
             
                  end
         | 
| 65 65 |  | 
| 66 | 
            -
                  it_behaves_like  | 
| 67 | 
            -
                  it_behaves_like  | 
| 68 | 
            -
                  it_behaves_like  | 
| 66 | 
            +
                  it_behaves_like 'a flat mapping', 1
         | 
| 67 | 
            +
                  it_behaves_like 'a flat mapping', 2
         | 
| 68 | 
            +
                  # it_behaves_like 'a flat mapping', nil
         | 
| 69 | 
            +
                  # it_behaves_like 'a flat mapping', rand(2..10)
         | 
| 69 70 | 
             
                end
         | 
| 70 71 |  | 
| 71 | 
            -
                context  | 
| 72 | 
            +
                context '.flat_map_values' do
         | 
| 72 73 | 
             
                  let(:func1) { lambda{|x| x*2} }
         | 
| 73 74 | 
             
                  let(:func2) { lambda{|x| [x.first]} }
         | 
| 74 75 | 
             
                  let(:hash_with_values) { Generator.hash_with_values }
         | 
| @@ -77,24 +78,26 @@ RSpec::describe "Spark::RDD" do | |
| 77 78 | 
             
                    $sc.parallelize(hash_with_values, workers)
         | 
| 78 79 | 
             
                  end
         | 
| 79 80 |  | 
| 80 | 
            -
                  it_behaves_like  | 
| 81 | 
            -
                  it_behaves_like  | 
| 82 | 
            -
                  it_behaves_like  | 
| 81 | 
            +
                  it_behaves_like 'a flat mapping values', 1
         | 
| 82 | 
            +
                  it_behaves_like 'a flat mapping values', 2
         | 
| 83 | 
            +
                  # it_behaves_like 'a flat mapping values', nil
         | 
| 84 | 
            +
                  # it_behaves_like 'a flat mapping values', rand(2..10)
         | 
| 83 85 | 
             
                end
         | 
| 84 86 | 
             
              end
         | 
| 85 87 |  | 
| 86 | 
            -
              context  | 
| 87 | 
            -
                context  | 
| 88 | 
            -
                  let(:file)    { File.join( | 
| 88 | 
            +
              context 'throught text_file' do
         | 
| 89 | 
            +
                context '.flat_map' do
         | 
| 90 | 
            +
                  let(:file)    { File.join('spec', 'inputs', 'numbers_0_100.txt') }
         | 
| 89 91 | 
             
                  let(:numbers) { File.readlines(file).map(&:strip) }
         | 
| 90 92 |  | 
| 91 93 | 
             
                  def rdd(workers)
         | 
| 92 94 | 
             
                    $sc.text_file(file, workers)
         | 
| 93 95 | 
             
                  end
         | 
| 94 96 |  | 
| 95 | 
            -
                  it_behaves_like  | 
| 96 | 
            -
                  it_behaves_like  | 
| 97 | 
            -
                  it_behaves_like  | 
| 97 | 
            +
                  it_behaves_like 'a flat mapping', 1
         | 
| 98 | 
            +
                  it_behaves_like 'a flat mapping', 2
         | 
| 99 | 
            +
                  # it_behaves_like 'a flat mapping', nil
         | 
| 100 | 
            +
                  # it_behaves_like 'a flat mapping', rand(2..10)
         | 
| 98 101 | 
             
                end
         | 
| 99 102 | 
             
              end
         | 
| 100 103 | 
             
            end
         | 
    
        data/spec/lib/group_spec.rb
    CHANGED
    
    | @@ -1,26 +1,26 @@ | |
| 1 | 
            -
            require  | 
| 1 | 
            +
            require 'spec_helper'
         | 
| 2 2 |  | 
| 3 | 
            -
            RSpec | 
| 3 | 
            +
            RSpec.shared_examples 'a groupping by key' do |workers|
         | 
| 4 4 | 
             
              it "with #{workers || 'default'} worker" do
         | 
| 5 5 | 
             
                expect(rdd_result(workers)).to eql(result)
         | 
| 6 6 | 
             
              end
         | 
| 7 7 | 
             
            end
         | 
| 8 8 |  | 
| 9 | 
            -
            RSpec | 
| 9 | 
            +
            RSpec.shared_examples 'a cogroupping by key' do |workers|
         | 
| 10 10 | 
             
              context "with #{workers || 'default'} worker" do
         | 
| 11 | 
            -
                it  | 
| 11 | 
            +
                it '.group_with' do
         | 
| 12 12 | 
             
                  rdd = rdd_1(workers).group_with(rdd_2(workers))
         | 
| 13 13 | 
             
                  expect(rdd.collect_as_hash).to eql(result_12)
         | 
| 14 14 | 
             
                end
         | 
| 15 15 |  | 
| 16 | 
            -
                it  | 
| 16 | 
            +
                it '.cogroup' do
         | 
| 17 17 | 
             
                  rdd = rdd_1(workers).cogroup(rdd_2(workers), rdd_3(workers))
         | 
| 18 18 | 
             
                  expect(rdd.collect_as_hash).to eql(result_123)
         | 
| 19 19 | 
             
                end
         | 
| 20 20 | 
             
              end
         | 
| 21 21 | 
             
            end
         | 
| 22 22 |  | 
| 23 | 
            -
            RSpec | 
| 23 | 
            +
            RSpec.shared_examples 'a groupping by' do |workers|
         | 
| 24 24 | 
             
              it "with #{workers || 'default'} worker" do
         | 
| 25 25 | 
             
                rdd = rdd_numbers(workers)
         | 
| 26 26 | 
             
                rdd = rdd.group_by(key_function1)
         | 
| @@ -34,7 +34,7 @@ RSpec::shared_examples "a groupping by" do |workers| | |
| 34 34 | 
             
              end
         | 
| 35 35 | 
             
            end
         | 
| 36 36 |  | 
| 37 | 
            -
            RSpec | 
| 37 | 
            +
            RSpec.describe 'Spark::RDD' do
         | 
| 38 38 |  | 
| 39 39 | 
             
              def make_result(*hashes)
         | 
| 40 40 | 
             
                _result = {}
         | 
| @@ -47,7 +47,7 @@ RSpec::describe "Spark::RDD" do | |
| 47 47 | 
             
                _result
         | 
| 48 48 | 
             
              end
         | 
| 49 49 |  | 
| 50 | 
            -
              context  | 
| 50 | 
            +
              context '.group_by_key' do
         | 
| 51 51 | 
             
                let(:hash) { Generator.hash }
         | 
| 52 52 | 
             
                let(:result) { make_result(hash) }
         | 
| 53 53 |  | 
| @@ -56,12 +56,13 @@ RSpec::describe "Spark::RDD" do | |
| 56 56 | 
             
                  rdd.group_by_key.collect_as_hash
         | 
| 57 57 | 
             
                end
         | 
| 58 58 |  | 
| 59 | 
            -
                it_behaves_like  | 
| 60 | 
            -
                it_behaves_like  | 
| 61 | 
            -
                it_behaves_like  | 
| 59 | 
            +
                it_behaves_like 'a groupping by key', 1
         | 
| 60 | 
            +
                it_behaves_like 'a groupping by key', 2
         | 
| 61 | 
            +
                # it_behaves_like 'a groupping by key', nil
         | 
| 62 | 
            +
                # it_behaves_like 'a groupping by key', rand(2..10)
         | 
| 62 63 | 
             
              end
         | 
| 63 64 |  | 
| 64 | 
            -
              context  | 
| 65 | 
            +
              context 'cogroup' do
         | 
| 65 66 | 
             
                let(:hash1) { Generator.hash }
         | 
| 66 67 | 
             
                let(:hash2) { Generator.hash }
         | 
| 67 68 | 
             
                let(:hash3) { Generator.hash }
         | 
| @@ -81,12 +82,13 @@ RSpec::describe "Spark::RDD" do | |
| 81 82 | 
             
                  $sc.parallelize(hash3)
         | 
| 82 83 | 
             
                end
         | 
| 83 84 |  | 
| 84 | 
            -
                it_behaves_like  | 
| 85 | 
            -
                it_behaves_like  | 
| 86 | 
            -
                it_behaves_like  | 
| 85 | 
            +
                it_behaves_like 'a cogroupping by key', 1
         | 
| 86 | 
            +
                it_behaves_like 'a cogroupping by key', 2
         | 
| 87 | 
            +
                # it_behaves_like 'a cogroupping by key', nil
         | 
| 88 | 
            +
                # it_behaves_like 'a cogroupping by key', rand(2..10)
         | 
| 87 89 | 
             
              end
         | 
| 88 90 |  | 
| 89 | 
            -
              context  | 
| 91 | 
            +
              context 'group_by' do
         | 
| 90 92 | 
             
                let(:key_function1) { lambda{|x| x%2} }
         | 
| 91 93 | 
             
                let(:key_function2) { lambda{|x| x.size} }
         | 
| 92 94 |  | 
| @@ -101,9 +103,10 @@ RSpec::describe "Spark::RDD" do | |
| 101 103 | 
             
                  $sc.parallelize(words)
         | 
| 102 104 | 
             
                end
         | 
| 103 105 |  | 
| 104 | 
            -
                it_behaves_like  | 
| 105 | 
            -
                it_behaves_like  | 
| 106 | 
            -
                it_behaves_like  | 
| 106 | 
            +
                it_behaves_like 'a groupping by', 1
         | 
| 107 | 
            +
                it_behaves_like 'a groupping by', 2
         | 
| 108 | 
            +
                # it_behaves_like 'a groupping by', nil
         | 
| 109 | 
            +
                # it_behaves_like 'a groupping by', rand(2..10)
         | 
| 107 110 | 
             
              end
         | 
| 108 111 |  | 
| 109 112 | 
             
            end
         | 
    
        data/spec/lib/helper_spec.rb
    CHANGED
    
    | @@ -1,19 +1,67 @@ | |
| 1 | 
            -
            require  | 
| 1 | 
            +
            require 'spec_helper'
         | 
| 2 2 |  | 
| 3 | 
            -
            RSpec | 
| 3 | 
            +
            RSpec.configure do |c|
         | 
| 4 4 | 
             
              c.include Spark::Helper::Parser
         | 
| 5 | 
            +
              c.include Spark::Helper::Statistic
         | 
| 5 6 | 
             
            end
         | 
| 6 7 |  | 
| 7 | 
            -
            RSpec | 
| 8 | 
            -
             | 
| 9 | 
            -
              it  | 
| 10 | 
            -
                expect(to_memory_size( | 
| 11 | 
            -
                expect(to_memory_size( | 
| 12 | 
            -
                expect(to_memory_size( | 
| 13 | 
            -
                expect(to_memory_size( | 
| 14 | 
            -
                expect(to_memory_size( | 
| 15 | 
            -
                expect(to_memory_size( | 
| 16 | 
            -
                expect(to_memory_size( | 
| 8 | 
            +
            RSpec.describe Spark::Helper do
         | 
| 9 | 
            +
             | 
| 10 | 
            +
              it 'memory size' do
         | 
| 11 | 
            +
                expect(to_memory_size('512mb')).to eql(524288.0)
         | 
| 12 | 
            +
                expect(to_memory_size('1586 mb')).to eql(1624064.0)
         | 
| 13 | 
            +
                expect(to_memory_size('3 MB')).to eql(3072.0)
         | 
| 14 | 
            +
                expect(to_memory_size('9gb')).to eql(9437184.0)
         | 
| 15 | 
            +
                expect(to_memory_size('9gb', 'mb')).to eql(9216.0)
         | 
| 16 | 
            +
                expect(to_memory_size('9mb', 'gb')).to eql(0.01)
         | 
| 17 | 
            +
                expect(to_memory_size('6652548796kb', 'mb')).to eql(6496629.68)
         | 
| 18 | 
            +
              end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
              context 'statistic' do
         | 
| 21 | 
            +
                it 'compute_fraction' do
         | 
| 22 | 
            +
                  expect(compute_fraction(1, 1000, true)).to be_within(0.001).of(0.013)
         | 
| 23 | 
            +
                  expect(compute_fraction(2, 1000, true)).to be_within(0.001).of(0.018)
         | 
| 24 | 
            +
                  expect(compute_fraction(3, 1000, true)).to be_within(0.001).of(0.023)
         | 
| 25 | 
            +
                  expect(compute_fraction(4, 1000, true)).to be_within(0.001).of(0.028)
         | 
| 26 | 
            +
                  expect(compute_fraction(5, 1000, true)).to be_within(0.001).of(0.031)
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                  expect(compute_fraction(1, 1000, false)).to be_within(0.001).of(0.0249)
         | 
| 29 | 
            +
                  expect(compute_fraction(2, 1000, false)).to be_within(0.001).of(0.0268)
         | 
| 30 | 
            +
                  expect(compute_fraction(3, 1000, false)).to be_within(0.001).of(0.0287)
         | 
| 31 | 
            +
                  expect(compute_fraction(4, 1000, false)).to be_within(0.001).of(0.0305)
         | 
| 32 | 
            +
                  expect(compute_fraction(5, 1000, false)).to be_within(0.001).of(0.0322)
         | 
| 33 | 
            +
                end
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                it 'bisect_right' do
         | 
| 36 | 
            +
                  data = [10, 20, 30, 40, 50, 60, 70, 80, 90]
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                  expect(bisect_right(data, 0)).to eq(0)
         | 
| 39 | 
            +
                  expect(bisect_right(data, 1)).to eq(0)
         | 
| 40 | 
            +
                  expect(bisect_right(data, 1, 2)).to eq(2)
         | 
| 41 | 
            +
                  expect(bisect_right(data, 1, 3)).to eq(3)
         | 
| 42 | 
            +
                  expect(bisect_right(data, 1, 4)).to eq(4)
         | 
| 43 | 
            +
                  expect(bisect_right(data, 9)).to eq(0)
         | 
| 44 | 
            +
                  expect(bisect_right(data, 10)).to eq(1)
         | 
| 45 | 
            +
                  expect(bisect_right(data, 40)).to eq(4)
         | 
| 46 | 
            +
                  expect(bisect_right(data, 42)).to eq(4)
         | 
| 47 | 
            +
                  expect(bisect_right(data, 72)).to eq(7)
         | 
| 48 | 
            +
                  expect(bisect_right(data, 80, 4)).to eq(8)
         | 
| 49 | 
            +
                  expect(bisect_right(data, 80, 5)).to eq(8)
         | 
| 50 | 
            +
                  expect(bisect_right(data, 80, 8)).to eq(8)
         | 
| 51 | 
            +
                  expect(bisect_right(data, 80, 9)).to eq(9)
         | 
| 52 | 
            +
                  expect(bisect_right(data, 200)).to eq(9)
         | 
| 53 | 
            +
                end
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                it 'determine_bounds' do
         | 
| 56 | 
            +
                  data = [10, 20, 30, 40, 50, 60, 70, 80, 90]
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                  expect(determine_bounds(data, 0)).to eq([])
         | 
| 59 | 
            +
                  expect(determine_bounds(data, 1)).to eq([])
         | 
| 60 | 
            +
                  expect(determine_bounds(data, 2)).to eq([50])
         | 
| 61 | 
            +
                  expect(determine_bounds(data, 3)).to eq([40, 70])
         | 
| 62 | 
            +
                  expect(determine_bounds(data, 4)).to eq([30, 50, 70])
         | 
| 63 | 
            +
                  expect(determine_bounds(data, 20)).to eq(data)
         | 
| 64 | 
            +
                end
         | 
| 17 65 | 
             
              end
         | 
| 18 66 |  | 
| 19 67 | 
             
            end
         | 
    
        data/spec/lib/key_spec.rb
    CHANGED
    
    | @@ -1,6 +1,6 @@ | |
| 1 | 
            -
            require  | 
| 1 | 
            +
            require 'spec_helper'
         | 
| 2 2 |  | 
| 3 | 
            -
            RSpec | 
| 3 | 
            +
            RSpec.shared_examples 'a keying by' do |workers|
         | 
| 4 4 | 
             
              it "with #{workers || 'default'} worker" do
         | 
| 5 5 | 
             
                rdd = rdd_numbers(workers)
         | 
| 6 6 | 
             
                rdd = rdd.key_by(key_function1)
         | 
| @@ -16,11 +16,11 @@ RSpec::shared_examples "a keying by" do |workers| | |
| 16 16 | 
             
              end
         | 
| 17 17 | 
             
            end
         | 
| 18 18 |  | 
| 19 | 
            -
            RSpec | 
| 19 | 
            +
            RSpec.describe 'Spark::RDD' do
         | 
| 20 20 |  | 
| 21 | 
            -
              context  | 
| 21 | 
            +
              context 'key_by' do
         | 
| 22 22 | 
             
                let(:key_function1) { lambda{|x| x.even?} }
         | 
| 23 | 
            -
                let(:key_function2) { lambda{|x| x.include?( | 
| 23 | 
            +
                let(:key_function2) { lambda{|x| x.include?('a')} }
         | 
| 24 24 |  | 
| 25 25 | 
             
                let(:numbers) { Generator.numbers }
         | 
| 26 26 | 
             
                let(:words)   { Generator.words }
         | 
| @@ -33,9 +33,10 @@ RSpec::describe "Spark::RDD" do | |
| 33 33 | 
             
                  $sc.parallelize(words)
         | 
| 34 34 | 
             
                end
         | 
| 35 35 |  | 
| 36 | 
            -
                it_behaves_like  | 
| 37 | 
            -
                it_behaves_like  | 
| 38 | 
            -
                it_behaves_like  | 
| 36 | 
            +
                it_behaves_like 'a keying by', 1
         | 
| 37 | 
            +
                it_behaves_like 'a keying by', 2
         | 
| 38 | 
            +
                # it_behaves_like 'a keying by', nil
         | 
| 39 | 
            +
                # it_behaves_like 'a keying by', rand(2..10)
         | 
| 39 40 | 
             
              end
         | 
| 40 41 |  | 
| 41 42 | 
             
            end
         | 
| @@ -1,10 +1,10 @@ | |
| 1 | 
            -
            require  | 
| 1 | 
            +
            require 'spec_helper'
         | 
| 2 2 |  | 
| 3 | 
            -
            RSpec | 
| 3 | 
            +
            RSpec.describe 'Spark::RDD' do
         | 
| 4 4 | 
             
              let(:numbers) { 1..100 }
         | 
| 5 5 | 
             
              let(:rand_numbers) { Generator.numbers }
         | 
| 6 6 |  | 
| 7 | 
            -
              it  | 
| 7 | 
            +
              it '.glom' do
         | 
| 8 8 | 
             
                rdd = $sc.parallelize(numbers, 1).glom
         | 
| 9 9 | 
             
                expect(rdd.collect).to eql([numbers.to_a])
         | 
| 10 10 |  | 
| @@ -14,7 +14,7 @@ RSpec::describe "Spark::RDD" do | |
| 14 14 | 
             
                expect(rdd.collect).to eql(numbers.each_slice(20).to_a)
         | 
| 15 15 | 
             
              end
         | 
| 16 16 |  | 
| 17 | 
            -
              it  | 
| 17 | 
            +
              it '.coalesce' do
         | 
| 18 18 | 
             
                rdd = $sc.parallelize(numbers, 5)
         | 
| 19 19 |  | 
| 20 20 | 
             
                rdd2 = rdd.glom
         | 
| @@ -24,7 +24,7 @@ RSpec::describe "Spark::RDD" do | |
| 24 24 | 
             
                expect(rdd3.collect.size).to eql(4)
         | 
| 25 25 | 
             
              end
         | 
| 26 26 |  | 
| 27 | 
            -
              it  | 
| 27 | 
            +
              it '.distinct' do
         | 
| 28 28 | 
             
                rdd = $sc.parallelize(rand_numbers, 5)
         | 
| 29 29 | 
             
                rdd = rdd.distinct
         | 
| 30 30 | 
             
                expect(rdd.collect.sort).to eql(rand_numbers.uniq.sort)
         | 
| @@ -35,22 +35,22 @@ RSpec::describe "Spark::RDD" do | |
| 35 35 | 
             
                expect(rdd.collect).to eql([1])
         | 
| 36 36 | 
             
              end
         | 
| 37 37 |  | 
| 38 | 
            -
              context  | 
| 39 | 
            -
                it  | 
| 38 | 
            +
              context '.union' do
         | 
| 39 | 
            +
                it 'classic method' do
         | 
| 40 40 | 
             
                  rdd = $sc.parallelize(numbers, 5)
         | 
| 41 41 | 
             
                  rdd = rdd.union(rdd).collect
         | 
| 42 42 |  | 
| 43 43 | 
             
                  expect(rdd.collect.sort).to eql((numbers.to_a+numbers.to_a).sort)
         | 
| 44 44 | 
             
                end
         | 
| 45 45 |  | 
| 46 | 
            -
                it  | 
| 46 | 
            +
                it 'with a different serializer' do
         | 
| 47 47 | 
             
                  rdd1 = $sc.parallelize(numbers, 1, Spark::Serializer.build{ __batched__(__marshal__) })
         | 
| 48 48 | 
             
                  rdd2 = $sc.parallelize(numbers, 1, Spark::Serializer.build{ __batched__(__oj__) })
         | 
| 49 49 |  | 
| 50 50 | 
             
                  expect { rdd1.union(rdd2).collect }.to_not raise_error
         | 
| 51 51 | 
             
                end
         | 
| 52 52 |  | 
| 53 | 
            -
                it  | 
| 53 | 
            +
                it 'as operator' do
         | 
| 54 54 | 
             
                  rdd1 = $sc.parallelize(numbers)
         | 
| 55 55 | 
             
                  rdd2 = $sc.parallelize(rand_numbers)
         | 
| 56 56 |  | 
| @@ -58,7 +58,7 @@ RSpec::describe "Spark::RDD" do | |
| 58 58 | 
             
                end
         | 
| 59 59 | 
             
              end
         | 
| 60 60 |  | 
| 61 | 
            -
              it  | 
| 61 | 
            +
              it '.compact' do
         | 
| 62 62 | 
             
                data = [nil, nil , 0, 0, 1, 2, nil, 6]
         | 
| 63 63 | 
             
                result = data.compact
         | 
| 64 64 | 
             
                ser = Spark::Serializer.build { __batched__(__marshal__, 1) }
         | 
| @@ -73,7 +73,7 @@ RSpec::describe "Spark::RDD" do | |
| 73 73 | 
             
                expect(rdd.collect).to eql(result)
         | 
| 74 74 | 
             
              end
         | 
| 75 75 |  | 
| 76 | 
            -
              it  | 
| 76 | 
            +
              it '.intersection' do
         | 
| 77 77 | 
             
                data1 = [0,1,2,3,4,5,6,7,8,9,10]
         | 
| 78 78 | 
             
                data2 = [5,6,7,8,9,10,11,12,13,14,15]
         | 
| 79 79 |  | 
| @@ -83,19 +83,19 @@ RSpec::describe "Spark::RDD" do | |
| 83 83 | 
             
                expect(rdd1.intersection(rdd2).collect.sort).to eql(data1 & data2)
         | 
| 84 84 | 
             
              end
         | 
| 85 85 |  | 
| 86 | 
            -
              it  | 
| 86 | 
            +
              it '.shuffle' do
         | 
| 87 87 | 
             
                data = Generator.numbers
         | 
| 88 88 | 
             
                rdd = $sc.parallelize(data)
         | 
| 89 89 |  | 
| 90 90 | 
             
                expect(rdd.shuffle.collect).to_not eql(data)
         | 
| 91 91 | 
             
              end
         | 
| 92 92 |  | 
| 93 | 
            -
              context  | 
| 93 | 
            +
              context '.cartesian' do
         | 
| 94 94 | 
             
                let(:data1) { Generator.numbers(100) }
         | 
| 95 95 | 
             
                let(:data2) { Generator.numbers(100) }
         | 
| 96 96 | 
             
                let(:result) { data1.product(data2).map(&:to_s).sort }
         | 
| 97 97 |  | 
| 98 | 
            -
                it  | 
| 98 | 
            +
                it 'unbatched' do
         | 
| 99 99 | 
             
                  ser = Spark::Serializer.build { __batched__(__marshal__, 1) }
         | 
| 100 100 |  | 
| 101 101 | 
             
                  rdd1 = $sc.parallelize(data1, 2, ser)
         | 
| @@ -106,7 +106,7 @@ RSpec::describe "Spark::RDD" do | |
| 106 106 | 
             
                  expect(rdd.collect.sort).to eql(result)
         | 
| 107 107 | 
             
                end
         | 
| 108 108 |  | 
| 109 | 
            -
                it  | 
| 109 | 
            +
                it 'batched' do
         | 
| 110 110 | 
             
                  ser1 = Spark::Serializer.build { __batched__(__marshal__, rand(4..10)) }
         | 
| 111 111 | 
             
                  ser2 = Spark::Serializer.build { __batched__(__marshal__, rand(4..10)) }
         | 
| 112 112 |  | 
| @@ -67,9 +67,10 @@ RSpec::describe 'Spark::RDD.map_partitions(_with_index)' do | |
| 67 67 | 
             
                  $sc.parallelize(numbers, workers)
         | 
| 68 68 | 
             
                end
         | 
| 69 69 |  | 
| 70 | 
            -
                it_behaves_like 'a map partitions', nil
         | 
| 71 70 | 
             
                it_behaves_like 'a map partitions', 1
         | 
| 72 | 
            -
                it_behaves_like 'a map partitions',  | 
| 71 | 
            +
                it_behaves_like 'a map partitions', 2
         | 
| 72 | 
            +
                # it_behaves_like 'a map partitions', nil
         | 
| 73 | 
            +
                # it_behaves_like 'a map partitions', rand(2..10)
         | 
| 73 74 | 
             
              end
         | 
| 74 75 |  | 
| 75 76 | 
             
              context 'throught text_file' do
         | 
| @@ -80,8 +81,9 @@ RSpec::describe 'Spark::RDD.map_partitions(_with_index)' do | |
| 80 81 | 
             
                  $sc.text_file(file, workers)
         | 
| 81 82 | 
             
                end
         | 
| 82 83 |  | 
| 83 | 
            -
                it_behaves_like 'a map partitions', nil
         | 
| 84 84 | 
             
                it_behaves_like 'a map partitions', 1
         | 
| 85 | 
            -
                it_behaves_like 'a map partitions',  | 
| 85 | 
            +
                it_behaves_like 'a map partitions', 2
         | 
| 86 | 
            +
                # it_behaves_like 'a map partitions', nil
         | 
| 87 | 
            +
                # it_behaves_like 'a map partitions', rand(2..10)
         | 
| 86 88 | 
             
              end
         | 
| 87 89 | 
             
            end
         | 
    
        data/spec/lib/map_spec.rb
    CHANGED
    
    | @@ -1,6 +1,6 @@ | |
| 1 | 
            -
            require  | 
| 1 | 
            +
            require 'spec_helper'
         | 
| 2 2 |  | 
| 3 | 
            -
            RSpec | 
| 3 | 
            +
            RSpec.shared_examples 'a mapping' do |workers|
         | 
| 4 4 | 
             
              it "with #{workers || 'default'} worker" do
         | 
| 5 5 | 
             
                rdd2 = rdd(workers).map(func1)
         | 
| 6 6 | 
             
                result = numbers.map(&func1)
         | 
| @@ -24,7 +24,7 @@ RSpec::shared_examples "a mapping" do |workers| | |
| 24 24 | 
             
              end
         | 
| 25 25 | 
             
            end
         | 
| 26 26 |  | 
| 27 | 
            -
            RSpec | 
| 27 | 
            +
            RSpec.shared_examples 'a mapping values' do |workers|
         | 
| 28 28 | 
             
              it "with #{workers || 'default'} worker" do
         | 
| 29 29 | 
             
                rdd2 = rdd(workers).map_values(func1)
         | 
| 30 30 | 
             
                result = hash.map{|key, value| [key, func1.call(value)]}
         | 
| @@ -43,49 +43,52 @@ RSpec::shared_examples "a mapping values" do |workers| | |
| 43 43 | 
             
              end
         | 
| 44 44 | 
             
            end
         | 
| 45 45 |  | 
| 46 | 
            -
            RSpec | 
| 46 | 
            +
            RSpec.describe 'Spark::RDD' do
         | 
| 47 47 | 
             
              let(:func1) { lambda{|x| x*2} }
         | 
| 48 48 | 
             
              let(:func2) { lambda{|x| x*3} }
         | 
| 49 49 | 
             
              let(:func3) { lambda{|x| x*4} }
         | 
| 50 50 |  | 
| 51 | 
            -
              context  | 
| 52 | 
            -
                context  | 
| 51 | 
            +
              context 'throught parallelize' do
         | 
| 52 | 
            +
                context '.map' do
         | 
| 53 53 | 
             
                  let(:numbers) { Generator.numbers }
         | 
| 54 54 |  | 
| 55 55 | 
             
                  def rdd(workers)
         | 
| 56 56 | 
             
                    $sc.parallelize(numbers, workers)
         | 
| 57 57 | 
             
                  end
         | 
| 58 58 |  | 
| 59 | 
            -
                  it_behaves_like  | 
| 60 | 
            -
                  it_behaves_like  | 
| 61 | 
            -
                  it_behaves_like  | 
| 59 | 
            +
                  it_behaves_like 'a mapping', 1
         | 
| 60 | 
            +
                  it_behaves_like 'a mapping', 2
         | 
| 61 | 
            +
                  # it_behaves_like 'a mapping', nil
         | 
| 62 | 
            +
                  # it_behaves_like 'a mapping', rand(2..10)
         | 
| 62 63 | 
             
                end
         | 
| 63 64 |  | 
| 64 | 
            -
                context  | 
| 65 | 
            +
                context '.map_values' do
         | 
| 65 66 | 
             
                  let!(:hash) { Generator.hash }
         | 
| 66 67 |  | 
| 67 68 | 
             
                  def rdd(workers)
         | 
| 68 69 | 
             
                    $sc.parallelize(hash, workers)
         | 
| 69 70 | 
             
                  end
         | 
| 70 71 |  | 
| 71 | 
            -
                  it_behaves_like  | 
| 72 | 
            -
                  it_behaves_like  | 
| 73 | 
            -
                  it_behaves_like  | 
| 72 | 
            +
                  it_behaves_like 'a mapping values', 1
         | 
| 73 | 
            +
                  it_behaves_like 'a mapping values', 2
         | 
| 74 | 
            +
                  # it_behaves_like 'a mapping values', nil
         | 
| 75 | 
            +
                  # it_behaves_like 'a mapping values', rand(2..10)
         | 
| 74 76 | 
             
                end
         | 
| 75 77 | 
             
              end
         | 
| 76 78 |  | 
| 77 | 
            -
              context  | 
| 78 | 
            -
                context  | 
| 79 | 
            -
                  let(:file) { File.join( | 
| 79 | 
            +
              context 'throught text_file' do
         | 
| 80 | 
            +
                context '.map' do
         | 
| 81 | 
            +
                  let(:file) { File.join('spec', 'inputs', 'numbers_0_100.txt') }
         | 
| 80 82 | 
             
                  let(:numbers) { File.readlines(file).map(&:strip) }
         | 
| 81 83 |  | 
| 82 84 | 
             
                  def rdd(workers)
         | 
| 83 85 | 
             
                    $sc.text_file(file, workers)
         | 
| 84 86 | 
             
                  end
         | 
| 85 87 |  | 
| 86 | 
            -
                  it_behaves_like  | 
| 87 | 
            -
                  it_behaves_like  | 
| 88 | 
            -
                  it_behaves_like  | 
| 88 | 
            +
                  it_behaves_like 'a mapping', 1
         | 
| 89 | 
            +
                  it_behaves_like 'a mapping', 2
         | 
| 90 | 
            +
                  # it_behaves_like 'a mapping', nil
         | 
| 91 | 
            +
                  # it_behaves_like 'a mapping', rand(2..10)
         | 
| 89 92 | 
             
                end
         | 
| 90 93 | 
             
              end
         | 
| 91 94 | 
             
            end
         |