ruby-spark 1.0.0 → 1.1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/README.md +99 -32
- data/TODO.md +2 -3
- data/benchmark/{performance → comparison}/prepare.sh +0 -0
- data/benchmark/{performance → comparison}/python.py +0 -0
- data/benchmark/{performance → comparison}/r.r +0 -0
- data/benchmark/{performance → comparison}/ruby.rb +0 -0
- data/benchmark/{performance → comparison}/run-all.sh +0 -0
- data/benchmark/{performance → comparison}/scala.scala +0 -0
- data/example/pi.rb +1 -1
- data/example/website_search.rb +83 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +30 -2
- data/lib/spark.rb +2 -2
- data/lib/spark/build.rb +1 -1
- data/lib/spark/cli.rb +1 -1
- data/lib/spark/command/base.rb +4 -0
- data/lib/spark/command_builder.rb +2 -2
- data/lib/spark/config.rb +11 -17
- data/lib/spark/context.rb +63 -45
- data/lib/spark/ext/io.rb +11 -1
- data/lib/spark/java_bridge/base.rb +2 -2
- data/lib/spark/rdd.rb +67 -18
- data/lib/spark/serializer.rb +68 -13
- data/lib/spark/serializer/auto_batched.rb +59 -0
- data/lib/spark/serializer/base.rb +30 -137
- data/lib/spark/serializer/batched.rb +84 -0
- data/lib/spark/serializer/cartesian.rb +5 -29
- data/lib/spark/serializer/compressed.rb +27 -0
- data/lib/spark/serializer/marshal.rb +6 -8
- data/lib/spark/serializer/message_pack.rb +8 -10
- data/lib/spark/serializer/oj.rb +8 -10
- data/lib/spark/serializer/pair.rb +27 -13
- data/lib/spark/serializer/text.rb +25 -0
- data/lib/spark/version.rb +1 -1
- data/lib/spark/worker/worker.rb +5 -2
- data/ruby-spark.gemspec +13 -1
- data/spec/lib/context_spec.rb +3 -1
- data/spec/lib/manipulation_spec.rb +18 -10
- data/spec/lib/map_partitions_spec.rb +16 -16
- data/spec/lib/serializer_spec.rb +84 -9
- data/spec/lib/statistic_spec.rb +26 -24
- data/spec/spec_helper.rb +1 -2
- metadata +112 -10
- data/lib/spark/serializer/utf8.rb +0 -25
    
        data/spec/lib/statistic_spec.rb
    CHANGED
    
    | @@ -1,6 +1,6 @@ | |
| 1 1 | 
             
            require 'spec_helper'
         | 
| 2 2 |  | 
| 3 | 
            -
            RSpec | 
| 3 | 
            +
            RSpec.shared_examples 'a stats' do |workers|
         | 
| 4 4 | 
             
              let(:numbers) { [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] }
         | 
| 5 5 |  | 
| 6 6 | 
             
              context "with #{workers || 'default'} worker" do
         | 
| @@ -29,23 +29,23 @@ RSpec::shared_examples 'a stats' do |workers| | |
| 29 29 | 
             
              end
         | 
| 30 30 | 
             
            end
         | 
| 31 31 |  | 
| 32 | 
            -
            RSpec | 
| 32 | 
            +
            RSpec.shared_examples 'a histogram' do |workers|
         | 
| 33 33 |  | 
| 34 34 | 
             
              context "with #{workers || 'default'} worker" do
         | 
| 35 35 | 
             
                it 'empty' do
         | 
| 36 | 
            -
                  rdd = $sc.parallelize([], workers,  | 
| 36 | 
            +
                  rdd = $sc.parallelize([], workers, ser)
         | 
| 37 37 |  | 
| 38 38 | 
             
                  expect( rdd.histogram([0, 10])[1] ).to eq([0])
         | 
| 39 39 | 
             
                  expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])
         | 
| 40 40 | 
             
                end
         | 
| 41 41 |  | 
| 42 42 | 
             
                it 'validation' do
         | 
| 43 | 
            -
                  rdd = $sc.parallelize([], workers,  | 
| 43 | 
            +
                  rdd = $sc.parallelize([], workers, ser)
         | 
| 44 44 | 
             
                  expect { rdd.histogram(0) }.to raise_error(ArgumentError)
         | 
| 45 45 | 
             
                end
         | 
| 46 46 |  | 
| 47 47 | 
             
                it 'double' do
         | 
| 48 | 
            -
                  rdd = $sc.parallelize([1.0, 2.0, 3.0, 4.0], workers,  | 
| 48 | 
            +
                  rdd = $sc.parallelize([1.0, 2.0, 3.0, 4.0], workers, ser)
         | 
| 49 49 | 
             
                  buckets, counts = rdd.histogram(2)
         | 
| 50 50 |  | 
| 51 51 | 
             
                  expect(buckets).to eq([1.0, 2.5, 4.0])
         | 
| @@ -53,91 +53,91 @@ RSpec::shared_examples 'a histogram' do |workers| | |
| 53 53 | 
             
                end
         | 
| 54 54 |  | 
| 55 55 | 
             
                it 'out of range' do
         | 
| 56 | 
            -
                  rdd = $sc.parallelize([10.01, -0.01], workers,  | 
| 56 | 
            +
                  rdd = $sc.parallelize([10.01, -0.01], workers, ser)
         | 
| 57 57 |  | 
| 58 58 | 
             
                  expect( rdd.histogram([0, 10])[1] ).to eq([0])
         | 
| 59 59 | 
             
                  expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])
         | 
| 60 60 | 
             
                end
         | 
| 61 61 |  | 
| 62 62 | 
             
                it 'in range with one bucket' do
         | 
| 63 | 
            -
                  rdd = $sc.parallelize([1, 2, 3, 4], workers,  | 
| 63 | 
            +
                  rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
         | 
| 64 64 |  | 
| 65 65 | 
             
                  expect( rdd.histogram([0, 10])[1] ).to eq([4])
         | 
| 66 66 | 
             
                  expect( rdd.histogram([0, 4, 10])[1] ).to eq([3, 1])
         | 
| 67 67 | 
             
                end
         | 
| 68 68 |  | 
| 69 69 | 
             
                it 'in range with one bucket exact match' do
         | 
| 70 | 
            -
                  rdd = $sc.parallelize([1, 2, 3, 4], workers,  | 
| 70 | 
            +
                  rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
         | 
| 71 71 | 
             
                  expect( rdd.histogram([1, 4])[1] ).to eq([4])
         | 
| 72 72 | 
             
                end
         | 
| 73 73 |  | 
| 74 74 | 
             
                it 'out of range with two buckets' do
         | 
| 75 | 
            -
                  rdd = $sc.parallelize([10.01, -0.01], workers,  | 
| 75 | 
            +
                  rdd = $sc.parallelize([10.01, -0.01], workers, ser)
         | 
| 76 76 | 
             
                  expect( rdd.histogram([0, 5, 10])[1] ).to eq([0, 0])
         | 
| 77 77 | 
             
                end
         | 
| 78 78 |  | 
| 79 79 | 
             
                it 'out of range with two uneven buckets' do
         | 
| 80 | 
            -
                  rdd = $sc.parallelize([10.01, -0.01], workers,  | 
| 80 | 
            +
                  rdd = $sc.parallelize([10.01, -0.01], workers, ser)
         | 
| 81 81 | 
             
                  expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])
         | 
| 82 82 | 
             
                end
         | 
| 83 83 |  | 
| 84 84 | 
             
                it 'in range with two buckets' do
         | 
| 85 | 
            -
                  rdd = $sc.parallelize([1, 2, 3, 5, 6], workers,  | 
| 85 | 
            +
                  rdd = $sc.parallelize([1, 2, 3, 5, 6], workers, ser)
         | 
| 86 86 | 
             
                  expect( rdd.histogram([0, 5, 10])[1] ).to eq([3, 2])
         | 
| 87 87 | 
             
                end
         | 
| 88 88 |  | 
| 89 89 | 
             
                it 'in range with two bucket and nil' do
         | 
| 90 | 
            -
                  rdd = $sc.parallelize([1, 2, 3, 5, 6, nil, Float::NAN], workers,  | 
| 90 | 
            +
                  rdd = $sc.parallelize([1, 2, 3, 5, 6, nil, Float::NAN], workers, ser)
         | 
| 91 91 | 
             
                  expect( rdd.histogram([0, 5, 10])[1] ).to eq([3, 2])
         | 
| 92 92 | 
             
                end
         | 
| 93 93 |  | 
| 94 94 | 
             
                it 'in range with two uneven buckets' do
         | 
| 95 | 
            -
                  rdd = $sc.parallelize([1, 2, 3, 5, 6], workers,  | 
| 95 | 
            +
                  rdd = $sc.parallelize([1, 2, 3, 5, 6], workers, ser)
         | 
| 96 96 | 
             
                  expect( rdd.histogram([0, 5, 11])[1] ).to eq([3, 2])
         | 
| 97 97 | 
             
                end
         | 
| 98 98 |  | 
| 99 99 | 
             
                it 'mixed range with two uneven buckets' do
         | 
| 100 | 
            -
                  rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.0, 11.01], workers,  | 
| 100 | 
            +
                  rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.0, 11.01], workers, ser)
         | 
| 101 101 | 
             
                  expect( rdd.histogram([0, 5, 11])[1] ).to eq([4, 3])
         | 
| 102 102 | 
             
                end
         | 
| 103 103 |  | 
| 104 104 | 
             
                it 'mixed range with four uneven buckets' do
         | 
| 105 | 
            -
                  rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1], workers,  | 
| 105 | 
            +
                  rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1], workers, ser)
         | 
| 106 106 | 
             
                  expect( rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1] ).to eq([4, 2, 1, 3])
         | 
| 107 107 | 
             
                end
         | 
| 108 108 |  | 
| 109 109 | 
             
                it 'mixed range with uneven buckets and NaN' do
         | 
| 110 | 
            -
                  rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1, nil, Float::NAN], workers,  | 
| 110 | 
            +
                  rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1, nil, Float::NAN], workers, ser)
         | 
| 111 111 | 
             
                  expect( rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1] ).to eq([4, 2, 1, 3])
         | 
| 112 112 | 
             
                end
         | 
| 113 113 |  | 
| 114 114 | 
             
                it 'out of range with infinite buckets' do
         | 
| 115 | 
            -
                  rdd = $sc.parallelize([10.01, -0.01, Float::NAN, Float::INFINITY], workers,  | 
| 115 | 
            +
                  rdd = $sc.parallelize([10.01, -0.01, Float::NAN, Float::INFINITY], workers, ser)
         | 
| 116 116 | 
             
                  expect( rdd.histogram([-Float::INFINITY, 0, Float::INFINITY])[1] ).to eq([1, 1])
         | 
| 117 117 | 
             
                end
         | 
| 118 118 |  | 
| 119 119 | 
             
                it 'without buckets' do
         | 
| 120 | 
            -
                  rdd = $sc.parallelize([1, 2, 3, 4], workers,  | 
| 120 | 
            +
                  rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
         | 
| 121 121 | 
             
                  expect( rdd.histogram(1) ).to eq([[1, 4], [4]])
         | 
| 122 122 | 
             
                end
         | 
| 123 123 |  | 
| 124 124 | 
             
                it 'without buckets single element' do
         | 
| 125 | 
            -
                  rdd = $sc.parallelize([1], workers,  | 
| 125 | 
            +
                  rdd = $sc.parallelize([1], workers, ser)
         | 
| 126 126 | 
             
                  expect( rdd.histogram(1) ).to eq([[1, 1], [1]])
         | 
| 127 127 | 
             
                end
         | 
| 128 128 |  | 
| 129 129 | 
             
                it 'without bucket no range' do
         | 
| 130 | 
            -
                  rdd = $sc.parallelize([1, 1, 1, 1], workers,  | 
| 130 | 
            +
                  rdd = $sc.parallelize([1, 1, 1, 1], workers, ser)
         | 
| 131 131 | 
             
                  expect( rdd.histogram(1) ).to eq([[1, 1], [4]])
         | 
| 132 132 | 
             
                end
         | 
| 133 133 |  | 
| 134 134 | 
             
                it 'without buckets basic two' do
         | 
| 135 | 
            -
                  rdd = $sc.parallelize([1, 2, 3, 4], workers,  | 
| 135 | 
            +
                  rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
         | 
| 136 136 | 
             
                  expect( rdd.histogram(2) ).to eq([[1, 2.5, 4], [2, 2]])
         | 
| 137 137 | 
             
                end
         | 
| 138 138 |  | 
| 139 139 | 
             
                it 'without buckets with more requested than elements' do
         | 
| 140 | 
            -
                  rdd = $sc.parallelize([1, 2], workers,  | 
| 140 | 
            +
                  rdd = $sc.parallelize([1, 2], workers, ser)
         | 
| 141 141 | 
             
                  buckets = [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]
         | 
| 142 142 | 
             
                  hist = [1, 0, 0, 0, 0, 0, 0, 0, 0, 1]
         | 
| 143 143 |  | 
| @@ -145,7 +145,7 @@ RSpec::shared_examples 'a histogram' do |workers| | |
| 145 145 | 
             
                end
         | 
| 146 146 |  | 
| 147 147 | 
             
                it 'string' do
         | 
| 148 | 
            -
                  rdd = $sc.parallelize(['ab', 'ac', 'b', 'bd', 'ef'], workers,  | 
| 148 | 
            +
                  rdd = $sc.parallelize(['ab', 'ac', 'b', 'bd', 'ef'], workers, ser)
         | 
| 149 149 |  | 
| 150 150 | 
             
                  expect( rdd.histogram(['a', 'b', 'c'])[1] ).to eq([2, 2])
         | 
| 151 151 | 
             
                  expect( rdd.histogram(1) ).to eq([['ab', 'ef'], [5]])
         | 
| @@ -155,7 +155,9 @@ RSpec::shared_examples 'a histogram' do |workers| | |
| 155 155 | 
             
              end
         | 
| 156 156 | 
             
            end
         | 
| 157 157 |  | 
| 158 | 
            -
            RSpec | 
| 158 | 
            +
            RSpec.describe Spark::RDD do
         | 
| 159 | 
            +
              let(:ser) { Spark::Serializer.build { __batched__(__marshal__, 1) } }
         | 
| 160 | 
            +
             | 
| 159 161 | 
             
              context '.stats' do
         | 
| 160 162 | 
             
                it_behaves_like 'a stats', 1
         | 
| 161 163 | 
             
                it_behaves_like 'a stats', rand(2..5)
         | 
    
        data/spec/spec_helper.rb
    CHANGED
    
    | @@ -14,8 +14,7 @@ Spark::Mllib.import | |
| 14 14 | 
             
            def spark_start
         | 
| 15 15 | 
             
              Spark.logger.disable
         | 
| 16 16 | 
             
              Spark.config do
         | 
| 17 | 
            -
                set 'spark.ruby. | 
| 18 | 
            -
                set 'spark.ruby.batch_size', 100
         | 
| 17 | 
            +
                set 'spark.ruby.serializer.batch_size', 100
         | 
| 19 18 | 
             
              end
         | 
| 20 19 | 
             
              Spark.start
         | 
| 21 20 | 
             
              $sc = Spark.context
         | 
    
        metadata
    CHANGED
    
    | @@ -1,15 +1,113 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: ruby-spark
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 1.0. | 
| 4 | 
            +
              version: 1.1.0.1
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Ondřej Moravčík
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2015-05- | 
| 11 | 
            +
            date: 2015-05-16 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 14 | 
            +
              name: sourcify
         | 
| 15 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 16 | 
            +
                requirements:
         | 
| 17 | 
            +
                - - '='
         | 
| 18 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 19 | 
            +
                    version: 0.6.0.rc4
         | 
| 20 | 
            +
              type: :runtime
         | 
| 21 | 
            +
              prerelease: false
         | 
| 22 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 23 | 
            +
                requirements:
         | 
| 24 | 
            +
                - - '='
         | 
| 25 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 26 | 
            +
                    version: 0.6.0.rc4
         | 
| 27 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 28 | 
            +
              name: method_source
         | 
| 29 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 30 | 
            +
                requirements:
         | 
| 31 | 
            +
                - - ">="
         | 
| 32 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 33 | 
            +
                    version: '0'
         | 
| 34 | 
            +
              type: :runtime
         | 
| 35 | 
            +
              prerelease: false
         | 
| 36 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 37 | 
            +
                requirements:
         | 
| 38 | 
            +
                - - ">="
         | 
| 39 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 40 | 
            +
                    version: '0'
         | 
| 41 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 42 | 
            +
              name: commander
         | 
| 43 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 44 | 
            +
                requirements:
         | 
| 45 | 
            +
                - - ">="
         | 
| 46 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 47 | 
            +
                    version: '0'
         | 
| 48 | 
            +
              type: :runtime
         | 
| 49 | 
            +
              prerelease: false
         | 
| 50 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 51 | 
            +
                requirements:
         | 
| 52 | 
            +
                - - ">="
         | 
| 53 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 54 | 
            +
                    version: '0'
         | 
| 55 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 56 | 
            +
              name: pry
         | 
| 57 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 58 | 
            +
                requirements:
         | 
| 59 | 
            +
                - - ">="
         | 
| 60 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 61 | 
            +
                    version: '0'
         | 
| 62 | 
            +
              type: :runtime
         | 
| 63 | 
            +
              prerelease: false
         | 
| 64 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 65 | 
            +
                requirements:
         | 
| 66 | 
            +
                - - ">="
         | 
| 67 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 68 | 
            +
                    version: '0'
         | 
| 69 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 70 | 
            +
              name: nio4r
         | 
| 71 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 72 | 
            +
                requirements:
         | 
| 73 | 
            +
                - - ">="
         | 
| 74 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 75 | 
            +
                    version: '0'
         | 
| 76 | 
            +
              type: :runtime
         | 
| 77 | 
            +
              prerelease: false
         | 
| 78 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 79 | 
            +
                requirements:
         | 
| 80 | 
            +
                - - ">="
         | 
| 81 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 82 | 
            +
                    version: '0'
         | 
| 83 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 84 | 
            +
              name: distribution
         | 
| 85 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 86 | 
            +
                requirements:
         | 
| 87 | 
            +
                - - ">="
         | 
| 88 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 89 | 
            +
                    version: '0'
         | 
| 90 | 
            +
              type: :runtime
         | 
| 91 | 
            +
              prerelease: false
         | 
| 92 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 93 | 
            +
                requirements:
         | 
| 94 | 
            +
                - - ">="
         | 
| 95 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 96 | 
            +
                    version: '0'
         | 
| 97 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 98 | 
            +
              name: rjb
         | 
| 99 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 100 | 
            +
                requirements:
         | 
| 101 | 
            +
                - - ">="
         | 
| 102 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 103 | 
            +
                    version: '0'
         | 
| 104 | 
            +
              type: :runtime
         | 
| 105 | 
            +
              prerelease: false
         | 
| 106 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 107 | 
            +
                requirements:
         | 
| 108 | 
            +
                - - ">="
         | 
| 109 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 110 | 
            +
                    version: '0'
         | 
| 13 111 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 112 | 
             
              name: bundler
         | 
| 15 113 | 
             
              requirement: !ruby/object:Gem::Requirement
         | 
| @@ -56,21 +154,22 @@ files: | |
| 56 154 | 
             
            - TODO.md
         | 
| 57 155 | 
             
            - benchmark/aggregate.rb
         | 
| 58 156 | 
             
            - benchmark/bisect.rb
         | 
| 157 | 
            +
            - benchmark/comparison/prepare.sh
         | 
| 158 | 
            +
            - benchmark/comparison/python.py
         | 
| 159 | 
            +
            - benchmark/comparison/r.r
         | 
| 160 | 
            +
            - benchmark/comparison/ruby.rb
         | 
| 161 | 
            +
            - benchmark/comparison/run-all.sh
         | 
| 162 | 
            +
            - benchmark/comparison/scala.scala
         | 
| 59 163 | 
             
            - benchmark/custom_marshal.rb
         | 
| 60 164 | 
             
            - benchmark/digest.rb
         | 
| 61 165 | 
             
            - benchmark/enumerator.rb
         | 
| 62 | 
            -
            - benchmark/performance/prepare.sh
         | 
| 63 | 
            -
            - benchmark/performance/python.py
         | 
| 64 | 
            -
            - benchmark/performance/r.r
         | 
| 65 | 
            -
            - benchmark/performance/ruby.rb
         | 
| 66 | 
            -
            - benchmark/performance/run-all.sh
         | 
| 67 | 
            -
            - benchmark/performance/scala.scala
         | 
| 68 166 | 
             
            - benchmark/serializer.rb
         | 
| 69 167 | 
             
            - benchmark/sort.rb
         | 
| 70 168 | 
             
            - benchmark/sort2.rb
         | 
| 71 169 | 
             
            - benchmark/take.rb
         | 
| 72 170 | 
             
            - bin/ruby-spark
         | 
| 73 171 | 
             
            - example/pi.rb
         | 
| 172 | 
            +
            - example/website_search.rb
         | 
| 74 173 | 
             
            - ext/ruby_c/extconf.rb
         | 
| 75 174 | 
             
            - ext/ruby_c/murmur.c
         | 
| 76 175 | 
             
            - ext/ruby_c/murmur.h
         | 
| @@ -155,13 +254,16 @@ files: | |
| 155 254 | 
             
            - lib/spark/rdd.rb
         | 
| 156 255 | 
             
            - lib/spark/sampler.rb
         | 
| 157 256 | 
             
            - lib/spark/serializer.rb
         | 
| 257 | 
            +
            - lib/spark/serializer/auto_batched.rb
         | 
| 158 258 | 
             
            - lib/spark/serializer/base.rb
         | 
| 259 | 
            +
            - lib/spark/serializer/batched.rb
         | 
| 159 260 | 
             
            - lib/spark/serializer/cartesian.rb
         | 
| 261 | 
            +
            - lib/spark/serializer/compressed.rb
         | 
| 160 262 | 
             
            - lib/spark/serializer/marshal.rb
         | 
| 161 263 | 
             
            - lib/spark/serializer/message_pack.rb
         | 
| 162 264 | 
             
            - lib/spark/serializer/oj.rb
         | 
| 163 265 | 
             
            - lib/spark/serializer/pair.rb
         | 
| 164 | 
            -
            - lib/spark/serializer/ | 
| 266 | 
            +
            - lib/spark/serializer/text.rb
         | 
| 165 267 | 
             
            - lib/spark/sort.rb
         | 
| 166 268 | 
             
            - lib/spark/stat_counter.rb
         | 
| 167 269 | 
             
            - lib/spark/storage_level.rb
         | 
| @@ -245,7 +347,7 @@ rubyforge_project: | |
| 245 347 | 
             
            rubygems_version: 2.2.2
         | 
| 246 348 | 
             
            signing_key: 
         | 
| 247 349 | 
             
            specification_version: 4
         | 
| 248 | 
            -
            summary: Ruby wrapper for Spark
         | 
| 350 | 
            +
            summary: Ruby wrapper for Apache Spark
         | 
| 249 351 | 
             
            test_files:
         | 
| 250 352 | 
             
            - spec/generator.rb
         | 
| 251 353 | 
             
            - spec/inputs/lorem_300.txt
         | 
| @@ -1,25 +0,0 @@ | |
| 1 | 
            -
            module Spark
         | 
| 2 | 
            -
              module Serializer
         | 
| 3 | 
            -
                ##
         | 
| 4 | 
            -
                # Used for file
         | 
| 5 | 
            -
                #
         | 
| 6 | 
            -
                # File is sended as String but worker use serialization
         | 
| 7 | 
            -
                #
         | 
| 8 | 
            -
                class UTF8 < Base
         | 
| 9 | 
            -
             | 
| 10 | 
            -
                  def set(*)
         | 
| 11 | 
            -
                    unbatch!
         | 
| 12 | 
            -
                    self
         | 
| 13 | 
            -
                  end
         | 
| 14 | 
            -
             | 
| 15 | 
            -
                  def batched?
         | 
| 16 | 
            -
                    false
         | 
| 17 | 
            -
                  end
         | 
| 18 | 
            -
             | 
| 19 | 
            -
                  def load_next_from_io(io, lenght)
         | 
| 20 | 
            -
                    io.read(lenght).force_encoding(Encoding::UTF_8)
         | 
| 21 | 
            -
                  end
         | 
| 22 | 
            -
             | 
| 23 | 
            -
                end
         | 
| 24 | 
            -
              end
         | 
| 25 | 
            -
            end
         |