ruby-spark 1.0.0 → 1.1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/.gitignore +1 -1
 - data/README.md +99 -32
 - data/TODO.md +2 -3
 - data/benchmark/{performance → comparison}/prepare.sh +0 -0
 - data/benchmark/{performance → comparison}/python.py +0 -0
 - data/benchmark/{performance → comparison}/r.r +0 -0
 - data/benchmark/{performance → comparison}/ruby.rb +0 -0
 - data/benchmark/{performance → comparison}/run-all.sh +0 -0
 - data/benchmark/{performance → comparison}/scala.scala +0 -0
 - data/example/pi.rb +1 -1
 - data/example/website_search.rb +83 -0
 - data/ext/spark/src/main/scala/RubyRDD.scala +30 -2
 - data/lib/spark.rb +2 -2
 - data/lib/spark/build.rb +1 -1
 - data/lib/spark/cli.rb +1 -1
 - data/lib/spark/command/base.rb +4 -0
 - data/lib/spark/command_builder.rb +2 -2
 - data/lib/spark/config.rb +11 -17
 - data/lib/spark/context.rb +63 -45
 - data/lib/spark/ext/io.rb +11 -1
 - data/lib/spark/java_bridge/base.rb +2 -2
 - data/lib/spark/rdd.rb +67 -18
 - data/lib/spark/serializer.rb +68 -13
 - data/lib/spark/serializer/auto_batched.rb +59 -0
 - data/lib/spark/serializer/base.rb +30 -137
 - data/lib/spark/serializer/batched.rb +84 -0
 - data/lib/spark/serializer/cartesian.rb +5 -29
 - data/lib/spark/serializer/compressed.rb +27 -0
 - data/lib/spark/serializer/marshal.rb +6 -8
 - data/lib/spark/serializer/message_pack.rb +8 -10
 - data/lib/spark/serializer/oj.rb +8 -10
 - data/lib/spark/serializer/pair.rb +27 -13
 - data/lib/spark/serializer/text.rb +25 -0
 - data/lib/spark/version.rb +1 -1
 - data/lib/spark/worker/worker.rb +5 -2
 - data/ruby-spark.gemspec +13 -1
 - data/spec/lib/context_spec.rb +3 -1
 - data/spec/lib/manipulation_spec.rb +18 -10
 - data/spec/lib/map_partitions_spec.rb +16 -16
 - data/spec/lib/serializer_spec.rb +84 -9
 - data/spec/lib/statistic_spec.rb +26 -24
 - data/spec/spec_helper.rb +1 -2
 - metadata +112 -10
 - data/lib/spark/serializer/utf8.rb +0 -25
 
    
        data/lib/spark/serializer.rb
    CHANGED
    
    | 
         @@ -1,24 +1,79 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module Spark
         
     | 
| 
      
 2 
     | 
    
         
            +
              ##
         
     | 
| 
      
 3 
     | 
    
         
            +
              # Serializer
         
     | 
| 
      
 4 
     | 
    
         
            +
              #
         
     | 
| 
       2 
5 
     | 
    
         
             
              module Serializer
         
     | 
| 
       3 
     | 
    
         
            -
                autoload :Base,        'spark/serializer/base'
         
     | 
| 
       4 
     | 
    
         
            -
                autoload :UTF8,        'spark/serializer/utf8'
         
     | 
| 
       5 
     | 
    
         
            -
                autoload :Marshal,     'spark/serializer/marshal'
         
     | 
| 
       6 
     | 
    
         
            -
                autoload :MessagePack, 'spark/serializer/message_pack'
         
     | 
| 
       7 
     | 
    
         
            -
                autoload :Oj,          'spark/serializer/oj'
         
     | 
| 
       8 
     | 
    
         
            -
                autoload :Pair,        'spark/serializer/pair'
         
     | 
| 
       9 
     | 
    
         
            -
                autoload :Cartesian,   'spark/serializer/cartesian'
         
     | 
| 
       10 
6 
     | 
    
         | 
| 
      
 7 
     | 
    
         
            +
                DEFAULT_COMPRESS = false
         
     | 
| 
       11 
8 
     | 
    
         
             
                DEFAULT_BATCH_SIZE = 1024
         
     | 
| 
       12 
9 
     | 
    
         
             
                DEFAULT_SERIALIZER_NAME = 'marshal'
         
     | 
| 
       13 
10 
     | 
    
         | 
| 
       14 
     | 
    
         
            -
                 
     | 
| 
       15 
     | 
    
         
            -
             
     | 
| 
      
 11 
     | 
    
         
            +
                @@registered = {}
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
                # Register class and create method for quick access.
         
     | 
| 
      
 14 
     | 
    
         
            +
                # Class will be available also as __name__ for using
         
     | 
| 
      
 15 
     | 
    
         
            +
                # in build method (Proc binding problem).
         
     | 
| 
      
 16 
     | 
    
         
            +
                #
         
     | 
| 
      
 17 
     | 
    
         
            +
                # == Examples:
         
     | 
| 
      
 18 
     | 
    
         
            +
                #   register('test1', 'test2', Class)
         
     | 
| 
      
 19 
     | 
    
         
            +
                #
         
     | 
| 
      
 20 
     | 
    
         
            +
                #   Spark::Serializer.test1
         
     | 
| 
      
 21 
     | 
    
         
            +
                #   Spark::Serializer.test2
         
     | 
| 
      
 22 
     | 
    
         
            +
                #
         
     | 
| 
      
 23 
     | 
    
         
            +
                #   # Proc binding problem
         
     | 
| 
      
 24 
     | 
    
         
            +
                #   build { marshal } # => Spark::Serializer::Marshal
         
     | 
| 
      
 25 
     | 
    
         
            +
                #
         
     | 
| 
      
 26 
     | 
    
         
            +
                #   marshal = 1
         
     | 
| 
      
 27 
     | 
    
         
            +
                #   build { marshal } # => 1
         
     | 
| 
      
 28 
     | 
    
         
            +
                #
         
     | 
| 
      
 29 
     | 
    
         
            +
                #   build { __marshal__ } # => Spark::Serializer::Marshal
         
     | 
| 
      
 30 
     | 
    
         
            +
                #
         
     | 
| 
      
 31 
     | 
    
         
            +
                def self.register(*args)
         
     | 
| 
      
 32 
     | 
    
         
            +
                  klass = args.pop
         
     | 
| 
      
 33 
     | 
    
         
            +
                  args.each do |arg|
         
     | 
| 
      
 34 
     | 
    
         
            +
                    @@registered[arg] = klass
         
     | 
| 
      
 35 
     | 
    
         
            +
                    define_singleton_method(arg.to_sym){|*args| klass.new(*args) }
         
     | 
| 
      
 36 
     | 
    
         
            +
                    define_singleton_method("__#{arg}__".to_sym){|*args| klass.new(*args) }
         
     | 
| 
      
 37 
     | 
    
         
            +
                  end
         
     | 
| 
      
 38 
     | 
    
         
            +
                end
         
     | 
| 
      
 39 
     | 
    
         
            +
             
     | 
| 
      
 40 
     | 
    
         
            +
                def self.find(name)
         
     | 
| 
      
 41 
     | 
    
         
            +
                  @@registered[name.to_s.downcase]
         
     | 
| 
       16 
42 
     | 
    
         
             
                end
         
     | 
| 
       17 
43 
     | 
    
         | 
| 
       18 
     | 
    
         
            -
                def self. 
     | 
| 
       19 
     | 
    
         
            -
                   
     | 
| 
       20 
     | 
    
         
            -
             
     | 
| 
       21 
     | 
    
         
            -
                   
     | 
| 
      
 44 
     | 
    
         
            +
                def self.find!(name)
         
     | 
| 
      
 45 
     | 
    
         
            +
                  klass = find(name)
         
     | 
| 
      
 46 
     | 
    
         
            +
             
     | 
| 
      
 47 
     | 
    
         
            +
                  if klass.nil?
         
     | 
| 
      
 48 
     | 
    
         
            +
                    raise Spark::SerializeError, "Unknow serializer #{name}."
         
     | 
| 
      
 49 
     | 
    
         
            +
                  end
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
                  klass
         
     | 
| 
       22 
52 
     | 
    
         
             
                end
         
     | 
| 
      
 53 
     | 
    
         
            +
             
     | 
| 
      
 54 
     | 
    
         
            +
                def self.build(text=nil, &block)
         
     | 
| 
      
 55 
     | 
    
         
            +
                  if block_given?
         
     | 
| 
      
 56 
     | 
    
         
            +
                    class_eval(&block)
         
     | 
| 
      
 57 
     | 
    
         
            +
                  else
         
     | 
| 
      
 58 
     | 
    
         
            +
                    class_eval(text.to_s)
         
     | 
| 
      
 59 
     | 
    
         
            +
                  end
         
     | 
| 
      
 60 
     | 
    
         
            +
                end
         
     | 
| 
      
 61 
     | 
    
         
            +
             
     | 
| 
       23 
62 
     | 
    
         
             
              end
         
     | 
| 
       24 
63 
     | 
    
         
             
            end
         
     | 
| 
      
 64 
     | 
    
         
            +
             
     | 
| 
      
 65 
     | 
    
         
            +
            # Parent
         
     | 
| 
      
 66 
     | 
    
         
            +
            require 'spark/serializer/base'
         
     | 
| 
      
 67 
     | 
    
         
            +
             
     | 
| 
      
 68 
     | 
    
         
            +
            # Basic
         
     | 
| 
      
 69 
     | 
    
         
            +
            require 'spark/serializer/oj'
         
     | 
| 
      
 70 
     | 
    
         
            +
            require 'spark/serializer/marshal'
         
     | 
| 
      
 71 
     | 
    
         
            +
            require 'spark/serializer/message_pack'
         
     | 
| 
      
 72 
     | 
    
         
            +
            require 'spark/serializer/text'
         
     | 
| 
      
 73 
     | 
    
         
            +
             
     | 
| 
      
 74 
     | 
    
         
            +
            # Others
         
     | 
| 
      
 75 
     | 
    
         
            +
            require 'spark/serializer/batched'
         
     | 
| 
      
 76 
     | 
    
         
            +
            require 'spark/serializer/auto_batched'
         
     | 
| 
      
 77 
     | 
    
         
            +
            require 'spark/serializer/compressed'
         
     | 
| 
      
 78 
     | 
    
         
            +
            require 'spark/serializer/pair'
         
     | 
| 
      
 79 
     | 
    
         
            +
            require 'spark/serializer/cartesian'
         
     | 
| 
         @@ -0,0 +1,59 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module Spark
         
     | 
| 
      
 2 
     | 
    
         
            +
              module Serializer
         
     | 
| 
      
 3 
     | 
    
         
            +
                ##
         
     | 
| 
      
 4 
     | 
    
         
            +
                # AutoBatched serializator
         
     | 
| 
      
 5 
     | 
    
         
            +
                #
         
     | 
| 
      
 6 
     | 
    
         
            +
                # Batch size is computed automatically. Simillar to Python's AutoBatchedSerializer.
         
     | 
| 
      
 7 
     | 
    
         
            +
                #
         
     | 
| 
      
 8 
     | 
    
         
            +
                class AutoBatched < Batched
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
                  MAX_RATIO = 10
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
                  def initialize(serializer, best_size=65536)
         
     | 
| 
      
 13 
     | 
    
         
            +
                    @serializer = serializer
         
     | 
| 
      
 14 
     | 
    
         
            +
                    @best_size = best_size.to_i
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
                    error('Batch size must be greater than 1') if @best_size < 2
         
     | 
| 
      
 17 
     | 
    
         
            +
                  end
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
                  def name
         
     | 
| 
      
 20 
     | 
    
         
            +
                    "AutoBatched(#{@best_size})"
         
     | 
| 
      
 21 
     | 
    
         
            +
                  end
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
                  def dump_to_io(data, io)
         
     | 
| 
      
 24 
     | 
    
         
            +
                    check_each(data)
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
                    # Only Array have .slice
         
     | 
| 
      
 27 
     | 
    
         
            +
                    data = data.to_a
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
                    index = 0
         
     | 
| 
      
 30 
     | 
    
         
            +
                    batch = 2
         
     | 
| 
      
 31 
     | 
    
         
            +
                    max = @best_size * MAX_RATIO
         
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
      
 33 
     | 
    
         
            +
                    loop do
         
     | 
| 
      
 34 
     | 
    
         
            +
                      chunk = data.slice(index, batch)
         
     | 
| 
      
 35 
     | 
    
         
            +
                      if chunk.nil? || chunk.empty?
         
     | 
| 
      
 36 
     | 
    
         
            +
                        break
         
     | 
| 
      
 37 
     | 
    
         
            +
                      end
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
                      serialized = @serializer.dump(chunk)
         
     | 
| 
      
 40 
     | 
    
         
            +
                      io.write_string(serialized)
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
                      index += batch
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
                      size = serialized.bytesize
         
     | 
| 
      
 45 
     | 
    
         
            +
                      if size < @best_size
         
     | 
| 
      
 46 
     | 
    
         
            +
                        batch *= 2
         
     | 
| 
      
 47 
     | 
    
         
            +
                      elsif size > max && batch > 1
         
     | 
| 
      
 48 
     | 
    
         
            +
                        batch /= 2
         
     | 
| 
      
 49 
     | 
    
         
            +
                      end
         
     | 
| 
      
 50 
     | 
    
         
            +
                    end
         
     | 
| 
      
 51 
     | 
    
         
            +
             
     | 
| 
      
 52 
     | 
    
         
            +
                    io.flush
         
     | 
| 
      
 53 
     | 
    
         
            +
                  end
         
     | 
| 
      
 54 
     | 
    
         
            +
             
     | 
| 
      
 55 
     | 
    
         
            +
                end
         
     | 
| 
      
 56 
     | 
    
         
            +
              end
         
     | 
| 
      
 57 
     | 
    
         
            +
            end
         
     | 
| 
      
 58 
     | 
    
         
            +
             
     | 
| 
      
 59 
     | 
    
         
            +
            Spark::Serializer.register('auto_batched', 'autobatched', Spark::Serializer::AutoBatched)
         
     | 
| 
         @@ -1,168 +1,61 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module Spark
         
     | 
| 
       2 
2 
     | 
    
         
             
              module Serializer
         
     | 
| 
       3 
     | 
    
         
            -
                # @abstract Parent for all  
     | 
| 
      
 3 
     | 
    
         
            +
                # @abstract Parent for all serializers
         
     | 
| 
       4 
4 
     | 
    
         
             
                class Base
         
     | 
| 
       5 
5 
     | 
    
         | 
| 
       6 
     | 
    
         
            -
                   
     | 
| 
       7 
     | 
    
         
            -
             
     | 
| 
      
 6 
     | 
    
         
            +
                  def load_from_io(io)
         
     | 
| 
      
 7 
     | 
    
         
            +
                    return to_enum(__callee__, io) unless block_given?
         
     | 
| 
       8 
8 
     | 
    
         | 
| 
       9 
     | 
    
         
            -
             
     | 
| 
      
 9 
     | 
    
         
            +
                    loop do
         
     | 
| 
      
 10 
     | 
    
         
            +
                      size = io.read_int_or_eof
         
     | 
| 
      
 11 
     | 
    
         
            +
                      break if size == Spark::Constant::DATA_EOF
         
     | 
| 
       10 
12 
     | 
    
         | 
| 
       11 
     | 
    
         
            -
             
     | 
| 
       12 
     | 
    
         
            -
             
     | 
| 
       13 
     | 
    
         
            -
                    self.batch_size = batch_size
         
     | 
| 
      
 13 
     | 
    
         
            +
                      yield load(io.read(size))
         
     | 
| 
      
 14 
     | 
    
         
            +
                    end
         
     | 
| 
       14 
15 
     | 
    
         
             
                  end
         
     | 
| 
       15 
16 
     | 
    
         | 
| 
       16 
     | 
    
         
            -
                  def  
     | 
| 
       17 
     | 
    
         
            -
                     
     | 
| 
       18 
     | 
    
         
            -
                  end
         
     | 
| 
      
 17 
     | 
    
         
            +
                  def load_from_file(file, *args)
         
     | 
| 
      
 18 
     | 
    
         
            +
                    return to_enum(__callee__, file, *args) unless block_given?
         
     | 
| 
       19 
19 
     | 
    
         | 
| 
       20 
     | 
    
         
            -
             
     | 
| 
       21 
     | 
    
         
            -
             
     | 
| 
       22 
     | 
    
         
            -
                     
     | 
| 
       23 
     | 
    
         
            -
                    self
         
     | 
| 
       24 
     | 
    
         
            -
                  end
         
     | 
| 
      
 20 
     | 
    
         
            +
                    load_from_io(file, *args).each do |item|
         
     | 
| 
      
 21 
     | 
    
         
            +
                      yield item
         
     | 
| 
      
 22 
     | 
    
         
            +
                    end
         
     | 
| 
       25 
23 
     | 
    
         | 
| 
       26 
     | 
    
         
            -
             
     | 
| 
       27 
     | 
    
         
            -
                     
     | 
| 
      
 24 
     | 
    
         
            +
                    file.close
         
     | 
| 
      
 25 
     | 
    
         
            +
                    file.unlink
         
     | 
| 
       28 
26 
     | 
    
         
             
                  end
         
     | 
| 
       29 
27 
     | 
    
         | 
| 
       30 
     | 
    
         
            -
                  def  
     | 
| 
       31 
     | 
    
         
            -
                    self. 
     | 
| 
      
 28 
     | 
    
         
            +
                  def ==(other)
         
     | 
| 
      
 29 
     | 
    
         
            +
                    self.to_s == other.to_s
         
     | 
| 
       32 
30 
     | 
    
         
             
                  end
         
     | 
| 
       33 
31 
     | 
    
         | 
| 
       34 
     | 
    
         
            -
                  # nil, 0, 1 are considered as non-batched
         
     | 
| 
       35 
32 
     | 
    
         
             
                  def batched?
         
     | 
| 
       36 
     | 
    
         
            -
                     
     | 
| 
       37 
     | 
    
         
            -
                  end
         
     | 
| 
       38 
     | 
    
         
            -
             
     | 
| 
       39 
     | 
    
         
            -
                  # ===========================================================================
         
     | 
| 
       40 
     | 
    
         
            -
                  # Load
         
     | 
| 
       41 
     | 
    
         
            -
             
     | 
| 
       42 
     | 
    
         
            -
                  # Load and deserialize an Array from IO, Array of Java iterator
         
     | 
| 
       43 
     | 
    
         
            -
                  #   mri:   respond_to?(:iterator) => false
         
     | 
| 
       44 
     | 
    
         
            -
                  #   jruby: respond_to?(:iterator) => true
         
     | 
| 
       45 
     | 
    
         
            -
                  #
         
     | 
| 
       46 
     | 
    
         
            -
                  def load(source)
         
     | 
| 
       47 
     | 
    
         
            -
                    # Tempfile is Delegator for File so it is not IO
         
     | 
| 
       48 
     | 
    
         
            -
                    # second wasy is __getobj__.is_a?(IO)
         
     | 
| 
       49 
     | 
    
         
            -
                    if source.is_a?(IO) || source.is_a?(Tempfile)
         
     | 
| 
       50 
     | 
    
         
            -
                      load_from_io(source)
         
     | 
| 
       51 
     | 
    
         
            -
                    # elsif source.is_a?(Array)
         
     | 
| 
       52 
     | 
    
         
            -
                    #   load_from_array(source)
         
     | 
| 
       53 
     | 
    
         
            -
                    elsif try(source, :iterator)
         
     | 
| 
       54 
     | 
    
         
            -
                      load_from_iterator(source.iterator)
         
     | 
| 
       55 
     | 
    
         
            -
                    end
         
     | 
| 
      
 33 
     | 
    
         
            +
                    false
         
     | 
| 
       56 
34 
     | 
    
         
             
                  end
         
     | 
| 
       57 
35 
     | 
    
         | 
| 
       58 
     | 
    
         
            -
                   
     | 
| 
       59 
     | 
    
         
            -
                  #
         
     | 
| 
       60 
     | 
    
         
            -
                  #   +------------+--------+
         
     | 
| 
       61 
     | 
    
         
            -
                  #   | signed int |  data  |
         
     | 
| 
       62 
     | 
    
         
            -
                  #   |     4B     |        |
         
     | 
| 
       63 
     | 
    
         
            -
                  #   +------------+--------+
         
     | 
| 
       64 
     | 
    
         
            -
                  #
         
     | 
| 
       65 
     | 
    
         
            -
                  def load_from_io(io)
         
     | 
| 
       66 
     | 
    
         
            -
                    return to_enum(__callee__, io) unless block_given?
         
     | 
| 
       67 
     | 
    
         
            -
             
     | 
| 
       68 
     | 
    
         
            -
                    loop do
         
     | 
| 
       69 
     | 
    
         
            -
                      lenght = read_int(io)
         
     | 
| 
       70 
     | 
    
         
            -
                      break if lenght == DATA_EOF
         
     | 
| 
       71 
     | 
    
         
            -
             
     | 
| 
       72 
     | 
    
         
            -
                      result = load_next_from_io(io, lenght)
         
     | 
| 
       73 
     | 
    
         
            -
                      if batched? && result.respond_to?(:each)
         
     | 
| 
       74 
     | 
    
         
            -
                        result.each {|item| yield item }
         
     | 
| 
       75 
     | 
    
         
            -
                      else
         
     | 
| 
       76 
     | 
    
         
            -
                        yield result
         
     | 
| 
       77 
     | 
    
         
            -
                      end
         
     | 
| 
       78 
     | 
    
         
            -
                    end # loop
         
     | 
| 
       79 
     | 
    
         
            -
                  end # load_from_io
         
     | 
| 
       80 
     | 
    
         
            -
             
     | 
| 
       81 
     | 
    
         
            -
                  def load_next_from_io(io, lenght)
         
     | 
| 
       82 
     | 
    
         
            -
                    deserialize(io.read(lenght))
         
     | 
| 
      
 36 
     | 
    
         
            +
                  def unbatch!
         
     | 
| 
       83 
37 
     | 
    
         
             
                  end
         
     | 
| 
       84 
38 
     | 
    
         | 
| 
       85 
     | 
    
         
            -
                   
     | 
| 
       86 
     | 
    
         
            -
             
     | 
| 
       87 
     | 
    
         
            -
             
     | 
| 
       88 
     | 
    
         
            -
                    result = []
         
     | 
| 
       89 
     | 
    
         
            -
                    while iterator.hasNext
         
     | 
| 
       90 
     | 
    
         
            -
                      item = iterator.next
         
     | 
| 
       91 
     | 
    
         
            -
             
     | 
| 
       92 
     | 
    
         
            -
                      # mri: data are String
         
     | 
| 
       93 
     | 
    
         
            -
                      # jruby: data are bytes Array
         
     | 
| 
       94 
     | 
    
         
            -
             
     | 
| 
       95 
     | 
    
         
            -
                      if item.is_a?(String)
         
     | 
| 
       96 
     | 
    
         
            -
                        # Serialized data
         
     | 
| 
       97 
     | 
    
         
            -
                        result << deserialize(item)
         
     | 
| 
       98 
     | 
    
         
            -
                      else
         
     | 
| 
       99 
     | 
    
         
            -
                        # Java object
         
     | 
| 
       100 
     | 
    
         
            -
                        if try(item, :getClass)
         
     | 
| 
       101 
     | 
    
         
            -
                          case item.getClass.name
         
     | 
| 
       102 
     | 
    
         
            -
                          when '[B'
         
     | 
| 
       103 
     | 
    
         
            -
                            # Array of bytes
         
     | 
| 
       104 
     | 
    
         
            -
                            result << deserialize(pack_unsigned_chars(item.to_a))
         
     | 
| 
       105 
     | 
    
         
            -
                          when 'scala.Tuple2'
         
     | 
| 
       106 
     | 
    
         
            -
                            # Tuple2
         
     | 
| 
       107 
     | 
    
         
            -
                            result << deserialize(item._1, item._2)
         
     | 
| 
       108 
     | 
    
         
            -
                          end
         
     | 
| 
       109 
     | 
    
         
            -
                        end
         
     | 
| 
       110 
     | 
    
         
            -
                      end
         
     | 
| 
       111 
     | 
    
         
            -
             
     | 
| 
      
 39 
     | 
    
         
            +
                  def check_each(data)
         
     | 
| 
      
 40 
     | 
    
         
            +
                    unless data.respond_to?(:each)
         
     | 
| 
      
 41 
     | 
    
         
            +
                      error('Data must be iterable.')
         
     | 
| 
       112 
42 
     | 
    
         
             
                    end
         
     | 
| 
       113 
     | 
    
         
            -
             
     | 
| 
       114 
     | 
    
         
            -
                    result.flatten!(1) if batched?
         
     | 
| 
       115 
     | 
    
         
            -
                    result
         
     | 
| 
       116 
43 
     | 
    
         
             
                  end
         
     | 
| 
       117 
44 
     | 
    
         | 
| 
       118 
     | 
    
         
            -
                  def  
     | 
| 
       119 
     | 
    
         
            -
                     
     | 
| 
       120 
     | 
    
         
            -
                    return DATA_EOF if bytes.nil?
         
     | 
| 
       121 
     | 
    
         
            -
                    unpack_int(bytes)
         
     | 
| 
      
 45 
     | 
    
         
            +
                  def error(message)
         
     | 
| 
      
 46 
     | 
    
         
            +
                    raise Spark::SerializeError, message
         
     | 
| 
       122 
47 
     | 
    
         
             
                  end
         
     | 
| 
       123 
48 
     | 
    
         | 
| 
       124 
     | 
    
         
            -
                   
     | 
| 
       125 
     | 
    
         
            -
             
     | 
| 
       126 
     | 
    
         
            -
             
     | 
| 
       127 
     | 
    
         
            -
                  # Serialize and send data into IO. Check 'load_from_io' for data format.
         
     | 
| 
       128 
     | 
    
         
            -
                  def dump(data, io)
         
     | 
| 
       129 
     | 
    
         
            -
                    if !data.is_a?(Array) && !data.is_a?(Enumerator)
         
     | 
| 
       130 
     | 
    
         
            -
                      data = [data]
         
     | 
| 
       131 
     | 
    
         
            -
                    end
         
     | 
| 
       132 
     | 
    
         
            -
                    data = data.each_slice(batch_size) if batched?
         
     | 
| 
       133 
     | 
    
         
            -
             
     | 
| 
       134 
     | 
    
         
            -
                    data.each do |item|
         
     | 
| 
       135 
     | 
    
         
            -
                      serialized = serialize(item)
         
     | 
| 
       136 
     | 
    
         
            -
             
     | 
| 
       137 
     | 
    
         
            -
                      # Size and data can have different encoding
         
     | 
| 
       138 
     | 
    
         
            -
                      # Marshal: both ASCII
         
     | 
| 
       139 
     | 
    
         
            -
                      # Oj: ASCII and UTF-8
         
     | 
| 
       140 
     | 
    
         
            -
                      io.write(pack_int(serialized.bytesize))
         
     | 
| 
       141 
     | 
    
         
            -
                      io.write(serialized)
         
     | 
| 
       142 
     | 
    
         
            -
                    end
         
     | 
| 
       143 
     | 
    
         
            -
             
     | 
| 
       144 
     | 
    
         
            -
                    io.flush
         
     | 
| 
      
 49 
     | 
    
         
            +
                  def name
         
     | 
| 
      
 50 
     | 
    
         
            +
                    self.class.name.split('::').last
         
     | 
| 
       145 
51 
     | 
    
         
             
                  end
         
     | 
| 
       146 
52 
     | 
    
         | 
| 
       147 
     | 
    
         
            -
                   
     | 
| 
       148 
     | 
    
         
            -
             
     | 
| 
       149 
     | 
    
         
            -
                    data.map! do |item|
         
     | 
| 
       150 
     | 
    
         
            -
                      serialize(item).to_java_bytes
         
     | 
| 
       151 
     | 
    
         
            -
                    end
         
     | 
| 
      
 53 
     | 
    
         
            +
                  def to_s
         
     | 
| 
      
 54 
     | 
    
         
            +
                    name
         
     | 
| 
       152 
55 
     | 
    
         
             
                  end
         
     | 
| 
       153 
56 
     | 
    
         | 
| 
       154 
     | 
    
         
            -
                   
     | 
| 
       155 
     | 
    
         
            -
             
     | 
| 
       156 
     | 
    
         
            -
                  #   mri   => RuntimeError
         
     | 
| 
       157 
     | 
    
         
            -
                  #   jruby => NoMethodError
         
     | 
| 
       158 
     | 
    
         
            -
                  #
         
     | 
| 
       159 
     | 
    
         
            -
                  def try(object, method)
         
     | 
| 
       160 
     | 
    
         
            -
                    begin
         
     | 
| 
       161 
     | 
    
         
            -
                      object.__send__(method)
         
     | 
| 
       162 
     | 
    
         
            -
                      return true
         
     | 
| 
       163 
     | 
    
         
            -
                    rescue
         
     | 
| 
       164 
     | 
    
         
            -
                      return false
         
     | 
| 
       165 
     | 
    
         
            -
                    end
         
     | 
| 
      
 57 
     | 
    
         
            +
                  def inspect
         
     | 
| 
      
 58 
     | 
    
         
            +
                    %{#<Spark::Serializer:0x#{object_id}  "#{self}">}
         
     | 
| 
       166 
59 
     | 
    
         
             
                  end
         
     | 
| 
       167 
60 
     | 
    
         | 
| 
       168 
61 
     | 
    
         
             
                end
         
     | 
| 
         @@ -0,0 +1,84 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module Spark
         
     | 
| 
      
 2 
     | 
    
         
            +
              module Serializer
         
     | 
| 
      
 3 
     | 
    
         
            +
                class Batched < Base
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
                  attr_writer :serializer
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
                  def initialize(serializer, batch_size=nil)
         
     | 
| 
      
 8 
     | 
    
         
            +
                    batch_size ||= Spark::Serializer::DEFAULT_BATCH_SIZE
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
                    @serializer = serializer
         
     | 
| 
      
 11 
     | 
    
         
            +
                    @batch_size = batch_size.to_i
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
                    error('Batch size must be greater than 0') if @batch_size < 1
         
     | 
| 
      
 14 
     | 
    
         
            +
                  end
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
                  # Really batched
         
     | 
| 
      
 17 
     | 
    
         
            +
                  def batched?
         
     | 
| 
      
 18 
     | 
    
         
            +
                    @batch_size > 1
         
     | 
| 
      
 19 
     | 
    
         
            +
                  end
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
                  def unbatch!
         
     | 
| 
      
 22 
     | 
    
         
            +
                    @batch_size = 1
         
     | 
| 
      
 23 
     | 
    
         
            +
                  end
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
                  def load(data)
         
     | 
| 
      
 26 
     | 
    
         
            +
                    @serializer.load(data)
         
     | 
| 
      
 27 
     | 
    
         
            +
                  end
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
                  def dump(data)
         
     | 
| 
      
 30 
     | 
    
         
            +
                    @serializer.dump(data)
         
     | 
| 
      
 31 
     | 
    
         
            +
                  end
         
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
      
 33 
     | 
    
         
            +
                  def name
         
     | 
| 
      
 34 
     | 
    
         
            +
                    "Batched(#{@batch_size})"
         
     | 
| 
      
 35 
     | 
    
         
            +
                  end
         
     | 
| 
      
 36 
     | 
    
         
            +
             
     | 
| 
      
 37 
     | 
    
         
            +
                  def to_s
         
     | 
| 
      
 38 
     | 
    
         
            +
                    "#{name} -> #{@serializer}"
         
     | 
| 
      
 39 
     | 
    
         
            +
                  end
         
     | 
| 
      
 40 
     | 
    
         
            +
             
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
                  # === Dump ==============================================================
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
                  def dump_to_io(data, io)
         
     | 
| 
      
 45 
     | 
    
         
            +
                    check_each(data)
         
     | 
| 
      
 46 
     | 
    
         
            +
             
     | 
| 
      
 47 
     | 
    
         
            +
                    if batched?
         
     | 
| 
      
 48 
     | 
    
         
            +
                      data = data.each_slice(@batch_size)
         
     | 
| 
      
 49 
     | 
    
         
            +
                    end
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
                    data.each do |item|
         
     | 
| 
      
 52 
     | 
    
         
            +
                      serialized = dump(item)
         
     | 
| 
      
 53 
     | 
    
         
            +
                      io.write_string(serialized)
         
     | 
| 
      
 54 
     | 
    
         
            +
                    end
         
     | 
| 
      
 55 
     | 
    
         
            +
             
     | 
| 
      
 56 
     | 
    
         
            +
                    io.flush
         
     | 
| 
      
 57 
     | 
    
         
            +
                  end
         
     | 
| 
      
 58 
     | 
    
         
            +
             
     | 
| 
      
 59 
     | 
    
         
            +
             
     | 
| 
      
 60 
     | 
    
         
            +
                  # === Load ==============================================================
         
     | 
| 
      
 61 
     | 
    
         
            +
             
     | 
| 
      
 62 
     | 
    
         
            +
                  def load_from_io(io)
         
     | 
| 
      
 63 
     | 
    
         
            +
                    return to_enum(__callee__, io) unless block_given?
         
     | 
| 
      
 64 
     | 
    
         
            +
             
     | 
| 
      
 65 
     | 
    
         
            +
                    loop do
         
     | 
| 
      
 66 
     | 
    
         
            +
                      size = io.read_int_or_eof
         
     | 
| 
      
 67 
     | 
    
         
            +
                      break if size == Spark::Constant::DATA_EOF
         
     | 
| 
      
 68 
     | 
    
         
            +
             
     | 
| 
      
 69 
     | 
    
         
            +
                      data = io.read(size)
         
     | 
| 
      
 70 
     | 
    
         
            +
                      data = load(data)
         
     | 
| 
      
 71 
     | 
    
         
            +
             
     | 
| 
      
 72 
     | 
    
         
            +
                      if batched?
         
     | 
| 
      
 73 
     | 
    
         
            +
                        data.each{|item| yield item }
         
     | 
| 
      
 74 
     | 
    
         
            +
                      else
         
     | 
| 
      
 75 
     | 
    
         
            +
                        yield data
         
     | 
| 
      
 76 
     | 
    
         
            +
                      end
         
     | 
| 
      
 77 
     | 
    
         
            +
                    end
         
     | 
| 
      
 78 
     | 
    
         
            +
                  end
         
     | 
| 
      
 79 
     | 
    
         
            +
             
     | 
| 
      
 80 
     | 
    
         
            +
                end
         
     | 
| 
      
 81 
     | 
    
         
            +
              end
         
     | 
| 
      
 82 
     | 
    
         
            +
            end
         
     | 
| 
      
 83 
     | 
    
         
            +
             
     | 
| 
      
 84 
     | 
    
         
            +
            Spark::Serializer.register('batched', Spark::Serializer::Batched)
         
     | 
| 
         @@ -1,37 +1,13 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module Spark
         
     | 
| 
       2 
2 
     | 
    
         
             
              module Serializer
         
     | 
| 
       3 
     | 
    
         
            -
                class Cartesian <  
     | 
| 
      
 3 
     | 
    
         
            +
                class Cartesian < Pair
         
     | 
| 
       4 
4 
     | 
    
         | 
| 
       5 
     | 
    
         
            -
                   
     | 
| 
       6 
     | 
    
         
            -
             
     | 
| 
       7 
     | 
    
         
            -
                  def set(first, second)
         
     | 
| 
       8 
     | 
    
         
            -
                    @first  = first
         
     | 
| 
       9 
     | 
    
         
            -
                    @second = second
         
     | 
| 
       10 
     | 
    
         
            -
                    self
         
     | 
| 
       11 
     | 
    
         
            -
                  end
         
     | 
| 
       12 
     | 
    
         
            -
             
     | 
| 
       13 
     | 
    
         
            -
                  # Little hack
         
     | 
| 
       14 
     | 
    
         
            -
                  # Data does not have to be batched but items are added by <<
         
     | 
| 
       15 
     | 
    
         
            -
                  def batched?
         
     | 
| 
       16 
     | 
    
         
            -
                    true
         
     | 
| 
       17 
     | 
    
         
            -
                  end
         
     | 
| 
       18 
     | 
    
         
            -
             
     | 
| 
       19 
     | 
    
         
            -
                  def load_next_from_io(io, lenght)
         
     | 
| 
       20 
     | 
    
         
            -
                    item1 = io.read(lenght)
         
     | 
| 
       21 
     | 
    
         
            -
                    item2 = io.read_string
         
     | 
| 
       22 
     | 
    
         
            -
                    deserialize(item1, item2)
         
     | 
| 
       23 
     | 
    
         
            -
                  end
         
     | 
| 
       24 
     | 
    
         
            -
             
     | 
| 
       25 
     | 
    
         
            -
                  def deserialize(item1, item2)
         
     | 
| 
       26 
     | 
    
         
            -
                    deserialized_item1 = @first.deserialize(item1)
         
     | 
| 
       27 
     | 
    
         
            -
                    deserialized_item2 = @second.deserialize(item2)
         
     | 
| 
       28 
     | 
    
         
            -
             
     | 
| 
       29 
     | 
    
         
            -
                    deserialized_item1 = [deserialized_item1] unless @first.batched?
         
     | 
| 
       30 
     | 
    
         
            -
                    deserialized_item2 = [deserialized_item2] unless @second.batched?
         
     | 
| 
       31 
     | 
    
         
            -
             
     | 
| 
       32 
     | 
    
         
            -
                    deserialized_item1.product(deserialized_item2)
         
     | 
| 
      
 5 
     | 
    
         
            +
                  def aggregate(item1, item2)
         
     | 
| 
      
 6 
     | 
    
         
            +
                    item1.product(item2)
         
     | 
| 
       33 
7 
     | 
    
         
             
                  end
         
     | 
| 
       34 
8 
     | 
    
         | 
| 
       35 
9 
     | 
    
         
             
                end
         
     | 
| 
       36 
10 
     | 
    
         
             
              end
         
     | 
| 
       37 
11 
     | 
    
         
             
            end
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
            Spark::Serializer.register('cartesian', Spark::Serializer::Cartesian)
         
     |