ruby-spark 1.0.0 → 1.1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/.gitignore +1 -1
 - data/README.md +99 -32
 - data/TODO.md +2 -3
 - data/benchmark/{performance → comparison}/prepare.sh +0 -0
 - data/benchmark/{performance → comparison}/python.py +0 -0
 - data/benchmark/{performance → comparison}/r.r +0 -0
 - data/benchmark/{performance → comparison}/ruby.rb +0 -0
 - data/benchmark/{performance → comparison}/run-all.sh +0 -0
 - data/benchmark/{performance → comparison}/scala.scala +0 -0
 - data/example/pi.rb +1 -1
 - data/example/website_search.rb +83 -0
 - data/ext/spark/src/main/scala/RubyRDD.scala +30 -2
 - data/lib/spark.rb +2 -2
 - data/lib/spark/build.rb +1 -1
 - data/lib/spark/cli.rb +1 -1
 - data/lib/spark/command/base.rb +4 -0
 - data/lib/spark/command_builder.rb +2 -2
 - data/lib/spark/config.rb +11 -17
 - data/lib/spark/context.rb +63 -45
 - data/lib/spark/ext/io.rb +11 -1
 - data/lib/spark/java_bridge/base.rb +2 -2
 - data/lib/spark/rdd.rb +67 -18
 - data/lib/spark/serializer.rb +68 -13
 - data/lib/spark/serializer/auto_batched.rb +59 -0
 - data/lib/spark/serializer/base.rb +30 -137
 - data/lib/spark/serializer/batched.rb +84 -0
 - data/lib/spark/serializer/cartesian.rb +5 -29
 - data/lib/spark/serializer/compressed.rb +27 -0
 - data/lib/spark/serializer/marshal.rb +6 -8
 - data/lib/spark/serializer/message_pack.rb +8 -10
 - data/lib/spark/serializer/oj.rb +8 -10
 - data/lib/spark/serializer/pair.rb +27 -13
 - data/lib/spark/serializer/text.rb +25 -0
 - data/lib/spark/version.rb +1 -1
 - data/lib/spark/worker/worker.rb +5 -2
 - data/ruby-spark.gemspec +13 -1
 - data/spec/lib/context_spec.rb +3 -1
 - data/spec/lib/manipulation_spec.rb +18 -10
 - data/spec/lib/map_partitions_spec.rb +16 -16
 - data/spec/lib/serializer_spec.rb +84 -9
 - data/spec/lib/statistic_spec.rb +26 -24
 - data/spec/spec_helper.rb +1 -2
 - metadata +112 -10
 - data/lib/spark/serializer/utf8.rb +0 -25
 
    
        data/lib/spark/command/base.rb
    CHANGED
    
    
| 
         @@ -32,8 +32,8 @@ module Spark 
     | 
|
| 
       32 
32 
     | 
    
         
             
                def deep_copy
         
     | 
| 
       33 
33 
     | 
    
         
             
                  copy = self.dup
         
     | 
| 
       34 
34 
     | 
    
         
             
                  copy.create_command
         
     | 
| 
       35 
     | 
    
         
            -
                  copy.serializer    = self.serializer. 
     | 
| 
       36 
     | 
    
         
            -
                  copy.deserializer  = self.deserializer. 
     | 
| 
      
 35 
     | 
    
         
            +
                  copy.serializer    = self.serializer.deep_copy
         
     | 
| 
      
 36 
     | 
    
         
            +
                  copy.deserializer  = self.deserializer.deep_copy
         
     | 
| 
       37 
37 
     | 
    
         
             
                  copy.commands      = self.commands.dup
         
     | 
| 
       38 
38 
     | 
    
         
             
                  copy.libraries     = self.libraries.dup
         
     | 
| 
       39 
39 
     | 
    
         
             
                  copy.bound_objects = self.bound_objects.dup
         
     | 
    
        data/lib/spark/config.rb
    CHANGED
    
    | 
         @@ -9,7 +9,7 @@ module Spark 
     | 
|
| 
       9 
9 
     | 
    
         | 
| 
       10 
10 
     | 
    
         
             
                TYPES = {
         
     | 
| 
       11 
11 
     | 
    
         
             
                  'spark.shuffle.spill' => :boolean,
         
     | 
| 
       12 
     | 
    
         
            -
                  'spark.ruby. 
     | 
| 
      
 12 
     | 
    
         
            +
                  'spark.ruby.serializer.compress' => :boolean
         
     | 
| 
       13 
13 
     | 
    
         
             
                }
         
     | 
| 
       14 
14 
     | 
    
         | 
| 
       15 
15 
     | 
    
         
             
                # Initialize java SparkConf and load default configuration.
         
     | 
| 
         @@ -55,8 +55,8 @@ module Spark 
     | 
|
| 
       55 
55 
     | 
    
         
             
                    errors << 'A master URL must be set in your configuration.'
         
     | 
| 
       56 
56 
     | 
    
         
             
                  end
         
     | 
| 
       57 
57 
     | 
    
         | 
| 
       58 
     | 
    
         
            -
                  if Spark::Serializer. 
     | 
| 
       59 
     | 
    
         
            -
                    errors << ' 
     | 
| 
      
 58 
     | 
    
         
            +
                  if Spark::Serializer.find(get('spark.ruby.serializer')).nil?
         
     | 
| 
      
 59 
     | 
    
         
            +
                    errors << 'Unknow serializer.'
         
     | 
| 
       60 
60 
     | 
    
         
             
                  end
         
     | 
| 
       61 
61 
     | 
    
         | 
| 
       62 
62 
     | 
    
         
             
                  scanned = get('spark.ruby.executor.command').scan('%s')
         
     | 
| 
         @@ -137,9 +137,9 @@ module Spark 
     | 
|
| 
       137 
137 
     | 
    
         
             
                  set_app_name('RubySpark')
         
     | 
| 
       138 
138 
     | 
    
         
             
                  set_master('local[*]')
         
     | 
| 
       139 
139 
     | 
    
         
             
                  set('spark.ruby.driver_home', Spark.home)
         
     | 
| 
       140 
     | 
    
         
            -
                  set('spark.ruby.parallelize_strategy', default_parallelize_strategy)
         
     | 
| 
       141 
140 
     | 
    
         
             
                  set('spark.ruby.serializer', default_serializer)
         
     | 
| 
       142 
     | 
    
         
            -
                  set('spark.ruby. 
     | 
| 
      
 141 
     | 
    
         
            +
                  set('spark.ruby.serializer.compress', default_serializer_compress)
         
     | 
| 
      
 142 
     | 
    
         
            +
                  set('spark.ruby.serializer.batch_size', default_serializer_batch_size)
         
     | 
| 
       143 
143 
     | 
    
         
             
                  set('spark.ruby.executor.uri', default_executor_uri)
         
     | 
| 
       144 
144 
     | 
    
         
             
                  set('spark.ruby.executor.command', default_executor_command)
         
     | 
| 
       145 
145 
     | 
    
         
             
                  set('spark.ruby.executor.options', default_executor_options)
         
     | 
| 
         @@ -147,22 +147,16 @@ module Spark 
     | 
|
| 
       147 
147 
     | 
    
         
             
                  load_executor_envs
         
     | 
| 
       148 
148 
     | 
    
         
             
                end
         
     | 
| 
       149 
149 
     | 
    
         | 
| 
       150 
     | 
    
         
            -
                # How to handle with data in method parallelize.
         
     | 
| 
       151 
     | 
    
         
            -
                #
         
     | 
| 
       152 
     | 
    
         
            -
                # == Possible options:
         
     | 
| 
       153 
     | 
    
         
            -
                # inplace:: data are changed directly to save memory
         
     | 
| 
       154 
     | 
    
         
            -
                # deep_copy:: data are cloned fist
         
     | 
| 
       155 
     | 
    
         
            -
                #
         
     | 
| 
       156 
     | 
    
         
            -
                def default_parallelize_strategy
         
     | 
| 
       157 
     | 
    
         
            -
                  ENV['SPARK_RUBY_PARALLELIZE_STRATEGY'] || 'inplace'
         
     | 
| 
       158 
     | 
    
         
            -
                end
         
     | 
| 
       159 
     | 
    
         
            -
             
     | 
| 
       160 
150 
     | 
    
         
             
                def default_serializer
         
     | 
| 
       161 
151 
     | 
    
         
             
                  ENV['SPARK_RUBY_SERIALIZER'] || Spark::Serializer::DEFAULT_SERIALIZER_NAME
         
     | 
| 
       162 
152 
     | 
    
         
             
                end
         
     | 
| 
       163 
153 
     | 
    
         | 
| 
       164 
     | 
    
         
            -
                def  
     | 
| 
       165 
     | 
    
         
            -
                  ENV[' 
     | 
| 
      
 154 
     | 
    
         
            +
                def default_serializer_compress
         
     | 
| 
      
 155 
     | 
    
         
            +
                  ENV['SPARK_RUBY_SERIALIZER_COMPRESS'] || Spark::Serializer::DEFAULT_COMPRESS
         
     | 
| 
      
 156 
     | 
    
         
            +
                end
         
     | 
| 
      
 157 
     | 
    
         
            +
             
     | 
| 
      
 158 
     | 
    
         
            +
                def default_serializer_batch_size
         
     | 
| 
      
 159 
     | 
    
         
            +
                  ENV['SPARK_RUBY_SERIALIZER_BATCH_SIZE'] || Spark::Serializer::DEFAULT_BATCH_SIZE
         
     | 
| 
       166 
160 
     | 
    
         
             
                end
         
     | 
| 
       167 
161 
     | 
    
         | 
| 
       168 
162 
     | 
    
         
             
                # Ruby executor.
         
     | 
    
        data/lib/spark/context.rb
    CHANGED
    
    | 
         @@ -2,6 +2,7 @@ 
     | 
|
| 
       2 
2 
     | 
    
         
             
            Spark.load_lib
         
     | 
| 
       3 
3 
     | 
    
         | 
| 
       4 
4 
     | 
    
         
             
            module Spark
         
     | 
| 
      
 5 
     | 
    
         
            +
              ##
         
     | 
| 
       5 
6 
     | 
    
         
             
              # Main entry point for Spark functionality. A SparkContext represents the connection to a Spark
         
     | 
| 
       6 
7 
     | 
    
         
             
              # cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster.
         
     | 
| 
       7 
8 
     | 
    
         
             
              #
         
     | 
| 
         @@ -57,10 +58,38 @@ module Spark 
     | 
|
| 
       57 
58 
     | 
    
         
             
                  sc.defaultParallelism
         
     | 
| 
       58 
59 
     | 
    
         
             
                end
         
     | 
| 
       59 
60 
     | 
    
         | 
| 
       60 
     | 
    
         
            -
                 
     | 
| 
       61 
     | 
    
         
            -
             
     | 
| 
       62 
     | 
    
         
            -
             
     | 
| 
       63 
     | 
    
         
            -
             
     | 
| 
      
 61 
     | 
    
         
            +
                # Default serializer
         
     | 
| 
      
 62 
     | 
    
         
            +
                #
         
     | 
| 
      
 63 
     | 
    
         
            +
                # Batch -> Compress -> Basic
         
     | 
| 
      
 64 
     | 
    
         
            +
                #
         
     | 
| 
      
 65 
     | 
    
         
            +
                def default_serializer
         
     | 
| 
      
 66 
     | 
    
         
            +
                  # Basic
         
     | 
| 
      
 67 
     | 
    
         
            +
                  serializer = Spark::Serializer.find!(config('spark.ruby.serializer')).new
         
     | 
| 
      
 68 
     | 
    
         
            +
             
     | 
| 
      
 69 
     | 
    
         
            +
                  # Compress
         
     | 
| 
      
 70 
     | 
    
         
            +
                  if config('spark.ruby.serializer.compress')
         
     | 
| 
      
 71 
     | 
    
         
            +
                    serializer = Spark::Serializer.compressed(serializer)
         
     | 
| 
      
 72 
     | 
    
         
            +
                  end
         
     | 
| 
      
 73 
     | 
    
         
            +
             
     | 
| 
      
 74 
     | 
    
         
            +
                  # Bactching
         
     | 
| 
      
 75 
     | 
    
         
            +
                  batch_size = default_batch_size
         
     | 
| 
      
 76 
     | 
    
         
            +
                  if batch_size == 'auto'
         
     | 
| 
      
 77 
     | 
    
         
            +
                    serializer = Spark::Serializer.auto_batched(serializer)
         
     | 
| 
      
 78 
     | 
    
         
            +
                  else
         
     | 
| 
      
 79 
     | 
    
         
            +
                    serializer = Spark::Serializer.batched(serializer, batch_size)
         
     | 
| 
      
 80 
     | 
    
         
            +
                  end
         
     | 
| 
      
 81 
     | 
    
         
            +
             
     | 
| 
      
 82 
     | 
    
         
            +
                  # Finally, "container" contains serializers
         
     | 
| 
      
 83 
     | 
    
         
            +
                  serializer
         
     | 
| 
      
 84 
     | 
    
         
            +
                end
         
     | 
| 
      
 85 
     | 
    
         
            +
             
     | 
| 
      
 86 
     | 
    
         
            +
                def default_batch_size
         
     | 
| 
      
 87 
     | 
    
         
            +
                  size = config('spark.ruby.serializer.batch_size').to_i
         
     | 
| 
      
 88 
     | 
    
         
            +
                  if size >= 1
         
     | 
| 
      
 89 
     | 
    
         
            +
                    size
         
     | 
| 
      
 90 
     | 
    
         
            +
                  else
         
     | 
| 
      
 91 
     | 
    
         
            +
                    'auto'
         
     | 
| 
      
 92 
     | 
    
         
            +
                  end
         
     | 
| 
       64 
93 
     | 
    
         
             
                end
         
     | 
| 
       65 
94 
     | 
    
         | 
| 
       66 
95 
     | 
    
         
             
                # Set a local property that affects jobs submitted from this thread, such as the
         
     | 
| 
         @@ -93,12 +122,11 @@ module Spark 
     | 
|
| 
       93 
122 
     | 
    
         
             
                # be changed at runtime.
         
     | 
| 
       94 
123 
     | 
    
         
             
                #
         
     | 
| 
       95 
124 
     | 
    
         
             
                def config(key=nil)
         
     | 
| 
       96 
     | 
    
         
            -
                   
     | 
| 
       97 
     | 
    
         
            -
             
     | 
| 
       98 
     | 
    
         
            -
                   
     | 
| 
       99 
     | 
    
         
            -
             
     | 
| 
       100 
     | 
    
         
            -
                   
     | 
| 
       101 
     | 
    
         
            -
                  Spark.config
         
     | 
| 
      
 125 
     | 
    
         
            +
                  if key
         
     | 
| 
      
 126 
     | 
    
         
            +
                    Spark.config.get(key)
         
     | 
| 
      
 127 
     | 
    
         
            +
                  else
         
     | 
| 
      
 128 
     | 
    
         
            +
                    Spark.config
         
     | 
| 
      
 129 
     | 
    
         
            +
                  end
         
     | 
| 
       102 
130 
     | 
    
         
             
                end
         
     | 
| 
       103 
131 
     | 
    
         | 
| 
       104 
132 
     | 
    
         
             
                # Add a file to be downloaded with this Spark job on every node.
         
     | 
| 
         @@ -164,10 +192,7 @@ module Spark 
     | 
|
| 
       164 
192 
     | 
    
         
             
                # == Parameters:
         
     | 
| 
       165 
193 
     | 
    
         
             
                # data:: Range or Array
         
     | 
| 
       166 
194 
     | 
    
         
             
                # num_slices:: number of slice
         
     | 
| 
       167 
     | 
    
         
            -
                #  
     | 
| 
       168 
     | 
    
         
            -
                #   - use
         
     | 
| 
       169 
     | 
    
         
            -
                #   - serializer
         
     | 
| 
       170 
     | 
    
         
            -
                #   - batch_size
         
     | 
| 
      
 195 
     | 
    
         
            +
                # serializer:: custom serializer (default: serializer based on configuration)
         
     | 
| 
       171 
196 
     | 
    
         
             
                #
         
     | 
| 
       172 
197 
     | 
    
         
             
                # == Examples:
         
     | 
| 
       173 
198 
     | 
    
         
             
                #   $sc.parallelize(["1", "2", "3"]).map(lambda{|x| x.to_i}).collect
         
     | 
| 
         @@ -176,33 +201,21 @@ module Spark 
     | 
|
| 
       176 
201 
     | 
    
         
             
                #   $sc.parallelize(1..3).map(:to_s).collect
         
     | 
| 
       177 
202 
     | 
    
         
             
                #   #=> ["1", "2", "3"]
         
     | 
| 
       178 
203 
     | 
    
         
             
                #
         
     | 
| 
       179 
     | 
    
         
            -
                def parallelize(data, num_slices=nil,  
     | 
| 
      
 204 
     | 
    
         
            +
                def parallelize(data, num_slices=nil, serializer=nil)
         
     | 
| 
       180 
205 
     | 
    
         
             
                  num_slices ||= default_parallelism
         
     | 
| 
      
 206 
     | 
    
         
            +
                  serializer ||= default_serializer
         
     | 
| 
       181 
207 
     | 
    
         | 
| 
       182 
     | 
    
         
            -
                   
     | 
| 
       183 
     | 
    
         
            -
                  use = :file
         
     | 
| 
       184 
     | 
    
         
            -
                  serializer = get_serializer(options[:serializer], options[:batch_size])
         
     | 
| 
       185 
     | 
    
         
            -
             
     | 
| 
       186 
     | 
    
         
            -
                  if data.is_a?(Array) && config['spark.ruby.parallelize_strategy'] == 'deep_copy'
         
     | 
| 
       187 
     | 
    
         
            -
                    data = data.deep_copy
         
     | 
| 
       188 
     | 
    
         
            -
                  else
         
     | 
| 
       189 
     | 
    
         
            -
                    # For enumerator or range
         
     | 
| 
       190 
     | 
    
         
            -
                    data = data.to_a
         
     | 
| 
       191 
     | 
    
         
            -
                  end
         
     | 
| 
      
 208 
     | 
    
         
            +
                  serializer.check_each(data)
         
     | 
| 
       192 
209 
     | 
    
         | 
| 
       193 
     | 
    
         
            -
                   
     | 
| 
       194 
     | 
    
         
            -
                   
     | 
| 
       195 
     | 
    
         
            -
             
     | 
| 
       196 
     | 
    
         
            -
             
     | 
| 
       197 
     | 
    
         
            -
                   
     | 
| 
       198 
     | 
    
         
            -
                    file = Tempfile.new('to_parallelize', temp_dir)
         
     | 
| 
       199 
     | 
    
         
            -
                    serializer.dump(data, file)
         
     | 
| 
       200 
     | 
    
         
            -
                    file.close # not unlink
         
     | 
| 
       201 
     | 
    
         
            -
                    jrdd = RubyRDD.readRDDFromFile(jcontext, file.path, num_slices)
         
     | 
| 
       202 
     | 
    
         
            -
                    file.unlink
         
     | 
| 
       203 
     | 
    
         
            -
                  end
         
     | 
| 
      
 210 
     | 
    
         
            +
                  # Through file
         
     | 
| 
      
 211 
     | 
    
         
            +
                  file = Tempfile.new('to_parallelize', temp_dir)
         
     | 
| 
      
 212 
     | 
    
         
            +
                  serializer.dump_to_io(data, file)
         
     | 
| 
      
 213 
     | 
    
         
            +
                  file.close # not unlink
         
     | 
| 
      
 214 
     | 
    
         
            +
                  jrdd = RubyRDD.readRDDFromFile(jcontext, file.path, num_slices)
         
     | 
| 
       204 
215 
     | 
    
         | 
| 
       205 
216 
     | 
    
         
             
                  Spark::RDD.new(jrdd, self, serializer)
         
     | 
| 
      
 217 
     | 
    
         
            +
                ensure
         
     | 
| 
      
 218 
     | 
    
         
            +
                  file && file.unlink
         
     | 
| 
       206 
219 
     | 
    
         
             
                end
         
     | 
| 
       207 
220 
     | 
    
         | 
| 
       208 
221 
     | 
    
         
             
                # Read a text file from HDFS, a local file system (available on all nodes), or any
         
     | 
| 
         @@ -217,11 +230,12 @@ module Spark 
     | 
|
| 
       217 
230 
     | 
    
         
             
                #   $sc.text_file(f.path).map(lambda{|x| x.to_i}).collect
         
     | 
| 
       218 
231 
     | 
    
         
             
                #   # => [1, 2]
         
     | 
| 
       219 
232 
     | 
    
         
             
                #
         
     | 
| 
       220 
     | 
    
         
            -
                def text_file(path, min_partitions=nil,  
     | 
| 
      
 233 
     | 
    
         
            +
                def text_file(path, min_partitions=nil, encoding=Encoding::UTF_8, serializer=nil)
         
     | 
| 
       221 
234 
     | 
    
         
             
                  min_partitions ||= default_parallelism
         
     | 
| 
       222 
     | 
    
         
            -
                  serializer  
     | 
| 
      
 235 
     | 
    
         
            +
                  serializer     ||= default_serializer
         
     | 
| 
      
 236 
     | 
    
         
            +
                  deserializer     = Spark::Serializer.build { __text__(encoding) }
         
     | 
| 
       223 
237 
     | 
    
         | 
| 
       224 
     | 
    
         
            -
                  Spark::RDD.new(@jcontext.textFile(path, min_partitions), self, serializer,  
     | 
| 
      
 238 
     | 
    
         
            +
                  Spark::RDD.new(@jcontext.textFile(path, min_partitions), self, serializer, deserializer)
         
     | 
| 
       225 
239 
     | 
    
         
             
                end
         
     | 
| 
       226 
240 
     | 
    
         | 
| 
       227 
241 
     | 
    
         
             
                # Read a directory of text files from HDFS, a local file system (available on all nodes), or any
         
     | 
| 
         @@ -240,10 +254,10 @@ module Spark 
     | 
|
| 
       240 
254 
     | 
    
         
             
                #   $sc.whole_text_files(dir).flat_map(lambda{|key, value| value.split}).collect
         
     | 
| 
       241 
255 
     | 
    
         
             
                #   # => ["1", "2", "3", "4"]
         
     | 
| 
       242 
256 
     | 
    
         
             
                #
         
     | 
| 
       243 
     | 
    
         
            -
                def whole_text_files(path, min_partitions=nil,  
     | 
| 
      
 257 
     | 
    
         
            +
                def whole_text_files(path, min_partitions=nil, serializer=nil)
         
     | 
| 
       244 
258 
     | 
    
         
             
                  min_partitions ||= default_parallelism
         
     | 
| 
       245 
     | 
    
         
            -
                  serializer  
     | 
| 
       246 
     | 
    
         
            -
                  deserializer 
     | 
| 
      
 259 
     | 
    
         
            +
                  serializer     ||= default_serializer
         
     | 
| 
      
 260 
     | 
    
         
            +
                  deserializer     = Spark::Serializer.build{ __pair__(__text__, __text__) }
         
     | 
| 
       247 
261 
     | 
    
         | 
| 
       248 
262 
     | 
    
         
             
                  Spark::RDD.new(@jcontext.wholeTextFiles(path, min_partitions), self, serializer, deserializer)
         
     | 
| 
       249 
263 
     | 
    
         
             
                end
         
     | 
| 
         @@ -254,7 +268,7 @@ module Spark 
     | 
|
| 
       254 
268 
     | 
    
         
             
                # If partitions is not specified, this will run over all partitions.
         
     | 
| 
       255 
269 
     | 
    
         
             
                #
         
     | 
| 
       256 
270 
     | 
    
         
             
                # == Example:
         
     | 
| 
       257 
     | 
    
         
            -
                #   rdd = $sc.parallelize(0..10, 5 
     | 
| 
      
 271 
     | 
    
         
            +
                #   rdd = $sc.parallelize(0..10, 5)
         
     | 
| 
       258 
272 
     | 
    
         
             
                #   $sc.run_job(rdd, lambda{|x| x.to_s}, [0,2])
         
     | 
| 
       259 
273 
     | 
    
         
             
                #   # => ["[0, 1]", "[4, 5]"]
         
     | 
| 
       260 
274 
     | 
    
         
             
                #
         
     | 
| 
         @@ -282,9 +296,13 @@ module Spark 
     | 
|
| 
       282 
296 
     | 
    
         
             
                  # Rjb represent Fixnum as Integer but Jruby as Long
         
     | 
| 
       283 
297 
     | 
    
         
             
                  partitions = to_java_array_list(convert_to_java_int(partitions))
         
     | 
| 
       284 
298 
     | 
    
         | 
| 
      
 299 
     | 
    
         
            +
                  # File for result
         
     | 
| 
      
 300 
     | 
    
         
            +
                  file = Tempfile.new('collect', temp_dir)
         
     | 
| 
      
 301 
     | 
    
         
            +
             
     | 
| 
       285 
302 
     | 
    
         
             
                  mapped = rdd.new_rdd_from_command(command, *args)
         
     | 
| 
       286 
     | 
    
         
            -
                   
     | 
| 
       287 
     | 
    
         
            -
             
     | 
| 
      
 303 
     | 
    
         
            +
                  RubyRDD.runJob(rdd.context.sc, mapped.jrdd, partitions, allow_local, file.path)
         
     | 
| 
      
 304 
     | 
    
         
            +
             
     | 
| 
      
 305 
     | 
    
         
            +
                  mapped.collect_from_file(file)
         
     | 
| 
       288 
306 
     | 
    
         
             
                end
         
     | 
| 
       289 
307 
     | 
    
         | 
| 
       290 
308 
     | 
    
         | 
    
        data/lib/spark/ext/io.rb
    CHANGED
    
    | 
         @@ -12,6 +12,12 @@ module Spark 
     | 
|
| 
       12 
12 
     | 
    
         
             
                      unpack_int(read(4))
         
     | 
| 
       13 
13 
     | 
    
         
             
                    end
         
     | 
| 
       14 
14 
     | 
    
         | 
| 
      
 15 
     | 
    
         
            +
                    def read_int_or_eof
         
     | 
| 
      
 16 
     | 
    
         
            +
                      bytes = read(4)
         
     | 
| 
      
 17 
     | 
    
         
            +
                      return Spark::Constant::DATA_EOF if bytes.nil?
         
     | 
| 
      
 18 
     | 
    
         
            +
                      unpack_int(bytes)
         
     | 
| 
      
 19 
     | 
    
         
            +
                    end
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
       15 
21 
     | 
    
         
             
                    def read_long
         
     | 
| 
       16 
22 
     | 
    
         
             
                      unpack_long(read(8))
         
     | 
| 
       17 
23 
     | 
    
         
             
                    end
         
     | 
| 
         @@ -35,8 +41,11 @@ module Spark 
     | 
|
| 
       35 
41 
     | 
    
         
             
                      write(pack_long(data))
         
     | 
| 
       36 
42 
     | 
    
         
             
                    end
         
     | 
| 
       37 
43 
     | 
    
         | 
| 
      
 44 
     | 
    
         
            +
                    # Size and data can have different encoding
         
     | 
| 
      
 45 
     | 
    
         
            +
                    # Marshal: both ASCII
         
     | 
| 
      
 46 
     | 
    
         
            +
                    # Oj: ASCII and UTF-8
         
     | 
| 
       38 
47 
     | 
    
         
             
                    def write_string(data)
         
     | 
| 
       39 
     | 
    
         
            -
                      write_int(data. 
     | 
| 
      
 48 
     | 
    
         
            +
                      write_int(data.bytesize)
         
     | 
| 
       40 
49 
     | 
    
         
             
                      write(data)
         
     | 
| 
       41 
50 
     | 
    
         
             
                    end
         
     | 
| 
       42 
51 
     | 
    
         | 
| 
         @@ -55,3 +64,4 @@ module Spark 
     | 
|
| 
       55 
64 
     | 
    
         
             
            end
         
     | 
| 
       56 
65 
     | 
    
         | 
| 
       57 
66 
     | 
    
         
             
            IO.__send__(:include, Spark::CoreExtension::IO)
         
     | 
| 
      
 67 
     | 
    
         
            +
            StringIO.__send__(:include, Spark::CoreExtension::IO)
         
     | 
| 
         @@ -145,8 +145,8 @@ module Spark 
     | 
|
| 
       145 
145 
     | 
    
         
             
                        if class_name == 'JavaRDD'
         
     | 
| 
       146 
146 
     | 
    
         
             
                          jrdd = RubyRDD.toRuby(object)
         
     | 
| 
       147 
147 
     | 
    
         | 
| 
       148 
     | 
    
         
            -
                          serializer 
     | 
| 
       149 
     | 
    
         
            -
                           
     | 
| 
      
 148 
     | 
    
         
            +
                          serializer = Spark::Serializer.build { __batched__(__marshal__) }
         
     | 
| 
      
 149 
     | 
    
         
            +
                          serializer = Spark::Serializer.build { __batched__(__marshal__, 2) }
         
     | 
| 
       150 
150 
     | 
    
         | 
| 
       151 
151 
     | 
    
         
             
                          return Spark::RDD.new(jrdd, Spark.sc, serializer, deserializer)
         
     | 
| 
       152 
152 
     | 
    
         
             
                        end
         
     | 
    
        data/lib/spark/rdd.rb
    CHANGED
    
    | 
         @@ -34,6 +34,18 @@ module Spark 
     | 
|
| 
       34 
34 
     | 
    
         
             
                  @command = Spark::CommandBuilder.new(serializer, deserializer)
         
     | 
| 
       35 
35 
     | 
    
         
             
                end
         
     | 
| 
       36 
36 
     | 
    
         | 
| 
      
 37 
     | 
    
         
            +
                def inspect
         
     | 
| 
      
 38 
     | 
    
         
            +
                  comms = @command.commands.join(' -> ')
         
     | 
| 
      
 39 
     | 
    
         
            +
             
     | 
| 
      
 40 
     | 
    
         
            +
                  result  = %{#<#{self.class.name}:0x#{object_id}}
         
     | 
| 
      
 41 
     | 
    
         
            +
                  result << %{ (#{comms})} unless comms.empty?
         
     | 
| 
      
 42 
     | 
    
         
            +
                  result << %{\n}
         
     | 
| 
      
 43 
     | 
    
         
            +
                  result << %{  Serializer: "#{serializer}"\n}
         
     | 
| 
      
 44 
     | 
    
         
            +
                  result << %{Deserializer: "#{deserializer}"}
         
     | 
| 
      
 45 
     | 
    
         
            +
                  result << %{>}
         
     | 
| 
      
 46 
     | 
    
         
            +
                  result
         
     | 
| 
      
 47 
     | 
    
         
            +
                end
         
     | 
| 
      
 48 
     | 
    
         
            +
             
     | 
| 
       37 
49 
     | 
    
         | 
| 
       38 
50 
     | 
    
         
             
                # =============================================================================
         
     | 
| 
       39 
51 
     | 
    
         
             
                # Operators
         
     | 
| 
         @@ -159,7 +171,16 @@ module Spark 
     | 
|
| 
       159 
171 
     | 
    
         
             
                end
         
     | 
| 
       160 
172 
     | 
    
         | 
| 
       161 
173 
     | 
    
         
             
                def to_java
         
     | 
| 
       162 
     | 
    
         
            -
                   
     | 
| 
      
 174 
     | 
    
         
            +
                  marshal = Spark::Serializer.marshal
         
     | 
| 
      
 175 
     | 
    
         
            +
             
     | 
| 
      
 176 
     | 
    
         
            +
                  if deserializer.batched?
         
     | 
| 
      
 177 
     | 
    
         
            +
                    ser = deserializer.deep_copy
         
     | 
| 
      
 178 
     | 
    
         
            +
                    ser.serializer = marshal
         
     | 
| 
      
 179 
     | 
    
         
            +
                  else
         
     | 
| 
      
 180 
     | 
    
         
            +
                    ser = Spark::Serializer.batched(marshal)
         
     | 
| 
      
 181 
     | 
    
         
            +
                  end
         
     | 
| 
      
 182 
     | 
    
         
            +
             
     | 
| 
      
 183 
     | 
    
         
            +
                  rdd = self.reserialize(ser)
         
     | 
| 
       163 
184 
     | 
    
         
             
                  RubyRDD.toJava(rdd.jrdd, rdd.serializer.batched?)
         
     | 
| 
       164 
185 
     | 
    
         
             
                end
         
     | 
| 
       165 
186 
     | 
    
         | 
| 
         @@ -169,20 +190,32 @@ module Spark 
     | 
|
| 
       169 
190 
     | 
    
         | 
| 
       170 
191 
     | 
    
         
             
                # Return an array that contains all of the elements in this RDD.
         
     | 
| 
       171 
192 
     | 
    
         
             
                # RJB raise an error if stage is killed.
         
     | 
| 
       172 
     | 
    
         
            -
                def collect
         
     | 
| 
       173 
     | 
    
         
            -
                   
     | 
| 
      
 193 
     | 
    
         
            +
                def collect(as_enum=false)
         
     | 
| 
      
 194 
     | 
    
         
            +
                  file = Tempfile.new('collect', context.temp_dir)
         
     | 
| 
      
 195 
     | 
    
         
            +
             
     | 
| 
      
 196 
     | 
    
         
            +
                  RubyRDD.writeRDDToFile(jrdd.rdd, file.path)
         
     | 
| 
      
 197 
     | 
    
         
            +
             
     | 
| 
      
 198 
     | 
    
         
            +
                  collect_from_file(file, as_enum)
         
     | 
| 
       174 
199 
     | 
    
         
             
                rescue => e
         
     | 
| 
       175 
200 
     | 
    
         
             
                  raise Spark::RDDError, e.message
         
     | 
| 
       176 
201 
     | 
    
         
             
                end
         
     | 
| 
       177 
202 
     | 
    
         | 
| 
       178 
     | 
    
         
            -
                def  
     | 
| 
      
 203 
     | 
    
         
            +
                def collect_from_file(file, as_enum=false)
         
     | 
| 
       179 
204 
     | 
    
         
             
                  if self.is_a?(PipelinedRDD)
         
     | 
| 
       180 
205 
     | 
    
         
             
                    klass = @command.serializer
         
     | 
| 
       181 
206 
     | 
    
         
             
                  else
         
     | 
| 
       182 
207 
     | 
    
         
             
                    klass = @command.deserializer
         
     | 
| 
       183 
208 
     | 
    
         
             
                  end
         
     | 
| 
       184 
209 
     | 
    
         | 
| 
       185 
     | 
    
         
            -
                   
     | 
| 
      
 210 
     | 
    
         
            +
                  if as_enum
         
     | 
| 
      
 211 
     | 
    
         
            +
                    result = klass.load_from_file(file)
         
     | 
| 
      
 212 
     | 
    
         
            +
                  else
         
     | 
| 
      
 213 
     | 
    
         
            +
                    result = klass.load_from_io(file).to_a
         
     | 
| 
      
 214 
     | 
    
         
            +
                    file.close
         
     | 
| 
      
 215 
     | 
    
         
            +
                    file.unlink
         
     | 
| 
      
 216 
     | 
    
         
            +
                  end
         
     | 
| 
      
 217 
     | 
    
         
            +
             
     | 
| 
      
 218 
     | 
    
         
            +
                  result
         
     | 
| 
       186 
219 
     | 
    
         
             
                end
         
     | 
| 
       187 
220 
     | 
    
         | 
| 
       188 
221 
     | 
    
         
             
                # Convert an Array to Hash
         
     | 
| 
         @@ -198,7 +231,7 @@ module Spark 
     | 
|
| 
       198 
231 
     | 
    
         
             
                # to satisfy the limit.
         
     | 
| 
       199 
232 
     | 
    
         
             
                #
         
     | 
| 
       200 
233 
     | 
    
         
             
                # == Example:
         
     | 
| 
       201 
     | 
    
         
            -
                #   rdd = $sc.parallelize(0..100, 20 
     | 
| 
      
 234 
     | 
    
         
            +
                #   rdd = $sc.parallelize(0..100, 20)
         
     | 
| 
       202 
235 
     | 
    
         
             
                #   rdd.take(5)
         
     | 
| 
       203 
236 
     | 
    
         
             
                #   # => [0, 1, 2, 3, 4]
         
     | 
| 
       204 
237 
     | 
    
         
             
                #
         
     | 
| 
         @@ -293,7 +326,7 @@ module Spark 
     | 
|
| 
       293 
326 
     | 
    
         
             
                #   seq = lambda{|x,y| x+y}
         
     | 
| 
       294 
327 
     | 
    
         
             
                #   com = lambda{|x,y| x*y}
         
     | 
| 
       295 
328 
     | 
    
         
             
                #
         
     | 
| 
       296 
     | 
    
         
            -
                #   rdd = $sc.parallelize(1..10, 2 
     | 
| 
      
 329 
     | 
    
         
            +
                #   rdd = $sc.parallelize(1..10, 2)
         
     | 
| 
       297 
330 
     | 
    
         
             
                #   rdd.aggregate(1, seq, com)
         
     | 
| 
       298 
331 
     | 
    
         
             
                #   # => 656
         
     | 
| 
       299 
332 
     | 
    
         
             
                #
         
     | 
| 
         @@ -590,7 +623,7 @@ module Spark 
     | 
|
| 
       590 
623 
     | 
    
         
             
                # of the original partition.
         
     | 
| 
       591 
624 
     | 
    
         
             
                #
         
     | 
| 
       592 
625 
     | 
    
         
             
                # == Example:
         
     | 
| 
       593 
     | 
    
         
            -
                #   rdd = $sc.parallelize(0...4, 4 
     | 
| 
      
 626 
     | 
    
         
            +
                #   rdd = $sc.parallelize(0...4, 4)
         
     | 
| 
       594 
627 
     | 
    
         
             
                #   rdd.map_partitions_with_index(lambda{|part, index| part.first * index}).collect
         
     | 
| 
       595 
628 
     | 
    
         
             
                #   # => [0, 1, 4, 9]
         
     | 
| 
       596 
629 
     | 
    
         
             
                #
         
     | 
| 
         @@ -623,7 +656,7 @@ module Spark 
     | 
|
| 
       623 
656 
     | 
    
         
             
                # Return an RDD created by coalescing all elements within each partition into an array.
         
     | 
| 
       624 
657 
     | 
    
         
             
                #
         
     | 
| 
       625 
658 
     | 
    
         
             
                # == Example:
         
     | 
| 
       626 
     | 
    
         
            -
                #   rdd = $sc.parallelize(0..10, 3 
     | 
| 
      
 659 
     | 
    
         
            +
                #   rdd = $sc.parallelize(0..10, 3)
         
     | 
| 
       627 
660 
     | 
    
         
             
                #   rdd.glom.collect
         
     | 
| 
       628 
661 
     | 
    
         
             
                #   # => [[0, 1, 2], [3, 4, 5, 6], [7, 8, 9, 10]]
         
     | 
| 
       629 
662 
     | 
    
         
             
                #
         
     | 
| 
         @@ -639,8 +672,14 @@ module Spark 
     | 
|
| 
       639 
672 
     | 
    
         
             
                #   # => [[0, 1, 2], [3, 4, 5, 6, 7, 8, 9, 10]]
         
     | 
| 
       640 
673 
     | 
    
         
             
                #
         
     | 
| 
       641 
674 
     | 
    
         
             
                def coalesce(num_partitions)
         
     | 
| 
      
 675 
     | 
    
         
            +
                  if self.is_a?(PipelinedRDD)
         
     | 
| 
      
 676 
     | 
    
         
            +
                    deser = @command.serializer
         
     | 
| 
      
 677 
     | 
    
         
            +
                  else
         
     | 
| 
      
 678 
     | 
    
         
            +
                    deser = @command.deserializer
         
     | 
| 
      
 679 
     | 
    
         
            +
                  end
         
     | 
| 
      
 680 
     | 
    
         
            +
             
     | 
| 
       642 
681 
     | 
    
         
             
                  new_jrdd = jrdd.coalesce(num_partitions)
         
     | 
| 
       643 
     | 
    
         
            -
                  RDD.new(new_jrdd, context, @command.serializer,  
     | 
| 
      
 682 
     | 
    
         
            +
                  RDD.new(new_jrdd, context, @command.serializer, deser)
         
     | 
| 
       644 
683 
     | 
    
         
             
                end
         
     | 
| 
       645 
684 
     | 
    
         | 
| 
       646 
685 
     | 
    
         
             
                # Return the Cartesian product of this RDD and another one, that is, the
         
     | 
| 
         @@ -655,7 +694,8 @@ module Spark 
     | 
|
| 
       655 
694 
     | 
    
         
             
                #   # => [[1, 4], [1, 5], [1, 6], [2, 4], [2, 5], [2, 6], [3, 4], [3, 5], [3, 6]]
         
     | 
| 
       656 
695 
     | 
    
         
             
                #
         
     | 
| 
       657 
696 
     | 
    
         
             
                def cartesian(other)
         
     | 
| 
       658 
     | 
    
         
            -
                  _deserializer = Spark::Serializer::Cartesian.new 
     | 
| 
      
 697 
     | 
    
         
            +
                  _deserializer = Spark::Serializer::Cartesian.new(self.deserializer, other.deserializer)
         
     | 
| 
      
 698 
     | 
    
         
            +
             
     | 
| 
       659 
699 
     | 
    
         
             
                  new_jrdd = jrdd.cartesian(other.jrdd)
         
     | 
| 
       660 
700 
     | 
    
         
             
                  RDD.new(new_jrdd, context, serializer, _deserializer)
         
     | 
| 
       661 
701 
     | 
    
         
             
                end
         
     | 
| 
         @@ -697,7 +737,7 @@ module Spark 
     | 
|
| 
       697 
737 
     | 
    
         
             
                #
         
     | 
| 
       698 
738 
     | 
    
         
             
                def union(other)
         
     | 
| 
       699 
739 
     | 
    
         
             
                  if self.serializer != other.serializer
         
     | 
| 
       700 
     | 
    
         
            -
                    other = other.reserialize(serializer 
     | 
| 
      
 740 
     | 
    
         
            +
                    other = other.reserialize(serializer)
         
     | 
| 
       701 
741 
     | 
    
         
             
                  end
         
     | 
| 
       702 
742 
     | 
    
         | 
| 
       703 
743 
     | 
    
         
             
                  new_jrdd = jrdd.union(other.jrdd)
         
     | 
| 
         @@ -713,10 +753,7 @@ module Spark 
     | 
|
| 
       713 
753 
     | 
    
         
             
                #   rdd.reserialize("oj").collect
         
     | 
| 
       714 
754 
     | 
    
         
             
                #   # => ["1", "2", "3"]
         
     | 
| 
       715 
755 
     | 
    
         
             
                #
         
     | 
| 
       716 
     | 
    
         
            -
                def reserialize(new_serializer 
     | 
| 
       717 
     | 
    
         
            -
                  new_batch_size ||= deserializer.batch_size
         
     | 
| 
       718 
     | 
    
         
            -
                  new_serializer = Spark::Serializer.get!(new_serializer).new(new_batch_size)
         
     | 
| 
       719 
     | 
    
         
            -
             
     | 
| 
      
 756 
     | 
    
         
            +
                def reserialize(new_serializer)
         
     | 
| 
       720 
757 
     | 
    
         
             
                  if serializer == new_serializer
         
     | 
| 
       721 
758 
     | 
    
         
             
                    return self
         
     | 
| 
       722 
759 
     | 
    
         
             
                  end
         
     | 
| 
         @@ -906,7 +943,7 @@ module Spark 
     | 
|
| 
       906 
943 
     | 
    
         
             
                #     x+y
         
     | 
| 
       907 
944 
     | 
    
         
             
                #   end
         
     | 
| 
       908 
945 
     | 
    
         
             
                #
         
     | 
| 
       909 
     | 
    
         
            -
                #   rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"], 2 
     | 
| 
      
 946 
     | 
    
         
            +
                #   rdd = $sc.parallelize(["a","b","c","a","b","c","a","c"], 2).map(lambda{|x| [x, 1]})
         
     | 
| 
       910 
947 
     | 
    
         
             
                #   rdd.combine_by_key(method(:combiner), method(:merge), method(:merge)).collect_as_hash
         
     | 
| 
       911 
948 
     | 
    
         
             
                #   # => {"a"=>3, "b"=>2, "c"=>3}
         
     | 
| 
       912 
949 
     | 
    
         
             
                #
         
     | 
| 
         @@ -973,7 +1010,7 @@ module Spark 
     | 
|
| 
       973 
1010 
     | 
    
         
             
                #     x*y
         
     | 
| 
       974 
1011 
     | 
    
         
             
                #   end
         
     | 
| 
       975 
1012 
     | 
    
         
             
                #
         
     | 
| 
       976 
     | 
    
         
            -
                #   rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]], 2 
     | 
| 
      
 1013 
     | 
    
         
            +
                #   rdd = $sc.parallelize([["a", 1], ["b", 2], ["a", 3], ["a", 4], ["c", 5]], 2)
         
     | 
| 
       977 
1014 
     | 
    
         
             
                #   rdd.aggregate_by_key(1, method(:combine), method(:merge))
         
     | 
| 
       978 
1015 
     | 
    
         
             
                #   # => [["b", 3], ["a", 16], ["c", 6]]
         
     | 
| 
       979 
1016 
     | 
    
         
             
                #
         
     | 
| 
         @@ -1064,6 +1101,17 @@ module Spark 
     | 
|
| 
       1064 
1101 
     | 
    
         
             
                  self.sort_by('lambda{|(key, _)| key}')
         
     | 
| 
       1065 
1102 
     | 
    
         
             
                end
         
     | 
| 
       1066 
1103 
     | 
    
         | 
| 
      
 1104 
     | 
    
         
            +
                # Sort the RDD by value
         
     | 
| 
      
 1105 
     | 
    
         
            +
                #
         
     | 
| 
      
 1106 
     | 
    
         
            +
                # == Example:
         
     | 
| 
      
 1107 
     | 
    
         
            +
                #   rdd = $sc.parallelize([["a", 3], ["b", 1], ["c", 2]])
         
     | 
| 
      
 1108 
     | 
    
         
            +
                #   rdd.sort_by_value.collect
         
     | 
| 
      
 1109 
     | 
    
         
            +
                #   # => [["b", 1], ["c", 2], ["a", 3]]
         
     | 
| 
      
 1110 
     | 
    
         
            +
                #
         
     | 
| 
      
 1111 
     | 
    
         
            +
                def sort_by_value(ascending=true, num_partitions=nil)
         
     | 
| 
      
 1112 
     | 
    
         
            +
                  self.sort_by('lambda{|(_, value)| value}')
         
     | 
| 
      
 1113 
     | 
    
         
            +
                end
         
     | 
| 
      
 1114 
     | 
    
         
            +
             
     | 
| 
       1067 
1115 
     | 
    
         
             
                # Sorts this RDD by the given key_function
         
     | 
| 
       1068 
1116 
     | 
    
         
             
                #
         
     | 
| 
       1069 
1117 
     | 
    
         
             
                # This is a different implementation than spark. Sort by doesn't use
         
     | 
| 
         @@ -1190,6 +1238,7 @@ module Spark 
     | 
|
| 
       1190 
1238 
     | 
    
         
             
                alias_method :defaultReducePartitions, :default_reduce_partitions
         
     | 
| 
       1191 
1239 
     | 
    
         
             
                alias_method :setName, :set_name
         
     | 
| 
       1192 
1240 
     | 
    
         
             
                alias_method :addLibrary, :add_library
         
     | 
| 
      
 1241 
     | 
    
         
            +
                alias_method :require, :add_library
         
     | 
| 
       1193 
1242 
     | 
    
         | 
| 
       1194 
1243 
     | 
    
         
             
                alias_method :flatMap, :flat_map
         
     | 
| 
       1195 
1244 
     | 
    
         
             
                alias_method :mapPartitions, :map_partitions
         
     |