ruby-spark 1.0.0 → 1.1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/.gitignore +1 -1
 - data/README.md +99 -32
 - data/TODO.md +2 -3
 - data/benchmark/{performance → comparison}/prepare.sh +0 -0
 - data/benchmark/{performance → comparison}/python.py +0 -0
 - data/benchmark/{performance → comparison}/r.r +0 -0
 - data/benchmark/{performance → comparison}/ruby.rb +0 -0
 - data/benchmark/{performance → comparison}/run-all.sh +0 -0
 - data/benchmark/{performance → comparison}/scala.scala +0 -0
 - data/example/pi.rb +1 -1
 - data/example/website_search.rb +83 -0
 - data/ext/spark/src/main/scala/RubyRDD.scala +30 -2
 - data/lib/spark.rb +2 -2
 - data/lib/spark/build.rb +1 -1
 - data/lib/spark/cli.rb +1 -1
 - data/lib/spark/command/base.rb +4 -0
 - data/lib/spark/command_builder.rb +2 -2
 - data/lib/spark/config.rb +11 -17
 - data/lib/spark/context.rb +63 -45
 - data/lib/spark/ext/io.rb +11 -1
 - data/lib/spark/java_bridge/base.rb +2 -2
 - data/lib/spark/rdd.rb +67 -18
 - data/lib/spark/serializer.rb +68 -13
 - data/lib/spark/serializer/auto_batched.rb +59 -0
 - data/lib/spark/serializer/base.rb +30 -137
 - data/lib/spark/serializer/batched.rb +84 -0
 - data/lib/spark/serializer/cartesian.rb +5 -29
 - data/lib/spark/serializer/compressed.rb +27 -0
 - data/lib/spark/serializer/marshal.rb +6 -8
 - data/lib/spark/serializer/message_pack.rb +8 -10
 - data/lib/spark/serializer/oj.rb +8 -10
 - data/lib/spark/serializer/pair.rb +27 -13
 - data/lib/spark/serializer/text.rb +25 -0
 - data/lib/spark/version.rb +1 -1
 - data/lib/spark/worker/worker.rb +5 -2
 - data/ruby-spark.gemspec +13 -1
 - data/spec/lib/context_spec.rb +3 -1
 - data/spec/lib/manipulation_spec.rb +18 -10
 - data/spec/lib/map_partitions_spec.rb +16 -16
 - data/spec/lib/serializer_spec.rb +84 -9
 - data/spec/lib/statistic_spec.rb +26 -24
 - data/spec/spec_helper.rb +1 -2
 - metadata +112 -10
 - data/lib/spark/serializer/utf8.rb +0 -25
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA1:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 64125882e5773c705d62c737e57a481d7bc8bb71
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: 4dd464e678a79c9e2d5655fc5b0aae813c2a6346
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 5d11709879c2ce1d1c1a7153dafc7b85908d51227b864f276ef1bfb257eb239ae9f6216054bb3a0e601c6df23ec756fb60df70793113b282fc8db1e73b7b9b5d
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: 4452699e228a9f4196a3afc583a9f6ea792ca41864524ab2b029aef04abb53d662c9ce538f8190add24a528605cca3d3fcc2f6c99e49f8ae33836df0a0181a13
         
     | 
    
        data/.gitignore
    CHANGED
    
    
    
        data/README.md
    CHANGED
    
    | 
         @@ -20,6 +20,12 @@ file.flat_map(:split) 
     | 
|
| 
       20 
20 
     | 
    
         | 
| 
       21 
21 
     | 
    
         
             
            ## Installation
         
     | 
| 
       22 
22 
     | 
    
         | 
| 
      
 23 
     | 
    
         
            +
            ### Requirments
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
            - Java 7+
         
     | 
| 
      
 26 
     | 
    
         
            +
            - Ruby 2+
         
     | 
| 
      
 27 
     | 
    
         
            +
            - MRI or JRuby
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
       23 
29 
     | 
    
         
             
            Add this line to your application's Gemfile:
         
     | 
| 
       24 
30 
     | 
    
         | 
| 
       25 
31 
     | 
    
         
             
            ```ruby
         
     | 
| 
         @@ -38,33 +44,34 @@ Or install it yourself as: 
     | 
|
| 
       38 
44 
     | 
    
         
             
            $ gem install ruby-spark
         
     | 
| 
       39 
45 
     | 
    
         
             
            ```
         
     | 
| 
       40 
46 
     | 
    
         | 
| 
       41 
     | 
    
         
            -
             
     | 
| 
      
 47 
     | 
    
         
            +
            Run `rake compile` if you are using gem from local filesystem.
         
     | 
| 
       42 
48 
     | 
    
         | 
| 
       43 
     | 
    
         
            -
             
     | 
| 
      
 49 
     | 
    
         
            +
            ### Build Apache Spark
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
            This command will download Spark and build extensions for this gem ([SBT](ext/spark/build.sbt) is used for compiling). For more informations check [wiki](https://github.com/ondra-m/ruby-spark/wiki/Installation). Everything is stored by default at [GEM_ROOT]/target,
         
     | 
| 
       44 
52 
     | 
    
         | 
| 
       45 
     | 
    
         
            -
            ```
         
     | 
| 
       46 
     | 
    
         
            -
            $ rake compile
         
     | 
| 
       47 
     | 
    
         
            -
            ```
         
     | 
| 
       48 
     | 
    
         
            -
            Then build Spark, [SBT](ext/spark/build.sbt) is used for compiling.
         
     | 
| 
       49 
53 
     | 
    
         
             
            ```
         
     | 
| 
       50 
54 
     | 
    
         
             
            $ ruby-spark build
         
     | 
| 
       51 
55 
     | 
    
         
             
            ```
         
     | 
| 
       52 
56 
     | 
    
         | 
| 
       53 
57 
     | 
    
         
             
            ## Usage
         
     | 
| 
       54 
58 
     | 
    
         | 
| 
       55 
     | 
    
         
            -
            You can use Ruby Spark via interactive shell
         
     | 
| 
      
 59 
     | 
    
         
            +
            You can use Ruby Spark via interactive shell (Pry is used)
         
     | 
| 
       56 
60 
     | 
    
         | 
| 
       57 
61 
     | 
    
         
             
            ```
         
     | 
| 
       58 
     | 
    
         
            -
            $ ruby-spark  
     | 
| 
      
 62 
     | 
    
         
            +
            $ ruby-spark shell
         
     | 
| 
       59 
63 
     | 
    
         
             
            ```
         
     | 
| 
       60 
64 
     | 
    
         | 
| 
       61 
65 
     | 
    
         
             
            Or on existing project
         
     | 
| 
       62 
66 
     | 
    
         | 
| 
       63 
67 
     | 
    
         
             
            ```ruby
         
     | 
| 
       64 
68 
     | 
    
         
             
            require 'ruby-spark'
         
     | 
| 
      
 69 
     | 
    
         
            +
             
     | 
| 
      
 70 
     | 
    
         
            +
            # Create a SparkContext
         
     | 
| 
       65 
71 
     | 
    
         
             
            Spark.start
         
     | 
| 
       66 
72 
     | 
    
         | 
| 
       67 
     | 
    
         
            -
             
     | 
| 
      
 73 
     | 
    
         
            +
            # Context reference
         
     | 
| 
      
 74 
     | 
    
         
            +
            Spark.sc
         
     | 
| 
       68 
75 
     | 
    
         
             
            ```
         
     | 
| 
       69 
76 
     | 
    
         | 
| 
       70 
77 
     | 
    
         
             
            If you want configure Spark first. See [configurations](https://github.com/ondra-m/ruby-spark/wiki/Configuration) for more details.
         
     | 
| 
         @@ -72,36 +79,47 @@ If you want configure Spark first. See [configurations](https://github.com/ondra 
     | 
|
| 
       72 
79 
     | 
    
         
             
            ```ruby
         
     | 
| 
       73 
80 
     | 
    
         
             
            require 'ruby-spark'
         
     | 
| 
       74 
81 
     | 
    
         | 
| 
      
 82 
     | 
    
         
            +
            # Use if you have custom SPARK_HOME
         
     | 
| 
       75 
83 
     | 
    
         
             
            Spark.load_lib(spark_home)
         
     | 
| 
      
 84 
     | 
    
         
            +
             
     | 
| 
      
 85 
     | 
    
         
            +
            # Configuration
         
     | 
| 
       76 
86 
     | 
    
         
             
            Spark.config do
         
     | 
| 
       77 
87 
     | 
    
         
             
               set_app_name "RubySpark"
         
     | 
| 
       78 
     | 
    
         
            -
               set 'spark.ruby.batch_size', 100
         
     | 
| 
       79 
88 
     | 
    
         
             
               set 'spark.ruby.serializer', 'oj'
         
     | 
| 
      
 89 
     | 
    
         
            +
               set 'spark.ruby.serializer.batch_size', 100
         
     | 
| 
       80 
90 
     | 
    
         
             
            end
         
     | 
| 
      
 91 
     | 
    
         
            +
             
     | 
| 
      
 92 
     | 
    
         
            +
            # Start Apache Spark
         
     | 
| 
       81 
93 
     | 
    
         
             
            Spark.start
         
     | 
| 
      
 94 
     | 
    
         
            +
            ```
         
     | 
| 
      
 95 
     | 
    
         
            +
             
     | 
| 
      
 96 
     | 
    
         
            +
            Finally, to stop the cluster. On the shell is Spark stopped automatically when you exist.
         
     | 
| 
       82 
97 
     | 
    
         | 
| 
       83 
     | 
    
         
            -
             
     | 
| 
      
 98 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 99 
     | 
    
         
            +
            Spark.stop
         
     | 
| 
       84 
100 
     | 
    
         
             
            ```
         
     | 
| 
       85 
101 
     | 
    
         | 
| 
       86 
     | 
    
         
            -
            ## Uploading a data
         
     | 
| 
       87 
102 
     | 
    
         | 
| 
       88 
     | 
    
         
            -
             
     | 
| 
      
 103 
     | 
    
         
            +
             
     | 
| 
      
 104 
     | 
    
         
            +
            ## Creating RDD (upload data)
         
     | 
| 
      
 105 
     | 
    
         
            +
             
     | 
| 
      
 106 
     | 
    
         
            +
            Single text file:
         
     | 
| 
       89 
107 
     | 
    
         | 
| 
       90 
108 
     | 
    
         
             
            ```ruby
         
     | 
| 
       91 
     | 
    
         
            -
             
     | 
| 
      
 109 
     | 
    
         
            +
            rdd = sc.text_file(FILE, workers_num, serializer=nil)
         
     | 
| 
       92 
110 
     | 
    
         
             
            ```
         
     | 
| 
       93 
111 
     | 
    
         | 
| 
       94 
     | 
    
         
            -
            All files on directory
         
     | 
| 
      
 112 
     | 
    
         
            +
            All files on directory:
         
     | 
| 
       95 
113 
     | 
    
         | 
| 
       96 
114 
     | 
    
         
             
            ```ruby
         
     | 
| 
       97 
     | 
    
         
            -
             
     | 
| 
      
 115 
     | 
    
         
            +
            rdd = sc.whole_text_files(DIRECTORY, workers_num, serializer=nil)
         
     | 
| 
       98 
116 
     | 
    
         
             
            ```
         
     | 
| 
       99 
117 
     | 
    
         | 
| 
       100 
     | 
    
         
            -
            Direct
         
     | 
| 
      
 118 
     | 
    
         
            +
            Direct uploading structures from ruby (choosen serializer must be able to serialize it):
         
     | 
| 
       101 
119 
     | 
    
         | 
| 
       102 
120 
     | 
    
         
             
            ```ruby
         
     | 
| 
       103 
     | 
    
         
            -
             
     | 
| 
       104 
     | 
    
         
            -
             
     | 
| 
      
 121 
     | 
    
         
            +
            rdd = sc.parallelize([1,2,3,4,5], workers_num, serializer=nil)
         
     | 
| 
      
 122 
     | 
    
         
            +
            rdd = sc.parallelize(1..5, workers_num, serializer=nil)
         
     | 
| 
       105 
123 
     | 
    
         
             
            ```
         
     | 
| 
       106 
124 
     | 
    
         | 
| 
       107 
125 
     | 
    
         
             
            ### Options
         
     | 
| 
         @@ -113,39 +131,74 @@ $sc.parallelize(1..5, workers_num, custom_options) 
     | 
|
| 
       113 
131 
     | 
    
         
             
                <i>(This value can be overwriten by spark)</i>
         
     | 
| 
       114 
132 
     | 
    
         
             
              </dd>
         
     | 
| 
       115 
133 
     | 
    
         | 
| 
       116 
     | 
    
         
            -
              <dt> 
     | 
| 
      
 134 
     | 
    
         
            +
              <dt>serializer</dt>
         
     | 
| 
       117 
135 
     | 
    
         
             
              <dd>
         
     | 
| 
       118 
     | 
    
         
            -
                 
     | 
| 
       119 
     | 
    
         
            -
                <b> 
     | 
| 
       120 
     | 
    
         
            -
                <br>
         
     | 
| 
       121 
     | 
    
         
            -
                <i>(Available only for parallelize)</i><br>
         
     | 
| 
       122 
     | 
    
         
            -
                <b>use</b>: <i>direct (upload direct to java)</i>, <i>file (upload throught a file)</i>
         
     | 
| 
      
 136 
     | 
    
         
            +
                Custom serializer.<br>
         
     | 
| 
      
 137 
     | 
    
         
            +
                <i>(default: by <b>spark.ruby.serializer</b> options)</i>
         
     | 
| 
       123 
138 
     | 
    
         
             
              </dd>
         
     | 
| 
       124 
139 
     | 
    
         
             
            </dl>
         
     | 
| 
       125 
140 
     | 
    
         | 
| 
      
 141 
     | 
    
         
            +
            ## Operations
         
     | 
| 
      
 142 
     | 
    
         
            +
             
     | 
| 
      
 143 
     | 
    
         
            +
            All operations can be divided into 2 groups:
         
     | 
| 
      
 144 
     | 
    
         
            +
             
     | 
| 
      
 145 
     | 
    
         
            +
            - **Transformations:** append new operation to current RDD and return new
         
     | 
| 
      
 146 
     | 
    
         
            +
            - **Actions:** add operation and start calculations
         
     | 
| 
      
 147 
     | 
    
         
            +
             
     | 
| 
      
 148 
     | 
    
         
            +
            See [wiki page](https://github.com/ondra-m/ruby-spark/wiki/RDD) or [Ruby-doc](http://www.rubydoc.info/github/ondra-m/ruby-spark/master/Spark/RDD) for more details.
         
     | 
| 
      
 149 
     | 
    
         
            +
             
     | 
| 
      
 150 
     | 
    
         
            +
            #### Transformations
         
     | 
| 
      
 151 
     | 
    
         
            +
             
     | 
| 
      
 152 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 153 
     | 
    
         
            +
            rdd.map(lambda{|item| ...})
         
     | 
| 
      
 154 
     | 
    
         
            +
            rdd.flat_map(lambda{|item| ...})
         
     | 
| 
      
 155 
     | 
    
         
            +
            rdd.filter(lambda{|item| ...})
         
     | 
| 
      
 156 
     | 
    
         
            +
            rdd.union(rdd)
         
     | 
| 
      
 157 
     | 
    
         
            +
            rdd.map_paritions(lambda{|iterator| ...})
         
     | 
| 
      
 158 
     | 
    
         
            +
            # ...
         
     | 
| 
      
 159 
     | 
    
         
            +
            ```
         
     | 
| 
      
 160 
     | 
    
         
            +
             
     | 
| 
      
 161 
     | 
    
         
            +
            #### Actions
         
     | 
| 
      
 162 
     | 
    
         
            +
             
     | 
| 
      
 163 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 164 
     | 
    
         
            +
            rdd.count
         
     | 
| 
      
 165 
     | 
    
         
            +
            rdd.take(n)
         
     | 
| 
      
 166 
     | 
    
         
            +
            rdd.collect
         
     | 
| 
      
 167 
     | 
    
         
            +
            # ...
         
     | 
| 
      
 168 
     | 
    
         
            +
            ```
         
     | 
| 
      
 169 
     | 
    
         
            +
             
     | 
| 
       126 
170 
     | 
    
         | 
| 
       127 
171 
     | 
    
         
             
            ## Examples
         
     | 
| 
       128 
172 
     | 
    
         | 
| 
       129 
173 
     | 
    
         
             
            Sum of numbers
         
     | 
| 
       130 
174 
     | 
    
         | 
| 
       131 
175 
     | 
    
         
             
            ```ruby
         
     | 
| 
       132 
     | 
    
         
            -
             
     | 
| 
      
 176 
     | 
    
         
            +
            sc.parallelize(0..10).sum
         
     | 
| 
       133 
177 
     | 
    
         
             
            # => 55
         
     | 
| 
       134 
178 
     | 
    
         
             
            ```
         
     | 
| 
       135 
179 
     | 
    
         | 
| 
       136 
180 
     | 
    
         
             
            Words count using methods
         
     | 
| 
       137 
181 
     | 
    
         | 
| 
       138 
182 
     | 
    
         
             
            ```ruby
         
     | 
| 
       139 
     | 
    
         
            -
             
     | 
| 
      
 183 
     | 
    
         
            +
            # Content:
         
     | 
| 
      
 184 
     | 
    
         
            +
            # "first line"
         
     | 
| 
      
 185 
     | 
    
         
            +
            # "second line"
         
     | 
| 
      
 186 
     | 
    
         
            +
            rdd = sc.text_file(PATH)
         
     | 
| 
       140 
187 
     | 
    
         | 
| 
      
 188 
     | 
    
         
            +
            # ["first", "line", "second", "line"]
         
     | 
| 
       141 
189 
     | 
    
         
             
            rdd = rdd.flat_map(lambda{|line| line.split})
         
     | 
| 
       142 
     | 
    
         
            -
                     .map(lambda{|word| [word, 1]})
         
     | 
| 
       143 
     | 
    
         
            -
                     .reduce_by_key(lambda{|a, b| a+b})
         
     | 
| 
       144 
190 
     | 
    
         | 
| 
      
 191 
     | 
    
         
            +
            # [["first", 1], ["line", 1], ["second", 1], ["line", 1]]
         
     | 
| 
      
 192 
     | 
    
         
            +
            rdd = rdd.map(lambda{|word| [word, 1]})
         
     | 
| 
      
 193 
     | 
    
         
            +
             
     | 
| 
      
 194 
     | 
    
         
            +
            # [["first", 1], ["line", 2], ["second", 1]]
         
     | 
| 
      
 195 
     | 
    
         
            +
            rdd = rdd.reduce_by_key(lambda{|a, b| a+b})
         
     | 
| 
      
 196 
     | 
    
         
            +
             
     | 
| 
      
 197 
     | 
    
         
            +
            # {"first"=>1, "line"=>2, "second"=>1}
         
     | 
| 
       145 
198 
     | 
    
         
             
            rdd.collect_as_hash
         
     | 
| 
       146 
199 
     | 
    
         
             
            ```
         
     | 
| 
       147 
200 
     | 
    
         | 
| 
       148 
     | 
    
         
            -
            Estimating  
     | 
| 
      
 201 
     | 
    
         
            +
            Estimating PI with a custom serializer
         
     | 
| 
       149 
202 
     | 
    
         | 
| 
       150 
203 
     | 
    
         
             
            ```ruby
         
     | 
| 
       151 
204 
     | 
    
         
             
            slices = 3
         
     | 
| 
         @@ -168,18 +221,32 @@ rdd = rdd.map(method(:map)) 
     | 
|
| 
       168 
221 
     | 
    
         
             
            puts 'Pi is roughly %f' % (4.0 * rdd.sum / n)
         
     | 
| 
       169 
222 
     | 
    
         
             
            ```
         
     | 
| 
       170 
223 
     | 
    
         | 
| 
      
 224 
     | 
    
         
            +
            Estimating PI
         
     | 
| 
      
 225 
     | 
    
         
            +
             
     | 
| 
      
 226 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 227 
     | 
    
         
            +
            rdd = sc.parallelize([10_000], 1)
         
     | 
| 
      
 228 
     | 
    
         
            +
            rdd = rdd.add_library('bigdecimal/math')
         
     | 
| 
      
 229 
     | 
    
         
            +
            rdd = rdd.map(lambda{|x| BigMath.PI(x)})
         
     | 
| 
      
 230 
     | 
    
         
            +
            rdd.collect # => #<BigDecimal, '0.31415926...'>
         
     | 
| 
      
 231 
     | 
    
         
            +
            ```
         
     | 
| 
      
 232 
     | 
    
         
            +
             
     | 
| 
       171 
233 
     | 
    
         
             
            Linear regression
         
     | 
| 
       172 
234 
     | 
    
         | 
| 
       173 
235 
     | 
    
         
             
            ```ruby
         
     | 
| 
       174 
     | 
    
         
            -
             
     | 
| 
      
 236 
     | 
    
         
            +
            # Import Mllib classes into Object
         
     | 
| 
      
 237 
     | 
    
         
            +
            # Otherwise are accessible via Spark::Mllib::LinearRegressionWithSGD
         
     | 
| 
      
 238 
     | 
    
         
            +
            Spark::Mllib.import(Object)
         
     | 
| 
       175 
239 
     | 
    
         | 
| 
      
 240 
     | 
    
         
            +
            # Training data
         
     | 
| 
       176 
241 
     | 
    
         
             
            data = [
         
     | 
| 
       177 
242 
     | 
    
         
             
              LabeledPoint.new(0.0, [0.0]),
         
     | 
| 
       178 
243 
     | 
    
         
             
              LabeledPoint.new(1.0, [1.0]),
         
     | 
| 
       179 
244 
     | 
    
         
             
              LabeledPoint.new(3.0, [2.0]),
         
     | 
| 
       180 
245 
     | 
    
         
             
              LabeledPoint.new(2.0, [3.0])
         
     | 
| 
       181 
246 
     | 
    
         
             
            ]
         
     | 
| 
       182 
     | 
    
         
            -
             
     | 
| 
      
 247 
     | 
    
         
            +
             
     | 
| 
      
 248 
     | 
    
         
            +
            # Train a model
         
     | 
| 
      
 249 
     | 
    
         
            +
            lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initial_weights: [1.0])
         
     | 
| 
       183 
250 
     | 
    
         | 
| 
       184 
251 
     | 
    
         
             
            lrm.predict([0.0])
         
     | 
| 
       185 
252 
     | 
    
         
             
            ```
         
     | 
    
        data/TODO.md
    CHANGED
    
    | 
         @@ -1,7 +1,6 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            - add compress
         
     | 
| 
       2 
1 
     | 
    
         
             
            - refactor JavaBridge: to_java, from_java
         
     | 
| 
       3 
2 
     | 
    
         
             
            - add Streaming
         
     | 
| 
       4 
3 
     | 
    
         
             
            - add SQL
         
     | 
| 
       5 
     | 
    
         
            -
            - autobatch serializer
         
     | 
| 
       6 
4 
     | 
    
         
             
            - worker informations (time, memory, ...)
         
     | 
| 
       7 
     | 
    
         
            -
            -  
     | 
| 
      
 5 
     | 
    
         
            +
            - killing zombie workers
         
     | 
| 
      
 6 
     | 
    
         
            +
            - global config to ~/.ruby-spark.conf (e.g. target folder for spark)
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
    
        data/example/pi.rb
    CHANGED
    
    
| 
         @@ -0,0 +1,83 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            #!/usr/bin/env ruby
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            # Parse sitemap and search word on every page
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            require 'optparse'
         
     | 
| 
      
 6 
     | 
    
         
            +
            require 'open-uri'
         
     | 
| 
      
 7 
     | 
    
         
            +
            require 'nokogiri'
         
     | 
| 
      
 8 
     | 
    
         
            +
            require 'ruby-spark'
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
            options = {
         
     | 
| 
      
 11 
     | 
    
         
            +
              sitemap: 'http://fit.cvut.cz/sitemap.xml',
         
     | 
| 
      
 12 
     | 
    
         
            +
              query: 'cvut',
         
     | 
| 
      
 13 
     | 
    
         
            +
              workers: 2
         
     | 
| 
      
 14 
     | 
    
         
            +
            }
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
            opt_parser = OptionParser.new do |opts|
         
     | 
| 
      
 17 
     | 
    
         
            +
              opts.banner = 'Usage: website_search.rb [options]'
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
              opts.separator ''
         
     | 
| 
      
 20 
     | 
    
         
            +
              opts.separator 'Specific options:'
         
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
              opts.on('-s', '--sitemap SITEMAP', 'Sitemap URL') do |sitemap|
         
     | 
| 
      
 23 
     | 
    
         
            +
                options[:sitemap] = sitemap
         
     | 
| 
      
 24 
     | 
    
         
            +
              end
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
              opts.on('-q', '--query QUERY', 'Query to search') do |query|
         
     | 
| 
      
 27 
     | 
    
         
            +
                options[:query] = query
         
     | 
| 
      
 28 
     | 
    
         
            +
              end
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
              opts.on('-w', '--workers WORKERS_NUM', Integer, 'Number of workers') do |workers|
         
     | 
| 
      
 31 
     | 
    
         
            +
                options[:workers] = workers
         
     | 
| 
      
 32 
     | 
    
         
            +
              end
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
              opts.on('--quite', 'Run quitely') do |v|
         
     | 
| 
      
 35 
     | 
    
         
            +
                Spark.logger.disabled
         
     | 
| 
      
 36 
     | 
    
         
            +
              end
         
     | 
| 
      
 37 
     | 
    
         
            +
             
     | 
| 
      
 38 
     | 
    
         
            +
              opts.on_tail('-h', '--help', 'Show this message') do
         
     | 
| 
      
 39 
     | 
    
         
            +
                puts opts
         
     | 
| 
      
 40 
     | 
    
         
            +
                exit
         
     | 
| 
      
 41 
     | 
    
         
            +
              end
         
     | 
| 
      
 42 
     | 
    
         
            +
            end
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
            opt_parser.parse!
         
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
| 
      
 46 
     | 
    
         
            +
            @links = []
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
            def parse_sitemap(doc)
         
     | 
| 
      
 49 
     | 
    
         
            +
              doc.xpath('//sitemapindex/sitemap/loc').each do |loc|
         
     | 
| 
      
 50 
     | 
    
         
            +
                next_doc = Nokogiri::HTML(open(loc.text))
         
     | 
| 
      
 51 
     | 
    
         
            +
                parse_sitemap(next_doc)
         
     | 
| 
      
 52 
     | 
    
         
            +
              end
         
     | 
| 
      
 53 
     | 
    
         
            +
             
     | 
| 
      
 54 
     | 
    
         
            +
              doc.xpath('//url/loc').each do |loc|
         
     | 
| 
      
 55 
     | 
    
         
            +
                @links << loc.text
         
     | 
| 
      
 56 
     | 
    
         
            +
              end
         
     | 
| 
      
 57 
     | 
    
         
            +
            end
         
     | 
| 
      
 58 
     | 
    
         
            +
             
     | 
| 
      
 59 
     | 
    
         
            +
            doc = Nokogiri::HTML(open(options[:sitemap]))
         
     | 
| 
      
 60 
     | 
    
         
            +
            parse_sitemap(doc)
         
     | 
| 
      
 61 
     | 
    
         
            +
             
     | 
| 
      
 62 
     | 
    
         
            +
            # Map function
         
     | 
| 
      
 63 
     | 
    
         
            +
            func = Proc.new do |url|
         
     | 
| 
      
 64 
     | 
    
         
            +
              begin
         
     | 
| 
      
 65 
     | 
    
         
            +
                open(url) {|f|
         
     | 
| 
      
 66 
     | 
    
         
            +
                  [url, f.read.scan(query).count]
         
     | 
| 
      
 67 
     | 
    
         
            +
                }
         
     | 
| 
      
 68 
     | 
    
         
            +
              rescue
         
     | 
| 
      
 69 
     | 
    
         
            +
                [url, 0]
         
     | 
| 
      
 70 
     | 
    
         
            +
              end
         
     | 
| 
      
 71 
     | 
    
         
            +
            end
         
     | 
| 
      
 72 
     | 
    
         
            +
             
     | 
| 
      
 73 
     | 
    
         
            +
            Spark.start
         
     | 
| 
      
 74 
     | 
    
         
            +
             
     | 
| 
      
 75 
     | 
    
         
            +
            rdd = Spark.sc.parallelize(@links, options[:workers])
         
     | 
| 
      
 76 
     | 
    
         
            +
                          .add_library('open-uri')
         
     | 
| 
      
 77 
     | 
    
         
            +
                          .bind(query: options[:query])
         
     | 
| 
      
 78 
     | 
    
         
            +
                          .map(func)
         
     | 
| 
      
 79 
     | 
    
         
            +
                          .sort_by(lambda{|(_, value)| value}, false)
         
     | 
| 
      
 80 
     | 
    
         
            +
             
     | 
| 
      
 81 
     | 
    
         
            +
            rdd.collect.each do |(url, count)|
         
     | 
| 
      
 82 
     | 
    
         
            +
              puts "#{url} => #{count}"
         
     | 
| 
      
 83 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -290,9 +290,21 @@ class PairwiseRDD(prev: RDD[Array[Byte]]) extends RDD[(Long, Array[Byte])](prev) 
     | 
|
| 
       290 
290 
     | 
    
         | 
| 
       291 
291 
     | 
    
         
             
            object RubyRDD extends Logging {
         
     | 
| 
       292 
292 
     | 
    
         | 
| 
      
 293 
     | 
    
         
            +
              def runJob(
         
     | 
| 
      
 294 
     | 
    
         
            +
                  sc: SparkContext,
         
     | 
| 
      
 295 
     | 
    
         
            +
                  rdd: JavaRDD[Array[Byte]],
         
     | 
| 
      
 296 
     | 
    
         
            +
                  partitions: ArrayList[Int],
         
     | 
| 
      
 297 
     | 
    
         
            +
                  allowLocal: Boolean,
         
     | 
| 
      
 298 
     | 
    
         
            +
                  filename: String): String = {
         
     | 
| 
      
 299 
     | 
    
         
            +
                type ByteArray = Array[Byte]
         
     | 
| 
      
 300 
     | 
    
         
            +
                type UnrolledPartition = Array[ByteArray]
         
     | 
| 
      
 301 
     | 
    
         
            +
                val allPartitions: Array[UnrolledPartition] =
         
     | 
| 
      
 302 
     | 
    
         
            +
                  sc.runJob(rdd, (x: Iterator[ByteArray]) => x.toArray, partitions, allowLocal)
         
     | 
| 
      
 303 
     | 
    
         
            +
                val flattenedPartition: UnrolledPartition = Array.concat(allPartitions: _*)
         
     | 
| 
      
 304 
     | 
    
         
            +
                writeRDDToFile(flattenedPartition.iterator, filename)
         
     | 
| 
      
 305 
     | 
    
         
            +
              }
         
     | 
| 
      
 306 
     | 
    
         
            +
             
     | 
| 
       293 
307 
     | 
    
         
             
              def readRDDFromFile(sc: JavaSparkContext, filename: String, parallelism: Int): JavaRDD[Array[Byte]] = {
         
     | 
| 
       294 
     | 
    
         
            -
                // Too slow
         
     | 
| 
       295 
     | 
    
         
            -
                // val file = new DataInputStream(new FileInputStream(filename))
         
     | 
| 
       296 
308 
     | 
    
         
             
                val file = new DataInputStream(new BufferedInputStream(new FileInputStream(filename)))
         
     | 
| 
       297 
309 
     | 
    
         
             
                val objs = new collection.mutable.ArrayBuffer[Array[Byte]]
         
     | 
| 
       298 
310 
     | 
    
         
             
                try {
         
     | 
| 
         @@ -308,6 +320,22 @@ object RubyRDD extends Logging { 
     | 
|
| 
       308 
320 
     | 
    
         
             
                JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))
         
     | 
| 
       309 
321 
     | 
    
         
             
              }
         
     | 
| 
       310 
322 
     | 
    
         | 
| 
      
 323 
     | 
    
         
            +
              def writeRDDToFile[T](items: Iterator[T], filename: String): String = {
         
     | 
| 
      
 324 
     | 
    
         
            +
                val file = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(filename)))
         
     | 
| 
      
 325 
     | 
    
         
            +
             
     | 
| 
      
 326 
     | 
    
         
            +
                try {
         
     | 
| 
      
 327 
     | 
    
         
            +
                  PythonRDD.writeIteratorToStream(items, file)
         
     | 
| 
      
 328 
     | 
    
         
            +
                } finally {
         
     | 
| 
      
 329 
     | 
    
         
            +
                  file.close()
         
     | 
| 
      
 330 
     | 
    
         
            +
                }
         
     | 
| 
      
 331 
     | 
    
         
            +
             
     | 
| 
      
 332 
     | 
    
         
            +
                filename
         
     | 
| 
      
 333 
     | 
    
         
            +
              }
         
     | 
| 
      
 334 
     | 
    
         
            +
             
     | 
| 
      
 335 
     | 
    
         
            +
              def writeRDDToFile[T](rdd: RDD[T], filename: String): String = {
         
     | 
| 
      
 336 
     | 
    
         
            +
                writeRDDToFile(rdd.collect.iterator, filename)
         
     | 
| 
      
 337 
     | 
    
         
            +
              }
         
     | 
| 
      
 338 
     | 
    
         
            +
             
     | 
| 
       311 
339 
     | 
    
         
             
              def readBroadcastFromFile(sc: JavaSparkContext, path: String, id: java.lang.Long): Broadcast[RubyBroadcast] = {
         
     | 
| 
       312 
340 
     | 
    
         
             
                sc.broadcast(new RubyBroadcast(path, id))
         
     | 
| 
       313 
341 
     | 
    
         
             
              }
         
     | 
    
        data/lib/spark.rb
    CHANGED
    
    
    
        data/lib/spark/build.rb
    CHANGED
    
    
    
        data/lib/spark/cli.rb
    CHANGED
    
    | 
         @@ -21,7 +21,7 @@ module Spark 
     | 
|
| 
       21 
21 
     | 
    
         
             
                  program :version, Spark::VERSION
         
     | 
| 
       22 
22 
     | 
    
         
             
                  program :description, 'Ruby wrapper for Spark'
         
     | 
| 
       23 
23 
     | 
    
         | 
| 
       24 
     | 
    
         
            -
                  global_option('-d', '--debug', 'Logging message to stdout'){ $ 
     | 
| 
      
 24 
     | 
    
         
            +
                  global_option('-d', '--debug', 'Logging message to stdout'){ $DEBUG = true }
         
     | 
| 
       25 
25 
     | 
    
         
             
                  default_command :help
         
     | 
| 
       26 
26 
     | 
    
         | 
| 
       27 
27 
     | 
    
         |