ruby-spark 1.0.0 → 1.1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -1
  3. data/README.md +99 -32
  4. data/TODO.md +2 -3
  5. data/benchmark/{performance → comparison}/prepare.sh +0 -0
  6. data/benchmark/{performance → comparison}/python.py +0 -0
  7. data/benchmark/{performance → comparison}/r.r +0 -0
  8. data/benchmark/{performance → comparison}/ruby.rb +0 -0
  9. data/benchmark/{performance → comparison}/run-all.sh +0 -0
  10. data/benchmark/{performance → comparison}/scala.scala +0 -0
  11. data/example/pi.rb +1 -1
  12. data/example/website_search.rb +83 -0
  13. data/ext/spark/src/main/scala/RubyRDD.scala +30 -2
  14. data/lib/spark.rb +2 -2
  15. data/lib/spark/build.rb +1 -1
  16. data/lib/spark/cli.rb +1 -1
  17. data/lib/spark/command/base.rb +4 -0
  18. data/lib/spark/command_builder.rb +2 -2
  19. data/lib/spark/config.rb +11 -17
  20. data/lib/spark/context.rb +63 -45
  21. data/lib/spark/ext/io.rb +11 -1
  22. data/lib/spark/java_bridge/base.rb +2 -2
  23. data/lib/spark/rdd.rb +67 -18
  24. data/lib/spark/serializer.rb +68 -13
  25. data/lib/spark/serializer/auto_batched.rb +59 -0
  26. data/lib/spark/serializer/base.rb +30 -137
  27. data/lib/spark/serializer/batched.rb +84 -0
  28. data/lib/spark/serializer/cartesian.rb +5 -29
  29. data/lib/spark/serializer/compressed.rb +27 -0
  30. data/lib/spark/serializer/marshal.rb +6 -8
  31. data/lib/spark/serializer/message_pack.rb +8 -10
  32. data/lib/spark/serializer/oj.rb +8 -10
  33. data/lib/spark/serializer/pair.rb +27 -13
  34. data/lib/spark/serializer/text.rb +25 -0
  35. data/lib/spark/version.rb +1 -1
  36. data/lib/spark/worker/worker.rb +5 -2
  37. data/ruby-spark.gemspec +13 -1
  38. data/spec/lib/context_spec.rb +3 -1
  39. data/spec/lib/manipulation_spec.rb +18 -10
  40. data/spec/lib/map_partitions_spec.rb +16 -16
  41. data/spec/lib/serializer_spec.rb +84 -9
  42. data/spec/lib/statistic_spec.rb +26 -24
  43. data/spec/spec_helper.rb +1 -2
  44. metadata +112 -10
  45. data/lib/spark/serializer/utf8.rb +0 -25
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 13c074c360edf1875745bf728b274f5bfb6e0d0a
4
- data.tar.gz: a59ba09fac91e2e4c0a58aab99dd60ffb2ec5e3f
3
+ metadata.gz: 64125882e5773c705d62c737e57a481d7bc8bb71
4
+ data.tar.gz: 4dd464e678a79c9e2d5655fc5b0aae813c2a6346
5
5
  SHA512:
6
- metadata.gz: 2e667dabd55b05100831cf3d0e58044941ce965965de1d6dce9a4e8fa5be843bfac1d57d5b6674ea056b9e9395650b84e3e77a86f0a256a1b1c4bfd8ca257340
7
- data.tar.gz: 690b88857fa4f841c8c0a5940af75d926285f42a81ff6753337ccc926473a4a3ce510d1c7e9b07ce2ebfd2559c05dd77432cf1dba132f15629669fbf8dfc51b6
6
+ metadata.gz: 5d11709879c2ce1d1c1a7153dafc7b85908d51227b864f276ef1bfb257eb239ae9f6216054bb3a0e601c6df23ec756fb60df70793113b282fc8db1e73b7b9b5d
7
+ data.tar.gz: 4452699e228a9f4196a3afc583a9f6ea792ca41864524ab2b029aef04abb53d662c9ce538f8190add24a528605cca3d3fcc2f6c99e49f8ae33836df0a0181a13
data/.gitignore CHANGED
@@ -32,6 +32,6 @@ mkmf.log
32
32
  ext/spark/target/*
33
33
  ext/spark/project/target/*
34
34
  ext/spark/project/project/target/*
35
- wiki/*
35
+ wiki
36
36
  /benchmark/performance/spark/*
37
37
  /benchmark/performance/rspark/*
data/README.md CHANGED
@@ -20,6 +20,12 @@ file.flat_map(:split)
20
20
 
21
21
  ## Installation
22
22
 
23
+ ### Requirments
24
+
25
+ - Java 7+
26
+ - Ruby 2+
27
+ - MRI or JRuby
28
+
23
29
  Add this line to your application's Gemfile:
24
30
 
25
31
  ```ruby
@@ -38,33 +44,34 @@ Or install it yourself as:
38
44
  $ gem install ruby-spark
39
45
  ```
40
46
 
41
- ### Install Apache Spark
47
+ Run `rake compile` if you are using gem from local filesystem.
42
48
 
43
- To install latest supported Spark. First compile native extension:
49
+ ### Build Apache Spark
50
+
51
+ This command will download Spark and build extensions for this gem ([SBT](ext/spark/build.sbt) is used for compiling). For more informations check [wiki](https://github.com/ondra-m/ruby-spark/wiki/Installation). Everything is stored by default at [GEM_ROOT]/target,
44
52
 
45
- ```
46
- $ rake compile
47
- ```
48
- Then build Spark, [SBT](ext/spark/build.sbt) is used for compiling.
49
53
  ```
50
54
  $ ruby-spark build
51
55
  ```
52
56
 
53
57
  ## Usage
54
58
 
55
- You can use Ruby Spark via interactive shell
59
+ You can use Ruby Spark via interactive shell (Pry is used)
56
60
 
57
61
  ```
58
- $ ruby-spark pry
62
+ $ ruby-spark shell
59
63
  ```
60
64
 
61
65
  Or on existing project
62
66
 
63
67
  ```ruby
64
68
  require 'ruby-spark'
69
+
70
+ # Create a SparkContext
65
71
  Spark.start
66
72
 
67
- Spark.sc # => context
73
+ # Context reference
74
+ Spark.sc
68
75
  ```
69
76
 
70
77
  If you want configure Spark first. See [configurations](https://github.com/ondra-m/ruby-spark/wiki/Configuration) for more details.
@@ -72,36 +79,47 @@ If you want configure Spark first. See [configurations](https://github.com/ondra
72
79
  ```ruby
73
80
  require 'ruby-spark'
74
81
 
82
+ # Use if you have custom SPARK_HOME
75
83
  Spark.load_lib(spark_home)
84
+
85
+ # Configuration
76
86
  Spark.config do
77
87
  set_app_name "RubySpark"
78
- set 'spark.ruby.batch_size', 100
79
88
  set 'spark.ruby.serializer', 'oj'
89
+ set 'spark.ruby.serializer.batch_size', 100
80
90
  end
91
+
92
+ # Start Apache Spark
81
93
  Spark.start
94
+ ```
95
+
96
+ Finally, to stop the cluster. On the shell is Spark stopped automatically when you exist.
82
97
 
83
- Spark.sc # => context
98
+ ```ruby
99
+ Spark.stop
84
100
  ```
85
101
 
86
- ## Uploading a data
87
102
 
88
- Single file
103
+
104
+ ## Creating RDD (upload data)
105
+
106
+ Single text file:
89
107
 
90
108
  ```ruby
91
- $sc.text_file(FILE, workers_num, custom_options)
109
+ rdd = sc.text_file(FILE, workers_num, serializer=nil)
92
110
  ```
93
111
 
94
- All files on directory
112
+ All files on directory:
95
113
 
96
114
  ```ruby
97
- $sc.whole_text_files(DIRECTORY, workers_num, custom_options)
115
+ rdd = sc.whole_text_files(DIRECTORY, workers_num, serializer=nil)
98
116
  ```
99
117
 
100
- Direct
118
+ Direct uploading structures from ruby (choosen serializer must be able to serialize it):
101
119
 
102
120
  ```ruby
103
- $sc.parallelize([1,2,3,4,5], workers_num, custom_options)
104
- $sc.parallelize(1..5, workers_num, custom_options)
121
+ rdd = sc.parallelize([1,2,3,4,5], workers_num, serializer=nil)
122
+ rdd = sc.parallelize(1..5, workers_num, serializer=nil)
105
123
  ```
106
124
 
107
125
  ### Options
@@ -113,39 +131,74 @@ $sc.parallelize(1..5, workers_num, custom_options)
113
131
  <i>(This value can be overwriten by spark)</i>
114
132
  </dd>
115
133
 
116
- <dt>custom_options</dt>
134
+ <dt>serializer</dt>
117
135
  <dd>
118
- <b>serializer</b>: name of serializator used for this RDD<br>
119
- <b>batch_size</b>: see configuration<br>
120
- <br>
121
- <i>(Available only for parallelize)</i><br>
122
- <b>use</b>: <i>direct (upload direct to java)</i>, <i>file (upload throught a file)</i>
136
+ Custom serializer.<br>
137
+ <i>(default: by <b>spark.ruby.serializer</b> options)</i>
123
138
  </dd>
124
139
  </dl>
125
140
 
141
+ ## Operations
142
+
143
+ All operations can be divided into 2 groups:
144
+
145
+ - **Transformations:** append new operation to current RDD and return new
146
+ - **Actions:** add operation and start calculations
147
+
148
+ See [wiki page](https://github.com/ondra-m/ruby-spark/wiki/RDD) or [Ruby-doc](http://www.rubydoc.info/github/ondra-m/ruby-spark/master/Spark/RDD) for more details.
149
+
150
+ #### Transformations
151
+
152
+ ```ruby
153
+ rdd.map(lambda{|item| ...})
154
+ rdd.flat_map(lambda{|item| ...})
155
+ rdd.filter(lambda{|item| ...})
156
+ rdd.union(rdd)
157
+ rdd.map_paritions(lambda{|iterator| ...})
158
+ # ...
159
+ ```
160
+
161
+ #### Actions
162
+
163
+ ```ruby
164
+ rdd.count
165
+ rdd.take(n)
166
+ rdd.collect
167
+ # ...
168
+ ```
169
+
126
170
 
127
171
  ## Examples
128
172
 
129
173
  Sum of numbers
130
174
 
131
175
  ```ruby
132
- $sc.parallelize(0..10).sum
176
+ sc.parallelize(0..10).sum
133
177
  # => 55
134
178
  ```
135
179
 
136
180
  Words count using methods
137
181
 
138
182
  ```ruby
139
- rdd = $sc.text_file(PATH)
183
+ # Content:
184
+ # "first line"
185
+ # "second line"
186
+ rdd = sc.text_file(PATH)
140
187
 
188
+ # ["first", "line", "second", "line"]
141
189
  rdd = rdd.flat_map(lambda{|line| line.split})
142
- .map(lambda{|word| [word, 1]})
143
- .reduce_by_key(lambda{|a, b| a+b})
144
190
 
191
+ # [["first", 1], ["line", 1], ["second", 1], ["line", 1]]
192
+ rdd = rdd.map(lambda{|word| [word, 1]})
193
+
194
+ # [["first", 1], ["line", 2], ["second", 1]]
195
+ rdd = rdd.reduce_by_key(lambda{|a, b| a+b})
196
+
197
+ # {"first"=>1, "line"=>2, "second"=>1}
145
198
  rdd.collect_as_hash
146
199
  ```
147
200
 
148
- Estimating pi with a custom serializer
201
+ Estimating PI with a custom serializer
149
202
 
150
203
  ```ruby
151
204
  slices = 3
@@ -168,18 +221,32 @@ rdd = rdd.map(method(:map))
168
221
  puts 'Pi is roughly %f' % (4.0 * rdd.sum / n)
169
222
  ```
170
223
 
224
+ Estimating PI
225
+
226
+ ```ruby
227
+ rdd = sc.parallelize([10_000], 1)
228
+ rdd = rdd.add_library('bigdecimal/math')
229
+ rdd = rdd.map(lambda{|x| BigMath.PI(x)})
230
+ rdd.collect # => #<BigDecimal, '0.31415926...'>
231
+ ```
232
+
171
233
  Linear regression
172
234
 
173
235
  ```ruby
174
- Spark::Mllib.import
236
+ # Import Mllib classes into Object
237
+ # Otherwise are accessible via Spark::Mllib::LinearRegressionWithSGD
238
+ Spark::Mllib.import(Object)
175
239
 
240
+ # Training data
176
241
  data = [
177
242
  LabeledPoint.new(0.0, [0.0]),
178
243
  LabeledPoint.new(1.0, [1.0]),
179
244
  LabeledPoint.new(3.0, [2.0]),
180
245
  LabeledPoint.new(2.0, [3.0])
181
246
  ]
182
- lrm = LinearRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0])
247
+
248
+ # Train a model
249
+ lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initial_weights: [1.0])
183
250
 
184
251
  lrm.predict([0.0])
185
252
  ```
data/TODO.md CHANGED
@@ -1,7 +1,6 @@
1
- - add compress
2
1
  - refactor JavaBridge: to_java, from_java
3
2
  - add Streaming
4
3
  - add SQL
5
- - autobatch serializer
6
4
  - worker informations (time, memory, ...)
7
- - encoding parameter to context.text_file
5
+ - killing zombie workers
6
+ - global config to ~/.ruby-spark.conf (e.g. target folder for spark)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
data/example/pi.rb CHANGED
@@ -22,7 +22,7 @@ def map(_)
22
22
  end
23
23
  end
24
24
 
25
- rdd = Spark.context.parallelize(1..n, slices, serializer: 'oj')
25
+ rdd = Spark.context.parallelize(1..n, slices)
26
26
  rdd = rdd.map(method(:map))
27
27
 
28
28
  puts 'Pi is roughly %f' % (4.0 * rdd.sum / n)
@@ -0,0 +1,83 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Parse sitemap and search word on every page
4
+
5
+ require 'optparse'
6
+ require 'open-uri'
7
+ require 'nokogiri'
8
+ require 'ruby-spark'
9
+
10
+ options = {
11
+ sitemap: 'http://fit.cvut.cz/sitemap.xml',
12
+ query: 'cvut',
13
+ workers: 2
14
+ }
15
+
16
+ opt_parser = OptionParser.new do |opts|
17
+ opts.banner = 'Usage: website_search.rb [options]'
18
+
19
+ opts.separator ''
20
+ opts.separator 'Specific options:'
21
+
22
+ opts.on('-s', '--sitemap SITEMAP', 'Sitemap URL') do |sitemap|
23
+ options[:sitemap] = sitemap
24
+ end
25
+
26
+ opts.on('-q', '--query QUERY', 'Query to search') do |query|
27
+ options[:query] = query
28
+ end
29
+
30
+ opts.on('-w', '--workers WORKERS_NUM', Integer, 'Number of workers') do |workers|
31
+ options[:workers] = workers
32
+ end
33
+
34
+ opts.on('--quite', 'Run quitely') do |v|
35
+ Spark.logger.disabled
36
+ end
37
+
38
+ opts.on_tail('-h', '--help', 'Show this message') do
39
+ puts opts
40
+ exit
41
+ end
42
+ end
43
+
44
+ opt_parser.parse!
45
+
46
+ @links = []
47
+
48
+ def parse_sitemap(doc)
49
+ doc.xpath('//sitemapindex/sitemap/loc').each do |loc|
50
+ next_doc = Nokogiri::HTML(open(loc.text))
51
+ parse_sitemap(next_doc)
52
+ end
53
+
54
+ doc.xpath('//url/loc').each do |loc|
55
+ @links << loc.text
56
+ end
57
+ end
58
+
59
+ doc = Nokogiri::HTML(open(options[:sitemap]))
60
+ parse_sitemap(doc)
61
+
62
+ # Map function
63
+ func = Proc.new do |url|
64
+ begin
65
+ open(url) {|f|
66
+ [url, f.read.scan(query).count]
67
+ }
68
+ rescue
69
+ [url, 0]
70
+ end
71
+ end
72
+
73
+ Spark.start
74
+
75
+ rdd = Spark.sc.parallelize(@links, options[:workers])
76
+ .add_library('open-uri')
77
+ .bind(query: options[:query])
78
+ .map(func)
79
+ .sort_by(lambda{|(_, value)| value}, false)
80
+
81
+ rdd.collect.each do |(url, count)|
82
+ puts "#{url} => #{count}"
83
+ end
@@ -290,9 +290,21 @@ class PairwiseRDD(prev: RDD[Array[Byte]]) extends RDD[(Long, Array[Byte])](prev)
290
290
 
291
291
  object RubyRDD extends Logging {
292
292
 
293
+ def runJob(
294
+ sc: SparkContext,
295
+ rdd: JavaRDD[Array[Byte]],
296
+ partitions: ArrayList[Int],
297
+ allowLocal: Boolean,
298
+ filename: String): String = {
299
+ type ByteArray = Array[Byte]
300
+ type UnrolledPartition = Array[ByteArray]
301
+ val allPartitions: Array[UnrolledPartition] =
302
+ sc.runJob(rdd, (x: Iterator[ByteArray]) => x.toArray, partitions, allowLocal)
303
+ val flattenedPartition: UnrolledPartition = Array.concat(allPartitions: _*)
304
+ writeRDDToFile(flattenedPartition.iterator, filename)
305
+ }
306
+
293
307
  def readRDDFromFile(sc: JavaSparkContext, filename: String, parallelism: Int): JavaRDD[Array[Byte]] = {
294
- // Too slow
295
- // val file = new DataInputStream(new FileInputStream(filename))
296
308
  val file = new DataInputStream(new BufferedInputStream(new FileInputStream(filename)))
297
309
  val objs = new collection.mutable.ArrayBuffer[Array[Byte]]
298
310
  try {
@@ -308,6 +320,22 @@ object RubyRDD extends Logging {
308
320
  JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))
309
321
  }
310
322
 
323
+ def writeRDDToFile[T](items: Iterator[T], filename: String): String = {
324
+ val file = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(filename)))
325
+
326
+ try {
327
+ PythonRDD.writeIteratorToStream(items, file)
328
+ } finally {
329
+ file.close()
330
+ }
331
+
332
+ filename
333
+ }
334
+
335
+ def writeRDDToFile[T](rdd: RDD[T], filename: String): String = {
336
+ writeRDDToFile(rdd.collect.iterator, filename)
337
+ }
338
+
311
339
  def readBroadcastFromFile(sc: JavaSparkContext, path: String, id: java.lang.Long): Broadcast[RubyBroadcast] = {
312
340
  sc.broadcast(new RubyBroadcast(path, id))
313
341
  }
data/lib/spark.rb CHANGED
@@ -190,9 +190,9 @@ require 'spark/version'
190
190
  require 'spark/error'
191
191
 
192
192
  # Make sure that Spark be always stopped
193
- Kernel::at_exit do
193
+ Kernel.at_exit do
194
194
  begin
195
- Spark.stop
195
+ Spark.started? && Spark.stop
196
196
  rescue
197
197
  end
198
198
  end
data/lib/spark/build.rb CHANGED
@@ -30,7 +30,7 @@ module Spark
30
30
  cmd = [SBT]
31
31
  cmd << SBT_EXT
32
32
  cmd << SBT_DEPS unless only_ext
33
- cmd << SBT_CLEAN unless $debug
33
+ cmd << SBT_CLEAN unless $DEBUG
34
34
 
35
35
  Dir.chdir(Spark.spark_ext_dir) do
36
36
  unless Kernel.system(env, cmd.join(' '))
data/lib/spark/cli.rb CHANGED
@@ -21,7 +21,7 @@ module Spark
21
21
  program :version, Spark::VERSION
22
22
  program :description, 'Ruby wrapper for Spark'
23
23
 
24
- global_option('-d', '--debug', 'Logging message to stdout'){ $debug = true }
24
+ global_option('-d', '--debug', 'Logging message to stdout'){ $DEBUG = true }
25
25
  default_command :help
26
26
 
27
27