ruby-spark 1.0.0 → 1.1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -1
  3. data/README.md +99 -32
  4. data/TODO.md +2 -3
  5. data/benchmark/{performance → comparison}/prepare.sh +0 -0
  6. data/benchmark/{performance → comparison}/python.py +0 -0
  7. data/benchmark/{performance → comparison}/r.r +0 -0
  8. data/benchmark/{performance → comparison}/ruby.rb +0 -0
  9. data/benchmark/{performance → comparison}/run-all.sh +0 -0
  10. data/benchmark/{performance → comparison}/scala.scala +0 -0
  11. data/example/pi.rb +1 -1
  12. data/example/website_search.rb +83 -0
  13. data/ext/spark/src/main/scala/RubyRDD.scala +30 -2
  14. data/lib/spark.rb +2 -2
  15. data/lib/spark/build.rb +1 -1
  16. data/lib/spark/cli.rb +1 -1
  17. data/lib/spark/command/base.rb +4 -0
  18. data/lib/spark/command_builder.rb +2 -2
  19. data/lib/spark/config.rb +11 -17
  20. data/lib/spark/context.rb +63 -45
  21. data/lib/spark/ext/io.rb +11 -1
  22. data/lib/spark/java_bridge/base.rb +2 -2
  23. data/lib/spark/rdd.rb +67 -18
  24. data/lib/spark/serializer.rb +68 -13
  25. data/lib/spark/serializer/auto_batched.rb +59 -0
  26. data/lib/spark/serializer/base.rb +30 -137
  27. data/lib/spark/serializer/batched.rb +84 -0
  28. data/lib/spark/serializer/cartesian.rb +5 -29
  29. data/lib/spark/serializer/compressed.rb +27 -0
  30. data/lib/spark/serializer/marshal.rb +6 -8
  31. data/lib/spark/serializer/message_pack.rb +8 -10
  32. data/lib/spark/serializer/oj.rb +8 -10
  33. data/lib/spark/serializer/pair.rb +27 -13
  34. data/lib/spark/serializer/text.rb +25 -0
  35. data/lib/spark/version.rb +1 -1
  36. data/lib/spark/worker/worker.rb +5 -2
  37. data/ruby-spark.gemspec +13 -1
  38. data/spec/lib/context_spec.rb +3 -1
  39. data/spec/lib/manipulation_spec.rb +18 -10
  40. data/spec/lib/map_partitions_spec.rb +16 -16
  41. data/spec/lib/serializer_spec.rb +84 -9
  42. data/spec/lib/statistic_spec.rb +26 -24
  43. data/spec/spec_helper.rb +1 -2
  44. metadata +112 -10
  45. data/lib/spark/serializer/utf8.rb +0 -25
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 13c074c360edf1875745bf728b274f5bfb6e0d0a
4
- data.tar.gz: a59ba09fac91e2e4c0a58aab99dd60ffb2ec5e3f
3
+ metadata.gz: 64125882e5773c705d62c737e57a481d7bc8bb71
4
+ data.tar.gz: 4dd464e678a79c9e2d5655fc5b0aae813c2a6346
5
5
  SHA512:
6
- metadata.gz: 2e667dabd55b05100831cf3d0e58044941ce965965de1d6dce9a4e8fa5be843bfac1d57d5b6674ea056b9e9395650b84e3e77a86f0a256a1b1c4bfd8ca257340
7
- data.tar.gz: 690b88857fa4f841c8c0a5940af75d926285f42a81ff6753337ccc926473a4a3ce510d1c7e9b07ce2ebfd2559c05dd77432cf1dba132f15629669fbf8dfc51b6
6
+ metadata.gz: 5d11709879c2ce1d1c1a7153dafc7b85908d51227b864f276ef1bfb257eb239ae9f6216054bb3a0e601c6df23ec756fb60df70793113b282fc8db1e73b7b9b5d
7
+ data.tar.gz: 4452699e228a9f4196a3afc583a9f6ea792ca41864524ab2b029aef04abb53d662c9ce538f8190add24a528605cca3d3fcc2f6c99e49f8ae33836df0a0181a13
data/.gitignore CHANGED
@@ -32,6 +32,6 @@ mkmf.log
32
32
  ext/spark/target/*
33
33
  ext/spark/project/target/*
34
34
  ext/spark/project/project/target/*
35
- wiki/*
35
+ wiki
36
36
  /benchmark/performance/spark/*
37
37
  /benchmark/performance/rspark/*
data/README.md CHANGED
@@ -20,6 +20,12 @@ file.flat_map(:split)
20
20
 
21
21
  ## Installation
22
22
 
23
+ ### Requirments
24
+
25
+ - Java 7+
26
+ - Ruby 2+
27
+ - MRI or JRuby
28
+
23
29
  Add this line to your application's Gemfile:
24
30
 
25
31
  ```ruby
@@ -38,33 +44,34 @@ Or install it yourself as:
38
44
  $ gem install ruby-spark
39
45
  ```
40
46
 
41
- ### Install Apache Spark
47
+ Run `rake compile` if you are using gem from local filesystem.
42
48
 
43
- To install latest supported Spark. First compile native extension:
49
+ ### Build Apache Spark
50
+
51
+ This command will download Spark and build extensions for this gem ([SBT](ext/spark/build.sbt) is used for compiling). For more informations check [wiki](https://github.com/ondra-m/ruby-spark/wiki/Installation). Everything is stored by default at [GEM_ROOT]/target,
44
52
 
45
- ```
46
- $ rake compile
47
- ```
48
- Then build Spark, [SBT](ext/spark/build.sbt) is used for compiling.
49
53
  ```
50
54
  $ ruby-spark build
51
55
  ```
52
56
 
53
57
  ## Usage
54
58
 
55
- You can use Ruby Spark via interactive shell
59
+ You can use Ruby Spark via interactive shell (Pry is used)
56
60
 
57
61
  ```
58
- $ ruby-spark pry
62
+ $ ruby-spark shell
59
63
  ```
60
64
 
61
65
  Or on existing project
62
66
 
63
67
  ```ruby
64
68
  require 'ruby-spark'
69
+
70
+ # Create a SparkContext
65
71
  Spark.start
66
72
 
67
- Spark.sc # => context
73
+ # Context reference
74
+ Spark.sc
68
75
  ```
69
76
 
70
77
  If you want configure Spark first. See [configurations](https://github.com/ondra-m/ruby-spark/wiki/Configuration) for more details.
@@ -72,36 +79,47 @@ If you want configure Spark first. See [configurations](https://github.com/ondra
72
79
  ```ruby
73
80
  require 'ruby-spark'
74
81
 
82
+ # Use if you have custom SPARK_HOME
75
83
  Spark.load_lib(spark_home)
84
+
85
+ # Configuration
76
86
  Spark.config do
77
87
  set_app_name "RubySpark"
78
- set 'spark.ruby.batch_size', 100
79
88
  set 'spark.ruby.serializer', 'oj'
89
+ set 'spark.ruby.serializer.batch_size', 100
80
90
  end
91
+
92
+ # Start Apache Spark
81
93
  Spark.start
94
+ ```
95
+
96
+ Finally, to stop the cluster. On the shell is Spark stopped automatically when you exist.
82
97
 
83
- Spark.sc # => context
98
+ ```ruby
99
+ Spark.stop
84
100
  ```
85
101
 
86
- ## Uploading a data
87
102
 
88
- Single file
103
+
104
+ ## Creating RDD (upload data)
105
+
106
+ Single text file:
89
107
 
90
108
  ```ruby
91
- $sc.text_file(FILE, workers_num, custom_options)
109
+ rdd = sc.text_file(FILE, workers_num, serializer=nil)
92
110
  ```
93
111
 
94
- All files on directory
112
+ All files on directory:
95
113
 
96
114
  ```ruby
97
- $sc.whole_text_files(DIRECTORY, workers_num, custom_options)
115
+ rdd = sc.whole_text_files(DIRECTORY, workers_num, serializer=nil)
98
116
  ```
99
117
 
100
- Direct
118
+ Direct uploading structures from ruby (choosen serializer must be able to serialize it):
101
119
 
102
120
  ```ruby
103
- $sc.parallelize([1,2,3,4,5], workers_num, custom_options)
104
- $sc.parallelize(1..5, workers_num, custom_options)
121
+ rdd = sc.parallelize([1,2,3,4,5], workers_num, serializer=nil)
122
+ rdd = sc.parallelize(1..5, workers_num, serializer=nil)
105
123
  ```
106
124
 
107
125
  ### Options
@@ -113,39 +131,74 @@ $sc.parallelize(1..5, workers_num, custom_options)
113
131
  <i>(This value can be overwriten by spark)</i>
114
132
  </dd>
115
133
 
116
- <dt>custom_options</dt>
134
+ <dt>serializer</dt>
117
135
  <dd>
118
- <b>serializer</b>: name of serializator used for this RDD<br>
119
- <b>batch_size</b>: see configuration<br>
120
- <br>
121
- <i>(Available only for parallelize)</i><br>
122
- <b>use</b>: <i>direct (upload direct to java)</i>, <i>file (upload throught a file)</i>
136
+ Custom serializer.<br>
137
+ <i>(default: by <b>spark.ruby.serializer</b> options)</i>
123
138
  </dd>
124
139
  </dl>
125
140
 
141
+ ## Operations
142
+
143
+ All operations can be divided into 2 groups:
144
+
145
+ - **Transformations:** append new operation to current RDD and return new
146
+ - **Actions:** add operation and start calculations
147
+
148
+ See [wiki page](https://github.com/ondra-m/ruby-spark/wiki/RDD) or [Ruby-doc](http://www.rubydoc.info/github/ondra-m/ruby-spark/master/Spark/RDD) for more details.
149
+
150
+ #### Transformations
151
+
152
+ ```ruby
153
+ rdd.map(lambda{|item| ...})
154
+ rdd.flat_map(lambda{|item| ...})
155
+ rdd.filter(lambda{|item| ...})
156
+ rdd.union(rdd)
157
+ rdd.map_paritions(lambda{|iterator| ...})
158
+ # ...
159
+ ```
160
+
161
+ #### Actions
162
+
163
+ ```ruby
164
+ rdd.count
165
+ rdd.take(n)
166
+ rdd.collect
167
+ # ...
168
+ ```
169
+
126
170
 
127
171
  ## Examples
128
172
 
129
173
  Sum of numbers
130
174
 
131
175
  ```ruby
132
- $sc.parallelize(0..10).sum
176
+ sc.parallelize(0..10).sum
133
177
  # => 55
134
178
  ```
135
179
 
136
180
  Words count using methods
137
181
 
138
182
  ```ruby
139
- rdd = $sc.text_file(PATH)
183
+ # Content:
184
+ # "first line"
185
+ # "second line"
186
+ rdd = sc.text_file(PATH)
140
187
 
188
+ # ["first", "line", "second", "line"]
141
189
  rdd = rdd.flat_map(lambda{|line| line.split})
142
- .map(lambda{|word| [word, 1]})
143
- .reduce_by_key(lambda{|a, b| a+b})
144
190
 
191
+ # [["first", 1], ["line", 1], ["second", 1], ["line", 1]]
192
+ rdd = rdd.map(lambda{|word| [word, 1]})
193
+
194
+ # [["first", 1], ["line", 2], ["second", 1]]
195
+ rdd = rdd.reduce_by_key(lambda{|a, b| a+b})
196
+
197
+ # {"first"=>1, "line"=>2, "second"=>1}
145
198
  rdd.collect_as_hash
146
199
  ```
147
200
 
148
- Estimating pi with a custom serializer
201
+ Estimating PI with a custom serializer
149
202
 
150
203
  ```ruby
151
204
  slices = 3
@@ -168,18 +221,32 @@ rdd = rdd.map(method(:map))
168
221
  puts 'Pi is roughly %f' % (4.0 * rdd.sum / n)
169
222
  ```
170
223
 
224
+ Estimating PI
225
+
226
+ ```ruby
227
+ rdd = sc.parallelize([10_000], 1)
228
+ rdd = rdd.add_library('bigdecimal/math')
229
+ rdd = rdd.map(lambda{|x| BigMath.PI(x)})
230
+ rdd.collect # => #<BigDecimal, '0.31415926...'>
231
+ ```
232
+
171
233
  Linear regression
172
234
 
173
235
  ```ruby
174
- Spark::Mllib.import
236
+ # Import Mllib classes into Object
237
+ # Otherwise are accessible via Spark::Mllib::LinearRegressionWithSGD
238
+ Spark::Mllib.import(Object)
175
239
 
240
+ # Training data
176
241
  data = [
177
242
  LabeledPoint.new(0.0, [0.0]),
178
243
  LabeledPoint.new(1.0, [1.0]),
179
244
  LabeledPoint.new(3.0, [2.0]),
180
245
  LabeledPoint.new(2.0, [3.0])
181
246
  ]
182
- lrm = LinearRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0])
247
+
248
+ # Train a model
249
+ lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initial_weights: [1.0])
183
250
 
184
251
  lrm.predict([0.0])
185
252
  ```
data/TODO.md CHANGED
@@ -1,7 +1,6 @@
1
- - add compress
2
1
  - refactor JavaBridge: to_java, from_java
3
2
  - add Streaming
4
3
  - add SQL
5
- - autobatch serializer
6
4
  - worker informations (time, memory, ...)
7
- - encoding parameter to context.text_file
5
+ - killing zombie workers
6
+ - global config to ~/.ruby-spark.conf (e.g. target folder for spark)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
data/example/pi.rb CHANGED
@@ -22,7 +22,7 @@ def map(_)
22
22
  end
23
23
  end
24
24
 
25
- rdd = Spark.context.parallelize(1..n, slices, serializer: 'oj')
25
+ rdd = Spark.context.parallelize(1..n, slices)
26
26
  rdd = rdd.map(method(:map))
27
27
 
28
28
  puts 'Pi is roughly %f' % (4.0 * rdd.sum / n)
@@ -0,0 +1,83 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Parse sitemap and search word on every page
4
+
5
+ require 'optparse'
6
+ require 'open-uri'
7
+ require 'nokogiri'
8
+ require 'ruby-spark'
9
+
10
+ options = {
11
+ sitemap: 'http://fit.cvut.cz/sitemap.xml',
12
+ query: 'cvut',
13
+ workers: 2
14
+ }
15
+
16
+ opt_parser = OptionParser.new do |opts|
17
+ opts.banner = 'Usage: website_search.rb [options]'
18
+
19
+ opts.separator ''
20
+ opts.separator 'Specific options:'
21
+
22
+ opts.on('-s', '--sitemap SITEMAP', 'Sitemap URL') do |sitemap|
23
+ options[:sitemap] = sitemap
24
+ end
25
+
26
+ opts.on('-q', '--query QUERY', 'Query to search') do |query|
27
+ options[:query] = query
28
+ end
29
+
30
+ opts.on('-w', '--workers WORKERS_NUM', Integer, 'Number of workers') do |workers|
31
+ options[:workers] = workers
32
+ end
33
+
34
+ opts.on('--quite', 'Run quitely') do |v|
35
+ Spark.logger.disabled
36
+ end
37
+
38
+ opts.on_tail('-h', '--help', 'Show this message') do
39
+ puts opts
40
+ exit
41
+ end
42
+ end
43
+
44
+ opt_parser.parse!
45
+
46
+ @links = []
47
+
48
+ def parse_sitemap(doc)
49
+ doc.xpath('//sitemapindex/sitemap/loc').each do |loc|
50
+ next_doc = Nokogiri::HTML(open(loc.text))
51
+ parse_sitemap(next_doc)
52
+ end
53
+
54
+ doc.xpath('//url/loc').each do |loc|
55
+ @links << loc.text
56
+ end
57
+ end
58
+
59
+ doc = Nokogiri::HTML(open(options[:sitemap]))
60
+ parse_sitemap(doc)
61
+
62
+ # Map function
63
+ func = Proc.new do |url|
64
+ begin
65
+ open(url) {|f|
66
+ [url, f.read.scan(query).count]
67
+ }
68
+ rescue
69
+ [url, 0]
70
+ end
71
+ end
72
+
73
+ Spark.start
74
+
75
+ rdd = Spark.sc.parallelize(@links, options[:workers])
76
+ .add_library('open-uri')
77
+ .bind(query: options[:query])
78
+ .map(func)
79
+ .sort_by(lambda{|(_, value)| value}, false)
80
+
81
+ rdd.collect.each do |(url, count)|
82
+ puts "#{url} => #{count}"
83
+ end
@@ -290,9 +290,21 @@ class PairwiseRDD(prev: RDD[Array[Byte]]) extends RDD[(Long, Array[Byte])](prev)
290
290
 
291
291
  object RubyRDD extends Logging {
292
292
 
293
+ def runJob(
294
+ sc: SparkContext,
295
+ rdd: JavaRDD[Array[Byte]],
296
+ partitions: ArrayList[Int],
297
+ allowLocal: Boolean,
298
+ filename: String): String = {
299
+ type ByteArray = Array[Byte]
300
+ type UnrolledPartition = Array[ByteArray]
301
+ val allPartitions: Array[UnrolledPartition] =
302
+ sc.runJob(rdd, (x: Iterator[ByteArray]) => x.toArray, partitions, allowLocal)
303
+ val flattenedPartition: UnrolledPartition = Array.concat(allPartitions: _*)
304
+ writeRDDToFile(flattenedPartition.iterator, filename)
305
+ }
306
+
293
307
  def readRDDFromFile(sc: JavaSparkContext, filename: String, parallelism: Int): JavaRDD[Array[Byte]] = {
294
- // Too slow
295
- // val file = new DataInputStream(new FileInputStream(filename))
296
308
  val file = new DataInputStream(new BufferedInputStream(new FileInputStream(filename)))
297
309
  val objs = new collection.mutable.ArrayBuffer[Array[Byte]]
298
310
  try {
@@ -308,6 +320,22 @@ object RubyRDD extends Logging {
308
320
  JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))
309
321
  }
310
322
 
323
+ def writeRDDToFile[T](items: Iterator[T], filename: String): String = {
324
+ val file = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(filename)))
325
+
326
+ try {
327
+ PythonRDD.writeIteratorToStream(items, file)
328
+ } finally {
329
+ file.close()
330
+ }
331
+
332
+ filename
333
+ }
334
+
335
+ def writeRDDToFile[T](rdd: RDD[T], filename: String): String = {
336
+ writeRDDToFile(rdd.collect.iterator, filename)
337
+ }
338
+
311
339
  def readBroadcastFromFile(sc: JavaSparkContext, path: String, id: java.lang.Long): Broadcast[RubyBroadcast] = {
312
340
  sc.broadcast(new RubyBroadcast(path, id))
313
341
  }
data/lib/spark.rb CHANGED
@@ -190,9 +190,9 @@ require 'spark/version'
190
190
  require 'spark/error'
191
191
 
192
192
  # Make sure that Spark be always stopped
193
- Kernel::at_exit do
193
+ Kernel.at_exit do
194
194
  begin
195
- Spark.stop
195
+ Spark.started? && Spark.stop
196
196
  rescue
197
197
  end
198
198
  end
data/lib/spark/build.rb CHANGED
@@ -30,7 +30,7 @@ module Spark
30
30
  cmd = [SBT]
31
31
  cmd << SBT_EXT
32
32
  cmd << SBT_DEPS unless only_ext
33
- cmd << SBT_CLEAN unless $debug
33
+ cmd << SBT_CLEAN unless $DEBUG
34
34
 
35
35
  Dir.chdir(Spark.spark_ext_dir) do
36
36
  unless Kernel.system(env, cmd.join(' '))
data/lib/spark/cli.rb CHANGED
@@ -21,7 +21,7 @@ module Spark
21
21
  program :version, Spark::VERSION
22
22
  program :description, 'Ruby wrapper for Spark'
23
23
 
24
- global_option('-d', '--debug', 'Logging message to stdout'){ $debug = true }
24
+ global_option('-d', '--debug', 'Logging message to stdout'){ $DEBUG = true }
25
25
  default_command :help
26
26
 
27
27