ruby-spark 1.0.0 → 1.1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/README.md +99 -32
- data/TODO.md +2 -3
- data/benchmark/{performance → comparison}/prepare.sh +0 -0
- data/benchmark/{performance → comparison}/python.py +0 -0
- data/benchmark/{performance → comparison}/r.r +0 -0
- data/benchmark/{performance → comparison}/ruby.rb +0 -0
- data/benchmark/{performance → comparison}/run-all.sh +0 -0
- data/benchmark/{performance → comparison}/scala.scala +0 -0
- data/example/pi.rb +1 -1
- data/example/website_search.rb +83 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +30 -2
- data/lib/spark.rb +2 -2
- data/lib/spark/build.rb +1 -1
- data/lib/spark/cli.rb +1 -1
- data/lib/spark/command/base.rb +4 -0
- data/lib/spark/command_builder.rb +2 -2
- data/lib/spark/config.rb +11 -17
- data/lib/spark/context.rb +63 -45
- data/lib/spark/ext/io.rb +11 -1
- data/lib/spark/java_bridge/base.rb +2 -2
- data/lib/spark/rdd.rb +67 -18
- data/lib/spark/serializer.rb +68 -13
- data/lib/spark/serializer/auto_batched.rb +59 -0
- data/lib/spark/serializer/base.rb +30 -137
- data/lib/spark/serializer/batched.rb +84 -0
- data/lib/spark/serializer/cartesian.rb +5 -29
- data/lib/spark/serializer/compressed.rb +27 -0
- data/lib/spark/serializer/marshal.rb +6 -8
- data/lib/spark/serializer/message_pack.rb +8 -10
- data/lib/spark/serializer/oj.rb +8 -10
- data/lib/spark/serializer/pair.rb +27 -13
- data/lib/spark/serializer/text.rb +25 -0
- data/lib/spark/version.rb +1 -1
- data/lib/spark/worker/worker.rb +5 -2
- data/ruby-spark.gemspec +13 -1
- data/spec/lib/context_spec.rb +3 -1
- data/spec/lib/manipulation_spec.rb +18 -10
- data/spec/lib/map_partitions_spec.rb +16 -16
- data/spec/lib/serializer_spec.rb +84 -9
- data/spec/lib/statistic_spec.rb +26 -24
- data/spec/spec_helper.rb +1 -2
- metadata +112 -10
- data/lib/spark/serializer/utf8.rb +0 -25
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 64125882e5773c705d62c737e57a481d7bc8bb71
|
4
|
+
data.tar.gz: 4dd464e678a79c9e2d5655fc5b0aae813c2a6346
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5d11709879c2ce1d1c1a7153dafc7b85908d51227b864f276ef1bfb257eb239ae9f6216054bb3a0e601c6df23ec756fb60df70793113b282fc8db1e73b7b9b5d
|
7
|
+
data.tar.gz: 4452699e228a9f4196a3afc583a9f6ea792ca41864524ab2b029aef04abb53d662c9ce538f8190add24a528605cca3d3fcc2f6c99e49f8ae33836df0a0181a13
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -20,6 +20,12 @@ file.flat_map(:split)
|
|
20
20
|
|
21
21
|
## Installation
|
22
22
|
|
23
|
+
### Requirments
|
24
|
+
|
25
|
+
- Java 7+
|
26
|
+
- Ruby 2+
|
27
|
+
- MRI or JRuby
|
28
|
+
|
23
29
|
Add this line to your application's Gemfile:
|
24
30
|
|
25
31
|
```ruby
|
@@ -38,33 +44,34 @@ Or install it yourself as:
|
|
38
44
|
$ gem install ruby-spark
|
39
45
|
```
|
40
46
|
|
41
|
-
|
47
|
+
Run `rake compile` if you are using gem from local filesystem.
|
42
48
|
|
43
|
-
|
49
|
+
### Build Apache Spark
|
50
|
+
|
51
|
+
This command will download Spark and build extensions for this gem ([SBT](ext/spark/build.sbt) is used for compiling). For more informations check [wiki](https://github.com/ondra-m/ruby-spark/wiki/Installation). Everything is stored by default at [GEM_ROOT]/target,
|
44
52
|
|
45
|
-
```
|
46
|
-
$ rake compile
|
47
|
-
```
|
48
|
-
Then build Spark, [SBT](ext/spark/build.sbt) is used for compiling.
|
49
53
|
```
|
50
54
|
$ ruby-spark build
|
51
55
|
```
|
52
56
|
|
53
57
|
## Usage
|
54
58
|
|
55
|
-
You can use Ruby Spark via interactive shell
|
59
|
+
You can use Ruby Spark via interactive shell (Pry is used)
|
56
60
|
|
57
61
|
```
|
58
|
-
$ ruby-spark
|
62
|
+
$ ruby-spark shell
|
59
63
|
```
|
60
64
|
|
61
65
|
Or on existing project
|
62
66
|
|
63
67
|
```ruby
|
64
68
|
require 'ruby-spark'
|
69
|
+
|
70
|
+
# Create a SparkContext
|
65
71
|
Spark.start
|
66
72
|
|
67
|
-
|
73
|
+
# Context reference
|
74
|
+
Spark.sc
|
68
75
|
```
|
69
76
|
|
70
77
|
If you want configure Spark first. See [configurations](https://github.com/ondra-m/ruby-spark/wiki/Configuration) for more details.
|
@@ -72,36 +79,47 @@ If you want configure Spark first. See [configurations](https://github.com/ondra
|
|
72
79
|
```ruby
|
73
80
|
require 'ruby-spark'
|
74
81
|
|
82
|
+
# Use if you have custom SPARK_HOME
|
75
83
|
Spark.load_lib(spark_home)
|
84
|
+
|
85
|
+
# Configuration
|
76
86
|
Spark.config do
|
77
87
|
set_app_name "RubySpark"
|
78
|
-
set 'spark.ruby.batch_size', 100
|
79
88
|
set 'spark.ruby.serializer', 'oj'
|
89
|
+
set 'spark.ruby.serializer.batch_size', 100
|
80
90
|
end
|
91
|
+
|
92
|
+
# Start Apache Spark
|
81
93
|
Spark.start
|
94
|
+
```
|
95
|
+
|
96
|
+
Finally, to stop the cluster. On the shell is Spark stopped automatically when you exist.
|
82
97
|
|
83
|
-
|
98
|
+
```ruby
|
99
|
+
Spark.stop
|
84
100
|
```
|
85
101
|
|
86
|
-
## Uploading a data
|
87
102
|
|
88
|
-
|
103
|
+
|
104
|
+
## Creating RDD (upload data)
|
105
|
+
|
106
|
+
Single text file:
|
89
107
|
|
90
108
|
```ruby
|
91
|
-
|
109
|
+
rdd = sc.text_file(FILE, workers_num, serializer=nil)
|
92
110
|
```
|
93
111
|
|
94
|
-
All files on directory
|
112
|
+
All files on directory:
|
95
113
|
|
96
114
|
```ruby
|
97
|
-
|
115
|
+
rdd = sc.whole_text_files(DIRECTORY, workers_num, serializer=nil)
|
98
116
|
```
|
99
117
|
|
100
|
-
Direct
|
118
|
+
Direct uploading structures from ruby (choosen serializer must be able to serialize it):
|
101
119
|
|
102
120
|
```ruby
|
103
|
-
|
104
|
-
|
121
|
+
rdd = sc.parallelize([1,2,3,4,5], workers_num, serializer=nil)
|
122
|
+
rdd = sc.parallelize(1..5, workers_num, serializer=nil)
|
105
123
|
```
|
106
124
|
|
107
125
|
### Options
|
@@ -113,39 +131,74 @@ $sc.parallelize(1..5, workers_num, custom_options)
|
|
113
131
|
<i>(This value can be overwriten by spark)</i>
|
114
132
|
</dd>
|
115
133
|
|
116
|
-
<dt>
|
134
|
+
<dt>serializer</dt>
|
117
135
|
<dd>
|
118
|
-
|
119
|
-
<b>
|
120
|
-
<br>
|
121
|
-
<i>(Available only for parallelize)</i><br>
|
122
|
-
<b>use</b>: <i>direct (upload direct to java)</i>, <i>file (upload throught a file)</i>
|
136
|
+
Custom serializer.<br>
|
137
|
+
<i>(default: by <b>spark.ruby.serializer</b> options)</i>
|
123
138
|
</dd>
|
124
139
|
</dl>
|
125
140
|
|
141
|
+
## Operations
|
142
|
+
|
143
|
+
All operations can be divided into 2 groups:
|
144
|
+
|
145
|
+
- **Transformations:** append new operation to current RDD and return new
|
146
|
+
- **Actions:** add operation and start calculations
|
147
|
+
|
148
|
+
See [wiki page](https://github.com/ondra-m/ruby-spark/wiki/RDD) or [Ruby-doc](http://www.rubydoc.info/github/ondra-m/ruby-spark/master/Spark/RDD) for more details.
|
149
|
+
|
150
|
+
#### Transformations
|
151
|
+
|
152
|
+
```ruby
|
153
|
+
rdd.map(lambda{|item| ...})
|
154
|
+
rdd.flat_map(lambda{|item| ...})
|
155
|
+
rdd.filter(lambda{|item| ...})
|
156
|
+
rdd.union(rdd)
|
157
|
+
rdd.map_paritions(lambda{|iterator| ...})
|
158
|
+
# ...
|
159
|
+
```
|
160
|
+
|
161
|
+
#### Actions
|
162
|
+
|
163
|
+
```ruby
|
164
|
+
rdd.count
|
165
|
+
rdd.take(n)
|
166
|
+
rdd.collect
|
167
|
+
# ...
|
168
|
+
```
|
169
|
+
|
126
170
|
|
127
171
|
## Examples
|
128
172
|
|
129
173
|
Sum of numbers
|
130
174
|
|
131
175
|
```ruby
|
132
|
-
|
176
|
+
sc.parallelize(0..10).sum
|
133
177
|
# => 55
|
134
178
|
```
|
135
179
|
|
136
180
|
Words count using methods
|
137
181
|
|
138
182
|
```ruby
|
139
|
-
|
183
|
+
# Content:
|
184
|
+
# "first line"
|
185
|
+
# "second line"
|
186
|
+
rdd = sc.text_file(PATH)
|
140
187
|
|
188
|
+
# ["first", "line", "second", "line"]
|
141
189
|
rdd = rdd.flat_map(lambda{|line| line.split})
|
142
|
-
.map(lambda{|word| [word, 1]})
|
143
|
-
.reduce_by_key(lambda{|a, b| a+b})
|
144
190
|
|
191
|
+
# [["first", 1], ["line", 1], ["second", 1], ["line", 1]]
|
192
|
+
rdd = rdd.map(lambda{|word| [word, 1]})
|
193
|
+
|
194
|
+
# [["first", 1], ["line", 2], ["second", 1]]
|
195
|
+
rdd = rdd.reduce_by_key(lambda{|a, b| a+b})
|
196
|
+
|
197
|
+
# {"first"=>1, "line"=>2, "second"=>1}
|
145
198
|
rdd.collect_as_hash
|
146
199
|
```
|
147
200
|
|
148
|
-
Estimating
|
201
|
+
Estimating PI with a custom serializer
|
149
202
|
|
150
203
|
```ruby
|
151
204
|
slices = 3
|
@@ -168,18 +221,32 @@ rdd = rdd.map(method(:map))
|
|
168
221
|
puts 'Pi is roughly %f' % (4.0 * rdd.sum / n)
|
169
222
|
```
|
170
223
|
|
224
|
+
Estimating PI
|
225
|
+
|
226
|
+
```ruby
|
227
|
+
rdd = sc.parallelize([10_000], 1)
|
228
|
+
rdd = rdd.add_library('bigdecimal/math')
|
229
|
+
rdd = rdd.map(lambda{|x| BigMath.PI(x)})
|
230
|
+
rdd.collect # => #<BigDecimal, '0.31415926...'>
|
231
|
+
```
|
232
|
+
|
171
233
|
Linear regression
|
172
234
|
|
173
235
|
```ruby
|
174
|
-
|
236
|
+
# Import Mllib classes into Object
|
237
|
+
# Otherwise are accessible via Spark::Mllib::LinearRegressionWithSGD
|
238
|
+
Spark::Mllib.import(Object)
|
175
239
|
|
240
|
+
# Training data
|
176
241
|
data = [
|
177
242
|
LabeledPoint.new(0.0, [0.0]),
|
178
243
|
LabeledPoint.new(1.0, [1.0]),
|
179
244
|
LabeledPoint.new(3.0, [2.0]),
|
180
245
|
LabeledPoint.new(2.0, [3.0])
|
181
246
|
]
|
182
|
-
|
247
|
+
|
248
|
+
# Train a model
|
249
|
+
lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initial_weights: [1.0])
|
183
250
|
|
184
251
|
lrm.predict([0.0])
|
185
252
|
```
|
data/TODO.md
CHANGED
@@ -1,7 +1,6 @@
|
|
1
|
-
- add compress
|
2
1
|
- refactor JavaBridge: to_java, from_java
|
3
2
|
- add Streaming
|
4
3
|
- add SQL
|
5
|
-
- autobatch serializer
|
6
4
|
- worker informations (time, memory, ...)
|
7
|
-
-
|
5
|
+
- killing zombie workers
|
6
|
+
- global config to ~/.ruby-spark.conf (e.g. target folder for spark)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
data/example/pi.rb
CHANGED
@@ -0,0 +1,83 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Parse sitemap and search word on every page
|
4
|
+
|
5
|
+
require 'optparse'
|
6
|
+
require 'open-uri'
|
7
|
+
require 'nokogiri'
|
8
|
+
require 'ruby-spark'
|
9
|
+
|
10
|
+
options = {
|
11
|
+
sitemap: 'http://fit.cvut.cz/sitemap.xml',
|
12
|
+
query: 'cvut',
|
13
|
+
workers: 2
|
14
|
+
}
|
15
|
+
|
16
|
+
opt_parser = OptionParser.new do |opts|
|
17
|
+
opts.banner = 'Usage: website_search.rb [options]'
|
18
|
+
|
19
|
+
opts.separator ''
|
20
|
+
opts.separator 'Specific options:'
|
21
|
+
|
22
|
+
opts.on('-s', '--sitemap SITEMAP', 'Sitemap URL') do |sitemap|
|
23
|
+
options[:sitemap] = sitemap
|
24
|
+
end
|
25
|
+
|
26
|
+
opts.on('-q', '--query QUERY', 'Query to search') do |query|
|
27
|
+
options[:query] = query
|
28
|
+
end
|
29
|
+
|
30
|
+
opts.on('-w', '--workers WORKERS_NUM', Integer, 'Number of workers') do |workers|
|
31
|
+
options[:workers] = workers
|
32
|
+
end
|
33
|
+
|
34
|
+
opts.on('--quite', 'Run quitely') do |v|
|
35
|
+
Spark.logger.disabled
|
36
|
+
end
|
37
|
+
|
38
|
+
opts.on_tail('-h', '--help', 'Show this message') do
|
39
|
+
puts opts
|
40
|
+
exit
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
opt_parser.parse!
|
45
|
+
|
46
|
+
@links = []
|
47
|
+
|
48
|
+
def parse_sitemap(doc)
|
49
|
+
doc.xpath('//sitemapindex/sitemap/loc').each do |loc|
|
50
|
+
next_doc = Nokogiri::HTML(open(loc.text))
|
51
|
+
parse_sitemap(next_doc)
|
52
|
+
end
|
53
|
+
|
54
|
+
doc.xpath('//url/loc').each do |loc|
|
55
|
+
@links << loc.text
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
doc = Nokogiri::HTML(open(options[:sitemap]))
|
60
|
+
parse_sitemap(doc)
|
61
|
+
|
62
|
+
# Map function
|
63
|
+
func = Proc.new do |url|
|
64
|
+
begin
|
65
|
+
open(url) {|f|
|
66
|
+
[url, f.read.scan(query).count]
|
67
|
+
}
|
68
|
+
rescue
|
69
|
+
[url, 0]
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
Spark.start
|
74
|
+
|
75
|
+
rdd = Spark.sc.parallelize(@links, options[:workers])
|
76
|
+
.add_library('open-uri')
|
77
|
+
.bind(query: options[:query])
|
78
|
+
.map(func)
|
79
|
+
.sort_by(lambda{|(_, value)| value}, false)
|
80
|
+
|
81
|
+
rdd.collect.each do |(url, count)|
|
82
|
+
puts "#{url} => #{count}"
|
83
|
+
end
|
@@ -290,9 +290,21 @@ class PairwiseRDD(prev: RDD[Array[Byte]]) extends RDD[(Long, Array[Byte])](prev)
|
|
290
290
|
|
291
291
|
object RubyRDD extends Logging {
|
292
292
|
|
293
|
+
def runJob(
|
294
|
+
sc: SparkContext,
|
295
|
+
rdd: JavaRDD[Array[Byte]],
|
296
|
+
partitions: ArrayList[Int],
|
297
|
+
allowLocal: Boolean,
|
298
|
+
filename: String): String = {
|
299
|
+
type ByteArray = Array[Byte]
|
300
|
+
type UnrolledPartition = Array[ByteArray]
|
301
|
+
val allPartitions: Array[UnrolledPartition] =
|
302
|
+
sc.runJob(rdd, (x: Iterator[ByteArray]) => x.toArray, partitions, allowLocal)
|
303
|
+
val flattenedPartition: UnrolledPartition = Array.concat(allPartitions: _*)
|
304
|
+
writeRDDToFile(flattenedPartition.iterator, filename)
|
305
|
+
}
|
306
|
+
|
293
307
|
def readRDDFromFile(sc: JavaSparkContext, filename: String, parallelism: Int): JavaRDD[Array[Byte]] = {
|
294
|
-
// Too slow
|
295
|
-
// val file = new DataInputStream(new FileInputStream(filename))
|
296
308
|
val file = new DataInputStream(new BufferedInputStream(new FileInputStream(filename)))
|
297
309
|
val objs = new collection.mutable.ArrayBuffer[Array[Byte]]
|
298
310
|
try {
|
@@ -308,6 +320,22 @@ object RubyRDD extends Logging {
|
|
308
320
|
JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))
|
309
321
|
}
|
310
322
|
|
323
|
+
def writeRDDToFile[T](items: Iterator[T], filename: String): String = {
|
324
|
+
val file = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(filename)))
|
325
|
+
|
326
|
+
try {
|
327
|
+
PythonRDD.writeIteratorToStream(items, file)
|
328
|
+
} finally {
|
329
|
+
file.close()
|
330
|
+
}
|
331
|
+
|
332
|
+
filename
|
333
|
+
}
|
334
|
+
|
335
|
+
def writeRDDToFile[T](rdd: RDD[T], filename: String): String = {
|
336
|
+
writeRDDToFile(rdd.collect.iterator, filename)
|
337
|
+
}
|
338
|
+
|
311
339
|
def readBroadcastFromFile(sc: JavaSparkContext, path: String, id: java.lang.Long): Broadcast[RubyBroadcast] = {
|
312
340
|
sc.broadcast(new RubyBroadcast(path, id))
|
313
341
|
}
|
data/lib/spark.rb
CHANGED
data/lib/spark/build.rb
CHANGED
data/lib/spark/cli.rb
CHANGED
@@ -21,7 +21,7 @@ module Spark
|
|
21
21
|
program :version, Spark::VERSION
|
22
22
|
program :description, 'Ruby wrapper for Spark'
|
23
23
|
|
24
|
-
global_option('-d', '--debug', 'Logging message to stdout'){ $
|
24
|
+
global_option('-d', '--debug', 'Logging message to stdout'){ $DEBUG = true }
|
25
25
|
default_command :help
|
26
26
|
|
27
27
|
|