ruby-spark 1.0.0 → 1.1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/README.md +99 -32
- data/TODO.md +2 -3
- data/benchmark/{performance → comparison}/prepare.sh +0 -0
- data/benchmark/{performance → comparison}/python.py +0 -0
- data/benchmark/{performance → comparison}/r.r +0 -0
- data/benchmark/{performance → comparison}/ruby.rb +0 -0
- data/benchmark/{performance → comparison}/run-all.sh +0 -0
- data/benchmark/{performance → comparison}/scala.scala +0 -0
- data/example/pi.rb +1 -1
- data/example/website_search.rb +83 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +30 -2
- data/lib/spark.rb +2 -2
- data/lib/spark/build.rb +1 -1
- data/lib/spark/cli.rb +1 -1
- data/lib/spark/command/base.rb +4 -0
- data/lib/spark/command_builder.rb +2 -2
- data/lib/spark/config.rb +11 -17
- data/lib/spark/context.rb +63 -45
- data/lib/spark/ext/io.rb +11 -1
- data/lib/spark/java_bridge/base.rb +2 -2
- data/lib/spark/rdd.rb +67 -18
- data/lib/spark/serializer.rb +68 -13
- data/lib/spark/serializer/auto_batched.rb +59 -0
- data/lib/spark/serializer/base.rb +30 -137
- data/lib/spark/serializer/batched.rb +84 -0
- data/lib/spark/serializer/cartesian.rb +5 -29
- data/lib/spark/serializer/compressed.rb +27 -0
- data/lib/spark/serializer/marshal.rb +6 -8
- data/lib/spark/serializer/message_pack.rb +8 -10
- data/lib/spark/serializer/oj.rb +8 -10
- data/lib/spark/serializer/pair.rb +27 -13
- data/lib/spark/serializer/text.rb +25 -0
- data/lib/spark/version.rb +1 -1
- data/lib/spark/worker/worker.rb +5 -2
- data/ruby-spark.gemspec +13 -1
- data/spec/lib/context_spec.rb +3 -1
- data/spec/lib/manipulation_spec.rb +18 -10
- data/spec/lib/map_partitions_spec.rb +16 -16
- data/spec/lib/serializer_spec.rb +84 -9
- data/spec/lib/statistic_spec.rb +26 -24
- data/spec/spec_helper.rb +1 -2
- metadata +112 -10
- data/lib/spark/serializer/utf8.rb +0 -25
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 64125882e5773c705d62c737e57a481d7bc8bb71
|
4
|
+
data.tar.gz: 4dd464e678a79c9e2d5655fc5b0aae813c2a6346
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5d11709879c2ce1d1c1a7153dafc7b85908d51227b864f276ef1bfb257eb239ae9f6216054bb3a0e601c6df23ec756fb60df70793113b282fc8db1e73b7b9b5d
|
7
|
+
data.tar.gz: 4452699e228a9f4196a3afc583a9f6ea792ca41864524ab2b029aef04abb53d662c9ce538f8190add24a528605cca3d3fcc2f6c99e49f8ae33836df0a0181a13
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -20,6 +20,12 @@ file.flat_map(:split)
|
|
20
20
|
|
21
21
|
## Installation
|
22
22
|
|
23
|
+
### Requirments
|
24
|
+
|
25
|
+
- Java 7+
|
26
|
+
- Ruby 2+
|
27
|
+
- MRI or JRuby
|
28
|
+
|
23
29
|
Add this line to your application's Gemfile:
|
24
30
|
|
25
31
|
```ruby
|
@@ -38,33 +44,34 @@ Or install it yourself as:
|
|
38
44
|
$ gem install ruby-spark
|
39
45
|
```
|
40
46
|
|
41
|
-
|
47
|
+
Run `rake compile` if you are using gem from local filesystem.
|
42
48
|
|
43
|
-
|
49
|
+
### Build Apache Spark
|
50
|
+
|
51
|
+
This command will download Spark and build extensions for this gem ([SBT](ext/spark/build.sbt) is used for compiling). For more informations check [wiki](https://github.com/ondra-m/ruby-spark/wiki/Installation). Everything is stored by default at [GEM_ROOT]/target,
|
44
52
|
|
45
|
-
```
|
46
|
-
$ rake compile
|
47
|
-
```
|
48
|
-
Then build Spark, [SBT](ext/spark/build.sbt) is used for compiling.
|
49
53
|
```
|
50
54
|
$ ruby-spark build
|
51
55
|
```
|
52
56
|
|
53
57
|
## Usage
|
54
58
|
|
55
|
-
You can use Ruby Spark via interactive shell
|
59
|
+
You can use Ruby Spark via interactive shell (Pry is used)
|
56
60
|
|
57
61
|
```
|
58
|
-
$ ruby-spark
|
62
|
+
$ ruby-spark shell
|
59
63
|
```
|
60
64
|
|
61
65
|
Or on existing project
|
62
66
|
|
63
67
|
```ruby
|
64
68
|
require 'ruby-spark'
|
69
|
+
|
70
|
+
# Create a SparkContext
|
65
71
|
Spark.start
|
66
72
|
|
67
|
-
|
73
|
+
# Context reference
|
74
|
+
Spark.sc
|
68
75
|
```
|
69
76
|
|
70
77
|
If you want configure Spark first. See [configurations](https://github.com/ondra-m/ruby-spark/wiki/Configuration) for more details.
|
@@ -72,36 +79,47 @@ If you want configure Spark first. See [configurations](https://github.com/ondra
|
|
72
79
|
```ruby
|
73
80
|
require 'ruby-spark'
|
74
81
|
|
82
|
+
# Use if you have custom SPARK_HOME
|
75
83
|
Spark.load_lib(spark_home)
|
84
|
+
|
85
|
+
# Configuration
|
76
86
|
Spark.config do
|
77
87
|
set_app_name "RubySpark"
|
78
|
-
set 'spark.ruby.batch_size', 100
|
79
88
|
set 'spark.ruby.serializer', 'oj'
|
89
|
+
set 'spark.ruby.serializer.batch_size', 100
|
80
90
|
end
|
91
|
+
|
92
|
+
# Start Apache Spark
|
81
93
|
Spark.start
|
94
|
+
```
|
95
|
+
|
96
|
+
Finally, to stop the cluster. On the shell is Spark stopped automatically when you exist.
|
82
97
|
|
83
|
-
|
98
|
+
```ruby
|
99
|
+
Spark.stop
|
84
100
|
```
|
85
101
|
|
86
|
-
## Uploading a data
|
87
102
|
|
88
|
-
|
103
|
+
|
104
|
+
## Creating RDD (upload data)
|
105
|
+
|
106
|
+
Single text file:
|
89
107
|
|
90
108
|
```ruby
|
91
|
-
|
109
|
+
rdd = sc.text_file(FILE, workers_num, serializer=nil)
|
92
110
|
```
|
93
111
|
|
94
|
-
All files on directory
|
112
|
+
All files on directory:
|
95
113
|
|
96
114
|
```ruby
|
97
|
-
|
115
|
+
rdd = sc.whole_text_files(DIRECTORY, workers_num, serializer=nil)
|
98
116
|
```
|
99
117
|
|
100
|
-
Direct
|
118
|
+
Direct uploading structures from ruby (choosen serializer must be able to serialize it):
|
101
119
|
|
102
120
|
```ruby
|
103
|
-
|
104
|
-
|
121
|
+
rdd = sc.parallelize([1,2,3,4,5], workers_num, serializer=nil)
|
122
|
+
rdd = sc.parallelize(1..5, workers_num, serializer=nil)
|
105
123
|
```
|
106
124
|
|
107
125
|
### Options
|
@@ -113,39 +131,74 @@ $sc.parallelize(1..5, workers_num, custom_options)
|
|
113
131
|
<i>(This value can be overwriten by spark)</i>
|
114
132
|
</dd>
|
115
133
|
|
116
|
-
<dt>
|
134
|
+
<dt>serializer</dt>
|
117
135
|
<dd>
|
118
|
-
|
119
|
-
<b>
|
120
|
-
<br>
|
121
|
-
<i>(Available only for parallelize)</i><br>
|
122
|
-
<b>use</b>: <i>direct (upload direct to java)</i>, <i>file (upload throught a file)</i>
|
136
|
+
Custom serializer.<br>
|
137
|
+
<i>(default: by <b>spark.ruby.serializer</b> options)</i>
|
123
138
|
</dd>
|
124
139
|
</dl>
|
125
140
|
|
141
|
+
## Operations
|
142
|
+
|
143
|
+
All operations can be divided into 2 groups:
|
144
|
+
|
145
|
+
- **Transformations:** append new operation to current RDD and return new
|
146
|
+
- **Actions:** add operation and start calculations
|
147
|
+
|
148
|
+
See [wiki page](https://github.com/ondra-m/ruby-spark/wiki/RDD) or [Ruby-doc](http://www.rubydoc.info/github/ondra-m/ruby-spark/master/Spark/RDD) for more details.
|
149
|
+
|
150
|
+
#### Transformations
|
151
|
+
|
152
|
+
```ruby
|
153
|
+
rdd.map(lambda{|item| ...})
|
154
|
+
rdd.flat_map(lambda{|item| ...})
|
155
|
+
rdd.filter(lambda{|item| ...})
|
156
|
+
rdd.union(rdd)
|
157
|
+
rdd.map_paritions(lambda{|iterator| ...})
|
158
|
+
# ...
|
159
|
+
```
|
160
|
+
|
161
|
+
#### Actions
|
162
|
+
|
163
|
+
```ruby
|
164
|
+
rdd.count
|
165
|
+
rdd.take(n)
|
166
|
+
rdd.collect
|
167
|
+
# ...
|
168
|
+
```
|
169
|
+
|
126
170
|
|
127
171
|
## Examples
|
128
172
|
|
129
173
|
Sum of numbers
|
130
174
|
|
131
175
|
```ruby
|
132
|
-
|
176
|
+
sc.parallelize(0..10).sum
|
133
177
|
# => 55
|
134
178
|
```
|
135
179
|
|
136
180
|
Words count using methods
|
137
181
|
|
138
182
|
```ruby
|
139
|
-
|
183
|
+
# Content:
|
184
|
+
# "first line"
|
185
|
+
# "second line"
|
186
|
+
rdd = sc.text_file(PATH)
|
140
187
|
|
188
|
+
# ["first", "line", "second", "line"]
|
141
189
|
rdd = rdd.flat_map(lambda{|line| line.split})
|
142
|
-
.map(lambda{|word| [word, 1]})
|
143
|
-
.reduce_by_key(lambda{|a, b| a+b})
|
144
190
|
|
191
|
+
# [["first", 1], ["line", 1], ["second", 1], ["line", 1]]
|
192
|
+
rdd = rdd.map(lambda{|word| [word, 1]})
|
193
|
+
|
194
|
+
# [["first", 1], ["line", 2], ["second", 1]]
|
195
|
+
rdd = rdd.reduce_by_key(lambda{|a, b| a+b})
|
196
|
+
|
197
|
+
# {"first"=>1, "line"=>2, "second"=>1}
|
145
198
|
rdd.collect_as_hash
|
146
199
|
```
|
147
200
|
|
148
|
-
Estimating
|
201
|
+
Estimating PI with a custom serializer
|
149
202
|
|
150
203
|
```ruby
|
151
204
|
slices = 3
|
@@ -168,18 +221,32 @@ rdd = rdd.map(method(:map))
|
|
168
221
|
puts 'Pi is roughly %f' % (4.0 * rdd.sum / n)
|
169
222
|
```
|
170
223
|
|
224
|
+
Estimating PI
|
225
|
+
|
226
|
+
```ruby
|
227
|
+
rdd = sc.parallelize([10_000], 1)
|
228
|
+
rdd = rdd.add_library('bigdecimal/math')
|
229
|
+
rdd = rdd.map(lambda{|x| BigMath.PI(x)})
|
230
|
+
rdd.collect # => #<BigDecimal, '0.31415926...'>
|
231
|
+
```
|
232
|
+
|
171
233
|
Linear regression
|
172
234
|
|
173
235
|
```ruby
|
174
|
-
|
236
|
+
# Import Mllib classes into Object
|
237
|
+
# Otherwise are accessible via Spark::Mllib::LinearRegressionWithSGD
|
238
|
+
Spark::Mllib.import(Object)
|
175
239
|
|
240
|
+
# Training data
|
176
241
|
data = [
|
177
242
|
LabeledPoint.new(0.0, [0.0]),
|
178
243
|
LabeledPoint.new(1.0, [1.0]),
|
179
244
|
LabeledPoint.new(3.0, [2.0]),
|
180
245
|
LabeledPoint.new(2.0, [3.0])
|
181
246
|
]
|
182
|
-
|
247
|
+
|
248
|
+
# Train a model
|
249
|
+
lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initial_weights: [1.0])
|
183
250
|
|
184
251
|
lrm.predict([0.0])
|
185
252
|
```
|
data/TODO.md
CHANGED
@@ -1,7 +1,6 @@
|
|
1
|
-
- add compress
|
2
1
|
- refactor JavaBridge: to_java, from_java
|
3
2
|
- add Streaming
|
4
3
|
- add SQL
|
5
|
-
- autobatch serializer
|
6
4
|
- worker informations (time, memory, ...)
|
7
|
-
-
|
5
|
+
- killing zombie workers
|
6
|
+
- global config to ~/.ruby-spark.conf (e.g. target folder for spark)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
data/example/pi.rb
CHANGED
@@ -0,0 +1,83 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Parse sitemap and search word on every page
|
4
|
+
|
5
|
+
require 'optparse'
|
6
|
+
require 'open-uri'
|
7
|
+
require 'nokogiri'
|
8
|
+
require 'ruby-spark'
|
9
|
+
|
10
|
+
options = {
|
11
|
+
sitemap: 'http://fit.cvut.cz/sitemap.xml',
|
12
|
+
query: 'cvut',
|
13
|
+
workers: 2
|
14
|
+
}
|
15
|
+
|
16
|
+
opt_parser = OptionParser.new do |opts|
|
17
|
+
opts.banner = 'Usage: website_search.rb [options]'
|
18
|
+
|
19
|
+
opts.separator ''
|
20
|
+
opts.separator 'Specific options:'
|
21
|
+
|
22
|
+
opts.on('-s', '--sitemap SITEMAP', 'Sitemap URL') do |sitemap|
|
23
|
+
options[:sitemap] = sitemap
|
24
|
+
end
|
25
|
+
|
26
|
+
opts.on('-q', '--query QUERY', 'Query to search') do |query|
|
27
|
+
options[:query] = query
|
28
|
+
end
|
29
|
+
|
30
|
+
opts.on('-w', '--workers WORKERS_NUM', Integer, 'Number of workers') do |workers|
|
31
|
+
options[:workers] = workers
|
32
|
+
end
|
33
|
+
|
34
|
+
opts.on('--quite', 'Run quitely') do |v|
|
35
|
+
Spark.logger.disabled
|
36
|
+
end
|
37
|
+
|
38
|
+
opts.on_tail('-h', '--help', 'Show this message') do
|
39
|
+
puts opts
|
40
|
+
exit
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
opt_parser.parse!
|
45
|
+
|
46
|
+
@links = []
|
47
|
+
|
48
|
+
def parse_sitemap(doc)
|
49
|
+
doc.xpath('//sitemapindex/sitemap/loc').each do |loc|
|
50
|
+
next_doc = Nokogiri::HTML(open(loc.text))
|
51
|
+
parse_sitemap(next_doc)
|
52
|
+
end
|
53
|
+
|
54
|
+
doc.xpath('//url/loc').each do |loc|
|
55
|
+
@links << loc.text
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
doc = Nokogiri::HTML(open(options[:sitemap]))
|
60
|
+
parse_sitemap(doc)
|
61
|
+
|
62
|
+
# Map function
|
63
|
+
func = Proc.new do |url|
|
64
|
+
begin
|
65
|
+
open(url) {|f|
|
66
|
+
[url, f.read.scan(query).count]
|
67
|
+
}
|
68
|
+
rescue
|
69
|
+
[url, 0]
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
Spark.start
|
74
|
+
|
75
|
+
rdd = Spark.sc.parallelize(@links, options[:workers])
|
76
|
+
.add_library('open-uri')
|
77
|
+
.bind(query: options[:query])
|
78
|
+
.map(func)
|
79
|
+
.sort_by(lambda{|(_, value)| value}, false)
|
80
|
+
|
81
|
+
rdd.collect.each do |(url, count)|
|
82
|
+
puts "#{url} => #{count}"
|
83
|
+
end
|
@@ -290,9 +290,21 @@ class PairwiseRDD(prev: RDD[Array[Byte]]) extends RDD[(Long, Array[Byte])](prev)
|
|
290
290
|
|
291
291
|
object RubyRDD extends Logging {
|
292
292
|
|
293
|
+
def runJob(
|
294
|
+
sc: SparkContext,
|
295
|
+
rdd: JavaRDD[Array[Byte]],
|
296
|
+
partitions: ArrayList[Int],
|
297
|
+
allowLocal: Boolean,
|
298
|
+
filename: String): String = {
|
299
|
+
type ByteArray = Array[Byte]
|
300
|
+
type UnrolledPartition = Array[ByteArray]
|
301
|
+
val allPartitions: Array[UnrolledPartition] =
|
302
|
+
sc.runJob(rdd, (x: Iterator[ByteArray]) => x.toArray, partitions, allowLocal)
|
303
|
+
val flattenedPartition: UnrolledPartition = Array.concat(allPartitions: _*)
|
304
|
+
writeRDDToFile(flattenedPartition.iterator, filename)
|
305
|
+
}
|
306
|
+
|
293
307
|
def readRDDFromFile(sc: JavaSparkContext, filename: String, parallelism: Int): JavaRDD[Array[Byte]] = {
|
294
|
-
// Too slow
|
295
|
-
// val file = new DataInputStream(new FileInputStream(filename))
|
296
308
|
val file = new DataInputStream(new BufferedInputStream(new FileInputStream(filename)))
|
297
309
|
val objs = new collection.mutable.ArrayBuffer[Array[Byte]]
|
298
310
|
try {
|
@@ -308,6 +320,22 @@ object RubyRDD extends Logging {
|
|
308
320
|
JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))
|
309
321
|
}
|
310
322
|
|
323
|
+
def writeRDDToFile[T](items: Iterator[T], filename: String): String = {
|
324
|
+
val file = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(filename)))
|
325
|
+
|
326
|
+
try {
|
327
|
+
PythonRDD.writeIteratorToStream(items, file)
|
328
|
+
} finally {
|
329
|
+
file.close()
|
330
|
+
}
|
331
|
+
|
332
|
+
filename
|
333
|
+
}
|
334
|
+
|
335
|
+
def writeRDDToFile[T](rdd: RDD[T], filename: String): String = {
|
336
|
+
writeRDDToFile(rdd.collect.iterator, filename)
|
337
|
+
}
|
338
|
+
|
311
339
|
def readBroadcastFromFile(sc: JavaSparkContext, path: String, id: java.lang.Long): Broadcast[RubyBroadcast] = {
|
312
340
|
sc.broadcast(new RubyBroadcast(path, id))
|
313
341
|
}
|
data/lib/spark.rb
CHANGED
data/lib/spark/build.rb
CHANGED
data/lib/spark/cli.rb
CHANGED
@@ -21,7 +21,7 @@ module Spark
|
|
21
21
|
program :version, Spark::VERSION
|
22
22
|
program :description, 'Ruby wrapper for Spark'
|
23
23
|
|
24
|
-
global_option('-d', '--debug', 'Logging message to stdout'){ $
|
24
|
+
global_option('-d', '--debug', 'Logging message to stdout'){ $DEBUG = true }
|
25
25
|
default_command :help
|
26
26
|
|
27
27
|
|