ruby-spark 1.0.0 → 1.1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/README.md +99 -32
- data/TODO.md +2 -3
- data/benchmark/{performance → comparison}/prepare.sh +0 -0
- data/benchmark/{performance → comparison}/python.py +0 -0
- data/benchmark/{performance → comparison}/r.r +0 -0
- data/benchmark/{performance → comparison}/ruby.rb +0 -0
- data/benchmark/{performance → comparison}/run-all.sh +0 -0
- data/benchmark/{performance → comparison}/scala.scala +0 -0
- data/example/pi.rb +1 -1
- data/example/website_search.rb +83 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +30 -2
- data/lib/spark.rb +2 -2
- data/lib/spark/build.rb +1 -1
- data/lib/spark/cli.rb +1 -1
- data/lib/spark/command/base.rb +4 -0
- data/lib/spark/command_builder.rb +2 -2
- data/lib/spark/config.rb +11 -17
- data/lib/spark/context.rb +63 -45
- data/lib/spark/ext/io.rb +11 -1
- data/lib/spark/java_bridge/base.rb +2 -2
- data/lib/spark/rdd.rb +67 -18
- data/lib/spark/serializer.rb +68 -13
- data/lib/spark/serializer/auto_batched.rb +59 -0
- data/lib/spark/serializer/base.rb +30 -137
- data/lib/spark/serializer/batched.rb +84 -0
- data/lib/spark/serializer/cartesian.rb +5 -29
- data/lib/spark/serializer/compressed.rb +27 -0
- data/lib/spark/serializer/marshal.rb +6 -8
- data/lib/spark/serializer/message_pack.rb +8 -10
- data/lib/spark/serializer/oj.rb +8 -10
- data/lib/spark/serializer/pair.rb +27 -13
- data/lib/spark/serializer/text.rb +25 -0
- data/lib/spark/version.rb +1 -1
- data/lib/spark/worker/worker.rb +5 -2
- data/ruby-spark.gemspec +13 -1
- data/spec/lib/context_spec.rb +3 -1
- data/spec/lib/manipulation_spec.rb +18 -10
- data/spec/lib/map_partitions_spec.rb +16 -16
- data/spec/lib/serializer_spec.rb +84 -9
- data/spec/lib/statistic_spec.rb +26 -24
- data/spec/spec_helper.rb +1 -2
- metadata +112 -10
- data/lib/spark/serializer/utf8.rb +0 -25
data/spec/lib/statistic_spec.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
|
-
RSpec
|
3
|
+
RSpec.shared_examples 'a stats' do |workers|
|
4
4
|
let(:numbers) { [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] }
|
5
5
|
|
6
6
|
context "with #{workers || 'default'} worker" do
|
@@ -29,23 +29,23 @@ RSpec::shared_examples 'a stats' do |workers|
|
|
29
29
|
end
|
30
30
|
end
|
31
31
|
|
32
|
-
RSpec
|
32
|
+
RSpec.shared_examples 'a histogram' do |workers|
|
33
33
|
|
34
34
|
context "with #{workers || 'default'} worker" do
|
35
35
|
it 'empty' do
|
36
|
-
rdd = $sc.parallelize([], workers,
|
36
|
+
rdd = $sc.parallelize([], workers, ser)
|
37
37
|
|
38
38
|
expect( rdd.histogram([0, 10])[1] ).to eq([0])
|
39
39
|
expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])
|
40
40
|
end
|
41
41
|
|
42
42
|
it 'validation' do
|
43
|
-
rdd = $sc.parallelize([], workers,
|
43
|
+
rdd = $sc.parallelize([], workers, ser)
|
44
44
|
expect { rdd.histogram(0) }.to raise_error(ArgumentError)
|
45
45
|
end
|
46
46
|
|
47
47
|
it 'double' do
|
48
|
-
rdd = $sc.parallelize([1.0, 2.0, 3.0, 4.0], workers,
|
48
|
+
rdd = $sc.parallelize([1.0, 2.0, 3.0, 4.0], workers, ser)
|
49
49
|
buckets, counts = rdd.histogram(2)
|
50
50
|
|
51
51
|
expect(buckets).to eq([1.0, 2.5, 4.0])
|
@@ -53,91 +53,91 @@ RSpec::shared_examples 'a histogram' do |workers|
|
|
53
53
|
end
|
54
54
|
|
55
55
|
it 'out of range' do
|
56
|
-
rdd = $sc.parallelize([10.01, -0.01], workers,
|
56
|
+
rdd = $sc.parallelize([10.01, -0.01], workers, ser)
|
57
57
|
|
58
58
|
expect( rdd.histogram([0, 10])[1] ).to eq([0])
|
59
59
|
expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])
|
60
60
|
end
|
61
61
|
|
62
62
|
it 'in range with one bucket' do
|
63
|
-
rdd = $sc.parallelize([1, 2, 3, 4], workers,
|
63
|
+
rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
|
64
64
|
|
65
65
|
expect( rdd.histogram([0, 10])[1] ).to eq([4])
|
66
66
|
expect( rdd.histogram([0, 4, 10])[1] ).to eq([3, 1])
|
67
67
|
end
|
68
68
|
|
69
69
|
it 'in range with one bucket exact match' do
|
70
|
-
rdd = $sc.parallelize([1, 2, 3, 4], workers,
|
70
|
+
rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
|
71
71
|
expect( rdd.histogram([1, 4])[1] ).to eq([4])
|
72
72
|
end
|
73
73
|
|
74
74
|
it 'out of range with two buckets' do
|
75
|
-
rdd = $sc.parallelize([10.01, -0.01], workers,
|
75
|
+
rdd = $sc.parallelize([10.01, -0.01], workers, ser)
|
76
76
|
expect( rdd.histogram([0, 5, 10])[1] ).to eq([0, 0])
|
77
77
|
end
|
78
78
|
|
79
79
|
it 'out of range with two uneven buckets' do
|
80
|
-
rdd = $sc.parallelize([10.01, -0.01], workers,
|
80
|
+
rdd = $sc.parallelize([10.01, -0.01], workers, ser)
|
81
81
|
expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])
|
82
82
|
end
|
83
83
|
|
84
84
|
it 'in range with two buckets' do
|
85
|
-
rdd = $sc.parallelize([1, 2, 3, 5, 6], workers,
|
85
|
+
rdd = $sc.parallelize([1, 2, 3, 5, 6], workers, ser)
|
86
86
|
expect( rdd.histogram([0, 5, 10])[1] ).to eq([3, 2])
|
87
87
|
end
|
88
88
|
|
89
89
|
it 'in range with two bucket and nil' do
|
90
|
-
rdd = $sc.parallelize([1, 2, 3, 5, 6, nil, Float::NAN], workers,
|
90
|
+
rdd = $sc.parallelize([1, 2, 3, 5, 6, nil, Float::NAN], workers, ser)
|
91
91
|
expect( rdd.histogram([0, 5, 10])[1] ).to eq([3, 2])
|
92
92
|
end
|
93
93
|
|
94
94
|
it 'in range with two uneven buckets' do
|
95
|
-
rdd = $sc.parallelize([1, 2, 3, 5, 6], workers,
|
95
|
+
rdd = $sc.parallelize([1, 2, 3, 5, 6], workers, ser)
|
96
96
|
expect( rdd.histogram([0, 5, 11])[1] ).to eq([3, 2])
|
97
97
|
end
|
98
98
|
|
99
99
|
it 'mixed range with two uneven buckets' do
|
100
|
-
rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.0, 11.01], workers,
|
100
|
+
rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.0, 11.01], workers, ser)
|
101
101
|
expect( rdd.histogram([0, 5, 11])[1] ).to eq([4, 3])
|
102
102
|
end
|
103
103
|
|
104
104
|
it 'mixed range with four uneven buckets' do
|
105
|
-
rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1], workers,
|
105
|
+
rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1], workers, ser)
|
106
106
|
expect( rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1] ).to eq([4, 2, 1, 3])
|
107
107
|
end
|
108
108
|
|
109
109
|
it 'mixed range with uneven buckets and NaN' do
|
110
|
-
rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1, nil, Float::NAN], workers,
|
110
|
+
rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1, nil, Float::NAN], workers, ser)
|
111
111
|
expect( rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1] ).to eq([4, 2, 1, 3])
|
112
112
|
end
|
113
113
|
|
114
114
|
it 'out of range with infinite buckets' do
|
115
|
-
rdd = $sc.parallelize([10.01, -0.01, Float::NAN, Float::INFINITY], workers,
|
115
|
+
rdd = $sc.parallelize([10.01, -0.01, Float::NAN, Float::INFINITY], workers, ser)
|
116
116
|
expect( rdd.histogram([-Float::INFINITY, 0, Float::INFINITY])[1] ).to eq([1, 1])
|
117
117
|
end
|
118
118
|
|
119
119
|
it 'without buckets' do
|
120
|
-
rdd = $sc.parallelize([1, 2, 3, 4], workers,
|
120
|
+
rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
|
121
121
|
expect( rdd.histogram(1) ).to eq([[1, 4], [4]])
|
122
122
|
end
|
123
123
|
|
124
124
|
it 'without buckets single element' do
|
125
|
-
rdd = $sc.parallelize([1], workers,
|
125
|
+
rdd = $sc.parallelize([1], workers, ser)
|
126
126
|
expect( rdd.histogram(1) ).to eq([[1, 1], [1]])
|
127
127
|
end
|
128
128
|
|
129
129
|
it 'without bucket no range' do
|
130
|
-
rdd = $sc.parallelize([1, 1, 1, 1], workers,
|
130
|
+
rdd = $sc.parallelize([1, 1, 1, 1], workers, ser)
|
131
131
|
expect( rdd.histogram(1) ).to eq([[1, 1], [4]])
|
132
132
|
end
|
133
133
|
|
134
134
|
it 'without buckets basic two' do
|
135
|
-
rdd = $sc.parallelize([1, 2, 3, 4], workers,
|
135
|
+
rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
|
136
136
|
expect( rdd.histogram(2) ).to eq([[1, 2.5, 4], [2, 2]])
|
137
137
|
end
|
138
138
|
|
139
139
|
it 'without buckets with more requested than elements' do
|
140
|
-
rdd = $sc.parallelize([1, 2], workers,
|
140
|
+
rdd = $sc.parallelize([1, 2], workers, ser)
|
141
141
|
buckets = [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]
|
142
142
|
hist = [1, 0, 0, 0, 0, 0, 0, 0, 0, 1]
|
143
143
|
|
@@ -145,7 +145,7 @@ RSpec::shared_examples 'a histogram' do |workers|
|
|
145
145
|
end
|
146
146
|
|
147
147
|
it 'string' do
|
148
|
-
rdd = $sc.parallelize(['ab', 'ac', 'b', 'bd', 'ef'], workers,
|
148
|
+
rdd = $sc.parallelize(['ab', 'ac', 'b', 'bd', 'ef'], workers, ser)
|
149
149
|
|
150
150
|
expect( rdd.histogram(['a', 'b', 'c'])[1] ).to eq([2, 2])
|
151
151
|
expect( rdd.histogram(1) ).to eq([['ab', 'ef'], [5]])
|
@@ -155,7 +155,9 @@ RSpec::shared_examples 'a histogram' do |workers|
|
|
155
155
|
end
|
156
156
|
end
|
157
157
|
|
158
|
-
RSpec
|
158
|
+
RSpec.describe Spark::RDD do
|
159
|
+
let(:ser) { Spark::Serializer.build { __batched__(__marshal__, 1) } }
|
160
|
+
|
159
161
|
context '.stats' do
|
160
162
|
it_behaves_like 'a stats', 1
|
161
163
|
it_behaves_like 'a stats', rand(2..5)
|
data/spec/spec_helper.rb
CHANGED
@@ -14,8 +14,7 @@ Spark::Mllib.import
|
|
14
14
|
def spark_start
|
15
15
|
Spark.logger.disable
|
16
16
|
Spark.config do
|
17
|
-
set 'spark.ruby.
|
18
|
-
set 'spark.ruby.batch_size', 100
|
17
|
+
set 'spark.ruby.serializer.batch_size', 100
|
19
18
|
end
|
20
19
|
Spark.start
|
21
20
|
$sc = Spark.context
|
metadata
CHANGED
@@ -1,15 +1,113 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-spark
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ondřej Moravčík
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-05-
|
11
|
+
date: 2015-05-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: sourcify
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.6.0.rc4
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.6.0.rc4
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: method_source
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: commander
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: pry
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: nio4r
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: distribution
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rjb
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
13
111
|
- !ruby/object:Gem::Dependency
|
14
112
|
name: bundler
|
15
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -56,21 +154,22 @@ files:
|
|
56
154
|
- TODO.md
|
57
155
|
- benchmark/aggregate.rb
|
58
156
|
- benchmark/bisect.rb
|
157
|
+
- benchmark/comparison/prepare.sh
|
158
|
+
- benchmark/comparison/python.py
|
159
|
+
- benchmark/comparison/r.r
|
160
|
+
- benchmark/comparison/ruby.rb
|
161
|
+
- benchmark/comparison/run-all.sh
|
162
|
+
- benchmark/comparison/scala.scala
|
59
163
|
- benchmark/custom_marshal.rb
|
60
164
|
- benchmark/digest.rb
|
61
165
|
- benchmark/enumerator.rb
|
62
|
-
- benchmark/performance/prepare.sh
|
63
|
-
- benchmark/performance/python.py
|
64
|
-
- benchmark/performance/r.r
|
65
|
-
- benchmark/performance/ruby.rb
|
66
|
-
- benchmark/performance/run-all.sh
|
67
|
-
- benchmark/performance/scala.scala
|
68
166
|
- benchmark/serializer.rb
|
69
167
|
- benchmark/sort.rb
|
70
168
|
- benchmark/sort2.rb
|
71
169
|
- benchmark/take.rb
|
72
170
|
- bin/ruby-spark
|
73
171
|
- example/pi.rb
|
172
|
+
- example/website_search.rb
|
74
173
|
- ext/ruby_c/extconf.rb
|
75
174
|
- ext/ruby_c/murmur.c
|
76
175
|
- ext/ruby_c/murmur.h
|
@@ -155,13 +254,16 @@ files:
|
|
155
254
|
- lib/spark/rdd.rb
|
156
255
|
- lib/spark/sampler.rb
|
157
256
|
- lib/spark/serializer.rb
|
257
|
+
- lib/spark/serializer/auto_batched.rb
|
158
258
|
- lib/spark/serializer/base.rb
|
259
|
+
- lib/spark/serializer/batched.rb
|
159
260
|
- lib/spark/serializer/cartesian.rb
|
261
|
+
- lib/spark/serializer/compressed.rb
|
160
262
|
- lib/spark/serializer/marshal.rb
|
161
263
|
- lib/spark/serializer/message_pack.rb
|
162
264
|
- lib/spark/serializer/oj.rb
|
163
265
|
- lib/spark/serializer/pair.rb
|
164
|
-
- lib/spark/serializer/
|
266
|
+
- lib/spark/serializer/text.rb
|
165
267
|
- lib/spark/sort.rb
|
166
268
|
- lib/spark/stat_counter.rb
|
167
269
|
- lib/spark/storage_level.rb
|
@@ -245,7 +347,7 @@ rubyforge_project:
|
|
245
347
|
rubygems_version: 2.2.2
|
246
348
|
signing_key:
|
247
349
|
specification_version: 4
|
248
|
-
summary: Ruby wrapper for Spark
|
350
|
+
summary: Ruby wrapper for Apache Spark
|
249
351
|
test_files:
|
250
352
|
- spec/generator.rb
|
251
353
|
- spec/inputs/lorem_300.txt
|
@@ -1,25 +0,0 @@
|
|
1
|
-
module Spark
|
2
|
-
module Serializer
|
3
|
-
##
|
4
|
-
# Used for file
|
5
|
-
#
|
6
|
-
# File is sended as String but worker use serialization
|
7
|
-
#
|
8
|
-
class UTF8 < Base
|
9
|
-
|
10
|
-
def set(*)
|
11
|
-
unbatch!
|
12
|
-
self
|
13
|
-
end
|
14
|
-
|
15
|
-
def batched?
|
16
|
-
false
|
17
|
-
end
|
18
|
-
|
19
|
-
def load_next_from_io(io, lenght)
|
20
|
-
io.read(lenght).force_encoding(Encoding::UTF_8)
|
21
|
-
end
|
22
|
-
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|