ruby-spark 1.1.0.1 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.travis.yml +15 -0
- data/CHANGELOG.md +8 -0
- data/README.md +184 -57
- data/TODO.md +3 -1
- data/ext/spark/build.sbt +5 -5
- data/ext/spark/src/main/scala/RubyWorker.scala +7 -16
- data/lib/spark.rb +69 -10
- data/lib/spark/accumulator.rb +8 -0
- data/lib/spark/broadcast.rb +7 -0
- data/lib/spark/build.rb +10 -10
- data/lib/spark/cli.rb +68 -76
- data/lib/spark/config.rb +13 -17
- data/lib/spark/context.rb +10 -7
- data/lib/spark/error.rb +4 -0
- data/lib/spark/helper/statistic.rb +5 -1
- data/lib/spark/java_bridge.rb +5 -3
- data/lib/spark/java_bridge/base.rb +15 -15
- data/lib/spark/java_bridge/jruby.rb +3 -1
- data/lib/spark/java_bridge/rjb.rb +2 -0
- data/lib/spark/mllib/classification/logistic_regression.rb +10 -2
- data/lib/spark/mllib/classification/svm.rb +10 -2
- data/lib/spark/mllib/clustering/kmeans.rb +6 -2
- data/lib/spark/mllib/regression/lasso.rb +18 -2
- data/lib/spark/mllib/regression/linear.rb +11 -3
- data/lib/spark/mllib/regression/ridge.rb +18 -2
- data/lib/spark/rdd.rb +11 -2
- data/lib/spark/serializer.rb +1 -1
- data/lib/spark/serializer/auto_batched.rb +7 -0
- data/lib/spark/version.rb +1 -1
- data/ruby-spark.gemspec +4 -5
- data/spec/generator.rb +1 -1
- data/spec/lib/collect_spec.rb +10 -10
- data/spec/lib/config_spec.rb +10 -10
- data/spec/lib/context_spec.rb +116 -115
- data/spec/lib/ext_spec.rb +17 -17
- data/spec/lib/external_apps_spec.rb +1 -1
- data/spec/lib/filter_spec.rb +17 -17
- data/spec/lib/flat_map_spec.rb +22 -19
- data/spec/lib/group_spec.rb +22 -19
- data/spec/lib/helper_spec.rb +60 -12
- data/spec/lib/key_spec.rb +9 -8
- data/spec/lib/manipulation_spec.rb +15 -15
- data/spec/lib/map_partitions_spec.rb +6 -4
- data/spec/lib/map_spec.rb +22 -19
- data/spec/lib/reduce_by_key_spec.rb +19 -19
- data/spec/lib/reduce_spec.rb +22 -20
- data/spec/lib/sample_spec.rb +13 -12
- data/spec/lib/serializer_spec.rb +27 -0
- data/spec/lib/sort_spec.rb +16 -14
- data/spec/lib/statistic_spec.rb +4 -2
- data/spec/lib/whole_text_files_spec.rb +9 -8
- data/spec/spec_helper.rb +3 -3
- metadata +19 -18
data/spec/generator.rb
CHANGED
data/spec/lib/collect_spec.rb
CHANGED
@@ -1,40 +1,40 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
|
-
RSpec
|
3
|
+
RSpec.describe Spark::RDD do
|
4
4
|
|
5
5
|
let(:mapping) { lambda{|x| [x, 1]} }
|
6
6
|
let(:numbers) { Generator.numbers }
|
7
|
-
|
8
|
-
it
|
7
|
+
|
8
|
+
it '.collect_as_hash' do
|
9
9
|
rdd = $sc.parallelize(numbers)
|
10
10
|
rdd = rdd.map(mapping)
|
11
11
|
|
12
12
|
expect(rdd.collect_as_hash).to eql(Hash[numbers.map(&mapping)])
|
13
13
|
end
|
14
14
|
|
15
|
-
context
|
15
|
+
context '.take' do
|
16
16
|
let(:size) { 1000 }
|
17
17
|
let(:numbers) { Generator.numbers(size) }
|
18
18
|
let(:rdd) { $sc.parallelize(numbers) }
|
19
19
|
|
20
|
-
it
|
20
|
+
it 'nothing' do
|
21
21
|
expect(rdd.take(0)).to eql([])
|
22
22
|
end
|
23
23
|
|
24
|
-
it
|
24
|
+
it 'first' do
|
25
25
|
expect(rdd.first).to eql(numbers.first)
|
26
26
|
end
|
27
27
|
|
28
|
-
it
|
28
|
+
it 'less than limit' do
|
29
29
|
_size = size / 2
|
30
30
|
expect(rdd.take(_size)).to eql(numbers.take(_size))
|
31
31
|
end
|
32
32
|
|
33
|
-
it
|
33
|
+
it 'all' do
|
34
34
|
expect(rdd.take(size)).to eql(numbers)
|
35
35
|
end
|
36
36
|
|
37
|
-
it
|
37
|
+
it 'more than limit' do
|
38
38
|
expect(rdd.take(size*2)).to eql(numbers)
|
39
39
|
end
|
40
40
|
end
|
data/spec/lib/config_spec.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
|
-
RSpec
|
3
|
+
RSpec.describe Spark::Config do
|
4
4
|
|
5
5
|
before(:context) do
|
6
6
|
Spark.stop
|
@@ -10,17 +10,17 @@ RSpec::describe Spark::Config do
|
|
10
10
|
spark_start
|
11
11
|
end
|
12
12
|
|
13
|
-
it
|
13
|
+
it 'should be stopped' do
|
14
14
|
expect(Spark.started?).to be_falsy
|
15
15
|
end
|
16
16
|
|
17
|
-
context
|
17
|
+
context 'new config' do
|
18
18
|
|
19
19
|
let(:configuration) do
|
20
20
|
{
|
21
|
-
|
22
|
-
|
23
|
-
|
21
|
+
'test.test1' => 'test1',
|
22
|
+
'test.test2' => 'test2',
|
23
|
+
'test.test3' => 'test3'
|
24
24
|
}
|
25
25
|
end
|
26
26
|
|
@@ -28,7 +28,7 @@ RSpec::describe Spark::Config do
|
|
28
28
|
Spark.clear_config
|
29
29
|
end
|
30
30
|
|
31
|
-
it
|
31
|
+
it 'throught methods' do
|
32
32
|
configuration.each do |key, value|
|
33
33
|
Spark.config.set(key, value)
|
34
34
|
end
|
@@ -38,7 +38,7 @@ RSpec::describe Spark::Config do
|
|
38
38
|
end
|
39
39
|
end
|
40
40
|
|
41
|
-
it
|
41
|
+
it 'throught hash style' do
|
42
42
|
configuration.each do |key, value|
|
43
43
|
Spark.config[key] = value
|
44
44
|
end
|
@@ -48,7 +48,7 @@ RSpec::describe Spark::Config do
|
|
48
48
|
end
|
49
49
|
end
|
50
50
|
|
51
|
-
it
|
51
|
+
it 'throught dsl' do
|
52
52
|
configuration.each do |key, value|
|
53
53
|
Spark.config {
|
54
54
|
set key, value
|
data/spec/lib/context_spec.rb
CHANGED
@@ -46,120 +46,121 @@ RSpec.describe Spark::Context do
|
|
46
46
|
)
|
47
47
|
end
|
48
48
|
|
49
|
-
context '.accumulator' do
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
end
|
49
|
+
# context '.accumulator' do
|
50
|
+
|
51
|
+
# it 'test' do
|
52
|
+
# accum1 = $sc.accumulator(0,)
|
53
|
+
# accum2 = $sc.accumulator(1, :*, 1)
|
54
|
+
# accum3 = $sc.accumulator(0, lambda{|max, val| val > max ? val : max})
|
55
|
+
|
56
|
+
# accum1 += 1
|
57
|
+
|
58
|
+
# accum2.add(2)
|
59
|
+
# accum2.add(2)
|
60
|
+
# accum2.add(2)
|
61
|
+
|
62
|
+
# accum3.add(9)
|
63
|
+
# accum3.add(6)
|
64
|
+
# accum3.add(7)
|
65
|
+
|
66
|
+
# expect(accum1.value).to eql(1)
|
67
|
+
# expect(accum2.value).to eql(8)
|
68
|
+
# expect(accum3.value).to eql(9)
|
69
|
+
|
70
|
+
# func = Proc.new do |_, index|
|
71
|
+
# accum1.add(1)
|
72
|
+
# accum2.add(2)
|
73
|
+
# accum3.add(index * 10)
|
74
|
+
# end
|
75
|
+
|
76
|
+
# rdd = $sc.parallelize(0..4, 4)
|
77
|
+
# rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
|
78
|
+
# rdd = rdd.map_partitions_with_index(func)
|
79
|
+
# rdd.collect
|
80
|
+
|
81
|
+
# # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
|
82
|
+
# sleep(1)
|
83
|
+
|
84
|
+
# expect(accum1.value).to eql(5)
|
85
|
+
# expect(accum2.value).to eql(128)
|
86
|
+
# expect(accum3.value).to eql(30)
|
87
|
+
# end
|
88
|
+
|
89
|
+
# context 'accum param' do
|
90
|
+
# it 'symbol' do
|
91
|
+
# accum1 = $sc.accumulator(1, :+, 0)
|
92
|
+
# accum2 = $sc.accumulator(5, :-, 3)
|
93
|
+
# accum3 = $sc.accumulator(1, :*, 1)
|
94
|
+
# accum4 = $sc.accumulator(1.0, :/, 1.0)
|
95
|
+
# accum5 = $sc.accumulator(2, :**, 2)
|
96
|
+
|
97
|
+
# func = Proc.new do |_|
|
98
|
+
# accum1.add(1)
|
99
|
+
# accum2.add(1)
|
100
|
+
# accum3.add(2)
|
101
|
+
# accum4.add(2)
|
102
|
+
# accum5.add(2)
|
103
|
+
# end
|
104
|
+
|
105
|
+
# rdd = $sc.parallelize(0..4, 2)
|
106
|
+
# rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3, accum4: accum4, accum5: accum5)
|
107
|
+
# rdd = rdd.map_partitions(func)
|
108
|
+
# rdd.collect
|
109
|
+
|
110
|
+
# # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
|
111
|
+
# sleep(1)
|
112
|
+
|
113
|
+
# expect(accum1.value).to eq(3)
|
114
|
+
# expect(accum2.value).to eq(1)
|
115
|
+
# expect(accum3.value).to eq(4)
|
116
|
+
# expect(accum4.value).to eq(4)
|
117
|
+
# expect(accum5.value).to eq(65536)
|
118
|
+
# end
|
119
|
+
|
120
|
+
# it 'proc' do
|
121
|
+
# accum1 = $sc.accumulator(1, lambda{|mem, val| mem + val}, 0)
|
122
|
+
# accum2 = $sc.accumulator('a', lambda{|mem, val| mem + val}, '')
|
123
|
+
# accum3 = $sc.accumulator([], lambda{|mem, val| mem << val}, [])
|
124
|
+
|
125
|
+
# func = Proc.new do |_|
|
126
|
+
# accum1.add(1)
|
127
|
+
# accum2.add('a')
|
128
|
+
# accum3.add(1)
|
129
|
+
# end
|
130
|
+
|
131
|
+
# rdd = $sc.parallelize(0..4, 2)
|
132
|
+
# rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
|
133
|
+
# rdd = rdd.map_partitions(func)
|
134
|
+
# rdd.collect
|
135
|
+
|
136
|
+
# # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
|
137
|
+
# sleep(1)
|
138
|
+
|
139
|
+
# expect(accum1.value).to eq(3)
|
140
|
+
# expect(accum2.value).to eq('aaa')
|
141
|
+
# expect(accum3.value).to eq([[1], [1]])
|
142
|
+
# end
|
143
|
+
|
144
|
+
# it 'string' do
|
145
|
+
# expect { $sc.accumulator(1, '0') }.to raise_error(Spark::SerializeError)
|
146
|
+
|
147
|
+
# accum = $sc.accumulator(1, 'lambda{|mem, val| mem + val}', 0)
|
148
|
+
|
149
|
+
# func = Proc.new do |_|
|
150
|
+
# accum.add(1)
|
151
|
+
# end
|
152
|
+
|
153
|
+
# rdd = $sc.parallelize(0..4, 2)
|
154
|
+
# rdd = rdd.bind(accum: accum)
|
155
|
+
# rdd = rdd.map_partitions(func)
|
156
|
+
# rdd.collect
|
157
|
+
|
158
|
+
# # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
|
159
|
+
# sleep(1)
|
160
|
+
|
161
|
+
# expect(accum.value).to eq(3)
|
162
|
+
# end
|
163
|
+
# end
|
164
|
+
# end
|
164
165
|
|
165
166
|
end
|
data/spec/lib/ext_spec.rb
CHANGED
@@ -1,18 +1,18 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
3
|
RSpec.describe Array do
|
4
4
|
|
5
|
-
it
|
6
|
-
data = [
|
5
|
+
it '.deep_copy' do
|
6
|
+
data = ['a', 'b', 'c']
|
7
7
|
new_data = data.dup
|
8
8
|
|
9
|
-
data[0] <<
|
9
|
+
data[0] << 'a'
|
10
10
|
|
11
11
|
expect(data).to eql(new_data)
|
12
12
|
|
13
13
|
new_data = data.deep_copy
|
14
14
|
|
15
|
-
data[1] <<
|
15
|
+
data[1] << 'b'
|
16
16
|
|
17
17
|
expect(data).to_not eql(new_data)
|
18
18
|
end
|
@@ -21,19 +21,19 @@ end
|
|
21
21
|
|
22
22
|
RSpec.describe Hash do
|
23
23
|
|
24
|
-
it
|
24
|
+
it '.stringify_keys!' do
|
25
25
|
data = {
|
26
|
-
a:
|
27
|
-
b:
|
28
|
-
c:
|
26
|
+
a: 'a',
|
27
|
+
b: 'b',
|
28
|
+
c: 'c'
|
29
29
|
}
|
30
30
|
|
31
31
|
data.stringify_keys!
|
32
32
|
|
33
33
|
expect(data).to eql({
|
34
|
-
|
35
|
-
|
36
|
-
|
34
|
+
'a' => 'a',
|
35
|
+
'b' => 'b',
|
36
|
+
'c' => 'c'
|
37
37
|
})
|
38
38
|
end
|
39
39
|
|
@@ -41,9 +41,9 @@ end
|
|
41
41
|
|
42
42
|
RSpec.describe String do
|
43
43
|
|
44
|
-
it
|
45
|
-
data =
|
46
|
-
expect(data).to eql(
|
44
|
+
it '.camelize' do
|
45
|
+
data = 'aaa_bbb_ccc'.camelize
|
46
|
+
expect(data).to eql('AaaBbbCcc')
|
47
47
|
end
|
48
48
|
|
49
49
|
end
|
@@ -56,14 +56,14 @@ RSpec.describe IO do
|
|
56
56
|
|
57
57
|
file.write_int(1)
|
58
58
|
file.write_long(2)
|
59
|
-
file.write_string(
|
59
|
+
file.write_string('3')
|
60
60
|
file.write_data([4])
|
61
61
|
|
62
62
|
file.rewind
|
63
63
|
|
64
64
|
expect(file.read_int).to eq(1)
|
65
65
|
expect(file.read_long).to eq(2)
|
66
|
-
expect(file.read_string).to eq(
|
66
|
+
expect(file.read_string).to eq('3')
|
67
67
|
expect(file.read_data).to eq([4])
|
68
68
|
|
69
69
|
file.unlink
|
data/spec/lib/filter_spec.rb
CHANGED
@@ -1,12 +1,12 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
3
|
def func4(item)
|
4
|
-
item.start_with?(
|
4
|
+
item.start_with?('a') && item.size > 3 && item[1].to_s.ord > 106
|
5
5
|
end
|
6
6
|
|
7
|
-
RSpec
|
7
|
+
RSpec.shared_examples 'a filtering' do |workers|
|
8
8
|
context "with #{workers || 'default'} worker" do
|
9
|
-
it
|
9
|
+
it 'when numbers' do
|
10
10
|
rdd2 = rdd_numbers(workers)
|
11
11
|
rdd2 = rdd2.filter(func1)
|
12
12
|
result = numbers.select(&func1)
|
@@ -20,7 +20,7 @@ RSpec::shared_examples "a filtering" do |workers|
|
|
20
20
|
expect(rdd3.collect).to eql([])
|
21
21
|
end
|
22
22
|
|
23
|
-
it
|
23
|
+
it 'when words' do
|
24
24
|
rdd2 = rdd_words(workers)
|
25
25
|
rdd2 = rdd2.filter(func3)
|
26
26
|
result = words.select{|x| func3.call(x)}
|
@@ -36,12 +36,12 @@ RSpec::shared_examples "a filtering" do |workers|
|
|
36
36
|
end
|
37
37
|
end
|
38
38
|
|
39
|
-
RSpec
|
39
|
+
RSpec.describe 'Spark::RDD.filter' do
|
40
40
|
let(:func1) { lambda{|x| x.to_i.even?} }
|
41
41
|
let(:func2) { lambda{|x| x.to_i.odd?} }
|
42
|
-
let(:func3) { lambda{|x| x.to_s.start_with?(
|
42
|
+
let(:func3) { lambda{|x| x.to_s.start_with?('b')} }
|
43
43
|
|
44
|
-
context
|
44
|
+
context 'throught parallelize' do
|
45
45
|
let(:numbers) { Generator.numbers_with_zero }
|
46
46
|
let(:words) { Generator.words }
|
47
47
|
|
@@ -53,14 +53,14 @@ RSpec::describe "Spark::RDD.filter" do
|
|
53
53
|
$sc.parallelize(words, workers)
|
54
54
|
end
|
55
55
|
|
56
|
-
it_behaves_like
|
57
|
-
it_behaves_like
|
58
|
-
it_behaves_like
|
56
|
+
it_behaves_like 'a filtering', 2
|
57
|
+
# it_behaves_like 'a filtering', nil
|
58
|
+
# it_behaves_like 'a filtering', rand(2..10)
|
59
59
|
end
|
60
60
|
|
61
|
-
context
|
62
|
-
let(:file_numbers) { File.join(
|
63
|
-
let(:file_words) { File.join(
|
61
|
+
context 'throught text_file' do
|
62
|
+
let(:file_numbers) { File.join('spec', 'inputs', 'numbers_0_100.txt') }
|
63
|
+
let(:file_words) { File.join('spec', 'inputs', 'lorem_300.txt') }
|
64
64
|
|
65
65
|
let(:numbers) { File.readlines(file_numbers).map(&:strip) }
|
66
66
|
let(:words) { File.readlines(file_words).map(&:strip) }
|
@@ -73,8 +73,8 @@ RSpec::describe "Spark::RDD.filter" do
|
|
73
73
|
$sc.text_file(file_words, workers)
|
74
74
|
end
|
75
75
|
|
76
|
-
it_behaves_like
|
77
|
-
it_behaves_like
|
78
|
-
it_behaves_like
|
76
|
+
it_behaves_like 'a filtering', 2
|
77
|
+
# it_behaves_like 'a filtering', nil
|
78
|
+
# it_behaves_like 'a filtering', rand(2..10)
|
79
79
|
end
|
80
80
|
end
|