ruby-spark 1.1.0.1 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/.travis.yml +15 -0
  4. data/CHANGELOG.md +8 -0
  5. data/README.md +184 -57
  6. data/TODO.md +3 -1
  7. data/ext/spark/build.sbt +5 -5
  8. data/ext/spark/src/main/scala/RubyWorker.scala +7 -16
  9. data/lib/spark.rb +69 -10
  10. data/lib/spark/accumulator.rb +8 -0
  11. data/lib/spark/broadcast.rb +7 -0
  12. data/lib/spark/build.rb +10 -10
  13. data/lib/spark/cli.rb +68 -76
  14. data/lib/spark/config.rb +13 -17
  15. data/lib/spark/context.rb +10 -7
  16. data/lib/spark/error.rb +4 -0
  17. data/lib/spark/helper/statistic.rb +5 -1
  18. data/lib/spark/java_bridge.rb +5 -3
  19. data/lib/spark/java_bridge/base.rb +15 -15
  20. data/lib/spark/java_bridge/jruby.rb +3 -1
  21. data/lib/spark/java_bridge/rjb.rb +2 -0
  22. data/lib/spark/mllib/classification/logistic_regression.rb +10 -2
  23. data/lib/spark/mllib/classification/svm.rb +10 -2
  24. data/lib/spark/mllib/clustering/kmeans.rb +6 -2
  25. data/lib/spark/mllib/regression/lasso.rb +18 -2
  26. data/lib/spark/mllib/regression/linear.rb +11 -3
  27. data/lib/spark/mllib/regression/ridge.rb +18 -2
  28. data/lib/spark/rdd.rb +11 -2
  29. data/lib/spark/serializer.rb +1 -1
  30. data/lib/spark/serializer/auto_batched.rb +7 -0
  31. data/lib/spark/version.rb +1 -1
  32. data/ruby-spark.gemspec +4 -5
  33. data/spec/generator.rb +1 -1
  34. data/spec/lib/collect_spec.rb +10 -10
  35. data/spec/lib/config_spec.rb +10 -10
  36. data/spec/lib/context_spec.rb +116 -115
  37. data/spec/lib/ext_spec.rb +17 -17
  38. data/spec/lib/external_apps_spec.rb +1 -1
  39. data/spec/lib/filter_spec.rb +17 -17
  40. data/spec/lib/flat_map_spec.rb +22 -19
  41. data/spec/lib/group_spec.rb +22 -19
  42. data/spec/lib/helper_spec.rb +60 -12
  43. data/spec/lib/key_spec.rb +9 -8
  44. data/spec/lib/manipulation_spec.rb +15 -15
  45. data/spec/lib/map_partitions_spec.rb +6 -4
  46. data/spec/lib/map_spec.rb +22 -19
  47. data/spec/lib/reduce_by_key_spec.rb +19 -19
  48. data/spec/lib/reduce_spec.rb +22 -20
  49. data/spec/lib/sample_spec.rb +13 -12
  50. data/spec/lib/serializer_spec.rb +27 -0
  51. data/spec/lib/sort_spec.rb +16 -14
  52. data/spec/lib/statistic_spec.rb +4 -2
  53. data/spec/lib/whole_text_files_spec.rb +9 -8
  54. data/spec/spec_helper.rb +3 -3
  55. metadata +19 -18
@@ -18,7 +18,7 @@ class Generator
18
18
  def self.lines(size=1000, letters=3)
19
19
  Array.new(size) do
20
20
  Array.new(rand(50..100)){
21
- (97+rand(letters)).chr + (" " * (rand(10) == 0 ? 1 : 0))
21
+ (97+rand(letters)).chr + (' ' * (rand(10) == 0 ? 1 : 0))
22
22
  }.join
23
23
  end
24
24
  end
@@ -1,40 +1,40 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
2
 
3
- RSpec::describe Spark::RDD do
3
+ RSpec.describe Spark::RDD do
4
4
 
5
5
  let(:mapping) { lambda{|x| [x, 1]} }
6
6
  let(:numbers) { Generator.numbers }
7
-
8
- it ".collect_as_hash" do
7
+
8
+ it '.collect_as_hash' do
9
9
  rdd = $sc.parallelize(numbers)
10
10
  rdd = rdd.map(mapping)
11
11
 
12
12
  expect(rdd.collect_as_hash).to eql(Hash[numbers.map(&mapping)])
13
13
  end
14
14
 
15
- context ".take" do
15
+ context '.take' do
16
16
  let(:size) { 1000 }
17
17
  let(:numbers) { Generator.numbers(size) }
18
18
  let(:rdd) { $sc.parallelize(numbers) }
19
19
 
20
- it "nothing" do
20
+ it 'nothing' do
21
21
  expect(rdd.take(0)).to eql([])
22
22
  end
23
23
 
24
- it "first" do
24
+ it 'first' do
25
25
  expect(rdd.first).to eql(numbers.first)
26
26
  end
27
27
 
28
- it "less than limit" do
28
+ it 'less than limit' do
29
29
  _size = size / 2
30
30
  expect(rdd.take(_size)).to eql(numbers.take(_size))
31
31
  end
32
32
 
33
- it "all" do
33
+ it 'all' do
34
34
  expect(rdd.take(size)).to eql(numbers)
35
35
  end
36
36
 
37
- it "more than limit" do
37
+ it 'more than limit' do
38
38
  expect(rdd.take(size*2)).to eql(numbers)
39
39
  end
40
40
  end
@@ -1,6 +1,6 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
2
 
3
- RSpec::describe Spark::Config do
3
+ RSpec.describe Spark::Config do
4
4
 
5
5
  before(:context) do
6
6
  Spark.stop
@@ -10,17 +10,17 @@ RSpec::describe Spark::Config do
10
10
  spark_start
11
11
  end
12
12
 
13
- it "should be stopped" do
13
+ it 'should be stopped' do
14
14
  expect(Spark.started?).to be_falsy
15
15
  end
16
16
 
17
- context "new config" do
17
+ context 'new config' do
18
18
 
19
19
  let(:configuration) do
20
20
  {
21
- "test.test1" => "test1",
22
- "test.test2" => "test2",
23
- "test.test3" => "test3"
21
+ 'test.test1' => 'test1',
22
+ 'test.test2' => 'test2',
23
+ 'test.test3' => 'test3'
24
24
  }
25
25
  end
26
26
 
@@ -28,7 +28,7 @@ RSpec::describe Spark::Config do
28
28
  Spark.clear_config
29
29
  end
30
30
 
31
- it "throught methods" do
31
+ it 'throught methods' do
32
32
  configuration.each do |key, value|
33
33
  Spark.config.set(key, value)
34
34
  end
@@ -38,7 +38,7 @@ RSpec::describe Spark::Config do
38
38
  end
39
39
  end
40
40
 
41
- it "throught hash style" do
41
+ it 'throught hash style' do
42
42
  configuration.each do |key, value|
43
43
  Spark.config[key] = value
44
44
  end
@@ -48,7 +48,7 @@ RSpec::describe Spark::Config do
48
48
  end
49
49
  end
50
50
 
51
- it "throught dsl" do
51
+ it 'throught dsl' do
52
52
  configuration.each do |key, value|
53
53
  Spark.config {
54
54
  set key, value
@@ -46,120 +46,121 @@ RSpec.describe Spark::Context do
46
46
  )
47
47
  end
48
48
 
49
- context '.accumulator' do
50
- it 'test' do
51
- accum1 = $sc.accumulator(0,)
52
- accum2 = $sc.accumulator(1, :*, 1)
53
- accum3 = $sc.accumulator(0, lambda{|max, val| val > max ? val : max})
54
-
55
- accum1 += 1
56
-
57
- accum2.add(2)
58
- accum2.add(2)
59
- accum2.add(2)
60
-
61
- accum3.add(9)
62
- accum3.add(6)
63
- accum3.add(7)
64
-
65
- expect(accum1.value).to eql(1)
66
- expect(accum2.value).to eql(8)
67
- expect(accum3.value).to eql(9)
68
-
69
- func = Proc.new do |_, index|
70
- accum1.add(1)
71
- accum2.add(2)
72
- accum3.add(index * 10)
73
- end
74
-
75
- rdd = $sc.parallelize(0..4, 4)
76
- rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
77
- rdd = rdd.map_partitions_with_index(func)
78
- rdd.collect
79
-
80
- # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
81
- sleep(1)
82
-
83
- expect(accum1.value).to eql(5)
84
- expect(accum2.value).to eql(128)
85
- expect(accum3.value).to eql(30)
86
- end
87
-
88
- context 'accum param' do
89
- it 'symbol' do
90
- accum1 = $sc.accumulator(1, :+, 0)
91
- accum2 = $sc.accumulator(5, :-, 3)
92
- accum3 = $sc.accumulator(1, :*, 1)
93
- accum4 = $sc.accumulator(1.0, :/, 1.0)
94
- accum5 = $sc.accumulator(2, :**, 2)
95
-
96
- func = Proc.new do |_|
97
- accum1.add(1)
98
- accum2.add(1)
99
- accum3.add(2)
100
- accum4.add(2)
101
- accum5.add(2)
102
- end
103
-
104
- rdd = $sc.parallelize(0..4, 2)
105
- rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3, accum4: accum4, accum5: accum5)
106
- rdd = rdd.map_partitions(func)
107
- rdd.collect
108
-
109
- # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
110
- sleep(1)
111
-
112
- expect(accum1.value).to eq(3)
113
- expect(accum2.value).to eq(1)
114
- expect(accum3.value).to eq(4)
115
- expect(accum4.value).to eq(4)
116
- expect(accum5.value).to eq(65536)
117
- end
118
-
119
- it 'proc' do
120
- accum1 = $sc.accumulator(1, lambda{|mem, val| mem + val}, 0)
121
- accum2 = $sc.accumulator('a', lambda{|mem, val| mem + val}, '')
122
- accum3 = $sc.accumulator([], lambda{|mem, val| mem << val}, [])
123
-
124
- func = Proc.new do |_|
125
- accum1.add(1)
126
- accum2.add('a')
127
- accum3.add(1)
128
- end
129
-
130
- rdd = $sc.parallelize(0..4, 2)
131
- rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
132
- rdd = rdd.map_partitions(func)
133
- rdd.collect
134
-
135
- # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
136
- sleep(1)
137
-
138
- expect(accum1.value).to eq(3)
139
- expect(accum2.value).to eq('aaa')
140
- expect(accum3.value).to eq([[1], [1]])
141
- end
142
-
143
- it 'string' do
144
- expect { $sc.accumulator(1, '0') }.to raise_error(Spark::SerializeError)
145
-
146
- accum = $sc.accumulator(1, 'lambda{|mem, val| mem + val}', 0)
147
-
148
- func = Proc.new do |_|
149
- accum.add(1)
150
- end
151
-
152
- rdd = $sc.parallelize(0..4, 2)
153
- rdd = rdd.bind(accum: accum)
154
- rdd = rdd.map_partitions(func)
155
- rdd.collect
156
-
157
- # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
158
- sleep(1)
159
-
160
- expect(accum.value).to eq(3)
161
- end
162
- end
163
- end
49
+ # context '.accumulator' do
50
+
51
+ # it 'test' do
52
+ # accum1 = $sc.accumulator(0,)
53
+ # accum2 = $sc.accumulator(1, :*, 1)
54
+ # accum3 = $sc.accumulator(0, lambda{|max, val| val > max ? val : max})
55
+
56
+ # accum1 += 1
57
+
58
+ # accum2.add(2)
59
+ # accum2.add(2)
60
+ # accum2.add(2)
61
+
62
+ # accum3.add(9)
63
+ # accum3.add(6)
64
+ # accum3.add(7)
65
+
66
+ # expect(accum1.value).to eql(1)
67
+ # expect(accum2.value).to eql(8)
68
+ # expect(accum3.value).to eql(9)
69
+
70
+ # func = Proc.new do |_, index|
71
+ # accum1.add(1)
72
+ # accum2.add(2)
73
+ # accum3.add(index * 10)
74
+ # end
75
+
76
+ # rdd = $sc.parallelize(0..4, 4)
77
+ # rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
78
+ # rdd = rdd.map_partitions_with_index(func)
79
+ # rdd.collect
80
+
81
+ # # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
82
+ # sleep(1)
83
+
84
+ # expect(accum1.value).to eql(5)
85
+ # expect(accum2.value).to eql(128)
86
+ # expect(accum3.value).to eql(30)
87
+ # end
88
+
89
+ # context 'accum param' do
90
+ # it 'symbol' do
91
+ # accum1 = $sc.accumulator(1, :+, 0)
92
+ # accum2 = $sc.accumulator(5, :-, 3)
93
+ # accum3 = $sc.accumulator(1, :*, 1)
94
+ # accum4 = $sc.accumulator(1.0, :/, 1.0)
95
+ # accum5 = $sc.accumulator(2, :**, 2)
96
+
97
+ # func = Proc.new do |_|
98
+ # accum1.add(1)
99
+ # accum2.add(1)
100
+ # accum3.add(2)
101
+ # accum4.add(2)
102
+ # accum5.add(2)
103
+ # end
104
+
105
+ # rdd = $sc.parallelize(0..4, 2)
106
+ # rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3, accum4: accum4, accum5: accum5)
107
+ # rdd = rdd.map_partitions(func)
108
+ # rdd.collect
109
+
110
+ # # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
111
+ # sleep(1)
112
+
113
+ # expect(accum1.value).to eq(3)
114
+ # expect(accum2.value).to eq(1)
115
+ # expect(accum3.value).to eq(4)
116
+ # expect(accum4.value).to eq(4)
117
+ # expect(accum5.value).to eq(65536)
118
+ # end
119
+
120
+ # it 'proc' do
121
+ # accum1 = $sc.accumulator(1, lambda{|mem, val| mem + val}, 0)
122
+ # accum2 = $sc.accumulator('a', lambda{|mem, val| mem + val}, '')
123
+ # accum3 = $sc.accumulator([], lambda{|mem, val| mem << val}, [])
124
+
125
+ # func = Proc.new do |_|
126
+ # accum1.add(1)
127
+ # accum2.add('a')
128
+ # accum3.add(1)
129
+ # end
130
+
131
+ # rdd = $sc.parallelize(0..4, 2)
132
+ # rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
133
+ # rdd = rdd.map_partitions(func)
134
+ # rdd.collect
135
+
136
+ # # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
137
+ # sleep(1)
138
+
139
+ # expect(accum1.value).to eq(3)
140
+ # expect(accum2.value).to eq('aaa')
141
+ # expect(accum3.value).to eq([[1], [1]])
142
+ # end
143
+
144
+ # it 'string' do
145
+ # expect { $sc.accumulator(1, '0') }.to raise_error(Spark::SerializeError)
146
+
147
+ # accum = $sc.accumulator(1, 'lambda{|mem, val| mem + val}', 0)
148
+
149
+ # func = Proc.new do |_|
150
+ # accum.add(1)
151
+ # end
152
+
153
+ # rdd = $sc.parallelize(0..4, 2)
154
+ # rdd = rdd.bind(accum: accum)
155
+ # rdd = rdd.map_partitions(func)
156
+ # rdd.collect
157
+
158
+ # # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
159
+ # sleep(1)
160
+
161
+ # expect(accum.value).to eq(3)
162
+ # end
163
+ # end
164
+ # end
164
165
 
165
166
  end
@@ -1,18 +1,18 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
2
 
3
3
  RSpec.describe Array do
4
4
 
5
- it ".deep_copy" do
6
- data = ["a", "b", "c"]
5
+ it '.deep_copy' do
6
+ data = ['a', 'b', 'c']
7
7
  new_data = data.dup
8
8
 
9
- data[0] << "a"
9
+ data[0] << 'a'
10
10
 
11
11
  expect(data).to eql(new_data)
12
12
 
13
13
  new_data = data.deep_copy
14
14
 
15
- data[1] << "b"
15
+ data[1] << 'b'
16
16
 
17
17
  expect(data).to_not eql(new_data)
18
18
  end
@@ -21,19 +21,19 @@ end
21
21
 
22
22
  RSpec.describe Hash do
23
23
 
24
- it ".stringify_keys!" do
24
+ it '.stringify_keys!' do
25
25
  data = {
26
- a: "a",
27
- b: "b",
28
- c: "c"
26
+ a: 'a',
27
+ b: 'b',
28
+ c: 'c'
29
29
  }
30
30
 
31
31
  data.stringify_keys!
32
32
 
33
33
  expect(data).to eql({
34
- "a" => "a",
35
- "b" => "b",
36
- "c" => "c"
34
+ 'a' => 'a',
35
+ 'b' => 'b',
36
+ 'c' => 'c'
37
37
  })
38
38
  end
39
39
 
@@ -41,9 +41,9 @@ end
41
41
 
42
42
  RSpec.describe String do
43
43
 
44
- it ".camelize" do
45
- data = "aaa_bbb_ccc".camelize
46
- expect(data).to eql("AaaBbbCcc")
44
+ it '.camelize' do
45
+ data = 'aaa_bbb_ccc'.camelize
46
+ expect(data).to eql('AaaBbbCcc')
47
47
  end
48
48
 
49
49
  end
@@ -56,14 +56,14 @@ RSpec.describe IO do
56
56
 
57
57
  file.write_int(1)
58
58
  file.write_long(2)
59
- file.write_string("3")
59
+ file.write_string('3')
60
60
  file.write_data([4])
61
61
 
62
62
  file.rewind
63
63
 
64
64
  expect(file.read_int).to eq(1)
65
65
  expect(file.read_long).to eq(2)
66
- expect(file.read_string).to eq("3")
66
+ expect(file.read_string).to eq('3')
67
67
  expect(file.read_data).to eq([4])
68
68
 
69
69
  file.unlink
@@ -1,6 +1,6 @@
1
1
  require 'spec_helper'
2
2
 
3
- RSpec::describe Spark::RDD do
3
+ RSpec.describe Spark::RDD do
4
4
 
5
5
  context '.pipe' do
6
6
  let(:words) { Generator.words }
@@ -1,12 +1,12 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
2
 
3
3
  def func4(item)
4
- item.start_with?("a") && item.size > 3 && item[1].to_s.ord > 106
4
+ item.start_with?('a') && item.size > 3 && item[1].to_s.ord > 106
5
5
  end
6
6
 
7
- RSpec::shared_examples "a filtering" do |workers|
7
+ RSpec.shared_examples 'a filtering' do |workers|
8
8
  context "with #{workers || 'default'} worker" do
9
- it "when numbers" do
9
+ it 'when numbers' do
10
10
  rdd2 = rdd_numbers(workers)
11
11
  rdd2 = rdd2.filter(func1)
12
12
  result = numbers.select(&func1)
@@ -20,7 +20,7 @@ RSpec::shared_examples "a filtering" do |workers|
20
20
  expect(rdd3.collect).to eql([])
21
21
  end
22
22
 
23
- it "when words" do
23
+ it 'when words' do
24
24
  rdd2 = rdd_words(workers)
25
25
  rdd2 = rdd2.filter(func3)
26
26
  result = words.select{|x| func3.call(x)}
@@ -36,12 +36,12 @@ RSpec::shared_examples "a filtering" do |workers|
36
36
  end
37
37
  end
38
38
 
39
- RSpec::describe "Spark::RDD.filter" do
39
+ RSpec.describe 'Spark::RDD.filter' do
40
40
  let(:func1) { lambda{|x| x.to_i.even?} }
41
41
  let(:func2) { lambda{|x| x.to_i.odd?} }
42
- let(:func3) { lambda{|x| x.to_s.start_with?("b")} }
42
+ let(:func3) { lambda{|x| x.to_s.start_with?('b')} }
43
43
 
44
- context "throught parallelize" do
44
+ context 'throught parallelize' do
45
45
  let(:numbers) { Generator.numbers_with_zero }
46
46
  let(:words) { Generator.words }
47
47
 
@@ -53,14 +53,14 @@ RSpec::describe "Spark::RDD.filter" do
53
53
  $sc.parallelize(words, workers)
54
54
  end
55
55
 
56
- it_behaves_like "a filtering", nil
57
- it_behaves_like "a filtering", 1
58
- it_behaves_like "a filtering", rand(2..10)
56
+ it_behaves_like 'a filtering', 2
57
+ # it_behaves_like 'a filtering', nil
58
+ # it_behaves_like 'a filtering', rand(2..10)
59
59
  end
60
60
 
61
- context "throught text_file" do
62
- let(:file_numbers) { File.join("spec", "inputs", "numbers_0_100.txt") }
63
- let(:file_words) { File.join("spec", "inputs", "lorem_300.txt") }
61
+ context 'throught text_file' do
62
+ let(:file_numbers) { File.join('spec', 'inputs', 'numbers_0_100.txt') }
63
+ let(:file_words) { File.join('spec', 'inputs', 'lorem_300.txt') }
64
64
 
65
65
  let(:numbers) { File.readlines(file_numbers).map(&:strip) }
66
66
  let(:words) { File.readlines(file_words).map(&:strip) }
@@ -73,8 +73,8 @@ RSpec::describe "Spark::RDD.filter" do
73
73
  $sc.text_file(file_words, workers)
74
74
  end
75
75
 
76
- it_behaves_like "a filtering", nil
77
- it_behaves_like "a filtering", 1
78
- it_behaves_like "a filtering", rand(2..10)
76
+ it_behaves_like 'a filtering', 2
77
+ # it_behaves_like 'a filtering', nil
78
+ # it_behaves_like 'a filtering', rand(2..10)
79
79
  end
80
80
  end