ruby-spark 1.1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/.travis.yml +15 -0
  4. data/CHANGELOG.md +8 -0
  5. data/README.md +184 -57
  6. data/TODO.md +3 -1
  7. data/ext/spark/build.sbt +5 -5
  8. data/ext/spark/src/main/scala/RubyWorker.scala +7 -16
  9. data/lib/spark.rb +69 -10
  10. data/lib/spark/accumulator.rb +8 -0
  11. data/lib/spark/broadcast.rb +7 -0
  12. data/lib/spark/build.rb +10 -10
  13. data/lib/spark/cli.rb +68 -76
  14. data/lib/spark/config.rb +13 -17
  15. data/lib/spark/context.rb +10 -7
  16. data/lib/spark/error.rb +4 -0
  17. data/lib/spark/helper/statistic.rb +5 -1
  18. data/lib/spark/java_bridge.rb +5 -3
  19. data/lib/spark/java_bridge/base.rb +15 -15
  20. data/lib/spark/java_bridge/jruby.rb +3 -1
  21. data/lib/spark/java_bridge/rjb.rb +2 -0
  22. data/lib/spark/mllib/classification/logistic_regression.rb +10 -2
  23. data/lib/spark/mllib/classification/svm.rb +10 -2
  24. data/lib/spark/mllib/clustering/kmeans.rb +6 -2
  25. data/lib/spark/mllib/regression/lasso.rb +18 -2
  26. data/lib/spark/mllib/regression/linear.rb +11 -3
  27. data/lib/spark/mllib/regression/ridge.rb +18 -2
  28. data/lib/spark/rdd.rb +11 -2
  29. data/lib/spark/serializer.rb +1 -1
  30. data/lib/spark/serializer/auto_batched.rb +7 -0
  31. data/lib/spark/version.rb +1 -1
  32. data/ruby-spark.gemspec +4 -5
  33. data/spec/generator.rb +1 -1
  34. data/spec/lib/collect_spec.rb +10 -10
  35. data/spec/lib/config_spec.rb +10 -10
  36. data/spec/lib/context_spec.rb +116 -115
  37. data/spec/lib/ext_spec.rb +17 -17
  38. data/spec/lib/external_apps_spec.rb +1 -1
  39. data/spec/lib/filter_spec.rb +17 -17
  40. data/spec/lib/flat_map_spec.rb +22 -19
  41. data/spec/lib/group_spec.rb +22 -19
  42. data/spec/lib/helper_spec.rb +60 -12
  43. data/spec/lib/key_spec.rb +9 -8
  44. data/spec/lib/manipulation_spec.rb +15 -15
  45. data/spec/lib/map_partitions_spec.rb +6 -4
  46. data/spec/lib/map_spec.rb +22 -19
  47. data/spec/lib/reduce_by_key_spec.rb +19 -19
  48. data/spec/lib/reduce_spec.rb +22 -20
  49. data/spec/lib/sample_spec.rb +13 -12
  50. data/spec/lib/serializer_spec.rb +27 -0
  51. data/spec/lib/sort_spec.rb +16 -14
  52. data/spec/lib/statistic_spec.rb +4 -2
  53. data/spec/lib/whole_text_files_spec.rb +9 -8
  54. data/spec/spec_helper.rb +3 -3
  55. metadata +19 -18
@@ -18,7 +18,7 @@ class Generator
18
18
  def self.lines(size=1000, letters=3)
19
19
  Array.new(size) do
20
20
  Array.new(rand(50..100)){
21
- (97+rand(letters)).chr + (" " * (rand(10) == 0 ? 1 : 0))
21
+ (97+rand(letters)).chr + (' ' * (rand(10) == 0 ? 1 : 0))
22
22
  }.join
23
23
  end
24
24
  end
@@ -1,40 +1,40 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
2
 
3
- RSpec::describe Spark::RDD do
3
+ RSpec.describe Spark::RDD do
4
4
 
5
5
  let(:mapping) { lambda{|x| [x, 1]} }
6
6
  let(:numbers) { Generator.numbers }
7
-
8
- it ".collect_as_hash" do
7
+
8
+ it '.collect_as_hash' do
9
9
  rdd = $sc.parallelize(numbers)
10
10
  rdd = rdd.map(mapping)
11
11
 
12
12
  expect(rdd.collect_as_hash).to eql(Hash[numbers.map(&mapping)])
13
13
  end
14
14
 
15
- context ".take" do
15
+ context '.take' do
16
16
  let(:size) { 1000 }
17
17
  let(:numbers) { Generator.numbers(size) }
18
18
  let(:rdd) { $sc.parallelize(numbers) }
19
19
 
20
- it "nothing" do
20
+ it 'nothing' do
21
21
  expect(rdd.take(0)).to eql([])
22
22
  end
23
23
 
24
- it "first" do
24
+ it 'first' do
25
25
  expect(rdd.first).to eql(numbers.first)
26
26
  end
27
27
 
28
- it "less than limit" do
28
+ it 'less than limit' do
29
29
  _size = size / 2
30
30
  expect(rdd.take(_size)).to eql(numbers.take(_size))
31
31
  end
32
32
 
33
- it "all" do
33
+ it 'all' do
34
34
  expect(rdd.take(size)).to eql(numbers)
35
35
  end
36
36
 
37
- it "more than limit" do
37
+ it 'more than limit' do
38
38
  expect(rdd.take(size*2)).to eql(numbers)
39
39
  end
40
40
  end
@@ -1,6 +1,6 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
2
 
3
- RSpec::describe Spark::Config do
3
+ RSpec.describe Spark::Config do
4
4
 
5
5
  before(:context) do
6
6
  Spark.stop
@@ -10,17 +10,17 @@ RSpec::describe Spark::Config do
10
10
  spark_start
11
11
  end
12
12
 
13
- it "should be stopped" do
13
+ it 'should be stopped' do
14
14
  expect(Spark.started?).to be_falsy
15
15
  end
16
16
 
17
- context "new config" do
17
+ context 'new config' do
18
18
 
19
19
  let(:configuration) do
20
20
  {
21
- "test.test1" => "test1",
22
- "test.test2" => "test2",
23
- "test.test3" => "test3"
21
+ 'test.test1' => 'test1',
22
+ 'test.test2' => 'test2',
23
+ 'test.test3' => 'test3'
24
24
  }
25
25
  end
26
26
 
@@ -28,7 +28,7 @@ RSpec::describe Spark::Config do
28
28
  Spark.clear_config
29
29
  end
30
30
 
31
- it "throught methods" do
31
+ it 'throught methods' do
32
32
  configuration.each do |key, value|
33
33
  Spark.config.set(key, value)
34
34
  end
@@ -38,7 +38,7 @@ RSpec::describe Spark::Config do
38
38
  end
39
39
  end
40
40
 
41
- it "throught hash style" do
41
+ it 'throught hash style' do
42
42
  configuration.each do |key, value|
43
43
  Spark.config[key] = value
44
44
  end
@@ -48,7 +48,7 @@ RSpec::describe Spark::Config do
48
48
  end
49
49
  end
50
50
 
51
- it "throught dsl" do
51
+ it 'throught dsl' do
52
52
  configuration.each do |key, value|
53
53
  Spark.config {
54
54
  set key, value
@@ -46,120 +46,121 @@ RSpec.describe Spark::Context do
46
46
  )
47
47
  end
48
48
 
49
- context '.accumulator' do
50
- it 'test' do
51
- accum1 = $sc.accumulator(0,)
52
- accum2 = $sc.accumulator(1, :*, 1)
53
- accum3 = $sc.accumulator(0, lambda{|max, val| val > max ? val : max})
54
-
55
- accum1 += 1
56
-
57
- accum2.add(2)
58
- accum2.add(2)
59
- accum2.add(2)
60
-
61
- accum3.add(9)
62
- accum3.add(6)
63
- accum3.add(7)
64
-
65
- expect(accum1.value).to eql(1)
66
- expect(accum2.value).to eql(8)
67
- expect(accum3.value).to eql(9)
68
-
69
- func = Proc.new do |_, index|
70
- accum1.add(1)
71
- accum2.add(2)
72
- accum3.add(index * 10)
73
- end
74
-
75
- rdd = $sc.parallelize(0..4, 4)
76
- rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
77
- rdd = rdd.map_partitions_with_index(func)
78
- rdd.collect
79
-
80
- # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
81
- sleep(1)
82
-
83
- expect(accum1.value).to eql(5)
84
- expect(accum2.value).to eql(128)
85
- expect(accum3.value).to eql(30)
86
- end
87
-
88
- context 'accum param' do
89
- it 'symbol' do
90
- accum1 = $sc.accumulator(1, :+, 0)
91
- accum2 = $sc.accumulator(5, :-, 3)
92
- accum3 = $sc.accumulator(1, :*, 1)
93
- accum4 = $sc.accumulator(1.0, :/, 1.0)
94
- accum5 = $sc.accumulator(2, :**, 2)
95
-
96
- func = Proc.new do |_|
97
- accum1.add(1)
98
- accum2.add(1)
99
- accum3.add(2)
100
- accum4.add(2)
101
- accum5.add(2)
102
- end
103
-
104
- rdd = $sc.parallelize(0..4, 2)
105
- rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3, accum4: accum4, accum5: accum5)
106
- rdd = rdd.map_partitions(func)
107
- rdd.collect
108
-
109
- # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
110
- sleep(1)
111
-
112
- expect(accum1.value).to eq(3)
113
- expect(accum2.value).to eq(1)
114
- expect(accum3.value).to eq(4)
115
- expect(accum4.value).to eq(4)
116
- expect(accum5.value).to eq(65536)
117
- end
118
-
119
- it 'proc' do
120
- accum1 = $sc.accumulator(1, lambda{|mem, val| mem + val}, 0)
121
- accum2 = $sc.accumulator('a', lambda{|mem, val| mem + val}, '')
122
- accum3 = $sc.accumulator([], lambda{|mem, val| mem << val}, [])
123
-
124
- func = Proc.new do |_|
125
- accum1.add(1)
126
- accum2.add('a')
127
- accum3.add(1)
128
- end
129
-
130
- rdd = $sc.parallelize(0..4, 2)
131
- rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
132
- rdd = rdd.map_partitions(func)
133
- rdd.collect
134
-
135
- # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
136
- sleep(1)
137
-
138
- expect(accum1.value).to eq(3)
139
- expect(accum2.value).to eq('aaa')
140
- expect(accum3.value).to eq([[1], [1]])
141
- end
142
-
143
- it 'string' do
144
- expect { $sc.accumulator(1, '0') }.to raise_error(Spark::SerializeError)
145
-
146
- accum = $sc.accumulator(1, 'lambda{|mem, val| mem + val}', 0)
147
-
148
- func = Proc.new do |_|
149
- accum.add(1)
150
- end
151
-
152
- rdd = $sc.parallelize(0..4, 2)
153
- rdd = rdd.bind(accum: accum)
154
- rdd = rdd.map_partitions(func)
155
- rdd.collect
156
-
157
- # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
158
- sleep(1)
159
-
160
- expect(accum.value).to eq(3)
161
- end
162
- end
163
- end
49
+ # context '.accumulator' do
50
+
51
+ # it 'test' do
52
+ # accum1 = $sc.accumulator(0,)
53
+ # accum2 = $sc.accumulator(1, :*, 1)
54
+ # accum3 = $sc.accumulator(0, lambda{|max, val| val > max ? val : max})
55
+
56
+ # accum1 += 1
57
+
58
+ # accum2.add(2)
59
+ # accum2.add(2)
60
+ # accum2.add(2)
61
+
62
+ # accum3.add(9)
63
+ # accum3.add(6)
64
+ # accum3.add(7)
65
+
66
+ # expect(accum1.value).to eql(1)
67
+ # expect(accum2.value).to eql(8)
68
+ # expect(accum3.value).to eql(9)
69
+
70
+ # func = Proc.new do |_, index|
71
+ # accum1.add(1)
72
+ # accum2.add(2)
73
+ # accum3.add(index * 10)
74
+ # end
75
+
76
+ # rdd = $sc.parallelize(0..4, 4)
77
+ # rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
78
+ # rdd = rdd.map_partitions_with_index(func)
79
+ # rdd.collect
80
+
81
+ # # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
82
+ # sleep(1)
83
+
84
+ # expect(accum1.value).to eql(5)
85
+ # expect(accum2.value).to eql(128)
86
+ # expect(accum3.value).to eql(30)
87
+ # end
88
+
89
+ # context 'accum param' do
90
+ # it 'symbol' do
91
+ # accum1 = $sc.accumulator(1, :+, 0)
92
+ # accum2 = $sc.accumulator(5, :-, 3)
93
+ # accum3 = $sc.accumulator(1, :*, 1)
94
+ # accum4 = $sc.accumulator(1.0, :/, 1.0)
95
+ # accum5 = $sc.accumulator(2, :**, 2)
96
+
97
+ # func = Proc.new do |_|
98
+ # accum1.add(1)
99
+ # accum2.add(1)
100
+ # accum3.add(2)
101
+ # accum4.add(2)
102
+ # accum5.add(2)
103
+ # end
104
+
105
+ # rdd = $sc.parallelize(0..4, 2)
106
+ # rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3, accum4: accum4, accum5: accum5)
107
+ # rdd = rdd.map_partitions(func)
108
+ # rdd.collect
109
+
110
+ # # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
111
+ # sleep(1)
112
+
113
+ # expect(accum1.value).to eq(3)
114
+ # expect(accum2.value).to eq(1)
115
+ # expect(accum3.value).to eq(4)
116
+ # expect(accum4.value).to eq(4)
117
+ # expect(accum5.value).to eq(65536)
118
+ # end
119
+
120
+ # it 'proc' do
121
+ # accum1 = $sc.accumulator(1, lambda{|mem, val| mem + val}, 0)
122
+ # accum2 = $sc.accumulator('a', lambda{|mem, val| mem + val}, '')
123
+ # accum3 = $sc.accumulator([], lambda{|mem, val| mem << val}, [])
124
+
125
+ # func = Proc.new do |_|
126
+ # accum1.add(1)
127
+ # accum2.add('a')
128
+ # accum3.add(1)
129
+ # end
130
+
131
+ # rdd = $sc.parallelize(0..4, 2)
132
+ # rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
133
+ # rdd = rdd.map_partitions(func)
134
+ # rdd.collect
135
+
136
+ # # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
137
+ # sleep(1)
138
+
139
+ # expect(accum1.value).to eq(3)
140
+ # expect(accum2.value).to eq('aaa')
141
+ # expect(accum3.value).to eq([[1], [1]])
142
+ # end
143
+
144
+ # it 'string' do
145
+ # expect { $sc.accumulator(1, '0') }.to raise_error(Spark::SerializeError)
146
+
147
+ # accum = $sc.accumulator(1, 'lambda{|mem, val| mem + val}', 0)
148
+
149
+ # func = Proc.new do |_|
150
+ # accum.add(1)
151
+ # end
152
+
153
+ # rdd = $sc.parallelize(0..4, 2)
154
+ # rdd = rdd.bind(accum: accum)
155
+ # rdd = rdd.map_partitions(func)
156
+ # rdd.collect
157
+
158
+ # # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
159
+ # sleep(1)
160
+
161
+ # expect(accum.value).to eq(3)
162
+ # end
163
+ # end
164
+ # end
164
165
 
165
166
  end
@@ -1,18 +1,18 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
2
 
3
3
  RSpec.describe Array do
4
4
 
5
- it ".deep_copy" do
6
- data = ["a", "b", "c"]
5
+ it '.deep_copy' do
6
+ data = ['a', 'b', 'c']
7
7
  new_data = data.dup
8
8
 
9
- data[0] << "a"
9
+ data[0] << 'a'
10
10
 
11
11
  expect(data).to eql(new_data)
12
12
 
13
13
  new_data = data.deep_copy
14
14
 
15
- data[1] << "b"
15
+ data[1] << 'b'
16
16
 
17
17
  expect(data).to_not eql(new_data)
18
18
  end
@@ -21,19 +21,19 @@ end
21
21
 
22
22
  RSpec.describe Hash do
23
23
 
24
- it ".stringify_keys!" do
24
+ it '.stringify_keys!' do
25
25
  data = {
26
- a: "a",
27
- b: "b",
28
- c: "c"
26
+ a: 'a',
27
+ b: 'b',
28
+ c: 'c'
29
29
  }
30
30
 
31
31
  data.stringify_keys!
32
32
 
33
33
  expect(data).to eql({
34
- "a" => "a",
35
- "b" => "b",
36
- "c" => "c"
34
+ 'a' => 'a',
35
+ 'b' => 'b',
36
+ 'c' => 'c'
37
37
  })
38
38
  end
39
39
 
@@ -41,9 +41,9 @@ end
41
41
 
42
42
  RSpec.describe String do
43
43
 
44
- it ".camelize" do
45
- data = "aaa_bbb_ccc".camelize
46
- expect(data).to eql("AaaBbbCcc")
44
+ it '.camelize' do
45
+ data = 'aaa_bbb_ccc'.camelize
46
+ expect(data).to eql('AaaBbbCcc')
47
47
  end
48
48
 
49
49
  end
@@ -56,14 +56,14 @@ RSpec.describe IO do
56
56
 
57
57
  file.write_int(1)
58
58
  file.write_long(2)
59
- file.write_string("3")
59
+ file.write_string('3')
60
60
  file.write_data([4])
61
61
 
62
62
  file.rewind
63
63
 
64
64
  expect(file.read_int).to eq(1)
65
65
  expect(file.read_long).to eq(2)
66
- expect(file.read_string).to eq("3")
66
+ expect(file.read_string).to eq('3')
67
67
  expect(file.read_data).to eq([4])
68
68
 
69
69
  file.unlink
@@ -1,6 +1,6 @@
1
1
  require 'spec_helper'
2
2
 
3
- RSpec::describe Spark::RDD do
3
+ RSpec.describe Spark::RDD do
4
4
 
5
5
  context '.pipe' do
6
6
  let(:words) { Generator.words }
@@ -1,12 +1,12 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
2
 
3
3
  def func4(item)
4
- item.start_with?("a") && item.size > 3 && item[1].to_s.ord > 106
4
+ item.start_with?('a') && item.size > 3 && item[1].to_s.ord > 106
5
5
  end
6
6
 
7
- RSpec::shared_examples "a filtering" do |workers|
7
+ RSpec.shared_examples 'a filtering' do |workers|
8
8
  context "with #{workers || 'default'} worker" do
9
- it "when numbers" do
9
+ it 'when numbers' do
10
10
  rdd2 = rdd_numbers(workers)
11
11
  rdd2 = rdd2.filter(func1)
12
12
  result = numbers.select(&func1)
@@ -20,7 +20,7 @@ RSpec::shared_examples "a filtering" do |workers|
20
20
  expect(rdd3.collect).to eql([])
21
21
  end
22
22
 
23
- it "when words" do
23
+ it 'when words' do
24
24
  rdd2 = rdd_words(workers)
25
25
  rdd2 = rdd2.filter(func3)
26
26
  result = words.select{|x| func3.call(x)}
@@ -36,12 +36,12 @@ RSpec::shared_examples "a filtering" do |workers|
36
36
  end
37
37
  end
38
38
 
39
- RSpec::describe "Spark::RDD.filter" do
39
+ RSpec.describe 'Spark::RDD.filter' do
40
40
  let(:func1) { lambda{|x| x.to_i.even?} }
41
41
  let(:func2) { lambda{|x| x.to_i.odd?} }
42
- let(:func3) { lambda{|x| x.to_s.start_with?("b")} }
42
+ let(:func3) { lambda{|x| x.to_s.start_with?('b')} }
43
43
 
44
- context "throught parallelize" do
44
+ context 'throught parallelize' do
45
45
  let(:numbers) { Generator.numbers_with_zero }
46
46
  let(:words) { Generator.words }
47
47
 
@@ -53,14 +53,14 @@ RSpec::describe "Spark::RDD.filter" do
53
53
  $sc.parallelize(words, workers)
54
54
  end
55
55
 
56
- it_behaves_like "a filtering", nil
57
- it_behaves_like "a filtering", 1
58
- it_behaves_like "a filtering", rand(2..10)
56
+ it_behaves_like 'a filtering', 2
57
+ # it_behaves_like 'a filtering', nil
58
+ # it_behaves_like 'a filtering', rand(2..10)
59
59
  end
60
60
 
61
- context "throught text_file" do
62
- let(:file_numbers) { File.join("spec", "inputs", "numbers_0_100.txt") }
63
- let(:file_words) { File.join("spec", "inputs", "lorem_300.txt") }
61
+ context 'throught text_file' do
62
+ let(:file_numbers) { File.join('spec', 'inputs', 'numbers_0_100.txt') }
63
+ let(:file_words) { File.join('spec', 'inputs', 'lorem_300.txt') }
64
64
 
65
65
  let(:numbers) { File.readlines(file_numbers).map(&:strip) }
66
66
  let(:words) { File.readlines(file_words).map(&:strip) }
@@ -73,8 +73,8 @@ RSpec::describe "Spark::RDD.filter" do
73
73
  $sc.text_file(file_words, workers)
74
74
  end
75
75
 
76
- it_behaves_like "a filtering", nil
77
- it_behaves_like "a filtering", 1
78
- it_behaves_like "a filtering", rand(2..10)
76
+ it_behaves_like 'a filtering', 2
77
+ # it_behaves_like 'a filtering', nil
78
+ # it_behaves_like 'a filtering', rand(2..10)
79
79
  end
80
80
  end