ruby-spark 1.1.0.1 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/.travis.yml +15 -0
  4. data/CHANGELOG.md +8 -0
  5. data/README.md +184 -57
  6. data/TODO.md +3 -1
  7. data/ext/spark/build.sbt +5 -5
  8. data/ext/spark/src/main/scala/RubyWorker.scala +7 -16
  9. data/lib/spark.rb +69 -10
  10. data/lib/spark/accumulator.rb +8 -0
  11. data/lib/spark/broadcast.rb +7 -0
  12. data/lib/spark/build.rb +10 -10
  13. data/lib/spark/cli.rb +68 -76
  14. data/lib/spark/config.rb +13 -17
  15. data/lib/spark/context.rb +10 -7
  16. data/lib/spark/error.rb +4 -0
  17. data/lib/spark/helper/statistic.rb +5 -1
  18. data/lib/spark/java_bridge.rb +5 -3
  19. data/lib/spark/java_bridge/base.rb +15 -15
  20. data/lib/spark/java_bridge/jruby.rb +3 -1
  21. data/lib/spark/java_bridge/rjb.rb +2 -0
  22. data/lib/spark/mllib/classification/logistic_regression.rb +10 -2
  23. data/lib/spark/mllib/classification/svm.rb +10 -2
  24. data/lib/spark/mllib/clustering/kmeans.rb +6 -2
  25. data/lib/spark/mllib/regression/lasso.rb +18 -2
  26. data/lib/spark/mllib/regression/linear.rb +11 -3
  27. data/lib/spark/mllib/regression/ridge.rb +18 -2
  28. data/lib/spark/rdd.rb +11 -2
  29. data/lib/spark/serializer.rb +1 -1
  30. data/lib/spark/serializer/auto_batched.rb +7 -0
  31. data/lib/spark/version.rb +1 -1
  32. data/ruby-spark.gemspec +4 -5
  33. data/spec/generator.rb +1 -1
  34. data/spec/lib/collect_spec.rb +10 -10
  35. data/spec/lib/config_spec.rb +10 -10
  36. data/spec/lib/context_spec.rb +116 -115
  37. data/spec/lib/ext_spec.rb +17 -17
  38. data/spec/lib/external_apps_spec.rb +1 -1
  39. data/spec/lib/filter_spec.rb +17 -17
  40. data/spec/lib/flat_map_spec.rb +22 -19
  41. data/spec/lib/group_spec.rb +22 -19
  42. data/spec/lib/helper_spec.rb +60 -12
  43. data/spec/lib/key_spec.rb +9 -8
  44. data/spec/lib/manipulation_spec.rb +15 -15
  45. data/spec/lib/map_partitions_spec.rb +6 -4
  46. data/spec/lib/map_spec.rb +22 -19
  47. data/spec/lib/reduce_by_key_spec.rb +19 -19
  48. data/spec/lib/reduce_spec.rb +22 -20
  49. data/spec/lib/sample_spec.rb +13 -12
  50. data/spec/lib/serializer_spec.rb +27 -0
  51. data/spec/lib/sort_spec.rb +16 -14
  52. data/spec/lib/statistic_spec.rb +4 -2
  53. data/spec/lib/whole_text_files_spec.rb +9 -8
  54. data/spec/spec_helper.rb +3 -3
  55. metadata +19 -18
@@ -1,4 +1,4 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
2
 
3
3
  def flat_map(line)
4
4
  line.split
@@ -12,7 +12,7 @@ def reduce(x,y)
12
12
  x+y
13
13
  end
14
14
 
15
- RSpec::shared_examples "a words counting" do |workers|
15
+ RSpec.shared_examples 'a words counting' do |workers|
16
16
  context "with #{workers || 'default'} worker" do
17
17
  let(:result) do
18
18
  keyyed = lines.flat_map{|x| x.split}.map{|x| [x,1]}
@@ -27,7 +27,7 @@ RSpec::shared_examples "a words counting" do |workers|
27
27
  result
28
28
  end
29
29
 
30
- it "when lambda" do
30
+ it 'when lambda' do
31
31
  rdd2 = rdd(workers)
32
32
  rdd2 = rdd2.flat_map(lambda{|line| line.split})
33
33
  rdd2 = rdd2.map(lambda{|word| [word, 1]})
@@ -36,7 +36,7 @@ RSpec::shared_examples "a words counting" do |workers|
36
36
  expect(rdd2.collect_as_hash).to eql(result)
37
37
  end
38
38
 
39
- it "when method" do
39
+ it 'when method' do
40
40
  rdd2 = rdd(workers)
41
41
  rdd2 = rdd2.flat_map(method(:flat_map))
42
42
  rdd2 = rdd2.map(method(:map))
@@ -45,7 +45,7 @@ RSpec::shared_examples "a words counting" do |workers|
45
45
  expect(rdd2.collect_as_hash).to eql(result)
46
46
  end
47
47
 
48
- it "keys, values" do
48
+ it 'keys, values' do
49
49
  rdd2 = rdd(workers)
50
50
  rdd2 = rdd2.flat_map(method(:flat_map))
51
51
  rdd2 = rdd2.map(method(:map))
@@ -57,35 +57,35 @@ RSpec::shared_examples "a words counting" do |workers|
57
57
  end
58
58
  end
59
59
 
60
- RSpec::describe "Spark::RDD" do
61
- context ".reduce_by_key" do
62
- context "throught parallelize" do
60
+ RSpec.describe 'Spark::RDD' do
61
+ context '.reduce_by_key' do
62
+ context 'throught parallelize' do
63
63
  let(:lines) { Generator.lines }
64
64
 
65
65
  def rdd(workers)
66
66
  $sc.parallelize(lines, workers)
67
67
  end
68
68
 
69
- it_behaves_like "a words counting", nil
70
- it_behaves_like "a words counting", 1
71
- it_behaves_like "a words counting", rand(2..10)
69
+ it_behaves_like 'a words counting', 2
70
+ # it_behaves_like 'a words counting', nil
71
+ # it_behaves_like 'a words counting', rand(2..10)
72
72
  end
73
73
 
74
- context "throught text_file" do
75
- let(:file) { File.join("spec", "inputs", "lorem_300.txt") }
74
+ context 'throught text_file' do
75
+ let(:file) { File.join('spec', 'inputs', 'lorem_300.txt') }
76
76
  let(:lines) { File.readlines(file).map(&:strip) }
77
77
 
78
78
  def rdd(workers)
79
79
  $sc.text_file(file, workers)
80
80
  end
81
81
 
82
- it_behaves_like "a words counting", nil
83
- it_behaves_like "a words counting", 1
84
- it_behaves_like "a words counting", rand(2..10)
82
+ it_behaves_like 'a words counting', 2
83
+ # it_behaves_like 'a words counting', nil
84
+ # it_behaves_like 'a words counting', rand(2..10)
85
85
  end
86
86
  end
87
87
 
88
- context ".fold_by_key" do
88
+ context '.fold_by_key' do
89
89
  let(:numbers) { Generator.numbers }
90
90
  let(:zero_value) { 0 }
91
91
  let(:rdd) { $sc.parallelize(numbers) }
@@ -105,11 +105,11 @@ RSpec::describe "Spark::RDD" do
105
105
  rdd.map(map).fold_by_key(zero_value, add, num_partitions).collect_as_hash
106
106
  end
107
107
 
108
- it "default num_partitions" do
108
+ it 'default num_partitions' do
109
109
  expect(fold_by_key).to eq(result)
110
110
  end
111
111
 
112
- it "default num_partitions" do
112
+ it 'default num_partitions' do
113
113
  expect(
114
114
  fold_by_key rand(1..10)
115
115
  ).to eq(result)
@@ -1,12 +1,12 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
2
 
3
3
  def longest_words(memo, word)
4
4
  memo.length > word.length ? memo : word
5
5
  end
6
6
 
7
- RSpec::shared_examples "a reducing" do |workers|
7
+ RSpec.shared_examples 'a reducing' do |workers|
8
8
  context "with #{workers || 'default'} worker" do
9
- it ".reduce" do
9
+ it '.reduce' do
10
10
  rdd2 = rdd_numbers(workers)
11
11
  rdd2 = rdd2.map(to_i)
12
12
  rdd2 = rdd2.reduce(func1)
@@ -30,7 +30,7 @@ RSpec::shared_examples "a reducing" do |workers|
30
30
  expect(rdd4).to eql(result)
31
31
  end
32
32
 
33
- it ".fold" do
33
+ it '.fold' do
34
34
  rdd2 = rdd_numbers(workers)
35
35
  rdd2 = rdd2.map(to_i)
36
36
  rdd_result = rdd2.fold(1, func1)
@@ -41,7 +41,7 @@ RSpec::shared_examples "a reducing" do |workers|
41
41
  expect(rdd_result).to eql(result)
42
42
  end
43
43
 
44
- it ".aggregate" do
44
+ it '.aggregate' do
45
45
  rdd2 = rdd_numbers(workers)
46
46
  rdd2 = rdd2.map(to_i)
47
47
 
@@ -55,28 +55,28 @@ RSpec::shared_examples "a reducing" do |workers|
55
55
  expect(rdd_result).to eql(result)
56
56
  end
57
57
 
58
- it ".max" do
58
+ it '.max' do
59
59
  rdd2 = rdd_numbers(workers)
60
60
  rdd2 = rdd2.map(to_i)
61
61
 
62
62
  expect(rdd2.max).to eql(numbers.map(&:to_i).max)
63
63
  end
64
64
 
65
- it ".min" do
65
+ it '.min' do
66
66
  rdd2 = rdd_numbers(workers)
67
67
  rdd2 = rdd2.map(to_i)
68
68
 
69
69
  expect(rdd2.min).to eql(numbers.map(&:to_i).min)
70
70
  end
71
71
 
72
- it ".sum" do
72
+ it '.sum' do
73
73
  rdd2 = rdd_numbers(workers)
74
74
  rdd2 = rdd2.map(to_i)
75
75
 
76
76
  expect(rdd2.sum).to eql(numbers.map(&:to_i).reduce(:+))
77
77
  end
78
78
 
79
- it ".count" do
79
+ it '.count' do
80
80
  rdd2 = rdd_numbers(workers)
81
81
  rdd2 = rdd2.map(to_i)
82
82
 
@@ -85,14 +85,14 @@ RSpec::shared_examples "a reducing" do |workers|
85
85
  end
86
86
  end
87
87
 
88
- RSpec::describe "Spark::RDD" do
88
+ RSpec.describe 'Spark::RDD' do
89
89
  let(:func1) { lambda{|sum, x| sum+x} }
90
90
  let(:func2) { lambda{|product, x| product*x} }
91
91
 
92
92
  let(:to_i) { lambda{|item| item.to_i} }
93
93
  let(:split) { lambda{|item| item.split} }
94
94
 
95
- context "throught parallelize" do
95
+ context 'throught parallelize' do
96
96
  let(:numbers) { Generator.numbers }
97
97
  let(:lines) { Generator.lines }
98
98
 
@@ -104,14 +104,15 @@ RSpec::describe "Spark::RDD" do
104
104
  $sc.parallelize(lines, workers)
105
105
  end
106
106
 
107
- it_behaves_like "a reducing", nil
108
- it_behaves_like "a reducing", 1
109
- it_behaves_like "a reducing", rand(2..10)
107
+ it_behaves_like 'a reducing', 1
108
+ it_behaves_like 'a reducing', 2
109
+ # it_behaves_like 'a reducing', nil
110
+ # it_behaves_like 'a reducing', rand(2..10)
110
111
  end
111
112
 
112
- context "throught text_file" do
113
- let(:file) { File.join("spec", "inputs", "numbers_0_100.txt") }
114
- let(:file_lines) { File.join("spec", "inputs", "lorem_300.txt") }
113
+ context 'throught text_file' do
114
+ let(:file) { File.join('spec', 'inputs', 'numbers_0_100.txt') }
115
+ let(:file_lines) { File.join('spec', 'inputs', 'lorem_300.txt') }
115
116
 
116
117
  let(:numbers) { File.readlines(file).map(&:strip).map(&:to_i) }
117
118
  let(:lines) { File.readlines(file_lines).map(&:strip) }
@@ -124,8 +125,9 @@ RSpec::describe "Spark::RDD" do
124
125
  $sc.text_file(file_lines, workers)
125
126
  end
126
127
 
127
- it_behaves_like "a reducing", nil
128
- it_behaves_like "a reducing", 1
129
- it_behaves_like "a reducing", rand(2..10)
128
+ it_behaves_like 'a reducing', 1
129
+ it_behaves_like 'a reducing', 2
130
+ # it_behaves_like 'a reducing', nil
131
+ # it_behaves_like 'a reducing', rand(2..10)
130
132
  end
131
133
  end
@@ -1,30 +1,30 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
2
 
3
3
  # Sample method can not be tested because of random generator
4
4
  # Just test it for raising error
5
5
 
6
- RSpec::shared_examples "a sampler" do |workers|
6
+ RSpec.shared_examples 'a sampler' do |workers|
7
7
  context "with #{workers || 'default'} worker" do
8
8
 
9
- context ".sample" do
10
- it "with replacement" do
9
+ context '.sample' do
10
+ it 'with replacement' do
11
11
  rdd2 = rdd(workers).sample(true, rand)
12
12
  expect { rdd2.collect }.to_not raise_error
13
13
  end
14
14
 
15
- it "without replacement" do
15
+ it 'without replacement' do
16
16
  rdd2 = rdd(workers).sample(false, rand)
17
17
  expect { rdd2.collect }.to_not raise_error
18
18
  end
19
19
  end
20
20
 
21
- context ".take_sample" do
22
- it "with replacement" do
21
+ context '.take_sample' do
22
+ it 'with replacement' do
23
23
  size = rand(10..999)
24
24
  expect(rdd(workers).take_sample(true, size).size).to eql(size)
25
25
  end
26
26
 
27
- it "without replacement" do
27
+ it 'without replacement' do
28
28
  size = rand(10..999)
29
29
  expect(rdd(workers).take_sample(false, size).size).to eql(size)
30
30
  end
@@ -33,14 +33,15 @@ RSpec::shared_examples "a sampler" do |workers|
33
33
  end
34
34
  end
35
35
 
36
- RSpec::describe "Spark::RDD" do
36
+ RSpec.describe 'Spark::RDD' do
37
37
  let(:numbers) { Generator.numbers(1000) }
38
38
 
39
39
  def rdd(workers)
40
40
  $sc.parallelize(numbers, workers)
41
41
  end
42
42
 
43
- it_behaves_like "a sampler", nil
44
- it_behaves_like "a sampler", 1
45
- it_behaves_like "a sampler", rand(2..10)
43
+ it_behaves_like 'a sampler', 1
44
+ it_behaves_like 'a sampler', 2
45
+ # it_behaves_like 'a sampler', nil
46
+ # it_behaves_like 'a sampler', rand(2..10)
46
47
  end
@@ -85,4 +85,31 @@ RSpec.describe Spark::Serializer do
85
85
  Zlib::Deflate.deflate(Marshal.dump(data))
86
86
  )
87
87
  end
88
+
89
+ context 'Auto batched' do
90
+ let(:klass) { Spark::Serializer::AutoBatched }
91
+ let(:marshal) { Spark::Serializer::Marshal.new }
92
+ let(:numbers) { Generator.numbers }
93
+
94
+ it 'initialize' do
95
+ expect { klass.new }.to raise_error(ArgumentError)
96
+ expect { klass.new(marshal) }.to_not raise_error
97
+ expect { klass.new(marshal, 1) }.to raise_error(Spark::SerializeError)
98
+ end
99
+
100
+ it 'serialization' do
101
+ serializer1 = klass.new(marshal)
102
+ serializer2 = klass.new(marshal, 2)
103
+
104
+ rdd1 = Spark.sc.parallelize(numbers, 2, serializer1)
105
+ rdd2 = Spark.sc.parallelize(numbers, 2, serializer2).map(:to_i)
106
+
107
+ result = rdd1.collect
108
+
109
+ expect(rdd1.serializer).to eq(serializer1)
110
+ expect(result).to eq(numbers)
111
+ expect(result).to eq(rdd2.collect)
112
+ end
113
+
114
+ end
88
115
  end
@@ -1,6 +1,6 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
2
 
3
- RSpec::shared_examples "a sorting" do |workers|
3
+ RSpec.shared_examples 'a sorting' do |workers|
4
4
  it "with #{workers || 'default'} worker" do
5
5
  rdd2 = rdd(workers)
6
6
 
@@ -22,37 +22,39 @@ RSpec::shared_examples "a sorting" do |workers|
22
22
  end
23
23
 
24
24
 
25
- RSpec::describe "Spark::RDD" do
25
+ RSpec.describe 'Spark::RDD' do
26
26
  let(:split) { lambda{|x| x.split} }
27
27
  let(:map) { lambda{|x| [x.to_s, 1]} }
28
28
  let(:len_map) { lambda{|x| [x.size, x]} }
29
29
 
30
- context "throught parallelize" do
31
- context ".map" do
30
+ context 'throught parallelize' do
31
+ context '.map' do
32
32
  let(:lines) { Generator.lines }
33
33
 
34
34
  def rdd(workers)
35
35
  $sc.parallelize(lines, workers)
36
36
  end
37
37
 
38
- it_behaves_like "a sorting", nil
39
- it_behaves_like "a sorting", 1
40
- it_behaves_like "a sorting", rand(2..10)
38
+ it_behaves_like 'a sorting', 1
39
+ it_behaves_like 'a sorting', 2
40
+ # it_behaves_like 'a sorting', nil
41
+ # it_behaves_like 'a sorting', rand(2..10)
41
42
  end
42
43
  end
43
44
 
44
- context "throught text_file" do
45
- context ".map" do
46
- let(:file) { File.join("spec", "inputs", "lorem_300.txt") }
45
+ context 'throught text_file' do
46
+ context '.map' do
47
+ let(:file) { File.join('spec', 'inputs', 'lorem_300.txt') }
47
48
  let(:lines) { File.readlines(file).map(&:strip) }
48
49
 
49
50
  def rdd(workers)
50
51
  $sc.text_file(file, workers)
51
52
  end
52
53
 
53
- it_behaves_like "a sorting", nil
54
- it_behaves_like "a sorting", 1
55
- it_behaves_like "a sorting", rand(2..10)
54
+ it_behaves_like 'a sorting', 1
55
+ it_behaves_like 'a sorting', 2
56
+ # it_behaves_like 'a sorting', nil
57
+ # it_behaves_like 'a sorting', rand(2..10)
56
58
  end
57
59
  end
58
60
  end
@@ -160,11 +160,13 @@ RSpec.describe Spark::RDD do
160
160
 
161
161
  context '.stats' do
162
162
  it_behaves_like 'a stats', 1
163
- it_behaves_like 'a stats', rand(2..5)
163
+ it_behaves_like 'a stats', 2
164
+ # it_behaves_like 'a stats', rand(2..5)
164
165
  end
165
166
 
166
167
  context '.histogram' do
167
168
  it_behaves_like 'a histogram', 1
168
- it_behaves_like 'a histogram', rand(2..5)
169
+ it_behaves_like 'a histogram', 2
170
+ # it_behaves_like 'a histogram', rand(2..5)
169
171
  end
170
172
  end
@@ -1,6 +1,6 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
2
 
3
- RSpec::shared_examples "a whole_text_files" do |workers|
3
+ RSpec.shared_examples 'a whole_text_files' do |workers|
4
4
  it "with #{workers || 'default'} worker" do
5
5
  rdd2 = rdd(workers).map(get_numbers)
6
6
  result = files.size
@@ -17,17 +17,18 @@ RSpec::shared_examples "a whole_text_files" do |workers|
17
17
  end
18
18
  end
19
19
 
20
- RSpec::describe "Spark::Context" do
20
+ RSpec.describe 'Spark::Context' do
21
21
  let(:get_numbers) { lambda{|file, content| content.split.map(&:to_i)} }
22
22
 
23
- let(:dir) { File.join("spec", "inputs", "numbers") }
24
- let(:files) { Dir.glob(File.join(dir, "*")) }
23
+ let(:dir) { File.join('spec', 'inputs', 'numbers') }
24
+ let(:files) { Dir.glob(File.join(dir, '*')) }
25
25
 
26
26
  def rdd(workers)
27
27
  $sc.whole_text_files(dir, workers)
28
28
  end
29
29
 
30
- it_behaves_like "a whole_text_files", nil
31
- it_behaves_like "a whole_text_files", 1
32
- it_behaves_like "a whole_text_files", rand(2..10)
30
+ it_behaves_like 'a whole_text_files', 1
31
+ it_behaves_like 'a whole_text_files', 2
32
+ # it_behaves_like 'a whole_text_files', nil
33
+ # it_behaves_like 'a whole_text_files', rand(2..10)
33
34
  end
@@ -1,5 +1,5 @@
1
- # require 'simplecov'
2
- # SimpleCov.start
1
+ require 'simplecov'
2
+ SimpleCov.start
3
3
 
4
4
  $LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'
5
5
  require 'ruby-spark'
@@ -7,7 +7,7 @@ require 'generator'
7
7
 
8
8
  # Loading
9
9
  Spark.load_lib
10
- Spark.jb.load_test
10
+ Spark.jb.import_all_test
11
11
  Spark::Mllib.import
12
12
 
13
13
  # Keep it on method because its called from config test
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-spark
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0.1
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ondřej Moravčík
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-16 00:00:00.000000000 Z
11
+ date: 2015-06-15 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rjb
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: sourcify
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -94,20 +108,6 @@ dependencies:
94
108
  - - ">="
95
109
  - !ruby/object:Gem::Version
96
110
  version: '0'
97
- - !ruby/object:Gem::Dependency
98
- name: rjb
99
- requirement: !ruby/object:Gem::Requirement
100
- requirements:
101
- - - ">="
102
- - !ruby/object:Gem::Version
103
- version: '0'
104
- type: :runtime
105
- prerelease: false
106
- version_requirements: !ruby/object:Gem::Requirement
107
- requirements:
108
- - - ">="
109
- - !ruby/object:Gem::Version
110
- version: '0'
111
111
  - !ruby/object:Gem::Dependency
112
112
  name: bundler
113
113
  requirement: !ruby/object:Gem::Requirement
@@ -146,6 +146,8 @@ extensions:
146
146
  extra_rdoc_files: []
147
147
  files:
148
148
  - ".gitignore"
149
+ - ".travis.yml"
150
+ - CHANGELOG.md
149
151
  - Gemfile
150
152
  - Guardfile
151
153
  - LICENSE.txt
@@ -344,7 +346,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
344
346
  requirements:
345
347
  - java, scala
346
348
  rubyforge_project:
347
- rubygems_version: 2.2.2
349
+ rubygems_version: 2.4.5
348
350
  signing_key:
349
351
  specification_version: 4
350
352
  summary: Ruby wrapper for Apache Spark
@@ -400,4 +402,3 @@ test_files:
400
402
  - spec/lib/statistic_spec.rb
401
403
  - spec/lib/whole_text_files_spec.rb
402
404
  - spec/spec_helper.rb
403
- has_rdoc: