ruby-spark 1.1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/.travis.yml +15 -0
  4. data/CHANGELOG.md +8 -0
  5. data/README.md +184 -57
  6. data/TODO.md +3 -1
  7. data/ext/spark/build.sbt +5 -5
  8. data/ext/spark/src/main/scala/RubyWorker.scala +7 -16
  9. data/lib/spark.rb +69 -10
  10. data/lib/spark/accumulator.rb +8 -0
  11. data/lib/spark/broadcast.rb +7 -0
  12. data/lib/spark/build.rb +10 -10
  13. data/lib/spark/cli.rb +68 -76
  14. data/lib/spark/config.rb +13 -17
  15. data/lib/spark/context.rb +10 -7
  16. data/lib/spark/error.rb +4 -0
  17. data/lib/spark/helper/statistic.rb +5 -1
  18. data/lib/spark/java_bridge.rb +5 -3
  19. data/lib/spark/java_bridge/base.rb +15 -15
  20. data/lib/spark/java_bridge/jruby.rb +3 -1
  21. data/lib/spark/java_bridge/rjb.rb +2 -0
  22. data/lib/spark/mllib/classification/logistic_regression.rb +10 -2
  23. data/lib/spark/mllib/classification/svm.rb +10 -2
  24. data/lib/spark/mllib/clustering/kmeans.rb +6 -2
  25. data/lib/spark/mllib/regression/lasso.rb +18 -2
  26. data/lib/spark/mllib/regression/linear.rb +11 -3
  27. data/lib/spark/mllib/regression/ridge.rb +18 -2
  28. data/lib/spark/rdd.rb +11 -2
  29. data/lib/spark/serializer.rb +1 -1
  30. data/lib/spark/serializer/auto_batched.rb +7 -0
  31. data/lib/spark/version.rb +1 -1
  32. data/ruby-spark.gemspec +4 -5
  33. data/spec/generator.rb +1 -1
  34. data/spec/lib/collect_spec.rb +10 -10
  35. data/spec/lib/config_spec.rb +10 -10
  36. data/spec/lib/context_spec.rb +116 -115
  37. data/spec/lib/ext_spec.rb +17 -17
  38. data/spec/lib/external_apps_spec.rb +1 -1
  39. data/spec/lib/filter_spec.rb +17 -17
  40. data/spec/lib/flat_map_spec.rb +22 -19
  41. data/spec/lib/group_spec.rb +22 -19
  42. data/spec/lib/helper_spec.rb +60 -12
  43. data/spec/lib/key_spec.rb +9 -8
  44. data/spec/lib/manipulation_spec.rb +15 -15
  45. data/spec/lib/map_partitions_spec.rb +6 -4
  46. data/spec/lib/map_spec.rb +22 -19
  47. data/spec/lib/reduce_by_key_spec.rb +19 -19
  48. data/spec/lib/reduce_spec.rb +22 -20
  49. data/spec/lib/sample_spec.rb +13 -12
  50. data/spec/lib/serializer_spec.rb +27 -0
  51. data/spec/lib/sort_spec.rb +16 -14
  52. data/spec/lib/statistic_spec.rb +4 -2
  53. data/spec/lib/whole_text_files_spec.rb +9 -8
  54. data/spec/spec_helper.rb +3 -3
  55. metadata +19 -18
@@ -1,4 +1,4 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
2
 
3
3
  def flat_map(line)
4
4
  line.split
@@ -12,7 +12,7 @@ def reduce(x,y)
12
12
  x+y
13
13
  end
14
14
 
15
- RSpec::shared_examples "a words counting" do |workers|
15
+ RSpec.shared_examples 'a words counting' do |workers|
16
16
  context "with #{workers || 'default'} worker" do
17
17
  let(:result) do
18
18
  keyyed = lines.flat_map{|x| x.split}.map{|x| [x,1]}
@@ -27,7 +27,7 @@ RSpec::shared_examples "a words counting" do |workers|
27
27
  result
28
28
  end
29
29
 
30
- it "when lambda" do
30
+ it 'when lambda' do
31
31
  rdd2 = rdd(workers)
32
32
  rdd2 = rdd2.flat_map(lambda{|line| line.split})
33
33
  rdd2 = rdd2.map(lambda{|word| [word, 1]})
@@ -36,7 +36,7 @@ RSpec::shared_examples "a words counting" do |workers|
36
36
  expect(rdd2.collect_as_hash).to eql(result)
37
37
  end
38
38
 
39
- it "when method" do
39
+ it 'when method' do
40
40
  rdd2 = rdd(workers)
41
41
  rdd2 = rdd2.flat_map(method(:flat_map))
42
42
  rdd2 = rdd2.map(method(:map))
@@ -45,7 +45,7 @@ RSpec::shared_examples "a words counting" do |workers|
45
45
  expect(rdd2.collect_as_hash).to eql(result)
46
46
  end
47
47
 
48
- it "keys, values" do
48
+ it 'keys, values' do
49
49
  rdd2 = rdd(workers)
50
50
  rdd2 = rdd2.flat_map(method(:flat_map))
51
51
  rdd2 = rdd2.map(method(:map))
@@ -57,35 +57,35 @@ RSpec::shared_examples "a words counting" do |workers|
57
57
  end
58
58
  end
59
59
 
60
- RSpec::describe "Spark::RDD" do
61
- context ".reduce_by_key" do
62
- context "throught parallelize" do
60
+ RSpec.describe 'Spark::RDD' do
61
+ context '.reduce_by_key' do
62
+ context 'throught parallelize' do
63
63
  let(:lines) { Generator.lines }
64
64
 
65
65
  def rdd(workers)
66
66
  $sc.parallelize(lines, workers)
67
67
  end
68
68
 
69
- it_behaves_like "a words counting", nil
70
- it_behaves_like "a words counting", 1
71
- it_behaves_like "a words counting", rand(2..10)
69
+ it_behaves_like 'a words counting', 2
70
+ # it_behaves_like 'a words counting', nil
71
+ # it_behaves_like 'a words counting', rand(2..10)
72
72
  end
73
73
 
74
- context "throught text_file" do
75
- let(:file) { File.join("spec", "inputs", "lorem_300.txt") }
74
+ context 'throught text_file' do
75
+ let(:file) { File.join('spec', 'inputs', 'lorem_300.txt') }
76
76
  let(:lines) { File.readlines(file).map(&:strip) }
77
77
 
78
78
  def rdd(workers)
79
79
  $sc.text_file(file, workers)
80
80
  end
81
81
 
82
- it_behaves_like "a words counting", nil
83
- it_behaves_like "a words counting", 1
84
- it_behaves_like "a words counting", rand(2..10)
82
+ it_behaves_like 'a words counting', 2
83
+ # it_behaves_like 'a words counting', nil
84
+ # it_behaves_like 'a words counting', rand(2..10)
85
85
  end
86
86
  end
87
87
 
88
- context ".fold_by_key" do
88
+ context '.fold_by_key' do
89
89
  let(:numbers) { Generator.numbers }
90
90
  let(:zero_value) { 0 }
91
91
  let(:rdd) { $sc.parallelize(numbers) }
@@ -105,11 +105,11 @@ RSpec::describe "Spark::RDD" do
105
105
  rdd.map(map).fold_by_key(zero_value, add, num_partitions).collect_as_hash
106
106
  end
107
107
 
108
- it "default num_partitions" do
108
+ it 'default num_partitions' do
109
109
  expect(fold_by_key).to eq(result)
110
110
  end
111
111
 
112
- it "default num_partitions" do
112
+ it 'default num_partitions' do
113
113
  expect(
114
114
  fold_by_key rand(1..10)
115
115
  ).to eq(result)
@@ -1,12 +1,12 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
2
 
3
3
  def longest_words(memo, word)
4
4
  memo.length > word.length ? memo : word
5
5
  end
6
6
 
7
- RSpec::shared_examples "a reducing" do |workers|
7
+ RSpec.shared_examples 'a reducing' do |workers|
8
8
  context "with #{workers || 'default'} worker" do
9
- it ".reduce" do
9
+ it '.reduce' do
10
10
  rdd2 = rdd_numbers(workers)
11
11
  rdd2 = rdd2.map(to_i)
12
12
  rdd2 = rdd2.reduce(func1)
@@ -30,7 +30,7 @@ RSpec::shared_examples "a reducing" do |workers|
30
30
  expect(rdd4).to eql(result)
31
31
  end
32
32
 
33
- it ".fold" do
33
+ it '.fold' do
34
34
  rdd2 = rdd_numbers(workers)
35
35
  rdd2 = rdd2.map(to_i)
36
36
  rdd_result = rdd2.fold(1, func1)
@@ -41,7 +41,7 @@ RSpec::shared_examples "a reducing" do |workers|
41
41
  expect(rdd_result).to eql(result)
42
42
  end
43
43
 
44
- it ".aggregate" do
44
+ it '.aggregate' do
45
45
  rdd2 = rdd_numbers(workers)
46
46
  rdd2 = rdd2.map(to_i)
47
47
 
@@ -55,28 +55,28 @@ RSpec::shared_examples "a reducing" do |workers|
55
55
  expect(rdd_result).to eql(result)
56
56
  end
57
57
 
58
- it ".max" do
58
+ it '.max' do
59
59
  rdd2 = rdd_numbers(workers)
60
60
  rdd2 = rdd2.map(to_i)
61
61
 
62
62
  expect(rdd2.max).to eql(numbers.map(&:to_i).max)
63
63
  end
64
64
 
65
- it ".min" do
65
+ it '.min' do
66
66
  rdd2 = rdd_numbers(workers)
67
67
  rdd2 = rdd2.map(to_i)
68
68
 
69
69
  expect(rdd2.min).to eql(numbers.map(&:to_i).min)
70
70
  end
71
71
 
72
- it ".sum" do
72
+ it '.sum' do
73
73
  rdd2 = rdd_numbers(workers)
74
74
  rdd2 = rdd2.map(to_i)
75
75
 
76
76
  expect(rdd2.sum).to eql(numbers.map(&:to_i).reduce(:+))
77
77
  end
78
78
 
79
- it ".count" do
79
+ it '.count' do
80
80
  rdd2 = rdd_numbers(workers)
81
81
  rdd2 = rdd2.map(to_i)
82
82
 
@@ -85,14 +85,14 @@ RSpec::shared_examples "a reducing" do |workers|
85
85
  end
86
86
  end
87
87
 
88
- RSpec::describe "Spark::RDD" do
88
+ RSpec.describe 'Spark::RDD' do
89
89
  let(:func1) { lambda{|sum, x| sum+x} }
90
90
  let(:func2) { lambda{|product, x| product*x} }
91
91
 
92
92
  let(:to_i) { lambda{|item| item.to_i} }
93
93
  let(:split) { lambda{|item| item.split} }
94
94
 
95
- context "throught parallelize" do
95
+ context 'throught parallelize' do
96
96
  let(:numbers) { Generator.numbers }
97
97
  let(:lines) { Generator.lines }
98
98
 
@@ -104,14 +104,15 @@ RSpec::describe "Spark::RDD" do
104
104
  $sc.parallelize(lines, workers)
105
105
  end
106
106
 
107
- it_behaves_like "a reducing", nil
108
- it_behaves_like "a reducing", 1
109
- it_behaves_like "a reducing", rand(2..10)
107
+ it_behaves_like 'a reducing', 1
108
+ it_behaves_like 'a reducing', 2
109
+ # it_behaves_like 'a reducing', nil
110
+ # it_behaves_like 'a reducing', rand(2..10)
110
111
  end
111
112
 
112
- context "throught text_file" do
113
- let(:file) { File.join("spec", "inputs", "numbers_0_100.txt") }
114
- let(:file_lines) { File.join("spec", "inputs", "lorem_300.txt") }
113
+ context 'throught text_file' do
114
+ let(:file) { File.join('spec', 'inputs', 'numbers_0_100.txt') }
115
+ let(:file_lines) { File.join('spec', 'inputs', 'lorem_300.txt') }
115
116
 
116
117
  let(:numbers) { File.readlines(file).map(&:strip).map(&:to_i) }
117
118
  let(:lines) { File.readlines(file_lines).map(&:strip) }
@@ -124,8 +125,9 @@ RSpec::describe "Spark::RDD" do
124
125
  $sc.text_file(file_lines, workers)
125
126
  end
126
127
 
127
- it_behaves_like "a reducing", nil
128
- it_behaves_like "a reducing", 1
129
- it_behaves_like "a reducing", rand(2..10)
128
+ it_behaves_like 'a reducing', 1
129
+ it_behaves_like 'a reducing', 2
130
+ # it_behaves_like 'a reducing', nil
131
+ # it_behaves_like 'a reducing', rand(2..10)
130
132
  end
131
133
  end
@@ -1,30 +1,30 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
2
 
3
3
  # Sample method can not be tested because of random generator
4
4
  # Just test it for raising error
5
5
 
6
- RSpec::shared_examples "a sampler" do |workers|
6
+ RSpec.shared_examples 'a sampler' do |workers|
7
7
  context "with #{workers || 'default'} worker" do
8
8
 
9
- context ".sample" do
10
- it "with replacement" do
9
+ context '.sample' do
10
+ it 'with replacement' do
11
11
  rdd2 = rdd(workers).sample(true, rand)
12
12
  expect { rdd2.collect }.to_not raise_error
13
13
  end
14
14
 
15
- it "without replacement" do
15
+ it 'without replacement' do
16
16
  rdd2 = rdd(workers).sample(false, rand)
17
17
  expect { rdd2.collect }.to_not raise_error
18
18
  end
19
19
  end
20
20
 
21
- context ".take_sample" do
22
- it "with replacement" do
21
+ context '.take_sample' do
22
+ it 'with replacement' do
23
23
  size = rand(10..999)
24
24
  expect(rdd(workers).take_sample(true, size).size).to eql(size)
25
25
  end
26
26
 
27
- it "without replacement" do
27
+ it 'without replacement' do
28
28
  size = rand(10..999)
29
29
  expect(rdd(workers).take_sample(false, size).size).to eql(size)
30
30
  end
@@ -33,14 +33,15 @@ RSpec::shared_examples "a sampler" do |workers|
33
33
  end
34
34
  end
35
35
 
36
- RSpec::describe "Spark::RDD" do
36
+ RSpec.describe 'Spark::RDD' do
37
37
  let(:numbers) { Generator.numbers(1000) }
38
38
 
39
39
  def rdd(workers)
40
40
  $sc.parallelize(numbers, workers)
41
41
  end
42
42
 
43
- it_behaves_like "a sampler", nil
44
- it_behaves_like "a sampler", 1
45
- it_behaves_like "a sampler", rand(2..10)
43
+ it_behaves_like 'a sampler', 1
44
+ it_behaves_like 'a sampler', 2
45
+ # it_behaves_like 'a sampler', nil
46
+ # it_behaves_like 'a sampler', rand(2..10)
46
47
  end
@@ -85,4 +85,31 @@ RSpec.describe Spark::Serializer do
85
85
  Zlib::Deflate.deflate(Marshal.dump(data))
86
86
  )
87
87
  end
88
+
89
+ context 'Auto batched' do
90
+ let(:klass) { Spark::Serializer::AutoBatched }
91
+ let(:marshal) { Spark::Serializer::Marshal.new }
92
+ let(:numbers) { Generator.numbers }
93
+
94
+ it 'initialize' do
95
+ expect { klass.new }.to raise_error(ArgumentError)
96
+ expect { klass.new(marshal) }.to_not raise_error
97
+ expect { klass.new(marshal, 1) }.to raise_error(Spark::SerializeError)
98
+ end
99
+
100
+ it 'serialization' do
101
+ serializer1 = klass.new(marshal)
102
+ serializer2 = klass.new(marshal, 2)
103
+
104
+ rdd1 = Spark.sc.parallelize(numbers, 2, serializer1)
105
+ rdd2 = Spark.sc.parallelize(numbers, 2, serializer2).map(:to_i)
106
+
107
+ result = rdd1.collect
108
+
109
+ expect(rdd1.serializer).to eq(serializer1)
110
+ expect(result).to eq(numbers)
111
+ expect(result).to eq(rdd2.collect)
112
+ end
113
+
114
+ end
88
115
  end
@@ -1,6 +1,6 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
2
 
3
- RSpec::shared_examples "a sorting" do |workers|
3
+ RSpec.shared_examples 'a sorting' do |workers|
4
4
  it "with #{workers || 'default'} worker" do
5
5
  rdd2 = rdd(workers)
6
6
 
@@ -22,37 +22,39 @@ RSpec::shared_examples "a sorting" do |workers|
22
22
  end
23
23
 
24
24
 
25
- RSpec::describe "Spark::RDD" do
25
+ RSpec.describe 'Spark::RDD' do
26
26
  let(:split) { lambda{|x| x.split} }
27
27
  let(:map) { lambda{|x| [x.to_s, 1]} }
28
28
  let(:len_map) { lambda{|x| [x.size, x]} }
29
29
 
30
- context "throught parallelize" do
31
- context ".map" do
30
+ context 'throught parallelize' do
31
+ context '.map' do
32
32
  let(:lines) { Generator.lines }
33
33
 
34
34
  def rdd(workers)
35
35
  $sc.parallelize(lines, workers)
36
36
  end
37
37
 
38
- it_behaves_like "a sorting", nil
39
- it_behaves_like "a sorting", 1
40
- it_behaves_like "a sorting", rand(2..10)
38
+ it_behaves_like 'a sorting', 1
39
+ it_behaves_like 'a sorting', 2
40
+ # it_behaves_like 'a sorting', nil
41
+ # it_behaves_like 'a sorting', rand(2..10)
41
42
  end
42
43
  end
43
44
 
44
- context "throught text_file" do
45
- context ".map" do
46
- let(:file) { File.join("spec", "inputs", "lorem_300.txt") }
45
+ context 'throught text_file' do
46
+ context '.map' do
47
+ let(:file) { File.join('spec', 'inputs', 'lorem_300.txt') }
47
48
  let(:lines) { File.readlines(file).map(&:strip) }
48
49
 
49
50
  def rdd(workers)
50
51
  $sc.text_file(file, workers)
51
52
  end
52
53
 
53
- it_behaves_like "a sorting", nil
54
- it_behaves_like "a sorting", 1
55
- it_behaves_like "a sorting", rand(2..10)
54
+ it_behaves_like 'a sorting', 1
55
+ it_behaves_like 'a sorting', 2
56
+ # it_behaves_like 'a sorting', nil
57
+ # it_behaves_like 'a sorting', rand(2..10)
56
58
  end
57
59
  end
58
60
  end
@@ -160,11 +160,13 @@ RSpec.describe Spark::RDD do
160
160
 
161
161
  context '.stats' do
162
162
  it_behaves_like 'a stats', 1
163
- it_behaves_like 'a stats', rand(2..5)
163
+ it_behaves_like 'a stats', 2
164
+ # it_behaves_like 'a stats', rand(2..5)
164
165
  end
165
166
 
166
167
  context '.histogram' do
167
168
  it_behaves_like 'a histogram', 1
168
- it_behaves_like 'a histogram', rand(2..5)
169
+ it_behaves_like 'a histogram', 2
170
+ # it_behaves_like 'a histogram', rand(2..5)
169
171
  end
170
172
  end
@@ -1,6 +1,6 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
2
 
3
- RSpec::shared_examples "a whole_text_files" do |workers|
3
+ RSpec.shared_examples 'a whole_text_files' do |workers|
4
4
  it "with #{workers || 'default'} worker" do
5
5
  rdd2 = rdd(workers).map(get_numbers)
6
6
  result = files.size
@@ -17,17 +17,18 @@ RSpec::shared_examples "a whole_text_files" do |workers|
17
17
  end
18
18
  end
19
19
 
20
- RSpec::describe "Spark::Context" do
20
+ RSpec.describe 'Spark::Context' do
21
21
  let(:get_numbers) { lambda{|file, content| content.split.map(&:to_i)} }
22
22
 
23
- let(:dir) { File.join("spec", "inputs", "numbers") }
24
- let(:files) { Dir.glob(File.join(dir, "*")) }
23
+ let(:dir) { File.join('spec', 'inputs', 'numbers') }
24
+ let(:files) { Dir.glob(File.join(dir, '*')) }
25
25
 
26
26
  def rdd(workers)
27
27
  $sc.whole_text_files(dir, workers)
28
28
  end
29
29
 
30
- it_behaves_like "a whole_text_files", nil
31
- it_behaves_like "a whole_text_files", 1
32
- it_behaves_like "a whole_text_files", rand(2..10)
30
+ it_behaves_like 'a whole_text_files', 1
31
+ it_behaves_like 'a whole_text_files', 2
32
+ # it_behaves_like 'a whole_text_files', nil
33
+ # it_behaves_like 'a whole_text_files', rand(2..10)
33
34
  end
@@ -1,5 +1,5 @@
1
- # require 'simplecov'
2
- # SimpleCov.start
1
+ require 'simplecov'
2
+ SimpleCov.start
3
3
 
4
4
  $LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'
5
5
  require 'ruby-spark'
@@ -7,7 +7,7 @@ require 'generator'
7
7
 
8
8
  # Loading
9
9
  Spark.load_lib
10
- Spark.jb.load_test
10
+ Spark.jb.import_all_test
11
11
  Spark::Mllib.import
12
12
 
13
13
  # Keep it on method because its called from config test
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-spark
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0.1
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ondřej Moravčík
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-16 00:00:00.000000000 Z
11
+ date: 2015-06-15 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rjb
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: sourcify
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -94,20 +108,6 @@ dependencies:
94
108
  - - ">="
95
109
  - !ruby/object:Gem::Version
96
110
  version: '0'
97
- - !ruby/object:Gem::Dependency
98
- name: rjb
99
- requirement: !ruby/object:Gem::Requirement
100
- requirements:
101
- - - ">="
102
- - !ruby/object:Gem::Version
103
- version: '0'
104
- type: :runtime
105
- prerelease: false
106
- version_requirements: !ruby/object:Gem::Requirement
107
- requirements:
108
- - - ">="
109
- - !ruby/object:Gem::Version
110
- version: '0'
111
111
  - !ruby/object:Gem::Dependency
112
112
  name: bundler
113
113
  requirement: !ruby/object:Gem::Requirement
@@ -146,6 +146,8 @@ extensions:
146
146
  extra_rdoc_files: []
147
147
  files:
148
148
  - ".gitignore"
149
+ - ".travis.yml"
150
+ - CHANGELOG.md
149
151
  - Gemfile
150
152
  - Guardfile
151
153
  - LICENSE.txt
@@ -344,7 +346,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
344
346
  requirements:
345
347
  - java, scala
346
348
  rubyforge_project:
347
- rubygems_version: 2.2.2
349
+ rubygems_version: 2.4.5
348
350
  signing_key:
349
351
  specification_version: 4
350
352
  summary: Ruby wrapper for Apache Spark
@@ -400,4 +402,3 @@ test_files:
400
402
  - spec/lib/statistic_spec.rb
401
403
  - spec/lib/whole_text_files_spec.rb
402
404
  - spec/spec_helper.rb
403
- has_rdoc: