ruby-spark 1.0.0 → 1.1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -1
  3. data/README.md +99 -32
  4. data/TODO.md +2 -3
  5. data/benchmark/{performance → comparison}/prepare.sh +0 -0
  6. data/benchmark/{performance → comparison}/python.py +0 -0
  7. data/benchmark/{performance → comparison}/r.r +0 -0
  8. data/benchmark/{performance → comparison}/ruby.rb +0 -0
  9. data/benchmark/{performance → comparison}/run-all.sh +0 -0
  10. data/benchmark/{performance → comparison}/scala.scala +0 -0
  11. data/example/pi.rb +1 -1
  12. data/example/website_search.rb +83 -0
  13. data/ext/spark/src/main/scala/RubyRDD.scala +30 -2
  14. data/lib/spark.rb +2 -2
  15. data/lib/spark/build.rb +1 -1
  16. data/lib/spark/cli.rb +1 -1
  17. data/lib/spark/command/base.rb +4 -0
  18. data/lib/spark/command_builder.rb +2 -2
  19. data/lib/spark/config.rb +11 -17
  20. data/lib/spark/context.rb +63 -45
  21. data/lib/spark/ext/io.rb +11 -1
  22. data/lib/spark/java_bridge/base.rb +2 -2
  23. data/lib/spark/rdd.rb +67 -18
  24. data/lib/spark/serializer.rb +68 -13
  25. data/lib/spark/serializer/auto_batched.rb +59 -0
  26. data/lib/spark/serializer/base.rb +30 -137
  27. data/lib/spark/serializer/batched.rb +84 -0
  28. data/lib/spark/serializer/cartesian.rb +5 -29
  29. data/lib/spark/serializer/compressed.rb +27 -0
  30. data/lib/spark/serializer/marshal.rb +6 -8
  31. data/lib/spark/serializer/message_pack.rb +8 -10
  32. data/lib/spark/serializer/oj.rb +8 -10
  33. data/lib/spark/serializer/pair.rb +27 -13
  34. data/lib/spark/serializer/text.rb +25 -0
  35. data/lib/spark/version.rb +1 -1
  36. data/lib/spark/worker/worker.rb +5 -2
  37. data/ruby-spark.gemspec +13 -1
  38. data/spec/lib/context_spec.rb +3 -1
  39. data/spec/lib/manipulation_spec.rb +18 -10
  40. data/spec/lib/map_partitions_spec.rb +16 -16
  41. data/spec/lib/serializer_spec.rb +84 -9
  42. data/spec/lib/statistic_spec.rb +26 -24
  43. data/spec/spec_helper.rb +1 -2
  44. metadata +112 -10
  45. data/lib/spark/serializer/utf8.rb +0 -25
@@ -0,0 +1,27 @@
1
+ module Spark
2
+ module Serializer
3
+ class Compressed < Base
4
+
5
+ def initialize(serializer)
6
+ @serializer = serializer
7
+ end
8
+
9
+ def dump(data)
10
+ Zlib::Deflate.deflate(@serializer.dump(data))
11
+ end
12
+
13
+ def load(data)
14
+ @serializer.load(Zlib::Inflate.inflate(data))
15
+ end
16
+
17
+ end
18
+ end
19
+ end
20
+
21
+ begin
22
+ # TODO: require only if it is necessary
23
+ require 'zlib'
24
+
25
+ Spark::Serializer.register('compress', 'compressed', Spark::Serializer::Compressed)
26
+ rescue LoadError
27
+ end
@@ -2,18 +2,16 @@ module Spark
2
2
  module Serializer
3
3
  class Marshal < Base
4
4
 
5
- def name
6
- 'marshal'
5
+ def dump(data)
6
+ ::Marshal.dump(data)
7
7
  end
8
8
 
9
- def serialize(data)
10
- ::Marshal::dump(data)
11
- end
12
-
13
- def deserialize(data)
14
- ::Marshal::load(data)
9
+ def load(data)
10
+ ::Marshal.load(data)
15
11
  end
16
12
 
17
13
  end
18
14
  end
19
15
  end
16
+
17
+ Spark::Serializer.register('marshal', Spark::Serializer::Marshal)
@@ -1,17 +1,13 @@
1
1
  module Spark
2
2
  module Serializer
3
- class MessagePack < Marshal
3
+ class MessagePack < Base
4
4
 
5
- def name
6
- 'message_pack'
5
+ def dump(data)
6
+ ::MessagePack.dump(data)
7
7
  end
8
8
 
9
- def self.serialize(data)
10
- ::MessagePack::dump(data)
11
- end
12
-
13
- def self.deserialize(data)
14
- ::MessagePack::load(data)
9
+ def load(data)
10
+ ::MessagePack.load(data)
15
11
  end
16
12
 
17
13
  end
@@ -19,7 +15,9 @@ module Spark
19
15
  end
20
16
 
21
17
  begin
18
+ # TODO: require only if it is necessary
22
19
  require 'msgpack'
20
+
21
+ Spark::Serializer.register('messagepack', 'message_pack', 'msgpack', 'msg_pack', Spark::Serializer::MessagePack)
23
22
  rescue LoadError
24
- Spark::Serializer::MessagePack = Spark::Serializer::Marshal
25
23
  end
@@ -1,17 +1,13 @@
1
1
  module Spark
2
2
  module Serializer
3
- class Oj < Marshal
3
+ class Oj < Base
4
4
 
5
- def name
6
- 'oj'
5
+ def dump(data)
6
+ ::Oj.dump(data)
7
7
  end
8
8
 
9
- def serialize(data)
10
- ::Oj::dump(data)
11
- end
12
-
13
- def deserialize(data)
14
- ::Oj::load(data)
9
+ def load(data)
10
+ ::Oj.load(data)
15
11
  end
16
12
 
17
13
  end
@@ -19,7 +15,9 @@ module Spark
19
15
  end
20
16
 
21
17
  begin
18
+ # TODO: require only if it is necessary
22
19
  require 'oj'
20
+
21
+ Spark::Serializer.register('oj', Spark::Serializer::Oj)
23
22
  rescue LoadError
24
- Spark::Serializer::Oj = Spark::Serializer::Marshal
25
23
  end
@@ -2,26 +2,40 @@ module Spark
2
2
  module Serializer
3
3
  class Pair < Base
4
4
 
5
- attr_reader :first, :second
5
+ def initialize(serializer1, serializer2)
6
+ @serializer1 = serializer1
7
+ @serializer2 = serializer2
8
+ end
6
9
 
7
- def set(first, second)
8
- unbatch!
9
- @first = first
10
- @second = second
11
- self
10
+ def to_s
11
+ "#{name}(#{@serializer1}, #{@serializer2})"
12
12
  end
13
13
 
14
- def batched?
15
- false
14
+ def aggregate(item1, item2)
15
+ item1.zip(item2)
16
16
  end
17
17
 
18
- def load_next_from_io(io, lenght)
19
- key_value = []
20
- key_value << @first.load_next_from_io(io, lenght)
21
- key_value << @second.load_next_from_io(io, read_int(io))
22
- key_value
18
+ def load_from_io(io)
19
+ return to_enum(__callee__, io) unless block_given?
20
+
21
+ loop do
22
+ size = io.read_int_or_eof
23
+ break if size == Spark::Constant::DATA_EOF
24
+
25
+ item1 = @serializer1.load(io.read(size))
26
+ item2 = @serializer2.load(io.read_string)
27
+
28
+ item1 = [item1] unless @serializer1.batched?
29
+ item2 = [item2] unless @serializer2.batched?
30
+
31
+ aggregate(item1, item2).each do |item|
32
+ yield item
33
+ end
34
+ end
23
35
  end
24
36
 
25
37
  end
26
38
  end
27
39
  end
40
+
41
+ Spark::Serializer.register('pair', Spark::Serializer::Pair)
@@ -0,0 +1,25 @@
1
+ module Spark
2
+ module Serializer
3
+ class Text < Base
4
+
5
+ attr_reader :encoding
6
+
7
+ def initialize(encoding=Encoding::UTF_8)
8
+ error('Encoding must be an instance of Encoding') unless encoding.is_a?(Encoding)
9
+
10
+ @encoding = encoding
11
+ end
12
+
13
+ def load(data)
14
+ data.to_s.force_encoding(@encoding)
15
+ end
16
+
17
+ def to_s
18
+ "Text(#{@encoding})"
19
+ end
20
+
21
+ end
22
+ end
23
+ end
24
+
25
+ Spark::Serializer.register('string', 'text', Spark::Serializer::Text)
data/lib/spark/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Spark
2
- VERSION = '1.0.0'
2
+ VERSION = '1.1.0.1'
3
3
  end
@@ -73,13 +73,16 @@ module Worker
73
73
  @command = socket.read_data
74
74
 
75
75
  # Load iterator
76
- @iterator = @command.deserializer.load(socket).lazy
76
+ @iterator = @command.deserializer.load_from_io(socket).lazy
77
77
 
78
78
  # Compute
79
79
  @iterator = @command.execute(@iterator, @split_index)
80
80
 
81
+ # Result is not iterable
82
+ @iterator = [@iterator] unless @iterator.respond_to?(:each)
83
+
81
84
  # Send result
82
- @command.serializer.dump(@iterator, socket)
85
+ @command.serializer.dump_to_io(@iterator, socket)
83
86
  end
84
87
 
85
88
  def send_error(e)
data/ruby-spark.gemspec CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
10
10
  spec.version = Spark::VERSION
11
11
  spec.authors = ['Ondřej Moravčík']
12
12
  spec.email = ['moravcik.ondrej@gmail.com']
13
- spec.summary = %q{Ruby wrapper for Spark}
13
+ spec.summary = %q{Ruby wrapper for Apache Spark}
14
14
  spec.description = %q{}
15
15
  spec.homepage = ''
16
16
  spec.license = 'MIT'
@@ -31,6 +31,18 @@ Gem::Specification.new do |spec|
31
31
 
32
32
  spec.requirements << 'java, scala'
33
33
 
34
+ spec.add_dependency 'sourcify', '0.6.0.rc4'
35
+ spec.add_dependency 'method_source'
36
+ spec.add_dependency 'commander'
37
+ spec.add_dependency 'pry'
38
+ spec.add_dependency 'nio4r'
39
+ spec.add_dependency 'distribution'
40
+
41
+ if RUBY_PLATFORM =~ /java/
42
+ else
43
+ spec.add_dependency 'rjb'
44
+ end
45
+
34
46
  spec.add_development_dependency 'bundler', '~> 1.6'
35
47
  spec.add_development_dependency 'rake'
36
48
  end
@@ -7,7 +7,9 @@ RSpec.describe Spark::Context do
7
7
  numbers = (0...100).to_a
8
8
  func = lambda{|part| part.size}
9
9
 
10
- rdd = $sc.parallelize(numbers, workers, batch_size: 1)
10
+ ser = Spark::Serializer.build { __batched__(__marshal__, 1) }
11
+
12
+ rdd = $sc.parallelize(numbers, workers, ser)
11
13
 
12
14
  rdd_result = $sc.run_job(rdd, func)
13
15
  result = numbers.each_slice(numbers.size/workers).map(&func)
@@ -8,7 +8,9 @@ RSpec::describe "Spark::RDD" do
8
8
  rdd = $sc.parallelize(numbers, 1).glom
9
9
  expect(rdd.collect).to eql([numbers.to_a])
10
10
 
11
- rdd = $sc.parallelize(numbers, 5, batch_size: 1).glom
11
+ ser = Spark::Serializer.build { __batched__(__marshal__, 1) }
12
+
13
+ rdd = $sc.parallelize(numbers, 5, ser).glom
12
14
  expect(rdd.collect).to eql(numbers.each_slice(20).to_a)
13
15
  end
14
16
 
@@ -42,9 +44,9 @@ RSpec::describe "Spark::RDD" do
42
44
  end
43
45
 
44
46
  it "with a different serializer" do
45
- rdd1 = $sc.parallelize(numbers, 1, serializer: "marshal")
46
- rdd2 = $sc.parallelize(numbers, 1, serializer: "oj")
47
-
47
+ rdd1 = $sc.parallelize(numbers, 1, Spark::Serializer.build{ __batched__(__marshal__) })
48
+ rdd2 = $sc.parallelize(numbers, 1, Spark::Serializer.build{ __batched__(__oj__) })
49
+
48
50
  expect { rdd1.union(rdd2).collect }.to_not raise_error
49
51
  end
50
52
 
@@ -59,14 +61,15 @@ RSpec::describe "Spark::RDD" do
59
61
  it ".compact" do
60
62
  data = [nil, nil , 0, 0, 1, 2, nil, 6]
61
63
  result = data.compact
64
+ ser = Spark::Serializer.build { __batched__(__marshal__, 1) }
62
65
 
63
66
  rdd = $sc.parallelize(data, 1).compact
64
67
  expect(rdd.collect).to eql(result)
65
68
 
66
- rdd = $sc.parallelize(data, 5, batch_size: 1).compact
69
+ rdd = $sc.parallelize(data, 5, ser).compact
67
70
  expect(rdd.collect).to eql(result)
68
71
 
69
- rdd = $sc.parallelize(data, 1, batch_size: 1).compact
72
+ rdd = $sc.parallelize(data, 1, ser).compact
70
73
  expect(rdd.collect).to eql(result)
71
74
  end
72
75
 
@@ -93,8 +96,10 @@ RSpec::describe "Spark::RDD" do
93
96
  let(:result) { data1.product(data2).map(&:to_s).sort }
94
97
 
95
98
  it "unbatched" do
96
- rdd1 = $sc.parallelize(data1, 2, batch_size: 1)
97
- rdd2 = $sc.parallelize(data2, 2, batch_size: 1)
99
+ ser = Spark::Serializer.build { __batched__(__marshal__, 1) }
100
+
101
+ rdd1 = $sc.parallelize(data1, 2, ser)
102
+ rdd2 = $sc.parallelize(data2, 2, ser)
98
103
 
99
104
  rdd = rdd1.cartesian(rdd2).map(lambda{|x| x.to_s})
100
105
 
@@ -102,8 +107,11 @@ RSpec::describe "Spark::RDD" do
102
107
  end
103
108
 
104
109
  it "batched" do
105
- rdd1 = $sc.parallelize(data1, 2, batch_size: rand(4..10))
106
- rdd2 = $sc.parallelize(data2, 2, batch_size: rand(4..10))
110
+ ser1 = Spark::Serializer.build { __batched__(__marshal__, rand(4..10)) }
111
+ ser2 = Spark::Serializer.build { __batched__(__marshal__, rand(4..10)) }
112
+
113
+ rdd1 = $sc.parallelize(data1, 2, ser1)
114
+ rdd2 = $sc.parallelize(data2, 2, ser2)
107
115
 
108
116
  rdd = rdd1.cartesian(rdd2).map(lambda{|x| x.to_s})
109
117
 
@@ -1,18 +1,18 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
2
 
3
3
  def func3(x)
4
4
  x.map(&:to_i).reduce(:+)
5
5
  end
6
6
 
7
7
  def func4_with_index(data, index)
8
- {
8
+ [{
9
9
  index => data.map(&:to_i).reduce(:*)
10
- }
10
+ }]
11
11
  end
12
12
 
13
- RSpec::shared_examples "a map partitions" do |workers|
13
+ RSpec.shared_examples 'a map partitions' do |workers|
14
14
  context "with #{workers || 'default'} worker" do
15
- it "without index" do
15
+ it 'without index' do
16
16
  rdd2 = rdd(workers).map_partitions(func1)
17
17
  result = func1.call(numbers)
18
18
 
@@ -35,7 +35,7 @@ RSpec::shared_examples "a map partitions" do |workers|
35
35
  expect(rdd4.collect).to eql(rdd3.collect)
36
36
  end
37
37
 
38
- it "with index" do
38
+ it 'with index' do
39
39
  rdd2 = rdd(workers).map_partitions_with_index(method(:func4_with_index))
40
40
  result = rdd2.collect
41
41
 
@@ -52,7 +52,7 @@ RSpec::shared_examples "a map partitions" do |workers|
52
52
  end
53
53
  end
54
54
 
55
- RSpec::describe "Spark::RDD.map_partitions(_with_index)" do
55
+ RSpec::describe 'Spark::RDD.map_partitions(_with_index)' do
56
56
  let(:func1) { lambda{|x| x.map(&:to_i)} }
57
57
  let(:func2) {
58
58
  lambda{|x|
@@ -60,28 +60,28 @@ RSpec::describe "Spark::RDD.map_partitions(_with_index)" do
60
60
  }
61
61
  }
62
62
 
63
- context "throught parallelize" do
63
+ context 'throught parallelize' do
64
64
  let(:numbers) { 0..1000 }
65
65
 
66
66
  def rdd(workers)
67
67
  $sc.parallelize(numbers, workers)
68
68
  end
69
69
 
70
- it_behaves_like "a map partitions", nil
71
- it_behaves_like "a map partitions", 1
72
- it_behaves_like "a map partitions", rand(2..10)
70
+ it_behaves_like 'a map partitions', nil
71
+ it_behaves_like 'a map partitions', 1
72
+ it_behaves_like 'a map partitions', rand(2..10)
73
73
  end
74
74
 
75
- context "throught text_file" do
76
- let(:file) { File.join("spec", "inputs", "numbers_0_100.txt") }
75
+ context 'throught text_file' do
76
+ let(:file) { File.join('spec', 'inputs', 'numbers_0_100.txt') }
77
77
  let(:numbers) { File.readlines(file).map(&:strip) }
78
78
 
79
79
  def rdd(workers)
80
80
  $sc.text_file(file, workers)
81
81
  end
82
82
 
83
- it_behaves_like "a map partitions", nil
84
- it_behaves_like "a map partitions", 1
85
- it_behaves_like "a map partitions", rand(2..10)
83
+ it_behaves_like 'a map partitions', nil
84
+ it_behaves_like 'a map partitions', 1
85
+ it_behaves_like 'a map partitions', rand(2..10)
86
86
  end
87
87
  end
@@ -1,13 +1,88 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
+ require 'zlib'
2
3
 
3
- RSpec::describe Spark::Serializer do
4
-
5
- it ".get" do
6
- expect(described_class.get(nil)).to eql(nil)
7
- expect(described_class.get("MARSHAL")).to eql(nil)
8
- expect(described_class.get("Marshal")).to eql(described_class::Marshal)
9
- expect(described_class.get("marshal")).to eql(described_class::Marshal)
10
- expect(described_class.get("message_pack")).to eql(described_class::MessagePack)
4
+ RSpec.describe Spark::Serializer do
5
+ let(:data) { [1, 'test', 2.0, [3], {key: 'value'}, :test, String] }
6
+
7
+ it 'find' do
8
+ expect(described_class.find('not_existed_class')).to eql(nil)
9
+
10
+ expect(described_class.find('Marshal')).to eq(described_class::Marshal)
11
+ expect(described_class.find('marshal')).to eq(described_class::Marshal)
12
+ expect(described_class.find(:marshal)).to eq(described_class::Marshal)
13
+ expect(described_class.find('batched')).to eq(described_class::Batched)
14
+ end
15
+
16
+ it 'find!' do
17
+ expect { expect(described_class.find!('not_existed_class')) }.to raise_error(Spark::SerializeError)
18
+ expect { expect(described_class.find!('marshal')) }.to_not raise_error
19
+ expect { expect(described_class.find!('batched')) }.to_not raise_error
20
+ end
21
+
22
+ it 'register' do
23
+ NewSerializer = Class.new
24
+
25
+ expect(described_class.find('new_serializer_1')).to eql(nil)
26
+ expect(described_class.find('new_serializer_2')).to eql(nil)
27
+ expect(described_class.find('new_serializer_3')).to eql(nil)
28
+
29
+ described_class.register('new_serializer_1', 'new_serializer_2', 'new_serializer_3', NewSerializer)
30
+
31
+ expect(described_class.find('new_serializer_1')).to eql(NewSerializer)
32
+ expect(described_class.find('new_serializer_2')).to eql(NewSerializer)
33
+ expect(described_class.find('new_serializer_3')).to eql(NewSerializer)
34
+ end
35
+
36
+ it '==' do
37
+ # One class
38
+ marshal1 = described_class::Marshal.new
39
+ marshal2 = described_class::Marshal.new
40
+
41
+ expect(marshal1).to eq(marshal1)
42
+ expect(marshal1).to eq(marshal2)
43
+
44
+ # Two classes
45
+ compressed1 = described_class::Compressed.new(marshal1)
46
+ compressed2 = described_class::Compressed.new(marshal2)
47
+
48
+ expect(compressed1).to eq(compressed1)
49
+ expect(compressed1).to eq(compressed2)
50
+
51
+ # Three classes
52
+ batched1 = described_class::Batched.new(compressed1, 1)
53
+ batched2 = described_class::Batched.new(compressed2, 1)
54
+ batched3 = described_class::Batched.new(compressed1, 2)
55
+
56
+ expect(batched1).to eq(batched2)
57
+ expect(batched1).to_not eq(batched3)
11
58
  end
12
59
 
60
+ context 'build' do
61
+ let(:marshal1) { described_class::Marshal.new }
62
+ let(:compressed1) { described_class::Compressed.new(marshal1) }
63
+ let(:batched1) { described_class::Batched.new(compressed1, 1) }
64
+
65
+ it 'block' do
66
+ expect(described_class.build{ marshal }).to eq(marshal1)
67
+ expect(described_class.build{ marshal }).to eq(described_class.build{ __marshal__ })
68
+ expect(described_class.build{ compressed(marshal) }).to eq(compressed1)
69
+ expect(described_class.build{ batched(compressed(marshal), 1) }).to eq(batched1)
70
+ end
71
+
72
+ it 'text' do
73
+ expect(described_class.build('marshal')).to eq(marshal1)
74
+ expect(described_class.build('compressed(marshal)')).to eq(compressed1)
75
+ expect(described_class.build('batched(compressed(marshal), 1)')).to eq(batched1)
76
+ end
77
+ end
78
+
79
+ it 'serialization' do
80
+ marshal1 = described_class.build{ marshal }
81
+ compressed1 = described_class.build{ compressed(marshal) }
82
+
83
+ expect(marshal1.dump(data)).to eq(Marshal.dump(data))
84
+ expect(compressed1.dump(data)).to eq(
85
+ Zlib::Deflate.deflate(Marshal.dump(data))
86
+ )
87
+ end
13
88
  end