ruby-spark 1.0.0 → 1.1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -1
  3. data/README.md +99 -32
  4. data/TODO.md +2 -3
  5. data/benchmark/{performance → comparison}/prepare.sh +0 -0
  6. data/benchmark/{performance → comparison}/python.py +0 -0
  7. data/benchmark/{performance → comparison}/r.r +0 -0
  8. data/benchmark/{performance → comparison}/ruby.rb +0 -0
  9. data/benchmark/{performance → comparison}/run-all.sh +0 -0
  10. data/benchmark/{performance → comparison}/scala.scala +0 -0
  11. data/example/pi.rb +1 -1
  12. data/example/website_search.rb +83 -0
  13. data/ext/spark/src/main/scala/RubyRDD.scala +30 -2
  14. data/lib/spark.rb +2 -2
  15. data/lib/spark/build.rb +1 -1
  16. data/lib/spark/cli.rb +1 -1
  17. data/lib/spark/command/base.rb +4 -0
  18. data/lib/spark/command_builder.rb +2 -2
  19. data/lib/spark/config.rb +11 -17
  20. data/lib/spark/context.rb +63 -45
  21. data/lib/spark/ext/io.rb +11 -1
  22. data/lib/spark/java_bridge/base.rb +2 -2
  23. data/lib/spark/rdd.rb +67 -18
  24. data/lib/spark/serializer.rb +68 -13
  25. data/lib/spark/serializer/auto_batched.rb +59 -0
  26. data/lib/spark/serializer/base.rb +30 -137
  27. data/lib/spark/serializer/batched.rb +84 -0
  28. data/lib/spark/serializer/cartesian.rb +5 -29
  29. data/lib/spark/serializer/compressed.rb +27 -0
  30. data/lib/spark/serializer/marshal.rb +6 -8
  31. data/lib/spark/serializer/message_pack.rb +8 -10
  32. data/lib/spark/serializer/oj.rb +8 -10
  33. data/lib/spark/serializer/pair.rb +27 -13
  34. data/lib/spark/serializer/text.rb +25 -0
  35. data/lib/spark/version.rb +1 -1
  36. data/lib/spark/worker/worker.rb +5 -2
  37. data/ruby-spark.gemspec +13 -1
  38. data/spec/lib/context_spec.rb +3 -1
  39. data/spec/lib/manipulation_spec.rb +18 -10
  40. data/spec/lib/map_partitions_spec.rb +16 -16
  41. data/spec/lib/serializer_spec.rb +84 -9
  42. data/spec/lib/statistic_spec.rb +26 -24
  43. data/spec/spec_helper.rb +1 -2
  44. metadata +112 -10
  45. data/lib/spark/serializer/utf8.rb +0 -25
@@ -0,0 +1,27 @@
1
+ module Spark
2
+ module Serializer
3
+ class Compressed < Base
4
+
5
+ def initialize(serializer)
6
+ @serializer = serializer
7
+ end
8
+
9
+ def dump(data)
10
+ Zlib::Deflate.deflate(@serializer.dump(data))
11
+ end
12
+
13
+ def load(data)
14
+ @serializer.load(Zlib::Inflate.inflate(data))
15
+ end
16
+
17
+ end
18
+ end
19
+ end
20
+
21
+ begin
22
+ # TODO: require only if it is necessary
23
+ require 'zlib'
24
+
25
+ Spark::Serializer.register('compress', 'compressed', Spark::Serializer::Compressed)
26
+ rescue LoadError
27
+ end
@@ -2,18 +2,16 @@ module Spark
2
2
  module Serializer
3
3
  class Marshal < Base
4
4
 
5
- def name
6
- 'marshal'
5
+ def dump(data)
6
+ ::Marshal.dump(data)
7
7
  end
8
8
 
9
- def serialize(data)
10
- ::Marshal::dump(data)
11
- end
12
-
13
- def deserialize(data)
14
- ::Marshal::load(data)
9
+ def load(data)
10
+ ::Marshal.load(data)
15
11
  end
16
12
 
17
13
  end
18
14
  end
19
15
  end
16
+
17
+ Spark::Serializer.register('marshal', Spark::Serializer::Marshal)
@@ -1,17 +1,13 @@
1
1
  module Spark
2
2
  module Serializer
3
- class MessagePack < Marshal
3
+ class MessagePack < Base
4
4
 
5
- def name
6
- 'message_pack'
5
+ def dump(data)
6
+ ::MessagePack.dump(data)
7
7
  end
8
8
 
9
- def self.serialize(data)
10
- ::MessagePack::dump(data)
11
- end
12
-
13
- def self.deserialize(data)
14
- ::MessagePack::load(data)
9
+ def load(data)
10
+ ::MessagePack.load(data)
15
11
  end
16
12
 
17
13
  end
@@ -19,7 +15,9 @@ module Spark
19
15
  end
20
16
 
21
17
  begin
18
+ # TODO: require only if it is necessary
22
19
  require 'msgpack'
20
+
21
+ Spark::Serializer.register('messagepack', 'message_pack', 'msgpack', 'msg_pack', Spark::Serializer::MessagePack)
23
22
  rescue LoadError
24
- Spark::Serializer::MessagePack = Spark::Serializer::Marshal
25
23
  end
@@ -1,17 +1,13 @@
1
1
  module Spark
2
2
  module Serializer
3
- class Oj < Marshal
3
+ class Oj < Base
4
4
 
5
- def name
6
- 'oj'
5
+ def dump(data)
6
+ ::Oj.dump(data)
7
7
  end
8
8
 
9
- def serialize(data)
10
- ::Oj::dump(data)
11
- end
12
-
13
- def deserialize(data)
14
- ::Oj::load(data)
9
+ def load(data)
10
+ ::Oj.load(data)
15
11
  end
16
12
 
17
13
  end
@@ -19,7 +15,9 @@ module Spark
19
15
  end
20
16
 
21
17
  begin
18
+ # TODO: require only if it is necessary
22
19
  require 'oj'
20
+
21
+ Spark::Serializer.register('oj', Spark::Serializer::Oj)
23
22
  rescue LoadError
24
- Spark::Serializer::Oj = Spark::Serializer::Marshal
25
23
  end
@@ -2,26 +2,40 @@ module Spark
2
2
  module Serializer
3
3
  class Pair < Base
4
4
 
5
- attr_reader :first, :second
5
+ def initialize(serializer1, serializer2)
6
+ @serializer1 = serializer1
7
+ @serializer2 = serializer2
8
+ end
6
9
 
7
- def set(first, second)
8
- unbatch!
9
- @first = first
10
- @second = second
11
- self
10
+ def to_s
11
+ "#{name}(#{@serializer1}, #{@serializer2})"
12
12
  end
13
13
 
14
- def batched?
15
- false
14
+ def aggregate(item1, item2)
15
+ item1.zip(item2)
16
16
  end
17
17
 
18
- def load_next_from_io(io, lenght)
19
- key_value = []
20
- key_value << @first.load_next_from_io(io, lenght)
21
- key_value << @second.load_next_from_io(io, read_int(io))
22
- key_value
18
+ def load_from_io(io)
19
+ return to_enum(__callee__, io) unless block_given?
20
+
21
+ loop do
22
+ size = io.read_int_or_eof
23
+ break if size == Spark::Constant::DATA_EOF
24
+
25
+ item1 = @serializer1.load(io.read(size))
26
+ item2 = @serializer2.load(io.read_string)
27
+
28
+ item1 = [item1] unless @serializer1.batched?
29
+ item2 = [item2] unless @serializer2.batched?
30
+
31
+ aggregate(item1, item2).each do |item|
32
+ yield item
33
+ end
34
+ end
23
35
  end
24
36
 
25
37
  end
26
38
  end
27
39
  end
40
+
41
+ Spark::Serializer.register('pair', Spark::Serializer::Pair)
@@ -0,0 +1,25 @@
1
+ module Spark
2
+ module Serializer
3
+ class Text < Base
4
+
5
+ attr_reader :encoding
6
+
7
+ def initialize(encoding=Encoding::UTF_8)
8
+ error('Encoding must be an instance of Encoding') unless encoding.is_a?(Encoding)
9
+
10
+ @encoding = encoding
11
+ end
12
+
13
+ def load(data)
14
+ data.to_s.force_encoding(@encoding)
15
+ end
16
+
17
+ def to_s
18
+ "Text(#{@encoding})"
19
+ end
20
+
21
+ end
22
+ end
23
+ end
24
+
25
+ Spark::Serializer.register('string', 'text', Spark::Serializer::Text)
data/lib/spark/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Spark
2
- VERSION = '1.0.0'
2
+ VERSION = '1.1.0.1'
3
3
  end
@@ -73,13 +73,16 @@ module Worker
73
73
  @command = socket.read_data
74
74
 
75
75
  # Load iterator
76
- @iterator = @command.deserializer.load(socket).lazy
76
+ @iterator = @command.deserializer.load_from_io(socket).lazy
77
77
 
78
78
  # Compute
79
79
  @iterator = @command.execute(@iterator, @split_index)
80
80
 
81
+ # Result is not iterable
82
+ @iterator = [@iterator] unless @iterator.respond_to?(:each)
83
+
81
84
  # Send result
82
- @command.serializer.dump(@iterator, socket)
85
+ @command.serializer.dump_to_io(@iterator, socket)
83
86
  end
84
87
 
85
88
  def send_error(e)
data/ruby-spark.gemspec CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
10
10
  spec.version = Spark::VERSION
11
11
  spec.authors = ['Ondřej Moravčík']
12
12
  spec.email = ['moravcik.ondrej@gmail.com']
13
- spec.summary = %q{Ruby wrapper for Spark}
13
+ spec.summary = %q{Ruby wrapper for Apache Spark}
14
14
  spec.description = %q{}
15
15
  spec.homepage = ''
16
16
  spec.license = 'MIT'
@@ -31,6 +31,18 @@ Gem::Specification.new do |spec|
31
31
 
32
32
  spec.requirements << 'java, scala'
33
33
 
34
+ spec.add_dependency 'sourcify', '0.6.0.rc4'
35
+ spec.add_dependency 'method_source'
36
+ spec.add_dependency 'commander'
37
+ spec.add_dependency 'pry'
38
+ spec.add_dependency 'nio4r'
39
+ spec.add_dependency 'distribution'
40
+
41
+ if RUBY_PLATFORM =~ /java/
42
+ else
43
+ spec.add_dependency 'rjb'
44
+ end
45
+
34
46
  spec.add_development_dependency 'bundler', '~> 1.6'
35
47
  spec.add_development_dependency 'rake'
36
48
  end
@@ -7,7 +7,9 @@ RSpec.describe Spark::Context do
7
7
  numbers = (0...100).to_a
8
8
  func = lambda{|part| part.size}
9
9
 
10
- rdd = $sc.parallelize(numbers, workers, batch_size: 1)
10
+ ser = Spark::Serializer.build { __batched__(__marshal__, 1) }
11
+
12
+ rdd = $sc.parallelize(numbers, workers, ser)
11
13
 
12
14
  rdd_result = $sc.run_job(rdd, func)
13
15
  result = numbers.each_slice(numbers.size/workers).map(&func)
@@ -8,7 +8,9 @@ RSpec::describe "Spark::RDD" do
8
8
  rdd = $sc.parallelize(numbers, 1).glom
9
9
  expect(rdd.collect).to eql([numbers.to_a])
10
10
 
11
- rdd = $sc.parallelize(numbers, 5, batch_size: 1).glom
11
+ ser = Spark::Serializer.build { __batched__(__marshal__, 1) }
12
+
13
+ rdd = $sc.parallelize(numbers, 5, ser).glom
12
14
  expect(rdd.collect).to eql(numbers.each_slice(20).to_a)
13
15
  end
14
16
 
@@ -42,9 +44,9 @@ RSpec::describe "Spark::RDD" do
42
44
  end
43
45
 
44
46
  it "with a different serializer" do
45
- rdd1 = $sc.parallelize(numbers, 1, serializer: "marshal")
46
- rdd2 = $sc.parallelize(numbers, 1, serializer: "oj")
47
-
47
+ rdd1 = $sc.parallelize(numbers, 1, Spark::Serializer.build{ __batched__(__marshal__) })
48
+ rdd2 = $sc.parallelize(numbers, 1, Spark::Serializer.build{ __batched__(__oj__) })
49
+
48
50
  expect { rdd1.union(rdd2).collect }.to_not raise_error
49
51
  end
50
52
 
@@ -59,14 +61,15 @@ RSpec::describe "Spark::RDD" do
59
61
  it ".compact" do
60
62
  data = [nil, nil , 0, 0, 1, 2, nil, 6]
61
63
  result = data.compact
64
+ ser = Spark::Serializer.build { __batched__(__marshal__, 1) }
62
65
 
63
66
  rdd = $sc.parallelize(data, 1).compact
64
67
  expect(rdd.collect).to eql(result)
65
68
 
66
- rdd = $sc.parallelize(data, 5, batch_size: 1).compact
69
+ rdd = $sc.parallelize(data, 5, ser).compact
67
70
  expect(rdd.collect).to eql(result)
68
71
 
69
- rdd = $sc.parallelize(data, 1, batch_size: 1).compact
72
+ rdd = $sc.parallelize(data, 1, ser).compact
70
73
  expect(rdd.collect).to eql(result)
71
74
  end
72
75
 
@@ -93,8 +96,10 @@ RSpec::describe "Spark::RDD" do
93
96
  let(:result) { data1.product(data2).map(&:to_s).sort }
94
97
 
95
98
  it "unbatched" do
96
- rdd1 = $sc.parallelize(data1, 2, batch_size: 1)
97
- rdd2 = $sc.parallelize(data2, 2, batch_size: 1)
99
+ ser = Spark::Serializer.build { __batched__(__marshal__, 1) }
100
+
101
+ rdd1 = $sc.parallelize(data1, 2, ser)
102
+ rdd2 = $sc.parallelize(data2, 2, ser)
98
103
 
99
104
  rdd = rdd1.cartesian(rdd2).map(lambda{|x| x.to_s})
100
105
 
@@ -102,8 +107,11 @@ RSpec::describe "Spark::RDD" do
102
107
  end
103
108
 
104
109
  it "batched" do
105
- rdd1 = $sc.parallelize(data1, 2, batch_size: rand(4..10))
106
- rdd2 = $sc.parallelize(data2, 2, batch_size: rand(4..10))
110
+ ser1 = Spark::Serializer.build { __batched__(__marshal__, rand(4..10)) }
111
+ ser2 = Spark::Serializer.build { __batched__(__marshal__, rand(4..10)) }
112
+
113
+ rdd1 = $sc.parallelize(data1, 2, ser1)
114
+ rdd2 = $sc.parallelize(data2, 2, ser2)
107
115
 
108
116
  rdd = rdd1.cartesian(rdd2).map(lambda{|x| x.to_s})
109
117
 
@@ -1,18 +1,18 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
2
 
3
3
  def func3(x)
4
4
  x.map(&:to_i).reduce(:+)
5
5
  end
6
6
 
7
7
  def func4_with_index(data, index)
8
- {
8
+ [{
9
9
  index => data.map(&:to_i).reduce(:*)
10
- }
10
+ }]
11
11
  end
12
12
 
13
- RSpec::shared_examples "a map partitions" do |workers|
13
+ RSpec.shared_examples 'a map partitions' do |workers|
14
14
  context "with #{workers || 'default'} worker" do
15
- it "without index" do
15
+ it 'without index' do
16
16
  rdd2 = rdd(workers).map_partitions(func1)
17
17
  result = func1.call(numbers)
18
18
 
@@ -35,7 +35,7 @@ RSpec::shared_examples "a map partitions" do |workers|
35
35
  expect(rdd4.collect).to eql(rdd3.collect)
36
36
  end
37
37
 
38
- it "with index" do
38
+ it 'with index' do
39
39
  rdd2 = rdd(workers).map_partitions_with_index(method(:func4_with_index))
40
40
  result = rdd2.collect
41
41
 
@@ -52,7 +52,7 @@ RSpec::shared_examples "a map partitions" do |workers|
52
52
  end
53
53
  end
54
54
 
55
- RSpec::describe "Spark::RDD.map_partitions(_with_index)" do
55
+ RSpec::describe 'Spark::RDD.map_partitions(_with_index)' do
56
56
  let(:func1) { lambda{|x| x.map(&:to_i)} }
57
57
  let(:func2) {
58
58
  lambda{|x|
@@ -60,28 +60,28 @@ RSpec::describe "Spark::RDD.map_partitions(_with_index)" do
60
60
  }
61
61
  }
62
62
 
63
- context "throught parallelize" do
63
+ context 'throught parallelize' do
64
64
  let(:numbers) { 0..1000 }
65
65
 
66
66
  def rdd(workers)
67
67
  $sc.parallelize(numbers, workers)
68
68
  end
69
69
 
70
- it_behaves_like "a map partitions", nil
71
- it_behaves_like "a map partitions", 1
72
- it_behaves_like "a map partitions", rand(2..10)
70
+ it_behaves_like 'a map partitions', nil
71
+ it_behaves_like 'a map partitions', 1
72
+ it_behaves_like 'a map partitions', rand(2..10)
73
73
  end
74
74
 
75
- context "throught text_file" do
76
- let(:file) { File.join("spec", "inputs", "numbers_0_100.txt") }
75
+ context 'throught text_file' do
76
+ let(:file) { File.join('spec', 'inputs', 'numbers_0_100.txt') }
77
77
  let(:numbers) { File.readlines(file).map(&:strip) }
78
78
 
79
79
  def rdd(workers)
80
80
  $sc.text_file(file, workers)
81
81
  end
82
82
 
83
- it_behaves_like "a map partitions", nil
84
- it_behaves_like "a map partitions", 1
85
- it_behaves_like "a map partitions", rand(2..10)
83
+ it_behaves_like 'a map partitions', nil
84
+ it_behaves_like 'a map partitions', 1
85
+ it_behaves_like 'a map partitions', rand(2..10)
86
86
  end
87
87
  end
@@ -1,13 +1,88 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
+ require 'zlib'
2
3
 
3
- RSpec::describe Spark::Serializer do
4
-
5
- it ".get" do
6
- expect(described_class.get(nil)).to eql(nil)
7
- expect(described_class.get("MARSHAL")).to eql(nil)
8
- expect(described_class.get("Marshal")).to eql(described_class::Marshal)
9
- expect(described_class.get("marshal")).to eql(described_class::Marshal)
10
- expect(described_class.get("message_pack")).to eql(described_class::MessagePack)
4
+ RSpec.describe Spark::Serializer do
5
+ let(:data) { [1, 'test', 2.0, [3], {key: 'value'}, :test, String] }
6
+
7
+ it 'find' do
8
+ expect(described_class.find('not_existed_class')).to eql(nil)
9
+
10
+ expect(described_class.find('Marshal')).to eq(described_class::Marshal)
11
+ expect(described_class.find('marshal')).to eq(described_class::Marshal)
12
+ expect(described_class.find(:marshal)).to eq(described_class::Marshal)
13
+ expect(described_class.find('batched')).to eq(described_class::Batched)
14
+ end
15
+
16
+ it 'find!' do
17
+ expect { expect(described_class.find!('not_existed_class')) }.to raise_error(Spark::SerializeError)
18
+ expect { expect(described_class.find!('marshal')) }.to_not raise_error
19
+ expect { expect(described_class.find!('batched')) }.to_not raise_error
20
+ end
21
+
22
+ it 'register' do
23
+ NewSerializer = Class.new
24
+
25
+ expect(described_class.find('new_serializer_1')).to eql(nil)
26
+ expect(described_class.find('new_serializer_2')).to eql(nil)
27
+ expect(described_class.find('new_serializer_3')).to eql(nil)
28
+
29
+ described_class.register('new_serializer_1', 'new_serializer_2', 'new_serializer_3', NewSerializer)
30
+
31
+ expect(described_class.find('new_serializer_1')).to eql(NewSerializer)
32
+ expect(described_class.find('new_serializer_2')).to eql(NewSerializer)
33
+ expect(described_class.find('new_serializer_3')).to eql(NewSerializer)
34
+ end
35
+
36
+ it '==' do
37
+ # One class
38
+ marshal1 = described_class::Marshal.new
39
+ marshal2 = described_class::Marshal.new
40
+
41
+ expect(marshal1).to eq(marshal1)
42
+ expect(marshal1).to eq(marshal2)
43
+
44
+ # Two classes
45
+ compressed1 = described_class::Compressed.new(marshal1)
46
+ compressed2 = described_class::Compressed.new(marshal2)
47
+
48
+ expect(compressed1).to eq(compressed1)
49
+ expect(compressed1).to eq(compressed2)
50
+
51
+ # Three classes
52
+ batched1 = described_class::Batched.new(compressed1, 1)
53
+ batched2 = described_class::Batched.new(compressed2, 1)
54
+ batched3 = described_class::Batched.new(compressed1, 2)
55
+
56
+ expect(batched1).to eq(batched2)
57
+ expect(batched1).to_not eq(batched3)
11
58
  end
12
59
 
60
+ context 'build' do
61
+ let(:marshal1) { described_class::Marshal.new }
62
+ let(:compressed1) { described_class::Compressed.new(marshal1) }
63
+ let(:batched1) { described_class::Batched.new(compressed1, 1) }
64
+
65
+ it 'block' do
66
+ expect(described_class.build{ marshal }).to eq(marshal1)
67
+ expect(described_class.build{ marshal }).to eq(described_class.build{ __marshal__ })
68
+ expect(described_class.build{ compressed(marshal) }).to eq(compressed1)
69
+ expect(described_class.build{ batched(compressed(marshal), 1) }).to eq(batched1)
70
+ end
71
+
72
+ it 'text' do
73
+ expect(described_class.build('marshal')).to eq(marshal1)
74
+ expect(described_class.build('compressed(marshal)')).to eq(compressed1)
75
+ expect(described_class.build('batched(compressed(marshal), 1)')).to eq(batched1)
76
+ end
77
+ end
78
+
79
+ it 'serialization' do
80
+ marshal1 = described_class.build{ marshal }
81
+ compressed1 = described_class.build{ compressed(marshal) }
82
+
83
+ expect(marshal1.dump(data)).to eq(Marshal.dump(data))
84
+ expect(compressed1.dump(data)).to eq(
85
+ Zlib::Deflate.deflate(Marshal.dump(data))
86
+ )
87
+ end
13
88
  end