ruby-spark 1.0.0 → 1.1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -1
  3. data/README.md +99 -32
  4. data/TODO.md +2 -3
  5. data/benchmark/{performance → comparison}/prepare.sh +0 -0
  6. data/benchmark/{performance → comparison}/python.py +0 -0
  7. data/benchmark/{performance → comparison}/r.r +0 -0
  8. data/benchmark/{performance → comparison}/ruby.rb +0 -0
  9. data/benchmark/{performance → comparison}/run-all.sh +0 -0
  10. data/benchmark/{performance → comparison}/scala.scala +0 -0
  11. data/example/pi.rb +1 -1
  12. data/example/website_search.rb +83 -0
  13. data/ext/spark/src/main/scala/RubyRDD.scala +30 -2
  14. data/lib/spark.rb +2 -2
  15. data/lib/spark/build.rb +1 -1
  16. data/lib/spark/cli.rb +1 -1
  17. data/lib/spark/command/base.rb +4 -0
  18. data/lib/spark/command_builder.rb +2 -2
  19. data/lib/spark/config.rb +11 -17
  20. data/lib/spark/context.rb +63 -45
  21. data/lib/spark/ext/io.rb +11 -1
  22. data/lib/spark/java_bridge/base.rb +2 -2
  23. data/lib/spark/rdd.rb +67 -18
  24. data/lib/spark/serializer.rb +68 -13
  25. data/lib/spark/serializer/auto_batched.rb +59 -0
  26. data/lib/spark/serializer/base.rb +30 -137
  27. data/lib/spark/serializer/batched.rb +84 -0
  28. data/lib/spark/serializer/cartesian.rb +5 -29
  29. data/lib/spark/serializer/compressed.rb +27 -0
  30. data/lib/spark/serializer/marshal.rb +6 -8
  31. data/lib/spark/serializer/message_pack.rb +8 -10
  32. data/lib/spark/serializer/oj.rb +8 -10
  33. data/lib/spark/serializer/pair.rb +27 -13
  34. data/lib/spark/serializer/text.rb +25 -0
  35. data/lib/spark/version.rb +1 -1
  36. data/lib/spark/worker/worker.rb +5 -2
  37. data/ruby-spark.gemspec +13 -1
  38. data/spec/lib/context_spec.rb +3 -1
  39. data/spec/lib/manipulation_spec.rb +18 -10
  40. data/spec/lib/map_partitions_spec.rb +16 -16
  41. data/spec/lib/serializer_spec.rb +84 -9
  42. data/spec/lib/statistic_spec.rb +26 -24
  43. data/spec/spec_helper.rb +1 -2
  44. metadata +112 -10
  45. data/lib/spark/serializer/utf8.rb +0 -25
@@ -1,24 +1,79 @@
1
1
  module Spark
2
+ ##
3
+ # Serializer
4
+ #
2
5
  module Serializer
3
- autoload :Base, 'spark/serializer/base'
4
- autoload :UTF8, 'spark/serializer/utf8'
5
- autoload :Marshal, 'spark/serializer/marshal'
6
- autoload :MessagePack, 'spark/serializer/message_pack'
7
- autoload :Oj, 'spark/serializer/oj'
8
- autoload :Pair, 'spark/serializer/pair'
9
- autoload :Cartesian, 'spark/serializer/cartesian'
10
6
 
7
+ DEFAULT_COMPRESS = false
11
8
  DEFAULT_BATCH_SIZE = 1024
12
9
  DEFAULT_SERIALIZER_NAME = 'marshal'
13
10
 
14
- def self.get(suggestion)
15
- const_get(suggestion.to_s.camelize) rescue nil
11
+ @@registered = {}
12
+
13
+ # Register class and create method for quick access.
14
+ # Class will be available also as __name__ for using
15
+ # in build method (Proc binding problem).
16
+ #
17
+ # == Examples:
18
+ # register('test1', 'test2', Class)
19
+ #
20
+ # Spark::Serializer.test1
21
+ # Spark::Serializer.test2
22
+ #
23
+ # # Proc binding problem
24
+ # build { marshal } # => Spark::Serializer::Marshal
25
+ #
26
+ # marshal = 1
27
+ # build { marshal } # => 1
28
+ #
29
+ # build { __marshal__ } # => Spark::Serializer::Marshal
30
+ #
31
+ def self.register(*args)
32
+ klass = args.pop
33
+ args.each do |arg|
34
+ @@registered[arg] = klass
35
+ define_singleton_method(arg.to_sym){|*args| klass.new(*args) }
36
+ define_singleton_method("__#{arg}__".to_sym){|*args| klass.new(*args) }
37
+ end
38
+ end
39
+
40
+ def self.find(name)
41
+ @@registered[name.to_s.downcase]
16
42
  end
17
43
 
18
- def self.get!(suggestion)
19
- const_get(suggestion.to_s.camelize)
20
- rescue
21
- raise Spark::NotImplemented, "Serializer #{suggestion.to_s.camelize} not exist."
44
+ def self.find!(name)
45
+ klass = find(name)
46
+
47
+ if klass.nil?
48
+ raise Spark::SerializeError, "Unknow serializer #{name}."
49
+ end
50
+
51
+ klass
22
52
  end
53
+
54
+ def self.build(text=nil, &block)
55
+ if block_given?
56
+ class_eval(&block)
57
+ else
58
+ class_eval(text.to_s)
59
+ end
60
+ end
61
+
23
62
  end
24
63
  end
64
+
65
+ # Parent
66
+ require 'spark/serializer/base'
67
+
68
+ # Basic
69
+ require 'spark/serializer/oj'
70
+ require 'spark/serializer/marshal'
71
+ require 'spark/serializer/message_pack'
72
+ require 'spark/serializer/text'
73
+
74
+ # Others
75
+ require 'spark/serializer/batched'
76
+ require 'spark/serializer/auto_batched'
77
+ require 'spark/serializer/compressed'
78
+ require 'spark/serializer/pair'
79
+ require 'spark/serializer/cartesian'
@@ -0,0 +1,59 @@
1
+ module Spark
2
+ module Serializer
3
+ ##
4
+ # AutoBatched serializator
5
+ #
6
+ # Batch size is computed automatically. Simillar to Python's AutoBatchedSerializer.
7
+ #
8
+ class AutoBatched < Batched
9
+
10
+ MAX_RATIO = 10
11
+
12
+ def initialize(serializer, best_size=65536)
13
+ @serializer = serializer
14
+ @best_size = best_size.to_i
15
+
16
+ error('Batch size must be greater than 1') if @best_size < 2
17
+ end
18
+
19
+ def name
20
+ "AutoBatched(#{@best_size})"
21
+ end
22
+
23
+ def dump_to_io(data, io)
24
+ check_each(data)
25
+
26
+ # Only Array have .slice
27
+ data = data.to_a
28
+
29
+ index = 0
30
+ batch = 2
31
+ max = @best_size * MAX_RATIO
32
+
33
+ loop do
34
+ chunk = data.slice(index, batch)
35
+ if chunk.nil? || chunk.empty?
36
+ break
37
+ end
38
+
39
+ serialized = @serializer.dump(chunk)
40
+ io.write_string(serialized)
41
+
42
+ index += batch
43
+
44
+ size = serialized.bytesize
45
+ if size < @best_size
46
+ batch *= 2
47
+ elsif size > max && batch > 1
48
+ batch /= 2
49
+ end
50
+ end
51
+
52
+ io.flush
53
+ end
54
+
55
+ end
56
+ end
57
+ end
58
+
59
+ Spark::Serializer.register('auto_batched', 'autobatched', Spark::Serializer::AutoBatched)
@@ -1,168 +1,61 @@
1
1
  module Spark
2
2
  module Serializer
3
- # @abstract Parent for all type of serializers
3
+ # @abstract Parent for all serializers
4
4
  class Base
5
5
 
6
- include Spark::Helper::Serialize
7
- include Spark::Constant
6
+ def load_from_io(io)
7
+ return to_enum(__callee__, io) unless block_given?
8
8
 
9
- attr_reader :batch_size
9
+ loop do
10
+ size = io.read_int_or_eof
11
+ break if size == Spark::Constant::DATA_EOF
10
12
 
11
- # Set default values
12
- def initialize(batch_size=nil)
13
- self.batch_size = batch_size
13
+ yield load(io.read(size))
14
+ end
14
15
  end
15
16
 
16
- def ==(other)
17
- self.class == other.class && self.batch_size == other.batch_size
18
- end
17
+ def load_from_file(file, *args)
18
+ return to_enum(__callee__, file, *args) unless block_given?
19
19
 
20
- # Set values given by user
21
- def set(batch_size)
22
- self.batch_size = batch_size unless batch_size.nil?
23
- self
24
- end
20
+ load_from_io(file, *args).each do |item|
21
+ yield item
22
+ end
25
23
 
26
- def batch_size=(size)
27
- @batch_size = size.to_i
24
+ file.close
25
+ file.unlink
28
26
  end
29
27
 
30
- def unbatch!
31
- self.batch_size = 1
28
+ def ==(other)
29
+ self.to_s == other.to_s
32
30
  end
33
31
 
34
- # nil, 0, 1 are considered as non-batched
35
32
  def batched?
36
- batch_size > 1
37
- end
38
-
39
- # ===========================================================================
40
- # Load
41
-
42
- # Load and deserialize an Array from IO, Array of Java iterator
43
- # mri: respond_to?(:iterator) => false
44
- # jruby: respond_to?(:iterator) => true
45
- #
46
- def load(source)
47
- # Tempfile is Delegator for File so it is not IO
48
- # second wasy is __getobj__.is_a?(IO)
49
- if source.is_a?(IO) || source.is_a?(Tempfile)
50
- load_from_io(source)
51
- # elsif source.is_a?(Array)
52
- # load_from_array(source)
53
- elsif try(source, :iterator)
54
- load_from_iterator(source.iterator)
55
- end
33
+ false
56
34
  end
57
35
 
58
- # Load data from IO. Data must have a format:
59
- #
60
- # +------------+--------+
61
- # | signed int | data |
62
- # | 4B | |
63
- # +------------+--------+
64
- #
65
- def load_from_io(io)
66
- return to_enum(__callee__, io) unless block_given?
67
-
68
- loop do
69
- lenght = read_int(io)
70
- break if lenght == DATA_EOF
71
-
72
- result = load_next_from_io(io, lenght)
73
- if batched? && result.respond_to?(:each)
74
- result.each {|item| yield item }
75
- else
76
- yield result
77
- end
78
- end # loop
79
- end # load_from_io
80
-
81
- def load_next_from_io(io, lenght)
82
- deserialize(io.read(lenght))
36
+ def unbatch!
83
37
  end
84
38
 
85
- # Load from Java iterator by calling hasNext and next
86
- #
87
- def load_from_iterator(iterator)
88
- result = []
89
- while iterator.hasNext
90
- item = iterator.next
91
-
92
- # mri: data are String
93
- # jruby: data are bytes Array
94
-
95
- if item.is_a?(String)
96
- # Serialized data
97
- result << deserialize(item)
98
- else
99
- # Java object
100
- if try(item, :getClass)
101
- case item.getClass.name
102
- when '[B'
103
- # Array of bytes
104
- result << deserialize(pack_unsigned_chars(item.to_a))
105
- when 'scala.Tuple2'
106
- # Tuple2
107
- result << deserialize(item._1, item._2)
108
- end
109
- end
110
- end
111
-
39
+ def check_each(data)
40
+ unless data.respond_to?(:each)
41
+ error('Data must be iterable.')
112
42
  end
113
-
114
- result.flatten!(1) if batched?
115
- result
116
43
  end
117
44
 
118
- def read_int(io)
119
- bytes = io.read(4)
120
- return DATA_EOF if bytes.nil?
121
- unpack_int(bytes)
45
+ def error(message)
46
+ raise Spark::SerializeError, message
122
47
  end
123
48
 
124
- # ===========================================================================
125
- # Dump
126
-
127
- # Serialize and send data into IO. Check 'load_from_io' for data format.
128
- def dump(data, io)
129
- if !data.is_a?(Array) && !data.is_a?(Enumerator)
130
- data = [data]
131
- end
132
- data = data.each_slice(batch_size) if batched?
133
-
134
- data.each do |item|
135
- serialized = serialize(item)
136
-
137
- # Size and data can have different encoding
138
- # Marshal: both ASCII
139
- # Oj: ASCII and UTF-8
140
- io.write(pack_int(serialized.bytesize))
141
- io.write(serialized)
142
- end
143
-
144
- io.flush
49
+ def name
50
+ self.class.name.split('::').last
145
51
  end
146
52
 
147
- # For direct serialization
148
- def dump_to_java(data)
149
- data.map! do |item|
150
- serialize(item).to_java_bytes
151
- end
53
+ def to_s
54
+ name
152
55
  end
153
56
 
154
- # Rescue cannot be defined
155
- #
156
- # mri => RuntimeError
157
- # jruby => NoMethodError
158
- #
159
- def try(object, method)
160
- begin
161
- object.__send__(method)
162
- return true
163
- rescue
164
- return false
165
- end
57
+ def inspect
58
+ %{#<Spark::Serializer:0x#{object_id} "#{self}">}
166
59
  end
167
60
 
168
61
  end
@@ -0,0 +1,84 @@
1
+ module Spark
2
+ module Serializer
3
+ class Batched < Base
4
+
5
+ attr_writer :serializer
6
+
7
+ def initialize(serializer, batch_size=nil)
8
+ batch_size ||= Spark::Serializer::DEFAULT_BATCH_SIZE
9
+
10
+ @serializer = serializer
11
+ @batch_size = batch_size.to_i
12
+
13
+ error('Batch size must be greater than 0') if @batch_size < 1
14
+ end
15
+
16
+ # Really batched
17
+ def batched?
18
+ @batch_size > 1
19
+ end
20
+
21
+ def unbatch!
22
+ @batch_size = 1
23
+ end
24
+
25
+ def load(data)
26
+ @serializer.load(data)
27
+ end
28
+
29
+ def dump(data)
30
+ @serializer.dump(data)
31
+ end
32
+
33
+ def name
34
+ "Batched(#{@batch_size})"
35
+ end
36
+
37
+ def to_s
38
+ "#{name} -> #{@serializer}"
39
+ end
40
+
41
+
42
+ # === Dump ==============================================================
43
+
44
+ def dump_to_io(data, io)
45
+ check_each(data)
46
+
47
+ if batched?
48
+ data = data.each_slice(@batch_size)
49
+ end
50
+
51
+ data.each do |item|
52
+ serialized = dump(item)
53
+ io.write_string(serialized)
54
+ end
55
+
56
+ io.flush
57
+ end
58
+
59
+
60
+ # === Load ==============================================================
61
+
62
+ def load_from_io(io)
63
+ return to_enum(__callee__, io) unless block_given?
64
+
65
+ loop do
66
+ size = io.read_int_or_eof
67
+ break if size == Spark::Constant::DATA_EOF
68
+
69
+ data = io.read(size)
70
+ data = load(data)
71
+
72
+ if batched?
73
+ data.each{|item| yield item }
74
+ else
75
+ yield data
76
+ end
77
+ end
78
+ end
79
+
80
+ end
81
+ end
82
+ end
83
+
84
+ Spark::Serializer.register('batched', Spark::Serializer::Batched)
@@ -1,37 +1,13 @@
1
1
  module Spark
2
2
  module Serializer
3
- class Cartesian < Base
3
+ class Cartesian < Pair
4
4
 
5
- attr_reader :first, :second
6
-
7
- def set(first, second)
8
- @first = first
9
- @second = second
10
- self
11
- end
12
-
13
- # Little hack
14
- # Data does not have to be batched but items are added by <<
15
- def batched?
16
- true
17
- end
18
-
19
- def load_next_from_io(io, lenght)
20
- item1 = io.read(lenght)
21
- item2 = io.read_string
22
- deserialize(item1, item2)
23
- end
24
-
25
- def deserialize(item1, item2)
26
- deserialized_item1 = @first.deserialize(item1)
27
- deserialized_item2 = @second.deserialize(item2)
28
-
29
- deserialized_item1 = [deserialized_item1] unless @first.batched?
30
- deserialized_item2 = [deserialized_item2] unless @second.batched?
31
-
32
- deserialized_item1.product(deserialized_item2)
5
+ def aggregate(item1, item2)
6
+ item1.product(item2)
33
7
  end
34
8
 
35
9
  end
36
10
  end
37
11
  end
12
+
13
+ Spark::Serializer.register('cartesian', Spark::Serializer::Cartesian)