ruby-spark 1.0.0 → 1.1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -1
  3. data/README.md +99 -32
  4. data/TODO.md +2 -3
  5. data/benchmark/{performance → comparison}/prepare.sh +0 -0
  6. data/benchmark/{performance → comparison}/python.py +0 -0
  7. data/benchmark/{performance → comparison}/r.r +0 -0
  8. data/benchmark/{performance → comparison}/ruby.rb +0 -0
  9. data/benchmark/{performance → comparison}/run-all.sh +0 -0
  10. data/benchmark/{performance → comparison}/scala.scala +0 -0
  11. data/example/pi.rb +1 -1
  12. data/example/website_search.rb +83 -0
  13. data/ext/spark/src/main/scala/RubyRDD.scala +30 -2
  14. data/lib/spark.rb +2 -2
  15. data/lib/spark/build.rb +1 -1
  16. data/lib/spark/cli.rb +1 -1
  17. data/lib/spark/command/base.rb +4 -0
  18. data/lib/spark/command_builder.rb +2 -2
  19. data/lib/spark/config.rb +11 -17
  20. data/lib/spark/context.rb +63 -45
  21. data/lib/spark/ext/io.rb +11 -1
  22. data/lib/spark/java_bridge/base.rb +2 -2
  23. data/lib/spark/rdd.rb +67 -18
  24. data/lib/spark/serializer.rb +68 -13
  25. data/lib/spark/serializer/auto_batched.rb +59 -0
  26. data/lib/spark/serializer/base.rb +30 -137
  27. data/lib/spark/serializer/batched.rb +84 -0
  28. data/lib/spark/serializer/cartesian.rb +5 -29
  29. data/lib/spark/serializer/compressed.rb +27 -0
  30. data/lib/spark/serializer/marshal.rb +6 -8
  31. data/lib/spark/serializer/message_pack.rb +8 -10
  32. data/lib/spark/serializer/oj.rb +8 -10
  33. data/lib/spark/serializer/pair.rb +27 -13
  34. data/lib/spark/serializer/text.rb +25 -0
  35. data/lib/spark/version.rb +1 -1
  36. data/lib/spark/worker/worker.rb +5 -2
  37. data/ruby-spark.gemspec +13 -1
  38. data/spec/lib/context_spec.rb +3 -1
  39. data/spec/lib/manipulation_spec.rb +18 -10
  40. data/spec/lib/map_partitions_spec.rb +16 -16
  41. data/spec/lib/serializer_spec.rb +84 -9
  42. data/spec/lib/statistic_spec.rb +26 -24
  43. data/spec/spec_helper.rb +1 -2
  44. metadata +112 -10
  45. data/lib/spark/serializer/utf8.rb +0 -25
@@ -1,24 +1,79 @@
1
1
  module Spark
2
+ ##
3
+ # Serializer
4
+ #
2
5
  module Serializer
3
- autoload :Base, 'spark/serializer/base'
4
- autoload :UTF8, 'spark/serializer/utf8'
5
- autoload :Marshal, 'spark/serializer/marshal'
6
- autoload :MessagePack, 'spark/serializer/message_pack'
7
- autoload :Oj, 'spark/serializer/oj'
8
- autoload :Pair, 'spark/serializer/pair'
9
- autoload :Cartesian, 'spark/serializer/cartesian'
10
6
 
7
+ DEFAULT_COMPRESS = false
11
8
  DEFAULT_BATCH_SIZE = 1024
12
9
  DEFAULT_SERIALIZER_NAME = 'marshal'
13
10
 
14
- def self.get(suggestion)
15
- const_get(suggestion.to_s.camelize) rescue nil
11
+ @@registered = {}
12
+
13
+ # Register class and create method for quick access.
14
+ # Class will be available also as __name__ for using
15
+ # in build method (Proc binding problem).
16
+ #
17
+ # == Examples:
18
+ # register('test1', 'test2', Class)
19
+ #
20
+ # Spark::Serializer.test1
21
+ # Spark::Serializer.test2
22
+ #
23
+ # # Proc binding problem
24
+ # build { marshal } # => Spark::Serializer::Marshal
25
+ #
26
+ # marshal = 1
27
+ # build { marshal } # => 1
28
+ #
29
+ # build { __marshal__ } # => Spark::Serializer::Marshal
30
+ #
31
+ def self.register(*args)
32
+ klass = args.pop
33
+ args.each do |arg|
34
+ @@registered[arg] = klass
35
+ define_singleton_method(arg.to_sym){|*args| klass.new(*args) }
36
+ define_singleton_method("__#{arg}__".to_sym){|*args| klass.new(*args) }
37
+ end
38
+ end
39
+
40
+ def self.find(name)
41
+ @@registered[name.to_s.downcase]
16
42
  end
17
43
 
18
- def self.get!(suggestion)
19
- const_get(suggestion.to_s.camelize)
20
- rescue
21
- raise Spark::NotImplemented, "Serializer #{suggestion.to_s.camelize} not exist."
44
+ def self.find!(name)
45
+ klass = find(name)
46
+
47
+ if klass.nil?
48
+ raise Spark::SerializeError, "Unknow serializer #{name}."
49
+ end
50
+
51
+ klass
22
52
  end
53
+
54
+ def self.build(text=nil, &block)
55
+ if block_given?
56
+ class_eval(&block)
57
+ else
58
+ class_eval(text.to_s)
59
+ end
60
+ end
61
+
23
62
  end
24
63
  end
64
+
65
+ # Parent
66
+ require 'spark/serializer/base'
67
+
68
+ # Basic
69
+ require 'spark/serializer/oj'
70
+ require 'spark/serializer/marshal'
71
+ require 'spark/serializer/message_pack'
72
+ require 'spark/serializer/text'
73
+
74
+ # Others
75
+ require 'spark/serializer/batched'
76
+ require 'spark/serializer/auto_batched'
77
+ require 'spark/serializer/compressed'
78
+ require 'spark/serializer/pair'
79
+ require 'spark/serializer/cartesian'
@@ -0,0 +1,59 @@
1
+ module Spark
2
+ module Serializer
3
+ ##
4
+ # AutoBatched serializator
5
+ #
6
+ # Batch size is computed automatically. Simillar to Python's AutoBatchedSerializer.
7
+ #
8
+ class AutoBatched < Batched
9
+
10
+ MAX_RATIO = 10
11
+
12
+ def initialize(serializer, best_size=65536)
13
+ @serializer = serializer
14
+ @best_size = best_size.to_i
15
+
16
+ error('Batch size must be greater than 1') if @best_size < 2
17
+ end
18
+
19
+ def name
20
+ "AutoBatched(#{@best_size})"
21
+ end
22
+
23
+ def dump_to_io(data, io)
24
+ check_each(data)
25
+
26
+ # Only Array have .slice
27
+ data = data.to_a
28
+
29
+ index = 0
30
+ batch = 2
31
+ max = @best_size * MAX_RATIO
32
+
33
+ loop do
34
+ chunk = data.slice(index, batch)
35
+ if chunk.nil? || chunk.empty?
36
+ break
37
+ end
38
+
39
+ serialized = @serializer.dump(chunk)
40
+ io.write_string(serialized)
41
+
42
+ index += batch
43
+
44
+ size = serialized.bytesize
45
+ if size < @best_size
46
+ batch *= 2
47
+ elsif size > max && batch > 1
48
+ batch /= 2
49
+ end
50
+ end
51
+
52
+ io.flush
53
+ end
54
+
55
+ end
56
+ end
57
+ end
58
+
59
+ Spark::Serializer.register('auto_batched', 'autobatched', Spark::Serializer::AutoBatched)
@@ -1,168 +1,61 @@
1
1
  module Spark
2
2
  module Serializer
3
- # @abstract Parent for all type of serializers
3
+ # @abstract Parent for all serializers
4
4
  class Base
5
5
 
6
- include Spark::Helper::Serialize
7
- include Spark::Constant
6
+ def load_from_io(io)
7
+ return to_enum(__callee__, io) unless block_given?
8
8
 
9
- attr_reader :batch_size
9
+ loop do
10
+ size = io.read_int_or_eof
11
+ break if size == Spark::Constant::DATA_EOF
10
12
 
11
- # Set default values
12
- def initialize(batch_size=nil)
13
- self.batch_size = batch_size
13
+ yield load(io.read(size))
14
+ end
14
15
  end
15
16
 
16
- def ==(other)
17
- self.class == other.class && self.batch_size == other.batch_size
18
- end
17
+ def load_from_file(file, *args)
18
+ return to_enum(__callee__, file, *args) unless block_given?
19
19
 
20
- # Set values given by user
21
- def set(batch_size)
22
- self.batch_size = batch_size unless batch_size.nil?
23
- self
24
- end
20
+ load_from_io(file, *args).each do |item|
21
+ yield item
22
+ end
25
23
 
26
- def batch_size=(size)
27
- @batch_size = size.to_i
24
+ file.close
25
+ file.unlink
28
26
  end
29
27
 
30
- def unbatch!
31
- self.batch_size = 1
28
+ def ==(other)
29
+ self.to_s == other.to_s
32
30
  end
33
31
 
34
- # nil, 0, 1 are considered as non-batched
35
32
  def batched?
36
- batch_size > 1
37
- end
38
-
39
- # ===========================================================================
40
- # Load
41
-
42
- # Load and deserialize an Array from IO, Array of Java iterator
43
- # mri: respond_to?(:iterator) => false
44
- # jruby: respond_to?(:iterator) => true
45
- #
46
- def load(source)
47
- # Tempfile is Delegator for File so it is not IO
48
- # second wasy is __getobj__.is_a?(IO)
49
- if source.is_a?(IO) || source.is_a?(Tempfile)
50
- load_from_io(source)
51
- # elsif source.is_a?(Array)
52
- # load_from_array(source)
53
- elsif try(source, :iterator)
54
- load_from_iterator(source.iterator)
55
- end
33
+ false
56
34
  end
57
35
 
58
- # Load data from IO. Data must have a format:
59
- #
60
- # +------------+--------+
61
- # | signed int | data |
62
- # | 4B | |
63
- # +------------+--------+
64
- #
65
- def load_from_io(io)
66
- return to_enum(__callee__, io) unless block_given?
67
-
68
- loop do
69
- lenght = read_int(io)
70
- break if lenght == DATA_EOF
71
-
72
- result = load_next_from_io(io, lenght)
73
- if batched? && result.respond_to?(:each)
74
- result.each {|item| yield item }
75
- else
76
- yield result
77
- end
78
- end # loop
79
- end # load_from_io
80
-
81
- def load_next_from_io(io, lenght)
82
- deserialize(io.read(lenght))
36
+ def unbatch!
83
37
  end
84
38
 
85
- # Load from Java iterator by calling hasNext and next
86
- #
87
- def load_from_iterator(iterator)
88
- result = []
89
- while iterator.hasNext
90
- item = iterator.next
91
-
92
- # mri: data are String
93
- # jruby: data are bytes Array
94
-
95
- if item.is_a?(String)
96
- # Serialized data
97
- result << deserialize(item)
98
- else
99
- # Java object
100
- if try(item, :getClass)
101
- case item.getClass.name
102
- when '[B'
103
- # Array of bytes
104
- result << deserialize(pack_unsigned_chars(item.to_a))
105
- when 'scala.Tuple2'
106
- # Tuple2
107
- result << deserialize(item._1, item._2)
108
- end
109
- end
110
- end
111
-
39
+ def check_each(data)
40
+ unless data.respond_to?(:each)
41
+ error('Data must be iterable.')
112
42
  end
113
-
114
- result.flatten!(1) if batched?
115
- result
116
43
  end
117
44
 
118
- def read_int(io)
119
- bytes = io.read(4)
120
- return DATA_EOF if bytes.nil?
121
- unpack_int(bytes)
45
+ def error(message)
46
+ raise Spark::SerializeError, message
122
47
  end
123
48
 
124
- # ===========================================================================
125
- # Dump
126
-
127
- # Serialize and send data into IO. Check 'load_from_io' for data format.
128
- def dump(data, io)
129
- if !data.is_a?(Array) && !data.is_a?(Enumerator)
130
- data = [data]
131
- end
132
- data = data.each_slice(batch_size) if batched?
133
-
134
- data.each do |item|
135
- serialized = serialize(item)
136
-
137
- # Size and data can have different encoding
138
- # Marshal: both ASCII
139
- # Oj: ASCII and UTF-8
140
- io.write(pack_int(serialized.bytesize))
141
- io.write(serialized)
142
- end
143
-
144
- io.flush
49
+ def name
50
+ self.class.name.split('::').last
145
51
  end
146
52
 
147
- # For direct serialization
148
- def dump_to_java(data)
149
- data.map! do |item|
150
- serialize(item).to_java_bytes
151
- end
53
+ def to_s
54
+ name
152
55
  end
153
56
 
154
- # Rescue cannot be defined
155
- #
156
- # mri => RuntimeError
157
- # jruby => NoMethodError
158
- #
159
- def try(object, method)
160
- begin
161
- object.__send__(method)
162
- return true
163
- rescue
164
- return false
165
- end
57
+ def inspect
58
+ %{#<Spark::Serializer:0x#{object_id} "#{self}">}
166
59
  end
167
60
 
168
61
  end
@@ -0,0 +1,84 @@
1
+ module Spark
2
+ module Serializer
3
+ class Batched < Base
4
+
5
+ attr_writer :serializer
6
+
7
+ def initialize(serializer, batch_size=nil)
8
+ batch_size ||= Spark::Serializer::DEFAULT_BATCH_SIZE
9
+
10
+ @serializer = serializer
11
+ @batch_size = batch_size.to_i
12
+
13
+ error('Batch size must be greater than 0') if @batch_size < 1
14
+ end
15
+
16
+ # Really batched
17
+ def batched?
18
+ @batch_size > 1
19
+ end
20
+
21
+ def unbatch!
22
+ @batch_size = 1
23
+ end
24
+
25
+ def load(data)
26
+ @serializer.load(data)
27
+ end
28
+
29
+ def dump(data)
30
+ @serializer.dump(data)
31
+ end
32
+
33
+ def name
34
+ "Batched(#{@batch_size})"
35
+ end
36
+
37
+ def to_s
38
+ "#{name} -> #{@serializer}"
39
+ end
40
+
41
+
42
+ # === Dump ==============================================================
43
+
44
+ def dump_to_io(data, io)
45
+ check_each(data)
46
+
47
+ if batched?
48
+ data = data.each_slice(@batch_size)
49
+ end
50
+
51
+ data.each do |item|
52
+ serialized = dump(item)
53
+ io.write_string(serialized)
54
+ end
55
+
56
+ io.flush
57
+ end
58
+
59
+
60
+ # === Load ==============================================================
61
+
62
+ def load_from_io(io)
63
+ return to_enum(__callee__, io) unless block_given?
64
+
65
+ loop do
66
+ size = io.read_int_or_eof
67
+ break if size == Spark::Constant::DATA_EOF
68
+
69
+ data = io.read(size)
70
+ data = load(data)
71
+
72
+ if batched?
73
+ data.each{|item| yield item }
74
+ else
75
+ yield data
76
+ end
77
+ end
78
+ end
79
+
80
+ end
81
+ end
82
+ end
83
+
84
+ Spark::Serializer.register('batched', Spark::Serializer::Batched)
@@ -1,37 +1,13 @@
1
1
  module Spark
2
2
  module Serializer
3
- class Cartesian < Base
3
+ class Cartesian < Pair
4
4
 
5
- attr_reader :first, :second
6
-
7
- def set(first, second)
8
- @first = first
9
- @second = second
10
- self
11
- end
12
-
13
- # Little hack
14
- # Data does not have to be batched but items are added by <<
15
- def batched?
16
- true
17
- end
18
-
19
- def load_next_from_io(io, lenght)
20
- item1 = io.read(lenght)
21
- item2 = io.read_string
22
- deserialize(item1, item2)
23
- end
24
-
25
- def deserialize(item1, item2)
26
- deserialized_item1 = @first.deserialize(item1)
27
- deserialized_item2 = @second.deserialize(item2)
28
-
29
- deserialized_item1 = [deserialized_item1] unless @first.batched?
30
- deserialized_item2 = [deserialized_item2] unless @second.batched?
31
-
32
- deserialized_item1.product(deserialized_item2)
5
+ def aggregate(item1, item2)
6
+ item1.product(item2)
33
7
  end
34
8
 
35
9
  end
36
10
  end
37
11
  end
12
+
13
+ Spark::Serializer.register('cartesian', Spark::Serializer::Cartesian)