ruby-spark 1.0.0 → 1.1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/README.md +99 -32
- data/TODO.md +2 -3
- data/benchmark/{performance → comparison}/prepare.sh +0 -0
- data/benchmark/{performance → comparison}/python.py +0 -0
- data/benchmark/{performance → comparison}/r.r +0 -0
- data/benchmark/{performance → comparison}/ruby.rb +0 -0
- data/benchmark/{performance → comparison}/run-all.sh +0 -0
- data/benchmark/{performance → comparison}/scala.scala +0 -0
- data/example/pi.rb +1 -1
- data/example/website_search.rb +83 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +30 -2
- data/lib/spark.rb +2 -2
- data/lib/spark/build.rb +1 -1
- data/lib/spark/cli.rb +1 -1
- data/lib/spark/command/base.rb +4 -0
- data/lib/spark/command_builder.rb +2 -2
- data/lib/spark/config.rb +11 -17
- data/lib/spark/context.rb +63 -45
- data/lib/spark/ext/io.rb +11 -1
- data/lib/spark/java_bridge/base.rb +2 -2
- data/lib/spark/rdd.rb +67 -18
- data/lib/spark/serializer.rb +68 -13
- data/lib/spark/serializer/auto_batched.rb +59 -0
- data/lib/spark/serializer/base.rb +30 -137
- data/lib/spark/serializer/batched.rb +84 -0
- data/lib/spark/serializer/cartesian.rb +5 -29
- data/lib/spark/serializer/compressed.rb +27 -0
- data/lib/spark/serializer/marshal.rb +6 -8
- data/lib/spark/serializer/message_pack.rb +8 -10
- data/lib/spark/serializer/oj.rb +8 -10
- data/lib/spark/serializer/pair.rb +27 -13
- data/lib/spark/serializer/text.rb +25 -0
- data/lib/spark/version.rb +1 -1
- data/lib/spark/worker/worker.rb +5 -2
- data/ruby-spark.gemspec +13 -1
- data/spec/lib/context_spec.rb +3 -1
- data/spec/lib/manipulation_spec.rb +18 -10
- data/spec/lib/map_partitions_spec.rb +16 -16
- data/spec/lib/serializer_spec.rb +84 -9
- data/spec/lib/statistic_spec.rb +26 -24
- data/spec/spec_helper.rb +1 -2
- metadata +112 -10
- data/lib/spark/serializer/utf8.rb +0 -25
data/lib/spark/serializer.rb
CHANGED
@@ -1,24 +1,79 @@
|
|
1
1
|
module Spark
|
2
|
+
##
|
3
|
+
# Serializer
|
4
|
+
#
|
2
5
|
module Serializer
|
3
|
-
autoload :Base, 'spark/serializer/base'
|
4
|
-
autoload :UTF8, 'spark/serializer/utf8'
|
5
|
-
autoload :Marshal, 'spark/serializer/marshal'
|
6
|
-
autoload :MessagePack, 'spark/serializer/message_pack'
|
7
|
-
autoload :Oj, 'spark/serializer/oj'
|
8
|
-
autoload :Pair, 'spark/serializer/pair'
|
9
|
-
autoload :Cartesian, 'spark/serializer/cartesian'
|
10
6
|
|
7
|
+
DEFAULT_COMPRESS = false
|
11
8
|
DEFAULT_BATCH_SIZE = 1024
|
12
9
|
DEFAULT_SERIALIZER_NAME = 'marshal'
|
13
10
|
|
14
|
-
|
15
|
-
|
11
|
+
@@registered = {}
|
12
|
+
|
13
|
+
# Register class and create method for quick access.
|
14
|
+
# Class will be available also as __name__ for using
|
15
|
+
# in build method (Proc binding problem).
|
16
|
+
#
|
17
|
+
# == Examples:
|
18
|
+
# register('test1', 'test2', Class)
|
19
|
+
#
|
20
|
+
# Spark::Serializer.test1
|
21
|
+
# Spark::Serializer.test2
|
22
|
+
#
|
23
|
+
# # Proc binding problem
|
24
|
+
# build { marshal } # => Spark::Serializer::Marshal
|
25
|
+
#
|
26
|
+
# marshal = 1
|
27
|
+
# build { marshal } # => 1
|
28
|
+
#
|
29
|
+
# build { __marshal__ } # => Spark::Serializer::Marshal
|
30
|
+
#
|
31
|
+
def self.register(*args)
|
32
|
+
klass = args.pop
|
33
|
+
args.each do |arg|
|
34
|
+
@@registered[arg] = klass
|
35
|
+
define_singleton_method(arg.to_sym){|*args| klass.new(*args) }
|
36
|
+
define_singleton_method("__#{arg}__".to_sym){|*args| klass.new(*args) }
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.find(name)
|
41
|
+
@@registered[name.to_s.downcase]
|
16
42
|
end
|
17
43
|
|
18
|
-
def self.
|
19
|
-
|
20
|
-
|
21
|
-
|
44
|
+
def self.find!(name)
|
45
|
+
klass = find(name)
|
46
|
+
|
47
|
+
if klass.nil?
|
48
|
+
raise Spark::SerializeError, "Unknow serializer #{name}."
|
49
|
+
end
|
50
|
+
|
51
|
+
klass
|
22
52
|
end
|
53
|
+
|
54
|
+
def self.build(text=nil, &block)
|
55
|
+
if block_given?
|
56
|
+
class_eval(&block)
|
57
|
+
else
|
58
|
+
class_eval(text.to_s)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
23
62
|
end
|
24
63
|
end
|
64
|
+
|
65
|
+
# Parent
|
66
|
+
require 'spark/serializer/base'
|
67
|
+
|
68
|
+
# Basic
|
69
|
+
require 'spark/serializer/oj'
|
70
|
+
require 'spark/serializer/marshal'
|
71
|
+
require 'spark/serializer/message_pack'
|
72
|
+
require 'spark/serializer/text'
|
73
|
+
|
74
|
+
# Others
|
75
|
+
require 'spark/serializer/batched'
|
76
|
+
require 'spark/serializer/auto_batched'
|
77
|
+
require 'spark/serializer/compressed'
|
78
|
+
require 'spark/serializer/pair'
|
79
|
+
require 'spark/serializer/cartesian'
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module Spark
|
2
|
+
module Serializer
|
3
|
+
##
|
4
|
+
# AutoBatched serializator
|
5
|
+
#
|
6
|
+
# Batch size is computed automatically. Simillar to Python's AutoBatchedSerializer.
|
7
|
+
#
|
8
|
+
class AutoBatched < Batched
|
9
|
+
|
10
|
+
MAX_RATIO = 10
|
11
|
+
|
12
|
+
def initialize(serializer, best_size=65536)
|
13
|
+
@serializer = serializer
|
14
|
+
@best_size = best_size.to_i
|
15
|
+
|
16
|
+
error('Batch size must be greater than 1') if @best_size < 2
|
17
|
+
end
|
18
|
+
|
19
|
+
def name
|
20
|
+
"AutoBatched(#{@best_size})"
|
21
|
+
end
|
22
|
+
|
23
|
+
def dump_to_io(data, io)
|
24
|
+
check_each(data)
|
25
|
+
|
26
|
+
# Only Array have .slice
|
27
|
+
data = data.to_a
|
28
|
+
|
29
|
+
index = 0
|
30
|
+
batch = 2
|
31
|
+
max = @best_size * MAX_RATIO
|
32
|
+
|
33
|
+
loop do
|
34
|
+
chunk = data.slice(index, batch)
|
35
|
+
if chunk.nil? || chunk.empty?
|
36
|
+
break
|
37
|
+
end
|
38
|
+
|
39
|
+
serialized = @serializer.dump(chunk)
|
40
|
+
io.write_string(serialized)
|
41
|
+
|
42
|
+
index += batch
|
43
|
+
|
44
|
+
size = serialized.bytesize
|
45
|
+
if size < @best_size
|
46
|
+
batch *= 2
|
47
|
+
elsif size > max && batch > 1
|
48
|
+
batch /= 2
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
io.flush
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
Spark::Serializer.register('auto_batched', 'autobatched', Spark::Serializer::AutoBatched)
|
@@ -1,168 +1,61 @@
|
|
1
1
|
module Spark
|
2
2
|
module Serializer
|
3
|
-
# @abstract Parent for all
|
3
|
+
# @abstract Parent for all serializers
|
4
4
|
class Base
|
5
5
|
|
6
|
-
|
7
|
-
|
6
|
+
def load_from_io(io)
|
7
|
+
return to_enum(__callee__, io) unless block_given?
|
8
8
|
|
9
|
-
|
9
|
+
loop do
|
10
|
+
size = io.read_int_or_eof
|
11
|
+
break if size == Spark::Constant::DATA_EOF
|
10
12
|
|
11
|
-
|
12
|
-
|
13
|
-
self.batch_size = batch_size
|
13
|
+
yield load(io.read(size))
|
14
|
+
end
|
14
15
|
end
|
15
16
|
|
16
|
-
def
|
17
|
-
|
18
|
-
end
|
17
|
+
def load_from_file(file, *args)
|
18
|
+
return to_enum(__callee__, file, *args) unless block_given?
|
19
19
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
self
|
24
|
-
end
|
20
|
+
load_from_io(file, *args).each do |item|
|
21
|
+
yield item
|
22
|
+
end
|
25
23
|
|
26
|
-
|
27
|
-
|
24
|
+
file.close
|
25
|
+
file.unlink
|
28
26
|
end
|
29
27
|
|
30
|
-
def
|
31
|
-
self.
|
28
|
+
def ==(other)
|
29
|
+
self.to_s == other.to_s
|
32
30
|
end
|
33
31
|
|
34
|
-
# nil, 0, 1 are considered as non-batched
|
35
32
|
def batched?
|
36
|
-
|
37
|
-
end
|
38
|
-
|
39
|
-
# ===========================================================================
|
40
|
-
# Load
|
41
|
-
|
42
|
-
# Load and deserialize an Array from IO, Array of Java iterator
|
43
|
-
# mri: respond_to?(:iterator) => false
|
44
|
-
# jruby: respond_to?(:iterator) => true
|
45
|
-
#
|
46
|
-
def load(source)
|
47
|
-
# Tempfile is Delegator for File so it is not IO
|
48
|
-
# second wasy is __getobj__.is_a?(IO)
|
49
|
-
if source.is_a?(IO) || source.is_a?(Tempfile)
|
50
|
-
load_from_io(source)
|
51
|
-
# elsif source.is_a?(Array)
|
52
|
-
# load_from_array(source)
|
53
|
-
elsif try(source, :iterator)
|
54
|
-
load_from_iterator(source.iterator)
|
55
|
-
end
|
33
|
+
false
|
56
34
|
end
|
57
35
|
|
58
|
-
|
59
|
-
#
|
60
|
-
# +------------+--------+
|
61
|
-
# | signed int | data |
|
62
|
-
# | 4B | |
|
63
|
-
# +------------+--------+
|
64
|
-
#
|
65
|
-
def load_from_io(io)
|
66
|
-
return to_enum(__callee__, io) unless block_given?
|
67
|
-
|
68
|
-
loop do
|
69
|
-
lenght = read_int(io)
|
70
|
-
break if lenght == DATA_EOF
|
71
|
-
|
72
|
-
result = load_next_from_io(io, lenght)
|
73
|
-
if batched? && result.respond_to?(:each)
|
74
|
-
result.each {|item| yield item }
|
75
|
-
else
|
76
|
-
yield result
|
77
|
-
end
|
78
|
-
end # loop
|
79
|
-
end # load_from_io
|
80
|
-
|
81
|
-
def load_next_from_io(io, lenght)
|
82
|
-
deserialize(io.read(lenght))
|
36
|
+
def unbatch!
|
83
37
|
end
|
84
38
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
result = []
|
89
|
-
while iterator.hasNext
|
90
|
-
item = iterator.next
|
91
|
-
|
92
|
-
# mri: data are String
|
93
|
-
# jruby: data are bytes Array
|
94
|
-
|
95
|
-
if item.is_a?(String)
|
96
|
-
# Serialized data
|
97
|
-
result << deserialize(item)
|
98
|
-
else
|
99
|
-
# Java object
|
100
|
-
if try(item, :getClass)
|
101
|
-
case item.getClass.name
|
102
|
-
when '[B'
|
103
|
-
# Array of bytes
|
104
|
-
result << deserialize(pack_unsigned_chars(item.to_a))
|
105
|
-
when 'scala.Tuple2'
|
106
|
-
# Tuple2
|
107
|
-
result << deserialize(item._1, item._2)
|
108
|
-
end
|
109
|
-
end
|
110
|
-
end
|
111
|
-
|
39
|
+
def check_each(data)
|
40
|
+
unless data.respond_to?(:each)
|
41
|
+
error('Data must be iterable.')
|
112
42
|
end
|
113
|
-
|
114
|
-
result.flatten!(1) if batched?
|
115
|
-
result
|
116
43
|
end
|
117
44
|
|
118
|
-
def
|
119
|
-
|
120
|
-
return DATA_EOF if bytes.nil?
|
121
|
-
unpack_int(bytes)
|
45
|
+
def error(message)
|
46
|
+
raise Spark::SerializeError, message
|
122
47
|
end
|
123
48
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
# Serialize and send data into IO. Check 'load_from_io' for data format.
|
128
|
-
def dump(data, io)
|
129
|
-
if !data.is_a?(Array) && !data.is_a?(Enumerator)
|
130
|
-
data = [data]
|
131
|
-
end
|
132
|
-
data = data.each_slice(batch_size) if batched?
|
133
|
-
|
134
|
-
data.each do |item|
|
135
|
-
serialized = serialize(item)
|
136
|
-
|
137
|
-
# Size and data can have different encoding
|
138
|
-
# Marshal: both ASCII
|
139
|
-
# Oj: ASCII and UTF-8
|
140
|
-
io.write(pack_int(serialized.bytesize))
|
141
|
-
io.write(serialized)
|
142
|
-
end
|
143
|
-
|
144
|
-
io.flush
|
49
|
+
def name
|
50
|
+
self.class.name.split('::').last
|
145
51
|
end
|
146
52
|
|
147
|
-
|
148
|
-
|
149
|
-
data.map! do |item|
|
150
|
-
serialize(item).to_java_bytes
|
151
|
-
end
|
53
|
+
def to_s
|
54
|
+
name
|
152
55
|
end
|
153
56
|
|
154
|
-
|
155
|
-
|
156
|
-
# mri => RuntimeError
|
157
|
-
# jruby => NoMethodError
|
158
|
-
#
|
159
|
-
def try(object, method)
|
160
|
-
begin
|
161
|
-
object.__send__(method)
|
162
|
-
return true
|
163
|
-
rescue
|
164
|
-
return false
|
165
|
-
end
|
57
|
+
def inspect
|
58
|
+
%{#<Spark::Serializer:0x#{object_id} "#{self}">}
|
166
59
|
end
|
167
60
|
|
168
61
|
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
module Spark
|
2
|
+
module Serializer
|
3
|
+
class Batched < Base
|
4
|
+
|
5
|
+
attr_writer :serializer
|
6
|
+
|
7
|
+
def initialize(serializer, batch_size=nil)
|
8
|
+
batch_size ||= Spark::Serializer::DEFAULT_BATCH_SIZE
|
9
|
+
|
10
|
+
@serializer = serializer
|
11
|
+
@batch_size = batch_size.to_i
|
12
|
+
|
13
|
+
error('Batch size must be greater than 0') if @batch_size < 1
|
14
|
+
end
|
15
|
+
|
16
|
+
# Really batched
|
17
|
+
def batched?
|
18
|
+
@batch_size > 1
|
19
|
+
end
|
20
|
+
|
21
|
+
def unbatch!
|
22
|
+
@batch_size = 1
|
23
|
+
end
|
24
|
+
|
25
|
+
def load(data)
|
26
|
+
@serializer.load(data)
|
27
|
+
end
|
28
|
+
|
29
|
+
def dump(data)
|
30
|
+
@serializer.dump(data)
|
31
|
+
end
|
32
|
+
|
33
|
+
def name
|
34
|
+
"Batched(#{@batch_size})"
|
35
|
+
end
|
36
|
+
|
37
|
+
def to_s
|
38
|
+
"#{name} -> #{@serializer}"
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
# === Dump ==============================================================
|
43
|
+
|
44
|
+
def dump_to_io(data, io)
|
45
|
+
check_each(data)
|
46
|
+
|
47
|
+
if batched?
|
48
|
+
data = data.each_slice(@batch_size)
|
49
|
+
end
|
50
|
+
|
51
|
+
data.each do |item|
|
52
|
+
serialized = dump(item)
|
53
|
+
io.write_string(serialized)
|
54
|
+
end
|
55
|
+
|
56
|
+
io.flush
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
# === Load ==============================================================
|
61
|
+
|
62
|
+
def load_from_io(io)
|
63
|
+
return to_enum(__callee__, io) unless block_given?
|
64
|
+
|
65
|
+
loop do
|
66
|
+
size = io.read_int_or_eof
|
67
|
+
break if size == Spark::Constant::DATA_EOF
|
68
|
+
|
69
|
+
data = io.read(size)
|
70
|
+
data = load(data)
|
71
|
+
|
72
|
+
if batched?
|
73
|
+
data.each{|item| yield item }
|
74
|
+
else
|
75
|
+
yield data
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
Spark::Serializer.register('batched', Spark::Serializer::Batched)
|
@@ -1,37 +1,13 @@
|
|
1
1
|
module Spark
|
2
2
|
module Serializer
|
3
|
-
class Cartesian <
|
3
|
+
class Cartesian < Pair
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
def set(first, second)
|
8
|
-
@first = first
|
9
|
-
@second = second
|
10
|
-
self
|
11
|
-
end
|
12
|
-
|
13
|
-
# Little hack
|
14
|
-
# Data does not have to be batched but items are added by <<
|
15
|
-
def batched?
|
16
|
-
true
|
17
|
-
end
|
18
|
-
|
19
|
-
def load_next_from_io(io, lenght)
|
20
|
-
item1 = io.read(lenght)
|
21
|
-
item2 = io.read_string
|
22
|
-
deserialize(item1, item2)
|
23
|
-
end
|
24
|
-
|
25
|
-
def deserialize(item1, item2)
|
26
|
-
deserialized_item1 = @first.deserialize(item1)
|
27
|
-
deserialized_item2 = @second.deserialize(item2)
|
28
|
-
|
29
|
-
deserialized_item1 = [deserialized_item1] unless @first.batched?
|
30
|
-
deserialized_item2 = [deserialized_item2] unless @second.batched?
|
31
|
-
|
32
|
-
deserialized_item1.product(deserialized_item2)
|
5
|
+
def aggregate(item1, item2)
|
6
|
+
item1.product(item2)
|
33
7
|
end
|
34
8
|
|
35
9
|
end
|
36
10
|
end
|
37
11
|
end
|
12
|
+
|
13
|
+
Spark::Serializer.register('cartesian', Spark::Serializer::Cartesian)
|