ruby-spark 1.0.0 → 1.1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/README.md +99 -32
- data/TODO.md +2 -3
- data/benchmark/{performance → comparison}/prepare.sh +0 -0
- data/benchmark/{performance → comparison}/python.py +0 -0
- data/benchmark/{performance → comparison}/r.r +0 -0
- data/benchmark/{performance → comparison}/ruby.rb +0 -0
- data/benchmark/{performance → comparison}/run-all.sh +0 -0
- data/benchmark/{performance → comparison}/scala.scala +0 -0
- data/example/pi.rb +1 -1
- data/example/website_search.rb +83 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +30 -2
- data/lib/spark.rb +2 -2
- data/lib/spark/build.rb +1 -1
- data/lib/spark/cli.rb +1 -1
- data/lib/spark/command/base.rb +4 -0
- data/lib/spark/command_builder.rb +2 -2
- data/lib/spark/config.rb +11 -17
- data/lib/spark/context.rb +63 -45
- data/lib/spark/ext/io.rb +11 -1
- data/lib/spark/java_bridge/base.rb +2 -2
- data/lib/spark/rdd.rb +67 -18
- data/lib/spark/serializer.rb +68 -13
- data/lib/spark/serializer/auto_batched.rb +59 -0
- data/lib/spark/serializer/base.rb +30 -137
- data/lib/spark/serializer/batched.rb +84 -0
- data/lib/spark/serializer/cartesian.rb +5 -29
- data/lib/spark/serializer/compressed.rb +27 -0
- data/lib/spark/serializer/marshal.rb +6 -8
- data/lib/spark/serializer/message_pack.rb +8 -10
- data/lib/spark/serializer/oj.rb +8 -10
- data/lib/spark/serializer/pair.rb +27 -13
- data/lib/spark/serializer/text.rb +25 -0
- data/lib/spark/version.rb +1 -1
- data/lib/spark/worker/worker.rb +5 -2
- data/ruby-spark.gemspec +13 -1
- data/spec/lib/context_spec.rb +3 -1
- data/spec/lib/manipulation_spec.rb +18 -10
- data/spec/lib/map_partitions_spec.rb +16 -16
- data/spec/lib/serializer_spec.rb +84 -9
- data/spec/lib/statistic_spec.rb +26 -24
- data/spec/spec_helper.rb +1 -2
- metadata +112 -10
- data/lib/spark/serializer/utf8.rb +0 -25
data/lib/spark/serializer.rb
CHANGED
@@ -1,24 +1,79 @@
|
|
1
1
|
module Spark
|
2
|
+
##
|
3
|
+
# Serializer
|
4
|
+
#
|
2
5
|
module Serializer
|
3
|
-
autoload :Base, 'spark/serializer/base'
|
4
|
-
autoload :UTF8, 'spark/serializer/utf8'
|
5
|
-
autoload :Marshal, 'spark/serializer/marshal'
|
6
|
-
autoload :MessagePack, 'spark/serializer/message_pack'
|
7
|
-
autoload :Oj, 'spark/serializer/oj'
|
8
|
-
autoload :Pair, 'spark/serializer/pair'
|
9
|
-
autoload :Cartesian, 'spark/serializer/cartesian'
|
10
6
|
|
7
|
+
DEFAULT_COMPRESS = false
|
11
8
|
DEFAULT_BATCH_SIZE = 1024
|
12
9
|
DEFAULT_SERIALIZER_NAME = 'marshal'
|
13
10
|
|
14
|
-
|
15
|
-
|
11
|
+
@@registered = {}
|
12
|
+
|
13
|
+
# Register class and create method for quick access.
|
14
|
+
# Class will be available also as __name__ for using
|
15
|
+
# in build method (Proc binding problem).
|
16
|
+
#
|
17
|
+
# == Examples:
|
18
|
+
# register('test1', 'test2', Class)
|
19
|
+
#
|
20
|
+
# Spark::Serializer.test1
|
21
|
+
# Spark::Serializer.test2
|
22
|
+
#
|
23
|
+
# # Proc binding problem
|
24
|
+
# build { marshal } # => Spark::Serializer::Marshal
|
25
|
+
#
|
26
|
+
# marshal = 1
|
27
|
+
# build { marshal } # => 1
|
28
|
+
#
|
29
|
+
# build { __marshal__ } # => Spark::Serializer::Marshal
|
30
|
+
#
|
31
|
+
def self.register(*args)
|
32
|
+
klass = args.pop
|
33
|
+
args.each do |arg|
|
34
|
+
@@registered[arg] = klass
|
35
|
+
define_singleton_method(arg.to_sym){|*args| klass.new(*args) }
|
36
|
+
define_singleton_method("__#{arg}__".to_sym){|*args| klass.new(*args) }
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.find(name)
|
41
|
+
@@registered[name.to_s.downcase]
|
16
42
|
end
|
17
43
|
|
18
|
-
def self.
|
19
|
-
|
20
|
-
|
21
|
-
|
44
|
+
def self.find!(name)
|
45
|
+
klass = find(name)
|
46
|
+
|
47
|
+
if klass.nil?
|
48
|
+
raise Spark::SerializeError, "Unknow serializer #{name}."
|
49
|
+
end
|
50
|
+
|
51
|
+
klass
|
22
52
|
end
|
53
|
+
|
54
|
+
def self.build(text=nil, &block)
|
55
|
+
if block_given?
|
56
|
+
class_eval(&block)
|
57
|
+
else
|
58
|
+
class_eval(text.to_s)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
23
62
|
end
|
24
63
|
end
|
64
|
+
|
65
|
+
# Parent
|
66
|
+
require 'spark/serializer/base'
|
67
|
+
|
68
|
+
# Basic
|
69
|
+
require 'spark/serializer/oj'
|
70
|
+
require 'spark/serializer/marshal'
|
71
|
+
require 'spark/serializer/message_pack'
|
72
|
+
require 'spark/serializer/text'
|
73
|
+
|
74
|
+
# Others
|
75
|
+
require 'spark/serializer/batched'
|
76
|
+
require 'spark/serializer/auto_batched'
|
77
|
+
require 'spark/serializer/compressed'
|
78
|
+
require 'spark/serializer/pair'
|
79
|
+
require 'spark/serializer/cartesian'
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module Spark
|
2
|
+
module Serializer
|
3
|
+
##
|
4
|
+
# AutoBatched serializator
|
5
|
+
#
|
6
|
+
# Batch size is computed automatically. Simillar to Python's AutoBatchedSerializer.
|
7
|
+
#
|
8
|
+
class AutoBatched < Batched
|
9
|
+
|
10
|
+
MAX_RATIO = 10
|
11
|
+
|
12
|
+
def initialize(serializer, best_size=65536)
|
13
|
+
@serializer = serializer
|
14
|
+
@best_size = best_size.to_i
|
15
|
+
|
16
|
+
error('Batch size must be greater than 1') if @best_size < 2
|
17
|
+
end
|
18
|
+
|
19
|
+
def name
|
20
|
+
"AutoBatched(#{@best_size})"
|
21
|
+
end
|
22
|
+
|
23
|
+
def dump_to_io(data, io)
|
24
|
+
check_each(data)
|
25
|
+
|
26
|
+
# Only Array have .slice
|
27
|
+
data = data.to_a
|
28
|
+
|
29
|
+
index = 0
|
30
|
+
batch = 2
|
31
|
+
max = @best_size * MAX_RATIO
|
32
|
+
|
33
|
+
loop do
|
34
|
+
chunk = data.slice(index, batch)
|
35
|
+
if chunk.nil? || chunk.empty?
|
36
|
+
break
|
37
|
+
end
|
38
|
+
|
39
|
+
serialized = @serializer.dump(chunk)
|
40
|
+
io.write_string(serialized)
|
41
|
+
|
42
|
+
index += batch
|
43
|
+
|
44
|
+
size = serialized.bytesize
|
45
|
+
if size < @best_size
|
46
|
+
batch *= 2
|
47
|
+
elsif size > max && batch > 1
|
48
|
+
batch /= 2
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
io.flush
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
Spark::Serializer.register('auto_batched', 'autobatched', Spark::Serializer::AutoBatched)
|
@@ -1,168 +1,61 @@
|
|
1
1
|
module Spark
|
2
2
|
module Serializer
|
3
|
-
# @abstract Parent for all
|
3
|
+
# @abstract Parent for all serializers
|
4
4
|
class Base
|
5
5
|
|
6
|
-
|
7
|
-
|
6
|
+
def load_from_io(io)
|
7
|
+
return to_enum(__callee__, io) unless block_given?
|
8
8
|
|
9
|
-
|
9
|
+
loop do
|
10
|
+
size = io.read_int_or_eof
|
11
|
+
break if size == Spark::Constant::DATA_EOF
|
10
12
|
|
11
|
-
|
12
|
-
|
13
|
-
self.batch_size = batch_size
|
13
|
+
yield load(io.read(size))
|
14
|
+
end
|
14
15
|
end
|
15
16
|
|
16
|
-
def
|
17
|
-
|
18
|
-
end
|
17
|
+
def load_from_file(file, *args)
|
18
|
+
return to_enum(__callee__, file, *args) unless block_given?
|
19
19
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
self
|
24
|
-
end
|
20
|
+
load_from_io(file, *args).each do |item|
|
21
|
+
yield item
|
22
|
+
end
|
25
23
|
|
26
|
-
|
27
|
-
|
24
|
+
file.close
|
25
|
+
file.unlink
|
28
26
|
end
|
29
27
|
|
30
|
-
def
|
31
|
-
self.
|
28
|
+
def ==(other)
|
29
|
+
self.to_s == other.to_s
|
32
30
|
end
|
33
31
|
|
34
|
-
# nil, 0, 1 are considered as non-batched
|
35
32
|
def batched?
|
36
|
-
|
37
|
-
end
|
38
|
-
|
39
|
-
# ===========================================================================
|
40
|
-
# Load
|
41
|
-
|
42
|
-
# Load and deserialize an Array from IO, Array of Java iterator
|
43
|
-
# mri: respond_to?(:iterator) => false
|
44
|
-
# jruby: respond_to?(:iterator) => true
|
45
|
-
#
|
46
|
-
def load(source)
|
47
|
-
# Tempfile is Delegator for File so it is not IO
|
48
|
-
# second wasy is __getobj__.is_a?(IO)
|
49
|
-
if source.is_a?(IO) || source.is_a?(Tempfile)
|
50
|
-
load_from_io(source)
|
51
|
-
# elsif source.is_a?(Array)
|
52
|
-
# load_from_array(source)
|
53
|
-
elsif try(source, :iterator)
|
54
|
-
load_from_iterator(source.iterator)
|
55
|
-
end
|
33
|
+
false
|
56
34
|
end
|
57
35
|
|
58
|
-
|
59
|
-
#
|
60
|
-
# +------------+--------+
|
61
|
-
# | signed int | data |
|
62
|
-
# | 4B | |
|
63
|
-
# +------------+--------+
|
64
|
-
#
|
65
|
-
def load_from_io(io)
|
66
|
-
return to_enum(__callee__, io) unless block_given?
|
67
|
-
|
68
|
-
loop do
|
69
|
-
lenght = read_int(io)
|
70
|
-
break if lenght == DATA_EOF
|
71
|
-
|
72
|
-
result = load_next_from_io(io, lenght)
|
73
|
-
if batched? && result.respond_to?(:each)
|
74
|
-
result.each {|item| yield item }
|
75
|
-
else
|
76
|
-
yield result
|
77
|
-
end
|
78
|
-
end # loop
|
79
|
-
end # load_from_io
|
80
|
-
|
81
|
-
def load_next_from_io(io, lenght)
|
82
|
-
deserialize(io.read(lenght))
|
36
|
+
def unbatch!
|
83
37
|
end
|
84
38
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
result = []
|
89
|
-
while iterator.hasNext
|
90
|
-
item = iterator.next
|
91
|
-
|
92
|
-
# mri: data are String
|
93
|
-
# jruby: data are bytes Array
|
94
|
-
|
95
|
-
if item.is_a?(String)
|
96
|
-
# Serialized data
|
97
|
-
result << deserialize(item)
|
98
|
-
else
|
99
|
-
# Java object
|
100
|
-
if try(item, :getClass)
|
101
|
-
case item.getClass.name
|
102
|
-
when '[B'
|
103
|
-
# Array of bytes
|
104
|
-
result << deserialize(pack_unsigned_chars(item.to_a))
|
105
|
-
when 'scala.Tuple2'
|
106
|
-
# Tuple2
|
107
|
-
result << deserialize(item._1, item._2)
|
108
|
-
end
|
109
|
-
end
|
110
|
-
end
|
111
|
-
|
39
|
+
def check_each(data)
|
40
|
+
unless data.respond_to?(:each)
|
41
|
+
error('Data must be iterable.')
|
112
42
|
end
|
113
|
-
|
114
|
-
result.flatten!(1) if batched?
|
115
|
-
result
|
116
43
|
end
|
117
44
|
|
118
|
-
def
|
119
|
-
|
120
|
-
return DATA_EOF if bytes.nil?
|
121
|
-
unpack_int(bytes)
|
45
|
+
def error(message)
|
46
|
+
raise Spark::SerializeError, message
|
122
47
|
end
|
123
48
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
# Serialize and send data into IO. Check 'load_from_io' for data format.
|
128
|
-
def dump(data, io)
|
129
|
-
if !data.is_a?(Array) && !data.is_a?(Enumerator)
|
130
|
-
data = [data]
|
131
|
-
end
|
132
|
-
data = data.each_slice(batch_size) if batched?
|
133
|
-
|
134
|
-
data.each do |item|
|
135
|
-
serialized = serialize(item)
|
136
|
-
|
137
|
-
# Size and data can have different encoding
|
138
|
-
# Marshal: both ASCII
|
139
|
-
# Oj: ASCII and UTF-8
|
140
|
-
io.write(pack_int(serialized.bytesize))
|
141
|
-
io.write(serialized)
|
142
|
-
end
|
143
|
-
|
144
|
-
io.flush
|
49
|
+
def name
|
50
|
+
self.class.name.split('::').last
|
145
51
|
end
|
146
52
|
|
147
|
-
|
148
|
-
|
149
|
-
data.map! do |item|
|
150
|
-
serialize(item).to_java_bytes
|
151
|
-
end
|
53
|
+
def to_s
|
54
|
+
name
|
152
55
|
end
|
153
56
|
|
154
|
-
|
155
|
-
|
156
|
-
# mri => RuntimeError
|
157
|
-
# jruby => NoMethodError
|
158
|
-
#
|
159
|
-
def try(object, method)
|
160
|
-
begin
|
161
|
-
object.__send__(method)
|
162
|
-
return true
|
163
|
-
rescue
|
164
|
-
return false
|
165
|
-
end
|
57
|
+
def inspect
|
58
|
+
%{#<Spark::Serializer:0x#{object_id} "#{self}">}
|
166
59
|
end
|
167
60
|
|
168
61
|
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
module Spark
|
2
|
+
module Serializer
|
3
|
+
class Batched < Base
|
4
|
+
|
5
|
+
attr_writer :serializer
|
6
|
+
|
7
|
+
def initialize(serializer, batch_size=nil)
|
8
|
+
batch_size ||= Spark::Serializer::DEFAULT_BATCH_SIZE
|
9
|
+
|
10
|
+
@serializer = serializer
|
11
|
+
@batch_size = batch_size.to_i
|
12
|
+
|
13
|
+
error('Batch size must be greater than 0') if @batch_size < 1
|
14
|
+
end
|
15
|
+
|
16
|
+
# Really batched
|
17
|
+
def batched?
|
18
|
+
@batch_size > 1
|
19
|
+
end
|
20
|
+
|
21
|
+
def unbatch!
|
22
|
+
@batch_size = 1
|
23
|
+
end
|
24
|
+
|
25
|
+
def load(data)
|
26
|
+
@serializer.load(data)
|
27
|
+
end
|
28
|
+
|
29
|
+
def dump(data)
|
30
|
+
@serializer.dump(data)
|
31
|
+
end
|
32
|
+
|
33
|
+
def name
|
34
|
+
"Batched(#{@batch_size})"
|
35
|
+
end
|
36
|
+
|
37
|
+
def to_s
|
38
|
+
"#{name} -> #{@serializer}"
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
# === Dump ==============================================================
|
43
|
+
|
44
|
+
def dump_to_io(data, io)
|
45
|
+
check_each(data)
|
46
|
+
|
47
|
+
if batched?
|
48
|
+
data = data.each_slice(@batch_size)
|
49
|
+
end
|
50
|
+
|
51
|
+
data.each do |item|
|
52
|
+
serialized = dump(item)
|
53
|
+
io.write_string(serialized)
|
54
|
+
end
|
55
|
+
|
56
|
+
io.flush
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
# === Load ==============================================================
|
61
|
+
|
62
|
+
def load_from_io(io)
|
63
|
+
return to_enum(__callee__, io) unless block_given?
|
64
|
+
|
65
|
+
loop do
|
66
|
+
size = io.read_int_or_eof
|
67
|
+
break if size == Spark::Constant::DATA_EOF
|
68
|
+
|
69
|
+
data = io.read(size)
|
70
|
+
data = load(data)
|
71
|
+
|
72
|
+
if batched?
|
73
|
+
data.each{|item| yield item }
|
74
|
+
else
|
75
|
+
yield data
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
Spark::Serializer.register('batched', Spark::Serializer::Batched)
|
@@ -1,37 +1,13 @@
|
|
1
1
|
module Spark
|
2
2
|
module Serializer
|
3
|
-
class Cartesian <
|
3
|
+
class Cartesian < Pair
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
def set(first, second)
|
8
|
-
@first = first
|
9
|
-
@second = second
|
10
|
-
self
|
11
|
-
end
|
12
|
-
|
13
|
-
# Little hack
|
14
|
-
# Data does not have to be batched but items are added by <<
|
15
|
-
def batched?
|
16
|
-
true
|
17
|
-
end
|
18
|
-
|
19
|
-
def load_next_from_io(io, lenght)
|
20
|
-
item1 = io.read(lenght)
|
21
|
-
item2 = io.read_string
|
22
|
-
deserialize(item1, item2)
|
23
|
-
end
|
24
|
-
|
25
|
-
def deserialize(item1, item2)
|
26
|
-
deserialized_item1 = @first.deserialize(item1)
|
27
|
-
deserialized_item2 = @second.deserialize(item2)
|
28
|
-
|
29
|
-
deserialized_item1 = [deserialized_item1] unless @first.batched?
|
30
|
-
deserialized_item2 = [deserialized_item2] unless @second.batched?
|
31
|
-
|
32
|
-
deserialized_item1.product(deserialized_item2)
|
5
|
+
def aggregate(item1, item2)
|
6
|
+
item1.product(item2)
|
33
7
|
end
|
34
8
|
|
35
9
|
end
|
36
10
|
end
|
37
11
|
end
|
12
|
+
|
13
|
+
Spark::Serializer.register('cartesian', Spark::Serializer::Cartesian)
|