ruby-spark 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +37 -0
- data/Gemfile +47 -0
- data/Guardfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +185 -0
- data/Rakefile +35 -0
- data/TODO.md +7 -0
- data/benchmark/aggregate.rb +33 -0
- data/benchmark/bisect.rb +88 -0
- data/benchmark/custom_marshal.rb +94 -0
- data/benchmark/digest.rb +150 -0
- data/benchmark/enumerator.rb +88 -0
- data/benchmark/performance/prepare.sh +18 -0
- data/benchmark/performance/python.py +156 -0
- data/benchmark/performance/r.r +69 -0
- data/benchmark/performance/ruby.rb +167 -0
- data/benchmark/performance/run-all.sh +160 -0
- data/benchmark/performance/scala.scala +181 -0
- data/benchmark/serializer.rb +82 -0
- data/benchmark/sort.rb +43 -0
- data/benchmark/sort2.rb +164 -0
- data/benchmark/take.rb +28 -0
- data/bin/ruby-spark +8 -0
- data/example/pi.rb +28 -0
- data/ext/ruby_c/extconf.rb +3 -0
- data/ext/ruby_c/murmur.c +158 -0
- data/ext/ruby_c/murmur.h +9 -0
- data/ext/ruby_c/ruby-spark.c +18 -0
- data/ext/ruby_java/Digest.java +36 -0
- data/ext/ruby_java/Murmur2.java +98 -0
- data/ext/ruby_java/RubySparkExtService.java +28 -0
- data/ext/ruby_java/extconf.rb +3 -0
- data/ext/spark/build.sbt +73 -0
- data/ext/spark/project/plugins.sbt +9 -0
- data/ext/spark/sbt/sbt +34 -0
- data/ext/spark/src/main/scala/Exec.scala +91 -0
- data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
- data/ext/spark/src/main/scala/Marshal.scala +52 -0
- data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
- data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
- data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
- data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
- data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
- data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
- data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
- data/ext/spark/src/main/scala/RubyPage.scala +34 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +364 -0
- data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
- data/ext/spark/src/main/scala/RubyTab.scala +11 -0
- data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
- data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
- data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
- data/lib/ruby-spark.rb +1 -0
- data/lib/spark.rb +198 -0
- data/lib/spark/accumulator.rb +260 -0
- data/lib/spark/broadcast.rb +98 -0
- data/lib/spark/build.rb +43 -0
- data/lib/spark/cli.rb +169 -0
- data/lib/spark/command.rb +86 -0
- data/lib/spark/command/base.rb +154 -0
- data/lib/spark/command/basic.rb +345 -0
- data/lib/spark/command/pair.rb +124 -0
- data/lib/spark/command/sort.rb +51 -0
- data/lib/spark/command/statistic.rb +144 -0
- data/lib/spark/command_builder.rb +141 -0
- data/lib/spark/command_validator.rb +34 -0
- data/lib/spark/config.rb +244 -0
- data/lib/spark/constant.rb +14 -0
- data/lib/spark/context.rb +304 -0
- data/lib/spark/error.rb +50 -0
- data/lib/spark/ext/hash.rb +41 -0
- data/lib/spark/ext/integer.rb +25 -0
- data/lib/spark/ext/io.rb +57 -0
- data/lib/spark/ext/ip_socket.rb +29 -0
- data/lib/spark/ext/module.rb +58 -0
- data/lib/spark/ext/object.rb +24 -0
- data/lib/spark/ext/string.rb +24 -0
- data/lib/spark/helper.rb +10 -0
- data/lib/spark/helper/logger.rb +40 -0
- data/lib/spark/helper/parser.rb +85 -0
- data/lib/spark/helper/serialize.rb +71 -0
- data/lib/spark/helper/statistic.rb +93 -0
- data/lib/spark/helper/system.rb +42 -0
- data/lib/spark/java_bridge.rb +19 -0
- data/lib/spark/java_bridge/base.rb +203 -0
- data/lib/spark/java_bridge/jruby.rb +23 -0
- data/lib/spark/java_bridge/rjb.rb +41 -0
- data/lib/spark/logger.rb +76 -0
- data/lib/spark/mllib.rb +100 -0
- data/lib/spark/mllib/classification/common.rb +31 -0
- data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
- data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
- data/lib/spark/mllib/classification/svm.rb +135 -0
- data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
- data/lib/spark/mllib/clustering/kmeans.rb +118 -0
- data/lib/spark/mllib/matrix.rb +120 -0
- data/lib/spark/mllib/regression/common.rb +73 -0
- data/lib/spark/mllib/regression/labeled_point.rb +41 -0
- data/lib/spark/mllib/regression/lasso.rb +100 -0
- data/lib/spark/mllib/regression/linear.rb +124 -0
- data/lib/spark/mllib/regression/ridge.rb +97 -0
- data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
- data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
- data/lib/spark/mllib/stat/distribution.rb +12 -0
- data/lib/spark/mllib/vector.rb +185 -0
- data/lib/spark/rdd.rb +1328 -0
- data/lib/spark/sampler.rb +92 -0
- data/lib/spark/serializer.rb +24 -0
- data/lib/spark/serializer/base.rb +170 -0
- data/lib/spark/serializer/cartesian.rb +37 -0
- data/lib/spark/serializer/marshal.rb +19 -0
- data/lib/spark/serializer/message_pack.rb +25 -0
- data/lib/spark/serializer/oj.rb +25 -0
- data/lib/spark/serializer/pair.rb +27 -0
- data/lib/spark/serializer/utf8.rb +25 -0
- data/lib/spark/sort.rb +189 -0
- data/lib/spark/stat_counter.rb +125 -0
- data/lib/spark/storage_level.rb +39 -0
- data/lib/spark/version.rb +3 -0
- data/lib/spark/worker/master.rb +144 -0
- data/lib/spark/worker/spark_files.rb +15 -0
- data/lib/spark/worker/worker.rb +197 -0
- data/ruby-spark.gemspec +36 -0
- data/spec/generator.rb +37 -0
- data/spec/inputs/lorem_300.txt +316 -0
- data/spec/inputs/numbers/1.txt +50 -0
- data/spec/inputs/numbers/10.txt +50 -0
- data/spec/inputs/numbers/11.txt +50 -0
- data/spec/inputs/numbers/12.txt +50 -0
- data/spec/inputs/numbers/13.txt +50 -0
- data/spec/inputs/numbers/14.txt +50 -0
- data/spec/inputs/numbers/15.txt +50 -0
- data/spec/inputs/numbers/16.txt +50 -0
- data/spec/inputs/numbers/17.txt +50 -0
- data/spec/inputs/numbers/18.txt +50 -0
- data/spec/inputs/numbers/19.txt +50 -0
- data/spec/inputs/numbers/2.txt +50 -0
- data/spec/inputs/numbers/20.txt +50 -0
- data/spec/inputs/numbers/3.txt +50 -0
- data/spec/inputs/numbers/4.txt +50 -0
- data/spec/inputs/numbers/5.txt +50 -0
- data/spec/inputs/numbers/6.txt +50 -0
- data/spec/inputs/numbers/7.txt +50 -0
- data/spec/inputs/numbers/8.txt +50 -0
- data/spec/inputs/numbers/9.txt +50 -0
- data/spec/inputs/numbers_0_100.txt +101 -0
- data/spec/inputs/numbers_1_100.txt +100 -0
- data/spec/lib/collect_spec.rb +42 -0
- data/spec/lib/command_spec.rb +68 -0
- data/spec/lib/config_spec.rb +64 -0
- data/spec/lib/context_spec.rb +163 -0
- data/spec/lib/ext_spec.rb +72 -0
- data/spec/lib/external_apps_spec.rb +45 -0
- data/spec/lib/filter_spec.rb +80 -0
- data/spec/lib/flat_map_spec.rb +100 -0
- data/spec/lib/group_spec.rb +109 -0
- data/spec/lib/helper_spec.rb +19 -0
- data/spec/lib/key_spec.rb +41 -0
- data/spec/lib/manipulation_spec.rb +114 -0
- data/spec/lib/map_partitions_spec.rb +87 -0
- data/spec/lib/map_spec.rb +91 -0
- data/spec/lib/mllib/classification_spec.rb +54 -0
- data/spec/lib/mllib/clustering_spec.rb +35 -0
- data/spec/lib/mllib/matrix_spec.rb +32 -0
- data/spec/lib/mllib/regression_spec.rb +116 -0
- data/spec/lib/mllib/vector_spec.rb +77 -0
- data/spec/lib/reduce_by_key_spec.rb +118 -0
- data/spec/lib/reduce_spec.rb +131 -0
- data/spec/lib/sample_spec.rb +46 -0
- data/spec/lib/serializer_spec.rb +13 -0
- data/spec/lib/sort_spec.rb +58 -0
- data/spec/lib/statistic_spec.rb +168 -0
- data/spec/lib/whole_text_files_spec.rb +33 -0
- data/spec/spec_helper.rb +39 -0
- metadata +301 -0
@@ -0,0 +1,92 @@
|
|
1
|
+
require 'distribution'
|
2
|
+
|
3
|
+
# Random Generators
|
4
|
+
module Spark
|
5
|
+
module RandomGenerator
|
6
|
+
class Poisson
|
7
|
+
|
8
|
+
def initialize(mean, seed)
|
9
|
+
generator = Random.new(seed)
|
10
|
+
@exp_rng = Distribution::Exponential.rng(1.0/mean, random: generator)
|
11
|
+
end
|
12
|
+
|
13
|
+
def rand
|
14
|
+
t = 0.0
|
15
|
+
number = 0
|
16
|
+
|
17
|
+
loop{
|
18
|
+
t += @exp_rng.call
|
19
|
+
if t > 1
|
20
|
+
return number
|
21
|
+
end
|
22
|
+
number += 1
|
23
|
+
}
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Samplers
|
31
|
+
module Spark
|
32
|
+
module Sampler
|
33
|
+
|
34
|
+
class Base
|
35
|
+
attr_reader :fraction, :seed
|
36
|
+
|
37
|
+
def initialize(fraction, seed=nil)
|
38
|
+
@fraction = fraction
|
39
|
+
@seed = seed || Random.new_seed
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Poisson Sampler
|
44
|
+
# -------------------------------------------------------------------------
|
45
|
+
class Poisson < Base
|
46
|
+
|
47
|
+
def sample(iterator)
|
48
|
+
iterator.map! do |item|
|
49
|
+
count = rng.rand
|
50
|
+
Array.new(count) { item }
|
51
|
+
end
|
52
|
+
iterator.flatten!
|
53
|
+
iterator.compact!
|
54
|
+
iterator
|
55
|
+
end
|
56
|
+
|
57
|
+
def lazy_sample(iterator)
|
58
|
+
Enumerator::Lazy.new(iterator) do |yielder, value|
|
59
|
+
count = rng.rand
|
60
|
+
count.times { yielder << value }
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def rng
|
65
|
+
@rng ||= Spark::RandomGenerator::Poisson.new(fraction, seed)
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
# Uniform Sampler
|
71
|
+
# -------------------------------------------------------------------------
|
72
|
+
class Uniform < Base
|
73
|
+
|
74
|
+
def sample(iterator)
|
75
|
+
iterator.select!{|item| rng.rand <= fraction}
|
76
|
+
iterator
|
77
|
+
end
|
78
|
+
|
79
|
+
def lazy_sample(iterator)
|
80
|
+
iterator.select do |item|
|
81
|
+
rng.rand <= fraction
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def rng
|
86
|
+
@rng ||= Random.new(seed)
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Spark
|
2
|
+
module Serializer
|
3
|
+
autoload :Base, 'spark/serializer/base'
|
4
|
+
autoload :UTF8, 'spark/serializer/utf8'
|
5
|
+
autoload :Marshal, 'spark/serializer/marshal'
|
6
|
+
autoload :MessagePack, 'spark/serializer/message_pack'
|
7
|
+
autoload :Oj, 'spark/serializer/oj'
|
8
|
+
autoload :Pair, 'spark/serializer/pair'
|
9
|
+
autoload :Cartesian, 'spark/serializer/cartesian'
|
10
|
+
|
11
|
+
DEFAULT_BATCH_SIZE = 1024
|
12
|
+
DEFAULT_SERIALIZER_NAME = 'marshal'
|
13
|
+
|
14
|
+
def self.get(suggestion)
|
15
|
+
const_get(suggestion.to_s.camelize) rescue nil
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.get!(suggestion)
|
19
|
+
const_get(suggestion.to_s.camelize)
|
20
|
+
rescue
|
21
|
+
raise Spark::NotImplemented, "Serializer #{suggestion.to_s.camelize} not exist."
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,170 @@
|
|
1
|
+
module Spark
|
2
|
+
module Serializer
|
3
|
+
# @abstract Parent for all type of serializers
|
4
|
+
class Base
|
5
|
+
|
6
|
+
include Spark::Helper::Serialize
|
7
|
+
include Spark::Constant
|
8
|
+
|
9
|
+
attr_reader :batch_size
|
10
|
+
|
11
|
+
# Set default values
|
12
|
+
def initialize(batch_size=nil)
|
13
|
+
self.batch_size = batch_size
|
14
|
+
end
|
15
|
+
|
16
|
+
def ==(other)
|
17
|
+
self.class == other.class && self.batch_size == other.batch_size
|
18
|
+
end
|
19
|
+
|
20
|
+
# Set values given by user
|
21
|
+
def set(batch_size)
|
22
|
+
self.batch_size = batch_size unless batch_size.nil?
|
23
|
+
self
|
24
|
+
end
|
25
|
+
|
26
|
+
def batch_size=(size)
|
27
|
+
@batch_size = size.to_i
|
28
|
+
end
|
29
|
+
|
30
|
+
def unbatch!
|
31
|
+
self.batch_size = 1
|
32
|
+
end
|
33
|
+
|
34
|
+
# nil, 0, 1 are considered as non-batched
|
35
|
+
def batched?
|
36
|
+
batch_size > 1
|
37
|
+
end
|
38
|
+
|
39
|
+
# ===========================================================================
|
40
|
+
# Load
|
41
|
+
|
42
|
+
# Load and deserialize an Array from IO, Array of Java iterator
|
43
|
+
# mri: respond_to?(:iterator) => false
|
44
|
+
# jruby: respond_to?(:iterator) => true
|
45
|
+
#
|
46
|
+
def load(source)
|
47
|
+
# Tempfile is Delegator for File so it is not IO
|
48
|
+
# second wasy is __getobj__.is_a?(IO)
|
49
|
+
if source.is_a?(IO) || source.is_a?(Tempfile)
|
50
|
+
load_from_io(source)
|
51
|
+
# elsif source.is_a?(Array)
|
52
|
+
# load_from_array(source)
|
53
|
+
elsif try(source, :iterator)
|
54
|
+
load_from_iterator(source.iterator)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# Load data from IO. Data must have a format:
|
59
|
+
#
|
60
|
+
# +------------+--------+
|
61
|
+
# | signed int | data |
|
62
|
+
# | 4B | |
|
63
|
+
# +------------+--------+
|
64
|
+
#
|
65
|
+
def load_from_io(io)
|
66
|
+
return to_enum(__callee__, io) unless block_given?
|
67
|
+
|
68
|
+
loop do
|
69
|
+
lenght = read_int(io)
|
70
|
+
break if lenght == DATA_EOF
|
71
|
+
|
72
|
+
result = load_next_from_io(io, lenght)
|
73
|
+
if batched? && result.respond_to?(:each)
|
74
|
+
result.each {|item| yield item }
|
75
|
+
else
|
76
|
+
yield result
|
77
|
+
end
|
78
|
+
end # loop
|
79
|
+
end # load_from_io
|
80
|
+
|
81
|
+
def load_next_from_io(io, lenght)
|
82
|
+
deserialize(io.read(lenght))
|
83
|
+
end
|
84
|
+
|
85
|
+
# Load from Java iterator by calling hasNext and next
|
86
|
+
#
|
87
|
+
def load_from_iterator(iterator)
|
88
|
+
result = []
|
89
|
+
while iterator.hasNext
|
90
|
+
item = iterator.next
|
91
|
+
|
92
|
+
# mri: data are String
|
93
|
+
# jruby: data are bytes Array
|
94
|
+
|
95
|
+
if item.is_a?(String)
|
96
|
+
# Serialized data
|
97
|
+
result << deserialize(item)
|
98
|
+
else
|
99
|
+
# Java object
|
100
|
+
if try(item, :getClass)
|
101
|
+
case item.getClass.name
|
102
|
+
when '[B'
|
103
|
+
# Array of bytes
|
104
|
+
result << deserialize(pack_unsigned_chars(item.to_a))
|
105
|
+
when 'scala.Tuple2'
|
106
|
+
# Tuple2
|
107
|
+
result << deserialize(item._1, item._2)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
113
|
+
|
114
|
+
result.flatten!(1) if batched?
|
115
|
+
result
|
116
|
+
end
|
117
|
+
|
118
|
+
def read_int(io)
|
119
|
+
bytes = io.read(4)
|
120
|
+
return DATA_EOF if bytes.nil?
|
121
|
+
unpack_int(bytes)
|
122
|
+
end
|
123
|
+
|
124
|
+
# ===========================================================================
|
125
|
+
# Dump
|
126
|
+
|
127
|
+
# Serialize and send data into IO. Check 'load_from_io' for data format.
|
128
|
+
def dump(data, io)
|
129
|
+
if !data.is_a?(Array) && !data.is_a?(Enumerator)
|
130
|
+
data = [data]
|
131
|
+
end
|
132
|
+
data = data.each_slice(batch_size) if batched?
|
133
|
+
|
134
|
+
data.each do |item|
|
135
|
+
serialized = serialize(item)
|
136
|
+
|
137
|
+
# Size and data can have different encoding
|
138
|
+
# Marshal: both ASCII
|
139
|
+
# Oj: ASCII and UTF-8
|
140
|
+
io.write(pack_int(serialized.bytesize))
|
141
|
+
io.write(serialized)
|
142
|
+
end
|
143
|
+
|
144
|
+
io.flush
|
145
|
+
end
|
146
|
+
|
147
|
+
# For direct serialization
|
148
|
+
def dump_to_java(data)
|
149
|
+
data.map! do |item|
|
150
|
+
serialize(item).to_java_bytes
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
# Rescue cannot be defined
|
155
|
+
#
|
156
|
+
# mri => RuntimeError
|
157
|
+
# jruby => NoMethodError
|
158
|
+
#
|
159
|
+
def try(object, method)
|
160
|
+
begin
|
161
|
+
object.__send__(method)
|
162
|
+
return true
|
163
|
+
rescue
|
164
|
+
return false
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Spark
|
2
|
+
module Serializer
|
3
|
+
class Cartesian < Base
|
4
|
+
|
5
|
+
attr_reader :first, :second
|
6
|
+
|
7
|
+
def set(first, second)
|
8
|
+
@first = first
|
9
|
+
@second = second
|
10
|
+
self
|
11
|
+
end
|
12
|
+
|
13
|
+
# Little hack
|
14
|
+
# Data does not have to be batched but items are added by <<
|
15
|
+
def batched?
|
16
|
+
true
|
17
|
+
end
|
18
|
+
|
19
|
+
def load_next_from_io(io, lenght)
|
20
|
+
item1 = io.read(lenght)
|
21
|
+
item2 = io.read_string
|
22
|
+
deserialize(item1, item2)
|
23
|
+
end
|
24
|
+
|
25
|
+
def deserialize(item1, item2)
|
26
|
+
deserialized_item1 = @first.deserialize(item1)
|
27
|
+
deserialized_item2 = @second.deserialize(item2)
|
28
|
+
|
29
|
+
deserialized_item1 = [deserialized_item1] unless @first.batched?
|
30
|
+
deserialized_item2 = [deserialized_item2] unless @second.batched?
|
31
|
+
|
32
|
+
deserialized_item1.product(deserialized_item2)
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Spark
|
2
|
+
module Serializer
|
3
|
+
class MessagePack < Marshal
|
4
|
+
|
5
|
+
def name
|
6
|
+
'message_pack'
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.serialize(data)
|
10
|
+
::MessagePack::dump(data)
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.deserialize(data)
|
14
|
+
::MessagePack::load(data)
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
begin
|
22
|
+
require 'msgpack'
|
23
|
+
rescue LoadError
|
24
|
+
Spark::Serializer::MessagePack = Spark::Serializer::Marshal
|
25
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Spark
|
2
|
+
module Serializer
|
3
|
+
class Oj < Marshal
|
4
|
+
|
5
|
+
def name
|
6
|
+
'oj'
|
7
|
+
end
|
8
|
+
|
9
|
+
def serialize(data)
|
10
|
+
::Oj::dump(data)
|
11
|
+
end
|
12
|
+
|
13
|
+
def deserialize(data)
|
14
|
+
::Oj::load(data)
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
begin
|
22
|
+
require 'oj'
|
23
|
+
rescue LoadError
|
24
|
+
Spark::Serializer::Oj = Spark::Serializer::Marshal
|
25
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Spark
|
2
|
+
module Serializer
|
3
|
+
class Pair < Base
|
4
|
+
|
5
|
+
attr_reader :first, :second
|
6
|
+
|
7
|
+
def set(first, second)
|
8
|
+
unbatch!
|
9
|
+
@first = first
|
10
|
+
@second = second
|
11
|
+
self
|
12
|
+
end
|
13
|
+
|
14
|
+
def batched?
|
15
|
+
false
|
16
|
+
end
|
17
|
+
|
18
|
+
def load_next_from_io(io, lenght)
|
19
|
+
key_value = []
|
20
|
+
key_value << @first.load_next_from_io(io, lenght)
|
21
|
+
key_value << @second.load_next_from_io(io, read_int(io))
|
22
|
+
key_value
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Spark
|
2
|
+
module Serializer
|
3
|
+
##
|
4
|
+
# Used for file
|
5
|
+
#
|
6
|
+
# File is sended as String but worker use serialization
|
7
|
+
#
|
8
|
+
class UTF8 < Base
|
9
|
+
|
10
|
+
def set(*)
|
11
|
+
unbatch!
|
12
|
+
self
|
13
|
+
end
|
14
|
+
|
15
|
+
def batched?
|
16
|
+
false
|
17
|
+
end
|
18
|
+
|
19
|
+
def load_next_from_io(io, lenght)
|
20
|
+
io.read(lenght).force_encoding(Encoding::UTF_8)
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|