ruby-spark 1.1.0.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +37 -0
- data/Gemfile +47 -0
- data/Guardfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +252 -0
- data/Rakefile +35 -0
- data/TODO.md +6 -0
- data/benchmark/aggregate.rb +33 -0
- data/benchmark/bisect.rb +88 -0
- data/benchmark/comparison/prepare.sh +18 -0
- data/benchmark/comparison/python.py +156 -0
- data/benchmark/comparison/r.r +69 -0
- data/benchmark/comparison/ruby.rb +167 -0
- data/benchmark/comparison/run-all.sh +160 -0
- data/benchmark/comparison/scala.scala +181 -0
- data/benchmark/custom_marshal.rb +94 -0
- data/benchmark/digest.rb +150 -0
- data/benchmark/enumerator.rb +88 -0
- data/benchmark/serializer.rb +82 -0
- data/benchmark/sort.rb +43 -0
- data/benchmark/sort2.rb +164 -0
- data/benchmark/take.rb +28 -0
- data/bin/ruby-spark +8 -0
- data/example/pi.rb +28 -0
- data/example/website_search.rb +83 -0
- data/ext/ruby_c/extconf.rb +3 -0
- data/ext/ruby_c/murmur.c +158 -0
- data/ext/ruby_c/murmur.h +9 -0
- data/ext/ruby_c/ruby-spark.c +18 -0
- data/ext/ruby_java/Digest.java +36 -0
- data/ext/ruby_java/Murmur2.java +98 -0
- data/ext/ruby_java/RubySparkExtService.java +28 -0
- data/ext/ruby_java/extconf.rb +3 -0
- data/ext/spark/build.sbt +73 -0
- data/ext/spark/project/plugins.sbt +9 -0
- data/ext/spark/sbt/sbt +34 -0
- data/ext/spark/src/main/scala/Exec.scala +91 -0
- data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
- data/ext/spark/src/main/scala/Marshal.scala +52 -0
- data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
- data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
- data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
- data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
- data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
- data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
- data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
- data/ext/spark/src/main/scala/RubyPage.scala +34 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +392 -0
- data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
- data/ext/spark/src/main/scala/RubyTab.scala +11 -0
- data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
- data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
- data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
- data/lib/ruby-spark.rb +1 -0
- data/lib/spark.rb +198 -0
- data/lib/spark/accumulator.rb +260 -0
- data/lib/spark/broadcast.rb +98 -0
- data/lib/spark/build.rb +43 -0
- data/lib/spark/cli.rb +169 -0
- data/lib/spark/command.rb +86 -0
- data/lib/spark/command/base.rb +158 -0
- data/lib/spark/command/basic.rb +345 -0
- data/lib/spark/command/pair.rb +124 -0
- data/lib/spark/command/sort.rb +51 -0
- data/lib/spark/command/statistic.rb +144 -0
- data/lib/spark/command_builder.rb +141 -0
- data/lib/spark/command_validator.rb +34 -0
- data/lib/spark/config.rb +238 -0
- data/lib/spark/constant.rb +14 -0
- data/lib/spark/context.rb +322 -0
- data/lib/spark/error.rb +50 -0
- data/lib/spark/ext/hash.rb +41 -0
- data/lib/spark/ext/integer.rb +25 -0
- data/lib/spark/ext/io.rb +67 -0
- data/lib/spark/ext/ip_socket.rb +29 -0
- data/lib/spark/ext/module.rb +58 -0
- data/lib/spark/ext/object.rb +24 -0
- data/lib/spark/ext/string.rb +24 -0
- data/lib/spark/helper.rb +10 -0
- data/lib/spark/helper/logger.rb +40 -0
- data/lib/spark/helper/parser.rb +85 -0
- data/lib/spark/helper/serialize.rb +71 -0
- data/lib/spark/helper/statistic.rb +93 -0
- data/lib/spark/helper/system.rb +42 -0
- data/lib/spark/java_bridge.rb +19 -0
- data/lib/spark/java_bridge/base.rb +203 -0
- data/lib/spark/java_bridge/jruby.rb +23 -0
- data/lib/spark/java_bridge/rjb.rb +41 -0
- data/lib/spark/logger.rb +76 -0
- data/lib/spark/mllib.rb +100 -0
- data/lib/spark/mllib/classification/common.rb +31 -0
- data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
- data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
- data/lib/spark/mllib/classification/svm.rb +135 -0
- data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
- data/lib/spark/mllib/clustering/kmeans.rb +118 -0
- data/lib/spark/mllib/matrix.rb +120 -0
- data/lib/spark/mllib/regression/common.rb +73 -0
- data/lib/spark/mllib/regression/labeled_point.rb +41 -0
- data/lib/spark/mllib/regression/lasso.rb +100 -0
- data/lib/spark/mllib/regression/linear.rb +124 -0
- data/lib/spark/mllib/regression/ridge.rb +97 -0
- data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
- data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
- data/lib/spark/mllib/stat/distribution.rb +12 -0
- data/lib/spark/mllib/vector.rb +185 -0
- data/lib/spark/rdd.rb +1377 -0
- data/lib/spark/sampler.rb +92 -0
- data/lib/spark/serializer.rb +79 -0
- data/lib/spark/serializer/auto_batched.rb +59 -0
- data/lib/spark/serializer/base.rb +63 -0
- data/lib/spark/serializer/batched.rb +84 -0
- data/lib/spark/serializer/cartesian.rb +13 -0
- data/lib/spark/serializer/compressed.rb +27 -0
- data/lib/spark/serializer/marshal.rb +17 -0
- data/lib/spark/serializer/message_pack.rb +23 -0
- data/lib/spark/serializer/oj.rb +23 -0
- data/lib/spark/serializer/pair.rb +41 -0
- data/lib/spark/serializer/text.rb +25 -0
- data/lib/spark/sort.rb +189 -0
- data/lib/spark/stat_counter.rb +125 -0
- data/lib/spark/storage_level.rb +39 -0
- data/lib/spark/version.rb +3 -0
- data/lib/spark/worker/master.rb +144 -0
- data/lib/spark/worker/spark_files.rb +15 -0
- data/lib/spark/worker/worker.rb +200 -0
- data/ruby-spark.gemspec +47 -0
- data/spec/generator.rb +37 -0
- data/spec/inputs/lorem_300.txt +316 -0
- data/spec/inputs/numbers/1.txt +50 -0
- data/spec/inputs/numbers/10.txt +50 -0
- data/spec/inputs/numbers/11.txt +50 -0
- data/spec/inputs/numbers/12.txt +50 -0
- data/spec/inputs/numbers/13.txt +50 -0
- data/spec/inputs/numbers/14.txt +50 -0
- data/spec/inputs/numbers/15.txt +50 -0
- data/spec/inputs/numbers/16.txt +50 -0
- data/spec/inputs/numbers/17.txt +50 -0
- data/spec/inputs/numbers/18.txt +50 -0
- data/spec/inputs/numbers/19.txt +50 -0
- data/spec/inputs/numbers/2.txt +50 -0
- data/spec/inputs/numbers/20.txt +50 -0
- data/spec/inputs/numbers/3.txt +50 -0
- data/spec/inputs/numbers/4.txt +50 -0
- data/spec/inputs/numbers/5.txt +50 -0
- data/spec/inputs/numbers/6.txt +50 -0
- data/spec/inputs/numbers/7.txt +50 -0
- data/spec/inputs/numbers/8.txt +50 -0
- data/spec/inputs/numbers/9.txt +50 -0
- data/spec/inputs/numbers_0_100.txt +101 -0
- data/spec/inputs/numbers_1_100.txt +100 -0
- data/spec/lib/collect_spec.rb +42 -0
- data/spec/lib/command_spec.rb +68 -0
- data/spec/lib/config_spec.rb +64 -0
- data/spec/lib/context_spec.rb +165 -0
- data/spec/lib/ext_spec.rb +72 -0
- data/spec/lib/external_apps_spec.rb +45 -0
- data/spec/lib/filter_spec.rb +80 -0
- data/spec/lib/flat_map_spec.rb +100 -0
- data/spec/lib/group_spec.rb +109 -0
- data/spec/lib/helper_spec.rb +19 -0
- data/spec/lib/key_spec.rb +41 -0
- data/spec/lib/manipulation_spec.rb +122 -0
- data/spec/lib/map_partitions_spec.rb +87 -0
- data/spec/lib/map_spec.rb +91 -0
- data/spec/lib/mllib/classification_spec.rb +54 -0
- data/spec/lib/mllib/clustering_spec.rb +35 -0
- data/spec/lib/mllib/matrix_spec.rb +32 -0
- data/spec/lib/mllib/regression_spec.rb +116 -0
- data/spec/lib/mllib/vector_spec.rb +77 -0
- data/spec/lib/reduce_by_key_spec.rb +118 -0
- data/spec/lib/reduce_spec.rb +131 -0
- data/spec/lib/sample_spec.rb +46 -0
- data/spec/lib/serializer_spec.rb +88 -0
- data/spec/lib/sort_spec.rb +58 -0
- data/spec/lib/statistic_spec.rb +170 -0
- data/spec/lib/whole_text_files_spec.rb +33 -0
- data/spec/spec_helper.rb +38 -0
- metadata +389 -0
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
require 'distribution'
|
|
2
|
+
|
|
3
|
+
# Random Generators
|
|
4
|
+
module Spark
|
|
5
|
+
module RandomGenerator
|
|
6
|
+
class Poisson
|
|
7
|
+
|
|
8
|
+
def initialize(mean, seed)
|
|
9
|
+
generator = Random.new(seed)
|
|
10
|
+
@exp_rng = Distribution::Exponential.rng(1.0/mean, random: generator)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def rand
|
|
14
|
+
t = 0.0
|
|
15
|
+
number = 0
|
|
16
|
+
|
|
17
|
+
loop{
|
|
18
|
+
t += @exp_rng.call
|
|
19
|
+
if t > 1
|
|
20
|
+
return number
|
|
21
|
+
end
|
|
22
|
+
number += 1
|
|
23
|
+
}
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Samplers
|
|
31
|
+
module Spark
|
|
32
|
+
module Sampler
|
|
33
|
+
|
|
34
|
+
class Base
|
|
35
|
+
attr_reader :fraction, :seed
|
|
36
|
+
|
|
37
|
+
def initialize(fraction, seed=nil)
|
|
38
|
+
@fraction = fraction
|
|
39
|
+
@seed = seed || Random.new_seed
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Poisson Sampler
|
|
44
|
+
# -------------------------------------------------------------------------
|
|
45
|
+
class Poisson < Base
|
|
46
|
+
|
|
47
|
+
def sample(iterator)
|
|
48
|
+
iterator.map! do |item|
|
|
49
|
+
count = rng.rand
|
|
50
|
+
Array.new(count) { item }
|
|
51
|
+
end
|
|
52
|
+
iterator.flatten!
|
|
53
|
+
iterator.compact!
|
|
54
|
+
iterator
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def lazy_sample(iterator)
|
|
58
|
+
Enumerator::Lazy.new(iterator) do |yielder, value|
|
|
59
|
+
count = rng.rand
|
|
60
|
+
count.times { yielder << value }
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def rng
|
|
65
|
+
@rng ||= Spark::RandomGenerator::Poisson.new(fraction, seed)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Uniform Sampler
|
|
71
|
+
# -------------------------------------------------------------------------
|
|
72
|
+
class Uniform < Base
|
|
73
|
+
|
|
74
|
+
def sample(iterator)
|
|
75
|
+
iterator.select!{|item| rng.rand <= fraction}
|
|
76
|
+
iterator
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def lazy_sample(iterator)
|
|
80
|
+
iterator.select do |item|
|
|
81
|
+
rng.rand <= fraction
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def rng
|
|
86
|
+
@rng ||= Random.new(seed)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
end
|
|
92
|
+
end
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
module Spark
|
|
2
|
+
##
|
|
3
|
+
# Serializer
|
|
4
|
+
#
|
|
5
|
+
module Serializer
|
|
6
|
+
|
|
7
|
+
DEFAULT_COMPRESS = false
|
|
8
|
+
DEFAULT_BATCH_SIZE = 1024
|
|
9
|
+
DEFAULT_SERIALIZER_NAME = 'marshal'
|
|
10
|
+
|
|
11
|
+
@@registered = {}
|
|
12
|
+
|
|
13
|
+
# Register class and create method for quick access.
|
|
14
|
+
# Class will be available also as __name__ for using
|
|
15
|
+
# in build method (Proc binding problem).
|
|
16
|
+
#
|
|
17
|
+
# == Examples:
|
|
18
|
+
# register('test1', 'test2', Class)
|
|
19
|
+
#
|
|
20
|
+
# Spark::Serializer.test1
|
|
21
|
+
# Spark::Serializer.test2
|
|
22
|
+
#
|
|
23
|
+
# # Proc binding problem
|
|
24
|
+
# build { marshal } # => Spark::Serializer::Marshal
|
|
25
|
+
#
|
|
26
|
+
# marshal = 1
|
|
27
|
+
# build { marshal } # => 1
|
|
28
|
+
#
|
|
29
|
+
# build { __marshal__ } # => Spark::Serializer::Marshal
|
|
30
|
+
#
|
|
31
|
+
def self.register(*args)
|
|
32
|
+
klass = args.pop
|
|
33
|
+
args.each do |arg|
|
|
34
|
+
@@registered[arg] = klass
|
|
35
|
+
define_singleton_method(arg.to_sym){|*args| klass.new(*args) }
|
|
36
|
+
define_singleton_method("__#{arg}__".to_sym){|*args| klass.new(*args) }
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def self.find(name)
|
|
41
|
+
@@registered[name.to_s.downcase]
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def self.find!(name)
|
|
45
|
+
klass = find(name)
|
|
46
|
+
|
|
47
|
+
if klass.nil?
|
|
48
|
+
raise Spark::SerializeError, "Unknow serializer #{name}."
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
klass
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def self.build(text=nil, &block)
|
|
55
|
+
if block_given?
|
|
56
|
+
class_eval(&block)
|
|
57
|
+
else
|
|
58
|
+
class_eval(text.to_s)
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Parent
|
|
66
|
+
require 'spark/serializer/base'
|
|
67
|
+
|
|
68
|
+
# Basic
|
|
69
|
+
require 'spark/serializer/oj'
|
|
70
|
+
require 'spark/serializer/marshal'
|
|
71
|
+
require 'spark/serializer/message_pack'
|
|
72
|
+
require 'spark/serializer/text'
|
|
73
|
+
|
|
74
|
+
# Others
|
|
75
|
+
require 'spark/serializer/batched'
|
|
76
|
+
require 'spark/serializer/auto_batched'
|
|
77
|
+
require 'spark/serializer/compressed'
|
|
78
|
+
require 'spark/serializer/pair'
|
|
79
|
+
require 'spark/serializer/cartesian'
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
module Spark
|
|
2
|
+
module Serializer
|
|
3
|
+
##
|
|
4
|
+
# AutoBatched serializator
|
|
5
|
+
#
|
|
6
|
+
# Batch size is computed automatically. Simillar to Python's AutoBatchedSerializer.
|
|
7
|
+
#
|
|
8
|
+
class AutoBatched < Batched
|
|
9
|
+
|
|
10
|
+
MAX_RATIO = 10
|
|
11
|
+
|
|
12
|
+
def initialize(serializer, best_size=65536)
|
|
13
|
+
@serializer = serializer
|
|
14
|
+
@best_size = best_size.to_i
|
|
15
|
+
|
|
16
|
+
error('Batch size must be greater than 1') if @best_size < 2
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def name
|
|
20
|
+
"AutoBatched(#{@best_size})"
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def dump_to_io(data, io)
|
|
24
|
+
check_each(data)
|
|
25
|
+
|
|
26
|
+
# Only Array have .slice
|
|
27
|
+
data = data.to_a
|
|
28
|
+
|
|
29
|
+
index = 0
|
|
30
|
+
batch = 2
|
|
31
|
+
max = @best_size * MAX_RATIO
|
|
32
|
+
|
|
33
|
+
loop do
|
|
34
|
+
chunk = data.slice(index, batch)
|
|
35
|
+
if chunk.nil? || chunk.empty?
|
|
36
|
+
break
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
serialized = @serializer.dump(chunk)
|
|
40
|
+
io.write_string(serialized)
|
|
41
|
+
|
|
42
|
+
index += batch
|
|
43
|
+
|
|
44
|
+
size = serialized.bytesize
|
|
45
|
+
if size < @best_size
|
|
46
|
+
batch *= 2
|
|
47
|
+
elsif size > max && batch > 1
|
|
48
|
+
batch /= 2
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
io.flush
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
Spark::Serializer.register('auto_batched', 'autobatched', Spark::Serializer::AutoBatched)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
module Spark
|
|
2
|
+
module Serializer
|
|
3
|
+
# @abstract Parent for all serializers
|
|
4
|
+
class Base
|
|
5
|
+
|
|
6
|
+
def load_from_io(io)
|
|
7
|
+
return to_enum(__callee__, io) unless block_given?
|
|
8
|
+
|
|
9
|
+
loop do
|
|
10
|
+
size = io.read_int_or_eof
|
|
11
|
+
break if size == Spark::Constant::DATA_EOF
|
|
12
|
+
|
|
13
|
+
yield load(io.read(size))
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def load_from_file(file, *args)
|
|
18
|
+
return to_enum(__callee__, file, *args) unless block_given?
|
|
19
|
+
|
|
20
|
+
load_from_io(file, *args).each do |item|
|
|
21
|
+
yield item
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
file.close
|
|
25
|
+
file.unlink
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def ==(other)
|
|
29
|
+
self.to_s == other.to_s
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def batched?
|
|
33
|
+
false
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def unbatch!
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def check_each(data)
|
|
40
|
+
unless data.respond_to?(:each)
|
|
41
|
+
error('Data must be iterable.')
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def error(message)
|
|
46
|
+
raise Spark::SerializeError, message
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def name
|
|
50
|
+
self.class.name.split('::').last
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def to_s
|
|
54
|
+
name
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def inspect
|
|
58
|
+
%{#<Spark::Serializer:0x#{object_id} "#{self}">}
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
module Spark
|
|
2
|
+
module Serializer
|
|
3
|
+
class Batched < Base
|
|
4
|
+
|
|
5
|
+
attr_writer :serializer
|
|
6
|
+
|
|
7
|
+
def initialize(serializer, batch_size=nil)
|
|
8
|
+
batch_size ||= Spark::Serializer::DEFAULT_BATCH_SIZE
|
|
9
|
+
|
|
10
|
+
@serializer = serializer
|
|
11
|
+
@batch_size = batch_size.to_i
|
|
12
|
+
|
|
13
|
+
error('Batch size must be greater than 0') if @batch_size < 1
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Really batched
|
|
17
|
+
def batched?
|
|
18
|
+
@batch_size > 1
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def unbatch!
|
|
22
|
+
@batch_size = 1
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def load(data)
|
|
26
|
+
@serializer.load(data)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def dump(data)
|
|
30
|
+
@serializer.dump(data)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def name
|
|
34
|
+
"Batched(#{@batch_size})"
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def to_s
|
|
38
|
+
"#{name} -> #{@serializer}"
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# === Dump ==============================================================
|
|
43
|
+
|
|
44
|
+
def dump_to_io(data, io)
|
|
45
|
+
check_each(data)
|
|
46
|
+
|
|
47
|
+
if batched?
|
|
48
|
+
data = data.each_slice(@batch_size)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
data.each do |item|
|
|
52
|
+
serialized = dump(item)
|
|
53
|
+
io.write_string(serialized)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
io.flush
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# === Load ==============================================================
|
|
61
|
+
|
|
62
|
+
def load_from_io(io)
|
|
63
|
+
return to_enum(__callee__, io) unless block_given?
|
|
64
|
+
|
|
65
|
+
loop do
|
|
66
|
+
size = io.read_int_or_eof
|
|
67
|
+
break if size == Spark::Constant::DATA_EOF
|
|
68
|
+
|
|
69
|
+
data = io.read(size)
|
|
70
|
+
data = load(data)
|
|
71
|
+
|
|
72
|
+
if batched?
|
|
73
|
+
data.each{|item| yield item }
|
|
74
|
+
else
|
|
75
|
+
yield data
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
Spark::Serializer.register('batched', Spark::Serializer::Batched)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
module Spark
|
|
2
|
+
module Serializer
|
|
3
|
+
class Compressed < Base
|
|
4
|
+
|
|
5
|
+
def initialize(serializer)
|
|
6
|
+
@serializer = serializer
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def dump(data)
|
|
10
|
+
Zlib::Deflate.deflate(@serializer.dump(data))
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def load(data)
|
|
14
|
+
@serializer.load(Zlib::Inflate.inflate(data))
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
begin
|
|
22
|
+
# TODO: require only if it is necessary
|
|
23
|
+
require 'zlib'
|
|
24
|
+
|
|
25
|
+
Spark::Serializer.register('compress', 'compressed', Spark::Serializer::Compressed)
|
|
26
|
+
rescue LoadError
|
|
27
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Spark
|
|
2
|
+
module Serializer
|
|
3
|
+
class Marshal < Base
|
|
4
|
+
|
|
5
|
+
def dump(data)
|
|
6
|
+
::Marshal.dump(data)
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def load(data)
|
|
10
|
+
::Marshal.load(data)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
Spark::Serializer.register('marshal', Spark::Serializer::Marshal)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
module Spark
|
|
2
|
+
module Serializer
|
|
3
|
+
class MessagePack < Base
|
|
4
|
+
|
|
5
|
+
def dump(data)
|
|
6
|
+
::MessagePack.dump(data)
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def load(data)
|
|
10
|
+
::MessagePack.load(data)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
begin
|
|
18
|
+
# TODO: require only if it is necessary
|
|
19
|
+
require 'msgpack'
|
|
20
|
+
|
|
21
|
+
Spark::Serializer.register('messagepack', 'message_pack', 'msgpack', 'msg_pack', Spark::Serializer::MessagePack)
|
|
22
|
+
rescue LoadError
|
|
23
|
+
end
|