ruby-spark 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +37 -0
- data/Gemfile +47 -0
- data/Guardfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +185 -0
- data/Rakefile +35 -0
- data/TODO.md +7 -0
- data/benchmark/aggregate.rb +33 -0
- data/benchmark/bisect.rb +88 -0
- data/benchmark/custom_marshal.rb +94 -0
- data/benchmark/digest.rb +150 -0
- data/benchmark/enumerator.rb +88 -0
- data/benchmark/performance/prepare.sh +18 -0
- data/benchmark/performance/python.py +156 -0
- data/benchmark/performance/r.r +69 -0
- data/benchmark/performance/ruby.rb +167 -0
- data/benchmark/performance/run-all.sh +160 -0
- data/benchmark/performance/scala.scala +181 -0
- data/benchmark/serializer.rb +82 -0
- data/benchmark/sort.rb +43 -0
- data/benchmark/sort2.rb +164 -0
- data/benchmark/take.rb +28 -0
- data/bin/ruby-spark +8 -0
- data/example/pi.rb +28 -0
- data/ext/ruby_c/extconf.rb +3 -0
- data/ext/ruby_c/murmur.c +158 -0
- data/ext/ruby_c/murmur.h +9 -0
- data/ext/ruby_c/ruby-spark.c +18 -0
- data/ext/ruby_java/Digest.java +36 -0
- data/ext/ruby_java/Murmur2.java +98 -0
- data/ext/ruby_java/RubySparkExtService.java +28 -0
- data/ext/ruby_java/extconf.rb +3 -0
- data/ext/spark/build.sbt +73 -0
- data/ext/spark/project/plugins.sbt +9 -0
- data/ext/spark/sbt/sbt +34 -0
- data/ext/spark/src/main/scala/Exec.scala +91 -0
- data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
- data/ext/spark/src/main/scala/Marshal.scala +52 -0
- data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
- data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
- data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
- data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
- data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
- data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
- data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
- data/ext/spark/src/main/scala/RubyPage.scala +34 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +364 -0
- data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
- data/ext/spark/src/main/scala/RubyTab.scala +11 -0
- data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
- data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
- data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
- data/lib/ruby-spark.rb +1 -0
- data/lib/spark.rb +198 -0
- data/lib/spark/accumulator.rb +260 -0
- data/lib/spark/broadcast.rb +98 -0
- data/lib/spark/build.rb +43 -0
- data/lib/spark/cli.rb +169 -0
- data/lib/spark/command.rb +86 -0
- data/lib/spark/command/base.rb +154 -0
- data/lib/spark/command/basic.rb +345 -0
- data/lib/spark/command/pair.rb +124 -0
- data/lib/spark/command/sort.rb +51 -0
- data/lib/spark/command/statistic.rb +144 -0
- data/lib/spark/command_builder.rb +141 -0
- data/lib/spark/command_validator.rb +34 -0
- data/lib/spark/config.rb +244 -0
- data/lib/spark/constant.rb +14 -0
- data/lib/spark/context.rb +304 -0
- data/lib/spark/error.rb +50 -0
- data/lib/spark/ext/hash.rb +41 -0
- data/lib/spark/ext/integer.rb +25 -0
- data/lib/spark/ext/io.rb +57 -0
- data/lib/spark/ext/ip_socket.rb +29 -0
- data/lib/spark/ext/module.rb +58 -0
- data/lib/spark/ext/object.rb +24 -0
- data/lib/spark/ext/string.rb +24 -0
- data/lib/spark/helper.rb +10 -0
- data/lib/spark/helper/logger.rb +40 -0
- data/lib/spark/helper/parser.rb +85 -0
- data/lib/spark/helper/serialize.rb +71 -0
- data/lib/spark/helper/statistic.rb +93 -0
- data/lib/spark/helper/system.rb +42 -0
- data/lib/spark/java_bridge.rb +19 -0
- data/lib/spark/java_bridge/base.rb +203 -0
- data/lib/spark/java_bridge/jruby.rb +23 -0
- data/lib/spark/java_bridge/rjb.rb +41 -0
- data/lib/spark/logger.rb +76 -0
- data/lib/spark/mllib.rb +100 -0
- data/lib/spark/mllib/classification/common.rb +31 -0
- data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
- data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
- data/lib/spark/mllib/classification/svm.rb +135 -0
- data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
- data/lib/spark/mllib/clustering/kmeans.rb +118 -0
- data/lib/spark/mllib/matrix.rb +120 -0
- data/lib/spark/mllib/regression/common.rb +73 -0
- data/lib/spark/mllib/regression/labeled_point.rb +41 -0
- data/lib/spark/mllib/regression/lasso.rb +100 -0
- data/lib/spark/mllib/regression/linear.rb +124 -0
- data/lib/spark/mllib/regression/ridge.rb +97 -0
- data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
- data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
- data/lib/spark/mllib/stat/distribution.rb +12 -0
- data/lib/spark/mllib/vector.rb +185 -0
- data/lib/spark/rdd.rb +1328 -0
- data/lib/spark/sampler.rb +92 -0
- data/lib/spark/serializer.rb +24 -0
- data/lib/spark/serializer/base.rb +170 -0
- data/lib/spark/serializer/cartesian.rb +37 -0
- data/lib/spark/serializer/marshal.rb +19 -0
- data/lib/spark/serializer/message_pack.rb +25 -0
- data/lib/spark/serializer/oj.rb +25 -0
- data/lib/spark/serializer/pair.rb +27 -0
- data/lib/spark/serializer/utf8.rb +25 -0
- data/lib/spark/sort.rb +189 -0
- data/lib/spark/stat_counter.rb +125 -0
- data/lib/spark/storage_level.rb +39 -0
- data/lib/spark/version.rb +3 -0
- data/lib/spark/worker/master.rb +144 -0
- data/lib/spark/worker/spark_files.rb +15 -0
- data/lib/spark/worker/worker.rb +197 -0
- data/ruby-spark.gemspec +36 -0
- data/spec/generator.rb +37 -0
- data/spec/inputs/lorem_300.txt +316 -0
- data/spec/inputs/numbers/1.txt +50 -0
- data/spec/inputs/numbers/10.txt +50 -0
- data/spec/inputs/numbers/11.txt +50 -0
- data/spec/inputs/numbers/12.txt +50 -0
- data/spec/inputs/numbers/13.txt +50 -0
- data/spec/inputs/numbers/14.txt +50 -0
- data/spec/inputs/numbers/15.txt +50 -0
- data/spec/inputs/numbers/16.txt +50 -0
- data/spec/inputs/numbers/17.txt +50 -0
- data/spec/inputs/numbers/18.txt +50 -0
- data/spec/inputs/numbers/19.txt +50 -0
- data/spec/inputs/numbers/2.txt +50 -0
- data/spec/inputs/numbers/20.txt +50 -0
- data/spec/inputs/numbers/3.txt +50 -0
- data/spec/inputs/numbers/4.txt +50 -0
- data/spec/inputs/numbers/5.txt +50 -0
- data/spec/inputs/numbers/6.txt +50 -0
- data/spec/inputs/numbers/7.txt +50 -0
- data/spec/inputs/numbers/8.txt +50 -0
- data/spec/inputs/numbers/9.txt +50 -0
- data/spec/inputs/numbers_0_100.txt +101 -0
- data/spec/inputs/numbers_1_100.txt +100 -0
- data/spec/lib/collect_spec.rb +42 -0
- data/spec/lib/command_spec.rb +68 -0
- data/spec/lib/config_spec.rb +64 -0
- data/spec/lib/context_spec.rb +163 -0
- data/spec/lib/ext_spec.rb +72 -0
- data/spec/lib/external_apps_spec.rb +45 -0
- data/spec/lib/filter_spec.rb +80 -0
- data/spec/lib/flat_map_spec.rb +100 -0
- data/spec/lib/group_spec.rb +109 -0
- data/spec/lib/helper_spec.rb +19 -0
- data/spec/lib/key_spec.rb +41 -0
- data/spec/lib/manipulation_spec.rb +114 -0
- data/spec/lib/map_partitions_spec.rb +87 -0
- data/spec/lib/map_spec.rb +91 -0
- data/spec/lib/mllib/classification_spec.rb +54 -0
- data/spec/lib/mllib/clustering_spec.rb +35 -0
- data/spec/lib/mllib/matrix_spec.rb +32 -0
- data/spec/lib/mllib/regression_spec.rb +116 -0
- data/spec/lib/mllib/vector_spec.rb +77 -0
- data/spec/lib/reduce_by_key_spec.rb +118 -0
- data/spec/lib/reduce_spec.rb +131 -0
- data/spec/lib/sample_spec.rb +46 -0
- data/spec/lib/serializer_spec.rb +13 -0
- data/spec/lib/sort_spec.rb +58 -0
- data/spec/lib/statistic_spec.rb +168 -0
- data/spec/lib/whole_text_files_spec.rb +33 -0
- data/spec/spec_helper.rb +39 -0
- metadata +301 -0
@@ -0,0 +1,86 @@
|
|
1
|
+
module Spark
|
2
|
+
##
|
3
|
+
# Container which includes all commands and other things for worker
|
4
|
+
# Every RDD have own copy of Command
|
5
|
+
#
|
6
|
+
class Command
|
7
|
+
|
8
|
+
attr_accessor :serializer, :deserializer, :commands, :libraries, :bound_objects
|
9
|
+
|
10
|
+
def initialize
|
11
|
+
@serializer = nil
|
12
|
+
@deserializer = nil
|
13
|
+
@commands = []
|
14
|
+
@libraries = []
|
15
|
+
@bound_objects = {}
|
16
|
+
end
|
17
|
+
|
18
|
+
def execute(iterator, split_index)
|
19
|
+
# Require necessary libraries
|
20
|
+
libraries.each{|lib| require lib}
|
21
|
+
|
22
|
+
# Prepare bound objects
|
23
|
+
@commands.each do |command|
|
24
|
+
command.__objects__ = bound_objects
|
25
|
+
end
|
26
|
+
|
27
|
+
# Prepare for running
|
28
|
+
@commands.each(&:prepare)
|
29
|
+
|
30
|
+
# Run all task
|
31
|
+
@commands.each do |command|
|
32
|
+
iterator = command.execute(iterator, split_index)
|
33
|
+
end
|
34
|
+
|
35
|
+
# Return changed iterator. This is not be necessary for some tasks
|
36
|
+
# because of using inplace changing but some task can return
|
37
|
+
# only one value (for example reduce).
|
38
|
+
iterator
|
39
|
+
end
|
40
|
+
|
41
|
+
def last
|
42
|
+
@commands.last
|
43
|
+
end
|
44
|
+
|
45
|
+
def bound_objects
|
46
|
+
# Objects from users
|
47
|
+
# Already initialized objects on worker
|
48
|
+
return @bound_objects if @bound_objects
|
49
|
+
|
50
|
+
if @serialized_bound_objects
|
51
|
+
# Still serialized
|
52
|
+
@bound_objects = Marshal.load(@serialized_bound_objects)
|
53
|
+
else
|
54
|
+
# Something else
|
55
|
+
@bound_objects = {}
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Bound objects can depend on library which is loaded during @execute
|
60
|
+
# In that case worker raise "undefined class/module"
|
61
|
+
def marshal_dump
|
62
|
+
[@serializer, @deserializer, @commands, @libraries, serialized_bound_objects]
|
63
|
+
end
|
64
|
+
|
65
|
+
def marshal_load(array)
|
66
|
+
@serializer = array.shift
|
67
|
+
@deserializer = array.shift
|
68
|
+
@commands = array.shift
|
69
|
+
@libraries = array.shift
|
70
|
+
@serialized_bound_objects = array.shift
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def serialized_bound_objects
|
76
|
+
@serialized_bound_objects ||= Marshal.dump(@bound_objects)
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
require 'spark/command/base'
|
83
|
+
require 'spark/command/basic'
|
84
|
+
require 'spark/command/pair'
|
85
|
+
require 'spark/command/statistic'
|
86
|
+
require 'spark/command/sort'
|
@@ -0,0 +1,154 @@
|
|
1
|
+
##
|
2
|
+
# Spark::Command::Base
|
3
|
+
#
|
4
|
+
# Parent for all commands (Map, FlatMap, Sort, ...)
|
5
|
+
#
|
6
|
+
class Spark::Command::Base
|
7
|
+
|
8
|
+
DEFAULT_VARIABLE_OPTIONS = {
|
9
|
+
type: Hash,
|
10
|
+
function: true
|
11
|
+
}
|
12
|
+
|
13
|
+
def initialize(*args)
|
14
|
+
settings.variables.each do |name, options|
|
15
|
+
instance_variable_set("@#{name}", args.shift)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.error(message)
|
20
|
+
raise Spark::CommandError, message
|
21
|
+
end
|
22
|
+
|
23
|
+
def error(message)
|
24
|
+
self.class.error(message)
|
25
|
+
end
|
26
|
+
|
27
|
+
def log(message=nil)
|
28
|
+
$stdout.puts %{==> #{Time.now.strftime("%H:%M:%S")} [#{self.class.name}] #{message}}
|
29
|
+
$stdout.flush
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
# ===============================================================================================
|
34
|
+
# Methods called during class loading
|
35
|
+
# This is not nicer way but these methods set/get classes variables for child
|
36
|
+
|
37
|
+
# Settings for command (variables)
|
38
|
+
def self.settings
|
39
|
+
init_settings
|
40
|
+
class_variable_get(:@@settings)
|
41
|
+
end
|
42
|
+
|
43
|
+
def settings
|
44
|
+
self.class.settings
|
45
|
+
end
|
46
|
+
|
47
|
+
# Init empty settings
|
48
|
+
def self.init_settings
|
49
|
+
if !class_variable_defined?(:@@settings)
|
50
|
+
struct = Struct.new(:variables)
|
51
|
+
|
52
|
+
class_variable_set(:@@settings, struct.new)
|
53
|
+
settings.variables = {}
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# New variable for command
|
58
|
+
#
|
59
|
+
# == Example:
|
60
|
+
#
|
61
|
+
# class Map < Spark::Command::Base
|
62
|
+
# variable :map_function
|
63
|
+
# end
|
64
|
+
#
|
65
|
+
# command = Map.new(1)
|
66
|
+
#
|
67
|
+
# command.instance_variables
|
68
|
+
# # => [:@map_function]
|
69
|
+
# command.instance_variable_get(:@map_function)
|
70
|
+
# # => 1
|
71
|
+
#
|
72
|
+
def self.variable(name, options={})
|
73
|
+
if settings.variables.has_key?(name)
|
74
|
+
error "Function #{name} already exist."
|
75
|
+
end
|
76
|
+
|
77
|
+
settings.variables[name] = DEFAULT_VARIABLE_OPTIONS.merge(options)
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
# ===============================================================================================
|
82
|
+
# Executing methods
|
83
|
+
|
84
|
+
# Execute command for data and split index
|
85
|
+
def execute(iterator, split_index)
|
86
|
+
# Implemented on Base but can be override
|
87
|
+
before_run
|
88
|
+
|
89
|
+
# Run has to be implemented on child
|
90
|
+
if iterator.is_a?(Enumerator::Lazy) && respond_to?(:lazy_run)
|
91
|
+
return lazy_run(iterator, split_index)
|
92
|
+
end
|
93
|
+
|
94
|
+
iterator = iterator.to_a
|
95
|
+
run(iterator, split_index)
|
96
|
+
end
|
97
|
+
|
98
|
+
def prepared?
|
99
|
+
!!@prepared
|
100
|
+
end
|
101
|
+
|
102
|
+
# This is called before execution. Executing will be stopped if
|
103
|
+
# some command contains error (e.g. badly serialized lambda).
|
104
|
+
#
|
105
|
+
# == What is doing?
|
106
|
+
# * evaluate lambda
|
107
|
+
# * evaluate method
|
108
|
+
# * make new lambda
|
109
|
+
#
|
110
|
+
def prepare
|
111
|
+
return if prepared?
|
112
|
+
|
113
|
+
to_function = settings.variables.select {|_, options| options[:function]}
|
114
|
+
to_function.each do |name, options|
|
115
|
+
name = "@#{name}"
|
116
|
+
data = instance_variable_get(name)
|
117
|
+
|
118
|
+
case data[:type]
|
119
|
+
when 'proc'
|
120
|
+
result = eval(data[:content])
|
121
|
+
when 'symbol'
|
122
|
+
result = lambda(&data[:content])
|
123
|
+
when 'method'
|
124
|
+
# Method must me added to instance not Class
|
125
|
+
instance_eval(data[:content])
|
126
|
+
# Method will be available as Proc
|
127
|
+
result = lambda(&method(data[:name]))
|
128
|
+
end
|
129
|
+
|
130
|
+
instance_variable_set(name, result)
|
131
|
+
end
|
132
|
+
|
133
|
+
@prepared = true
|
134
|
+
end
|
135
|
+
|
136
|
+
# This method is called before every execution.
|
137
|
+
def before_run
|
138
|
+
end
|
139
|
+
|
140
|
+
|
141
|
+
# ===============================================================================================
|
142
|
+
# Bound objects
|
143
|
+
|
144
|
+
attr_accessor :__objects__
|
145
|
+
|
146
|
+
def method_missing(method, *args, &block)
|
147
|
+
if __objects__ && __objects__.has_key?(method)
|
148
|
+
return __objects__[method]
|
149
|
+
end
|
150
|
+
|
151
|
+
super
|
152
|
+
end
|
153
|
+
|
154
|
+
end
|
@@ -0,0 +1,345 @@
|
|
1
|
+
_Base = Spark::Command::Base
|
2
|
+
|
3
|
+
# -------------------------------------------------------------------------------------------------
|
4
|
+
# Map
|
5
|
+
|
6
|
+
class Spark::Command::Map < _Base
|
7
|
+
variable :map_function
|
8
|
+
|
9
|
+
def run(iterator, *)
|
10
|
+
iterator.map! do |item|
|
11
|
+
@map_function.call(item)
|
12
|
+
end
|
13
|
+
iterator
|
14
|
+
end
|
15
|
+
|
16
|
+
def lazy_run(iterator, *)
|
17
|
+
iterator.map do |item|
|
18
|
+
@map_function.call(item)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# -------------------------------------------------------------------------------------------------
|
24
|
+
# FlatMap
|
25
|
+
|
26
|
+
class Spark::Command::FlatMap < Spark::Command::Map
|
27
|
+
def run(iterator, *)
|
28
|
+
iterator = super
|
29
|
+
iterator.flatten!(1)
|
30
|
+
iterator
|
31
|
+
end
|
32
|
+
|
33
|
+
def lazy_run(iterator, *)
|
34
|
+
iterator.flat_map do |item|
|
35
|
+
@map_function.call(item)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# -------------------------------------------------------------------------------------------------
|
41
|
+
# MapPartitionsWithIndex
|
42
|
+
|
43
|
+
class Spark::Command::MapPartitionsWithIndex < _Base
|
44
|
+
variable :partition_function
|
45
|
+
|
46
|
+
def run(iterator, index)
|
47
|
+
iterator = @partition_function.call(iterator, index)
|
48
|
+
iterator
|
49
|
+
end
|
50
|
+
|
51
|
+
# User should controll if there is Enumerator or not
|
52
|
+
# alias_method :lazy_run, :run
|
53
|
+
end
|
54
|
+
|
55
|
+
# -------------------------------------------------------------------------------------------------
|
56
|
+
# MapPartitions
|
57
|
+
|
58
|
+
class Spark::Command::MapPartitions < Spark::Command::MapPartitionsWithIndex
|
59
|
+
def run(iterator, *)
|
60
|
+
# Do not use `super` because `@partition_function` can be method with 1 argument
|
61
|
+
iterator = @partition_function.call(iterator)
|
62
|
+
iterator
|
63
|
+
end
|
64
|
+
# alias_method :lazy_run, :run
|
65
|
+
end
|
66
|
+
|
67
|
+
# -------------------------------------------------------------------------------------------------
|
68
|
+
# Filter
|
69
|
+
|
70
|
+
class Spark::Command::Filter < _Base
|
71
|
+
variable :filter_function
|
72
|
+
|
73
|
+
def run(iterator, *)
|
74
|
+
iterator.select! do |item|
|
75
|
+
@filter_function.call(item)
|
76
|
+
end
|
77
|
+
iterator
|
78
|
+
end
|
79
|
+
|
80
|
+
def lazy_run(iterator, *)
|
81
|
+
iterator.select do |item|
|
82
|
+
@filter_function.call(item)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# -------------------------------------------------------------------------------------------------
|
88
|
+
# Compact
|
89
|
+
|
90
|
+
class Spark::Command::Compact < _Base
|
91
|
+
def run(iterator, *)
|
92
|
+
iterator.compact!
|
93
|
+
iterator
|
94
|
+
end
|
95
|
+
|
96
|
+
def lazy_run(iterator, *)
|
97
|
+
iterator.select do |item|
|
98
|
+
!item.nil?
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
# -------------------------------------------------------------------------------------------------
|
104
|
+
# Glom
|
105
|
+
|
106
|
+
class Spark::Command::Glom < _Base
|
107
|
+
def run(iterator, *)
|
108
|
+
[iterator]
|
109
|
+
end
|
110
|
+
|
111
|
+
def lazy_run(iterator, *)
|
112
|
+
run(iterator.to_a)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# -------------------------------------------------------------------------------------------------
|
117
|
+
# Shuffle
|
118
|
+
|
119
|
+
class Spark::Command::Shuffle < _Base
|
120
|
+
variable :seed, function: false, type: Integer
|
121
|
+
|
122
|
+
def run(iterator, *)
|
123
|
+
iterator.shuffle!(random: rng)
|
124
|
+
iterator
|
125
|
+
end
|
126
|
+
|
127
|
+
def rng
|
128
|
+
Random.new(@seed)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# -------------------------------------------------------------------------------------------------
|
133
|
+
# PartitionBy
|
134
|
+
|
135
|
+
class Spark::Command::PartitionBy
|
136
|
+
|
137
|
+
class Base < Spark::Command::Base
|
138
|
+
include Spark::Helper::Serialize
|
139
|
+
|
140
|
+
def prepare
|
141
|
+
super
|
142
|
+
|
143
|
+
# Default. Keep it after super because Sorting has own key_function.
|
144
|
+
@key_function ||= lambda{|x| x[0]}
|
145
|
+
end
|
146
|
+
|
147
|
+
def run(iterator, *)
|
148
|
+
iterator.map! do |item|
|
149
|
+
make_partition_item(item)
|
150
|
+
end
|
151
|
+
iterator.flatten!(1)
|
152
|
+
iterator
|
153
|
+
end
|
154
|
+
|
155
|
+
def lazy_run(iterator, *)
|
156
|
+
iterator.flat_map do |item|
|
157
|
+
make_partition_item(item)
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
private
|
162
|
+
|
163
|
+
def make_partition_item(item)
|
164
|
+
[
|
165
|
+
pack_long(@partition_func.call(@key_function[item])),
|
166
|
+
item
|
167
|
+
]
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
class Basic < Base
|
172
|
+
variable :partition_func
|
173
|
+
end
|
174
|
+
|
175
|
+
class Sorting < Base
|
176
|
+
variable :key_function
|
177
|
+
variable :bounds, function: false, type: Array
|
178
|
+
variable :ascending, function: false, type: [TrueClass, FalseClass]
|
179
|
+
variable :num_partitions, function: false, type: Numeric
|
180
|
+
|
181
|
+
def prepare
|
182
|
+
super
|
183
|
+
|
184
|
+
# Index by bisect alghoritm
|
185
|
+
@partition_func ||= Proc.new do |key|
|
186
|
+
count = 0
|
187
|
+
@bounds.each{|i|
|
188
|
+
break if i >= key
|
189
|
+
count += 1
|
190
|
+
}
|
191
|
+
|
192
|
+
if @ascending
|
193
|
+
count
|
194
|
+
else
|
195
|
+
@num_partitions - 1 - count
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
end # Sorting
|
201
|
+
end # PartitionBy
|
202
|
+
|
203
|
+
# -------------------------------------------------------------------------------------------------
|
204
|
+
# Aggregate
|
205
|
+
|
206
|
+
class Spark::Command::Aggregate < _Base
|
207
|
+
variable :reduce_func
|
208
|
+
variable :zero_value, function: false, type: Object
|
209
|
+
|
210
|
+
def run(iterator, *)
|
211
|
+
[iterator.reduce(@zero_value, &@reduce_func)]
|
212
|
+
end
|
213
|
+
|
214
|
+
def lazy_run(iterator, *)
|
215
|
+
run(iterator)
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
# -------------------------------------------------------------------------------------------------
|
220
|
+
# Reduce
|
221
|
+
|
222
|
+
class Spark::Command::Reduce < Spark::Command::Aggregate
|
223
|
+
def run(iterator, *)
|
224
|
+
[iterator.reduce(&@reduce_func)]
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
# -------------------------------------------------------------------------------------------------
|
229
|
+
# Foreach
|
230
|
+
|
231
|
+
class Spark::Command::Foreach < _Base
|
232
|
+
variable :each_function
|
233
|
+
|
234
|
+
def run(iterator, *)
|
235
|
+
iterator.each do |item|
|
236
|
+
@each_function.call(item)
|
237
|
+
end
|
238
|
+
nil
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
# -------------------------------------------------------------------------------------------------
|
243
|
+
# ForeachPartition
|
244
|
+
|
245
|
+
class Spark::Command::ForeachPartition < _Base
|
246
|
+
variable :partition_function
|
247
|
+
|
248
|
+
def run(iterator, *)
|
249
|
+
@partition_function.call(iterator)
|
250
|
+
nil
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
# -------------------------------------------------------------------------------------------------
|
255
|
+
# KeyBy
|
256
|
+
|
257
|
+
class Spark::Command::KeyBy < _Base
|
258
|
+
variable :key_function
|
259
|
+
|
260
|
+
def run(iterator, *)
|
261
|
+
iterator.map! do |item|
|
262
|
+
[@key_function.call(item), item]
|
263
|
+
end
|
264
|
+
iterator
|
265
|
+
end
|
266
|
+
|
267
|
+
def lazy_run(iterator, *)
|
268
|
+
iterator.map do |item|
|
269
|
+
[@key_function.call(item), item]
|
270
|
+
end
|
271
|
+
end
|
272
|
+
end
|
273
|
+
|
274
|
+
# -------------------------------------------------------------------------------------------------
|
275
|
+
# Take
|
276
|
+
|
277
|
+
class Spark::Command::Take < _Base
|
278
|
+
variable :total, function: false, type: Numeric
|
279
|
+
variable :last_part, function: false, type: Numeric
|
280
|
+
|
281
|
+
def run(iterator, index)
|
282
|
+
if index == @last_part && iterator.size > @total
|
283
|
+
return iterator.slice!(0, @total)
|
284
|
+
end
|
285
|
+
|
286
|
+
iterator
|
287
|
+
end
|
288
|
+
end
|
289
|
+
|
290
|
+
# -------------------------------------------------------------------------------------------------
|
291
|
+
# Pipe
|
292
|
+
|
293
|
+
class Spark::Command::Pipe < _Base
|
294
|
+
variable :cmds, function: false, type: Array
|
295
|
+
|
296
|
+
def before_run
|
297
|
+
require 'open3'
|
298
|
+
|
299
|
+
@in, @out, @threads = Open3.pipeline_rw(*@cmds)
|
300
|
+
end
|
301
|
+
|
302
|
+
def run(iterator, *)
|
303
|
+
create_writing_thread(iterator)
|
304
|
+
|
305
|
+
new_iterator = []
|
306
|
+
|
307
|
+
# Read full input
|
308
|
+
begin
|
309
|
+
loop {
|
310
|
+
new_iterator << @out.readline.rstrip
|
311
|
+
}
|
312
|
+
rescue EOFError
|
313
|
+
end
|
314
|
+
|
315
|
+
new_iterator
|
316
|
+
end
|
317
|
+
|
318
|
+
def lazy_run(iterator, *)
|
319
|
+
create_writing_thread(iterator)
|
320
|
+
|
321
|
+
Enumerator::Lazy.new([nil]) do |yielder, _|
|
322
|
+
begin
|
323
|
+
loop {
|
324
|
+
yielder << @out.readline.rstrip
|
325
|
+
}
|
326
|
+
rescue EOFError
|
327
|
+
end
|
328
|
+
end
|
329
|
+
end
|
330
|
+
|
331
|
+
private
|
332
|
+
|
333
|
+
def create_writing_thread(iterator)
|
334
|
+
@writing_thread = Thread.new do
|
335
|
+
# Send complete iterator to the pipe
|
336
|
+
iterator.each do |item|
|
337
|
+
@in.puts(item.to_s.rstrip)
|
338
|
+
end
|
339
|
+
|
340
|
+
# Input must be closed for EOFError
|
341
|
+
@in.close
|
342
|
+
end
|
343
|
+
end
|
344
|
+
|
345
|
+
end
|