ruby-spark 1.1.0.1-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +37 -0
- data/Gemfile +47 -0
- data/Guardfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +252 -0
- data/Rakefile +35 -0
- data/TODO.md +6 -0
- data/benchmark/aggregate.rb +33 -0
- data/benchmark/bisect.rb +88 -0
- data/benchmark/comparison/prepare.sh +18 -0
- data/benchmark/comparison/python.py +156 -0
- data/benchmark/comparison/r.r +69 -0
- data/benchmark/comparison/ruby.rb +167 -0
- data/benchmark/comparison/run-all.sh +160 -0
- data/benchmark/comparison/scala.scala +181 -0
- data/benchmark/custom_marshal.rb +94 -0
- data/benchmark/digest.rb +150 -0
- data/benchmark/enumerator.rb +88 -0
- data/benchmark/serializer.rb +82 -0
- data/benchmark/sort.rb +43 -0
- data/benchmark/sort2.rb +164 -0
- data/benchmark/take.rb +28 -0
- data/bin/ruby-spark +8 -0
- data/example/pi.rb +28 -0
- data/example/website_search.rb +83 -0
- data/ext/ruby_c/extconf.rb +3 -0
- data/ext/ruby_c/murmur.c +158 -0
- data/ext/ruby_c/murmur.h +9 -0
- data/ext/ruby_c/ruby-spark.c +18 -0
- data/ext/ruby_java/Digest.java +36 -0
- data/ext/ruby_java/Murmur2.java +98 -0
- data/ext/ruby_java/RubySparkExtService.java +28 -0
- data/ext/ruby_java/extconf.rb +3 -0
- data/ext/spark/build.sbt +73 -0
- data/ext/spark/project/plugins.sbt +9 -0
- data/ext/spark/sbt/sbt +34 -0
- data/ext/spark/src/main/scala/Exec.scala +91 -0
- data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
- data/ext/spark/src/main/scala/Marshal.scala +52 -0
- data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
- data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
- data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
- data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
- data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
- data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
- data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
- data/ext/spark/src/main/scala/RubyPage.scala +34 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +392 -0
- data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
- data/ext/spark/src/main/scala/RubyTab.scala +11 -0
- data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
- data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
- data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
- data/lib/ruby-spark.rb +1 -0
- data/lib/spark.rb +198 -0
- data/lib/spark/accumulator.rb +260 -0
- data/lib/spark/broadcast.rb +98 -0
- data/lib/spark/build.rb +43 -0
- data/lib/spark/cli.rb +169 -0
- data/lib/spark/command.rb +86 -0
- data/lib/spark/command/base.rb +158 -0
- data/lib/spark/command/basic.rb +345 -0
- data/lib/spark/command/pair.rb +124 -0
- data/lib/spark/command/sort.rb +51 -0
- data/lib/spark/command/statistic.rb +144 -0
- data/lib/spark/command_builder.rb +141 -0
- data/lib/spark/command_validator.rb +34 -0
- data/lib/spark/config.rb +238 -0
- data/lib/spark/constant.rb +14 -0
- data/lib/spark/context.rb +322 -0
- data/lib/spark/error.rb +50 -0
- data/lib/spark/ext/hash.rb +41 -0
- data/lib/spark/ext/integer.rb +25 -0
- data/lib/spark/ext/io.rb +67 -0
- data/lib/spark/ext/ip_socket.rb +29 -0
- data/lib/spark/ext/module.rb +58 -0
- data/lib/spark/ext/object.rb +24 -0
- data/lib/spark/ext/string.rb +24 -0
- data/lib/spark/helper.rb +10 -0
- data/lib/spark/helper/logger.rb +40 -0
- data/lib/spark/helper/parser.rb +85 -0
- data/lib/spark/helper/serialize.rb +71 -0
- data/lib/spark/helper/statistic.rb +93 -0
- data/lib/spark/helper/system.rb +42 -0
- data/lib/spark/java_bridge.rb +19 -0
- data/lib/spark/java_bridge/base.rb +203 -0
- data/lib/spark/java_bridge/jruby.rb +23 -0
- data/lib/spark/java_bridge/rjb.rb +41 -0
- data/lib/spark/logger.rb +76 -0
- data/lib/spark/mllib.rb +100 -0
- data/lib/spark/mllib/classification/common.rb +31 -0
- data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
- data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
- data/lib/spark/mllib/classification/svm.rb +135 -0
- data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
- data/lib/spark/mllib/clustering/kmeans.rb +118 -0
- data/lib/spark/mllib/matrix.rb +120 -0
- data/lib/spark/mllib/regression/common.rb +73 -0
- data/lib/spark/mllib/regression/labeled_point.rb +41 -0
- data/lib/spark/mllib/regression/lasso.rb +100 -0
- data/lib/spark/mllib/regression/linear.rb +124 -0
- data/lib/spark/mllib/regression/ridge.rb +97 -0
- data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
- data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
- data/lib/spark/mllib/stat/distribution.rb +12 -0
- data/lib/spark/mllib/vector.rb +185 -0
- data/lib/spark/rdd.rb +1377 -0
- data/lib/spark/sampler.rb +92 -0
- data/lib/spark/serializer.rb +79 -0
- data/lib/spark/serializer/auto_batched.rb +59 -0
- data/lib/spark/serializer/base.rb +63 -0
- data/lib/spark/serializer/batched.rb +84 -0
- data/lib/spark/serializer/cartesian.rb +13 -0
- data/lib/spark/serializer/compressed.rb +27 -0
- data/lib/spark/serializer/marshal.rb +17 -0
- data/lib/spark/serializer/message_pack.rb +23 -0
- data/lib/spark/serializer/oj.rb +23 -0
- data/lib/spark/serializer/pair.rb +41 -0
- data/lib/spark/serializer/text.rb +25 -0
- data/lib/spark/sort.rb +189 -0
- data/lib/spark/stat_counter.rb +125 -0
- data/lib/spark/storage_level.rb +39 -0
- data/lib/spark/version.rb +3 -0
- data/lib/spark/worker/master.rb +144 -0
- data/lib/spark/worker/spark_files.rb +15 -0
- data/lib/spark/worker/worker.rb +200 -0
- data/ruby-spark.gemspec +47 -0
- data/spec/generator.rb +37 -0
- data/spec/inputs/lorem_300.txt +316 -0
- data/spec/inputs/numbers/1.txt +50 -0
- data/spec/inputs/numbers/10.txt +50 -0
- data/spec/inputs/numbers/11.txt +50 -0
- data/spec/inputs/numbers/12.txt +50 -0
- data/spec/inputs/numbers/13.txt +50 -0
- data/spec/inputs/numbers/14.txt +50 -0
- data/spec/inputs/numbers/15.txt +50 -0
- data/spec/inputs/numbers/16.txt +50 -0
- data/spec/inputs/numbers/17.txt +50 -0
- data/spec/inputs/numbers/18.txt +50 -0
- data/spec/inputs/numbers/19.txt +50 -0
- data/spec/inputs/numbers/2.txt +50 -0
- data/spec/inputs/numbers/20.txt +50 -0
- data/spec/inputs/numbers/3.txt +50 -0
- data/spec/inputs/numbers/4.txt +50 -0
- data/spec/inputs/numbers/5.txt +50 -0
- data/spec/inputs/numbers/6.txt +50 -0
- data/spec/inputs/numbers/7.txt +50 -0
- data/spec/inputs/numbers/8.txt +50 -0
- data/spec/inputs/numbers/9.txt +50 -0
- data/spec/inputs/numbers_0_100.txt +101 -0
- data/spec/inputs/numbers_1_100.txt +100 -0
- data/spec/lib/collect_spec.rb +42 -0
- data/spec/lib/command_spec.rb +68 -0
- data/spec/lib/config_spec.rb +64 -0
- data/spec/lib/context_spec.rb +165 -0
- data/spec/lib/ext_spec.rb +72 -0
- data/spec/lib/external_apps_spec.rb +45 -0
- data/spec/lib/filter_spec.rb +80 -0
- data/spec/lib/flat_map_spec.rb +100 -0
- data/spec/lib/group_spec.rb +109 -0
- data/spec/lib/helper_spec.rb +19 -0
- data/spec/lib/key_spec.rb +41 -0
- data/spec/lib/manipulation_spec.rb +122 -0
- data/spec/lib/map_partitions_spec.rb +87 -0
- data/spec/lib/map_spec.rb +91 -0
- data/spec/lib/mllib/classification_spec.rb +54 -0
- data/spec/lib/mllib/clustering_spec.rb +35 -0
- data/spec/lib/mllib/matrix_spec.rb +32 -0
- data/spec/lib/mllib/regression_spec.rb +116 -0
- data/spec/lib/mllib/vector_spec.rb +77 -0
- data/spec/lib/reduce_by_key_spec.rb +118 -0
- data/spec/lib/reduce_spec.rb +131 -0
- data/spec/lib/sample_spec.rb +46 -0
- data/spec/lib/serializer_spec.rb +88 -0
- data/spec/lib/sort_spec.rb +58 -0
- data/spec/lib/statistic_spec.rb +170 -0
- data/spec/lib/whole_text_files_spec.rb +33 -0
- data/spec/spec_helper.rb +38 -0
- metadata +389 -0
@@ -0,0 +1,34 @@
|
|
1
|
+
module Spark
|
2
|
+
module CommandValidator
|
3
|
+
|
4
|
+
def validate(value, options)
|
5
|
+
validate_type(value, options[:type])
|
6
|
+
end
|
7
|
+
|
8
|
+
def valid?(value, options)
|
9
|
+
begin
|
10
|
+
validate(value, options)
|
11
|
+
return true
|
12
|
+
rescue
|
13
|
+
return false
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def validate_type(value, types)
|
18
|
+
types = [types] if !types.is_a?(Array)
|
19
|
+
|
20
|
+
types.each do |type|
|
21
|
+
return if value.is_a?(type)
|
22
|
+
end
|
23
|
+
|
24
|
+
error "Value: #{value} should be a #{types.join(' or ')} but is #{value.class}."
|
25
|
+
end
|
26
|
+
|
27
|
+
def validate_size(array1, array2)
|
28
|
+
if array1.size != array2.size
|
29
|
+
error "Wrong number of arguments (#{array1.size} for #{array2.size})"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
end
|
data/lib/spark/config.rb
ADDED
@@ -0,0 +1,238 @@
|
|
1
|
+
# Necessary libraries
|
2
|
+
Spark.load_lib
|
3
|
+
|
4
|
+
module Spark
|
5
|
+
# Common configuration for RubySpark and Spark
|
6
|
+
class Config
|
7
|
+
|
8
|
+
include Spark::Helper::System
|
9
|
+
|
10
|
+
TYPES = {
|
11
|
+
'spark.shuffle.spill' => :boolean,
|
12
|
+
'spark.ruby.serializer.compress' => :boolean
|
13
|
+
}
|
14
|
+
|
15
|
+
# Initialize java SparkConf and load default configuration.
|
16
|
+
def initialize
|
17
|
+
@spark_conf = SparkConf.new(true)
|
18
|
+
set_default
|
19
|
+
end
|
20
|
+
|
21
|
+
def from_file(file)
|
22
|
+
check_read_only
|
23
|
+
|
24
|
+
if file && File.exist?(file)
|
25
|
+
file = File.expand_path(file)
|
26
|
+
RubyUtils.loadPropertiesFile(spark_conf, file)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def [](key)
|
31
|
+
get(key)
|
32
|
+
end
|
33
|
+
|
34
|
+
def []=(key, value)
|
35
|
+
set(key, value)
|
36
|
+
end
|
37
|
+
|
38
|
+
def spark_conf
|
39
|
+
if Spark.started?
|
40
|
+
# Get latest configuration
|
41
|
+
Spark.context.jcontext.conf
|
42
|
+
else
|
43
|
+
@spark_conf
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def valid!
|
48
|
+
errors = []
|
49
|
+
|
50
|
+
if !contains?('spark.app.name')
|
51
|
+
errors << 'An application name must be set in your configuration.'
|
52
|
+
end
|
53
|
+
|
54
|
+
if !contains?('spark.master')
|
55
|
+
errors << 'A master URL must be set in your configuration.'
|
56
|
+
end
|
57
|
+
|
58
|
+
if Spark::Serializer.find(get('spark.ruby.serializer')).nil?
|
59
|
+
errors << 'Unknow serializer.'
|
60
|
+
end
|
61
|
+
|
62
|
+
scanned = get('spark.ruby.executor.command').scan('%s')
|
63
|
+
|
64
|
+
if scanned.size == 0
|
65
|
+
errors << "Executor command must contain '%s'."
|
66
|
+
end
|
67
|
+
|
68
|
+
if scanned.size > 1
|
69
|
+
errors << "Executor command can contain only one '%s'."
|
70
|
+
end
|
71
|
+
|
72
|
+
if errors.any?
|
73
|
+
errors.map!{|error| "- #{error}"}
|
74
|
+
|
75
|
+
raise Spark::ConfigurationError, "Configuration is not valid:\r\n#{errors.join("\r\n")}"
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def read_only?
|
80
|
+
Spark.started?
|
81
|
+
end
|
82
|
+
|
83
|
+
# Rescue from NoSuchElementException
|
84
|
+
def get(key)
|
85
|
+
value = spark_conf.get(key.to_s)
|
86
|
+
|
87
|
+
case TYPES[key]
|
88
|
+
when :boolean
|
89
|
+
parse_boolean(value)
|
90
|
+
when :integer
|
91
|
+
parse_integer(value)
|
92
|
+
else
|
93
|
+
value
|
94
|
+
end
|
95
|
+
rescue
|
96
|
+
nil
|
97
|
+
end
|
98
|
+
|
99
|
+
def get_all
|
100
|
+
Hash[spark_conf.getAll.map{|tuple| [tuple._1, tuple._2]}]
|
101
|
+
end
|
102
|
+
|
103
|
+
def contains?(key)
|
104
|
+
spark_conf.contains(key.to_s)
|
105
|
+
end
|
106
|
+
|
107
|
+
def set(key, value)
|
108
|
+
check_read_only
|
109
|
+
spark_conf.set(key.to_s, value.to_s)
|
110
|
+
end
|
111
|
+
|
112
|
+
def set_app_name(name)
|
113
|
+
set('spark.app.name', name)
|
114
|
+
end
|
115
|
+
|
116
|
+
def set_master(master)
|
117
|
+
set('spark.master', master)
|
118
|
+
end
|
119
|
+
|
120
|
+
def parse_boolean(value)
|
121
|
+
case value
|
122
|
+
when 'true'
|
123
|
+
true
|
124
|
+
when 'false'
|
125
|
+
false
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
def parse_integer(value)
|
130
|
+
value.to_i
|
131
|
+
end
|
132
|
+
|
133
|
+
# =============================================================================
|
134
|
+
# Defaults
|
135
|
+
|
136
|
+
def set_default
|
137
|
+
set_app_name('RubySpark')
|
138
|
+
set_master('local[*]')
|
139
|
+
set('spark.ruby.driver_home', Spark.home)
|
140
|
+
set('spark.ruby.serializer', default_serializer)
|
141
|
+
set('spark.ruby.serializer.compress', default_serializer_compress)
|
142
|
+
set('spark.ruby.serializer.batch_size', default_serializer_batch_size)
|
143
|
+
set('spark.ruby.executor.uri', default_executor_uri)
|
144
|
+
set('spark.ruby.executor.command', default_executor_command)
|
145
|
+
set('spark.ruby.executor.options', default_executor_options)
|
146
|
+
set('spark.ruby.worker.type', default_worker_type)
|
147
|
+
load_executor_envs
|
148
|
+
end
|
149
|
+
|
150
|
+
def default_serializer
|
151
|
+
ENV['SPARK_RUBY_SERIALIZER'] || Spark::Serializer::DEFAULT_SERIALIZER_NAME
|
152
|
+
end
|
153
|
+
|
154
|
+
def default_serializer_compress
|
155
|
+
ENV['SPARK_RUBY_SERIALIZER_COMPRESS'] || Spark::Serializer::DEFAULT_COMPRESS
|
156
|
+
end
|
157
|
+
|
158
|
+
def default_serializer_batch_size
|
159
|
+
ENV['SPARK_RUBY_SERIALIZER_BATCH_SIZE'] || Spark::Serializer::DEFAULT_BATCH_SIZE
|
160
|
+
end
|
161
|
+
|
162
|
+
# Ruby executor.
|
163
|
+
#
|
164
|
+
# == Options:
|
165
|
+
# nil::
|
166
|
+
# System's gem is loaded (ruby-spark).
|
167
|
+
#
|
168
|
+
# other::
|
169
|
+
# Path of library which will be used.
|
170
|
+
# Current ruby-spark gem is used.
|
171
|
+
# (default)
|
172
|
+
#
|
173
|
+
def default_executor_uri
|
174
|
+
ENV['SPARK_RUBY_EXECUTOR_URI'] || ''
|
175
|
+
end
|
176
|
+
|
177
|
+
# Command template which is applied when scala want create a ruby
|
178
|
+
# process (e.g. master, home request). Command is represented by '%s'.
|
179
|
+
#
|
180
|
+
# == Example:
|
181
|
+
# bash --norc -i -c "export HOME=/home/user; cd; source .bashrc; %s"
|
182
|
+
#
|
183
|
+
def default_executor_command
|
184
|
+
ENV['SPARK_RUBY_EXECUTOR_COMMAND'] || '%s'
|
185
|
+
end
|
186
|
+
|
187
|
+
# Options for every worker.
|
188
|
+
#
|
189
|
+
# == Examples:
|
190
|
+
# -J-Xmx512m
|
191
|
+
#
|
192
|
+
def default_executor_options
|
193
|
+
ENV['SPARK_RUBY_EXECUTOR_OPTIONS'] || ''
|
194
|
+
end
|
195
|
+
|
196
|
+
# Type of worker.
|
197
|
+
#
|
198
|
+
# == Options:
|
199
|
+
# process:: (default)
|
200
|
+
# thread:: (experimental)
|
201
|
+
#
|
202
|
+
def default_worker_type
|
203
|
+
ENV['SPARK_RUBY_WORKER_TYPE'] || 'process'
|
204
|
+
end
|
205
|
+
|
206
|
+
# Load environment variables for executor from ENV.
|
207
|
+
#
|
208
|
+
# == Examples:
|
209
|
+
# SPARK_RUBY_EXECUTOR_ENV_KEY1="1"
|
210
|
+
# SPARK_RUBY_EXECUTOR_ENV_KEY2="2"
|
211
|
+
#
|
212
|
+
def load_executor_envs
|
213
|
+
prefix = 'SPARK_RUBY_EXECUTOR_ENV_'
|
214
|
+
|
215
|
+
envs = ENV.select{|key, _| key.start_with?(prefix)}
|
216
|
+
envs.each do |key, value|
|
217
|
+
key = key.dup # ENV keys are frozen
|
218
|
+
key.slice!(0, prefix.size)
|
219
|
+
|
220
|
+
set("spark.ruby.executor.env.#{key}", value)
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
# Aliases
|
225
|
+
alias_method :getAll, :get_all
|
226
|
+
alias_method :setAppName, :set_app_name
|
227
|
+
alias_method :setMaster, :set_master
|
228
|
+
|
229
|
+
private
|
230
|
+
|
231
|
+
def check_read_only
|
232
|
+
if read_only?
|
233
|
+
raise Spark::ConfigurationError, 'Configuration is ready only'
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
end
|
238
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module Spark
|
2
|
+
# Commond constant for Ruby and Spark
|
3
|
+
module Constant
|
4
|
+
DATA_EOF = -2
|
5
|
+
WORKER_ERROR = -1
|
6
|
+
WORKER_DONE = 0
|
7
|
+
CREATE_WORKER = 1
|
8
|
+
KILL_WORKER = 2
|
9
|
+
KILL_WORKER_AND_WAIT = 3
|
10
|
+
SUCCESSFULLY_KILLED = 4
|
11
|
+
UNSUCCESSFUL_KILLING = 5
|
12
|
+
ACCUMULATOR_ACK = 6
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,322 @@
|
|
1
|
+
# Necessary libraries
|
2
|
+
Spark.load_lib
|
3
|
+
|
4
|
+
module Spark
|
5
|
+
##
|
6
|
+
# Main entry point for Spark functionality. A SparkContext represents the connection to a Spark
|
7
|
+
# cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster.
|
8
|
+
#
|
9
|
+
class Context
|
10
|
+
|
11
|
+
include Spark::Helper::System
|
12
|
+
include Spark::Helper::Parser
|
13
|
+
include Spark::Helper::Logger
|
14
|
+
|
15
|
+
attr_reader :jcontext, :jaccumulator, :temp_dir
|
16
|
+
|
17
|
+
# Constructor for Ruby context. Configuration is automatically is taken
|
18
|
+
# from Spark. Config will be automatically set to default if user start
|
19
|
+
# context first.
|
20
|
+
#
|
21
|
+
def initialize
|
22
|
+
Spark.config.valid!
|
23
|
+
@jcontext = JavaSparkContext.new(Spark.config.spark_conf)
|
24
|
+
@jcontext.addJar(Spark.ruby_spark_jar)
|
25
|
+
|
26
|
+
# Does not work on 1.2
|
27
|
+
# ui.attachTab(RubyTab.new(ui, to_java_hash(RbConfig::CONFIG)))
|
28
|
+
|
29
|
+
spark_local_dir = JUtils.getLocalDir(sc.conf)
|
30
|
+
@temp_dir = JUtils.createTempDir(spark_local_dir, 'ruby').getAbsolutePath
|
31
|
+
|
32
|
+
accum_server = Spark::Accumulator::Server
|
33
|
+
accum_server.start
|
34
|
+
@jaccumulator = @jcontext.accumulator(ArrayList.new, RubyAccumulatorParam.new(accum_server.host, accum_server.port))
|
35
|
+
|
36
|
+
log_info("Ruby accumulator server is running on port #{accum_server.port}")
|
37
|
+
|
38
|
+
set_call_site('Ruby') # description of stage
|
39
|
+
end
|
40
|
+
|
41
|
+
def stop
|
42
|
+
Spark::Accumulator::Server.stop
|
43
|
+
log_info('Ruby accumulator server was stopped')
|
44
|
+
@jcontext.stop
|
45
|
+
end
|
46
|
+
|
47
|
+
def sc
|
48
|
+
@jcontext.sc
|
49
|
+
end
|
50
|
+
|
51
|
+
def ui
|
52
|
+
sc.ui
|
53
|
+
end
|
54
|
+
|
55
|
+
# Default level of parallelism to use when not given by user (e.g. parallelize and makeRDD)
|
56
|
+
#
|
57
|
+
def default_parallelism
|
58
|
+
sc.defaultParallelism
|
59
|
+
end
|
60
|
+
|
61
|
+
# Default serializer
|
62
|
+
#
|
63
|
+
# Batch -> Compress -> Basic
|
64
|
+
#
|
65
|
+
def default_serializer
|
66
|
+
# Basic
|
67
|
+
serializer = Spark::Serializer.find!(config('spark.ruby.serializer')).new
|
68
|
+
|
69
|
+
# Compress
|
70
|
+
if config('spark.ruby.serializer.compress')
|
71
|
+
serializer = Spark::Serializer.compressed(serializer)
|
72
|
+
end
|
73
|
+
|
74
|
+
# Bactching
|
75
|
+
batch_size = default_batch_size
|
76
|
+
if batch_size == 'auto'
|
77
|
+
serializer = Spark::Serializer.auto_batched(serializer)
|
78
|
+
else
|
79
|
+
serializer = Spark::Serializer.batched(serializer, batch_size)
|
80
|
+
end
|
81
|
+
|
82
|
+
# Finally, "container" contains serializers
|
83
|
+
serializer
|
84
|
+
end
|
85
|
+
|
86
|
+
def default_batch_size
|
87
|
+
size = config('spark.ruby.serializer.batch_size').to_i
|
88
|
+
if size >= 1
|
89
|
+
size
|
90
|
+
else
|
91
|
+
'auto'
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# Set a local property that affects jobs submitted from this thread, such as the
|
96
|
+
# Spark fair scheduler pool.
|
97
|
+
#
|
98
|
+
def set_local_property(key, value)
|
99
|
+
jcontext.setLocalProperty(key, value)
|
100
|
+
end
|
101
|
+
|
102
|
+
# Get a local property set in this thread, or null if it is missing
|
103
|
+
#
|
104
|
+
def get_local_property(key)
|
105
|
+
jcontext.getLocalProperty(key)
|
106
|
+
end
|
107
|
+
|
108
|
+
# Support function for API backtraces.
|
109
|
+
#
|
110
|
+
def set_call_site(site)
|
111
|
+
set_local_property('externalCallSite', site)
|
112
|
+
end
|
113
|
+
|
114
|
+
# Capture the current user callsite and return a formatted version for printing. If the user
|
115
|
+
# has overridden the call site, this will return the user's version.
|
116
|
+
#
|
117
|
+
def get_call_site
|
118
|
+
jcontext.getCallSite
|
119
|
+
end
|
120
|
+
|
121
|
+
# Return a copy of this SparkContext's configuration. The configuration *cannot*
|
122
|
+
# be changed at runtime.
|
123
|
+
#
|
124
|
+
def config(key=nil)
|
125
|
+
if key
|
126
|
+
Spark.config.get(key)
|
127
|
+
else
|
128
|
+
Spark.config
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# Add a file to be downloaded with this Spark job on every node.
|
133
|
+
# The path of file passed can be either a local file, a file in HDFS
|
134
|
+
# (or other Hadoop-supported filesystems), or an HTTP, HTTPS or FTP URI.
|
135
|
+
#
|
136
|
+
# To access the file in Spark jobs, use `SparkFiles.get(file_name)` with the
|
137
|
+
# filename to find its download location.
|
138
|
+
#
|
139
|
+
# == Example:
|
140
|
+
# `echo 10 > test.txt`
|
141
|
+
#
|
142
|
+
# $sc.add_file('test.txt')
|
143
|
+
# $sc.parallelize(0..5).map(lambda{|x| x * SparkFiles.get_content('test.txt').to_i}).collect
|
144
|
+
# # => [0, 10, 20, 30, 40, 50]
|
145
|
+
#
|
146
|
+
def add_file(*files)
|
147
|
+
files.each do |file|
|
148
|
+
sc.addFile(file)
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
# Broadcast a read-only variable to the cluster, returning a Spark::Broadcast
|
153
|
+
# object for reading it in distributed functions. The variable will
|
154
|
+
# be sent to each cluster only once.
|
155
|
+
#
|
156
|
+
# == Example:
|
157
|
+
# broadcast1 = $sc.broadcast('a')
|
158
|
+
# broadcast2 = $sc.broadcast('b')
|
159
|
+
#
|
160
|
+
# rdd = $sc.parallelize(0..5, 4)
|
161
|
+
# rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2)
|
162
|
+
# rdd = rdd.map_partitions_with_index(lambda{|part, index| [broadcast1.value * index, broadcast2.value * index] })
|
163
|
+
# rdd.collect
|
164
|
+
# # => ["", "", "a", "b", "aa", "bb", "aaa", "bbb"]
|
165
|
+
#
|
166
|
+
def broadcast(value)
|
167
|
+
Spark::Broadcast.new(self, value)
|
168
|
+
end
|
169
|
+
|
170
|
+
# Create an Accumulator with the given initial value, using a given
|
171
|
+
# accum_param helper object to define how to add values of the
|
172
|
+
# data type if provided.
|
173
|
+
#
|
174
|
+
# == Example:
|
175
|
+
# accum = $sc.accumulator(7)
|
176
|
+
#
|
177
|
+
# rdd = $sc.parallelize(0..5, 4)
|
178
|
+
# rdd = rdd.bind(accum: accum)
|
179
|
+
# rdd = rdd.map_partitions(lambda{|_| accum.add(1) })
|
180
|
+
# rdd = rdd.collect
|
181
|
+
#
|
182
|
+
# accum.value
|
183
|
+
# # => 11
|
184
|
+
#
|
185
|
+
def accumulator(value, accum_param=:+, zero_value=0)
|
186
|
+
Spark::Accumulator.new(value, accum_param, zero_value)
|
187
|
+
end
|
188
|
+
|
189
|
+
# Distribute a local Ruby collection to form an RDD
|
190
|
+
# Direct method can be slow so be careful, this method update data inplace
|
191
|
+
#
|
192
|
+
# == Parameters:
|
193
|
+
# data:: Range or Array
|
194
|
+
# num_slices:: number of slice
|
195
|
+
# serializer:: custom serializer (default: serializer based on configuration)
|
196
|
+
#
|
197
|
+
# == Examples:
|
198
|
+
# $sc.parallelize(["1", "2", "3"]).map(lambda{|x| x.to_i}).collect
|
199
|
+
# #=> [1, 2, 3]
|
200
|
+
#
|
201
|
+
# $sc.parallelize(1..3).map(:to_s).collect
|
202
|
+
# #=> ["1", "2", "3"]
|
203
|
+
#
|
204
|
+
def parallelize(data, num_slices=nil, serializer=nil)
|
205
|
+
num_slices ||= default_parallelism
|
206
|
+
serializer ||= default_serializer
|
207
|
+
|
208
|
+
serializer.check_each(data)
|
209
|
+
|
210
|
+
# Through file
|
211
|
+
file = Tempfile.new('to_parallelize', temp_dir)
|
212
|
+
serializer.dump_to_io(data, file)
|
213
|
+
file.close # not unlink
|
214
|
+
jrdd = RubyRDD.readRDDFromFile(jcontext, file.path, num_slices)
|
215
|
+
|
216
|
+
Spark::RDD.new(jrdd, self, serializer)
|
217
|
+
ensure
|
218
|
+
file && file.unlink
|
219
|
+
end
|
220
|
+
|
221
|
+
# Read a text file from HDFS, a local file system (available on all nodes), or any
|
222
|
+
# Hadoop-supported file system URI, and return it as an RDD of Strings.
|
223
|
+
#
|
224
|
+
# == Example:
|
225
|
+
# f = Tempfile.new("test")
|
226
|
+
# f.puts("1")
|
227
|
+
# f.puts("2")
|
228
|
+
# f.close
|
229
|
+
#
|
230
|
+
# $sc.text_file(f.path).map(lambda{|x| x.to_i}).collect
|
231
|
+
# # => [1, 2]
|
232
|
+
#
|
233
|
+
def text_file(path, min_partitions=nil, encoding=Encoding::UTF_8, serializer=nil)
|
234
|
+
min_partitions ||= default_parallelism
|
235
|
+
serializer ||= default_serializer
|
236
|
+
deserializer = Spark::Serializer.build { __text__(encoding) }
|
237
|
+
|
238
|
+
Spark::RDD.new(@jcontext.textFile(path, min_partitions), self, serializer, deserializer)
|
239
|
+
end
|
240
|
+
|
241
|
+
# Read a directory of text files from HDFS, a local file system (available on all nodes), or any
|
242
|
+
# Hadoop-supported file system URI. Each file is read as a single record and returned in a
|
243
|
+
# key-value pair, where the key is the path of each file, the value is the content of each file.
|
244
|
+
#
|
245
|
+
# == Example:
|
246
|
+
# dir = Dir.mktmpdir
|
247
|
+
# f1 = Tempfile.new("test1", dir)
|
248
|
+
# f2 = Tempfile.new("test2", dir)
|
249
|
+
# f1.puts("1"); f1.puts("2");
|
250
|
+
# f2.puts("3"); f2.puts("4");
|
251
|
+
# f1.close
|
252
|
+
# f2.close
|
253
|
+
#
|
254
|
+
# $sc.whole_text_files(dir).flat_map(lambda{|key, value| value.split}).collect
|
255
|
+
# # => ["1", "2", "3", "4"]
|
256
|
+
#
|
257
|
+
def whole_text_files(path, min_partitions=nil, serializer=nil)
|
258
|
+
min_partitions ||= default_parallelism
|
259
|
+
serializer ||= default_serializer
|
260
|
+
deserializer = Spark::Serializer.build{ __pair__(__text__, __text__) }
|
261
|
+
|
262
|
+
Spark::RDD.new(@jcontext.wholeTextFiles(path, min_partitions), self, serializer, deserializer)
|
263
|
+
end
|
264
|
+
|
265
|
+
# Executes the given partition function f on the specified set of partitions,
|
266
|
+
# returning the result as an array of elements.
|
267
|
+
#
|
268
|
+
# If partitions is not specified, this will run over all partitions.
|
269
|
+
#
|
270
|
+
# == Example:
|
271
|
+
# rdd = $sc.parallelize(0..10, 5)
|
272
|
+
# $sc.run_job(rdd, lambda{|x| x.to_s}, [0,2])
|
273
|
+
# # => ["[0, 1]", "[4, 5]"]
|
274
|
+
#
|
275
|
+
def run_job(rdd, f, partitions=nil, allow_local=false)
|
276
|
+
run_job_with_command(rdd, partitions, allow_local, Spark::Command::MapPartitions, f)
|
277
|
+
end
|
278
|
+
|
279
|
+
# Execute the given command on specific set of partitions.
|
280
|
+
#
|
281
|
+
def run_job_with_command(rdd, partitions, allow_local, command, *args)
|
282
|
+
if !partitions.nil? && !partitions.is_a?(Array)
|
283
|
+
raise Spark::ContextError, 'Partitions must be nil or Array'
|
284
|
+
end
|
285
|
+
|
286
|
+
partitions_size = rdd.partitions_size
|
287
|
+
|
288
|
+
# Execute all parts
|
289
|
+
if partitions.nil?
|
290
|
+
partitions = (0...partitions_size).to_a
|
291
|
+
end
|
292
|
+
|
293
|
+
# Can happend when you use coalesce
|
294
|
+
partitions.delete_if {|part| part >= partitions_size}
|
295
|
+
|
296
|
+
# Rjb represent Fixnum as Integer but Jruby as Long
|
297
|
+
partitions = to_java_array_list(convert_to_java_int(partitions))
|
298
|
+
|
299
|
+
# File for result
|
300
|
+
file = Tempfile.new('collect', temp_dir)
|
301
|
+
|
302
|
+
mapped = rdd.new_rdd_from_command(command, *args)
|
303
|
+
RubyRDD.runJob(rdd.context.sc, mapped.jrdd, partitions, allow_local, file.path)
|
304
|
+
|
305
|
+
mapped.collect_from_file(file)
|
306
|
+
end
|
307
|
+
|
308
|
+
|
309
|
+
# Aliases
|
310
|
+
alias_method :textFile, :text_file
|
311
|
+
alias_method :wholeTextFiles, :whole_text_files
|
312
|
+
alias_method :defaultParallelism, :default_parallelism
|
313
|
+
alias_method :setLocalProperty, :set_local_property
|
314
|
+
alias_method :getLocalProperty, :get_local_property
|
315
|
+
alias_method :setCallSite, :set_call_site
|
316
|
+
alias_method :getCallSite, :get_call_site
|
317
|
+
alias_method :runJob, :run_job
|
318
|
+
alias_method :runJobWithCommand, :run_job_with_command
|
319
|
+
alias_method :addFile, :add_file
|
320
|
+
|
321
|
+
end
|
322
|
+
end
|