ruby-spark 1.1.0.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +37 -0
- data/Gemfile +47 -0
- data/Guardfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +252 -0
- data/Rakefile +35 -0
- data/TODO.md +6 -0
- data/benchmark/aggregate.rb +33 -0
- data/benchmark/bisect.rb +88 -0
- data/benchmark/comparison/prepare.sh +18 -0
- data/benchmark/comparison/python.py +156 -0
- data/benchmark/comparison/r.r +69 -0
- data/benchmark/comparison/ruby.rb +167 -0
- data/benchmark/comparison/run-all.sh +160 -0
- data/benchmark/comparison/scala.scala +181 -0
- data/benchmark/custom_marshal.rb +94 -0
- data/benchmark/digest.rb +150 -0
- data/benchmark/enumerator.rb +88 -0
- data/benchmark/serializer.rb +82 -0
- data/benchmark/sort.rb +43 -0
- data/benchmark/sort2.rb +164 -0
- data/benchmark/take.rb +28 -0
- data/bin/ruby-spark +8 -0
- data/example/pi.rb +28 -0
- data/example/website_search.rb +83 -0
- data/ext/ruby_c/extconf.rb +3 -0
- data/ext/ruby_c/murmur.c +158 -0
- data/ext/ruby_c/murmur.h +9 -0
- data/ext/ruby_c/ruby-spark.c +18 -0
- data/ext/ruby_java/Digest.java +36 -0
- data/ext/ruby_java/Murmur2.java +98 -0
- data/ext/ruby_java/RubySparkExtService.java +28 -0
- data/ext/ruby_java/extconf.rb +3 -0
- data/ext/spark/build.sbt +73 -0
- data/ext/spark/project/plugins.sbt +9 -0
- data/ext/spark/sbt/sbt +34 -0
- data/ext/spark/src/main/scala/Exec.scala +91 -0
- data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
- data/ext/spark/src/main/scala/Marshal.scala +52 -0
- data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
- data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
- data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
- data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
- data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
- data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
- data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
- data/ext/spark/src/main/scala/RubyPage.scala +34 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +392 -0
- data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
- data/ext/spark/src/main/scala/RubyTab.scala +11 -0
- data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
- data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
- data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
- data/lib/ruby-spark.rb +1 -0
- data/lib/spark.rb +198 -0
- data/lib/spark/accumulator.rb +260 -0
- data/lib/spark/broadcast.rb +98 -0
- data/lib/spark/build.rb +43 -0
- data/lib/spark/cli.rb +169 -0
- data/lib/spark/command.rb +86 -0
- data/lib/spark/command/base.rb +158 -0
- data/lib/spark/command/basic.rb +345 -0
- data/lib/spark/command/pair.rb +124 -0
- data/lib/spark/command/sort.rb +51 -0
- data/lib/spark/command/statistic.rb +144 -0
- data/lib/spark/command_builder.rb +141 -0
- data/lib/spark/command_validator.rb +34 -0
- data/lib/spark/config.rb +238 -0
- data/lib/spark/constant.rb +14 -0
- data/lib/spark/context.rb +322 -0
- data/lib/spark/error.rb +50 -0
- data/lib/spark/ext/hash.rb +41 -0
- data/lib/spark/ext/integer.rb +25 -0
- data/lib/spark/ext/io.rb +67 -0
- data/lib/spark/ext/ip_socket.rb +29 -0
- data/lib/spark/ext/module.rb +58 -0
- data/lib/spark/ext/object.rb +24 -0
- data/lib/spark/ext/string.rb +24 -0
- data/lib/spark/helper.rb +10 -0
- data/lib/spark/helper/logger.rb +40 -0
- data/lib/spark/helper/parser.rb +85 -0
- data/lib/spark/helper/serialize.rb +71 -0
- data/lib/spark/helper/statistic.rb +93 -0
- data/lib/spark/helper/system.rb +42 -0
- data/lib/spark/java_bridge.rb +19 -0
- data/lib/spark/java_bridge/base.rb +203 -0
- data/lib/spark/java_bridge/jruby.rb +23 -0
- data/lib/spark/java_bridge/rjb.rb +41 -0
- data/lib/spark/logger.rb +76 -0
- data/lib/spark/mllib.rb +100 -0
- data/lib/spark/mllib/classification/common.rb +31 -0
- data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
- data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
- data/lib/spark/mllib/classification/svm.rb +135 -0
- data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
- data/lib/spark/mllib/clustering/kmeans.rb +118 -0
- data/lib/spark/mllib/matrix.rb +120 -0
- data/lib/spark/mllib/regression/common.rb +73 -0
- data/lib/spark/mllib/regression/labeled_point.rb +41 -0
- data/lib/spark/mllib/regression/lasso.rb +100 -0
- data/lib/spark/mllib/regression/linear.rb +124 -0
- data/lib/spark/mllib/regression/ridge.rb +97 -0
- data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
- data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
- data/lib/spark/mllib/stat/distribution.rb +12 -0
- data/lib/spark/mllib/vector.rb +185 -0
- data/lib/spark/rdd.rb +1377 -0
- data/lib/spark/sampler.rb +92 -0
- data/lib/spark/serializer.rb +79 -0
- data/lib/spark/serializer/auto_batched.rb +59 -0
- data/lib/spark/serializer/base.rb +63 -0
- data/lib/spark/serializer/batched.rb +84 -0
- data/lib/spark/serializer/cartesian.rb +13 -0
- data/lib/spark/serializer/compressed.rb +27 -0
- data/lib/spark/serializer/marshal.rb +17 -0
- data/lib/spark/serializer/message_pack.rb +23 -0
- data/lib/spark/serializer/oj.rb +23 -0
- data/lib/spark/serializer/pair.rb +41 -0
- data/lib/spark/serializer/text.rb +25 -0
- data/lib/spark/sort.rb +189 -0
- data/lib/spark/stat_counter.rb +125 -0
- data/lib/spark/storage_level.rb +39 -0
- data/lib/spark/version.rb +3 -0
- data/lib/spark/worker/master.rb +144 -0
- data/lib/spark/worker/spark_files.rb +15 -0
- data/lib/spark/worker/worker.rb +200 -0
- data/ruby-spark.gemspec +47 -0
- data/spec/generator.rb +37 -0
- data/spec/inputs/lorem_300.txt +316 -0
- data/spec/inputs/numbers/1.txt +50 -0
- data/spec/inputs/numbers/10.txt +50 -0
- data/spec/inputs/numbers/11.txt +50 -0
- data/spec/inputs/numbers/12.txt +50 -0
- data/spec/inputs/numbers/13.txt +50 -0
- data/spec/inputs/numbers/14.txt +50 -0
- data/spec/inputs/numbers/15.txt +50 -0
- data/spec/inputs/numbers/16.txt +50 -0
- data/spec/inputs/numbers/17.txt +50 -0
- data/spec/inputs/numbers/18.txt +50 -0
- data/spec/inputs/numbers/19.txt +50 -0
- data/spec/inputs/numbers/2.txt +50 -0
- data/spec/inputs/numbers/20.txt +50 -0
- data/spec/inputs/numbers/3.txt +50 -0
- data/spec/inputs/numbers/4.txt +50 -0
- data/spec/inputs/numbers/5.txt +50 -0
- data/spec/inputs/numbers/6.txt +50 -0
- data/spec/inputs/numbers/7.txt +50 -0
- data/spec/inputs/numbers/8.txt +50 -0
- data/spec/inputs/numbers/9.txt +50 -0
- data/spec/inputs/numbers_0_100.txt +101 -0
- data/spec/inputs/numbers_1_100.txt +100 -0
- data/spec/lib/collect_spec.rb +42 -0
- data/spec/lib/command_spec.rb +68 -0
- data/spec/lib/config_spec.rb +64 -0
- data/spec/lib/context_spec.rb +165 -0
- data/spec/lib/ext_spec.rb +72 -0
- data/spec/lib/external_apps_spec.rb +45 -0
- data/spec/lib/filter_spec.rb +80 -0
- data/spec/lib/flat_map_spec.rb +100 -0
- data/spec/lib/group_spec.rb +109 -0
- data/spec/lib/helper_spec.rb +19 -0
- data/spec/lib/key_spec.rb +41 -0
- data/spec/lib/manipulation_spec.rb +122 -0
- data/spec/lib/map_partitions_spec.rb +87 -0
- data/spec/lib/map_spec.rb +91 -0
- data/spec/lib/mllib/classification_spec.rb +54 -0
- data/spec/lib/mllib/clustering_spec.rb +35 -0
- data/spec/lib/mllib/matrix_spec.rb +32 -0
- data/spec/lib/mllib/regression_spec.rb +116 -0
- data/spec/lib/mllib/vector_spec.rb +77 -0
- data/spec/lib/reduce_by_key_spec.rb +118 -0
- data/spec/lib/reduce_spec.rb +131 -0
- data/spec/lib/sample_spec.rb +46 -0
- data/spec/lib/serializer_spec.rb +88 -0
- data/spec/lib/sort_spec.rb +58 -0
- data/spec/lib/statistic_spec.rb +170 -0
- data/spec/lib/whole_text_files_spec.rb +33 -0
- data/spec/spec_helper.rb +38 -0
- metadata +389 -0
@@ -0,0 +1,34 @@
|
|
1
|
+
module Spark
|
2
|
+
module CommandValidator
|
3
|
+
|
4
|
+
def validate(value, options)
|
5
|
+
validate_type(value, options[:type])
|
6
|
+
end
|
7
|
+
|
8
|
+
def valid?(value, options)
|
9
|
+
begin
|
10
|
+
validate(value, options)
|
11
|
+
return true
|
12
|
+
rescue
|
13
|
+
return false
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def validate_type(value, types)
|
18
|
+
types = [types] if !types.is_a?(Array)
|
19
|
+
|
20
|
+
types.each do |type|
|
21
|
+
return if value.is_a?(type)
|
22
|
+
end
|
23
|
+
|
24
|
+
error "Value: #{value} should be a #{types.join(' or ')} but is #{value.class}."
|
25
|
+
end
|
26
|
+
|
27
|
+
def validate_size(array1, array2)
|
28
|
+
if array1.size != array2.size
|
29
|
+
error "Wrong number of arguments (#{array1.size} for #{array2.size})"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
end
|
data/lib/spark/config.rb
ADDED
@@ -0,0 +1,238 @@
|
|
1
|
+
# Necessary libraries
|
2
|
+
Spark.load_lib
|
3
|
+
|
4
|
+
module Spark
|
5
|
+
# Common configuration for RubySpark and Spark
|
6
|
+
class Config
|
7
|
+
|
8
|
+
include Spark::Helper::System
|
9
|
+
|
10
|
+
TYPES = {
|
11
|
+
'spark.shuffle.spill' => :boolean,
|
12
|
+
'spark.ruby.serializer.compress' => :boolean
|
13
|
+
}
|
14
|
+
|
15
|
+
# Initialize java SparkConf and load default configuration.
|
16
|
+
def initialize
|
17
|
+
@spark_conf = SparkConf.new(true)
|
18
|
+
set_default
|
19
|
+
end
|
20
|
+
|
21
|
+
def from_file(file)
|
22
|
+
check_read_only
|
23
|
+
|
24
|
+
if file && File.exist?(file)
|
25
|
+
file = File.expand_path(file)
|
26
|
+
RubyUtils.loadPropertiesFile(spark_conf, file)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def [](key)
|
31
|
+
get(key)
|
32
|
+
end
|
33
|
+
|
34
|
+
def []=(key, value)
|
35
|
+
set(key, value)
|
36
|
+
end
|
37
|
+
|
38
|
+
def spark_conf
|
39
|
+
if Spark.started?
|
40
|
+
# Get latest configuration
|
41
|
+
Spark.context.jcontext.conf
|
42
|
+
else
|
43
|
+
@spark_conf
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def valid!
|
48
|
+
errors = []
|
49
|
+
|
50
|
+
if !contains?('spark.app.name')
|
51
|
+
errors << 'An application name must be set in your configuration.'
|
52
|
+
end
|
53
|
+
|
54
|
+
if !contains?('spark.master')
|
55
|
+
errors << 'A master URL must be set in your configuration.'
|
56
|
+
end
|
57
|
+
|
58
|
+
if Spark::Serializer.find(get('spark.ruby.serializer')).nil?
|
59
|
+
errors << 'Unknow serializer.'
|
60
|
+
end
|
61
|
+
|
62
|
+
scanned = get('spark.ruby.executor.command').scan('%s')
|
63
|
+
|
64
|
+
if scanned.size == 0
|
65
|
+
errors << "Executor command must contain '%s'."
|
66
|
+
end
|
67
|
+
|
68
|
+
if scanned.size > 1
|
69
|
+
errors << "Executor command can contain only one '%s'."
|
70
|
+
end
|
71
|
+
|
72
|
+
if errors.any?
|
73
|
+
errors.map!{|error| "- #{error}"}
|
74
|
+
|
75
|
+
raise Spark::ConfigurationError, "Configuration is not valid:\r\n#{errors.join("\r\n")}"
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def read_only?
|
80
|
+
Spark.started?
|
81
|
+
end
|
82
|
+
|
83
|
+
# Rescue from NoSuchElementException
|
84
|
+
def get(key)
|
85
|
+
value = spark_conf.get(key.to_s)
|
86
|
+
|
87
|
+
case TYPES[key]
|
88
|
+
when :boolean
|
89
|
+
parse_boolean(value)
|
90
|
+
when :integer
|
91
|
+
parse_integer(value)
|
92
|
+
else
|
93
|
+
value
|
94
|
+
end
|
95
|
+
rescue
|
96
|
+
nil
|
97
|
+
end
|
98
|
+
|
99
|
+
def get_all
|
100
|
+
Hash[spark_conf.getAll.map{|tuple| [tuple._1, tuple._2]}]
|
101
|
+
end
|
102
|
+
|
103
|
+
def contains?(key)
|
104
|
+
spark_conf.contains(key.to_s)
|
105
|
+
end
|
106
|
+
|
107
|
+
def set(key, value)
|
108
|
+
check_read_only
|
109
|
+
spark_conf.set(key.to_s, value.to_s)
|
110
|
+
end
|
111
|
+
|
112
|
+
def set_app_name(name)
|
113
|
+
set('spark.app.name', name)
|
114
|
+
end
|
115
|
+
|
116
|
+
def set_master(master)
|
117
|
+
set('spark.master', master)
|
118
|
+
end
|
119
|
+
|
120
|
+
def parse_boolean(value)
|
121
|
+
case value
|
122
|
+
when 'true'
|
123
|
+
true
|
124
|
+
when 'false'
|
125
|
+
false
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
def parse_integer(value)
|
130
|
+
value.to_i
|
131
|
+
end
|
132
|
+
|
133
|
+
# =============================================================================
|
134
|
+
# Defaults
|
135
|
+
|
136
|
+
def set_default
|
137
|
+
set_app_name('RubySpark')
|
138
|
+
set_master('local[*]')
|
139
|
+
set('spark.ruby.driver_home', Spark.home)
|
140
|
+
set('spark.ruby.serializer', default_serializer)
|
141
|
+
set('spark.ruby.serializer.compress', default_serializer_compress)
|
142
|
+
set('spark.ruby.serializer.batch_size', default_serializer_batch_size)
|
143
|
+
set('spark.ruby.executor.uri', default_executor_uri)
|
144
|
+
set('spark.ruby.executor.command', default_executor_command)
|
145
|
+
set('spark.ruby.executor.options', default_executor_options)
|
146
|
+
set('spark.ruby.worker.type', default_worker_type)
|
147
|
+
load_executor_envs
|
148
|
+
end
|
149
|
+
|
150
|
+
def default_serializer
|
151
|
+
ENV['SPARK_RUBY_SERIALIZER'] || Spark::Serializer::DEFAULT_SERIALIZER_NAME
|
152
|
+
end
|
153
|
+
|
154
|
+
def default_serializer_compress
|
155
|
+
ENV['SPARK_RUBY_SERIALIZER_COMPRESS'] || Spark::Serializer::DEFAULT_COMPRESS
|
156
|
+
end
|
157
|
+
|
158
|
+
def default_serializer_batch_size
|
159
|
+
ENV['SPARK_RUBY_SERIALIZER_BATCH_SIZE'] || Spark::Serializer::DEFAULT_BATCH_SIZE
|
160
|
+
end
|
161
|
+
|
162
|
+
# Ruby executor.
|
163
|
+
#
|
164
|
+
# == Options:
|
165
|
+
# nil::
|
166
|
+
# System's gem is loaded (ruby-spark).
|
167
|
+
#
|
168
|
+
# other::
|
169
|
+
# Path of library which will be used.
|
170
|
+
# Current ruby-spark gem is used.
|
171
|
+
# (default)
|
172
|
+
#
|
173
|
+
def default_executor_uri
|
174
|
+
ENV['SPARK_RUBY_EXECUTOR_URI'] || ''
|
175
|
+
end
|
176
|
+
|
177
|
+
# Command template which is applied when scala want create a ruby
|
178
|
+
# process (e.g. master, home request). Command is represented by '%s'.
|
179
|
+
#
|
180
|
+
# == Example:
|
181
|
+
# bash --norc -i -c "export HOME=/home/user; cd; source .bashrc; %s"
|
182
|
+
#
|
183
|
+
def default_executor_command
|
184
|
+
ENV['SPARK_RUBY_EXECUTOR_COMMAND'] || '%s'
|
185
|
+
end
|
186
|
+
|
187
|
+
# Options for every worker.
|
188
|
+
#
|
189
|
+
# == Examples:
|
190
|
+
# -J-Xmx512m
|
191
|
+
#
|
192
|
+
def default_executor_options
|
193
|
+
ENV['SPARK_RUBY_EXECUTOR_OPTIONS'] || ''
|
194
|
+
end
|
195
|
+
|
196
|
+
# Type of worker.
|
197
|
+
#
|
198
|
+
# == Options:
|
199
|
+
# process:: (default)
|
200
|
+
# thread:: (experimental)
|
201
|
+
#
|
202
|
+
def default_worker_type
|
203
|
+
ENV['SPARK_RUBY_WORKER_TYPE'] || 'process'
|
204
|
+
end
|
205
|
+
|
206
|
+
# Load environment variables for executor from ENV.
|
207
|
+
#
|
208
|
+
# == Examples:
|
209
|
+
# SPARK_RUBY_EXECUTOR_ENV_KEY1="1"
|
210
|
+
# SPARK_RUBY_EXECUTOR_ENV_KEY2="2"
|
211
|
+
#
|
212
|
+
def load_executor_envs
|
213
|
+
prefix = 'SPARK_RUBY_EXECUTOR_ENV_'
|
214
|
+
|
215
|
+
envs = ENV.select{|key, _| key.start_with?(prefix)}
|
216
|
+
envs.each do |key, value|
|
217
|
+
key = key.dup # ENV keys are frozen
|
218
|
+
key.slice!(0, prefix.size)
|
219
|
+
|
220
|
+
set("spark.ruby.executor.env.#{key}", value)
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
# Aliases
|
225
|
+
alias_method :getAll, :get_all
|
226
|
+
alias_method :setAppName, :set_app_name
|
227
|
+
alias_method :setMaster, :set_master
|
228
|
+
|
229
|
+
private
|
230
|
+
|
231
|
+
def check_read_only
|
232
|
+
if read_only?
|
233
|
+
raise Spark::ConfigurationError, 'Configuration is ready only'
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
end
|
238
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module Spark
|
2
|
+
# Commond constant for Ruby and Spark
|
3
|
+
module Constant
|
4
|
+
DATA_EOF = -2
|
5
|
+
WORKER_ERROR = -1
|
6
|
+
WORKER_DONE = 0
|
7
|
+
CREATE_WORKER = 1
|
8
|
+
KILL_WORKER = 2
|
9
|
+
KILL_WORKER_AND_WAIT = 3
|
10
|
+
SUCCESSFULLY_KILLED = 4
|
11
|
+
UNSUCCESSFUL_KILLING = 5
|
12
|
+
ACCUMULATOR_ACK = 6
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,322 @@
|
|
1
|
+
# Necessary libraries
|
2
|
+
Spark.load_lib
|
3
|
+
|
4
|
+
module Spark
|
5
|
+
##
|
6
|
+
# Main entry point for Spark functionality. A SparkContext represents the connection to a Spark
|
7
|
+
# cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster.
|
8
|
+
#
|
9
|
+
class Context
|
10
|
+
|
11
|
+
include Spark::Helper::System
|
12
|
+
include Spark::Helper::Parser
|
13
|
+
include Spark::Helper::Logger
|
14
|
+
|
15
|
+
attr_reader :jcontext, :jaccumulator, :temp_dir
|
16
|
+
|
17
|
+
# Constructor for Ruby context. Configuration is automatically is taken
|
18
|
+
# from Spark. Config will be automatically set to default if user start
|
19
|
+
# context first.
|
20
|
+
#
|
21
|
+
def initialize
|
22
|
+
Spark.config.valid!
|
23
|
+
@jcontext = JavaSparkContext.new(Spark.config.spark_conf)
|
24
|
+
@jcontext.addJar(Spark.ruby_spark_jar)
|
25
|
+
|
26
|
+
# Does not work on 1.2
|
27
|
+
# ui.attachTab(RubyTab.new(ui, to_java_hash(RbConfig::CONFIG)))
|
28
|
+
|
29
|
+
spark_local_dir = JUtils.getLocalDir(sc.conf)
|
30
|
+
@temp_dir = JUtils.createTempDir(spark_local_dir, 'ruby').getAbsolutePath
|
31
|
+
|
32
|
+
accum_server = Spark::Accumulator::Server
|
33
|
+
accum_server.start
|
34
|
+
@jaccumulator = @jcontext.accumulator(ArrayList.new, RubyAccumulatorParam.new(accum_server.host, accum_server.port))
|
35
|
+
|
36
|
+
log_info("Ruby accumulator server is running on port #{accum_server.port}")
|
37
|
+
|
38
|
+
set_call_site('Ruby') # description of stage
|
39
|
+
end
|
40
|
+
|
41
|
+
def stop
|
42
|
+
Spark::Accumulator::Server.stop
|
43
|
+
log_info('Ruby accumulator server was stopped')
|
44
|
+
@jcontext.stop
|
45
|
+
end
|
46
|
+
|
47
|
+
def sc
|
48
|
+
@jcontext.sc
|
49
|
+
end
|
50
|
+
|
51
|
+
def ui
|
52
|
+
sc.ui
|
53
|
+
end
|
54
|
+
|
55
|
+
# Default level of parallelism to use when not given by user (e.g. parallelize and makeRDD)
|
56
|
+
#
|
57
|
+
def default_parallelism
|
58
|
+
sc.defaultParallelism
|
59
|
+
end
|
60
|
+
|
61
|
+
# Default serializer
|
62
|
+
#
|
63
|
+
# Batch -> Compress -> Basic
|
64
|
+
#
|
65
|
+
def default_serializer
|
66
|
+
# Basic
|
67
|
+
serializer = Spark::Serializer.find!(config('spark.ruby.serializer')).new
|
68
|
+
|
69
|
+
# Compress
|
70
|
+
if config('spark.ruby.serializer.compress')
|
71
|
+
serializer = Spark::Serializer.compressed(serializer)
|
72
|
+
end
|
73
|
+
|
74
|
+
# Bactching
|
75
|
+
batch_size = default_batch_size
|
76
|
+
if batch_size == 'auto'
|
77
|
+
serializer = Spark::Serializer.auto_batched(serializer)
|
78
|
+
else
|
79
|
+
serializer = Spark::Serializer.batched(serializer, batch_size)
|
80
|
+
end
|
81
|
+
|
82
|
+
# Finally, "container" contains serializers
|
83
|
+
serializer
|
84
|
+
end
|
85
|
+
|
86
|
+
def default_batch_size
|
87
|
+
size = config('spark.ruby.serializer.batch_size').to_i
|
88
|
+
if size >= 1
|
89
|
+
size
|
90
|
+
else
|
91
|
+
'auto'
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# Set a local property that affects jobs submitted from this thread, such as the
|
96
|
+
# Spark fair scheduler pool.
|
97
|
+
#
|
98
|
+
def set_local_property(key, value)
|
99
|
+
jcontext.setLocalProperty(key, value)
|
100
|
+
end
|
101
|
+
|
102
|
+
# Get a local property set in this thread, or null if it is missing
|
103
|
+
#
|
104
|
+
def get_local_property(key)
|
105
|
+
jcontext.getLocalProperty(key)
|
106
|
+
end
|
107
|
+
|
108
|
+
# Support function for API backtraces.
|
109
|
+
#
|
110
|
+
def set_call_site(site)
|
111
|
+
set_local_property('externalCallSite', site)
|
112
|
+
end
|
113
|
+
|
114
|
+
# Capture the current user callsite and return a formatted version for printing. If the user
|
115
|
+
# has overridden the call site, this will return the user's version.
|
116
|
+
#
|
117
|
+
def get_call_site
|
118
|
+
jcontext.getCallSite
|
119
|
+
end
|
120
|
+
|
121
|
+
# Return a copy of this SparkContext's configuration. The configuration *cannot*
|
122
|
+
# be changed at runtime.
|
123
|
+
#
|
124
|
+
def config(key=nil)
|
125
|
+
if key
|
126
|
+
Spark.config.get(key)
|
127
|
+
else
|
128
|
+
Spark.config
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# Add a file to be downloaded with this Spark job on every node.
|
133
|
+
# The path of file passed can be either a local file, a file in HDFS
|
134
|
+
# (or other Hadoop-supported filesystems), or an HTTP, HTTPS or FTP URI.
|
135
|
+
#
|
136
|
+
# To access the file in Spark jobs, use `SparkFiles.get(file_name)` with the
|
137
|
+
# filename to find its download location.
|
138
|
+
#
|
139
|
+
# == Example:
|
140
|
+
# `echo 10 > test.txt`
|
141
|
+
#
|
142
|
+
# $sc.add_file('test.txt')
|
143
|
+
# $sc.parallelize(0..5).map(lambda{|x| x * SparkFiles.get_content('test.txt').to_i}).collect
|
144
|
+
# # => [0, 10, 20, 30, 40, 50]
|
145
|
+
#
|
146
|
+
def add_file(*files)
|
147
|
+
files.each do |file|
|
148
|
+
sc.addFile(file)
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
# Broadcast a read-only variable to the cluster, returning a Spark::Broadcast
|
153
|
+
# object for reading it in distributed functions. The variable will
|
154
|
+
# be sent to each cluster only once.
|
155
|
+
#
|
156
|
+
# == Example:
|
157
|
+
# broadcast1 = $sc.broadcast('a')
|
158
|
+
# broadcast2 = $sc.broadcast('b')
|
159
|
+
#
|
160
|
+
# rdd = $sc.parallelize(0..5, 4)
|
161
|
+
# rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2)
|
162
|
+
# rdd = rdd.map_partitions_with_index(lambda{|part, index| [broadcast1.value * index, broadcast2.value * index] })
|
163
|
+
# rdd.collect
|
164
|
+
# # => ["", "", "a", "b", "aa", "bb", "aaa", "bbb"]
|
165
|
+
#
|
166
|
+
def broadcast(value)
|
167
|
+
Spark::Broadcast.new(self, value)
|
168
|
+
end
|
169
|
+
|
170
|
+
# Create an Accumulator with the given initial value, using a given
|
171
|
+
# accum_param helper object to define how to add values of the
|
172
|
+
# data type if provided.
|
173
|
+
#
|
174
|
+
# == Example:
|
175
|
+
# accum = $sc.accumulator(7)
|
176
|
+
#
|
177
|
+
# rdd = $sc.parallelize(0..5, 4)
|
178
|
+
# rdd = rdd.bind(accum: accum)
|
179
|
+
# rdd = rdd.map_partitions(lambda{|_| accum.add(1) })
|
180
|
+
# rdd = rdd.collect
|
181
|
+
#
|
182
|
+
# accum.value
|
183
|
+
# # => 11
|
184
|
+
#
|
185
|
+
def accumulator(value, accum_param=:+, zero_value=0)
|
186
|
+
Spark::Accumulator.new(value, accum_param, zero_value)
|
187
|
+
end
|
188
|
+
|
189
|
+
# Distribute a local Ruby collection to form an RDD
|
190
|
+
# Direct method can be slow so be careful, this method update data inplace
|
191
|
+
#
|
192
|
+
# == Parameters:
|
193
|
+
# data:: Range or Array
|
194
|
+
# num_slices:: number of slice
|
195
|
+
# serializer:: custom serializer (default: serializer based on configuration)
|
196
|
+
#
|
197
|
+
# == Examples:
|
198
|
+
# $sc.parallelize(["1", "2", "3"]).map(lambda{|x| x.to_i}).collect
|
199
|
+
# #=> [1, 2, 3]
|
200
|
+
#
|
201
|
+
# $sc.parallelize(1..3).map(:to_s).collect
|
202
|
+
# #=> ["1", "2", "3"]
|
203
|
+
#
|
204
|
+
def parallelize(data, num_slices=nil, serializer=nil)
|
205
|
+
num_slices ||= default_parallelism
|
206
|
+
serializer ||= default_serializer
|
207
|
+
|
208
|
+
serializer.check_each(data)
|
209
|
+
|
210
|
+
# Through file
|
211
|
+
file = Tempfile.new('to_parallelize', temp_dir)
|
212
|
+
serializer.dump_to_io(data, file)
|
213
|
+
file.close # not unlink
|
214
|
+
jrdd = RubyRDD.readRDDFromFile(jcontext, file.path, num_slices)
|
215
|
+
|
216
|
+
Spark::RDD.new(jrdd, self, serializer)
|
217
|
+
ensure
|
218
|
+
file && file.unlink
|
219
|
+
end
|
220
|
+
|
221
|
+
# Read a text file from HDFS, a local file system (available on all nodes), or any
|
222
|
+
# Hadoop-supported file system URI, and return it as an RDD of Strings.
|
223
|
+
#
|
224
|
+
# == Example:
|
225
|
+
# f = Tempfile.new("test")
|
226
|
+
# f.puts("1")
|
227
|
+
# f.puts("2")
|
228
|
+
# f.close
|
229
|
+
#
|
230
|
+
# $sc.text_file(f.path).map(lambda{|x| x.to_i}).collect
|
231
|
+
# # => [1, 2]
|
232
|
+
#
|
233
|
+
def text_file(path, min_partitions=nil, encoding=Encoding::UTF_8, serializer=nil)
|
234
|
+
min_partitions ||= default_parallelism
|
235
|
+
serializer ||= default_serializer
|
236
|
+
deserializer = Spark::Serializer.build { __text__(encoding) }
|
237
|
+
|
238
|
+
Spark::RDD.new(@jcontext.textFile(path, min_partitions), self, serializer, deserializer)
|
239
|
+
end
|
240
|
+
|
241
|
+
# Read a directory of text files from HDFS, a local file system (available on all nodes), or any
|
242
|
+
# Hadoop-supported file system URI. Each file is read as a single record and returned in a
|
243
|
+
# key-value pair, where the key is the path of each file, the value is the content of each file.
|
244
|
+
#
|
245
|
+
# == Example:
|
246
|
+
# dir = Dir.mktmpdir
|
247
|
+
# f1 = Tempfile.new("test1", dir)
|
248
|
+
# f2 = Tempfile.new("test2", dir)
|
249
|
+
# f1.puts("1"); f1.puts("2");
|
250
|
+
# f2.puts("3"); f2.puts("4");
|
251
|
+
# f1.close
|
252
|
+
# f2.close
|
253
|
+
#
|
254
|
+
# $sc.whole_text_files(dir).flat_map(lambda{|key, value| value.split}).collect
|
255
|
+
# # => ["1", "2", "3", "4"]
|
256
|
+
#
|
257
|
+
def whole_text_files(path, min_partitions=nil, serializer=nil)
|
258
|
+
min_partitions ||= default_parallelism
|
259
|
+
serializer ||= default_serializer
|
260
|
+
deserializer = Spark::Serializer.build{ __pair__(__text__, __text__) }
|
261
|
+
|
262
|
+
Spark::RDD.new(@jcontext.wholeTextFiles(path, min_partitions), self, serializer, deserializer)
|
263
|
+
end
|
264
|
+
|
265
|
+
# Executes the given partition function f on the specified set of partitions,
|
266
|
+
# returning the result as an array of elements.
|
267
|
+
#
|
268
|
+
# If partitions is not specified, this will run over all partitions.
|
269
|
+
#
|
270
|
+
# == Example:
|
271
|
+
# rdd = $sc.parallelize(0..10, 5)
|
272
|
+
# $sc.run_job(rdd, lambda{|x| x.to_s}, [0,2])
|
273
|
+
# # => ["[0, 1]", "[4, 5]"]
|
274
|
+
#
|
275
|
+
def run_job(rdd, f, partitions=nil, allow_local=false)
|
276
|
+
run_job_with_command(rdd, partitions, allow_local, Spark::Command::MapPartitions, f)
|
277
|
+
end
|
278
|
+
|
279
|
+
# Execute the given command on specific set of partitions.
|
280
|
+
#
|
281
|
+
def run_job_with_command(rdd, partitions, allow_local, command, *args)
|
282
|
+
if !partitions.nil? && !partitions.is_a?(Array)
|
283
|
+
raise Spark::ContextError, 'Partitions must be nil or Array'
|
284
|
+
end
|
285
|
+
|
286
|
+
partitions_size = rdd.partitions_size
|
287
|
+
|
288
|
+
# Execute all parts
|
289
|
+
if partitions.nil?
|
290
|
+
partitions = (0...partitions_size).to_a
|
291
|
+
end
|
292
|
+
|
293
|
+
# Can happend when you use coalesce
|
294
|
+
partitions.delete_if {|part| part >= partitions_size}
|
295
|
+
|
296
|
+
# Rjb represent Fixnum as Integer but Jruby as Long
|
297
|
+
partitions = to_java_array_list(convert_to_java_int(partitions))
|
298
|
+
|
299
|
+
# File for result
|
300
|
+
file = Tempfile.new('collect', temp_dir)
|
301
|
+
|
302
|
+
mapped = rdd.new_rdd_from_command(command, *args)
|
303
|
+
RubyRDD.runJob(rdd.context.sc, mapped.jrdd, partitions, allow_local, file.path)
|
304
|
+
|
305
|
+
mapped.collect_from_file(file)
|
306
|
+
end
|
307
|
+
|
308
|
+
|
309
|
+
# Aliases
|
310
|
+
alias_method :textFile, :text_file
|
311
|
+
alias_method :wholeTextFiles, :whole_text_files
|
312
|
+
alias_method :defaultParallelism, :default_parallelism
|
313
|
+
alias_method :setLocalProperty, :set_local_property
|
314
|
+
alias_method :getLocalProperty, :get_local_property
|
315
|
+
alias_method :setCallSite, :set_call_site
|
316
|
+
alias_method :getCallSite, :get_call_site
|
317
|
+
alias_method :runJob, :run_job
|
318
|
+
alias_method :runJobWithCommand, :run_job_with_command
|
319
|
+
alias_method :addFile, :add_file
|
320
|
+
|
321
|
+
end
|
322
|
+
end
|