ruby-spark 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +37 -0
- data/Gemfile +47 -0
- data/Guardfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +185 -0
- data/Rakefile +35 -0
- data/TODO.md +7 -0
- data/benchmark/aggregate.rb +33 -0
- data/benchmark/bisect.rb +88 -0
- data/benchmark/custom_marshal.rb +94 -0
- data/benchmark/digest.rb +150 -0
- data/benchmark/enumerator.rb +88 -0
- data/benchmark/performance/prepare.sh +18 -0
- data/benchmark/performance/python.py +156 -0
- data/benchmark/performance/r.r +69 -0
- data/benchmark/performance/ruby.rb +167 -0
- data/benchmark/performance/run-all.sh +160 -0
- data/benchmark/performance/scala.scala +181 -0
- data/benchmark/serializer.rb +82 -0
- data/benchmark/sort.rb +43 -0
- data/benchmark/sort2.rb +164 -0
- data/benchmark/take.rb +28 -0
- data/bin/ruby-spark +8 -0
- data/example/pi.rb +28 -0
- data/ext/ruby_c/extconf.rb +3 -0
- data/ext/ruby_c/murmur.c +158 -0
- data/ext/ruby_c/murmur.h +9 -0
- data/ext/ruby_c/ruby-spark.c +18 -0
- data/ext/ruby_java/Digest.java +36 -0
- data/ext/ruby_java/Murmur2.java +98 -0
- data/ext/ruby_java/RubySparkExtService.java +28 -0
- data/ext/ruby_java/extconf.rb +3 -0
- data/ext/spark/build.sbt +73 -0
- data/ext/spark/project/plugins.sbt +9 -0
- data/ext/spark/sbt/sbt +34 -0
- data/ext/spark/src/main/scala/Exec.scala +91 -0
- data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
- data/ext/spark/src/main/scala/Marshal.scala +52 -0
- data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
- data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
- data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
- data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
- data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
- data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
- data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
- data/ext/spark/src/main/scala/RubyPage.scala +34 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +364 -0
- data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
- data/ext/spark/src/main/scala/RubyTab.scala +11 -0
- data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
- data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
- data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
- data/lib/ruby-spark.rb +1 -0
- data/lib/spark.rb +198 -0
- data/lib/spark/accumulator.rb +260 -0
- data/lib/spark/broadcast.rb +98 -0
- data/lib/spark/build.rb +43 -0
- data/lib/spark/cli.rb +169 -0
- data/lib/spark/command.rb +86 -0
- data/lib/spark/command/base.rb +154 -0
- data/lib/spark/command/basic.rb +345 -0
- data/lib/spark/command/pair.rb +124 -0
- data/lib/spark/command/sort.rb +51 -0
- data/lib/spark/command/statistic.rb +144 -0
- data/lib/spark/command_builder.rb +141 -0
- data/lib/spark/command_validator.rb +34 -0
- data/lib/spark/config.rb +244 -0
- data/lib/spark/constant.rb +14 -0
- data/lib/spark/context.rb +304 -0
- data/lib/spark/error.rb +50 -0
- data/lib/spark/ext/hash.rb +41 -0
- data/lib/spark/ext/integer.rb +25 -0
- data/lib/spark/ext/io.rb +57 -0
- data/lib/spark/ext/ip_socket.rb +29 -0
- data/lib/spark/ext/module.rb +58 -0
- data/lib/spark/ext/object.rb +24 -0
- data/lib/spark/ext/string.rb +24 -0
- data/lib/spark/helper.rb +10 -0
- data/lib/spark/helper/logger.rb +40 -0
- data/lib/spark/helper/parser.rb +85 -0
- data/lib/spark/helper/serialize.rb +71 -0
- data/lib/spark/helper/statistic.rb +93 -0
- data/lib/spark/helper/system.rb +42 -0
- data/lib/spark/java_bridge.rb +19 -0
- data/lib/spark/java_bridge/base.rb +203 -0
- data/lib/spark/java_bridge/jruby.rb +23 -0
- data/lib/spark/java_bridge/rjb.rb +41 -0
- data/lib/spark/logger.rb +76 -0
- data/lib/spark/mllib.rb +100 -0
- data/lib/spark/mllib/classification/common.rb +31 -0
- data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
- data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
- data/lib/spark/mllib/classification/svm.rb +135 -0
- data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
- data/lib/spark/mllib/clustering/kmeans.rb +118 -0
- data/lib/spark/mllib/matrix.rb +120 -0
- data/lib/spark/mllib/regression/common.rb +73 -0
- data/lib/spark/mllib/regression/labeled_point.rb +41 -0
- data/lib/spark/mllib/regression/lasso.rb +100 -0
- data/lib/spark/mllib/regression/linear.rb +124 -0
- data/lib/spark/mllib/regression/ridge.rb +97 -0
- data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
- data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
- data/lib/spark/mllib/stat/distribution.rb +12 -0
- data/lib/spark/mllib/vector.rb +185 -0
- data/lib/spark/rdd.rb +1328 -0
- data/lib/spark/sampler.rb +92 -0
- data/lib/spark/serializer.rb +24 -0
- data/lib/spark/serializer/base.rb +170 -0
- data/lib/spark/serializer/cartesian.rb +37 -0
- data/lib/spark/serializer/marshal.rb +19 -0
- data/lib/spark/serializer/message_pack.rb +25 -0
- data/lib/spark/serializer/oj.rb +25 -0
- data/lib/spark/serializer/pair.rb +27 -0
- data/lib/spark/serializer/utf8.rb +25 -0
- data/lib/spark/sort.rb +189 -0
- data/lib/spark/stat_counter.rb +125 -0
- data/lib/spark/storage_level.rb +39 -0
- data/lib/spark/version.rb +3 -0
- data/lib/spark/worker/master.rb +144 -0
- data/lib/spark/worker/spark_files.rb +15 -0
- data/lib/spark/worker/worker.rb +197 -0
- data/ruby-spark.gemspec +36 -0
- data/spec/generator.rb +37 -0
- data/spec/inputs/lorem_300.txt +316 -0
- data/spec/inputs/numbers/1.txt +50 -0
- data/spec/inputs/numbers/10.txt +50 -0
- data/spec/inputs/numbers/11.txt +50 -0
- data/spec/inputs/numbers/12.txt +50 -0
- data/spec/inputs/numbers/13.txt +50 -0
- data/spec/inputs/numbers/14.txt +50 -0
- data/spec/inputs/numbers/15.txt +50 -0
- data/spec/inputs/numbers/16.txt +50 -0
- data/spec/inputs/numbers/17.txt +50 -0
- data/spec/inputs/numbers/18.txt +50 -0
- data/spec/inputs/numbers/19.txt +50 -0
- data/spec/inputs/numbers/2.txt +50 -0
- data/spec/inputs/numbers/20.txt +50 -0
- data/spec/inputs/numbers/3.txt +50 -0
- data/spec/inputs/numbers/4.txt +50 -0
- data/spec/inputs/numbers/5.txt +50 -0
- data/spec/inputs/numbers/6.txt +50 -0
- data/spec/inputs/numbers/7.txt +50 -0
- data/spec/inputs/numbers/8.txt +50 -0
- data/spec/inputs/numbers/9.txt +50 -0
- data/spec/inputs/numbers_0_100.txt +101 -0
- data/spec/inputs/numbers_1_100.txt +100 -0
- data/spec/lib/collect_spec.rb +42 -0
- data/spec/lib/command_spec.rb +68 -0
- data/spec/lib/config_spec.rb +64 -0
- data/spec/lib/context_spec.rb +163 -0
- data/spec/lib/ext_spec.rb +72 -0
- data/spec/lib/external_apps_spec.rb +45 -0
- data/spec/lib/filter_spec.rb +80 -0
- data/spec/lib/flat_map_spec.rb +100 -0
- data/spec/lib/group_spec.rb +109 -0
- data/spec/lib/helper_spec.rb +19 -0
- data/spec/lib/key_spec.rb +41 -0
- data/spec/lib/manipulation_spec.rb +114 -0
- data/spec/lib/map_partitions_spec.rb +87 -0
- data/spec/lib/map_spec.rb +91 -0
- data/spec/lib/mllib/classification_spec.rb +54 -0
- data/spec/lib/mllib/clustering_spec.rb +35 -0
- data/spec/lib/mllib/matrix_spec.rb +32 -0
- data/spec/lib/mllib/regression_spec.rb +116 -0
- data/spec/lib/mllib/vector_spec.rb +77 -0
- data/spec/lib/reduce_by_key_spec.rb +118 -0
- data/spec/lib/reduce_spec.rb +131 -0
- data/spec/lib/sample_spec.rb +46 -0
- data/spec/lib/serializer_spec.rb +13 -0
- data/spec/lib/sort_spec.rb +58 -0
- data/spec/lib/statistic_spec.rb +168 -0
- data/spec/lib/whole_text_files_spec.rb +33 -0
- data/spec/spec_helper.rb +39 -0
- metadata +301 -0
@@ -0,0 +1,260 @@
|
|
1
|
+
module Spark
|
2
|
+
##
|
3
|
+
# A shared variable that can be accumulated, i.e., has a commutative and associative "add"
|
4
|
+
# operation. Worker tasks on a Spark cluster can add values to an Accumulator with the `+=`
|
5
|
+
# operator, but only the driver program is allowed to access its value, using value.
|
6
|
+
# Updates from the workers get propagated automatically to the driver program.
|
7
|
+
#
|
8
|
+
# == Arguments:
|
9
|
+
# value::
|
10
|
+
# Initial value for accumulator. This values is stored only on driver process
|
11
|
+
#
|
12
|
+
# accum_param::
|
13
|
+
# How merge 2 value on worker or driver process.
|
14
|
+
# Symbol or Proc (or String)
|
15
|
+
#
|
16
|
+
# zero_value::
|
17
|
+
# Initial value for worker process
|
18
|
+
#
|
19
|
+
#
|
20
|
+
# == Examples:
|
21
|
+
#
|
22
|
+
# accum1 = $sc.accumulator(1)
|
23
|
+
# accum2 = $sc.accumulator(2, :*, 1)
|
24
|
+
# accum3 = $sc.accumulator(3, lambda{|max, val| val > max ? val : max})
|
25
|
+
#
|
26
|
+
# accum1 += 1
|
27
|
+
#
|
28
|
+
# accum2.add(2)
|
29
|
+
# accum2.add(2)
|
30
|
+
# accum2.add(2)
|
31
|
+
#
|
32
|
+
# accum3.add(9)
|
33
|
+
# accum3.add(6)
|
34
|
+
# accum3.add(7)
|
35
|
+
#
|
36
|
+
# accum1.value # => 2
|
37
|
+
# accum2.value # => 16
|
38
|
+
# accum3.value # => 9
|
39
|
+
#
|
40
|
+
# func = Proc.new do |_, index|
|
41
|
+
# accum1.add(1)
|
42
|
+
# accum2.add(2)
|
43
|
+
# accum3.add(index * 10)
|
44
|
+
# end
|
45
|
+
#
|
46
|
+
# rdd = $sc.parallelize(0..4, 4)
|
47
|
+
# rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
|
48
|
+
# rdd = rdd.map_partitions_with_index(func)
|
49
|
+
# rdd.collect
|
50
|
+
#
|
51
|
+
# accum1.value # => 6
|
52
|
+
# accum2.value # => 256
|
53
|
+
# accum3.value # => 30
|
54
|
+
#
|
55
|
+
class Accumulator
|
56
|
+
|
57
|
+
attr_reader :id, :value, :accum_param, :zero_value
|
58
|
+
|
59
|
+
@@instances = {}
|
60
|
+
@@changed = []
|
61
|
+
|
62
|
+
SUPPORTED_SYMBOLS = [:+, :-, :*, :/, :**]
|
63
|
+
|
64
|
+
|
65
|
+
# =========================================================================
|
66
|
+
# Creating and selecting Spark::Accumulator
|
67
|
+
|
68
|
+
def initialize(value, accum_param=:+, zero_value=0)
|
69
|
+
@id = object_id
|
70
|
+
@value = value
|
71
|
+
@accum_param = accum_param
|
72
|
+
@zero_value = zero_value
|
73
|
+
@driver = true
|
74
|
+
|
75
|
+
valid_accum_param
|
76
|
+
|
77
|
+
@@instances[@id] = self
|
78
|
+
end
|
79
|
+
|
80
|
+
def self.changed
|
81
|
+
@@changed
|
82
|
+
end
|
83
|
+
|
84
|
+
def self.instances
|
85
|
+
@@instances
|
86
|
+
end
|
87
|
+
|
88
|
+
def valid_accum_param
|
89
|
+
if @accum_param.is_a?(Symbol)
|
90
|
+
raise Spark::AccumulatorError, "Unsupported symbol #{@accum_param}" unless SUPPORTED_SYMBOLS.include?(@accum_param)
|
91
|
+
@serialized_accum_param = @accum_param
|
92
|
+
return
|
93
|
+
end
|
94
|
+
|
95
|
+
if @accum_param.is_a?(Proc)
|
96
|
+
begin
|
97
|
+
@serialized_accum_param = @accum_param.to_source
|
98
|
+
return
|
99
|
+
rescue
|
100
|
+
raise Spark::SerializeError, 'Proc can not be serialized. Use String instead.'
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
if @accum_param.is_a?(String)
|
105
|
+
@serialized_accum_param = @accum_param
|
106
|
+
@accum_param = eval(@accum_param)
|
107
|
+
|
108
|
+
unless @accum_param.is_a?(Proc)
|
109
|
+
raise Spark::SerializeError, 'Yours param is not a Proc.'
|
110
|
+
end
|
111
|
+
|
112
|
+
return
|
113
|
+
end
|
114
|
+
|
115
|
+
raise Spark::AccumulatorError, 'Unsupported param. Use Symbol, Proc or String.'
|
116
|
+
end
|
117
|
+
|
118
|
+
# Driver process or worker
|
119
|
+
def driver?
|
120
|
+
@driver
|
121
|
+
end
|
122
|
+
|
123
|
+
|
124
|
+
# =========================================================================
|
125
|
+
# Operations
|
126
|
+
|
127
|
+
def add(term)
|
128
|
+
if !driver? && !@@changed.include?(self)
|
129
|
+
@@changed << self
|
130
|
+
end
|
131
|
+
|
132
|
+
if @accum_param.is_a?(Proc)
|
133
|
+
@value = @accum_param.call(@value, term)
|
134
|
+
else
|
135
|
+
add_by_symbol(term)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def +(term)
|
140
|
+
add(term)
|
141
|
+
self
|
142
|
+
end
|
143
|
+
|
144
|
+
def add_by_symbol(term)
|
145
|
+
case @accum_param
|
146
|
+
when :+
|
147
|
+
@value += term
|
148
|
+
when :-
|
149
|
+
@value -= term
|
150
|
+
when :*
|
151
|
+
@value *= term
|
152
|
+
when :/
|
153
|
+
@value /= term
|
154
|
+
when :**
|
155
|
+
@value **= term
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
|
160
|
+
# =========================================================================
|
161
|
+
# Dump and load
|
162
|
+
|
163
|
+
def marshal_dump
|
164
|
+
[@id, @zero_value, @serialized_accum_param]
|
165
|
+
end
|
166
|
+
|
167
|
+
def marshal_load(array)
|
168
|
+
@id, @zero_value, @serialized_accum_param = array
|
169
|
+
|
170
|
+
@value = @zero_value
|
171
|
+
@driver = false
|
172
|
+
load_accum_param
|
173
|
+
end
|
174
|
+
|
175
|
+
def load_accum_param
|
176
|
+
if @serialized_accum_param.is_a?(String)
|
177
|
+
@accum_param = eval(@serialized_accum_param)
|
178
|
+
else
|
179
|
+
@accum_param = @serialized_accum_param
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
# =============================================================================
|
187
|
+
# Server for handeling Accumulator update
|
188
|
+
#
|
189
|
+
module Spark
|
190
|
+
class Accumulator
|
191
|
+
class Server
|
192
|
+
|
193
|
+
attr_reader :server, :host, :port
|
194
|
+
|
195
|
+
def self.start
|
196
|
+
@instance ||= Spark::Accumulator::Server.new
|
197
|
+
end
|
198
|
+
|
199
|
+
def self.stop
|
200
|
+
@instance && @instance.stop
|
201
|
+
end
|
202
|
+
|
203
|
+
def self.host
|
204
|
+
start
|
205
|
+
@instance.host
|
206
|
+
end
|
207
|
+
|
208
|
+
def self.port
|
209
|
+
start
|
210
|
+
@instance.port
|
211
|
+
end
|
212
|
+
|
213
|
+
def initialize
|
214
|
+
@server = TCPServer.new(0)
|
215
|
+
@host = @server.hostname
|
216
|
+
@port = @server.port
|
217
|
+
|
218
|
+
@threads = []
|
219
|
+
handle_accept
|
220
|
+
end
|
221
|
+
|
222
|
+
def stop
|
223
|
+
@threads.each(&:kill)
|
224
|
+
rescue
|
225
|
+
nil
|
226
|
+
end
|
227
|
+
|
228
|
+
def handle_accept
|
229
|
+
@threads << Thread.new do
|
230
|
+
loop {
|
231
|
+
handle_connection(@server.accept)
|
232
|
+
}
|
233
|
+
end
|
234
|
+
|
235
|
+
end
|
236
|
+
|
237
|
+
def handle_connection(socket)
|
238
|
+
@threads << Thread.new do
|
239
|
+
until socket.closed?
|
240
|
+
count = socket.read_int
|
241
|
+
count.times do
|
242
|
+
data = socket.read_data
|
243
|
+
accum = Spark::Accumulator.instances[data[0]]
|
244
|
+
if accum
|
245
|
+
accum.add(data[1])
|
246
|
+
else
|
247
|
+
Spark.logger.warn("Accumulator with id #{data[0]} does not exist.")
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
# http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
|
252
|
+
# socket.write_int(Spark::Constant::ACCUMULATOR_ACK)
|
253
|
+
end
|
254
|
+
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
end
|
259
|
+
end
|
260
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
module Spark
|
2
|
+
##
|
3
|
+
# Broadcast a read-only variable to the cluster, returning a Spark::Broadcast
|
4
|
+
# object for reading it in distributed functions. The variable will
|
5
|
+
# be sent to each cluster only once.
|
6
|
+
#
|
7
|
+
# == Example:
|
8
|
+
#
|
9
|
+
# broadcast1 = $sc.broadcast('a')
|
10
|
+
# broadcast2 = $sc.broadcast('b')
|
11
|
+
# broadcast3 = $sc.broadcast([1,2,3])
|
12
|
+
#
|
13
|
+
# func = Proc.new do |part, index|
|
14
|
+
# [
|
15
|
+
# broadcast1.value * index,
|
16
|
+
# broadcast2.value * index,
|
17
|
+
# broadcast3.value.reduce(:+)
|
18
|
+
# ]
|
19
|
+
# end
|
20
|
+
#
|
21
|
+
# rdd = $sc.parallelize(0..5, 4)
|
22
|
+
# rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2, broadcast3: broadcast3)
|
23
|
+
# rdd = rdd.map_partitions_with_index(func)
|
24
|
+
# rdd.collect
|
25
|
+
# # => ["", "", 6, "a", "b", 6, "aa", "bb", 6, "aaa", "bbb", 6]
|
26
|
+
#
|
27
|
+
class Broadcast
|
28
|
+
|
29
|
+
LOADED = 0 # id, value, path
|
30
|
+
NOT_LOADED = 1 # id, path
|
31
|
+
WITHOUT_PATH = 2 # id
|
32
|
+
|
33
|
+
attr_reader :id, :state, :path, :jbroadcast
|
34
|
+
|
35
|
+
@@registered = {}
|
36
|
+
|
37
|
+
# =========================================================================
|
38
|
+
# Creating broadcast for SparkContext
|
39
|
+
|
40
|
+
# Create new Broadcast and dump value to the disk
|
41
|
+
#
|
42
|
+
# b = $sc.broadcast('a')
|
43
|
+
#
|
44
|
+
# b.value # => 'a'
|
45
|
+
# b.path
|
46
|
+
# b.jbroadcast
|
47
|
+
#
|
48
|
+
def initialize(sc, value)
|
49
|
+
@id = object_id
|
50
|
+
@value = value
|
51
|
+
@state = LOADED
|
52
|
+
|
53
|
+
file = Tempfile.create('broadcast', sc.temp_dir)
|
54
|
+
file.binmode
|
55
|
+
file.write(Marshal.dump(value))
|
56
|
+
file.close
|
57
|
+
|
58
|
+
@path = file.path
|
59
|
+
@jbroadcast = RubyRDD.readBroadcastFromFile(sc.jcontext, @path, Spark.jb.to_long(@id))
|
60
|
+
|
61
|
+
ObjectSpace.define_finalizer(self, proc { File.unlink(@path) })
|
62
|
+
end
|
63
|
+
|
64
|
+
def self.register(id, path)
|
65
|
+
@@registered[id] = path
|
66
|
+
end
|
67
|
+
|
68
|
+
def value
|
69
|
+
case state
|
70
|
+
when LOADED
|
71
|
+
@value
|
72
|
+
when NOT_LOADED
|
73
|
+
@value = Marshal.load(File.read(@path))
|
74
|
+
@state = LOADED
|
75
|
+
@value
|
76
|
+
when WITHOUT_PATH
|
77
|
+
@path = @@registered[id]
|
78
|
+
|
79
|
+
if @path
|
80
|
+
@state = NOT_LOADED
|
81
|
+
value
|
82
|
+
else
|
83
|
+
raise Spark::BroadcastError, "Broadcast #{@id} do not have registered path."
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def marshal_dump
|
89
|
+
@id
|
90
|
+
end
|
91
|
+
|
92
|
+
def marshal_load(id)
|
93
|
+
@id = id
|
94
|
+
@state = WITHOUT_PATH
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
end
|
data/lib/spark/build.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
module Spark
|
2
|
+
module Build
|
3
|
+
|
4
|
+
DEFAULT_SCALA_VERSION = '2.10.4'
|
5
|
+
DEFAULT_CORE_VERSION = '2.10'
|
6
|
+
DEFAULT_SPARK_VERSION = '1.3.0'
|
7
|
+
DEFAULT_HADOOP_VERSION = '1.0.4'
|
8
|
+
|
9
|
+
SBT = 'sbt/sbt'
|
10
|
+
SBT_DEPS = 'assemblyPackageDependency'
|
11
|
+
SBT_EXT = 'package'
|
12
|
+
SBT_CLEAN = 'clean'
|
13
|
+
|
14
|
+
def self.build(options)
|
15
|
+
spark_home = options.spark_home || Spark.target_dir
|
16
|
+
scala_version = options.scala_version || DEFAULT_SCALA_VERSION
|
17
|
+
spark_core = options.spark_core || DEFAULT_CORE_VERSION
|
18
|
+
spark_version = options.spark_version || DEFAULT_SPARK_VERSION
|
19
|
+
hadoop_version = options.hadoop_version || DEFAULT_HADOOP_VERSION
|
20
|
+
only_ext = options.only_ext
|
21
|
+
|
22
|
+
env = {
|
23
|
+
'SCALA_VERSION' => scala_version,
|
24
|
+
'SPARK_VERSION' => spark_version,
|
25
|
+
'SPARK_CORE_VERSION' => spark_core,
|
26
|
+
'HADOOP_VERSION' => hadoop_version,
|
27
|
+
'SPARK_HOME' => spark_home
|
28
|
+
}
|
29
|
+
|
30
|
+
cmd = [SBT]
|
31
|
+
cmd << SBT_EXT
|
32
|
+
cmd << SBT_DEPS unless only_ext
|
33
|
+
cmd << SBT_CLEAN unless $debug
|
34
|
+
|
35
|
+
Dir.chdir(Spark.spark_ext_dir) do
|
36
|
+
unless Kernel.system(env, cmd.join(' '))
|
37
|
+
raise Spark::BuildError, 'Spark cannot be assembled.'
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
data/lib/spark/cli.rb
ADDED
@@ -0,0 +1,169 @@
|
|
1
|
+
require 'commander'
|
2
|
+
|
3
|
+
module Commander
|
4
|
+
module UI
|
5
|
+
# Disable paging
|
6
|
+
# for 'classic' help
|
7
|
+
def self.enable_paging
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
module Spark
|
13
|
+
class CLI
|
14
|
+
include Commander::Methods
|
15
|
+
|
16
|
+
IRB_HISTORY_FILE = File.join(Dir.home, '.irb_spark_history')
|
17
|
+
IRB_HISTORY_SIZE = 100
|
18
|
+
|
19
|
+
def run
|
20
|
+
program :name, 'RubySpark'
|
21
|
+
program :version, Spark::VERSION
|
22
|
+
program :description, 'Ruby wrapper for Spark'
|
23
|
+
|
24
|
+
global_option('-d', '--debug', 'Logging message to stdout'){ $debug = true }
|
25
|
+
default_command :help
|
26
|
+
|
27
|
+
|
28
|
+
# Build ---------------------------------------------------------------
|
29
|
+
command :build do |c|
|
30
|
+
c.syntax = 'build [options]'
|
31
|
+
c.description = 'Build spark and gem extensions'
|
32
|
+
c.option '--hadoop-version STRING', String, 'Version of hadoop which will stored with the SPARK'
|
33
|
+
c.option '--spark-home STRING', String, 'Directory where SPARK will be stored'
|
34
|
+
c.option '--spark-core STRING', String, 'Version of SPARK core'
|
35
|
+
c.option '--spark-version STRING', String, 'Version of SPARK'
|
36
|
+
c.option '--scala-version STRING', String, 'Version of Scala'
|
37
|
+
c.option '--only-ext', 'Start SPARK immediately'
|
38
|
+
|
39
|
+
c.action do |args, options|
|
40
|
+
options.default hadoop_version: Spark::Build::DEFAULT_HADOOP_VERSION,
|
41
|
+
spark_home: Spark.target_dir,
|
42
|
+
spark_core: Spark::Build::DEFAULT_CORE_VERSION,
|
43
|
+
spark_version: Spark::Build::DEFAULT_SPARK_VERSION,
|
44
|
+
scala_version: Spark::Build::DEFAULT_SCALA_VERSION,
|
45
|
+
only_ext: false
|
46
|
+
|
47
|
+
Spark::Build.build(options)
|
48
|
+
puts
|
49
|
+
puts 'Everything is OK'
|
50
|
+
end
|
51
|
+
end
|
52
|
+
alias_command :install, :build
|
53
|
+
|
54
|
+
|
55
|
+
# Pry -------------------------------------------------------------------
|
56
|
+
command :pry do |c|
|
57
|
+
c.syntax = 'pry [options]'
|
58
|
+
c.description = 'Start ruby shell for spark'
|
59
|
+
c.option '--spark-home STRING', String, 'Directory where SPARK is stored'
|
60
|
+
c.option '--properties-file STRING', String, 'Path to a file from which to load extra properties'
|
61
|
+
c.option '--[no-]start', 'Start SPARK immediately'
|
62
|
+
c.option '--[no-]logger', 'Enable/disable logger (default: enable)'
|
63
|
+
|
64
|
+
c.action do |args, options|
|
65
|
+
options.default start: true, logger: true
|
66
|
+
|
67
|
+
Spark.load_lib(options.spark_home)
|
68
|
+
Spark::Logger.disable unless options.logger
|
69
|
+
|
70
|
+
Spark.config do
|
71
|
+
set_app_name 'Pry RubySpark'
|
72
|
+
end
|
73
|
+
|
74
|
+
Spark.config.from_file(options.properties_file)
|
75
|
+
|
76
|
+
if options.start
|
77
|
+
# Load Java and Spark
|
78
|
+
Spark.start
|
79
|
+
$sc = Spark.context
|
80
|
+
|
81
|
+
Spark.print_logo('Spark context is loaded as $sc')
|
82
|
+
else
|
83
|
+
Spark.print_logo('You can start Spark with Spark.start')
|
84
|
+
end
|
85
|
+
|
86
|
+
# Load Pry
|
87
|
+
require 'pry'
|
88
|
+
Pry.start
|
89
|
+
end
|
90
|
+
end
|
91
|
+
alias_command :shell, :pry
|
92
|
+
|
93
|
+
|
94
|
+
# IRB -------------------------------------------------------------------
|
95
|
+
command :irb do |c|
|
96
|
+
c.syntax = 'irb [options]'
|
97
|
+
c.description = 'Start ruby shell for spark'
|
98
|
+
c.option '--spark-home STRING', String, 'Directory where SPARK is stored'
|
99
|
+
c.option '--[no-]start', 'Start SPARK immediately'
|
100
|
+
c.option '--[no-]logger', 'Enable/disable logger (default: enable)'
|
101
|
+
|
102
|
+
c.action do |args, options|
|
103
|
+
options.default start: true, logger: true
|
104
|
+
|
105
|
+
Spark.load_lib(options.spark_home)
|
106
|
+
Spark::Logger.disable unless options.logger
|
107
|
+
|
108
|
+
Spark.config do
|
109
|
+
set_app_name 'Pry RubySpark'
|
110
|
+
end
|
111
|
+
|
112
|
+
if options.start
|
113
|
+
# Load Java and Spark
|
114
|
+
Spark.start
|
115
|
+
$sc = Spark.context
|
116
|
+
|
117
|
+
Spark.print_logo('Spark context is loaded as $sc')
|
118
|
+
else
|
119
|
+
Spark.print_logo('You can start Spark with Spark.start')
|
120
|
+
end
|
121
|
+
|
122
|
+
# Load IRB
|
123
|
+
require 'irb'
|
124
|
+
require 'irb/completion'
|
125
|
+
require 'irb/ext/save-history'
|
126
|
+
|
127
|
+
begin
|
128
|
+
file = File.expand_path(IRB_HISTORY_FILE)
|
129
|
+
if File.exists?(file)
|
130
|
+
lines = IO.readlines(file).collect { |line| line.chomp }
|
131
|
+
Readline::HISTORY.push(*lines)
|
132
|
+
end
|
133
|
+
Kernel.at_exit do
|
134
|
+
lines = Readline::HISTORY.to_a.reverse.uniq.reverse
|
135
|
+
lines = lines[-IRB_HISTORY_SIZE, IRB_HISTORY_SIZE] if lines.nitems > IRB_HISTORY_SIZE
|
136
|
+
File.open(IRB_HISTORY_FILE, File::WRONLY | File::CREAT | File::TRUNC) { |io| io.puts lines.join("\n") }
|
137
|
+
end
|
138
|
+
rescue
|
139
|
+
end
|
140
|
+
|
141
|
+
ARGV.clear # Clear Thor ARGV, otherwise IRB will parse it
|
142
|
+
ARGV.concat ['--readline', '--prompt-mode', 'simple']
|
143
|
+
IRB.start
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
|
148
|
+
# Home ------------------------------------------------------------------
|
149
|
+
command :home do |c|
|
150
|
+
c.action do |args, options|
|
151
|
+
puts Spark.home
|
152
|
+
exit(0)
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
|
157
|
+
# Ruby spark jar --------------------------------------------------------
|
158
|
+
command :ruby_spark_jar do |c|
|
159
|
+
c.action do |args, options|
|
160
|
+
puts Spark.ruby_spark_jar
|
161
|
+
exit(0)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
run!
|
166
|
+
end
|
167
|
+
|
168
|
+
end
|
169
|
+
end
|