ruby-spark 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +37 -0
  3. data/Gemfile +47 -0
  4. data/Guardfile +5 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +185 -0
  7. data/Rakefile +35 -0
  8. data/TODO.md +7 -0
  9. data/benchmark/aggregate.rb +33 -0
  10. data/benchmark/bisect.rb +88 -0
  11. data/benchmark/custom_marshal.rb +94 -0
  12. data/benchmark/digest.rb +150 -0
  13. data/benchmark/enumerator.rb +88 -0
  14. data/benchmark/performance/prepare.sh +18 -0
  15. data/benchmark/performance/python.py +156 -0
  16. data/benchmark/performance/r.r +69 -0
  17. data/benchmark/performance/ruby.rb +167 -0
  18. data/benchmark/performance/run-all.sh +160 -0
  19. data/benchmark/performance/scala.scala +181 -0
  20. data/benchmark/serializer.rb +82 -0
  21. data/benchmark/sort.rb +43 -0
  22. data/benchmark/sort2.rb +164 -0
  23. data/benchmark/take.rb +28 -0
  24. data/bin/ruby-spark +8 -0
  25. data/example/pi.rb +28 -0
  26. data/ext/ruby_c/extconf.rb +3 -0
  27. data/ext/ruby_c/murmur.c +158 -0
  28. data/ext/ruby_c/murmur.h +9 -0
  29. data/ext/ruby_c/ruby-spark.c +18 -0
  30. data/ext/ruby_java/Digest.java +36 -0
  31. data/ext/ruby_java/Murmur2.java +98 -0
  32. data/ext/ruby_java/RubySparkExtService.java +28 -0
  33. data/ext/ruby_java/extconf.rb +3 -0
  34. data/ext/spark/build.sbt +73 -0
  35. data/ext/spark/project/plugins.sbt +9 -0
  36. data/ext/spark/sbt/sbt +34 -0
  37. data/ext/spark/src/main/scala/Exec.scala +91 -0
  38. data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
  39. data/ext/spark/src/main/scala/Marshal.scala +52 -0
  40. data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
  41. data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
  42. data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
  43. data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
  44. data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
  45. data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
  46. data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
  47. data/ext/spark/src/main/scala/RubyPage.scala +34 -0
  48. data/ext/spark/src/main/scala/RubyRDD.scala +364 -0
  49. data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
  50. data/ext/spark/src/main/scala/RubyTab.scala +11 -0
  51. data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
  52. data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
  53. data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
  54. data/lib/ruby-spark.rb +1 -0
  55. data/lib/spark.rb +198 -0
  56. data/lib/spark/accumulator.rb +260 -0
  57. data/lib/spark/broadcast.rb +98 -0
  58. data/lib/spark/build.rb +43 -0
  59. data/lib/spark/cli.rb +169 -0
  60. data/lib/spark/command.rb +86 -0
  61. data/lib/spark/command/base.rb +154 -0
  62. data/lib/spark/command/basic.rb +345 -0
  63. data/lib/spark/command/pair.rb +124 -0
  64. data/lib/spark/command/sort.rb +51 -0
  65. data/lib/spark/command/statistic.rb +144 -0
  66. data/lib/spark/command_builder.rb +141 -0
  67. data/lib/spark/command_validator.rb +34 -0
  68. data/lib/spark/config.rb +244 -0
  69. data/lib/spark/constant.rb +14 -0
  70. data/lib/spark/context.rb +304 -0
  71. data/lib/spark/error.rb +50 -0
  72. data/lib/spark/ext/hash.rb +41 -0
  73. data/lib/spark/ext/integer.rb +25 -0
  74. data/lib/spark/ext/io.rb +57 -0
  75. data/lib/spark/ext/ip_socket.rb +29 -0
  76. data/lib/spark/ext/module.rb +58 -0
  77. data/lib/spark/ext/object.rb +24 -0
  78. data/lib/spark/ext/string.rb +24 -0
  79. data/lib/spark/helper.rb +10 -0
  80. data/lib/spark/helper/logger.rb +40 -0
  81. data/lib/spark/helper/parser.rb +85 -0
  82. data/lib/spark/helper/serialize.rb +71 -0
  83. data/lib/spark/helper/statistic.rb +93 -0
  84. data/lib/spark/helper/system.rb +42 -0
  85. data/lib/spark/java_bridge.rb +19 -0
  86. data/lib/spark/java_bridge/base.rb +203 -0
  87. data/lib/spark/java_bridge/jruby.rb +23 -0
  88. data/lib/spark/java_bridge/rjb.rb +41 -0
  89. data/lib/spark/logger.rb +76 -0
  90. data/lib/spark/mllib.rb +100 -0
  91. data/lib/spark/mllib/classification/common.rb +31 -0
  92. data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
  93. data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
  94. data/lib/spark/mllib/classification/svm.rb +135 -0
  95. data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
  96. data/lib/spark/mllib/clustering/kmeans.rb +118 -0
  97. data/lib/spark/mllib/matrix.rb +120 -0
  98. data/lib/spark/mllib/regression/common.rb +73 -0
  99. data/lib/spark/mllib/regression/labeled_point.rb +41 -0
  100. data/lib/spark/mllib/regression/lasso.rb +100 -0
  101. data/lib/spark/mllib/regression/linear.rb +124 -0
  102. data/lib/spark/mllib/regression/ridge.rb +97 -0
  103. data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
  104. data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
  105. data/lib/spark/mllib/stat/distribution.rb +12 -0
  106. data/lib/spark/mllib/vector.rb +185 -0
  107. data/lib/spark/rdd.rb +1328 -0
  108. data/lib/spark/sampler.rb +92 -0
  109. data/lib/spark/serializer.rb +24 -0
  110. data/lib/spark/serializer/base.rb +170 -0
  111. data/lib/spark/serializer/cartesian.rb +37 -0
  112. data/lib/spark/serializer/marshal.rb +19 -0
  113. data/lib/spark/serializer/message_pack.rb +25 -0
  114. data/lib/spark/serializer/oj.rb +25 -0
  115. data/lib/spark/serializer/pair.rb +27 -0
  116. data/lib/spark/serializer/utf8.rb +25 -0
  117. data/lib/spark/sort.rb +189 -0
  118. data/lib/spark/stat_counter.rb +125 -0
  119. data/lib/spark/storage_level.rb +39 -0
  120. data/lib/spark/version.rb +3 -0
  121. data/lib/spark/worker/master.rb +144 -0
  122. data/lib/spark/worker/spark_files.rb +15 -0
  123. data/lib/spark/worker/worker.rb +197 -0
  124. data/ruby-spark.gemspec +36 -0
  125. data/spec/generator.rb +37 -0
  126. data/spec/inputs/lorem_300.txt +316 -0
  127. data/spec/inputs/numbers/1.txt +50 -0
  128. data/spec/inputs/numbers/10.txt +50 -0
  129. data/spec/inputs/numbers/11.txt +50 -0
  130. data/spec/inputs/numbers/12.txt +50 -0
  131. data/spec/inputs/numbers/13.txt +50 -0
  132. data/spec/inputs/numbers/14.txt +50 -0
  133. data/spec/inputs/numbers/15.txt +50 -0
  134. data/spec/inputs/numbers/16.txt +50 -0
  135. data/spec/inputs/numbers/17.txt +50 -0
  136. data/spec/inputs/numbers/18.txt +50 -0
  137. data/spec/inputs/numbers/19.txt +50 -0
  138. data/spec/inputs/numbers/2.txt +50 -0
  139. data/spec/inputs/numbers/20.txt +50 -0
  140. data/spec/inputs/numbers/3.txt +50 -0
  141. data/spec/inputs/numbers/4.txt +50 -0
  142. data/spec/inputs/numbers/5.txt +50 -0
  143. data/spec/inputs/numbers/6.txt +50 -0
  144. data/spec/inputs/numbers/7.txt +50 -0
  145. data/spec/inputs/numbers/8.txt +50 -0
  146. data/spec/inputs/numbers/9.txt +50 -0
  147. data/spec/inputs/numbers_0_100.txt +101 -0
  148. data/spec/inputs/numbers_1_100.txt +100 -0
  149. data/spec/lib/collect_spec.rb +42 -0
  150. data/spec/lib/command_spec.rb +68 -0
  151. data/spec/lib/config_spec.rb +64 -0
  152. data/spec/lib/context_spec.rb +163 -0
  153. data/spec/lib/ext_spec.rb +72 -0
  154. data/spec/lib/external_apps_spec.rb +45 -0
  155. data/spec/lib/filter_spec.rb +80 -0
  156. data/spec/lib/flat_map_spec.rb +100 -0
  157. data/spec/lib/group_spec.rb +109 -0
  158. data/spec/lib/helper_spec.rb +19 -0
  159. data/spec/lib/key_spec.rb +41 -0
  160. data/spec/lib/manipulation_spec.rb +114 -0
  161. data/spec/lib/map_partitions_spec.rb +87 -0
  162. data/spec/lib/map_spec.rb +91 -0
  163. data/spec/lib/mllib/classification_spec.rb +54 -0
  164. data/spec/lib/mllib/clustering_spec.rb +35 -0
  165. data/spec/lib/mllib/matrix_spec.rb +32 -0
  166. data/spec/lib/mllib/regression_spec.rb +116 -0
  167. data/spec/lib/mllib/vector_spec.rb +77 -0
  168. data/spec/lib/reduce_by_key_spec.rb +118 -0
  169. data/spec/lib/reduce_spec.rb +131 -0
  170. data/spec/lib/sample_spec.rb +46 -0
  171. data/spec/lib/serializer_spec.rb +13 -0
  172. data/spec/lib/sort_spec.rb +58 -0
  173. data/spec/lib/statistic_spec.rb +168 -0
  174. data/spec/lib/whole_text_files_spec.rb +33 -0
  175. data/spec/spec_helper.rb +39 -0
  176. metadata +301 -0
@@ -0,0 +1,260 @@
1
+ module Spark
2
+ ##
3
+ # A shared variable that can be accumulated, i.e., has a commutative and associative "add"
4
+ # operation. Worker tasks on a Spark cluster can add values to an Accumulator with the `+=`
5
+ # operator, but only the driver program is allowed to access its value, using value.
6
+ # Updates from the workers get propagated automatically to the driver program.
7
+ #
8
+ # == Arguments:
9
+ # value::
10
+ # Initial value for accumulator. This values is stored only on driver process
11
+ #
12
+ # accum_param::
13
+ # How merge 2 value on worker or driver process.
14
+ # Symbol or Proc (or String)
15
+ #
16
+ # zero_value::
17
+ # Initial value for worker process
18
+ #
19
+ #
20
+ # == Examples:
21
+ #
22
+ # accum1 = $sc.accumulator(1)
23
+ # accum2 = $sc.accumulator(2, :*, 1)
24
+ # accum3 = $sc.accumulator(3, lambda{|max, val| val > max ? val : max})
25
+ #
26
+ # accum1 += 1
27
+ #
28
+ # accum2.add(2)
29
+ # accum2.add(2)
30
+ # accum2.add(2)
31
+ #
32
+ # accum3.add(9)
33
+ # accum3.add(6)
34
+ # accum3.add(7)
35
+ #
36
+ # accum1.value # => 2
37
+ # accum2.value # => 16
38
+ # accum3.value # => 9
39
+ #
40
+ # func = Proc.new do |_, index|
41
+ # accum1.add(1)
42
+ # accum2.add(2)
43
+ # accum3.add(index * 10)
44
+ # end
45
+ #
46
+ # rdd = $sc.parallelize(0..4, 4)
47
+ # rdd = rdd.bind(accum1: accum1, accum2: accum2, accum3: accum3)
48
+ # rdd = rdd.map_partitions_with_index(func)
49
+ # rdd.collect
50
+ #
51
+ # accum1.value # => 6
52
+ # accum2.value # => 256
53
+ # accum3.value # => 30
54
+ #
55
+ class Accumulator
56
+
57
+ attr_reader :id, :value, :accum_param, :zero_value
58
+
59
+ @@instances = {}
60
+ @@changed = []
61
+
62
+ SUPPORTED_SYMBOLS = [:+, :-, :*, :/, :**]
63
+
64
+
65
+ # =========================================================================
66
+ # Creating and selecting Spark::Accumulator
67
+
68
+ def initialize(value, accum_param=:+, zero_value=0)
69
+ @id = object_id
70
+ @value = value
71
+ @accum_param = accum_param
72
+ @zero_value = zero_value
73
+ @driver = true
74
+
75
+ valid_accum_param
76
+
77
+ @@instances[@id] = self
78
+ end
79
+
80
+ def self.changed
81
+ @@changed
82
+ end
83
+
84
+ def self.instances
85
+ @@instances
86
+ end
87
+
88
+ def valid_accum_param
89
+ if @accum_param.is_a?(Symbol)
90
+ raise Spark::AccumulatorError, "Unsupported symbol #{@accum_param}" unless SUPPORTED_SYMBOLS.include?(@accum_param)
91
+ @serialized_accum_param = @accum_param
92
+ return
93
+ end
94
+
95
+ if @accum_param.is_a?(Proc)
96
+ begin
97
+ @serialized_accum_param = @accum_param.to_source
98
+ return
99
+ rescue
100
+ raise Spark::SerializeError, 'Proc can not be serialized. Use String instead.'
101
+ end
102
+ end
103
+
104
+ if @accum_param.is_a?(String)
105
+ @serialized_accum_param = @accum_param
106
+ @accum_param = eval(@accum_param)
107
+
108
+ unless @accum_param.is_a?(Proc)
109
+ raise Spark::SerializeError, 'Yours param is not a Proc.'
110
+ end
111
+
112
+ return
113
+ end
114
+
115
+ raise Spark::AccumulatorError, 'Unsupported param. Use Symbol, Proc or String.'
116
+ end
117
+
118
+ # Driver process or worker
119
+ def driver?
120
+ @driver
121
+ end
122
+
123
+
124
+ # =========================================================================
125
+ # Operations
126
+
127
+ def add(term)
128
+ if !driver? && !@@changed.include?(self)
129
+ @@changed << self
130
+ end
131
+
132
+ if @accum_param.is_a?(Proc)
133
+ @value = @accum_param.call(@value, term)
134
+ else
135
+ add_by_symbol(term)
136
+ end
137
+ end
138
+
139
+ def +(term)
140
+ add(term)
141
+ self
142
+ end
143
+
144
+ def add_by_symbol(term)
145
+ case @accum_param
146
+ when :+
147
+ @value += term
148
+ when :-
149
+ @value -= term
150
+ when :*
151
+ @value *= term
152
+ when :/
153
+ @value /= term
154
+ when :**
155
+ @value **= term
156
+ end
157
+ end
158
+
159
+
160
+ # =========================================================================
161
+ # Dump and load
162
+
163
+ def marshal_dump
164
+ [@id, @zero_value, @serialized_accum_param]
165
+ end
166
+
167
+ def marshal_load(array)
168
+ @id, @zero_value, @serialized_accum_param = array
169
+
170
+ @value = @zero_value
171
+ @driver = false
172
+ load_accum_param
173
+ end
174
+
175
+ def load_accum_param
176
+ if @serialized_accum_param.is_a?(String)
177
+ @accum_param = eval(@serialized_accum_param)
178
+ else
179
+ @accum_param = @serialized_accum_param
180
+ end
181
+ end
182
+
183
+ end
184
+ end
185
+
186
+ # =============================================================================
187
+ # Server for handeling Accumulator update
188
+ #
189
+ module Spark
190
+ class Accumulator
191
+ class Server
192
+
193
+ attr_reader :server, :host, :port
194
+
195
+ def self.start
196
+ @instance ||= Spark::Accumulator::Server.new
197
+ end
198
+
199
+ def self.stop
200
+ @instance && @instance.stop
201
+ end
202
+
203
+ def self.host
204
+ start
205
+ @instance.host
206
+ end
207
+
208
+ def self.port
209
+ start
210
+ @instance.port
211
+ end
212
+
213
+ def initialize
214
+ @server = TCPServer.new(0)
215
+ @host = @server.hostname
216
+ @port = @server.port
217
+
218
+ @threads = []
219
+ handle_accept
220
+ end
221
+
222
+ def stop
223
+ @threads.each(&:kill)
224
+ rescue
225
+ nil
226
+ end
227
+
228
+ def handle_accept
229
+ @threads << Thread.new do
230
+ loop {
231
+ handle_connection(@server.accept)
232
+ }
233
+ end
234
+
235
+ end
236
+
237
+ def handle_connection(socket)
238
+ @threads << Thread.new do
239
+ until socket.closed?
240
+ count = socket.read_int
241
+ count.times do
242
+ data = socket.read_data
243
+ accum = Spark::Accumulator.instances[data[0]]
244
+ if accum
245
+ accum.add(data[1])
246
+ else
247
+ Spark.logger.warn("Accumulator with id #{data[0]} does not exist.")
248
+ end
249
+ end
250
+
251
+ # http://stackoverflow.com/questions/28560133/ruby-server-java-scala-client-deadlock
252
+ # socket.write_int(Spark::Constant::ACCUMULATOR_ACK)
253
+ end
254
+
255
+ end
256
+ end
257
+
258
+ end
259
+ end
260
+ end
@@ -0,0 +1,98 @@
1
+ module Spark
2
+ ##
3
+ # Broadcast a read-only variable to the cluster, returning a Spark::Broadcast
4
+ # object for reading it in distributed functions. The variable will
5
+ # be sent to each cluster only once.
6
+ #
7
+ # == Example:
8
+ #
9
+ # broadcast1 = $sc.broadcast('a')
10
+ # broadcast2 = $sc.broadcast('b')
11
+ # broadcast3 = $sc.broadcast([1,2,3])
12
+ #
13
+ # func = Proc.new do |part, index|
14
+ # [
15
+ # broadcast1.value * index,
16
+ # broadcast2.value * index,
17
+ # broadcast3.value.reduce(:+)
18
+ # ]
19
+ # end
20
+ #
21
+ # rdd = $sc.parallelize(0..5, 4)
22
+ # rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2, broadcast3: broadcast3)
23
+ # rdd = rdd.map_partitions_with_index(func)
24
+ # rdd.collect
25
+ # # => ["", "", 6, "a", "b", 6, "aa", "bb", 6, "aaa", "bbb", 6]
26
+ #
27
+ class Broadcast
28
+
29
+ LOADED = 0 # id, value, path
30
+ NOT_LOADED = 1 # id, path
31
+ WITHOUT_PATH = 2 # id
32
+
33
+ attr_reader :id, :state, :path, :jbroadcast
34
+
35
+ @@registered = {}
36
+
37
+ # =========================================================================
38
+ # Creating broadcast for SparkContext
39
+
40
+ # Create new Broadcast and dump value to the disk
41
+ #
42
+ # b = $sc.broadcast('a')
43
+ #
44
+ # b.value # => 'a'
45
+ # b.path
46
+ # b.jbroadcast
47
+ #
48
+ def initialize(sc, value)
49
+ @id = object_id
50
+ @value = value
51
+ @state = LOADED
52
+
53
+ file = Tempfile.create('broadcast', sc.temp_dir)
54
+ file.binmode
55
+ file.write(Marshal.dump(value))
56
+ file.close
57
+
58
+ @path = file.path
59
+ @jbroadcast = RubyRDD.readBroadcastFromFile(sc.jcontext, @path, Spark.jb.to_long(@id))
60
+
61
+ ObjectSpace.define_finalizer(self, proc { File.unlink(@path) })
62
+ end
63
+
64
+ def self.register(id, path)
65
+ @@registered[id] = path
66
+ end
67
+
68
+ def value
69
+ case state
70
+ when LOADED
71
+ @value
72
+ when NOT_LOADED
73
+ @value = Marshal.load(File.read(@path))
74
+ @state = LOADED
75
+ @value
76
+ when WITHOUT_PATH
77
+ @path = @@registered[id]
78
+
79
+ if @path
80
+ @state = NOT_LOADED
81
+ value
82
+ else
83
+ raise Spark::BroadcastError, "Broadcast #{@id} do not have registered path."
84
+ end
85
+ end
86
+ end
87
+
88
+ def marshal_dump
89
+ @id
90
+ end
91
+
92
+ def marshal_load(id)
93
+ @id = id
94
+ @state = WITHOUT_PATH
95
+ end
96
+
97
+ end
98
+ end
@@ -0,0 +1,43 @@
1
+ module Spark
2
+ module Build
3
+
4
+ DEFAULT_SCALA_VERSION = '2.10.4'
5
+ DEFAULT_CORE_VERSION = '2.10'
6
+ DEFAULT_SPARK_VERSION = '1.3.0'
7
+ DEFAULT_HADOOP_VERSION = '1.0.4'
8
+
9
+ SBT = 'sbt/sbt'
10
+ SBT_DEPS = 'assemblyPackageDependency'
11
+ SBT_EXT = 'package'
12
+ SBT_CLEAN = 'clean'
13
+
14
+ def self.build(options)
15
+ spark_home = options.spark_home || Spark.target_dir
16
+ scala_version = options.scala_version || DEFAULT_SCALA_VERSION
17
+ spark_core = options.spark_core || DEFAULT_CORE_VERSION
18
+ spark_version = options.spark_version || DEFAULT_SPARK_VERSION
19
+ hadoop_version = options.hadoop_version || DEFAULT_HADOOP_VERSION
20
+ only_ext = options.only_ext
21
+
22
+ env = {
23
+ 'SCALA_VERSION' => scala_version,
24
+ 'SPARK_VERSION' => spark_version,
25
+ 'SPARK_CORE_VERSION' => spark_core,
26
+ 'HADOOP_VERSION' => hadoop_version,
27
+ 'SPARK_HOME' => spark_home
28
+ }
29
+
30
+ cmd = [SBT]
31
+ cmd << SBT_EXT
32
+ cmd << SBT_DEPS unless only_ext
33
+ cmd << SBT_CLEAN unless $debug
34
+
35
+ Dir.chdir(Spark.spark_ext_dir) do
36
+ unless Kernel.system(env, cmd.join(' '))
37
+ raise Spark::BuildError, 'Spark cannot be assembled.'
38
+ end
39
+ end
40
+ end
41
+
42
+ end
43
+ end
data/lib/spark/cli.rb ADDED
@@ -0,0 +1,169 @@
1
+ require 'commander'
2
+
3
+ module Commander
4
+ module UI
5
+ # Disable paging
6
+ # for 'classic' help
7
+ def self.enable_paging
8
+ end
9
+ end
10
+ end
11
+
12
+ module Spark
13
+ class CLI
14
+ include Commander::Methods
15
+
16
+ IRB_HISTORY_FILE = File.join(Dir.home, '.irb_spark_history')
17
+ IRB_HISTORY_SIZE = 100
18
+
19
+ def run
20
+ program :name, 'RubySpark'
21
+ program :version, Spark::VERSION
22
+ program :description, 'Ruby wrapper for Spark'
23
+
24
+ global_option('-d', '--debug', 'Logging message to stdout'){ $debug = true }
25
+ default_command :help
26
+
27
+
28
+ # Build ---------------------------------------------------------------
29
+ command :build do |c|
30
+ c.syntax = 'build [options]'
31
+ c.description = 'Build spark and gem extensions'
32
+ c.option '--hadoop-version STRING', String, 'Version of hadoop which will stored with the SPARK'
33
+ c.option '--spark-home STRING', String, 'Directory where SPARK will be stored'
34
+ c.option '--spark-core STRING', String, 'Version of SPARK core'
35
+ c.option '--spark-version STRING', String, 'Version of SPARK'
36
+ c.option '--scala-version STRING', String, 'Version of Scala'
37
+ c.option '--only-ext', 'Start SPARK immediately'
38
+
39
+ c.action do |args, options|
40
+ options.default hadoop_version: Spark::Build::DEFAULT_HADOOP_VERSION,
41
+ spark_home: Spark.target_dir,
42
+ spark_core: Spark::Build::DEFAULT_CORE_VERSION,
43
+ spark_version: Spark::Build::DEFAULT_SPARK_VERSION,
44
+ scala_version: Spark::Build::DEFAULT_SCALA_VERSION,
45
+ only_ext: false
46
+
47
+ Spark::Build.build(options)
48
+ puts
49
+ puts 'Everything is OK'
50
+ end
51
+ end
52
+ alias_command :install, :build
53
+
54
+
55
+ # Pry -------------------------------------------------------------------
56
+ command :pry do |c|
57
+ c.syntax = 'pry [options]'
58
+ c.description = 'Start ruby shell for spark'
59
+ c.option '--spark-home STRING', String, 'Directory where SPARK is stored'
60
+ c.option '--properties-file STRING', String, 'Path to a file from which to load extra properties'
61
+ c.option '--[no-]start', 'Start SPARK immediately'
62
+ c.option '--[no-]logger', 'Enable/disable logger (default: enable)'
63
+
64
+ c.action do |args, options|
65
+ options.default start: true, logger: true
66
+
67
+ Spark.load_lib(options.spark_home)
68
+ Spark::Logger.disable unless options.logger
69
+
70
+ Spark.config do
71
+ set_app_name 'Pry RubySpark'
72
+ end
73
+
74
+ Spark.config.from_file(options.properties_file)
75
+
76
+ if options.start
77
+ # Load Java and Spark
78
+ Spark.start
79
+ $sc = Spark.context
80
+
81
+ Spark.print_logo('Spark context is loaded as $sc')
82
+ else
83
+ Spark.print_logo('You can start Spark with Spark.start')
84
+ end
85
+
86
+ # Load Pry
87
+ require 'pry'
88
+ Pry.start
89
+ end
90
+ end
91
+ alias_command :shell, :pry
92
+
93
+
94
+ # IRB -------------------------------------------------------------------
95
+ command :irb do |c|
96
+ c.syntax = 'irb [options]'
97
+ c.description = 'Start ruby shell for spark'
98
+ c.option '--spark-home STRING', String, 'Directory where SPARK is stored'
99
+ c.option '--[no-]start', 'Start SPARK immediately'
100
+ c.option '--[no-]logger', 'Enable/disable logger (default: enable)'
101
+
102
+ c.action do |args, options|
103
+ options.default start: true, logger: true
104
+
105
+ Spark.load_lib(options.spark_home)
106
+ Spark::Logger.disable unless options.logger
107
+
108
+ Spark.config do
109
+ set_app_name 'Pry RubySpark'
110
+ end
111
+
112
+ if options.start
113
+ # Load Java and Spark
114
+ Spark.start
115
+ $sc = Spark.context
116
+
117
+ Spark.print_logo('Spark context is loaded as $sc')
118
+ else
119
+ Spark.print_logo('You can start Spark with Spark.start')
120
+ end
121
+
122
+ # Load IRB
123
+ require 'irb'
124
+ require 'irb/completion'
125
+ require 'irb/ext/save-history'
126
+
127
+ begin
128
+ file = File.expand_path(IRB_HISTORY_FILE)
129
+ if File.exists?(file)
130
+ lines = IO.readlines(file).collect { |line| line.chomp }
131
+ Readline::HISTORY.push(*lines)
132
+ end
133
+ Kernel.at_exit do
134
+ lines = Readline::HISTORY.to_a.reverse.uniq.reverse
135
+ lines = lines[-IRB_HISTORY_SIZE, IRB_HISTORY_SIZE] if lines.nitems > IRB_HISTORY_SIZE
136
+ File.open(IRB_HISTORY_FILE, File::WRONLY | File::CREAT | File::TRUNC) { |io| io.puts lines.join("\n") }
137
+ end
138
+ rescue
139
+ end
140
+
141
+ ARGV.clear # Clear Thor ARGV, otherwise IRB will parse it
142
+ ARGV.concat ['--readline', '--prompt-mode', 'simple']
143
+ IRB.start
144
+ end
145
+ end
146
+
147
+
148
+ # Home ------------------------------------------------------------------
149
+ command :home do |c|
150
+ c.action do |args, options|
151
+ puts Spark.home
152
+ exit(0)
153
+ end
154
+ end
155
+
156
+
157
+ # Ruby spark jar --------------------------------------------------------
158
+ command :ruby_spark_jar do |c|
159
+ c.action do |args, options|
160
+ puts Spark.ruby_spark_jar
161
+ exit(0)
162
+ end
163
+ end
164
+
165
+ run!
166
+ end
167
+
168
+ end
169
+ end