ruby-spark 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (176) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +37 -0
  3. data/Gemfile +47 -0
  4. data/Guardfile +5 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +185 -0
  7. data/Rakefile +35 -0
  8. data/TODO.md +7 -0
  9. data/benchmark/aggregate.rb +33 -0
  10. data/benchmark/bisect.rb +88 -0
  11. data/benchmark/custom_marshal.rb +94 -0
  12. data/benchmark/digest.rb +150 -0
  13. data/benchmark/enumerator.rb +88 -0
  14. data/benchmark/performance/prepare.sh +18 -0
  15. data/benchmark/performance/python.py +156 -0
  16. data/benchmark/performance/r.r +69 -0
  17. data/benchmark/performance/ruby.rb +167 -0
  18. data/benchmark/performance/run-all.sh +160 -0
  19. data/benchmark/performance/scala.scala +181 -0
  20. data/benchmark/serializer.rb +82 -0
  21. data/benchmark/sort.rb +43 -0
  22. data/benchmark/sort2.rb +164 -0
  23. data/benchmark/take.rb +28 -0
  24. data/bin/ruby-spark +8 -0
  25. data/example/pi.rb +28 -0
  26. data/ext/ruby_c/extconf.rb +3 -0
  27. data/ext/ruby_c/murmur.c +158 -0
  28. data/ext/ruby_c/murmur.h +9 -0
  29. data/ext/ruby_c/ruby-spark.c +18 -0
  30. data/ext/ruby_java/Digest.java +36 -0
  31. data/ext/ruby_java/Murmur2.java +98 -0
  32. data/ext/ruby_java/RubySparkExtService.java +28 -0
  33. data/ext/ruby_java/extconf.rb +3 -0
  34. data/ext/spark/build.sbt +73 -0
  35. data/ext/spark/project/plugins.sbt +9 -0
  36. data/ext/spark/sbt/sbt +34 -0
  37. data/ext/spark/src/main/scala/Exec.scala +91 -0
  38. data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
  39. data/ext/spark/src/main/scala/Marshal.scala +52 -0
  40. data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
  41. data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
  42. data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
  43. data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
  44. data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
  45. data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
  46. data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
  47. data/ext/spark/src/main/scala/RubyPage.scala +34 -0
  48. data/ext/spark/src/main/scala/RubyRDD.scala +364 -0
  49. data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
  50. data/ext/spark/src/main/scala/RubyTab.scala +11 -0
  51. data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
  52. data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
  53. data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
  54. data/lib/ruby-spark.rb +1 -0
  55. data/lib/spark.rb +198 -0
  56. data/lib/spark/accumulator.rb +260 -0
  57. data/lib/spark/broadcast.rb +98 -0
  58. data/lib/spark/build.rb +43 -0
  59. data/lib/spark/cli.rb +169 -0
  60. data/lib/spark/command.rb +86 -0
  61. data/lib/spark/command/base.rb +154 -0
  62. data/lib/spark/command/basic.rb +345 -0
  63. data/lib/spark/command/pair.rb +124 -0
  64. data/lib/spark/command/sort.rb +51 -0
  65. data/lib/spark/command/statistic.rb +144 -0
  66. data/lib/spark/command_builder.rb +141 -0
  67. data/lib/spark/command_validator.rb +34 -0
  68. data/lib/spark/config.rb +244 -0
  69. data/lib/spark/constant.rb +14 -0
  70. data/lib/spark/context.rb +304 -0
  71. data/lib/spark/error.rb +50 -0
  72. data/lib/spark/ext/hash.rb +41 -0
  73. data/lib/spark/ext/integer.rb +25 -0
  74. data/lib/spark/ext/io.rb +57 -0
  75. data/lib/spark/ext/ip_socket.rb +29 -0
  76. data/lib/spark/ext/module.rb +58 -0
  77. data/lib/spark/ext/object.rb +24 -0
  78. data/lib/spark/ext/string.rb +24 -0
  79. data/lib/spark/helper.rb +10 -0
  80. data/lib/spark/helper/logger.rb +40 -0
  81. data/lib/spark/helper/parser.rb +85 -0
  82. data/lib/spark/helper/serialize.rb +71 -0
  83. data/lib/spark/helper/statistic.rb +93 -0
  84. data/lib/spark/helper/system.rb +42 -0
  85. data/lib/spark/java_bridge.rb +19 -0
  86. data/lib/spark/java_bridge/base.rb +203 -0
  87. data/lib/spark/java_bridge/jruby.rb +23 -0
  88. data/lib/spark/java_bridge/rjb.rb +41 -0
  89. data/lib/spark/logger.rb +76 -0
  90. data/lib/spark/mllib.rb +100 -0
  91. data/lib/spark/mllib/classification/common.rb +31 -0
  92. data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
  93. data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
  94. data/lib/spark/mllib/classification/svm.rb +135 -0
  95. data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
  96. data/lib/spark/mllib/clustering/kmeans.rb +118 -0
  97. data/lib/spark/mllib/matrix.rb +120 -0
  98. data/lib/spark/mllib/regression/common.rb +73 -0
  99. data/lib/spark/mllib/regression/labeled_point.rb +41 -0
  100. data/lib/spark/mllib/regression/lasso.rb +100 -0
  101. data/lib/spark/mllib/regression/linear.rb +124 -0
  102. data/lib/spark/mllib/regression/ridge.rb +97 -0
  103. data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
  104. data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
  105. data/lib/spark/mllib/stat/distribution.rb +12 -0
  106. data/lib/spark/mllib/vector.rb +185 -0
  107. data/lib/spark/rdd.rb +1328 -0
  108. data/lib/spark/sampler.rb +92 -0
  109. data/lib/spark/serializer.rb +24 -0
  110. data/lib/spark/serializer/base.rb +170 -0
  111. data/lib/spark/serializer/cartesian.rb +37 -0
  112. data/lib/spark/serializer/marshal.rb +19 -0
  113. data/lib/spark/serializer/message_pack.rb +25 -0
  114. data/lib/spark/serializer/oj.rb +25 -0
  115. data/lib/spark/serializer/pair.rb +27 -0
  116. data/lib/spark/serializer/utf8.rb +25 -0
  117. data/lib/spark/sort.rb +189 -0
  118. data/lib/spark/stat_counter.rb +125 -0
  119. data/lib/spark/storage_level.rb +39 -0
  120. data/lib/spark/version.rb +3 -0
  121. data/lib/spark/worker/master.rb +144 -0
  122. data/lib/spark/worker/spark_files.rb +15 -0
  123. data/lib/spark/worker/worker.rb +197 -0
  124. data/ruby-spark.gemspec +36 -0
  125. data/spec/generator.rb +37 -0
  126. data/spec/inputs/lorem_300.txt +316 -0
  127. data/spec/inputs/numbers/1.txt +50 -0
  128. data/spec/inputs/numbers/10.txt +50 -0
  129. data/spec/inputs/numbers/11.txt +50 -0
  130. data/spec/inputs/numbers/12.txt +50 -0
  131. data/spec/inputs/numbers/13.txt +50 -0
  132. data/spec/inputs/numbers/14.txt +50 -0
  133. data/spec/inputs/numbers/15.txt +50 -0
  134. data/spec/inputs/numbers/16.txt +50 -0
  135. data/spec/inputs/numbers/17.txt +50 -0
  136. data/spec/inputs/numbers/18.txt +50 -0
  137. data/spec/inputs/numbers/19.txt +50 -0
  138. data/spec/inputs/numbers/2.txt +50 -0
  139. data/spec/inputs/numbers/20.txt +50 -0
  140. data/spec/inputs/numbers/3.txt +50 -0
  141. data/spec/inputs/numbers/4.txt +50 -0
  142. data/spec/inputs/numbers/5.txt +50 -0
  143. data/spec/inputs/numbers/6.txt +50 -0
  144. data/spec/inputs/numbers/7.txt +50 -0
  145. data/spec/inputs/numbers/8.txt +50 -0
  146. data/spec/inputs/numbers/9.txt +50 -0
  147. data/spec/inputs/numbers_0_100.txt +101 -0
  148. data/spec/inputs/numbers_1_100.txt +100 -0
  149. data/spec/lib/collect_spec.rb +42 -0
  150. data/spec/lib/command_spec.rb +68 -0
  151. data/spec/lib/config_spec.rb +64 -0
  152. data/spec/lib/context_spec.rb +163 -0
  153. data/spec/lib/ext_spec.rb +72 -0
  154. data/spec/lib/external_apps_spec.rb +45 -0
  155. data/spec/lib/filter_spec.rb +80 -0
  156. data/spec/lib/flat_map_spec.rb +100 -0
  157. data/spec/lib/group_spec.rb +109 -0
  158. data/spec/lib/helper_spec.rb +19 -0
  159. data/spec/lib/key_spec.rb +41 -0
  160. data/spec/lib/manipulation_spec.rb +114 -0
  161. data/spec/lib/map_partitions_spec.rb +87 -0
  162. data/spec/lib/map_spec.rb +91 -0
  163. data/spec/lib/mllib/classification_spec.rb +54 -0
  164. data/spec/lib/mllib/clustering_spec.rb +35 -0
  165. data/spec/lib/mllib/matrix_spec.rb +32 -0
  166. data/spec/lib/mllib/regression_spec.rb +116 -0
  167. data/spec/lib/mllib/vector_spec.rb +77 -0
  168. data/spec/lib/reduce_by_key_spec.rb +118 -0
  169. data/spec/lib/reduce_spec.rb +131 -0
  170. data/spec/lib/sample_spec.rb +46 -0
  171. data/spec/lib/serializer_spec.rb +13 -0
  172. data/spec/lib/sort_spec.rb +58 -0
  173. data/spec/lib/statistic_spec.rb +168 -0
  174. data/spec/lib/whole_text_files_spec.rb +33 -0
  175. data/spec/spec_helper.rb +39 -0
  176. metadata +301 -0
@@ -0,0 +1,15 @@
1
+ package org.apache.spark.api.ruby
2
+
3
+ import org.apache.spark.util._
4
+ import org.apache.spark.{SparkConf, Logging}
5
+
6
+ object RubyUtils extends Logging {
7
+
8
+ def loadPropertiesFile(conf: SparkConf, path: String): String = {
9
+ Utils.getPropertiesFromFile(path).foreach {
10
+ case (key, value) => conf.set(key, value)
11
+ }
12
+ path
13
+ }
14
+
15
+ }
@@ -0,0 +1,257 @@
1
+ package org.apache.spark.api.ruby
2
+
3
+ import java.io.{File, DataInputStream, InputStream, DataOutputStream, FileOutputStream}
4
+ import java.net.{InetAddress, ServerSocket, Socket, SocketException}
5
+ import java.nio.file.Paths
6
+
7
+ import scala.collection.mutable
8
+ import scala.collection.JavaConversions._
9
+
10
+ import org.apache.spark._
11
+ import org.apache.spark.api.python.PythonRDD
12
+ import org.apache.spark.util.Utils
13
+ import org.apache.spark.util.RedirectThread
14
+
15
+
16
+ /* =================================================================================================
17
+ * Object RubyWorker
18
+ * =================================================================================================
19
+ *
20
+ * Create and store server for creating workers.
21
+ */
22
+
23
+ object RubyWorker extends Logging {
24
+
25
+ val PROCESS_WAIT_TIMEOUT = 10000
26
+
27
+ private var serverSocket: ServerSocket = null
28
+ private val serverHost = InetAddress.getByAddress(Array(127, 0, 0, 1))
29
+ private var serverPort: Int = 0
30
+
31
+ private var master: ExecutedFileCommand = null
32
+ private var masterSocket: Socket = null
33
+ private var masterOutputStream: DataOutputStream = null
34
+ private var masterInputStream: DataInputStream = null
35
+
36
+ private var workers = new mutable.WeakHashMap[Socket, Long]()
37
+
38
+
39
+ /* ----------------------------------------------------------------------------------------------
40
+ * Create new worker but first check if exist SocketServer and master process.
41
+ * If not it will create them. Worker have 2 chance to create.
42
+ */
43
+
44
+ def create(env: SparkEnv): (Socket, Long) = {
45
+ synchronized {
46
+ // Create the server if it hasn't been started
47
+ createServer(env)
48
+
49
+ // Attempt to connect, restart and retry once if it fails
50
+ try {
51
+ createWorker
52
+ } catch {
53
+ case exc: SocketException =>
54
+ logWarning("Worker unexpectedly quit, attempting to restart")
55
+ createWorker
56
+ }
57
+ }
58
+ }
59
+
60
+ /* ----------------------------------------------------------------------------------------------
61
+ * Create a worker throught master process. Return new socket and id.
62
+ * According spark.ruby.worker.type id will be:
63
+ * process: PID
64
+ * thread: thread object id
65
+ */
66
+
67
+ def createWorker: (Socket, Long) = {
68
+ synchronized {
69
+ masterOutputStream.writeInt(RubyConstant.CREATE_WORKER)
70
+ var socket = serverSocket.accept()
71
+
72
+ var id = new DataInputStream(socket.getInputStream).readLong()
73
+ workers.put(socket, id)
74
+
75
+ (socket, id)
76
+ }
77
+ }
78
+
79
+ /* ----------------------------------------------------------------------------------------------
80
+ * Create SocketServer and bind it to the localhost. Max numbers of connection on queue
81
+ * is set to default. If server is created withou exception -> create master.
82
+ */
83
+
84
+ private def createServer(env: SparkEnv){
85
+ synchronized {
86
+ // Already running?
87
+ if(serverSocket != null && masterSocket != null) {
88
+ return
89
+ }
90
+
91
+ try {
92
+ // Start Socket Server for comunication
93
+ serverSocket = new ServerSocket(0, 0, serverHost)
94
+ serverPort = serverSocket.getLocalPort
95
+
96
+ // Create a master for worker creations
97
+ createMaster(env)
98
+ } catch {
99
+ case e: Exception =>
100
+ throw new SparkException("There was a problem with creating a server", e)
101
+ }
102
+ }
103
+ }
104
+
105
+ /* ----------------------------------------------------------------------------------------------
106
+ * In this point SocketServer must be created. Master process create and kill workers.
107
+ * Creating workers from Java can be an expensive operation because new process can
108
+ * get copy of address space.
109
+ */
110
+
111
+ private def createMaster(env: SparkEnv){
112
+ synchronized {
113
+ val isDriver = env.executorId == SparkContext.DRIVER_IDENTIFIER
114
+ val executorOptions = env.conf.get("spark.ruby.executor.options", "")
115
+ val commandTemplate = env.conf.get("spark.ruby.executor.command")
116
+ val workerType = env.conf.get("spark.ruby.worker.type")
117
+
118
+ // Where is root of ruby-spark
119
+ var executorLocation = ""
120
+
121
+ if(isDriver){
122
+ // Use worker from current active gem location
123
+ executorLocation = env.conf.get("spark.ruby.driver_home")
124
+ }
125
+ else{
126
+ // Ruby-spark package uri
127
+ val uri = env.conf.get("spark.ruby.executor.uri", "")
128
+
129
+ if(uri.isEmpty){
130
+ // Use gem installed on the system
131
+ try {
132
+ val homeCommand = new FileCommand(commandTemplate, "ruby-spark home", env, getEnvVars(env))
133
+
134
+ executorLocation = homeCommand.run.readLine
135
+ } catch {
136
+ case e: java.io.IOException =>
137
+ throw new SparkException("Ruby-spark gem is not installed.", e)
138
+ }
139
+ }
140
+ else{
141
+ // Prepare and use gem from uri
142
+ }
143
+ }
144
+
145
+ // Master and worker are saved in GEM_ROOT/lib/spark/worker
146
+ executorLocation = Paths.get(executorLocation, "lib", "spark", "worker").toString
147
+
148
+ // Create master command
149
+ // -C: change worker dir before execution
150
+ val masterRb = s"ruby $executorOptions -C $executorLocation master.rb $workerType $serverPort"
151
+ val masterCommand = new FileCommand(commandTemplate, masterRb, env, getEnvVars(env))
152
+
153
+ // Start master
154
+ master = masterCommand.run
155
+
156
+ // Redirect master stdout and stderr
157
+ redirectStreamsToStderr(master.getInputStream, master.getErrorStream)
158
+
159
+ // Wait for it to connect to our socket
160
+ serverSocket.setSoTimeout(PROCESS_WAIT_TIMEOUT)
161
+ try {
162
+ // Use socket for comunication. Keep stdout and stdin for log
163
+ masterSocket = serverSocket.accept()
164
+ masterOutputStream = new DataOutputStream(masterSocket.getOutputStream)
165
+ masterInputStream = new DataInputStream(masterSocket.getInputStream)
166
+
167
+ PythonRDD.writeUTF(executorOptions, masterOutputStream)
168
+ } catch {
169
+ case e: Exception =>
170
+ throw new SparkException("Ruby master did not connect back in time", e)
171
+ }
172
+ }
173
+ }
174
+
175
+ /* ----------------------------------------------------------------------------------------------
176
+ * Gel all environment variables for executor
177
+ */
178
+
179
+ def getEnvVars(env: SparkEnv): Map[String, String] = {
180
+ val prefix = "spark.ruby.executor.env."
181
+ env.conf.getAll.filter{case (k, _) => k.startsWith(prefix)}
182
+ .map{case (k, v) => (k.substring(prefix.length), v)}
183
+ .toMap
184
+ }
185
+
186
+ /* ------------------------------------------------------------------------------------------- */
187
+
188
+ def kill(workerId: Long){
189
+ masterOutputStream.writeInt(RubyConstant.KILL_WORKER)
190
+ masterOutputStream.writeLong(workerId)
191
+ }
192
+
193
+ /* ------------------------------------------------------------------------------------------- */
194
+
195
+ def killAndWait(workerId: Long){
196
+ masterOutputStream.writeInt(RubyConstant.KILL_WORKER_AND_WAIT)
197
+ masterOutputStream.writeLong(workerId)
198
+
199
+ // Wait for answer
200
+ masterInputStream.readInt() match {
201
+ case RubyConstant.SUCCESSFULLY_KILLED =>
202
+ logInfo(s"Worker $workerId was successfully killed")
203
+ case RubyConstant.UNSUCCESSFUL_KILLING =>
204
+ logInfo(s"Worker $workerId cannot be killed (maybe is already killed)")
205
+ }
206
+ }
207
+
208
+ /* ----------------------------------------------------------------------------------------------
209
+ * workers HashMap is week but it avoid long list of workers which cannot be killed (killAndWait)
210
+ */
211
+
212
+ def remove(worker: Socket, workerId: Long){
213
+ try {
214
+ workers.remove(worker)
215
+ } catch {
216
+ case e: Exception => logWarning(s"Worker $workerId does not exist (maybe is already removed)")
217
+ }
218
+ }
219
+
220
+ /* ------------------------------------------------------------------------------------------- */
221
+
222
+ def stopServer{
223
+ synchronized {
224
+ // Kill workers
225
+ workers.foreach { case (socket, id) => killAndWait(id) }
226
+
227
+ // Kill master
228
+ master.destroy
229
+
230
+ // Stop SocketServer
231
+ serverSocket.close()
232
+
233
+ // Clean variables
234
+ serverSocket = null
235
+ serverPort = 0
236
+ master = null
237
+ masterSocket = null
238
+ masterOutputStream = null
239
+ masterInputStream = null
240
+ }
241
+ }
242
+
243
+ /* ------------------------------------------------------------------------------------------- */
244
+
245
+ private def redirectStreamsToStderr(streams: InputStream*) {
246
+ try {
247
+ for(stream <- streams) {
248
+ new RedirectThread(stream, System.err, "stream reader").start()
249
+ }
250
+ } catch {
251
+ case e: Exception =>
252
+ logError("Exception in redirecting streams", e)
253
+ }
254
+ }
255
+
256
+ /* ------------------------------------------------------------------------------------------- */
257
+ }
@@ -0,0 +1,84 @@
1
+ package org.apache.spark.api.ruby.marshal
2
+
3
+ import org.scalatest._
4
+
5
+
6
+ import org.apache.spark.api.ruby.marshal._
7
+
8
+ class MarshalSpec extends FunSpec with Matchers {
9
+
10
+ // ====================================================================================
11
+ // Load
12
+
13
+ describe("Marshal.load"){
14
+ describe("single value"){
15
+ it("int"){
16
+ val data = 1
17
+ val serialized = Array[Byte](4, 8, 105, 6)
18
+
19
+ Marshal.load(serialized) should equal(data)
20
+ }
21
+
22
+ it("double"){
23
+ val data = 1.2
24
+ val serialized = Array[Byte](4, 8, 102, 8, 49, 46, 50)
25
+
26
+ Marshal.load(serialized) should equal(data)
27
+ }
28
+ }
29
+
30
+ describe("array"){
31
+ it("ints"){
32
+ val data = Array(1, 2, 3, 4, 5)
33
+ val serialized = Array[Byte](4, 8, 91, 10, 105, 6, 105, 7, 105, 8, 105, 9, 105, 10)
34
+
35
+ Marshal.load(serialized) should equal(data)
36
+ }
37
+
38
+ it("doubles"){
39
+ val data = Array(1.1, 2.2, 3.3)
40
+ val serialized = Array[Byte](4, 8, 91, 8, 102, 8, 49, 46, 49, 102, 8, 50, 46, 50, 102, 8, 51, 46, 51)
41
+
42
+ Marshal.load(serialized) should equal(data)
43
+ }
44
+ }
45
+ }
46
+
47
+ // ====================================================================================
48
+ // Dump
49
+
50
+ describe("Marshal.dump"){
51
+ describe("single value"){
52
+ it("int"){
53
+ val data = 1
54
+ val serialized = Array(4, 8, 105, 6)
55
+
56
+ Marshal.dump(data) should equal(serialized)
57
+ }
58
+
59
+ it("double"){
60
+ val data = 1.2
61
+ val serialized = Array(4, 8, 102, 8, 49, 46, 50)
62
+
63
+ Marshal.dump(data) should equal(serialized)
64
+ }
65
+ }
66
+
67
+ describe("array"){
68
+ it("ints"){
69
+ val data = Array(1, 2, 3, 4, 5)
70
+ val serialized = Array(4, 8, 91, 10, 105, 6, 105, 7, 105, 8, 105, 9, 105, 10)
71
+
72
+ Marshal.dump(data) should equal(serialized)
73
+ }
74
+
75
+ it("doubles"){
76
+ val data = Array(1.1, 2.2, 3.3)
77
+ val serialized = Array(4, 8, 91, 8, 102, 8, 49, 46, 49, 102, 8, 50, 46, 50, 102, 8, 51, 46, 51)
78
+
79
+ Marshal.dump(data) should equal(serialized)
80
+ }
81
+ }
82
+ }
83
+
84
+ }
data/lib/ruby-spark.rb ADDED
@@ -0,0 +1 @@
1
+ require_relative 'spark'
data/lib/spark.rb ADDED
@@ -0,0 +1,198 @@
1
+ # Gems and libraries
2
+ require 'method_source'
3
+ require 'forwardable'
4
+ require 'sourcify'
5
+ require 'socket'
6
+ require 'tempfile'
7
+ require 'tmpdir'
8
+
9
+ module Spark
10
+ autoload :Context, 'spark/context'
11
+ autoload :Config, 'spark/config'
12
+ autoload :RDD, 'spark/rdd'
13
+ autoload :CLI, 'spark/cli'
14
+ autoload :Build, 'spark/build'
15
+ autoload :Serializer, 'spark/serializer'
16
+ autoload :Helper, 'spark/helper'
17
+ autoload :StorageLevel, 'spark/storage_level'
18
+ autoload :Command, 'spark/command'
19
+ autoload :CommandBuilder, 'spark/command_builder'
20
+ autoload :Sampler, 'spark/sampler'
21
+ autoload :Logger, 'spark/logger'
22
+ autoload :JavaBridge, 'spark/java_bridge'
23
+ autoload :ExternalSorter, 'spark/sort'
24
+ autoload :Constant, 'spark/constant'
25
+ autoload :Broadcast, 'spark/broadcast'
26
+ autoload :Accumulator, 'spark/accumulator'
27
+ autoload :StatCounter, 'spark/stat_counter'
28
+ autoload :Mllib, 'spark/mllib'
29
+
30
+ include Helper::System
31
+
32
+ def self.print_logo(message=nil)
33
+ puts <<-STRING
34
+
35
+ Welcome to
36
+ __ ____ __
37
+ ______ __/ / __ __ / __/__ ___ _____/ /__
38
+ / __/ // / _ \\/ // / _\\ \\/ _ \\/ _ `/ __/ '_/
39
+ /_/ \\_,_/_.__/\\_, / /___/ .__/\\_,_/_/ /_/\\_\\ version #{Spark::VERSION}
40
+ /___/ /_/
41
+
42
+ #{message}
43
+
44
+ STRING
45
+ end
46
+
47
+ # Returns current configuration. Configurations can be changed until
48
+ # context is initialized. In this case config is locked only for reading.
49
+ #
50
+ # == Configuration can be changed:
51
+ #
52
+ # Spark.config.set('spark.app.name', 'RubySpark')
53
+ #
54
+ # Spark.config['spark.app.name'] = 'RubySpark'
55
+ #
56
+ # Spark.config do
57
+ # set 'spark.app.name', 'RubySpark'
58
+ # end
59
+ #
60
+ def self.config(&block)
61
+ @config ||= Spark::Config.new
62
+
63
+ if block_given?
64
+ @config.instance_eval(&block)
65
+ else
66
+ @config
67
+ end
68
+ end
69
+
70
+ # Destroy current configuration. This can be useful for restarting config
71
+ # to set new. It has no effect if context is already started.
72
+ def self.clear_config
73
+ @config = nil
74
+ end
75
+
76
+ # Return a current active context or nil.
77
+ #
78
+ # TODO: Run `start` if context is nil?
79
+ #
80
+ def self.context
81
+ @context
82
+ end
83
+
84
+ # Initialize spark context if not already. Config will be automatically
85
+ # loaded on constructor. From that point `config` will use configuration
86
+ # from running Spark and will be locked only for reading.
87
+ def self.start
88
+ if started?
89
+ # Already started
90
+ else
91
+ @context ||= Spark::Context.new
92
+ end
93
+ end
94
+
95
+ def self.stop
96
+ @context.stop
97
+ RubyWorker.stopServer
98
+ logger.info('Workers were stopped')
99
+ rescue
100
+ nil
101
+ ensure
102
+ @context = nil
103
+ clear_config
104
+ end
105
+
106
+ def self.started?
107
+ !!@context
108
+ end
109
+
110
+ def self.logger
111
+ @logger ||= Spark::Logger.new
112
+ end
113
+
114
+ # Root of the gem
115
+ def self.root
116
+ @root ||= File.expand_path('..', File.dirname(__FILE__))
117
+ end
118
+
119
+ def self.home
120
+ root
121
+ end
122
+
123
+ # Default directory for java extensions
124
+ def self.target_dir
125
+ @target_dir ||= File.join(root, 'target')
126
+ end
127
+
128
+ # Directory where is worker.rb
129
+ def self.worker_dir
130
+ @worker_dir ||= File.join(root, 'lib', 'spark', 'worker')
131
+ end
132
+
133
+ def self.ruby_spark_jar
134
+ @ruby_spark_jar ||= File.join(target_dir, 'ruby-spark.jar')
135
+ end
136
+
137
+ def self.spark_ext_dir
138
+ @spark_ext_dir ||= File.join(root, 'ext', 'spark')
139
+ end
140
+
141
+
142
+ # ===============================================================================
143
+ # Load JVM and jars
144
+
145
+ # Load dependent libraries, can be use once
146
+ # Cannot load before CLI::install
147
+ #
148
+ # == Parameters:
149
+ # spark_home::
150
+ # path to directory where are located sparks .jar files or single Spark jar
151
+ #
152
+ def self.load_lib(spark_home=nil)
153
+ return if @java_bridge
154
+
155
+ spark_home ||= Spark.target_dir
156
+
157
+ bridge = JavaBridge.get
158
+ @java_bridge = bridge.new(spark_home)
159
+ @java_bridge.load
160
+ nil
161
+ end
162
+
163
+ def self.java_bridge
164
+ @java_bridge
165
+ end
166
+
167
+
168
+ # Aliases
169
+ class << self
170
+ alias_method :sc, :context
171
+ alias_method :jb, :java_bridge
172
+ end
173
+
174
+ end
175
+
176
+ # C/Java extensions
177
+ require 'ruby_spark_ext'
178
+
179
+ # Ruby core extensions
180
+ require 'spark/ext/module'
181
+ require 'spark/ext/object'
182
+ require 'spark/ext/hash'
183
+ require 'spark/ext/string'
184
+ require 'spark/ext/integer'
185
+ require 'spark/ext/ip_socket'
186
+ require 'spark/ext/io'
187
+
188
+ # Other requirments
189
+ require 'spark/version'
190
+ require 'spark/error'
191
+
192
+ # Make sure that Spark be always stopped
193
+ Kernel::at_exit do
194
+ begin
195
+ Spark.stop
196
+ rescue
197
+ end
198
+ end