ruby-spark 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +37 -0
  3. data/Gemfile +47 -0
  4. data/Guardfile +5 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +185 -0
  7. data/Rakefile +35 -0
  8. data/TODO.md +7 -0
  9. data/benchmark/aggregate.rb +33 -0
  10. data/benchmark/bisect.rb +88 -0
  11. data/benchmark/custom_marshal.rb +94 -0
  12. data/benchmark/digest.rb +150 -0
  13. data/benchmark/enumerator.rb +88 -0
  14. data/benchmark/performance/prepare.sh +18 -0
  15. data/benchmark/performance/python.py +156 -0
  16. data/benchmark/performance/r.r +69 -0
  17. data/benchmark/performance/ruby.rb +167 -0
  18. data/benchmark/performance/run-all.sh +160 -0
  19. data/benchmark/performance/scala.scala +181 -0
  20. data/benchmark/serializer.rb +82 -0
  21. data/benchmark/sort.rb +43 -0
  22. data/benchmark/sort2.rb +164 -0
  23. data/benchmark/take.rb +28 -0
  24. data/bin/ruby-spark +8 -0
  25. data/example/pi.rb +28 -0
  26. data/ext/ruby_c/extconf.rb +3 -0
  27. data/ext/ruby_c/murmur.c +158 -0
  28. data/ext/ruby_c/murmur.h +9 -0
  29. data/ext/ruby_c/ruby-spark.c +18 -0
  30. data/ext/ruby_java/Digest.java +36 -0
  31. data/ext/ruby_java/Murmur2.java +98 -0
  32. data/ext/ruby_java/RubySparkExtService.java +28 -0
  33. data/ext/ruby_java/extconf.rb +3 -0
  34. data/ext/spark/build.sbt +73 -0
  35. data/ext/spark/project/plugins.sbt +9 -0
  36. data/ext/spark/sbt/sbt +34 -0
  37. data/ext/spark/src/main/scala/Exec.scala +91 -0
  38. data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
  39. data/ext/spark/src/main/scala/Marshal.scala +52 -0
  40. data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
  41. data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
  42. data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
  43. data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
  44. data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
  45. data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
  46. data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
  47. data/ext/spark/src/main/scala/RubyPage.scala +34 -0
  48. data/ext/spark/src/main/scala/RubyRDD.scala +364 -0
  49. data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
  50. data/ext/spark/src/main/scala/RubyTab.scala +11 -0
  51. data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
  52. data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
  53. data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
  54. data/lib/ruby-spark.rb +1 -0
  55. data/lib/spark.rb +198 -0
  56. data/lib/spark/accumulator.rb +260 -0
  57. data/lib/spark/broadcast.rb +98 -0
  58. data/lib/spark/build.rb +43 -0
  59. data/lib/spark/cli.rb +169 -0
  60. data/lib/spark/command.rb +86 -0
  61. data/lib/spark/command/base.rb +154 -0
  62. data/lib/spark/command/basic.rb +345 -0
  63. data/lib/spark/command/pair.rb +124 -0
  64. data/lib/spark/command/sort.rb +51 -0
  65. data/lib/spark/command/statistic.rb +144 -0
  66. data/lib/spark/command_builder.rb +141 -0
  67. data/lib/spark/command_validator.rb +34 -0
  68. data/lib/spark/config.rb +244 -0
  69. data/lib/spark/constant.rb +14 -0
  70. data/lib/spark/context.rb +304 -0
  71. data/lib/spark/error.rb +50 -0
  72. data/lib/spark/ext/hash.rb +41 -0
  73. data/lib/spark/ext/integer.rb +25 -0
  74. data/lib/spark/ext/io.rb +57 -0
  75. data/lib/spark/ext/ip_socket.rb +29 -0
  76. data/lib/spark/ext/module.rb +58 -0
  77. data/lib/spark/ext/object.rb +24 -0
  78. data/lib/spark/ext/string.rb +24 -0
  79. data/lib/spark/helper.rb +10 -0
  80. data/lib/spark/helper/logger.rb +40 -0
  81. data/lib/spark/helper/parser.rb +85 -0
  82. data/lib/spark/helper/serialize.rb +71 -0
  83. data/lib/spark/helper/statistic.rb +93 -0
  84. data/lib/spark/helper/system.rb +42 -0
  85. data/lib/spark/java_bridge.rb +19 -0
  86. data/lib/spark/java_bridge/base.rb +203 -0
  87. data/lib/spark/java_bridge/jruby.rb +23 -0
  88. data/lib/spark/java_bridge/rjb.rb +41 -0
  89. data/lib/spark/logger.rb +76 -0
  90. data/lib/spark/mllib.rb +100 -0
  91. data/lib/spark/mllib/classification/common.rb +31 -0
  92. data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
  93. data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
  94. data/lib/spark/mllib/classification/svm.rb +135 -0
  95. data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
  96. data/lib/spark/mllib/clustering/kmeans.rb +118 -0
  97. data/lib/spark/mllib/matrix.rb +120 -0
  98. data/lib/spark/mllib/regression/common.rb +73 -0
  99. data/lib/spark/mllib/regression/labeled_point.rb +41 -0
  100. data/lib/spark/mllib/regression/lasso.rb +100 -0
  101. data/lib/spark/mllib/regression/linear.rb +124 -0
  102. data/lib/spark/mllib/regression/ridge.rb +97 -0
  103. data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
  104. data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
  105. data/lib/spark/mllib/stat/distribution.rb +12 -0
  106. data/lib/spark/mllib/vector.rb +185 -0
  107. data/lib/spark/rdd.rb +1328 -0
  108. data/lib/spark/sampler.rb +92 -0
  109. data/lib/spark/serializer.rb +24 -0
  110. data/lib/spark/serializer/base.rb +170 -0
  111. data/lib/spark/serializer/cartesian.rb +37 -0
  112. data/lib/spark/serializer/marshal.rb +19 -0
  113. data/lib/spark/serializer/message_pack.rb +25 -0
  114. data/lib/spark/serializer/oj.rb +25 -0
  115. data/lib/spark/serializer/pair.rb +27 -0
  116. data/lib/spark/serializer/utf8.rb +25 -0
  117. data/lib/spark/sort.rb +189 -0
  118. data/lib/spark/stat_counter.rb +125 -0
  119. data/lib/spark/storage_level.rb +39 -0
  120. data/lib/spark/version.rb +3 -0
  121. data/lib/spark/worker/master.rb +144 -0
  122. data/lib/spark/worker/spark_files.rb +15 -0
  123. data/lib/spark/worker/worker.rb +197 -0
  124. data/ruby-spark.gemspec +36 -0
  125. data/spec/generator.rb +37 -0
  126. data/spec/inputs/lorem_300.txt +316 -0
  127. data/spec/inputs/numbers/1.txt +50 -0
  128. data/spec/inputs/numbers/10.txt +50 -0
  129. data/spec/inputs/numbers/11.txt +50 -0
  130. data/spec/inputs/numbers/12.txt +50 -0
  131. data/spec/inputs/numbers/13.txt +50 -0
  132. data/spec/inputs/numbers/14.txt +50 -0
  133. data/spec/inputs/numbers/15.txt +50 -0
  134. data/spec/inputs/numbers/16.txt +50 -0
  135. data/spec/inputs/numbers/17.txt +50 -0
  136. data/spec/inputs/numbers/18.txt +50 -0
  137. data/spec/inputs/numbers/19.txt +50 -0
  138. data/spec/inputs/numbers/2.txt +50 -0
  139. data/spec/inputs/numbers/20.txt +50 -0
  140. data/spec/inputs/numbers/3.txt +50 -0
  141. data/spec/inputs/numbers/4.txt +50 -0
  142. data/spec/inputs/numbers/5.txt +50 -0
  143. data/spec/inputs/numbers/6.txt +50 -0
  144. data/spec/inputs/numbers/7.txt +50 -0
  145. data/spec/inputs/numbers/8.txt +50 -0
  146. data/spec/inputs/numbers/9.txt +50 -0
  147. data/spec/inputs/numbers_0_100.txt +101 -0
  148. data/spec/inputs/numbers_1_100.txt +100 -0
  149. data/spec/lib/collect_spec.rb +42 -0
  150. data/spec/lib/command_spec.rb +68 -0
  151. data/spec/lib/config_spec.rb +64 -0
  152. data/spec/lib/context_spec.rb +163 -0
  153. data/spec/lib/ext_spec.rb +72 -0
  154. data/spec/lib/external_apps_spec.rb +45 -0
  155. data/spec/lib/filter_spec.rb +80 -0
  156. data/spec/lib/flat_map_spec.rb +100 -0
  157. data/spec/lib/group_spec.rb +109 -0
  158. data/spec/lib/helper_spec.rb +19 -0
  159. data/spec/lib/key_spec.rb +41 -0
  160. data/spec/lib/manipulation_spec.rb +114 -0
  161. data/spec/lib/map_partitions_spec.rb +87 -0
  162. data/spec/lib/map_spec.rb +91 -0
  163. data/spec/lib/mllib/classification_spec.rb +54 -0
  164. data/spec/lib/mllib/clustering_spec.rb +35 -0
  165. data/spec/lib/mllib/matrix_spec.rb +32 -0
  166. data/spec/lib/mllib/regression_spec.rb +116 -0
  167. data/spec/lib/mllib/vector_spec.rb +77 -0
  168. data/spec/lib/reduce_by_key_spec.rb +118 -0
  169. data/spec/lib/reduce_spec.rb +131 -0
  170. data/spec/lib/sample_spec.rb +46 -0
  171. data/spec/lib/serializer_spec.rb +13 -0
  172. data/spec/lib/sort_spec.rb +58 -0
  173. data/spec/lib/statistic_spec.rb +168 -0
  174. data/spec/lib/whole_text_files_spec.rb +33 -0
  175. data/spec/spec_helper.rb +39 -0
  176. metadata +301 -0
@@ -0,0 +1,15 @@
1
+ package org.apache.spark.api.ruby
2
+
3
+ import org.apache.spark.util._
4
+ import org.apache.spark.{SparkConf, Logging}
5
+
6
+ object RubyUtils extends Logging {
7
+
8
+ def loadPropertiesFile(conf: SparkConf, path: String): String = {
9
+ Utils.getPropertiesFromFile(path).foreach {
10
+ case (key, value) => conf.set(key, value)
11
+ }
12
+ path
13
+ }
14
+
15
+ }
@@ -0,0 +1,257 @@
1
+ package org.apache.spark.api.ruby
2
+
3
+ import java.io.{File, DataInputStream, InputStream, DataOutputStream, FileOutputStream}
4
+ import java.net.{InetAddress, ServerSocket, Socket, SocketException}
5
+ import java.nio.file.Paths
6
+
7
+ import scala.collection.mutable
8
+ import scala.collection.JavaConversions._
9
+
10
+ import org.apache.spark._
11
+ import org.apache.spark.api.python.PythonRDD
12
+ import org.apache.spark.util.Utils
13
+ import org.apache.spark.util.RedirectThread
14
+
15
+
16
+ /* =================================================================================================
17
+ * Object RubyWorker
18
+ * =================================================================================================
19
+ *
20
+ * Create and store server for creating workers.
21
+ */
22
+
23
+ object RubyWorker extends Logging {
24
+
25
+ val PROCESS_WAIT_TIMEOUT = 10000
26
+
27
+ private var serverSocket: ServerSocket = null
28
+ private val serverHost = InetAddress.getByAddress(Array(127, 0, 0, 1))
29
+ private var serverPort: Int = 0
30
+
31
+ private var master: ExecutedFileCommand = null
32
+ private var masterSocket: Socket = null
33
+ private var masterOutputStream: DataOutputStream = null
34
+ private var masterInputStream: DataInputStream = null
35
+
36
+ private var workers = new mutable.WeakHashMap[Socket, Long]()
37
+
38
+
39
+ /* ----------------------------------------------------------------------------------------------
40
+ * Create new worker but first check if exist SocketServer and master process.
41
+ * If not it will create them. Worker have 2 chance to create.
42
+ */
43
+
44
+ def create(env: SparkEnv): (Socket, Long) = {
45
+ synchronized {
46
+ // Create the server if it hasn't been started
47
+ createServer(env)
48
+
49
+ // Attempt to connect, restart and retry once if it fails
50
+ try {
51
+ createWorker
52
+ } catch {
53
+ case exc: SocketException =>
54
+ logWarning("Worker unexpectedly quit, attempting to restart")
55
+ createWorker
56
+ }
57
+ }
58
+ }
59
+
60
+ /* ----------------------------------------------------------------------------------------------
61
+ * Create a worker throught master process. Return new socket and id.
62
+ * According spark.ruby.worker.type id will be:
63
+ * process: PID
64
+ * thread: thread object id
65
+ */
66
+
67
+ def createWorker: (Socket, Long) = {
68
+ synchronized {
69
+ masterOutputStream.writeInt(RubyConstant.CREATE_WORKER)
70
+ var socket = serverSocket.accept()
71
+
72
+ var id = new DataInputStream(socket.getInputStream).readLong()
73
+ workers.put(socket, id)
74
+
75
+ (socket, id)
76
+ }
77
+ }
78
+
79
+ /* ----------------------------------------------------------------------------------------------
80
+ * Create SocketServer and bind it to the localhost. Max numbers of connection on queue
81
+ * is set to default. If server is created withou exception -> create master.
82
+ */
83
+
84
+ private def createServer(env: SparkEnv){
85
+ synchronized {
86
+ // Already running?
87
+ if(serverSocket != null && masterSocket != null) {
88
+ return
89
+ }
90
+
91
+ try {
92
+ // Start Socket Server for comunication
93
+ serverSocket = new ServerSocket(0, 0, serverHost)
94
+ serverPort = serverSocket.getLocalPort
95
+
96
+ // Create a master for worker creations
97
+ createMaster(env)
98
+ } catch {
99
+ case e: Exception =>
100
+ throw new SparkException("There was a problem with creating a server", e)
101
+ }
102
+ }
103
+ }
104
+
105
+ /* ----------------------------------------------------------------------------------------------
106
+ * In this point SocketServer must be created. Master process create and kill workers.
107
+ * Creating workers from Java can be an expensive operation because new process can
108
+ * get copy of address space.
109
+ */
110
+
111
+ private def createMaster(env: SparkEnv){
112
+ synchronized {
113
+ val isDriver = env.executorId == SparkContext.DRIVER_IDENTIFIER
114
+ val executorOptions = env.conf.get("spark.ruby.executor.options", "")
115
+ val commandTemplate = env.conf.get("spark.ruby.executor.command")
116
+ val workerType = env.conf.get("spark.ruby.worker.type")
117
+
118
+ // Where is root of ruby-spark
119
+ var executorLocation = ""
120
+
121
+ if(isDriver){
122
+ // Use worker from current active gem location
123
+ executorLocation = env.conf.get("spark.ruby.driver_home")
124
+ }
125
+ else{
126
+ // Ruby-spark package uri
127
+ val uri = env.conf.get("spark.ruby.executor.uri", "")
128
+
129
+ if(uri.isEmpty){
130
+ // Use gem installed on the system
131
+ try {
132
+ val homeCommand = new FileCommand(commandTemplate, "ruby-spark home", env, getEnvVars(env))
133
+
134
+ executorLocation = homeCommand.run.readLine
135
+ } catch {
136
+ case e: java.io.IOException =>
137
+ throw new SparkException("Ruby-spark gem is not installed.", e)
138
+ }
139
+ }
140
+ else{
141
+ // Prepare and use gem from uri
142
+ }
143
+ }
144
+
145
+ // Master and worker are saved in GEM_ROOT/lib/spark/worker
146
+ executorLocation = Paths.get(executorLocation, "lib", "spark", "worker").toString
147
+
148
+ // Create master command
149
+ // -C: change worker dir before execution
150
+ val masterRb = s"ruby $executorOptions -C $executorLocation master.rb $workerType $serverPort"
151
+ val masterCommand = new FileCommand(commandTemplate, masterRb, env, getEnvVars(env))
152
+
153
+ // Start master
154
+ master = masterCommand.run
155
+
156
+ // Redirect master stdout and stderr
157
+ redirectStreamsToStderr(master.getInputStream, master.getErrorStream)
158
+
159
+ // Wait for it to connect to our socket
160
+ serverSocket.setSoTimeout(PROCESS_WAIT_TIMEOUT)
161
+ try {
162
+ // Use socket for comunication. Keep stdout and stdin for log
163
+ masterSocket = serverSocket.accept()
164
+ masterOutputStream = new DataOutputStream(masterSocket.getOutputStream)
165
+ masterInputStream = new DataInputStream(masterSocket.getInputStream)
166
+
167
+ PythonRDD.writeUTF(executorOptions, masterOutputStream)
168
+ } catch {
169
+ case e: Exception =>
170
+ throw new SparkException("Ruby master did not connect back in time", e)
171
+ }
172
+ }
173
+ }
174
+
175
+ /* ----------------------------------------------------------------------------------------------
176
+ * Gel all environment variables for executor
177
+ */
178
+
179
+ def getEnvVars(env: SparkEnv): Map[String, String] = {
180
+ val prefix = "spark.ruby.executor.env."
181
+ env.conf.getAll.filter{case (k, _) => k.startsWith(prefix)}
182
+ .map{case (k, v) => (k.substring(prefix.length), v)}
183
+ .toMap
184
+ }
185
+
186
+ /* ------------------------------------------------------------------------------------------- */
187
+
188
+ def kill(workerId: Long){
189
+ masterOutputStream.writeInt(RubyConstant.KILL_WORKER)
190
+ masterOutputStream.writeLong(workerId)
191
+ }
192
+
193
+ /* ------------------------------------------------------------------------------------------- */
194
+
195
+ def killAndWait(workerId: Long){
196
+ masterOutputStream.writeInt(RubyConstant.KILL_WORKER_AND_WAIT)
197
+ masterOutputStream.writeLong(workerId)
198
+
199
+ // Wait for answer
200
+ masterInputStream.readInt() match {
201
+ case RubyConstant.SUCCESSFULLY_KILLED =>
202
+ logInfo(s"Worker $workerId was successfully killed")
203
+ case RubyConstant.UNSUCCESSFUL_KILLING =>
204
+ logInfo(s"Worker $workerId cannot be killed (maybe is already killed)")
205
+ }
206
+ }
207
+
208
+ /* ----------------------------------------------------------------------------------------------
209
+ * workers HashMap is week but it avoid long list of workers which cannot be killed (killAndWait)
210
+ */
211
+
212
+ def remove(worker: Socket, workerId: Long){
213
+ try {
214
+ workers.remove(worker)
215
+ } catch {
216
+ case e: Exception => logWarning(s"Worker $workerId does not exist (maybe is already removed)")
217
+ }
218
+ }
219
+
220
+ /* ------------------------------------------------------------------------------------------- */
221
+
222
+ def stopServer{
223
+ synchronized {
224
+ // Kill workers
225
+ workers.foreach { case (socket, id) => killAndWait(id) }
226
+
227
+ // Kill master
228
+ master.destroy
229
+
230
+ // Stop SocketServer
231
+ serverSocket.close()
232
+
233
+ // Clean variables
234
+ serverSocket = null
235
+ serverPort = 0
236
+ master = null
237
+ masterSocket = null
238
+ masterOutputStream = null
239
+ masterInputStream = null
240
+ }
241
+ }
242
+
243
+ /* ------------------------------------------------------------------------------------------- */
244
+
245
+ private def redirectStreamsToStderr(streams: InputStream*) {
246
+ try {
247
+ for(stream <- streams) {
248
+ new RedirectThread(stream, System.err, "stream reader").start()
249
+ }
250
+ } catch {
251
+ case e: Exception =>
252
+ logError("Exception in redirecting streams", e)
253
+ }
254
+ }
255
+
256
+ /* ------------------------------------------------------------------------------------------- */
257
+ }
@@ -0,0 +1,84 @@
1
+ package org.apache.spark.api.ruby.marshal
2
+
3
+ import org.scalatest._
4
+
5
+
6
+ import org.apache.spark.api.ruby.marshal._
7
+
8
+ class MarshalSpec extends FunSpec with Matchers {
9
+
10
+ // ====================================================================================
11
+ // Load
12
+
13
+ describe("Marshal.load"){
14
+ describe("single value"){
15
+ it("int"){
16
+ val data = 1
17
+ val serialized = Array[Byte](4, 8, 105, 6)
18
+
19
+ Marshal.load(serialized) should equal(data)
20
+ }
21
+
22
+ it("double"){
23
+ val data = 1.2
24
+ val serialized = Array[Byte](4, 8, 102, 8, 49, 46, 50)
25
+
26
+ Marshal.load(serialized) should equal(data)
27
+ }
28
+ }
29
+
30
+ describe("array"){
31
+ it("ints"){
32
+ val data = Array(1, 2, 3, 4, 5)
33
+ val serialized = Array[Byte](4, 8, 91, 10, 105, 6, 105, 7, 105, 8, 105, 9, 105, 10)
34
+
35
+ Marshal.load(serialized) should equal(data)
36
+ }
37
+
38
+ it("doubles"){
39
+ val data = Array(1.1, 2.2, 3.3)
40
+ val serialized = Array[Byte](4, 8, 91, 8, 102, 8, 49, 46, 49, 102, 8, 50, 46, 50, 102, 8, 51, 46, 51)
41
+
42
+ Marshal.load(serialized) should equal(data)
43
+ }
44
+ }
45
+ }
46
+
47
+ // ====================================================================================
48
+ // Dump
49
+
50
+ describe("Marshal.dump"){
51
+ describe("single value"){
52
+ it("int"){
53
+ val data = 1
54
+ val serialized = Array(4, 8, 105, 6)
55
+
56
+ Marshal.dump(data) should equal(serialized)
57
+ }
58
+
59
+ it("double"){
60
+ val data = 1.2
61
+ val serialized = Array(4, 8, 102, 8, 49, 46, 50)
62
+
63
+ Marshal.dump(data) should equal(serialized)
64
+ }
65
+ }
66
+
67
+ describe("array"){
68
+ it("ints"){
69
+ val data = Array(1, 2, 3, 4, 5)
70
+ val serialized = Array(4, 8, 91, 10, 105, 6, 105, 7, 105, 8, 105, 9, 105, 10)
71
+
72
+ Marshal.dump(data) should equal(serialized)
73
+ }
74
+
75
+ it("doubles"){
76
+ val data = Array(1.1, 2.2, 3.3)
77
+ val serialized = Array(4, 8, 91, 8, 102, 8, 49, 46, 49, 102, 8, 50, 46, 50, 102, 8, 51, 46, 51)
78
+
79
+ Marshal.dump(data) should equal(serialized)
80
+ }
81
+ }
82
+ }
83
+
84
+ }
data/lib/ruby-spark.rb ADDED
@@ -0,0 +1 @@
1
+ require_relative 'spark'
data/lib/spark.rb ADDED
@@ -0,0 +1,198 @@
1
+ # Gems and libraries
2
+ require 'method_source'
3
+ require 'forwardable'
4
+ require 'sourcify'
5
+ require 'socket'
6
+ require 'tempfile'
7
+ require 'tmpdir'
8
+
9
+ module Spark
10
+ autoload :Context, 'spark/context'
11
+ autoload :Config, 'spark/config'
12
+ autoload :RDD, 'spark/rdd'
13
+ autoload :CLI, 'spark/cli'
14
+ autoload :Build, 'spark/build'
15
+ autoload :Serializer, 'spark/serializer'
16
+ autoload :Helper, 'spark/helper'
17
+ autoload :StorageLevel, 'spark/storage_level'
18
+ autoload :Command, 'spark/command'
19
+ autoload :CommandBuilder, 'spark/command_builder'
20
+ autoload :Sampler, 'spark/sampler'
21
+ autoload :Logger, 'spark/logger'
22
+ autoload :JavaBridge, 'spark/java_bridge'
23
+ autoload :ExternalSorter, 'spark/sort'
24
+ autoload :Constant, 'spark/constant'
25
+ autoload :Broadcast, 'spark/broadcast'
26
+ autoload :Accumulator, 'spark/accumulator'
27
+ autoload :StatCounter, 'spark/stat_counter'
28
+ autoload :Mllib, 'spark/mllib'
29
+
30
+ include Helper::System
31
+
32
+ def self.print_logo(message=nil)
33
+ puts <<-STRING
34
+
35
+ Welcome to
36
+ __ ____ __
37
+ ______ __/ / __ __ / __/__ ___ _____/ /__
38
+ / __/ // / _ \\/ // / _\\ \\/ _ \\/ _ `/ __/ '_/
39
+ /_/ \\_,_/_.__/\\_, / /___/ .__/\\_,_/_/ /_/\\_\\ version #{Spark::VERSION}
40
+ /___/ /_/
41
+
42
+ #{message}
43
+
44
+ STRING
45
+ end
46
+
47
+ # Returns current configuration. Configurations can be changed until
48
+ # context is initialized. In this case config is locked only for reading.
49
+ #
50
+ # == Configuration can be changed:
51
+ #
52
+ # Spark.config.set('spark.app.name', 'RubySpark')
53
+ #
54
+ # Spark.config['spark.app.name'] = 'RubySpark'
55
+ #
56
+ # Spark.config do
57
+ # set 'spark.app.name', 'RubySpark'
58
+ # end
59
+ #
60
+ def self.config(&block)
61
+ @config ||= Spark::Config.new
62
+
63
+ if block_given?
64
+ @config.instance_eval(&block)
65
+ else
66
+ @config
67
+ end
68
+ end
69
+
70
+ # Destroy current configuration. This can be useful for restarting config
71
+ # to set new. It has no effect if context is already started.
72
+ def self.clear_config
73
+ @config = nil
74
+ end
75
+
76
+ # Return a current active context or nil.
77
+ #
78
+ # TODO: Run `start` if context is nil?
79
+ #
80
+ def self.context
81
+ @context
82
+ end
83
+
84
+ # Initialize spark context if not already. Config will be automatically
85
+ # loaded on constructor. From that point `config` will use configuration
86
+ # from running Spark and will be locked only for reading.
87
+ def self.start
88
+ if started?
89
+ # Already started
90
+ else
91
+ @context ||= Spark::Context.new
92
+ end
93
+ end
94
+
95
+ def self.stop
96
+ @context.stop
97
+ RubyWorker.stopServer
98
+ logger.info('Workers were stopped')
99
+ rescue
100
+ nil
101
+ ensure
102
+ @context = nil
103
+ clear_config
104
+ end
105
+
106
+ def self.started?
107
+ !!@context
108
+ end
109
+
110
+ def self.logger
111
+ @logger ||= Spark::Logger.new
112
+ end
113
+
114
+ # Root of the gem
115
+ def self.root
116
+ @root ||= File.expand_path('..', File.dirname(__FILE__))
117
+ end
118
+
119
+ def self.home
120
+ root
121
+ end
122
+
123
+ # Default directory for java extensions
124
+ def self.target_dir
125
+ @target_dir ||= File.join(root, 'target')
126
+ end
127
+
128
+ # Directory where is worker.rb
129
+ def self.worker_dir
130
+ @worker_dir ||= File.join(root, 'lib', 'spark', 'worker')
131
+ end
132
+
133
+ def self.ruby_spark_jar
134
+ @ruby_spark_jar ||= File.join(target_dir, 'ruby-spark.jar')
135
+ end
136
+
137
+ def self.spark_ext_dir
138
+ @spark_ext_dir ||= File.join(root, 'ext', 'spark')
139
+ end
140
+
141
+
142
+ # ===============================================================================
143
+ # Load JVM and jars
144
+
145
+ # Load dependent libraries, can be use once
146
+ # Cannot load before CLI::install
147
+ #
148
+ # == Parameters:
149
+ # spark_home::
150
+ # path to directory where are located sparks .jar files or single Spark jar
151
+ #
152
+ def self.load_lib(spark_home=nil)
153
+ return if @java_bridge
154
+
155
+ spark_home ||= Spark.target_dir
156
+
157
+ bridge = JavaBridge.get
158
+ @java_bridge = bridge.new(spark_home)
159
+ @java_bridge.load
160
+ nil
161
+ end
162
+
163
+ def self.java_bridge
164
+ @java_bridge
165
+ end
166
+
167
+
168
+ # Aliases
169
+ class << self
170
+ alias_method :sc, :context
171
+ alias_method :jb, :java_bridge
172
+ end
173
+
174
+ end
175
+
176
+ # C/Java extensions
177
+ require 'ruby_spark_ext'
178
+
179
+ # Ruby core extensions
180
+ require 'spark/ext/module'
181
+ require 'spark/ext/object'
182
+ require 'spark/ext/hash'
183
+ require 'spark/ext/string'
184
+ require 'spark/ext/integer'
185
+ require 'spark/ext/ip_socket'
186
+ require 'spark/ext/io'
187
+
188
+ # Other requirments
189
+ require 'spark/version'
190
+ require 'spark/error'
191
+
192
+ # Make sure that Spark be always stopped
193
+ Kernel::at_exit do
194
+ begin
195
+ Spark.stop
196
+ rescue
197
+ end
198
+ end