ruby-spark 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +37 -0
- data/Gemfile +47 -0
- data/Guardfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +185 -0
- data/Rakefile +35 -0
- data/TODO.md +7 -0
- data/benchmark/aggregate.rb +33 -0
- data/benchmark/bisect.rb +88 -0
- data/benchmark/custom_marshal.rb +94 -0
- data/benchmark/digest.rb +150 -0
- data/benchmark/enumerator.rb +88 -0
- data/benchmark/performance/prepare.sh +18 -0
- data/benchmark/performance/python.py +156 -0
- data/benchmark/performance/r.r +69 -0
- data/benchmark/performance/ruby.rb +167 -0
- data/benchmark/performance/run-all.sh +160 -0
- data/benchmark/performance/scala.scala +181 -0
- data/benchmark/serializer.rb +82 -0
- data/benchmark/sort.rb +43 -0
- data/benchmark/sort2.rb +164 -0
- data/benchmark/take.rb +28 -0
- data/bin/ruby-spark +8 -0
- data/example/pi.rb +28 -0
- data/ext/ruby_c/extconf.rb +3 -0
- data/ext/ruby_c/murmur.c +158 -0
- data/ext/ruby_c/murmur.h +9 -0
- data/ext/ruby_c/ruby-spark.c +18 -0
- data/ext/ruby_java/Digest.java +36 -0
- data/ext/ruby_java/Murmur2.java +98 -0
- data/ext/ruby_java/RubySparkExtService.java +28 -0
- data/ext/ruby_java/extconf.rb +3 -0
- data/ext/spark/build.sbt +73 -0
- data/ext/spark/project/plugins.sbt +9 -0
- data/ext/spark/sbt/sbt +34 -0
- data/ext/spark/src/main/scala/Exec.scala +91 -0
- data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
- data/ext/spark/src/main/scala/Marshal.scala +52 -0
- data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
- data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
- data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
- data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
- data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
- data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
- data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
- data/ext/spark/src/main/scala/RubyPage.scala +34 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +364 -0
- data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
- data/ext/spark/src/main/scala/RubyTab.scala +11 -0
- data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
- data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
- data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
- data/lib/ruby-spark.rb +1 -0
- data/lib/spark.rb +198 -0
- data/lib/spark/accumulator.rb +260 -0
- data/lib/spark/broadcast.rb +98 -0
- data/lib/spark/build.rb +43 -0
- data/lib/spark/cli.rb +169 -0
- data/lib/spark/command.rb +86 -0
- data/lib/spark/command/base.rb +154 -0
- data/lib/spark/command/basic.rb +345 -0
- data/lib/spark/command/pair.rb +124 -0
- data/lib/spark/command/sort.rb +51 -0
- data/lib/spark/command/statistic.rb +144 -0
- data/lib/spark/command_builder.rb +141 -0
- data/lib/spark/command_validator.rb +34 -0
- data/lib/spark/config.rb +244 -0
- data/lib/spark/constant.rb +14 -0
- data/lib/spark/context.rb +304 -0
- data/lib/spark/error.rb +50 -0
- data/lib/spark/ext/hash.rb +41 -0
- data/lib/spark/ext/integer.rb +25 -0
- data/lib/spark/ext/io.rb +57 -0
- data/lib/spark/ext/ip_socket.rb +29 -0
- data/lib/spark/ext/module.rb +58 -0
- data/lib/spark/ext/object.rb +24 -0
- data/lib/spark/ext/string.rb +24 -0
- data/lib/spark/helper.rb +10 -0
- data/lib/spark/helper/logger.rb +40 -0
- data/lib/spark/helper/parser.rb +85 -0
- data/lib/spark/helper/serialize.rb +71 -0
- data/lib/spark/helper/statistic.rb +93 -0
- data/lib/spark/helper/system.rb +42 -0
- data/lib/spark/java_bridge.rb +19 -0
- data/lib/spark/java_bridge/base.rb +203 -0
- data/lib/spark/java_bridge/jruby.rb +23 -0
- data/lib/spark/java_bridge/rjb.rb +41 -0
- data/lib/spark/logger.rb +76 -0
- data/lib/spark/mllib.rb +100 -0
- data/lib/spark/mllib/classification/common.rb +31 -0
- data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
- data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
- data/lib/spark/mllib/classification/svm.rb +135 -0
- data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
- data/lib/spark/mllib/clustering/kmeans.rb +118 -0
- data/lib/spark/mllib/matrix.rb +120 -0
- data/lib/spark/mllib/regression/common.rb +73 -0
- data/lib/spark/mllib/regression/labeled_point.rb +41 -0
- data/lib/spark/mllib/regression/lasso.rb +100 -0
- data/lib/spark/mllib/regression/linear.rb +124 -0
- data/lib/spark/mllib/regression/ridge.rb +97 -0
- data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
- data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
- data/lib/spark/mllib/stat/distribution.rb +12 -0
- data/lib/spark/mllib/vector.rb +185 -0
- data/lib/spark/rdd.rb +1328 -0
- data/lib/spark/sampler.rb +92 -0
- data/lib/spark/serializer.rb +24 -0
- data/lib/spark/serializer/base.rb +170 -0
- data/lib/spark/serializer/cartesian.rb +37 -0
- data/lib/spark/serializer/marshal.rb +19 -0
- data/lib/spark/serializer/message_pack.rb +25 -0
- data/lib/spark/serializer/oj.rb +25 -0
- data/lib/spark/serializer/pair.rb +27 -0
- data/lib/spark/serializer/utf8.rb +25 -0
- data/lib/spark/sort.rb +189 -0
- data/lib/spark/stat_counter.rb +125 -0
- data/lib/spark/storage_level.rb +39 -0
- data/lib/spark/version.rb +3 -0
- data/lib/spark/worker/master.rb +144 -0
- data/lib/spark/worker/spark_files.rb +15 -0
- data/lib/spark/worker/worker.rb +197 -0
- data/ruby-spark.gemspec +36 -0
- data/spec/generator.rb +37 -0
- data/spec/inputs/lorem_300.txt +316 -0
- data/spec/inputs/numbers/1.txt +50 -0
- data/spec/inputs/numbers/10.txt +50 -0
- data/spec/inputs/numbers/11.txt +50 -0
- data/spec/inputs/numbers/12.txt +50 -0
- data/spec/inputs/numbers/13.txt +50 -0
- data/spec/inputs/numbers/14.txt +50 -0
- data/spec/inputs/numbers/15.txt +50 -0
- data/spec/inputs/numbers/16.txt +50 -0
- data/spec/inputs/numbers/17.txt +50 -0
- data/spec/inputs/numbers/18.txt +50 -0
- data/spec/inputs/numbers/19.txt +50 -0
- data/spec/inputs/numbers/2.txt +50 -0
- data/spec/inputs/numbers/20.txt +50 -0
- data/spec/inputs/numbers/3.txt +50 -0
- data/spec/inputs/numbers/4.txt +50 -0
- data/spec/inputs/numbers/5.txt +50 -0
- data/spec/inputs/numbers/6.txt +50 -0
- data/spec/inputs/numbers/7.txt +50 -0
- data/spec/inputs/numbers/8.txt +50 -0
- data/spec/inputs/numbers/9.txt +50 -0
- data/spec/inputs/numbers_0_100.txt +101 -0
- data/spec/inputs/numbers_1_100.txt +100 -0
- data/spec/lib/collect_spec.rb +42 -0
- data/spec/lib/command_spec.rb +68 -0
- data/spec/lib/config_spec.rb +64 -0
- data/spec/lib/context_spec.rb +163 -0
- data/spec/lib/ext_spec.rb +72 -0
- data/spec/lib/external_apps_spec.rb +45 -0
- data/spec/lib/filter_spec.rb +80 -0
- data/spec/lib/flat_map_spec.rb +100 -0
- data/spec/lib/group_spec.rb +109 -0
- data/spec/lib/helper_spec.rb +19 -0
- data/spec/lib/key_spec.rb +41 -0
- data/spec/lib/manipulation_spec.rb +114 -0
- data/spec/lib/map_partitions_spec.rb +87 -0
- data/spec/lib/map_spec.rb +91 -0
- data/spec/lib/mllib/classification_spec.rb +54 -0
- data/spec/lib/mllib/clustering_spec.rb +35 -0
- data/spec/lib/mllib/matrix_spec.rb +32 -0
- data/spec/lib/mllib/regression_spec.rb +116 -0
- data/spec/lib/mllib/vector_spec.rb +77 -0
- data/spec/lib/reduce_by_key_spec.rb +118 -0
- data/spec/lib/reduce_spec.rb +131 -0
- data/spec/lib/sample_spec.rb +46 -0
- data/spec/lib/serializer_spec.rb +13 -0
- data/spec/lib/sort_spec.rb +58 -0
- data/spec/lib/statistic_spec.rb +168 -0
- data/spec/lib/whole_text_files_spec.rb +33 -0
- data/spec/spec_helper.rb +39 -0
- metadata +301 -0
@@ -0,0 +1,15 @@
|
|
1
|
+
package org.apache.spark.api.ruby
|
2
|
+
|
3
|
+
import org.apache.spark.util._
|
4
|
+
import org.apache.spark.{SparkConf, Logging}
|
5
|
+
|
6
|
+
object RubyUtils extends Logging {
|
7
|
+
|
8
|
+
def loadPropertiesFile(conf: SparkConf, path: String): String = {
|
9
|
+
Utils.getPropertiesFromFile(path).foreach {
|
10
|
+
case (key, value) => conf.set(key, value)
|
11
|
+
}
|
12
|
+
path
|
13
|
+
}
|
14
|
+
|
15
|
+
}
|
@@ -0,0 +1,257 @@
|
|
1
|
+
package org.apache.spark.api.ruby
|
2
|
+
|
3
|
+
import java.io.{File, DataInputStream, InputStream, DataOutputStream, FileOutputStream}
|
4
|
+
import java.net.{InetAddress, ServerSocket, Socket, SocketException}
|
5
|
+
import java.nio.file.Paths
|
6
|
+
|
7
|
+
import scala.collection.mutable
|
8
|
+
import scala.collection.JavaConversions._
|
9
|
+
|
10
|
+
import org.apache.spark._
|
11
|
+
import org.apache.spark.api.python.PythonRDD
|
12
|
+
import org.apache.spark.util.Utils
|
13
|
+
import org.apache.spark.util.RedirectThread
|
14
|
+
|
15
|
+
|
16
|
+
/* =================================================================================================
|
17
|
+
* Object RubyWorker
|
18
|
+
* =================================================================================================
|
19
|
+
*
|
20
|
+
* Create and store server for creating workers.
|
21
|
+
*/
|
22
|
+
|
23
|
+
object RubyWorker extends Logging {
|
24
|
+
|
25
|
+
val PROCESS_WAIT_TIMEOUT = 10000
|
26
|
+
|
27
|
+
private var serverSocket: ServerSocket = null
|
28
|
+
private val serverHost = InetAddress.getByAddress(Array(127, 0, 0, 1))
|
29
|
+
private var serverPort: Int = 0
|
30
|
+
|
31
|
+
private var master: ExecutedFileCommand = null
|
32
|
+
private var masterSocket: Socket = null
|
33
|
+
private var masterOutputStream: DataOutputStream = null
|
34
|
+
private var masterInputStream: DataInputStream = null
|
35
|
+
|
36
|
+
private var workers = new mutable.WeakHashMap[Socket, Long]()
|
37
|
+
|
38
|
+
|
39
|
+
/* ----------------------------------------------------------------------------------------------
|
40
|
+
* Create new worker but first check if exist SocketServer and master process.
|
41
|
+
* If not it will create them. Worker have 2 chance to create.
|
42
|
+
*/
|
43
|
+
|
44
|
+
def create(env: SparkEnv): (Socket, Long) = {
|
45
|
+
synchronized {
|
46
|
+
// Create the server if it hasn't been started
|
47
|
+
createServer(env)
|
48
|
+
|
49
|
+
// Attempt to connect, restart and retry once if it fails
|
50
|
+
try {
|
51
|
+
createWorker
|
52
|
+
} catch {
|
53
|
+
case exc: SocketException =>
|
54
|
+
logWarning("Worker unexpectedly quit, attempting to restart")
|
55
|
+
createWorker
|
56
|
+
}
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
/* ----------------------------------------------------------------------------------------------
|
61
|
+
* Create a worker throught master process. Return new socket and id.
|
62
|
+
* According spark.ruby.worker.type id will be:
|
63
|
+
* process: PID
|
64
|
+
* thread: thread object id
|
65
|
+
*/
|
66
|
+
|
67
|
+
def createWorker: (Socket, Long) = {
|
68
|
+
synchronized {
|
69
|
+
masterOutputStream.writeInt(RubyConstant.CREATE_WORKER)
|
70
|
+
var socket = serverSocket.accept()
|
71
|
+
|
72
|
+
var id = new DataInputStream(socket.getInputStream).readLong()
|
73
|
+
workers.put(socket, id)
|
74
|
+
|
75
|
+
(socket, id)
|
76
|
+
}
|
77
|
+
}
|
78
|
+
|
79
|
+
/* ----------------------------------------------------------------------------------------------
|
80
|
+
* Create SocketServer and bind it to the localhost. Max numbers of connection on queue
|
81
|
+
* is set to default. If server is created withou exception -> create master.
|
82
|
+
*/
|
83
|
+
|
84
|
+
private def createServer(env: SparkEnv){
|
85
|
+
synchronized {
|
86
|
+
// Already running?
|
87
|
+
if(serverSocket != null && masterSocket != null) {
|
88
|
+
return
|
89
|
+
}
|
90
|
+
|
91
|
+
try {
|
92
|
+
// Start Socket Server for comunication
|
93
|
+
serverSocket = new ServerSocket(0, 0, serverHost)
|
94
|
+
serverPort = serverSocket.getLocalPort
|
95
|
+
|
96
|
+
// Create a master for worker creations
|
97
|
+
createMaster(env)
|
98
|
+
} catch {
|
99
|
+
case e: Exception =>
|
100
|
+
throw new SparkException("There was a problem with creating a server", e)
|
101
|
+
}
|
102
|
+
}
|
103
|
+
}
|
104
|
+
|
105
|
+
/* ----------------------------------------------------------------------------------------------
|
106
|
+
* In this point SocketServer must be created. Master process create and kill workers.
|
107
|
+
* Creating workers from Java can be an expensive operation because new process can
|
108
|
+
* get copy of address space.
|
109
|
+
*/
|
110
|
+
|
111
|
+
private def createMaster(env: SparkEnv){
|
112
|
+
synchronized {
|
113
|
+
val isDriver = env.executorId == SparkContext.DRIVER_IDENTIFIER
|
114
|
+
val executorOptions = env.conf.get("spark.ruby.executor.options", "")
|
115
|
+
val commandTemplate = env.conf.get("spark.ruby.executor.command")
|
116
|
+
val workerType = env.conf.get("spark.ruby.worker.type")
|
117
|
+
|
118
|
+
// Where is root of ruby-spark
|
119
|
+
var executorLocation = ""
|
120
|
+
|
121
|
+
if(isDriver){
|
122
|
+
// Use worker from current active gem location
|
123
|
+
executorLocation = env.conf.get("spark.ruby.driver_home")
|
124
|
+
}
|
125
|
+
else{
|
126
|
+
// Ruby-spark package uri
|
127
|
+
val uri = env.conf.get("spark.ruby.executor.uri", "")
|
128
|
+
|
129
|
+
if(uri.isEmpty){
|
130
|
+
// Use gem installed on the system
|
131
|
+
try {
|
132
|
+
val homeCommand = new FileCommand(commandTemplate, "ruby-spark home", env, getEnvVars(env))
|
133
|
+
|
134
|
+
executorLocation = homeCommand.run.readLine
|
135
|
+
} catch {
|
136
|
+
case e: java.io.IOException =>
|
137
|
+
throw new SparkException("Ruby-spark gem is not installed.", e)
|
138
|
+
}
|
139
|
+
}
|
140
|
+
else{
|
141
|
+
// Prepare and use gem from uri
|
142
|
+
}
|
143
|
+
}
|
144
|
+
|
145
|
+
// Master and worker are saved in GEM_ROOT/lib/spark/worker
|
146
|
+
executorLocation = Paths.get(executorLocation, "lib", "spark", "worker").toString
|
147
|
+
|
148
|
+
// Create master command
|
149
|
+
// -C: change worker dir before execution
|
150
|
+
val masterRb = s"ruby $executorOptions -C $executorLocation master.rb $workerType $serverPort"
|
151
|
+
val masterCommand = new FileCommand(commandTemplate, masterRb, env, getEnvVars(env))
|
152
|
+
|
153
|
+
// Start master
|
154
|
+
master = masterCommand.run
|
155
|
+
|
156
|
+
// Redirect master stdout and stderr
|
157
|
+
redirectStreamsToStderr(master.getInputStream, master.getErrorStream)
|
158
|
+
|
159
|
+
// Wait for it to connect to our socket
|
160
|
+
serverSocket.setSoTimeout(PROCESS_WAIT_TIMEOUT)
|
161
|
+
try {
|
162
|
+
// Use socket for comunication. Keep stdout and stdin for log
|
163
|
+
masterSocket = serverSocket.accept()
|
164
|
+
masterOutputStream = new DataOutputStream(masterSocket.getOutputStream)
|
165
|
+
masterInputStream = new DataInputStream(masterSocket.getInputStream)
|
166
|
+
|
167
|
+
PythonRDD.writeUTF(executorOptions, masterOutputStream)
|
168
|
+
} catch {
|
169
|
+
case e: Exception =>
|
170
|
+
throw new SparkException("Ruby master did not connect back in time", e)
|
171
|
+
}
|
172
|
+
}
|
173
|
+
}
|
174
|
+
|
175
|
+
/* ----------------------------------------------------------------------------------------------
|
176
|
+
* Gel all environment variables for executor
|
177
|
+
*/
|
178
|
+
|
179
|
+
def getEnvVars(env: SparkEnv): Map[String, String] = {
|
180
|
+
val prefix = "spark.ruby.executor.env."
|
181
|
+
env.conf.getAll.filter{case (k, _) => k.startsWith(prefix)}
|
182
|
+
.map{case (k, v) => (k.substring(prefix.length), v)}
|
183
|
+
.toMap
|
184
|
+
}
|
185
|
+
|
186
|
+
/* ------------------------------------------------------------------------------------------- */
|
187
|
+
|
188
|
+
def kill(workerId: Long){
|
189
|
+
masterOutputStream.writeInt(RubyConstant.KILL_WORKER)
|
190
|
+
masterOutputStream.writeLong(workerId)
|
191
|
+
}
|
192
|
+
|
193
|
+
/* ------------------------------------------------------------------------------------------- */
|
194
|
+
|
195
|
+
def killAndWait(workerId: Long){
|
196
|
+
masterOutputStream.writeInt(RubyConstant.KILL_WORKER_AND_WAIT)
|
197
|
+
masterOutputStream.writeLong(workerId)
|
198
|
+
|
199
|
+
// Wait for answer
|
200
|
+
masterInputStream.readInt() match {
|
201
|
+
case RubyConstant.SUCCESSFULLY_KILLED =>
|
202
|
+
logInfo(s"Worker $workerId was successfully killed")
|
203
|
+
case RubyConstant.UNSUCCESSFUL_KILLING =>
|
204
|
+
logInfo(s"Worker $workerId cannot be killed (maybe is already killed)")
|
205
|
+
}
|
206
|
+
}
|
207
|
+
|
208
|
+
/* ----------------------------------------------------------------------------------------------
|
209
|
+
* workers HashMap is week but it avoid long list of workers which cannot be killed (killAndWait)
|
210
|
+
*/
|
211
|
+
|
212
|
+
def remove(worker: Socket, workerId: Long){
|
213
|
+
try {
|
214
|
+
workers.remove(worker)
|
215
|
+
} catch {
|
216
|
+
case e: Exception => logWarning(s"Worker $workerId does not exist (maybe is already removed)")
|
217
|
+
}
|
218
|
+
}
|
219
|
+
|
220
|
+
/* ------------------------------------------------------------------------------------------- */
|
221
|
+
|
222
|
+
def stopServer{
|
223
|
+
synchronized {
|
224
|
+
// Kill workers
|
225
|
+
workers.foreach { case (socket, id) => killAndWait(id) }
|
226
|
+
|
227
|
+
// Kill master
|
228
|
+
master.destroy
|
229
|
+
|
230
|
+
// Stop SocketServer
|
231
|
+
serverSocket.close()
|
232
|
+
|
233
|
+
// Clean variables
|
234
|
+
serverSocket = null
|
235
|
+
serverPort = 0
|
236
|
+
master = null
|
237
|
+
masterSocket = null
|
238
|
+
masterOutputStream = null
|
239
|
+
masterInputStream = null
|
240
|
+
}
|
241
|
+
}
|
242
|
+
|
243
|
+
/* ------------------------------------------------------------------------------------------- */
|
244
|
+
|
245
|
+
private def redirectStreamsToStderr(streams: InputStream*) {
|
246
|
+
try {
|
247
|
+
for(stream <- streams) {
|
248
|
+
new RedirectThread(stream, System.err, "stream reader").start()
|
249
|
+
}
|
250
|
+
} catch {
|
251
|
+
case e: Exception =>
|
252
|
+
logError("Exception in redirecting streams", e)
|
253
|
+
}
|
254
|
+
}
|
255
|
+
|
256
|
+
/* ------------------------------------------------------------------------------------------- */
|
257
|
+
}
|
@@ -0,0 +1,84 @@
|
|
1
|
+
package org.apache.spark.api.ruby.marshal
|
2
|
+
|
3
|
+
import org.scalatest._
|
4
|
+
|
5
|
+
|
6
|
+
import org.apache.spark.api.ruby.marshal._
|
7
|
+
|
8
|
+
class MarshalSpec extends FunSpec with Matchers {
|
9
|
+
|
10
|
+
// ====================================================================================
|
11
|
+
// Load
|
12
|
+
|
13
|
+
describe("Marshal.load"){
|
14
|
+
describe("single value"){
|
15
|
+
it("int"){
|
16
|
+
val data = 1
|
17
|
+
val serialized = Array[Byte](4, 8, 105, 6)
|
18
|
+
|
19
|
+
Marshal.load(serialized) should equal(data)
|
20
|
+
}
|
21
|
+
|
22
|
+
it("double"){
|
23
|
+
val data = 1.2
|
24
|
+
val serialized = Array[Byte](4, 8, 102, 8, 49, 46, 50)
|
25
|
+
|
26
|
+
Marshal.load(serialized) should equal(data)
|
27
|
+
}
|
28
|
+
}
|
29
|
+
|
30
|
+
describe("array"){
|
31
|
+
it("ints"){
|
32
|
+
val data = Array(1, 2, 3, 4, 5)
|
33
|
+
val serialized = Array[Byte](4, 8, 91, 10, 105, 6, 105, 7, 105, 8, 105, 9, 105, 10)
|
34
|
+
|
35
|
+
Marshal.load(serialized) should equal(data)
|
36
|
+
}
|
37
|
+
|
38
|
+
it("doubles"){
|
39
|
+
val data = Array(1.1, 2.2, 3.3)
|
40
|
+
val serialized = Array[Byte](4, 8, 91, 8, 102, 8, 49, 46, 49, 102, 8, 50, 46, 50, 102, 8, 51, 46, 51)
|
41
|
+
|
42
|
+
Marshal.load(serialized) should equal(data)
|
43
|
+
}
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
// ====================================================================================
|
48
|
+
// Dump
|
49
|
+
|
50
|
+
describe("Marshal.dump"){
|
51
|
+
describe("single value"){
|
52
|
+
it("int"){
|
53
|
+
val data = 1
|
54
|
+
val serialized = Array(4, 8, 105, 6)
|
55
|
+
|
56
|
+
Marshal.dump(data) should equal(serialized)
|
57
|
+
}
|
58
|
+
|
59
|
+
it("double"){
|
60
|
+
val data = 1.2
|
61
|
+
val serialized = Array(4, 8, 102, 8, 49, 46, 50)
|
62
|
+
|
63
|
+
Marshal.dump(data) should equal(serialized)
|
64
|
+
}
|
65
|
+
}
|
66
|
+
|
67
|
+
describe("array"){
|
68
|
+
it("ints"){
|
69
|
+
val data = Array(1, 2, 3, 4, 5)
|
70
|
+
val serialized = Array(4, 8, 91, 10, 105, 6, 105, 7, 105, 8, 105, 9, 105, 10)
|
71
|
+
|
72
|
+
Marshal.dump(data) should equal(serialized)
|
73
|
+
}
|
74
|
+
|
75
|
+
it("doubles"){
|
76
|
+
val data = Array(1.1, 2.2, 3.3)
|
77
|
+
val serialized = Array(4, 8, 91, 8, 102, 8, 49, 46, 49, 102, 8, 50, 46, 50, 102, 8, 51, 46, 51)
|
78
|
+
|
79
|
+
Marshal.dump(data) should equal(serialized)
|
80
|
+
}
|
81
|
+
}
|
82
|
+
}
|
83
|
+
|
84
|
+
}
|
data/lib/ruby-spark.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require_relative 'spark'
|
data/lib/spark.rb
ADDED
@@ -0,0 +1,198 @@
|
|
1
|
+
# Gems and libraries
|
2
|
+
require 'method_source'
|
3
|
+
require 'forwardable'
|
4
|
+
require 'sourcify'
|
5
|
+
require 'socket'
|
6
|
+
require 'tempfile'
|
7
|
+
require 'tmpdir'
|
8
|
+
|
9
|
+
module Spark
|
10
|
+
autoload :Context, 'spark/context'
|
11
|
+
autoload :Config, 'spark/config'
|
12
|
+
autoload :RDD, 'spark/rdd'
|
13
|
+
autoload :CLI, 'spark/cli'
|
14
|
+
autoload :Build, 'spark/build'
|
15
|
+
autoload :Serializer, 'spark/serializer'
|
16
|
+
autoload :Helper, 'spark/helper'
|
17
|
+
autoload :StorageLevel, 'spark/storage_level'
|
18
|
+
autoload :Command, 'spark/command'
|
19
|
+
autoload :CommandBuilder, 'spark/command_builder'
|
20
|
+
autoload :Sampler, 'spark/sampler'
|
21
|
+
autoload :Logger, 'spark/logger'
|
22
|
+
autoload :JavaBridge, 'spark/java_bridge'
|
23
|
+
autoload :ExternalSorter, 'spark/sort'
|
24
|
+
autoload :Constant, 'spark/constant'
|
25
|
+
autoload :Broadcast, 'spark/broadcast'
|
26
|
+
autoload :Accumulator, 'spark/accumulator'
|
27
|
+
autoload :StatCounter, 'spark/stat_counter'
|
28
|
+
autoload :Mllib, 'spark/mllib'
|
29
|
+
|
30
|
+
include Helper::System
|
31
|
+
|
32
|
+
def self.print_logo(message=nil)
|
33
|
+
puts <<-STRING
|
34
|
+
|
35
|
+
Welcome to
|
36
|
+
__ ____ __
|
37
|
+
______ __/ / __ __ / __/__ ___ _____/ /__
|
38
|
+
/ __/ // / _ \\/ // / _\\ \\/ _ \\/ _ `/ __/ '_/
|
39
|
+
/_/ \\_,_/_.__/\\_, / /___/ .__/\\_,_/_/ /_/\\_\\ version #{Spark::VERSION}
|
40
|
+
/___/ /_/
|
41
|
+
|
42
|
+
#{message}
|
43
|
+
|
44
|
+
STRING
|
45
|
+
end
|
46
|
+
|
47
|
+
# Returns current configuration. Configurations can be changed until
|
48
|
+
# context is initialized. In this case config is locked only for reading.
|
49
|
+
#
|
50
|
+
# == Configuration can be changed:
|
51
|
+
#
|
52
|
+
# Spark.config.set('spark.app.name', 'RubySpark')
|
53
|
+
#
|
54
|
+
# Spark.config['spark.app.name'] = 'RubySpark'
|
55
|
+
#
|
56
|
+
# Spark.config do
|
57
|
+
# set 'spark.app.name', 'RubySpark'
|
58
|
+
# end
|
59
|
+
#
|
60
|
+
def self.config(&block)
|
61
|
+
@config ||= Spark::Config.new
|
62
|
+
|
63
|
+
if block_given?
|
64
|
+
@config.instance_eval(&block)
|
65
|
+
else
|
66
|
+
@config
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
# Destroy current configuration. This can be useful for restarting config
|
71
|
+
# to set new. It has no effect if context is already started.
|
72
|
+
def self.clear_config
|
73
|
+
@config = nil
|
74
|
+
end
|
75
|
+
|
76
|
+
# Return a current active context or nil.
|
77
|
+
#
|
78
|
+
# TODO: Run `start` if context is nil?
|
79
|
+
#
|
80
|
+
def self.context
|
81
|
+
@context
|
82
|
+
end
|
83
|
+
|
84
|
+
# Initialize spark context if not already. Config will be automatically
|
85
|
+
# loaded on constructor. From that point `config` will use configuration
|
86
|
+
# from running Spark and will be locked only for reading.
|
87
|
+
def self.start
|
88
|
+
if started?
|
89
|
+
# Already started
|
90
|
+
else
|
91
|
+
@context ||= Spark::Context.new
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def self.stop
|
96
|
+
@context.stop
|
97
|
+
RubyWorker.stopServer
|
98
|
+
logger.info('Workers were stopped')
|
99
|
+
rescue
|
100
|
+
nil
|
101
|
+
ensure
|
102
|
+
@context = nil
|
103
|
+
clear_config
|
104
|
+
end
|
105
|
+
|
106
|
+
def self.started?
|
107
|
+
!!@context
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.logger
|
111
|
+
@logger ||= Spark::Logger.new
|
112
|
+
end
|
113
|
+
|
114
|
+
# Root of the gem
|
115
|
+
def self.root
|
116
|
+
@root ||= File.expand_path('..', File.dirname(__FILE__))
|
117
|
+
end
|
118
|
+
|
119
|
+
def self.home
|
120
|
+
root
|
121
|
+
end
|
122
|
+
|
123
|
+
# Default directory for java extensions
|
124
|
+
def self.target_dir
|
125
|
+
@target_dir ||= File.join(root, 'target')
|
126
|
+
end
|
127
|
+
|
128
|
+
# Directory where is worker.rb
|
129
|
+
def self.worker_dir
|
130
|
+
@worker_dir ||= File.join(root, 'lib', 'spark', 'worker')
|
131
|
+
end
|
132
|
+
|
133
|
+
def self.ruby_spark_jar
|
134
|
+
@ruby_spark_jar ||= File.join(target_dir, 'ruby-spark.jar')
|
135
|
+
end
|
136
|
+
|
137
|
+
def self.spark_ext_dir
|
138
|
+
@spark_ext_dir ||= File.join(root, 'ext', 'spark')
|
139
|
+
end
|
140
|
+
|
141
|
+
|
142
|
+
# ===============================================================================
|
143
|
+
# Load JVM and jars
|
144
|
+
|
145
|
+
# Load dependent libraries, can be use once
|
146
|
+
# Cannot load before CLI::install
|
147
|
+
#
|
148
|
+
# == Parameters:
|
149
|
+
# spark_home::
|
150
|
+
# path to directory where are located sparks .jar files or single Spark jar
|
151
|
+
#
|
152
|
+
def self.load_lib(spark_home=nil)
|
153
|
+
return if @java_bridge
|
154
|
+
|
155
|
+
spark_home ||= Spark.target_dir
|
156
|
+
|
157
|
+
bridge = JavaBridge.get
|
158
|
+
@java_bridge = bridge.new(spark_home)
|
159
|
+
@java_bridge.load
|
160
|
+
nil
|
161
|
+
end
|
162
|
+
|
163
|
+
def self.java_bridge
|
164
|
+
@java_bridge
|
165
|
+
end
|
166
|
+
|
167
|
+
|
168
|
+
# Aliases
|
169
|
+
class << self
|
170
|
+
alias_method :sc, :context
|
171
|
+
alias_method :jb, :java_bridge
|
172
|
+
end
|
173
|
+
|
174
|
+
end
|
175
|
+
|
176
|
+
# C/Java extensions
|
177
|
+
require 'ruby_spark_ext'
|
178
|
+
|
179
|
+
# Ruby core extensions
|
180
|
+
require 'spark/ext/module'
|
181
|
+
require 'spark/ext/object'
|
182
|
+
require 'spark/ext/hash'
|
183
|
+
require 'spark/ext/string'
|
184
|
+
require 'spark/ext/integer'
|
185
|
+
require 'spark/ext/ip_socket'
|
186
|
+
require 'spark/ext/io'
|
187
|
+
|
188
|
+
# Other requirments
|
189
|
+
require 'spark/version'
|
190
|
+
require 'spark/error'
|
191
|
+
|
192
|
+
# Make sure that Spark be always stopped
|
193
|
+
Kernel::at_exit do
|
194
|
+
begin
|
195
|
+
Spark.stop
|
196
|
+
rescue
|
197
|
+
end
|
198
|
+
end
|