ruby-spark 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +37 -0
- data/Gemfile +47 -0
- data/Guardfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +185 -0
- data/Rakefile +35 -0
- data/TODO.md +7 -0
- data/benchmark/aggregate.rb +33 -0
- data/benchmark/bisect.rb +88 -0
- data/benchmark/custom_marshal.rb +94 -0
- data/benchmark/digest.rb +150 -0
- data/benchmark/enumerator.rb +88 -0
- data/benchmark/performance/prepare.sh +18 -0
- data/benchmark/performance/python.py +156 -0
- data/benchmark/performance/r.r +69 -0
- data/benchmark/performance/ruby.rb +167 -0
- data/benchmark/performance/run-all.sh +160 -0
- data/benchmark/performance/scala.scala +181 -0
- data/benchmark/serializer.rb +82 -0
- data/benchmark/sort.rb +43 -0
- data/benchmark/sort2.rb +164 -0
- data/benchmark/take.rb +28 -0
- data/bin/ruby-spark +8 -0
- data/example/pi.rb +28 -0
- data/ext/ruby_c/extconf.rb +3 -0
- data/ext/ruby_c/murmur.c +158 -0
- data/ext/ruby_c/murmur.h +9 -0
- data/ext/ruby_c/ruby-spark.c +18 -0
- data/ext/ruby_java/Digest.java +36 -0
- data/ext/ruby_java/Murmur2.java +98 -0
- data/ext/ruby_java/RubySparkExtService.java +28 -0
- data/ext/ruby_java/extconf.rb +3 -0
- data/ext/spark/build.sbt +73 -0
- data/ext/spark/project/plugins.sbt +9 -0
- data/ext/spark/sbt/sbt +34 -0
- data/ext/spark/src/main/scala/Exec.scala +91 -0
- data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
- data/ext/spark/src/main/scala/Marshal.scala +52 -0
- data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
- data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
- data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
- data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
- data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
- data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
- data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
- data/ext/spark/src/main/scala/RubyPage.scala +34 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +364 -0
- data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
- data/ext/spark/src/main/scala/RubyTab.scala +11 -0
- data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
- data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
- data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
- data/lib/ruby-spark.rb +1 -0
- data/lib/spark.rb +198 -0
- data/lib/spark/accumulator.rb +260 -0
- data/lib/spark/broadcast.rb +98 -0
- data/lib/spark/build.rb +43 -0
- data/lib/spark/cli.rb +169 -0
- data/lib/spark/command.rb +86 -0
- data/lib/spark/command/base.rb +154 -0
- data/lib/spark/command/basic.rb +345 -0
- data/lib/spark/command/pair.rb +124 -0
- data/lib/spark/command/sort.rb +51 -0
- data/lib/spark/command/statistic.rb +144 -0
- data/lib/spark/command_builder.rb +141 -0
- data/lib/spark/command_validator.rb +34 -0
- data/lib/spark/config.rb +244 -0
- data/lib/spark/constant.rb +14 -0
- data/lib/spark/context.rb +304 -0
- data/lib/spark/error.rb +50 -0
- data/lib/spark/ext/hash.rb +41 -0
- data/lib/spark/ext/integer.rb +25 -0
- data/lib/spark/ext/io.rb +57 -0
- data/lib/spark/ext/ip_socket.rb +29 -0
- data/lib/spark/ext/module.rb +58 -0
- data/lib/spark/ext/object.rb +24 -0
- data/lib/spark/ext/string.rb +24 -0
- data/lib/spark/helper.rb +10 -0
- data/lib/spark/helper/logger.rb +40 -0
- data/lib/spark/helper/parser.rb +85 -0
- data/lib/spark/helper/serialize.rb +71 -0
- data/lib/spark/helper/statistic.rb +93 -0
- data/lib/spark/helper/system.rb +42 -0
- data/lib/spark/java_bridge.rb +19 -0
- data/lib/spark/java_bridge/base.rb +203 -0
- data/lib/spark/java_bridge/jruby.rb +23 -0
- data/lib/spark/java_bridge/rjb.rb +41 -0
- data/lib/spark/logger.rb +76 -0
- data/lib/spark/mllib.rb +100 -0
- data/lib/spark/mllib/classification/common.rb +31 -0
- data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
- data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
- data/lib/spark/mllib/classification/svm.rb +135 -0
- data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
- data/lib/spark/mllib/clustering/kmeans.rb +118 -0
- data/lib/spark/mllib/matrix.rb +120 -0
- data/lib/spark/mllib/regression/common.rb +73 -0
- data/lib/spark/mllib/regression/labeled_point.rb +41 -0
- data/lib/spark/mllib/regression/lasso.rb +100 -0
- data/lib/spark/mllib/regression/linear.rb +124 -0
- data/lib/spark/mllib/regression/ridge.rb +97 -0
- data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
- data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
- data/lib/spark/mllib/stat/distribution.rb +12 -0
- data/lib/spark/mllib/vector.rb +185 -0
- data/lib/spark/rdd.rb +1328 -0
- data/lib/spark/sampler.rb +92 -0
- data/lib/spark/serializer.rb +24 -0
- data/lib/spark/serializer/base.rb +170 -0
- data/lib/spark/serializer/cartesian.rb +37 -0
- data/lib/spark/serializer/marshal.rb +19 -0
- data/lib/spark/serializer/message_pack.rb +25 -0
- data/lib/spark/serializer/oj.rb +25 -0
- data/lib/spark/serializer/pair.rb +27 -0
- data/lib/spark/serializer/utf8.rb +25 -0
- data/lib/spark/sort.rb +189 -0
- data/lib/spark/stat_counter.rb +125 -0
- data/lib/spark/storage_level.rb +39 -0
- data/lib/spark/version.rb +3 -0
- data/lib/spark/worker/master.rb +144 -0
- data/lib/spark/worker/spark_files.rb +15 -0
- data/lib/spark/worker/worker.rb +197 -0
- data/ruby-spark.gemspec +36 -0
- data/spec/generator.rb +37 -0
- data/spec/inputs/lorem_300.txt +316 -0
- data/spec/inputs/numbers/1.txt +50 -0
- data/spec/inputs/numbers/10.txt +50 -0
- data/spec/inputs/numbers/11.txt +50 -0
- data/spec/inputs/numbers/12.txt +50 -0
- data/spec/inputs/numbers/13.txt +50 -0
- data/spec/inputs/numbers/14.txt +50 -0
- data/spec/inputs/numbers/15.txt +50 -0
- data/spec/inputs/numbers/16.txt +50 -0
- data/spec/inputs/numbers/17.txt +50 -0
- data/spec/inputs/numbers/18.txt +50 -0
- data/spec/inputs/numbers/19.txt +50 -0
- data/spec/inputs/numbers/2.txt +50 -0
- data/spec/inputs/numbers/20.txt +50 -0
- data/spec/inputs/numbers/3.txt +50 -0
- data/spec/inputs/numbers/4.txt +50 -0
- data/spec/inputs/numbers/5.txt +50 -0
- data/spec/inputs/numbers/6.txt +50 -0
- data/spec/inputs/numbers/7.txt +50 -0
- data/spec/inputs/numbers/8.txt +50 -0
- data/spec/inputs/numbers/9.txt +50 -0
- data/spec/inputs/numbers_0_100.txt +101 -0
- data/spec/inputs/numbers_1_100.txt +100 -0
- data/spec/lib/collect_spec.rb +42 -0
- data/spec/lib/command_spec.rb +68 -0
- data/spec/lib/config_spec.rb +64 -0
- data/spec/lib/context_spec.rb +163 -0
- data/spec/lib/ext_spec.rb +72 -0
- data/spec/lib/external_apps_spec.rb +45 -0
- data/spec/lib/filter_spec.rb +80 -0
- data/spec/lib/flat_map_spec.rb +100 -0
- data/spec/lib/group_spec.rb +109 -0
- data/spec/lib/helper_spec.rb +19 -0
- data/spec/lib/key_spec.rb +41 -0
- data/spec/lib/manipulation_spec.rb +114 -0
- data/spec/lib/map_partitions_spec.rb +87 -0
- data/spec/lib/map_spec.rb +91 -0
- data/spec/lib/mllib/classification_spec.rb +54 -0
- data/spec/lib/mllib/clustering_spec.rb +35 -0
- data/spec/lib/mllib/matrix_spec.rb +32 -0
- data/spec/lib/mllib/regression_spec.rb +116 -0
- data/spec/lib/mllib/vector_spec.rb +77 -0
- data/spec/lib/reduce_by_key_spec.rb +118 -0
- data/spec/lib/reduce_spec.rb +131 -0
- data/spec/lib/sample_spec.rb +46 -0
- data/spec/lib/serializer_spec.rb +13 -0
- data/spec/lib/sort_spec.rb +58 -0
- data/spec/lib/statistic_spec.rb +168 -0
- data/spec/lib/whole_text_files_spec.rb +33 -0
- data/spec/spec_helper.rb +39 -0
- metadata +301 -0
@@ -0,0 +1,15 @@
|
|
1
|
+
package org.apache.spark.api.ruby
|
2
|
+
|
3
|
+
import org.apache.spark.util._
|
4
|
+
import org.apache.spark.{SparkConf, Logging}
|
5
|
+
|
6
|
+
object RubyUtils extends Logging {
|
7
|
+
|
8
|
+
def loadPropertiesFile(conf: SparkConf, path: String): String = {
|
9
|
+
Utils.getPropertiesFromFile(path).foreach {
|
10
|
+
case (key, value) => conf.set(key, value)
|
11
|
+
}
|
12
|
+
path
|
13
|
+
}
|
14
|
+
|
15
|
+
}
|
@@ -0,0 +1,257 @@
|
|
1
|
+
package org.apache.spark.api.ruby
|
2
|
+
|
3
|
+
import java.io.{File, DataInputStream, InputStream, DataOutputStream, FileOutputStream}
|
4
|
+
import java.net.{InetAddress, ServerSocket, Socket, SocketException}
|
5
|
+
import java.nio.file.Paths
|
6
|
+
|
7
|
+
import scala.collection.mutable
|
8
|
+
import scala.collection.JavaConversions._
|
9
|
+
|
10
|
+
import org.apache.spark._
|
11
|
+
import org.apache.spark.api.python.PythonRDD
|
12
|
+
import org.apache.spark.util.Utils
|
13
|
+
import org.apache.spark.util.RedirectThread
|
14
|
+
|
15
|
+
|
16
|
+
/* =================================================================================================
|
17
|
+
* Object RubyWorker
|
18
|
+
* =================================================================================================
|
19
|
+
*
|
20
|
+
* Create and store server for creating workers.
|
21
|
+
*/
|
22
|
+
|
23
|
+
object RubyWorker extends Logging {
|
24
|
+
|
25
|
+
val PROCESS_WAIT_TIMEOUT = 10000
|
26
|
+
|
27
|
+
private var serverSocket: ServerSocket = null
|
28
|
+
private val serverHost = InetAddress.getByAddress(Array(127, 0, 0, 1))
|
29
|
+
private var serverPort: Int = 0
|
30
|
+
|
31
|
+
private var master: ExecutedFileCommand = null
|
32
|
+
private var masterSocket: Socket = null
|
33
|
+
private var masterOutputStream: DataOutputStream = null
|
34
|
+
private var masterInputStream: DataInputStream = null
|
35
|
+
|
36
|
+
private var workers = new mutable.WeakHashMap[Socket, Long]()
|
37
|
+
|
38
|
+
|
39
|
+
/* ----------------------------------------------------------------------------------------------
|
40
|
+
* Create new worker but first check if exist SocketServer and master process.
|
41
|
+
* If not it will create them. Worker have 2 chance to create.
|
42
|
+
*/
|
43
|
+
|
44
|
+
def create(env: SparkEnv): (Socket, Long) = {
|
45
|
+
synchronized {
|
46
|
+
// Create the server if it hasn't been started
|
47
|
+
createServer(env)
|
48
|
+
|
49
|
+
// Attempt to connect, restart and retry once if it fails
|
50
|
+
try {
|
51
|
+
createWorker
|
52
|
+
} catch {
|
53
|
+
case exc: SocketException =>
|
54
|
+
logWarning("Worker unexpectedly quit, attempting to restart")
|
55
|
+
createWorker
|
56
|
+
}
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
/* ----------------------------------------------------------------------------------------------
|
61
|
+
* Create a worker throught master process. Return new socket and id.
|
62
|
+
* According spark.ruby.worker.type id will be:
|
63
|
+
* process: PID
|
64
|
+
* thread: thread object id
|
65
|
+
*/
|
66
|
+
|
67
|
+
def createWorker: (Socket, Long) = {
|
68
|
+
synchronized {
|
69
|
+
masterOutputStream.writeInt(RubyConstant.CREATE_WORKER)
|
70
|
+
var socket = serverSocket.accept()
|
71
|
+
|
72
|
+
var id = new DataInputStream(socket.getInputStream).readLong()
|
73
|
+
workers.put(socket, id)
|
74
|
+
|
75
|
+
(socket, id)
|
76
|
+
}
|
77
|
+
}
|
78
|
+
|
79
|
+
/* ----------------------------------------------------------------------------------------------
|
80
|
+
* Create SocketServer and bind it to the localhost. Max numbers of connection on queue
|
81
|
+
* is set to default. If server is created withou exception -> create master.
|
82
|
+
*/
|
83
|
+
|
84
|
+
private def createServer(env: SparkEnv){
|
85
|
+
synchronized {
|
86
|
+
// Already running?
|
87
|
+
if(serverSocket != null && masterSocket != null) {
|
88
|
+
return
|
89
|
+
}
|
90
|
+
|
91
|
+
try {
|
92
|
+
// Start Socket Server for comunication
|
93
|
+
serverSocket = new ServerSocket(0, 0, serverHost)
|
94
|
+
serverPort = serverSocket.getLocalPort
|
95
|
+
|
96
|
+
// Create a master for worker creations
|
97
|
+
createMaster(env)
|
98
|
+
} catch {
|
99
|
+
case e: Exception =>
|
100
|
+
throw new SparkException("There was a problem with creating a server", e)
|
101
|
+
}
|
102
|
+
}
|
103
|
+
}
|
104
|
+
|
105
|
+
/* ----------------------------------------------------------------------------------------------
|
106
|
+
* In this point SocketServer must be created. Master process create and kill workers.
|
107
|
+
* Creating workers from Java can be an expensive operation because new process can
|
108
|
+
* get copy of address space.
|
109
|
+
*/
|
110
|
+
|
111
|
+
private def createMaster(env: SparkEnv){
|
112
|
+
synchronized {
|
113
|
+
val isDriver = env.executorId == SparkContext.DRIVER_IDENTIFIER
|
114
|
+
val executorOptions = env.conf.get("spark.ruby.executor.options", "")
|
115
|
+
val commandTemplate = env.conf.get("spark.ruby.executor.command")
|
116
|
+
val workerType = env.conf.get("spark.ruby.worker.type")
|
117
|
+
|
118
|
+
// Where is root of ruby-spark
|
119
|
+
var executorLocation = ""
|
120
|
+
|
121
|
+
if(isDriver){
|
122
|
+
// Use worker from current active gem location
|
123
|
+
executorLocation = env.conf.get("spark.ruby.driver_home")
|
124
|
+
}
|
125
|
+
else{
|
126
|
+
// Ruby-spark package uri
|
127
|
+
val uri = env.conf.get("spark.ruby.executor.uri", "")
|
128
|
+
|
129
|
+
if(uri.isEmpty){
|
130
|
+
// Use gem installed on the system
|
131
|
+
try {
|
132
|
+
val homeCommand = new FileCommand(commandTemplate, "ruby-spark home", env, getEnvVars(env))
|
133
|
+
|
134
|
+
executorLocation = homeCommand.run.readLine
|
135
|
+
} catch {
|
136
|
+
case e: java.io.IOException =>
|
137
|
+
throw new SparkException("Ruby-spark gem is not installed.", e)
|
138
|
+
}
|
139
|
+
}
|
140
|
+
else{
|
141
|
+
// Prepare and use gem from uri
|
142
|
+
}
|
143
|
+
}
|
144
|
+
|
145
|
+
// Master and worker are saved in GEM_ROOT/lib/spark/worker
|
146
|
+
executorLocation = Paths.get(executorLocation, "lib", "spark", "worker").toString
|
147
|
+
|
148
|
+
// Create master command
|
149
|
+
// -C: change worker dir before execution
|
150
|
+
val masterRb = s"ruby $executorOptions -C $executorLocation master.rb $workerType $serverPort"
|
151
|
+
val masterCommand = new FileCommand(commandTemplate, masterRb, env, getEnvVars(env))
|
152
|
+
|
153
|
+
// Start master
|
154
|
+
master = masterCommand.run
|
155
|
+
|
156
|
+
// Redirect master stdout and stderr
|
157
|
+
redirectStreamsToStderr(master.getInputStream, master.getErrorStream)
|
158
|
+
|
159
|
+
// Wait for it to connect to our socket
|
160
|
+
serverSocket.setSoTimeout(PROCESS_WAIT_TIMEOUT)
|
161
|
+
try {
|
162
|
+
// Use socket for comunication. Keep stdout and stdin for log
|
163
|
+
masterSocket = serverSocket.accept()
|
164
|
+
masterOutputStream = new DataOutputStream(masterSocket.getOutputStream)
|
165
|
+
masterInputStream = new DataInputStream(masterSocket.getInputStream)
|
166
|
+
|
167
|
+
PythonRDD.writeUTF(executorOptions, masterOutputStream)
|
168
|
+
} catch {
|
169
|
+
case e: Exception =>
|
170
|
+
throw new SparkException("Ruby master did not connect back in time", e)
|
171
|
+
}
|
172
|
+
}
|
173
|
+
}
|
174
|
+
|
175
|
+
/* ----------------------------------------------------------------------------------------------
|
176
|
+
* Gel all environment variables for executor
|
177
|
+
*/
|
178
|
+
|
179
|
+
def getEnvVars(env: SparkEnv): Map[String, String] = {
|
180
|
+
val prefix = "spark.ruby.executor.env."
|
181
|
+
env.conf.getAll.filter{case (k, _) => k.startsWith(prefix)}
|
182
|
+
.map{case (k, v) => (k.substring(prefix.length), v)}
|
183
|
+
.toMap
|
184
|
+
}
|
185
|
+
|
186
|
+
/* ------------------------------------------------------------------------------------------- */
|
187
|
+
|
188
|
+
def kill(workerId: Long){
|
189
|
+
masterOutputStream.writeInt(RubyConstant.KILL_WORKER)
|
190
|
+
masterOutputStream.writeLong(workerId)
|
191
|
+
}
|
192
|
+
|
193
|
+
/* ------------------------------------------------------------------------------------------- */
|
194
|
+
|
195
|
+
def killAndWait(workerId: Long){
|
196
|
+
masterOutputStream.writeInt(RubyConstant.KILL_WORKER_AND_WAIT)
|
197
|
+
masterOutputStream.writeLong(workerId)
|
198
|
+
|
199
|
+
// Wait for answer
|
200
|
+
masterInputStream.readInt() match {
|
201
|
+
case RubyConstant.SUCCESSFULLY_KILLED =>
|
202
|
+
logInfo(s"Worker $workerId was successfully killed")
|
203
|
+
case RubyConstant.UNSUCCESSFUL_KILLING =>
|
204
|
+
logInfo(s"Worker $workerId cannot be killed (maybe is already killed)")
|
205
|
+
}
|
206
|
+
}
|
207
|
+
|
208
|
+
/* ----------------------------------------------------------------------------------------------
|
209
|
+
* workers HashMap is week but it avoid long list of workers which cannot be killed (killAndWait)
|
210
|
+
*/
|
211
|
+
|
212
|
+
def remove(worker: Socket, workerId: Long){
|
213
|
+
try {
|
214
|
+
workers.remove(worker)
|
215
|
+
} catch {
|
216
|
+
case e: Exception => logWarning(s"Worker $workerId does not exist (maybe is already removed)")
|
217
|
+
}
|
218
|
+
}
|
219
|
+
|
220
|
+
/* ------------------------------------------------------------------------------------------- */
|
221
|
+
|
222
|
+
def stopServer{
|
223
|
+
synchronized {
|
224
|
+
// Kill workers
|
225
|
+
workers.foreach { case (socket, id) => killAndWait(id) }
|
226
|
+
|
227
|
+
// Kill master
|
228
|
+
master.destroy
|
229
|
+
|
230
|
+
// Stop SocketServer
|
231
|
+
serverSocket.close()
|
232
|
+
|
233
|
+
// Clean variables
|
234
|
+
serverSocket = null
|
235
|
+
serverPort = 0
|
236
|
+
master = null
|
237
|
+
masterSocket = null
|
238
|
+
masterOutputStream = null
|
239
|
+
masterInputStream = null
|
240
|
+
}
|
241
|
+
}
|
242
|
+
|
243
|
+
/* ------------------------------------------------------------------------------------------- */
|
244
|
+
|
245
|
+
private def redirectStreamsToStderr(streams: InputStream*) {
|
246
|
+
try {
|
247
|
+
for(stream <- streams) {
|
248
|
+
new RedirectThread(stream, System.err, "stream reader").start()
|
249
|
+
}
|
250
|
+
} catch {
|
251
|
+
case e: Exception =>
|
252
|
+
logError("Exception in redirecting streams", e)
|
253
|
+
}
|
254
|
+
}
|
255
|
+
|
256
|
+
/* ------------------------------------------------------------------------------------------- */
|
257
|
+
}
|
@@ -0,0 +1,84 @@
|
|
1
|
+
package org.apache.spark.api.ruby.marshal
|
2
|
+
|
3
|
+
import org.scalatest._
|
4
|
+
|
5
|
+
|
6
|
+
import org.apache.spark.api.ruby.marshal._
|
7
|
+
|
8
|
+
class MarshalSpec extends FunSpec with Matchers {
|
9
|
+
|
10
|
+
// ====================================================================================
|
11
|
+
// Load
|
12
|
+
|
13
|
+
describe("Marshal.load"){
|
14
|
+
describe("single value"){
|
15
|
+
it("int"){
|
16
|
+
val data = 1
|
17
|
+
val serialized = Array[Byte](4, 8, 105, 6)
|
18
|
+
|
19
|
+
Marshal.load(serialized) should equal(data)
|
20
|
+
}
|
21
|
+
|
22
|
+
it("double"){
|
23
|
+
val data = 1.2
|
24
|
+
val serialized = Array[Byte](4, 8, 102, 8, 49, 46, 50)
|
25
|
+
|
26
|
+
Marshal.load(serialized) should equal(data)
|
27
|
+
}
|
28
|
+
}
|
29
|
+
|
30
|
+
describe("array"){
|
31
|
+
it("ints"){
|
32
|
+
val data = Array(1, 2, 3, 4, 5)
|
33
|
+
val serialized = Array[Byte](4, 8, 91, 10, 105, 6, 105, 7, 105, 8, 105, 9, 105, 10)
|
34
|
+
|
35
|
+
Marshal.load(serialized) should equal(data)
|
36
|
+
}
|
37
|
+
|
38
|
+
it("doubles"){
|
39
|
+
val data = Array(1.1, 2.2, 3.3)
|
40
|
+
val serialized = Array[Byte](4, 8, 91, 8, 102, 8, 49, 46, 49, 102, 8, 50, 46, 50, 102, 8, 51, 46, 51)
|
41
|
+
|
42
|
+
Marshal.load(serialized) should equal(data)
|
43
|
+
}
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
// ====================================================================================
|
48
|
+
// Dump
|
49
|
+
|
50
|
+
describe("Marshal.dump"){
|
51
|
+
describe("single value"){
|
52
|
+
it("int"){
|
53
|
+
val data = 1
|
54
|
+
val serialized = Array(4, 8, 105, 6)
|
55
|
+
|
56
|
+
Marshal.dump(data) should equal(serialized)
|
57
|
+
}
|
58
|
+
|
59
|
+
it("double"){
|
60
|
+
val data = 1.2
|
61
|
+
val serialized = Array(4, 8, 102, 8, 49, 46, 50)
|
62
|
+
|
63
|
+
Marshal.dump(data) should equal(serialized)
|
64
|
+
}
|
65
|
+
}
|
66
|
+
|
67
|
+
describe("array"){
|
68
|
+
it("ints"){
|
69
|
+
val data = Array(1, 2, 3, 4, 5)
|
70
|
+
val serialized = Array(4, 8, 91, 10, 105, 6, 105, 7, 105, 8, 105, 9, 105, 10)
|
71
|
+
|
72
|
+
Marshal.dump(data) should equal(serialized)
|
73
|
+
}
|
74
|
+
|
75
|
+
it("doubles"){
|
76
|
+
val data = Array(1.1, 2.2, 3.3)
|
77
|
+
val serialized = Array(4, 8, 91, 8, 102, 8, 49, 46, 49, 102, 8, 50, 46, 50, 102, 8, 51, 46, 51)
|
78
|
+
|
79
|
+
Marshal.dump(data) should equal(serialized)
|
80
|
+
}
|
81
|
+
}
|
82
|
+
}
|
83
|
+
|
84
|
+
}
|
data/lib/ruby-spark.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require_relative 'spark'
|
data/lib/spark.rb
ADDED
@@ -0,0 +1,198 @@
|
|
1
|
+
# Gems and libraries
|
2
|
+
require 'method_source'
|
3
|
+
require 'forwardable'
|
4
|
+
require 'sourcify'
|
5
|
+
require 'socket'
|
6
|
+
require 'tempfile'
|
7
|
+
require 'tmpdir'
|
8
|
+
|
9
|
+
module Spark
|
10
|
+
autoload :Context, 'spark/context'
|
11
|
+
autoload :Config, 'spark/config'
|
12
|
+
autoload :RDD, 'spark/rdd'
|
13
|
+
autoload :CLI, 'spark/cli'
|
14
|
+
autoload :Build, 'spark/build'
|
15
|
+
autoload :Serializer, 'spark/serializer'
|
16
|
+
autoload :Helper, 'spark/helper'
|
17
|
+
autoload :StorageLevel, 'spark/storage_level'
|
18
|
+
autoload :Command, 'spark/command'
|
19
|
+
autoload :CommandBuilder, 'spark/command_builder'
|
20
|
+
autoload :Sampler, 'spark/sampler'
|
21
|
+
autoload :Logger, 'spark/logger'
|
22
|
+
autoload :JavaBridge, 'spark/java_bridge'
|
23
|
+
autoload :ExternalSorter, 'spark/sort'
|
24
|
+
autoload :Constant, 'spark/constant'
|
25
|
+
autoload :Broadcast, 'spark/broadcast'
|
26
|
+
autoload :Accumulator, 'spark/accumulator'
|
27
|
+
autoload :StatCounter, 'spark/stat_counter'
|
28
|
+
autoload :Mllib, 'spark/mllib'
|
29
|
+
|
30
|
+
include Helper::System
|
31
|
+
|
32
|
+
def self.print_logo(message=nil)
|
33
|
+
puts <<-STRING
|
34
|
+
|
35
|
+
Welcome to
|
36
|
+
__ ____ __
|
37
|
+
______ __/ / __ __ / __/__ ___ _____/ /__
|
38
|
+
/ __/ // / _ \\/ // / _\\ \\/ _ \\/ _ `/ __/ '_/
|
39
|
+
/_/ \\_,_/_.__/\\_, / /___/ .__/\\_,_/_/ /_/\\_\\ version #{Spark::VERSION}
|
40
|
+
/___/ /_/
|
41
|
+
|
42
|
+
#{message}
|
43
|
+
|
44
|
+
STRING
|
45
|
+
end
|
46
|
+
|
47
|
+
# Returns current configuration. Configurations can be changed until
|
48
|
+
# context is initialized. In this case config is locked only for reading.
|
49
|
+
#
|
50
|
+
# == Configuration can be changed:
|
51
|
+
#
|
52
|
+
# Spark.config.set('spark.app.name', 'RubySpark')
|
53
|
+
#
|
54
|
+
# Spark.config['spark.app.name'] = 'RubySpark'
|
55
|
+
#
|
56
|
+
# Spark.config do
|
57
|
+
# set 'spark.app.name', 'RubySpark'
|
58
|
+
# end
|
59
|
+
#
|
60
|
+
def self.config(&block)
|
61
|
+
@config ||= Spark::Config.new
|
62
|
+
|
63
|
+
if block_given?
|
64
|
+
@config.instance_eval(&block)
|
65
|
+
else
|
66
|
+
@config
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
# Destroy current configuration. This can be useful for restarting config
|
71
|
+
# to set new. It has no effect if context is already started.
|
72
|
+
def self.clear_config
|
73
|
+
@config = nil
|
74
|
+
end
|
75
|
+
|
76
|
+
# Return a current active context or nil.
|
77
|
+
#
|
78
|
+
# TODO: Run `start` if context is nil?
|
79
|
+
#
|
80
|
+
def self.context
|
81
|
+
@context
|
82
|
+
end
|
83
|
+
|
84
|
+
# Initialize spark context if not already. Config will be automatically
|
85
|
+
# loaded on constructor. From that point `config` will use configuration
|
86
|
+
# from running Spark and will be locked only for reading.
|
87
|
+
def self.start
|
88
|
+
if started?
|
89
|
+
# Already started
|
90
|
+
else
|
91
|
+
@context ||= Spark::Context.new
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def self.stop
|
96
|
+
@context.stop
|
97
|
+
RubyWorker.stopServer
|
98
|
+
logger.info('Workers were stopped')
|
99
|
+
rescue
|
100
|
+
nil
|
101
|
+
ensure
|
102
|
+
@context = nil
|
103
|
+
clear_config
|
104
|
+
end
|
105
|
+
|
106
|
+
def self.started?
|
107
|
+
!!@context
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.logger
|
111
|
+
@logger ||= Spark::Logger.new
|
112
|
+
end
|
113
|
+
|
114
|
+
# Root of the gem
|
115
|
+
def self.root
|
116
|
+
@root ||= File.expand_path('..', File.dirname(__FILE__))
|
117
|
+
end
|
118
|
+
|
119
|
+
def self.home
|
120
|
+
root
|
121
|
+
end
|
122
|
+
|
123
|
+
# Default directory for java extensions
|
124
|
+
def self.target_dir
|
125
|
+
@target_dir ||= File.join(root, 'target')
|
126
|
+
end
|
127
|
+
|
128
|
+
# Directory where is worker.rb
|
129
|
+
def self.worker_dir
|
130
|
+
@worker_dir ||= File.join(root, 'lib', 'spark', 'worker')
|
131
|
+
end
|
132
|
+
|
133
|
+
def self.ruby_spark_jar
|
134
|
+
@ruby_spark_jar ||= File.join(target_dir, 'ruby-spark.jar')
|
135
|
+
end
|
136
|
+
|
137
|
+
def self.spark_ext_dir
|
138
|
+
@spark_ext_dir ||= File.join(root, 'ext', 'spark')
|
139
|
+
end
|
140
|
+
|
141
|
+
|
142
|
+
# ===============================================================================
|
143
|
+
# Load JVM and jars
|
144
|
+
|
145
|
+
# Load dependent libraries, can be use once
|
146
|
+
# Cannot load before CLI::install
|
147
|
+
#
|
148
|
+
# == Parameters:
|
149
|
+
# spark_home::
|
150
|
+
# path to directory where are located sparks .jar files or single Spark jar
|
151
|
+
#
|
152
|
+
def self.load_lib(spark_home=nil)
|
153
|
+
return if @java_bridge
|
154
|
+
|
155
|
+
spark_home ||= Spark.target_dir
|
156
|
+
|
157
|
+
bridge = JavaBridge.get
|
158
|
+
@java_bridge = bridge.new(spark_home)
|
159
|
+
@java_bridge.load
|
160
|
+
nil
|
161
|
+
end
|
162
|
+
|
163
|
+
def self.java_bridge
|
164
|
+
@java_bridge
|
165
|
+
end
|
166
|
+
|
167
|
+
|
168
|
+
# Aliases
|
169
|
+
class << self
|
170
|
+
alias_method :sc, :context
|
171
|
+
alias_method :jb, :java_bridge
|
172
|
+
end
|
173
|
+
|
174
|
+
end
|
175
|
+
|
176
|
+
# C/Java extensions
|
177
|
+
require 'ruby_spark_ext'
|
178
|
+
|
179
|
+
# Ruby core extensions
|
180
|
+
require 'spark/ext/module'
|
181
|
+
require 'spark/ext/object'
|
182
|
+
require 'spark/ext/hash'
|
183
|
+
require 'spark/ext/string'
|
184
|
+
require 'spark/ext/integer'
|
185
|
+
require 'spark/ext/ip_socket'
|
186
|
+
require 'spark/ext/io'
|
187
|
+
|
188
|
+
# Other requirments
|
189
|
+
require 'spark/version'
|
190
|
+
require 'spark/error'
|
191
|
+
|
192
|
+
# Make sure that Spark be always stopped
|
193
|
+
Kernel::at_exit do
|
194
|
+
begin
|
195
|
+
Spark.stop
|
196
|
+
rescue
|
197
|
+
end
|
198
|
+
end
|