cassandra_model_spark 0.0.1.5-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +17 -0
  3. data/bin/cmodel-spark-build +7 -0
  4. data/bin/cmodel-spark-env.rb +11 -0
  5. data/bin/cmodel-spark-master +22 -0
  6. data/bin/cmodel-spark-run-master +4 -0
  7. data/bin/cmodel-spark-run-master.sh +8 -0
  8. data/bin/cmodel-spark-run-slave +4 -0
  9. data/bin/cmodel-spark-run-slave.sh +8 -0
  10. data/bin/cmodel-spark-slaves +22 -0
  11. data/ext/scala_helper/bin/load-spark-env.sh +63 -0
  12. data/ext/scala_helper/bin/spark-class +87 -0
  13. data/ext/scala_helper/build.sbt +62 -0
  14. data/ext/scala_helper/cassandra_helper.scala +23 -0
  15. data/ext/scala_helper/data_type_helper.scala +27 -0
  16. data/ext/scala_helper/marshal_loader.scala +204 -0
  17. data/ext/scala_helper/marshal_row_mapping.scala +85 -0
  18. data/ext/scala_helper/project/plugins.sbt +6 -0
  19. data/ext/scala_helper/sbin/spark-config.sh +30 -0
  20. data/ext/scala_helper/sbin/spark-daemon.sh +223 -0
  21. data/ext/scala_helper/schema_builder.scala +35 -0
  22. data/ext/scala_helper/worker.scala +13 -0
  23. data/lib/cassandra_model_spark.rb +42 -0
  24. data/lib/cassandra_model_spark/build.rb +24 -0
  25. data/lib/cassandra_model_spark/column_cast.rb +44 -0
  26. data/lib/cassandra_model_spark/connection_cache.rb +9 -0
  27. data/lib/cassandra_model_spark/data_frame.rb +374 -0
  28. data/lib/cassandra_model_spark/java_bridge.rb +91 -0
  29. data/lib/cassandra_model_spark/java_classes.rb +36 -0
  30. data/lib/cassandra_model_spark/launcher.rb +150 -0
  31. data/lib/cassandra_model_spark/query_builder.rb +37 -0
  32. data/lib/cassandra_model_spark/raw_connection.rb +47 -0
  33. data/lib/cassandra_model_spark/record.rb +18 -0
  34. data/lib/cassandra_model_spark/spark.rb +33 -0
  35. metadata +113 -0
@@ -0,0 +1,85 @@
1
+ package org.apache.spark.api.cassandra_model
2
+
3
+ import scala.collection.mutable._
4
+ import org.apache.spark.rdd._
5
+ import org.apache.spark.sql._
6
+ import com.datastax.spark.connector._
7
+ import com.datastax.spark.connector.rdd._
8
+ import org.apache.spark.sql.types._
9
+
10
+ object MapStringStringRowMapping {
11
+ private def canDecode(blob: Array[Byte]) = {
12
+ new String(blob.slice(0, 4)) == "MRSH"
13
+ }
14
+
15
+ private def decodeValue(blob: Array[Byte]): AnyRef = {
16
+ if (canDecode(blob)) {
17
+ val decoder = new MarshalLoader(blob)
18
+ val value = decoder.getValue()
19
+
20
+ value match {
21
+ case (m: Map[_, _]) => m map { case (key, value) => (String.valueOf(key), String.valueOf(value)) }
22
+ case _ => new IllegalArgumentException("Unsupported Ruby Type")
23
+ }
24
+ } else {
25
+ blob
26
+ }
27
+ }
28
+
29
+ private def updatedRow(row: CassandraRow): CassandraRow = {
30
+ val columns = row.columnNames
31
+ val values = row.columnValues.map{
32
+ value => value match {
33
+ case (blob: Array[Byte]) => decodeValue(blob)
34
+ case _ => value
35
+ }
36
+ }
37
+
38
+ new CassandraRow(columns, values)
39
+ }
40
+
41
+ def mappedRDD(rdd: RDD[CassandraRow]): RDD[CassandraRow] = {
42
+ rdd.map(
43
+ row => updatedRow(row)
44
+ )
45
+ }
46
+ }
47
+
48
+ object SparkRowRowMapping {
49
+ private def canDecode(blob: Array[Byte]) = {
50
+ new String(blob.slice(0, 4)) == "MRSH"
51
+ }
52
+
53
+ private def decodeValue(blob: Array[Byte]): AnyRef = {
54
+ if (canDecode(blob)) {
55
+ val decoder = new MarshalLoader(blob)
56
+ val value = decoder.getValue()
57
+
58
+ value match {
59
+ case (m: Map[_, _]) => Row.fromSeq(m.values.toSeq)
60
+ case (a: Array[_]) => Row.fromSeq(a.toSeq)
61
+ case _ => new IllegalArgumentException("Unsupported Ruby Type")
62
+ }
63
+ } else {
64
+ blob
65
+ }
66
+ }
67
+
68
+ private def updatedRow(row: CassandraRow): CassandraRow = {
69
+ val columns = row.columnNames
70
+ val values = row.columnValues.map{
71
+ value => value match {
72
+ case (blob: Array[Byte]) => decodeValue(blob)
73
+ case _ => value
74
+ }
75
+ }
76
+
77
+ new CassandraRow(columns, values)
78
+ }
79
+
80
+ def mappedRDD(rdd: RDD[CassandraRow]): RDD[CassandraRow] = {
81
+ rdd.map(
82
+ row => updatedRow(row)
83
+ )
84
+ }
85
+ }
@@ -0,0 +1,6 @@
1
+
2
+ resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)
3
+ resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/"
4
+ resolvers += "Spray Repository" at "http://repo.spray.cc/"
5
+ addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.10.2")
6
+ addSbtPlugin("com.github.gseitz" % "sbt-protobuf" % "0.3.3")
@@ -0,0 +1,30 @@
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one or more
3
+ # contributor license agreements. See the NOTICE file distributed with
4
+ # this work for additional information regarding copyright ownership.
5
+ # The ASF licenses this file to You under the Apache License, Version 2.0
6
+ # (the "License"); you may not use this file except in compliance with
7
+ # the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ # included in all the spark scripts with source command
19
+ # should not be executable directly
20
+ # also should not be passed any arguments, since we need original $*
21
+
22
+ # symlink and absolute path should rely on SPARK_HOME to resolve
23
+ if [ -z "${SPARK_HOME}" ]; then
24
+ export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
25
+ fi
26
+
27
+ export SPARK_CONF_DIR="${SPARK_CONF_DIR:-"${SPARK_HOME}/conf"}"
28
+ # Add the PySpark classes to the PYTHONPATH:
29
+ export PYTHONPATH="${SPARK_HOME}/python:${PYTHONPATH}"
30
+ export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.9-src.zip:${PYTHONPATH}"
@@ -0,0 +1,223 @@
1
+ #!/usr/bin/env bash
2
+
3
+ #
4
+ # Licensed to the Apache Software Foundation (ASF) under one or more
5
+ # contributor license agreements. See the NOTICE file distributed with
6
+ # this work for additional information regarding copyright ownership.
7
+ # The ASF licenses this file to You under the Apache License, Version 2.0
8
+ # (the "License"); you may not use this file except in compliance with
9
+ # the License. You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+ #
19
+
20
+ # Runs a Spark command as a daemon.
21
+ #
22
+ # Environment Variables
23
+ #
24
+ # SPARK_CONF_DIR Alternate conf dir. Default is ${SPARK_HOME}/conf.
25
+ # SPARK_LOG_DIR Where log files are stored. ${SPARK_HOME}/logs by default.
26
+ # SPARK_MASTER host:path where spark code should be rsync'd from
27
+ # SPARK_PID_DIR The pid files are stored. /tmp by default.
28
+ # SPARK_IDENT_STRING A string representing this instance of spark. $USER by default
29
+ # SPARK_NICENESS The scheduling priority for daemons. Defaults to 0.
30
+ ##
31
+
32
+ usage="Usage: spark-daemon.sh [--config <conf-dir>] (start|stop|submit|status) <spark-command> <spark-instance-number> <args...>"
33
+
34
+ # if no args specified, show usage
35
+ if [ $# -le 1 ]; then
36
+ echo $usage
37
+ exit 1
38
+ fi
39
+
40
+ if [ -z "${SPARK_HOME}" ]; then
41
+ export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
42
+ fi
43
+
44
+ . "${SPARK_HOME}/sbin/spark-config.sh"
45
+
46
+ # get arguments
47
+
48
+ # Check if --config is passed as an argument. It is an optional parameter.
49
+ # Exit if the argument is not a directory.
50
+
51
+ if [ "$1" == "--config" ]
52
+ then
53
+ shift
54
+ conf_dir="$1"
55
+ if [ ! -d "$conf_dir" ]
56
+ then
57
+ echo "ERROR : $conf_dir is not a directory"
58
+ echo $usage
59
+ exit 1
60
+ else
61
+ export SPARK_CONF_DIR="$conf_dir"
62
+ fi
63
+ shift
64
+ fi
65
+
66
+ option=$1
67
+ shift
68
+ command=$1
69
+ shift
70
+ instance=$1
71
+ shift
72
+
73
+ spark_rotate_log ()
74
+ {
75
+ log=$1;
76
+ num=5;
77
+ if [ -n "$2" ]; then
78
+ num=$2
79
+ fi
80
+ if [ -f "$log" ]; then # rotate logs
81
+ while [ $num -gt 1 ]; do
82
+ prev=`expr $num - 1`
83
+ [ -f "$log.$prev" ] && mv "$log.$prev" "$log.$num"
84
+ num=$prev
85
+ done
86
+ mv "$log" "$log.$num";
87
+ fi
88
+ }
89
+
90
+ . "${SPARK_HOME}/bin/load-spark-env.sh"
91
+
92
+ if [ "$SPARK_IDENT_STRING" = "" ]; then
93
+ export SPARK_IDENT_STRING="$USER"
94
+ fi
95
+
96
+
97
+ export SPARK_PRINT_LAUNCH_COMMAND="1"
98
+
99
+ # get log directory
100
+ if [ "$SPARK_LOG_DIR" = "" ]; then
101
+ export SPARK_LOG_DIR="${SPARK_HOME}/logs"
102
+ fi
103
+ mkdir -p "$SPARK_LOG_DIR"
104
+ touch "$SPARK_LOG_DIR"/.spark_test > /dev/null 2>&1
105
+ TEST_LOG_DIR=$?
106
+ if [ "${TEST_LOG_DIR}" = "0" ]; then
107
+ rm -f "$SPARK_LOG_DIR"/.spark_test
108
+ else
109
+ chown "$SPARK_IDENT_STRING" "$SPARK_LOG_DIR"
110
+ fi
111
+
112
+ if [ "$SPARK_PID_DIR" = "" ]; then
113
+ SPARK_PID_DIR=/tmp
114
+ fi
115
+
116
+ # some variables
117
+ log="$SPARK_LOG_DIR/spark-$SPARK_IDENT_STRING-$command-$instance-$HOSTNAME.out"
118
+ pid="$SPARK_PID_DIR/spark-$SPARK_IDENT_STRING-$command-$instance.pid"
119
+
120
+ # Set default scheduling priority
121
+ if [ "$SPARK_NICENESS" = "" ]; then
122
+ export SPARK_NICENESS=0
123
+ fi
124
+
125
+ run_command() {
126
+ mode="$1"
127
+ shift
128
+
129
+ mkdir -p "$SPARK_PID_DIR"
130
+
131
+ if [ -f "$pid" ]; then
132
+ TARGET_ID="$(cat "$pid")"
133
+ if [[ $(ps -p "$TARGET_ID" -o comm=) =~ "java" ]]; then
134
+ echo "$command running as process $TARGET_ID. Stop it first."
135
+ exit 1
136
+ fi
137
+ fi
138
+
139
+ if [ "$SPARK_MASTER" != "" ]; then
140
+ echo rsync from "$SPARK_MASTER"
141
+ rsync -a -e ssh --delete --exclude=.svn --exclude='logs/*' --exclude='contrib/hod/logs/*' "$SPARK_MASTER/" "${SPARK_HOME}"
142
+ fi
143
+
144
+ spark_rotate_log "$log"
145
+ echo "starting $command, logging to $log"
146
+
147
+ case "$mode" in
148
+ (class)
149
+ nohup nice -n "$SPARK_NICENESS" "${SPARK_HOME}"/bin/spark-class $command "$@" >> "$log" 2>&1 < /dev/null &
150
+ newpid="$!"
151
+ ;;
152
+
153
+ (submit)
154
+ nohup nice -n "$SPARK_NICENESS" "${SPARK_HOME}"/bin/spark-submit --class $command "$@" >> "$log" 2>&1 < /dev/null &
155
+ newpid="$!"
156
+ ;;
157
+
158
+ (*)
159
+ echo "unknown mode: $mode"
160
+ exit 1
161
+ ;;
162
+ esac
163
+
164
+ echo "$newpid" > "$pid"
165
+ sleep 2
166
+ # Check if the process has died; in that case we'll tail the log so the user can see
167
+ if [[ ! $(ps -p "$newpid" -o comm=) =~ "java" ]]; then
168
+ echo "failed to launch $command:"
169
+ tail -2 "$log" | sed 's/^/ /'
170
+ echo "full log in $log"
171
+ fi
172
+ }
173
+
174
+ case $option in
175
+
176
+ (submit)
177
+ run_command submit "$@"
178
+ ;;
179
+
180
+ (start)
181
+ run_command class "$@"
182
+ ;;
183
+
184
+ (stop)
185
+
186
+ if [ -f $pid ]; then
187
+ TARGET_ID="$(cat "$pid")"
188
+ if [[ $(ps -p "$TARGET_ID" -o comm=) =~ "java" ]]; then
189
+ echo "stopping $command"
190
+ kill "$TARGET_ID" && rm -f "$pid"
191
+ else
192
+ echo "no $command to stop"
193
+ fi
194
+ else
195
+ echo "no $command to stop"
196
+ fi
197
+ ;;
198
+
199
+ (status)
200
+
201
+ if [ -f $pid ]; then
202
+ TARGET_ID="$(cat "$pid")"
203
+ if [[ $(ps -p "$TARGET_ID" -o comm=) =~ "java" ]]; then
204
+ echo $command is running.
205
+ exit 0
206
+ else
207
+ echo $pid file is present but $command not running
208
+ exit 1
209
+ fi
210
+ else
211
+ echo $command not running.
212
+ exit 2
213
+ fi
214
+ ;;
215
+
216
+ (*)
217
+ echo $usage
218
+ exit 1
219
+ ;;
220
+
221
+ esac
222
+
223
+
@@ -0,0 +1,35 @@
1
+ package org.apache.spark.api.cassandra_model
2
+
3
+ import org.apache.spark.rdd._
4
+ import com.datastax.spark.connector._
5
+ import com.datastax.spark.connector.rdd._
6
+ import org.apache.spark.sql._
7
+ import org.apache.spark.sql.types._
8
+
9
+ class SchemaBuilder {
10
+ var fields = Array[StructField]()
11
+
12
+ def build = StructType(fields)
13
+
14
+ def cassandraRDDToRDD(rdd: RDD[CassandraRow]) = {
15
+ rdd.map(
16
+ p => Row.fromSeq(
17
+ p.columnValues.map{
18
+ p => p match {
19
+ case (d: java.util.Date) => new java.sql.Timestamp(d.getTime())
20
+ case _ => p
21
+ }
22
+ }
23
+ )
24
+ )
25
+ }
26
+
27
+ def createDataFrame(sqlc: SQLContext, rdd: RDD[CassandraRow]) = {
28
+ val new_rdd = cassandraRDDToRDD(rdd)
29
+ sqlc.createDataFrame(new_rdd, build)
30
+ }
31
+
32
+ def addColumn(name: String, sql_type: DataType) = {
33
+ fields :+= StructField(name, sql_type, true)
34
+ }
35
+ }
@@ -0,0 +1,13 @@
1
+ package org.apache.spark.deploy.worker
2
+
3
+ import org.apache.spark.SparkConf
4
+
5
+ object RubyWorkerStarter {
6
+ def startWorker(master_url: String, host: String, port: Int, web_ui_port: Int, conf: SparkConf) = {
7
+ val temp_conf = new SparkConf
8
+ val argv = Array(master_url)
9
+ val args = new WorkerArguments(argv, temp_conf)
10
+
11
+ Worker.startRpcEnvAndEndpoint(host, port, web_ui_port, args.cores, args.memory, args.masters, args.workDir, conf = conf)
12
+ }
13
+ }
@@ -0,0 +1,42 @@
1
+ #--
2
+ # Copyright 2015 Thomas RM Rogers
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #++
16
+
17
+ require 'yaml'
18
+ require 'logger'
19
+
20
+ require 'concurrent'
21
+ require 'cassandra'
22
+ require 'active_support/all'
23
+ require 'active_support/core_ext/class/attribute_accessors'
24
+ require 'thomas_utils'
25
+ require 'batch_reactor'
26
+ require 'cassandra_model'
27
+ require 'rjb' unless RUBY_ENGINE == 'jruby' || CassandraModel.const_defined?('NO_BRIDGE')
28
+ require 'cassandra_model_spark/spark'
29
+
30
+ unless CassandraModel.const_defined?('NO_BRIDGE')
31
+ require 'cassandra_model_spark/java_bridge'
32
+ Dir["#{CassandraModel::Spark.classpath}/*.jar"].each { |file| require file }
33
+ initialize_java_engine
34
+ require 'cassandra_model_spark/java_classes'
35
+ end
36
+
37
+ require 'cassandra_model_spark/raw_connection'
38
+ require 'cassandra_model_spark/connection_cache'
39
+ require 'cassandra_model_spark/record'
40
+ require 'cassandra_model_spark/query_builder'
41
+ require 'cassandra_model_spark/data_frame'
42
+ require 'cassandra_model_spark/column_cast'
@@ -0,0 +1,24 @@
1
+ require 'optparse'
2
+ require_relative 'spark'
3
+
4
+ options = {}
5
+ OptionParser.new do |opts|
6
+ opts.banner = 'Usage: build.rb [--only-ext]'
7
+ opts.on('-e', '--only-ext', 'Build only extension') do
8
+ options[:only_ext] = true
9
+ end
10
+ end.parse!
11
+
12
+ ASSEMBLY_PATH = '/ext/scala_helper'
13
+
14
+ Dir.chdir("#{CassandraModel::Spark.root}#{ASSEMBLY_PATH}") do
15
+ puts '=> building extension...'
16
+ cmd = 'sbt package'
17
+ cmd << ' assemblyPackageDependency' unless options[:only_ext]
18
+ system(ENV.to_hash.merge('TARGET_DIR' => CassandraModel::Spark.classpath), cmd)
19
+ %w(bin sbin).each do |path|
20
+ puts "=> copying #{path}/ to #{CassandraModel::Spark.home}/"
21
+ `cp -R #{CassandraModel::Spark.root}#{ASSEMBLY_PATH}/#{path}/ #{CassandraModel::Spark.home}/`
22
+ end
23
+ `touch #{CassandraModel::Spark.home}/RELEASE`
24
+ end