cassandra_model_spark 0.0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +17 -0
  3. data/bin/cmodel-spark-build +7 -0
  4. data/bin/cmodel-spark-env.rb +11 -0
  5. data/bin/cmodel-spark-master +22 -0
  6. data/bin/cmodel-spark-run-master +4 -0
  7. data/bin/cmodel-spark-run-master.sh +8 -0
  8. data/bin/cmodel-spark-run-slave +4 -0
  9. data/bin/cmodel-spark-run-slave.sh +8 -0
  10. data/bin/cmodel-spark-slaves +22 -0
  11. data/ext/scala_helper/bin/load-spark-env.sh +63 -0
  12. data/ext/scala_helper/bin/spark-class +87 -0
  13. data/ext/scala_helper/build.sbt +62 -0
  14. data/ext/scala_helper/cassandra_helper.scala +23 -0
  15. data/ext/scala_helper/data_type_helper.scala +27 -0
  16. data/ext/scala_helper/marshal_loader.scala +204 -0
  17. data/ext/scala_helper/marshal_row_mapping.scala +85 -0
  18. data/ext/scala_helper/project/plugins.sbt +6 -0
  19. data/ext/scala_helper/sbin/spark-config.sh +30 -0
  20. data/ext/scala_helper/sbin/spark-daemon.sh +223 -0
  21. data/ext/scala_helper/schema_builder.scala +35 -0
  22. data/ext/scala_helper/worker.scala +13 -0
  23. data/lib/cassandra_model_spark/build.rb +24 -0
  24. data/lib/cassandra_model_spark/column_cast.rb +44 -0
  25. data/lib/cassandra_model_spark/connection_cache.rb +9 -0
  26. data/lib/cassandra_model_spark/data_frame.rb +374 -0
  27. data/lib/cassandra_model_spark/java_bridge.rb +91 -0
  28. data/lib/cassandra_model_spark/java_classes.rb +36 -0
  29. data/lib/cassandra_model_spark/launcher.rb +150 -0
  30. data/lib/cassandra_model_spark/query_builder.rb +37 -0
  31. data/lib/cassandra_model_spark/raw_connection.rb +47 -0
  32. data/lib/cassandra_model_spark/record.rb +18 -0
  33. data/lib/cassandra_model_spark/spark.rb +33 -0
  34. data/lib/cassandra_model_spark.rb +42 -0
  35. metadata +127 -0
@@ -0,0 +1,30 @@
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one or more
3
+ # contributor license agreements. See the NOTICE file distributed with
4
+ # this work for additional information regarding copyright ownership.
5
+ # The ASF licenses this file to You under the Apache License, Version 2.0
6
+ # (the "License"); you may not use this file except in compliance with
7
+ # the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ # included in all the spark scripts with source command
19
+ # should not be executable directly
20
+ # also should not be passed any arguments, since we need original $*
21
+
22
+ # symlink and absolute path should rely on SPARK_HOME to resolve
23
+ if [ -z "${SPARK_HOME}" ]; then
24
+ export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
25
+ fi
26
+
27
+ export SPARK_CONF_DIR="${SPARK_CONF_DIR:-"${SPARK_HOME}/conf"}"
28
+ # Add the PySpark classes to the PYTHONPATH:
29
+ export PYTHONPATH="${SPARK_HOME}/python:${PYTHONPATH}"
30
+ export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.9-src.zip:${PYTHONPATH}"
@@ -0,0 +1,223 @@
1
+ #!/usr/bin/env bash
2
+
3
+ #
4
+ # Licensed to the Apache Software Foundation (ASF) under one or more
5
+ # contributor license agreements. See the NOTICE file distributed with
6
+ # this work for additional information regarding copyright ownership.
7
+ # The ASF licenses this file to You under the Apache License, Version 2.0
8
+ # (the "License"); you may not use this file except in compliance with
9
+ # the License. You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+ #
19
+
20
+ # Runs a Spark command as a daemon.
21
+ #
22
+ # Environment Variables
23
+ #
24
+ # SPARK_CONF_DIR Alternate conf dir. Default is ${SPARK_HOME}/conf.
25
+ # SPARK_LOG_DIR Where log files are stored. ${SPARK_HOME}/logs by default.
26
+ # SPARK_MASTER host:path where spark code should be rsync'd from
27
+ # SPARK_PID_DIR The pid files are stored. /tmp by default.
28
+ # SPARK_IDENT_STRING A string representing this instance of spark. $USER by default
29
+ # SPARK_NICENESS The scheduling priority for daemons. Defaults to 0.
30
+ ##
31
+
32
+ usage="Usage: spark-daemon.sh [--config <conf-dir>] (start|stop|submit|status) <spark-command> <spark-instance-number> <args...>"
33
+
34
+ # if no args specified, show usage
35
+ if [ $# -le 1 ]; then
36
+ echo $usage
37
+ exit 1
38
+ fi
39
+
40
+ if [ -z "${SPARK_HOME}" ]; then
41
+ export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
42
+ fi
43
+
44
+ . "${SPARK_HOME}/sbin/spark-config.sh"
45
+
46
+ # get arguments
47
+
48
+ # Check if --config is passed as an argument. It is an optional parameter.
49
+ # Exit if the argument is not a directory.
50
+
51
+ if [ "$1" == "--config" ]
52
+ then
53
+ shift
54
+ conf_dir="$1"
55
+ if [ ! -d "$conf_dir" ]
56
+ then
57
+ echo "ERROR : $conf_dir is not a directory"
58
+ echo $usage
59
+ exit 1
60
+ else
61
+ export SPARK_CONF_DIR="$conf_dir"
62
+ fi
63
+ shift
64
+ fi
65
+
66
+ option=$1
67
+ shift
68
+ command=$1
69
+ shift
70
+ instance=$1
71
+ shift
72
+
73
+ spark_rotate_log ()
74
+ {
75
+ log=$1;
76
+ num=5;
77
+ if [ -n "$2" ]; then
78
+ num=$2
79
+ fi
80
+ if [ -f "$log" ]; then # rotate logs
81
+ while [ $num -gt 1 ]; do
82
+ prev=`expr $num - 1`
83
+ [ -f "$log.$prev" ] && mv "$log.$prev" "$log.$num"
84
+ num=$prev
85
+ done
86
+ mv "$log" "$log.$num";
87
+ fi
88
+ }
89
+
90
+ . "${SPARK_HOME}/bin/load-spark-env.sh"
91
+
92
+ if [ "$SPARK_IDENT_STRING" = "" ]; then
93
+ export SPARK_IDENT_STRING="$USER"
94
+ fi
95
+
96
+
97
+ export SPARK_PRINT_LAUNCH_COMMAND="1"
98
+
99
+ # get log directory
100
+ if [ "$SPARK_LOG_DIR" = "" ]; then
101
+ export SPARK_LOG_DIR="${SPARK_HOME}/logs"
102
+ fi
103
+ mkdir -p "$SPARK_LOG_DIR"
104
+ touch "$SPARK_LOG_DIR"/.spark_test > /dev/null 2>&1
105
+ TEST_LOG_DIR=$?
106
+ if [ "${TEST_LOG_DIR}" = "0" ]; then
107
+ rm -f "$SPARK_LOG_DIR"/.spark_test
108
+ else
109
+ chown "$SPARK_IDENT_STRING" "$SPARK_LOG_DIR"
110
+ fi
111
+
112
+ if [ "$SPARK_PID_DIR" = "" ]; then
113
+ SPARK_PID_DIR=/tmp
114
+ fi
115
+
116
+ # some variables
117
+ log="$SPARK_LOG_DIR/spark-$SPARK_IDENT_STRING-$command-$instance-$HOSTNAME.out"
118
+ pid="$SPARK_PID_DIR/spark-$SPARK_IDENT_STRING-$command-$instance.pid"
119
+
120
+ # Set default scheduling priority
121
+ if [ "$SPARK_NICENESS" = "" ]; then
122
+ export SPARK_NICENESS=0
123
+ fi
124
+
125
+ run_command() {
126
+ mode="$1"
127
+ shift
128
+
129
+ mkdir -p "$SPARK_PID_DIR"
130
+
131
+ if [ -f "$pid" ]; then
132
+ TARGET_ID="$(cat "$pid")"
133
+ if [[ $(ps -p "$TARGET_ID" -o comm=) =~ "java" ]]; then
134
+ echo "$command running as process $TARGET_ID. Stop it first."
135
+ exit 1
136
+ fi
137
+ fi
138
+
139
+ if [ "$SPARK_MASTER" != "" ]; then
140
+ echo rsync from "$SPARK_MASTER"
141
+ rsync -a -e ssh --delete --exclude=.svn --exclude='logs/*' --exclude='contrib/hod/logs/*' "$SPARK_MASTER/" "${SPARK_HOME}"
142
+ fi
143
+
144
+ spark_rotate_log "$log"
145
+ echo "starting $command, logging to $log"
146
+
147
+ case "$mode" in
148
+ (class)
149
+ nohup nice -n "$SPARK_NICENESS" "${SPARK_HOME}"/bin/spark-class $command "$@" >> "$log" 2>&1 < /dev/null &
150
+ newpid="$!"
151
+ ;;
152
+
153
+ (submit)
154
+ nohup nice -n "$SPARK_NICENESS" "${SPARK_HOME}"/bin/spark-submit --class $command "$@" >> "$log" 2>&1 < /dev/null &
155
+ newpid="$!"
156
+ ;;
157
+
158
+ (*)
159
+ echo "unknown mode: $mode"
160
+ exit 1
161
+ ;;
162
+ esac
163
+
164
+ echo "$newpid" > "$pid"
165
+ sleep 2
166
+ # Check if the process has died; in that case we'll tail the log so the user can see
167
+ if [[ ! $(ps -p "$newpid" -o comm=) =~ "java" ]]; then
168
+ echo "failed to launch $command:"
169
+ tail -2 "$log" | sed 's/^/ /'
170
+ echo "full log in $log"
171
+ fi
172
+ }
173
+
174
+ case $option in
175
+
176
+ (submit)
177
+ run_command submit "$@"
178
+ ;;
179
+
180
+ (start)
181
+ run_command class "$@"
182
+ ;;
183
+
184
+ (stop)
185
+
186
+ if [ -f $pid ]; then
187
+ TARGET_ID="$(cat "$pid")"
188
+ if [[ $(ps -p "$TARGET_ID" -o comm=) =~ "java" ]]; then
189
+ echo "stopping $command"
190
+ kill "$TARGET_ID" && rm -f "$pid"
191
+ else
192
+ echo "no $command to stop"
193
+ fi
194
+ else
195
+ echo "no $command to stop"
196
+ fi
197
+ ;;
198
+
199
+ (status)
200
+
201
+ if [ -f $pid ]; then
202
+ TARGET_ID="$(cat "$pid")"
203
+ if [[ $(ps -p "$TARGET_ID" -o comm=) =~ "java" ]]; then
204
+ echo $command is running.
205
+ exit 0
206
+ else
207
+ echo $pid file is present but $command not running
208
+ exit 1
209
+ fi
210
+ else
211
+ echo $command not running.
212
+ exit 2
213
+ fi
214
+ ;;
215
+
216
+ (*)
217
+ echo $usage
218
+ exit 1
219
+ ;;
220
+
221
+ esac
222
+
223
+
@@ -0,0 +1,35 @@
1
+ package org.apache.spark.api.cassandra_model
2
+
3
+ import org.apache.spark.rdd._
4
+ import com.datastax.spark.connector._
5
+ import com.datastax.spark.connector.rdd._
6
+ import org.apache.spark.sql._
7
+ import org.apache.spark.sql.types._
8
+
9
+ class SchemaBuilder {
10
+ var fields = Array[StructField]()
11
+
12
+ def build = StructType(fields)
13
+
14
+ def cassandraRDDToRDD(rdd: RDD[CassandraRow]) = {
15
+ rdd.map(
16
+ p => Row.fromSeq(
17
+ p.columnValues.map{
18
+ p => p match {
19
+ case (d: java.util.Date) => new java.sql.Timestamp(d.getTime())
20
+ case _ => p
21
+ }
22
+ }
23
+ )
24
+ )
25
+ }
26
+
27
+ def createDataFrame(sqlc: SQLContext, rdd: RDD[CassandraRow]) = {
28
+ val new_rdd = cassandraRDDToRDD(rdd)
29
+ sqlc.createDataFrame(new_rdd, build)
30
+ }
31
+
32
+ def addColumn(name: String, sql_type: DataType) = {
33
+ fields :+= StructField(name, sql_type, true)
34
+ }
35
+ }
@@ -0,0 +1,13 @@
1
+ package org.apache.spark.deploy.worker
2
+
3
+ import org.apache.spark.SparkConf
4
+
5
+ object RubyWorkerStarter {
6
+ def startWorker(master_url: String, host: String, port: Int, web_ui_port: Int, conf: SparkConf) = {
7
+ val temp_conf = new SparkConf
8
+ val argv = Array(master_url)
9
+ val args = new WorkerArguments(argv, temp_conf)
10
+
11
+ Worker.startRpcEnvAndEndpoint(host, port, web_ui_port, args.cores, args.memory, args.masters, args.workDir, conf = conf)
12
+ }
13
+ }
@@ -0,0 +1,24 @@
1
+ require 'optparse'
2
+ require_relative 'spark'
3
+
4
+ options = {}
5
+ OptionParser.new do |opts|
6
+ opts.banner = 'Usage: build.rb [--only-ext]'
7
+ opts.on('-e', '--only-ext', 'Build only extension') do
8
+ options[:only_ext] = true
9
+ end
10
+ end.parse!
11
+
12
+ ASSEMBLY_PATH = '/ext/scala_helper'
13
+
14
+ Dir.chdir("#{CassandraModel::Spark.root}#{ASSEMBLY_PATH}") do
15
+ puts '=> building extension...'
16
+ cmd = 'sbt package'
17
+ cmd << ' assemblyPackageDependency' unless options[:only_ext]
18
+ system(ENV.to_hash.merge('TARGET_DIR' => CassandraModel::Spark.classpath), cmd)
19
+ %w(bin sbin).each do |path|
20
+ puts "=> copying #{path}/ to #{CassandraModel::Spark.home}/"
21
+ `cp -R #{CassandraModel::Spark.root}#{ASSEMBLY_PATH}/#{path}/ #{CassandraModel::Spark.home}/`
22
+ end
23
+ `touch #{CassandraModel::Spark.home}/RELEASE`
24
+ end
@@ -0,0 +1,44 @@
1
+ module CassandraModel
2
+ module Spark
3
+ class ColumnCast
4
+ include ThomasUtils::SymbolHelpers
5
+
6
+ attr_reader :key
7
+
8
+ def initialize(key, type)
9
+ @key = key
10
+ @type = type.to_s.upcase
11
+ end
12
+
13
+ def quote(quote)
14
+ quoted_key = if @key.respond_to?(:quote)
15
+ @key.quote(quote)
16
+ else
17
+ "#{quote}#{@key}#{quote}"
18
+ end
19
+ "CAST(#{quoted_key} AS #{@type})"
20
+ end
21
+
22
+ def new_key(key)
23
+ self.class.new(key, @type)
24
+ end
25
+
26
+ end
27
+ end
28
+ end
29
+
30
+ module ThomasUtils
31
+ class KeyChild
32
+ def cast_as(type)
33
+ CassandraModel::Spark::ColumnCast.new(self, type)
34
+ end
35
+ alias :* :cast_as
36
+ end
37
+ end
38
+
39
+ class Symbol
40
+ def cast_as(type)
41
+ CassandraModel::Spark::ColumnCast.new(self, type)
42
+ end
43
+ alias :* :cast_as
44
+ end
@@ -0,0 +1,9 @@
1
+ module CassandraModel
2
+ class ConnectionCache
3
+ def self.clear
4
+ @@cache.values.map(&:java_spark_context).map(&:stop)
5
+ @@cache.values.map(&:shutdown)
6
+ @@cache.clear
7
+ end
8
+ end
9
+ end