RubyGems - cassandra_model_spark - Versions diffs - 0.0.1.5-java - Mend

cassandra_model_spark 0.0.1.5-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +7 -0
data/README.md +17 -0
data/bin/cmodel-spark-build +7 -0
data/bin/cmodel-spark-env.rb +11 -0
data/bin/cmodel-spark-master +22 -0
data/bin/cmodel-spark-run-master +4 -0
data/bin/cmodel-spark-run-master.sh +8 -0
data/bin/cmodel-spark-run-slave +4 -0
data/bin/cmodel-spark-run-slave.sh +8 -0
data/bin/cmodel-spark-slaves +22 -0
data/ext/scala_helper/bin/load-spark-env.sh +63 -0
data/ext/scala_helper/bin/spark-class +87 -0
data/ext/scala_helper/build.sbt +62 -0
data/ext/scala_helper/cassandra_helper.scala +23 -0
data/ext/scala_helper/data_type_helper.scala +27 -0
data/ext/scala_helper/marshal_loader.scala +204 -0
data/ext/scala_helper/marshal_row_mapping.scala +85 -0
data/ext/scala_helper/project/plugins.sbt +6 -0
data/ext/scala_helper/sbin/spark-config.sh +30 -0
data/ext/scala_helper/sbin/spark-daemon.sh +223 -0
data/ext/scala_helper/schema_builder.scala +35 -0
data/ext/scala_helper/worker.scala +13 -0
data/lib/cassandra_model_spark.rb +42 -0
data/lib/cassandra_model_spark/build.rb +24 -0
data/lib/cassandra_model_spark/column_cast.rb +44 -0
data/lib/cassandra_model_spark/connection_cache.rb +9 -0
data/lib/cassandra_model_spark/data_frame.rb +374 -0
data/lib/cassandra_model_spark/java_bridge.rb +91 -0
data/lib/cassandra_model_spark/java_classes.rb +36 -0
data/lib/cassandra_model_spark/launcher.rb +150 -0
data/lib/cassandra_model_spark/query_builder.rb +37 -0
data/lib/cassandra_model_spark/raw_connection.rb +47 -0
data/lib/cassandra_model_spark/record.rb +18 -0
data/lib/cassandra_model_spark/spark.rb +33 -0
metadata +113 -0

data/ext/scala_helper/marshal_row_mapping.scala ADDED

@@ -0,0 +1,85 @@
+package org.apache.spark.api.cassandra_model
+import scala.collection.mutable._
+import org.apache.spark.rdd._
+import org.apache.spark.sql._
+import com.datastax.spark.connector._
+import com.datastax.spark.connector.rdd._
+import org.apache.spark.sql.types._
+object MapStringStringRowMapping {
+  private def canDecode(blob: Array[Byte]) = {
+    new String(blob.slice(0, 4)) == "MRSH"
+  }
+  private def decodeValue(blob: Array[Byte]): AnyRef = {
+    if (canDecode(blob)) {
+      val decoder = new MarshalLoader(blob)
+      val value = decoder.getValue()
+      value match {
+        case (m: Map[_, _]) => m map { case (key, value) => (String.valueOf(key), String.valueOf(value)) }
+        case _ => new IllegalArgumentException("Unsupported Ruby Type")
+      }
+    } else {
+      blob
+    }
+  }
+  private def updatedRow(row: CassandraRow): CassandraRow = {
+    val columns = row.columnNames
+    val values = row.columnValues.map{
+      value => value match {
+        case (blob: Array[Byte]) => decodeValue(blob)
+        case _ => value
+      }
+    }
+    new CassandraRow(columns, values)
+  }
+  def mappedRDD(rdd: RDD[CassandraRow]): RDD[CassandraRow] = {
+    rdd.map(
+      row => updatedRow(row)
+    )
+  }
+}
+object SparkRowRowMapping {
+  private def canDecode(blob: Array[Byte]) = {
+    new String(blob.slice(0, 4)) == "MRSH"
+  }
+  private def decodeValue(blob: Array[Byte]): AnyRef = {
+    if (canDecode(blob)) {
+      val decoder = new MarshalLoader(blob)
+      val value = decoder.getValue()
+      value match {
+        case (m: Map[_, _]) => Row.fromSeq(m.values.toSeq)
+        case (a: Array[_]) => Row.fromSeq(a.toSeq)
+        case _ => new IllegalArgumentException("Unsupported Ruby Type")
+      }
+    } else {
+      blob
+    }
+  }
+  private def updatedRow(row: CassandraRow): CassandraRow = {
+    val columns = row.columnNames
+    val values = row.columnValues.map{
+      value => value match {
+        case (blob: Array[Byte]) => decodeValue(blob)
+        case _ => value
+      }
+    }
+    new CassandraRow(columns, values)
+  }
+  def mappedRDD(rdd: RDD[CassandraRow]): RDD[CassandraRow] = {
+    rdd.map(
+      row => updatedRow(row)
+    )
+  }
+}

data/ext/scala_helper/project/plugins.sbt ADDED

@@ -0,0 +1,6 @@
+resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)
+resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/"
+resolvers += "Spray Repository" at "http://repo.spray.cc/"
+addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.10.2")
+addSbtPlugin("com.github.gseitz" % "sbt-protobuf" % "0.3.3")

data/ext/scala_helper/sbin/spark-config.sh ADDED

@@ -0,0 +1,30 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# included in all the spark scripts with source command
+# should not be executable directly
+# also should not be passed any arguments, since we need original $*
+# symlink and absolute path should rely on SPARK_HOME to resolve
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
+export SPARK_CONF_DIR="${SPARK_CONF_DIR:-"${SPARK_HOME}/conf"}"
+# Add the PySpark classes to the PYTHONPATH:
+export PYTHONPATH="${SPARK_HOME}/python:${PYTHONPATH}"
+export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.9-src.zip:${PYTHONPATH}"

data/ext/scala_helper/sbin/spark-daemon.sh ADDED

@@ -0,0 +1,223 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Runs a Spark command as a daemon.
+#
+# Environment Variables
+#
+#   SPARK_CONF_DIR  Alternate conf dir. Default is ${SPARK_HOME}/conf.
+#   SPARK_LOG_DIR   Where log files are stored. ${SPARK_HOME}/logs by default.
+#   SPARK_MASTER    host:path where spark code should be rsync'd from
+#   SPARK_PID_DIR   The pid files are stored. /tmp by default.
+#   SPARK_IDENT_STRING   A string representing this instance of spark. $USER by default
+#   SPARK_NICENESS The scheduling priority for daemons. Defaults to 0.
+##
+usage="Usage: spark-daemon.sh [--config <conf-dir>] (start|stop|submit|status) <spark-command> <spark-instance-number> <args...>"
+# if no args specified, show usage
+if [ $# -le 1 ]; then
+  echo $usage
+  exit 1
+fi
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
+. "${SPARK_HOME}/sbin/spark-config.sh"
+# get arguments
+# Check if --config is passed as an argument. It is an optional parameter.
+# Exit if the argument is not a directory.
+if [ "$1" == "--config" ]
+then
+  shift
+  conf_dir="$1"
+  if [ ! -d "$conf_dir" ]
+  then
+    echo "ERROR : $conf_dir is not a directory"
+    echo $usage
+    exit 1
+  else
+    export SPARK_CONF_DIR="$conf_dir"
+  fi
+  shift
+fi
+option=$1
+shift
+command=$1
+shift
+instance=$1
+shift
+spark_rotate_log ()
+{
+    log=$1;
+    num=5;
+    if [ -n "$2" ]; then
+	num=$2
+    fi
+    if [ -f "$log" ]; then # rotate logs
+	while [ $num -gt 1 ]; do
+	    prev=`expr $num - 1`
+	    [ -f "$log.$prev" ] && mv "$log.$prev" "$log.$num"
+	    num=$prev
+	done
+	mv "$log" "$log.$num";
+    fi
+}
+. "${SPARK_HOME}/bin/load-spark-env.sh"
+if [ "$SPARK_IDENT_STRING" = "" ]; then
+  export SPARK_IDENT_STRING="$USER"
+fi
+export SPARK_PRINT_LAUNCH_COMMAND="1"
+# get log directory
+if [ "$SPARK_LOG_DIR" = "" ]; then
+  export SPARK_LOG_DIR="${SPARK_HOME}/logs"
+fi
+mkdir -p "$SPARK_LOG_DIR"
+touch "$SPARK_LOG_DIR"/.spark_test > /dev/null 2>&1
+TEST_LOG_DIR=$?
+if [ "${TEST_LOG_DIR}" = "0" ]; then
+  rm -f "$SPARK_LOG_DIR"/.spark_test
+else
+  chown "$SPARK_IDENT_STRING" "$SPARK_LOG_DIR"
+fi
+if [ "$SPARK_PID_DIR" = "" ]; then
+  SPARK_PID_DIR=/tmp
+fi
+# some variables
+log="$SPARK_LOG_DIR/spark-$SPARK_IDENT_STRING-$command-$instance-$HOSTNAME.out"
+pid="$SPARK_PID_DIR/spark-$SPARK_IDENT_STRING-$command-$instance.pid"
+# Set default scheduling priority
+if [ "$SPARK_NICENESS" = "" ]; then
+    export SPARK_NICENESS=0
+fi
+run_command() {
+  mode="$1"
+  shift
+  mkdir -p "$SPARK_PID_DIR"
+  if [ -f "$pid" ]; then
+    TARGET_ID="$(cat "$pid")"
+    if [[ $(ps -p "$TARGET_ID" -o comm=) =~ "java" ]]; then
+      echo "$command running as process $TARGET_ID.  Stop it first."
+      exit 1
+    fi
+  fi
+  if [ "$SPARK_MASTER" != "" ]; then
+    echo rsync from "$SPARK_MASTER"
+    rsync -a -e ssh --delete --exclude=.svn --exclude='logs/*' --exclude='contrib/hod/logs/*' "$SPARK_MASTER/" "${SPARK_HOME}"
+  fi
+  spark_rotate_log "$log"
+  echo "starting $command, logging to $log"
+  case "$mode" in
+    (class)
+      nohup nice -n "$SPARK_NICENESS" "${SPARK_HOME}"/bin/spark-class $command "$@" >> "$log" 2>&1 < /dev/null &
+      newpid="$!"
+      ;;
+    (submit)
+      nohup nice -n "$SPARK_NICENESS" "${SPARK_HOME}"/bin/spark-submit --class $command "$@" >> "$log" 2>&1 < /dev/null &
+      newpid="$!"
+      ;;
+    (*)
+      echo "unknown mode: $mode"
+      exit 1
+      ;;
+  esac
+  echo "$newpid" > "$pid"
+  sleep 2
+  # Check if the process has died; in that case we'll tail the log so the user can see
+  if [[ ! $(ps -p "$newpid" -o comm=) =~ "java" ]]; then
+    echo "failed to launch $command:"
+    tail -2 "$log" | sed 's/^/  /'
+    echo "full log in $log"
+  fi
+}
+case $option in
+  (submit)
+    run_command submit "$@"
+    ;;
+  (start)
+    run_command class "$@"
+    ;;
+  (stop)
+    if [ -f $pid ]; then
+      TARGET_ID="$(cat "$pid")"
+      if [[ $(ps -p "$TARGET_ID" -o comm=) =~ "java" ]]; then
+        echo "stopping $command"
+        kill "$TARGET_ID" && rm -f "$pid"
+      else
+        echo "no $command to stop"
+      fi
+    else
+      echo "no $command to stop"
+    fi
+    ;;
+  (status)
+    if [ -f $pid ]; then
+      TARGET_ID="$(cat "$pid")"
+      if [[ $(ps -p "$TARGET_ID" -o comm=) =~ "java" ]]; then
+        echo $command is running.
+        exit 0
+      else
+        echo $pid file is present but $command not running
+        exit 1
+      fi
+    else
+      echo $command not running.
+      exit 2
+    fi
+    ;;
+  (*)
+    echo $usage
+    exit 1
+    ;;
+esac

data/ext/scala_helper/schema_builder.scala ADDED

@@ -0,0 +1,35 @@
+package org.apache.spark.api.cassandra_model
+import org.apache.spark.rdd._
+import com.datastax.spark.connector._
+import com.datastax.spark.connector.rdd._
+import org.apache.spark.sql._
+import org.apache.spark.sql.types._
+class SchemaBuilder {
+  var fields = Array[StructField]()
+  def build = StructType(fields)
+  def cassandraRDDToRDD(rdd: RDD[CassandraRow]) = {
+    rdd.map(
+      p => Row.fromSeq(
+        p.columnValues.map{
+          p => p match {
+              case (d: java.util.Date) => new java.sql.Timestamp(d.getTime())
+              case _ => p
+            }
+          }
+        )
+    )
+  }
+  def createDataFrame(sqlc: SQLContext, rdd: RDD[CassandraRow]) = {
+    val new_rdd = cassandraRDDToRDD(rdd)
+    sqlc.createDataFrame(new_rdd, build)
+  }
+  def addColumn(name: String, sql_type: DataType) = {
+    fields :+= StructField(name, sql_type, true)
+  }
+}

data/ext/scala_helper/worker.scala ADDED

@@ -0,0 +1,13 @@
+package org.apache.spark.deploy.worker
+import org.apache.spark.SparkConf
+object RubyWorkerStarter {
+  def startWorker(master_url: String, host: String, port: Int, web_ui_port: Int, conf: SparkConf) = {
+    val temp_conf = new SparkConf
+    val argv = Array(master_url)
+    val args = new WorkerArguments(argv, temp_conf)
+    Worker.startRpcEnvAndEndpoint(host, port, web_ui_port, args.cores, args.memory, args.masters, args.workDir, conf = conf)
+  }
+}

data/lib/cassandra_model_spark.rb ADDED

@@ -0,0 +1,42 @@
+#--
+# Copyright 2015 Thomas RM Rogers
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#++
+require 'yaml'
+require 'logger'
+require 'concurrent'
+require 'cassandra'
+require 'active_support/all'
+require 'active_support/core_ext/class/attribute_accessors'
+require 'thomas_utils'
+require 'batch_reactor'
+require 'cassandra_model'
+require 'rjb' unless RUBY_ENGINE == 'jruby' || CassandraModel.const_defined?('NO_BRIDGE')
+require 'cassandra_model_spark/spark'
+unless CassandraModel.const_defined?('NO_BRIDGE')
+  require 'cassandra_model_spark/java_bridge'
+  Dir["#{CassandraModel::Spark.classpath}/*.jar"].each { |file| require file }
+  initialize_java_engine
+  require 'cassandra_model_spark/java_classes'
+end
+require 'cassandra_model_spark/raw_connection'
+require 'cassandra_model_spark/connection_cache'
+require 'cassandra_model_spark/record'
+require 'cassandra_model_spark/query_builder'
+require 'cassandra_model_spark/data_frame'
+require 'cassandra_model_spark/column_cast'

data/lib/cassandra_model_spark/build.rb ADDED

@@ -0,0 +1,24 @@
+require 'optparse'
+require_relative 'spark'
+options = {}
+OptionParser.new do |opts|
+  opts.banner = 'Usage: build.rb [--only-ext]'
+  opts.on('-e', '--only-ext', 'Build only extension') do
+    options[:only_ext] = true
+  end
+end.parse!
+ASSEMBLY_PATH = '/ext/scala_helper'
+Dir.chdir("#{CassandraModel::Spark.root}#{ASSEMBLY_PATH}") do
+  puts '=> building extension...'
+  cmd = 'sbt package'
+  cmd << ' assemblyPackageDependency' unless options[:only_ext]
+  system(ENV.to_hash.merge('TARGET_DIR' => CassandraModel::Spark.classpath), cmd)
+  %w(bin sbin).each do |path|
+    puts "=> copying #{path}/ to #{CassandraModel::Spark.home}/"
+    `cp -R #{CassandraModel::Spark.root}#{ASSEMBLY_PATH}/#{path}/ #{CassandraModel::Spark.home}/`
+  end
+  `touch #{CassandraModel::Spark.home}/RELEASE`
+end