cassandra_model_spark 0.0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +17 -0
- data/bin/cmodel-spark-build +7 -0
- data/bin/cmodel-spark-env.rb +11 -0
- data/bin/cmodel-spark-master +22 -0
- data/bin/cmodel-spark-run-master +4 -0
- data/bin/cmodel-spark-run-master.sh +8 -0
- data/bin/cmodel-spark-run-slave +4 -0
- data/bin/cmodel-spark-run-slave.sh +8 -0
- data/bin/cmodel-spark-slaves +22 -0
- data/ext/scala_helper/bin/load-spark-env.sh +63 -0
- data/ext/scala_helper/bin/spark-class +87 -0
- data/ext/scala_helper/build.sbt +62 -0
- data/ext/scala_helper/cassandra_helper.scala +23 -0
- data/ext/scala_helper/data_type_helper.scala +27 -0
- data/ext/scala_helper/marshal_loader.scala +204 -0
- data/ext/scala_helper/marshal_row_mapping.scala +85 -0
- data/ext/scala_helper/project/plugins.sbt +6 -0
- data/ext/scala_helper/sbin/spark-config.sh +30 -0
- data/ext/scala_helper/sbin/spark-daemon.sh +223 -0
- data/ext/scala_helper/schema_builder.scala +35 -0
- data/ext/scala_helper/worker.scala +13 -0
- data/lib/cassandra_model_spark/build.rb +24 -0
- data/lib/cassandra_model_spark/column_cast.rb +44 -0
- data/lib/cassandra_model_spark/connection_cache.rb +9 -0
- data/lib/cassandra_model_spark/data_frame.rb +374 -0
- data/lib/cassandra_model_spark/java_bridge.rb +91 -0
- data/lib/cassandra_model_spark/java_classes.rb +36 -0
- data/lib/cassandra_model_spark/launcher.rb +150 -0
- data/lib/cassandra_model_spark/query_builder.rb +37 -0
- data/lib/cassandra_model_spark/raw_connection.rb +47 -0
- data/lib/cassandra_model_spark/record.rb +18 -0
- data/lib/cassandra_model_spark/spark.rb +33 -0
- data/lib/cassandra_model_spark.rb +42 -0
- metadata +127 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 27a73c07406303279c26f7e8f112be77e17c4d1b
|
4
|
+
data.tar.gz: 946084377df9d9ed40cc37e6feb9a9fedc15fa79
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 06d57399fb89e4d0252ab1abbbe086e53904332afd25c66fd59ab18bd4d0e2a24a8eb14a7cc65f8c302817414e8afd8999818a74523f4e42c6d55b794b938e6c
|
7
|
+
data.tar.gz: 5462ff873fbb73f82d7424a989e7377b5e1826a54a8e234fe3ee732bff4936d3ed3646f256ce71f3bd189878c4941f38d3ff1c5845bc7b75ba318a992301cedf
|
data/README.md
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# cassandra_model_spark
|
2
|
+
|
3
|
+
## Copyright
|
4
|
+
|
5
|
+
Copyright 2015 Thomas Rogers.
|
6
|
+
|
7
|
+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
|
8
|
+
|
9
|
+
[http://www.apache.org/licenses/LICENSE-2.0](http://www.apache.org/licenses/LICENSE-2.0)
|
10
|
+
|
11
|
+
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
|
12
|
+
|
13
|
+
|
14
|
+
|
15
|
+
|
16
|
+
|
17
|
+
|
@@ -0,0 +1,11 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
|
4
|
+
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
|
5
|
+
|
6
|
+
require 'bundler/setup'
|
7
|
+
require 'cassandra_model_spark/spark'
|
8
|
+
|
9
|
+
print "export SPARK_HOME=#{CassandraModel::Spark.home}"
|
10
|
+
|
11
|
+
|
@@ -0,0 +1,22 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
|
4
|
+
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
|
5
|
+
|
6
|
+
require 'bundler/setup'
|
7
|
+
require 'cassandra_model_spark'
|
8
|
+
require 'cassandra_model_spark/launcher'
|
9
|
+
|
10
|
+
command = ARGV.shift.downcase.to_sym
|
11
|
+
launcher = CassandraModel::Spark::Launcher.new
|
12
|
+
case command
|
13
|
+
when :start
|
14
|
+
launcher.start_master
|
15
|
+
when :run
|
16
|
+
launcher.run_master
|
17
|
+
when :stop
|
18
|
+
launcher.stop_master
|
19
|
+
else
|
20
|
+
puts '=> only supports start, run, or stop'
|
21
|
+
end
|
22
|
+
|
@@ -0,0 +1,22 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
|
4
|
+
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
|
5
|
+
|
6
|
+
require 'bundler/setup'
|
7
|
+
require 'cassandra_model_spark'
|
8
|
+
require 'cassandra_model_spark/launcher'
|
9
|
+
|
10
|
+
command = ARGV.shift.downcase.to_sym
|
11
|
+
launcher = CassandraModel::Spark::Launcher.new
|
12
|
+
case command
|
13
|
+
when :start
|
14
|
+
launcher.start_slaves
|
15
|
+
when :run
|
16
|
+
launcher.run_slave
|
17
|
+
when :stop
|
18
|
+
launcher.stop_slaves
|
19
|
+
else
|
20
|
+
puts '=> only supports start, run, or stop'
|
21
|
+
end
|
22
|
+
|
@@ -0,0 +1,63 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
#
|
4
|
+
# Licensed to the Apache Software Foundation (ASF) under one or more
|
5
|
+
# contributor license agreements. See the NOTICE file distributed with
|
6
|
+
# this work for additional information regarding copyright ownership.
|
7
|
+
# The ASF licenses this file to You under the Apache License, Version 2.0
|
8
|
+
# (the "License"); you may not use this file except in compliance with
|
9
|
+
# the License. You may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
16
|
+
# See the License for the specific language governing permissions and
|
17
|
+
# limitations under the License.
|
18
|
+
#
|
19
|
+
|
20
|
+
# This script loads spark-env.sh if it exists, and ensures it is only loaded once.
|
21
|
+
# spark-env.sh is loaded from SPARK_CONF_DIR if set, or within the current directory's
|
22
|
+
# conf/ subdirectory.
|
23
|
+
|
24
|
+
# Figure out where Spark is installed
|
25
|
+
if [ -z "${SPARK_HOME}" ]; then
|
26
|
+
export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
|
27
|
+
fi
|
28
|
+
|
29
|
+
if [ -z "$SPARK_ENV_LOADED" ]; then
|
30
|
+
export SPARK_ENV_LOADED=1
|
31
|
+
|
32
|
+
# Returns the parent of the directory this script lives in.
|
33
|
+
parent_dir="${SPARK_HOME}"
|
34
|
+
|
35
|
+
user_conf_dir="${SPARK_CONF_DIR:-"$parent_dir"/conf}"
|
36
|
+
|
37
|
+
if [ -f "${user_conf_dir}/spark-env.sh" ]; then
|
38
|
+
# Promote all variable declarations to environment (exported) variables
|
39
|
+
set -a
|
40
|
+
. "${user_conf_dir}/spark-env.sh"
|
41
|
+
set +a
|
42
|
+
fi
|
43
|
+
fi
|
44
|
+
|
45
|
+
# Setting SPARK_SCALA_VERSION if not already set.
|
46
|
+
|
47
|
+
if [ -z "$SPARK_SCALA_VERSION" ]; then
|
48
|
+
|
49
|
+
ASSEMBLY_DIR2="${SPARK_HOME}/assembly/target/scala-2.11"
|
50
|
+
ASSEMBLY_DIR1="${SPARK_HOME}/assembly/target/scala-2.10"
|
51
|
+
|
52
|
+
if [[ -d "$ASSEMBLY_DIR2" && -d "$ASSEMBLY_DIR1" ]]; then
|
53
|
+
echo -e "Presence of build for both scala versions(SCALA 2.10 and SCALA 2.11) detected." 1>&2
|
54
|
+
echo -e 'Either clean one of them or, export SPARK_SCALA_VERSION=2.11 in spark-env.sh.' 1>&2
|
55
|
+
exit 1
|
56
|
+
fi
|
57
|
+
|
58
|
+
if [ -d "$ASSEMBLY_DIR2" ]; then
|
59
|
+
export SPARK_SCALA_VERSION="2.11"
|
60
|
+
else
|
61
|
+
export SPARK_SCALA_VERSION="2.10"
|
62
|
+
fi
|
63
|
+
fi
|
@@ -0,0 +1,87 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
#
|
4
|
+
# Licensed to the Apache Software Foundation (ASF) under one or more
|
5
|
+
# contributor license agreements. See the NOTICE file distributed with
|
6
|
+
# this work for additional information regarding copyright ownership.
|
7
|
+
# The ASF licenses this file to You under the Apache License, Version 2.0
|
8
|
+
# (the "License"); you may not use this file except in compliance with
|
9
|
+
# the License. You may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
16
|
+
# See the License for the specific language governing permissions and
|
17
|
+
# limitations under the License.
|
18
|
+
#
|
19
|
+
|
20
|
+
if [ -z "${SPARK_HOME}" ]; then
|
21
|
+
export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
|
22
|
+
fi
|
23
|
+
|
24
|
+
. "${SPARK_HOME}"/bin/load-spark-env.sh
|
25
|
+
|
26
|
+
# Find the java binary
|
27
|
+
if [ -n "${JAVA_HOME}" ]; then
|
28
|
+
RUNNER="${JAVA_HOME}/bin/java"
|
29
|
+
else
|
30
|
+
if [ `command -v java` ]; then
|
31
|
+
RUNNER="java"
|
32
|
+
else
|
33
|
+
echo "JAVA_HOME is not set" >&2
|
34
|
+
exit 1
|
35
|
+
fi
|
36
|
+
fi
|
37
|
+
|
38
|
+
# Find assembly jar
|
39
|
+
SPARK_ASSEMBLY_JAR=
|
40
|
+
if [ -f "${SPARK_HOME}/RELEASE" ]; then
|
41
|
+
ASSEMBLY_DIR="${SPARK_HOME}/lib"
|
42
|
+
else
|
43
|
+
ASSEMBLY_DIR="${SPARK_HOME}/assembly/target/scala-$SPARK_SCALA_VERSION"
|
44
|
+
fi
|
45
|
+
|
46
|
+
GREP_OPTIONS=
|
47
|
+
num_jars="$(ls -1 "$ASSEMBLY_DIR" | grep "^spark-assembly.*hadoop.*\.jar$" | wc -l)"
|
48
|
+
if [ "$num_jars" -eq "0" -a -z "$SPARK_ASSEMBLY_JAR" -a "$SPARK_PREPEND_CLASSES" != "1" ]; then
|
49
|
+
echo "Failed to find Spark assembly in $ASSEMBLY_DIR." 1>&2
|
50
|
+
echo "You need to build Spark before running this program." 1>&2
|
51
|
+
exit 1
|
52
|
+
fi
|
53
|
+
if [ -d "$ASSEMBLY_DIR" ]; then
|
54
|
+
ASSEMBLY_JARS="$(ls -1 "$ASSEMBLY_DIR" | grep "^spark-assembly.*hadoop.*\.jar$" || true)"
|
55
|
+
if [ "$num_jars" -gt "1" ]; then
|
56
|
+
echo "Found multiple Spark assembly jars in $ASSEMBLY_DIR:" 1>&2
|
57
|
+
echo "$ASSEMBLY_JARS" 1>&2
|
58
|
+
echo "Please remove all but one jar." 1>&2
|
59
|
+
exit 1
|
60
|
+
fi
|
61
|
+
fi
|
62
|
+
|
63
|
+
SPARK_ASSEMBLY_JAR="${ASSEMBLY_DIR}/${ASSEMBLY_JARS}"
|
64
|
+
|
65
|
+
LAUNCH_CLASSPATH="$SPARK_ASSEMBLY_JAR"
|
66
|
+
|
67
|
+
# Add the launcher build dir to the classpath if requested.
|
68
|
+
if [ -n "$SPARK_PREPEND_CLASSES" ]; then
|
69
|
+
LAUNCH_CLASSPATH="${SPARK_HOME}/launcher/target/scala-$SPARK_SCALA_VERSION/classes:$LAUNCH_CLASSPATH"
|
70
|
+
fi
|
71
|
+
|
72
|
+
export _SPARK_ASSEMBLY="$SPARK_ASSEMBLY_JAR"
|
73
|
+
|
74
|
+
# For tests
|
75
|
+
if [[ -n "$SPARK_TESTING" ]]; then
|
76
|
+
unset YARN_CONF_DIR
|
77
|
+
unset HADOOP_CONF_DIR
|
78
|
+
fi
|
79
|
+
|
80
|
+
# The launcher library will print arguments separated by a NULL character, to allow arguments with
|
81
|
+
# characters that would be otherwise interpreted by the shell. Read that in a while loop, populating
|
82
|
+
# an array that will be used to exec the final command.
|
83
|
+
CMD=()
|
84
|
+
while IFS= read -d '' -r ARG; do
|
85
|
+
CMD+=("$ARG")
|
86
|
+
done < <("$RUNNER" -cp "$LAUNCH_CLASSPATH" org.apache.spark.launcher.Main "$@")
|
87
|
+
exec "${CMD[@]}"
|
@@ -0,0 +1,62 @@
|
|
1
|
+
import AssemblyKeys._
|
2
|
+
|
3
|
+
assemblySettings
|
4
|
+
|
5
|
+
name := "cmodel_scala_helper"
|
6
|
+
version := "0.0.1"
|
7
|
+
scalaVersion := "2.10.4"
|
8
|
+
|
9
|
+
val sparkVersion = "1.5.2"
|
10
|
+
val hadoopClientVersion = "1.0.4"
|
11
|
+
val cassandraConnectionVersion = "1.5.0-M3"
|
12
|
+
|
13
|
+
val _targetDir = scala.util.Properties.envOrElse("TARGET_DIR", "target")
|
14
|
+
|
15
|
+
javacOptions ++= Seq("-source", "1.7", "-target", "1.7")
|
16
|
+
|
17
|
+
artifactPath in Compile in packageBin := file(s"${_targetDir}/cmodel_scala_helper.jar")
|
18
|
+
outputPath in packageDependency := file(s"${_targetDir}/spark-assembly-${sparkVersion}-cassandra_model-hadoop${hadoopClientVersion}.jar")
|
19
|
+
|
20
|
+
libraryDependencies ++= Seq(
|
21
|
+
"org.apache.spark" %% "spark-core" % sparkVersion excludeAll(ExclusionRule(organization = "org.apache.hadoop")),
|
22
|
+
"org.apache.spark" %% "spark-sql" % sparkVersion,
|
23
|
+
"org.apache.spark" %% "spark-hive" % sparkVersion,
|
24
|
+
"org.apache.spark" %% "spark-streaming" % sparkVersion,
|
25
|
+
"org.apache.spark" %% "spark-streaming-kafka" % sparkVersion,
|
26
|
+
"org.apache.spark" %% "spark-streaming-flume" % sparkVersion,
|
27
|
+
"org.apache.spark" %% "spark-graphx" % sparkVersion,
|
28
|
+
"org.apache.spark" %% "spark-mllib" % sparkVersion,
|
29
|
+
"org.apache.hadoop" % "hadoop-client" % hadoopClientVersion,
|
30
|
+
"com.datastax.spark" %% "spark-cassandra-connector" % cassandraConnectionVersion,
|
31
|
+
"com.datastax.spark" %% "spark-cassandra-connector-java" % cassandraConnectionVersion,
|
32
|
+
"com.github.fommil.netlib" % "all" % "1.1.2",
|
33
|
+
"org.scalatest" % "scalatest_2.10" % "2.2.1" % "test"
|
34
|
+
)
|
35
|
+
|
36
|
+
resolvers ++= Seq(
|
37
|
+
"JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/",
|
38
|
+
"Spray Repository" at "http://repo.spray.cc/",
|
39
|
+
"Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/",
|
40
|
+
"Akka Repository" at "http://repo.akka.io/releases/",
|
41
|
+
"Twitter4J Repository" at "http://twitter4j.org/maven2/",
|
42
|
+
"Apache HBase" at "https://repository.apache.org/content/repositories/releases",
|
43
|
+
"Twitter Maven Repo" at "http://maven.twttr.com/",
|
44
|
+
"scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools",
|
45
|
+
"Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/",
|
46
|
+
"Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/",
|
47
|
+
"Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven",
|
48
|
+
Resolver.sonatypeRepo("public")
|
49
|
+
)
|
50
|
+
|
51
|
+
mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) =>
|
52
|
+
{
|
53
|
+
case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard
|
54
|
+
case m if m.startsWith("META-INF") => MergeStrategy.discard
|
55
|
+
case PathList("javax", "servlet", xs @ _*) => MergeStrategy.first
|
56
|
+
case PathList("org", "apache", xs @ _*) => MergeStrategy.first
|
57
|
+
case PathList("org", "jboss", xs @ _*) => MergeStrategy.first
|
58
|
+
case "about.html" => MergeStrategy.rename
|
59
|
+
case "reference.conf" => MergeStrategy.concat
|
60
|
+
case _ => MergeStrategy.first
|
61
|
+
}
|
62
|
+
}
|
@@ -0,0 +1,23 @@
|
|
1
|
+
package org.apache.spark.api.cassandra_model
|
2
|
+
|
3
|
+
import org.apache.spark._
|
4
|
+
import com.datastax.spark.connector._
|
5
|
+
import com.datastax.spark.connector.rdd._
|
6
|
+
import java.util._
|
7
|
+
import scala.collection.JavaConversions.mapAsScalaMap
|
8
|
+
|
9
|
+
object CassandraHelper {
|
10
|
+
def cassandraTable(sc: SparkContext, keyspace: String, table: String) = {
|
11
|
+
sc.cassandraTable(keyspace, table)
|
12
|
+
}
|
13
|
+
def filterRDD(rdd: CassandraRDD[CassandraRow], restriction: HashMap[String, Any]) = {
|
14
|
+
var result = rdd
|
15
|
+
for ((k,v) <- restriction) {
|
16
|
+
result = v match {
|
17
|
+
case (a: Array[Any]) => result.where(k, a : _*)
|
18
|
+
case _ => result.where(k, v)
|
19
|
+
}
|
20
|
+
}
|
21
|
+
result
|
22
|
+
}
|
23
|
+
}
|
@@ -0,0 +1,27 @@
|
|
1
|
+
package org.apache.spark.api.cassandra_model
|
2
|
+
|
3
|
+
import org.apache.spark.sql.types._
|
4
|
+
|
5
|
+
object DataTypeHelper {
|
6
|
+
def getArrayType(key_type: DataType) = ArrayType(key_type)
|
7
|
+
def getBinaryType = BinaryType
|
8
|
+
def getBooleanType = BooleanType
|
9
|
+
def getByteType = ByteType
|
10
|
+
def getDataType = DataType
|
11
|
+
def getDateType = DateType
|
12
|
+
def getDecimal = Decimal
|
13
|
+
def getDecimalType = DecimalType
|
14
|
+
def getDoubleType = DoubleType
|
15
|
+
def getFloatType = FloatType
|
16
|
+
def getIntegerType = IntegerType
|
17
|
+
def getLongType = LongType
|
18
|
+
def getMapType(key_type: DataType, value_type: DataType) = MapType(key_type, value_type)
|
19
|
+
def getMetadata = Metadata
|
20
|
+
def getNullType = NullType
|
21
|
+
def getPrecisionInfo = PrecisionInfo
|
22
|
+
def getShortType = ShortType
|
23
|
+
def getStringType = StringType
|
24
|
+
def getStructField = StructField
|
25
|
+
def getStructType = StructType
|
26
|
+
def getTimestampType = TimestampType
|
27
|
+
}
|
@@ -0,0 +1,204 @@
|
|
1
|
+
package org.apache.spark.api.cassandra_model
|
2
|
+
|
3
|
+
import scala.collection.mutable._
|
4
|
+
|
5
|
+
class MarshalLoader (dump: Array[Byte]) {
|
6
|
+
private val bytes: Array[Byte] = dump
|
7
|
+
private var parse_index: Int = 0
|
8
|
+
private var symbol_table: List[String] = List()
|
9
|
+
private var object_table: List[AnyRef] = List()
|
10
|
+
|
11
|
+
private def getBytes() = {
|
12
|
+
bytes
|
13
|
+
}
|
14
|
+
|
15
|
+
private def recordObject(obj: AnyRef) = {
|
16
|
+
object_table :+= obj
|
17
|
+
}
|
18
|
+
|
19
|
+
private def nextBytes(amount: Int): Array[Byte] = {
|
20
|
+
val result = bytes.slice(parse_index, parse_index + amount)
|
21
|
+
parse_index = parse_index + amount
|
22
|
+
result
|
23
|
+
}
|
24
|
+
|
25
|
+
private def nextByte(): Byte = {
|
26
|
+
val result = bytes(parse_index)
|
27
|
+
|
28
|
+
parse_index = parse_index + 1
|
29
|
+
|
30
|
+
result
|
31
|
+
}
|
32
|
+
|
33
|
+
private def scanIntBits(num_bytes: Int) = {
|
34
|
+
var bit: Int = 0
|
35
|
+
var value: Int = 0
|
36
|
+
|
37
|
+
for (bit <- 0 to num_bytes-1) {
|
38
|
+
val next_value = 0xff & nextByte()
|
39
|
+
value += (next_value << (bit * 8))
|
40
|
+
}
|
41
|
+
value
|
42
|
+
}
|
43
|
+
|
44
|
+
private def decodeInt(): java.lang.Integer = {
|
45
|
+
val first_byte: Int = nextByte()
|
46
|
+
var value: Int = 0
|
47
|
+
|
48
|
+
if (first_byte == 0)
|
49
|
+
return 0
|
50
|
+
if (first_byte >= 6)
|
51
|
+
return first_byte - 5
|
52
|
+
if (first_byte <= -6)
|
53
|
+
return 5 + first_byte
|
54
|
+
|
55
|
+
var num_bytes = first_byte
|
56
|
+
if (num_bytes > 0) {
|
57
|
+
value = scanIntBits(num_bytes)
|
58
|
+
} else {
|
59
|
+
num_bytes = -num_bytes
|
60
|
+
value = scanIntBits(num_bytes)
|
61
|
+
val long_bytes = num_bytes
|
62
|
+
val long_value: Long = value - (1L << (long_bytes * 8L))
|
63
|
+
value = long_value.toInt
|
64
|
+
}
|
65
|
+
|
66
|
+
value
|
67
|
+
}
|
68
|
+
|
69
|
+
private def decodeDouble(): java.lang.Double = {
|
70
|
+
val length = decodeInt()
|
71
|
+
val str_value = new String(nextBytes(length))
|
72
|
+
|
73
|
+
val result: java.lang.Double = str_value.toDouble
|
74
|
+
recordObject(result)
|
75
|
+
result
|
76
|
+
}
|
77
|
+
|
78
|
+
private def decodeASCIIString(): String = {
|
79
|
+
val length = decodeInt()
|
80
|
+
|
81
|
+
val result = new String(nextBytes(length))
|
82
|
+
recordObject(result)
|
83
|
+
result
|
84
|
+
}
|
85
|
+
|
86
|
+
private def decodeSymbol(): String = {
|
87
|
+
val length = decodeInt()
|
88
|
+
val string_bytes = nextBytes(length)
|
89
|
+
val result = new String(string_bytes)
|
90
|
+
|
91
|
+
symbol_table :+= result
|
92
|
+
|
93
|
+
result
|
94
|
+
}
|
95
|
+
|
96
|
+
private def decodeSymLink(): String = {
|
97
|
+
val index = decodeInt()
|
98
|
+
|
99
|
+
symbol_table(index)
|
100
|
+
}
|
101
|
+
|
102
|
+
private def decodeString(): String = {
|
103
|
+
val string_code = nextByte()
|
104
|
+
val length = decodeInt()
|
105
|
+
val str_bytes = nextBytes(length)
|
106
|
+
val var_count = decodeInt()
|
107
|
+
val encoding = decodeAny()
|
108
|
+
|
109
|
+
if (encoding == "E") {
|
110
|
+
val is_utf8 = decodeAny()
|
111
|
+
}
|
112
|
+
|
113
|
+
val result = new String(str_bytes)
|
114
|
+
recordObject(result)
|
115
|
+
result
|
116
|
+
}
|
117
|
+
|
118
|
+
private def decodeMagic(): String = {
|
119
|
+
val magic = new String(nextBytes(4))
|
120
|
+
|
121
|
+
if (magic != "MRSH") {
|
122
|
+
throw new IllegalArgumentException("Invalid format header: '" + magic + "'")
|
123
|
+
}
|
124
|
+
|
125
|
+
magic
|
126
|
+
}
|
127
|
+
|
128
|
+
private def decodeVersion(): Array[Byte] = {
|
129
|
+
val version = nextBytes(2)
|
130
|
+
|
131
|
+
if (version(0) != 0x04 || version(1) != 0x08) {
|
132
|
+
throw new IllegalArgumentException("Invalid Marshal version: [" + version(0) + "], [" + version(1) + "]")
|
133
|
+
}
|
134
|
+
|
135
|
+
version
|
136
|
+
}
|
137
|
+
|
138
|
+
private def decodeHashItem(): String = {
|
139
|
+
val ivar_code = nextByte()
|
140
|
+
decodeString()
|
141
|
+
}
|
142
|
+
|
143
|
+
private def decodeHash(): HashMap[AnyRef, AnyRef] = {
|
144
|
+
var result = new HashMap[AnyRef, AnyRef]
|
145
|
+
val length = decodeInt()
|
146
|
+
|
147
|
+
var item = 0
|
148
|
+
for (item <- 0 to length-1) {
|
149
|
+
val key = decodeAny()
|
150
|
+
val value = decodeAny()
|
151
|
+
result(key) = value
|
152
|
+
}
|
153
|
+
|
154
|
+
recordObject(result)
|
155
|
+
result
|
156
|
+
}
|
157
|
+
|
158
|
+
private def decodeArray(): Array[AnyRef] = {
|
159
|
+
var list_result: List[AnyRef] = List()
|
160
|
+
val length = decodeInt()
|
161
|
+
|
162
|
+
var item = 0
|
163
|
+
for (item <- 0 to length-1) {
|
164
|
+
val value = decodeAny()
|
165
|
+
list_result :+= value
|
166
|
+
}
|
167
|
+
|
168
|
+
val result = list_result.toArray
|
169
|
+
recordObject(result)
|
170
|
+
result
|
171
|
+
}
|
172
|
+
|
173
|
+
private def decodeObjectReference(): AnyRef = {
|
174
|
+
val index = decodeInt()-1
|
175
|
+
|
176
|
+
object_table(index)
|
177
|
+
}
|
178
|
+
|
179
|
+
private def decodeAny(): AnyRef = {
|
180
|
+
val code = nextByte()
|
181
|
+
|
182
|
+
code match {
|
183
|
+
case 0x30 => null
|
184
|
+
case 0x54 => true: java.lang.Boolean
|
185
|
+
case 0x46 => false: java.lang.Boolean
|
186
|
+
case 0x69 => decodeInt()
|
187
|
+
case 0x66 => decodeDouble()
|
188
|
+
case 0x3a => decodeSymbol()
|
189
|
+
case 0x3b => decodeSymLink()
|
190
|
+
case 0x7b => decodeHash()
|
191
|
+
case 0x5b => decodeArray()
|
192
|
+
case 0x22 => decodeASCIIString()
|
193
|
+
case 0x49 => decodeString()
|
194
|
+
case 0x40 => decodeObjectReference()
|
195
|
+
case _ => throw new IllegalArgumentException("Unsupported code type: " + code)
|
196
|
+
}
|
197
|
+
}
|
198
|
+
|
199
|
+
private val magic = decodeMagic()
|
200
|
+
private val version = decodeVersion()
|
201
|
+
private val value = decodeAny()
|
202
|
+
|
203
|
+
def getValue(): AnyRef = value
|
204
|
+
}
|
@@ -0,0 +1,85 @@
|
|
1
|
+
package org.apache.spark.api.cassandra_model
|
2
|
+
|
3
|
+
import scala.collection.mutable._
|
4
|
+
import org.apache.spark.rdd._
|
5
|
+
import org.apache.spark.sql._
|
6
|
+
import com.datastax.spark.connector._
|
7
|
+
import com.datastax.spark.connector.rdd._
|
8
|
+
import org.apache.spark.sql.types._
|
9
|
+
|
10
|
+
object MapStringStringRowMapping {
|
11
|
+
private def canDecode(blob: Array[Byte]) = {
|
12
|
+
new String(blob.slice(0, 4)) == "MRSH"
|
13
|
+
}
|
14
|
+
|
15
|
+
private def decodeValue(blob: Array[Byte]): AnyRef = {
|
16
|
+
if (canDecode(blob)) {
|
17
|
+
val decoder = new MarshalLoader(blob)
|
18
|
+
val value = decoder.getValue()
|
19
|
+
|
20
|
+
value match {
|
21
|
+
case (m: Map[_, _]) => m map { case (key, value) => (String.valueOf(key), String.valueOf(value)) }
|
22
|
+
case _ => new IllegalArgumentException("Unsupported Ruby Type")
|
23
|
+
}
|
24
|
+
} else {
|
25
|
+
blob
|
26
|
+
}
|
27
|
+
}
|
28
|
+
|
29
|
+
private def updatedRow(row: CassandraRow): CassandraRow = {
|
30
|
+
val columns = row.columnNames
|
31
|
+
val values = row.columnValues.map{
|
32
|
+
value => value match {
|
33
|
+
case (blob: Array[Byte]) => decodeValue(blob)
|
34
|
+
case _ => value
|
35
|
+
}
|
36
|
+
}
|
37
|
+
|
38
|
+
new CassandraRow(columns, values)
|
39
|
+
}
|
40
|
+
|
41
|
+
def mappedRDD(rdd: RDD[CassandraRow]): RDD[CassandraRow] = {
|
42
|
+
rdd.map(
|
43
|
+
row => updatedRow(row)
|
44
|
+
)
|
45
|
+
}
|
46
|
+
}
|
47
|
+
|
48
|
+
object SparkRowRowMapping {
|
49
|
+
private def canDecode(blob: Array[Byte]) = {
|
50
|
+
new String(blob.slice(0, 4)) == "MRSH"
|
51
|
+
}
|
52
|
+
|
53
|
+
private def decodeValue(blob: Array[Byte]): AnyRef = {
|
54
|
+
if (canDecode(blob)) {
|
55
|
+
val decoder = new MarshalLoader(blob)
|
56
|
+
val value = decoder.getValue()
|
57
|
+
|
58
|
+
value match {
|
59
|
+
case (m: Map[_, _]) => Row.fromSeq(m.values.toSeq)
|
60
|
+
case (a: Array[_]) => Row.fromSeq(a.toSeq)
|
61
|
+
case _ => new IllegalArgumentException("Unsupported Ruby Type")
|
62
|
+
}
|
63
|
+
} else {
|
64
|
+
blob
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
private def updatedRow(row: CassandraRow): CassandraRow = {
|
69
|
+
val columns = row.columnNames
|
70
|
+
val values = row.columnValues.map{
|
71
|
+
value => value match {
|
72
|
+
case (blob: Array[Byte]) => decodeValue(blob)
|
73
|
+
case _ => value
|
74
|
+
}
|
75
|
+
}
|
76
|
+
|
77
|
+
new CassandraRow(columns, values)
|
78
|
+
}
|
79
|
+
|
80
|
+
def mappedRDD(rdd: RDD[CassandraRow]): RDD[CassandraRow] = {
|
81
|
+
rdd.map(
|
82
|
+
row => updatedRow(row)
|
83
|
+
)
|
84
|
+
}
|
85
|
+
}
|
@@ -0,0 +1,6 @@
|
|
1
|
+
|
2
|
+
resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)
|
3
|
+
resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/"
|
4
|
+
resolvers += "Spray Repository" at "http://repo.spray.cc/"
|
5
|
+
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.10.2")
|
6
|
+
addSbtPlugin("com.github.gseitz" % "sbt-protobuf" % "0.3.3")
|