RubyGems - cassandra_model_spark - Versions diffs - 0.0.1.5-java → 0.0.4-java - Mend

cassandra_model_spark 0.0.1.5-java → 0.0.4-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +4 -4
data/ext/scala_helper/build.sbt +5 -2
data/ext/scala_helper/cassandra_helper.scala +3 -2
data/ext/scala_helper/column_deserializer.scala +28 -0
data/ext/scala_helper/data_type_helper.scala +2 -0
data/ext/scala_helper/lua_rdd.scala +352 -0
data/ext/scala_helper/lua_row_lib.scala +108 -0
data/ext/scala_helper/marshal_loader.scala +6 -6
data/ext/scala_helper/marshal_row_mapping.scala +11 -3
data/ext/scala_helper/row_conversions.scala +20 -0
data/lib/cassandra_model_spark.rb +2 -0
data/lib/cassandra_model_spark/connection_cache.rb +2 -2
data/lib/cassandra_model_spark/data_frame.rb +121 -35
data/lib/cassandra_model_spark/java_bridge.rb +40 -4
data/lib/cassandra_model_spark/java_classes.rb +20 -5
data/lib/cassandra_model_spark/query_builder.rb +3 -3
data/lib/cassandra_model_spark/raw_connection.rb +12 -3
data/lib/cassandra_model_spark/record.rb +5 -1
data/lib/cassandra_model_spark/schema.rb +47 -0
data/lib/cassandra_model_spark/sql_schema.rb +49 -0
metadata +26 -9
data/ext/scala_helper/schema_builder.scala +0 -35

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 15373fcc369058da2966c722c6b35ac0c164b073
-  data.tar.gz: d840cc0cd750dea5d198f37ced59608340cced97
+  metadata.gz: dc8ac871e123f5a2118c4498e398b4923f81e07e
+  data.tar.gz: f954e055c2741a44c14bef9bd851a3db8d21d476
 SHA512:
-  metadata.gz: c356202b4c8fc59c8936d4af43ad844a0f70895ee09a5453741354cb957c179486fc7491ec28ad8ef5445f742063a9a86897031e0feaae890b9e886d4a0422cf
-  data.tar.gz: a0045c5c883ba0148482c97fbe293b303c30e81289be36c83eca45f3fdebe34429f23d1305f6377b250902186837bf5bfd7d8f2e58d4c5a37edb3dfc220c55ac
+  metadata.gz: 6fd1d8508b7807334f6245e0ee7d7c89476aad9f384e42e02d52661209f851e0a8a7fce5f73448acb6d86017fb96a8b2818cc8b141ec7db685e7101b96696c7f
+  data.tar.gz: a218ec4c41c0cbeff174154498e65028c0858b4dc4d269ab3082103acbb84bdcf8bf138bd67ca0ba822622cddf752c857191d4325cf631872e971e1822333e22

data/ext/scala_helper/build.sbt CHANGED

@@ -6,13 +6,14 @@ name := "cmodel_scala_helper"
 version := "0.0.1"
 scalaVersion := "2.10.4"
-val sparkVersion = "1.5.2"
+val sparkVersion = "1.6.1"
 val hadoopClientVersion = "1.0.4"
-val cassandraConnectionVersion = "1.5.0-M3"
+val cassandraConnectionVersion = "1.6.0-M1"
 val _targetDir = scala.util.Properties.envOrElse("TARGET_DIR", "target")
 javacOptions ++= Seq("-source", "1.7", "-target", "1.7")
+scalacOptions ++= Seq("-feature")
 artifactPath in Compile in packageBin := file(s"${_targetDir}/cmodel_scala_helper.jar")
 outputPath in packageDependency := file(s"${_targetDir}/spark-assembly-${sparkVersion}-cassandra_model-hadoop${hadoopClientVersion}.jar")
@@ -30,6 +31,8 @@ libraryDependencies ++= Seq(
   "com.datastax.spark" %% "spark-cassandra-connector" % cassandraConnectionVersion,
   "com.datastax.spark" %% "spark-cassandra-connector-java" % cassandraConnectionVersion,
   "com.github.fommil.netlib" % "all" % "1.1.2",
+  "com.databricks" % "spark-csv_2.10" % "1.3.0",
+  "org.luaj" % "luaj-jse" % "3.0.1",
   "org.scalatest" % "scalatest_2.10" % "2.2.1" % "test"
 )

data/ext/scala_helper/cassandra_helper.scala CHANGED

@@ -10,11 +10,12 @@ object CassandraHelper {
   def cassandraTable(sc: SparkContext, keyspace: String, table: String) = {
     sc.cassandraTable(keyspace, table)
   }
   def filterRDD(rdd: CassandraRDD[CassandraRow], restriction: HashMap[String, Any]) = {
     var result = rdd
-    for ((k,v) <- restriction) {
+    for ((k, v) <- restriction) {
       result = v match {
-        case (a: Array[Any]) => result.where(k, a : _*)
+        case (a: Array[Any]) => result.where(k, a: _*)
         case _ => result.where(k, v)
       }
     }

data/ext/scala_helper/column_deserializer.scala ADDED

@@ -0,0 +1,28 @@
+package org.apache.spark.api.cassandra_model
+import org.apache.spark.rdd._
+import com.datastax.spark.connector._
+import com.datastax.spark.connector.rdd._
+object ColumnDeserializer {
+  def mappedRDD(rdd: RDD[CassandraRow], column: Int): RDD[CassandraRow] = {
+    rdd.map(updatedRow(_, column))
+  }
+  private def updatedRow(row: CassandraRow, column: Int): CassandraRow =
+  {
+    val columns = row.columnNames
+    val updated_value = getDecodedValue(row, column)
+    val values = row.columnValues.updated(column, updated_value)
+    new CassandraRow(columns, values)
+  }
+  private def getDecodedValue(row: CassandraRow, column: Int): AnyRef = row.columnValues(column) match {
+    case (blob: Array[Byte]) => decodeValue(blob)
+  }
+  private def decodeValue(blob: Array[Byte]): AnyRef = {
+    new MarshalLoader(blob).getValue()
+  }
+}

data/ext/scala_helper/data_type_helper.scala CHANGED

@@ -4,6 +4,7 @@ import org.apache.spark.sql.types._
 object DataTypeHelper {
   def getArrayType(key_type: DataType) = ArrayType(key_type)
+  def getArrayType = ArrayType
   def getBinaryType = BinaryType
   def getBooleanType = BooleanType
   def getByteType = ByteType
@@ -16,6 +17,7 @@ object DataTypeHelper {
   def getIntegerType = IntegerType
   def getLongType = LongType
   def getMapType(key_type: DataType, value_type: DataType) = MapType(key_type, value_type)
+  def getMapType = MapType
   def getMetadata = Metadata
   def getNullType = NullType
   def getPrecisionInfo = PrecisionInfo

data/ext/scala_helper/lua_rdd.scala ADDED

@@ -0,0 +1,352 @@
+package org.apache.spark.api.cassandra_model
+import org.luaj.vm2.Globals
+import org.luaj.vm2.compiler.LuaC
+import org.luaj.vm2.compiler.DumpState
+import org.luaj.vm2._
+import org.luaj.vm2.lib.jse.JseBaseLib
+import org.luaj.vm2.lib._
+import org.apache.spark.rdd._
+import org.apache.spark.sql._
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.catalyst.expressions.GenericRow
+import scala.reflect.ClassTag
+import java.io._
+import java.security.MessageDigest
+object LuaRowValue {
+  def luaTableToArray[T](table: LuaTable)(implicit m: ClassTag[T]): Array[T] = {
+    val keys = table.keys()
+    val length = keys.length
+    val result = new Array[T](length)
+    var index = 0
+    keys.foreach { table_key =>
+      val value = table.get(table_key)
+      val result_value = value match {
+        case str: LuaString => str.toString()
+        case num: LuaInteger => num.toint()
+        case dfnum: LuaDouble => dfnum.todouble()
+        case inner_table: LuaTable => luaTableToArray[T](inner_table)
+        case inner_row: LuaRowValue => inner_row.row
+      }
+      result(index) = result_value match {
+        case t_value: T => t_value
+      }
+      index += 1
+    }
+    result
+  }
+  def luaTableToRow(table: LuaTable): Row = {
+    val row: Array[Any] = luaTableToArray(table)
+    new GenericRow(row)
+  }
+}
+class LuaRowValue(val schema: StructType, val row: Row) extends LuaValue {
+  def `type`(): Int = 999
+  def typename(): String = "Row"
+  override def tostring() = LuaValue.valueOf(row.toString())
+  override def toString() = row.toString()
+  override def get(key: LuaValue): LuaValue = {
+    val column_index = schema.fieldIndex(key.toString())
+    val field = schema(column_index)
+    valueOf(field.dataType, column_index)
+  }
+  override def get(column_index: Int): LuaValue = {
+    val field = schema(column_index)
+    valueOf(field.dataType, column_index)
+  }
+  private def valueOf(data_type: DataType, index: Int): LuaValue = {
+    data_type match {
+      case StringType => LuaValue.valueOf(row.getString(index))
+      case IntegerType => LuaValue.valueOf(row.getInt(index))
+      case FloatType => LuaValue.valueOf(row.getFloat(index))
+      case DoubleType => LuaValue.valueOf(row.getDouble(index))
+      case ArrayType(StringType, true) => arrayValueOf[String](index)
+      case ArrayType(IntegerType, true) => arrayValueOf[Int](index)
+      case ArrayType(FloatType, true) => arrayValueOf[Float](index)
+      case ArrayType(DoubleType, true) => arrayValueOf[Double](index)
+      case array_type: ArrayType => objectArrayValueOf(array_type, index)
+      case inner_schema: StructType => new LuaRowValue(inner_schema, row.getAs[Row](index))
+    }
+  }
+  private def objectArrayValueOf(array_type: ArrayType, index: Int): LuaValue = {
+    array_type.elementType match {
+      case inner_schema: StructType => rowArrayValueOf(inner_schema, index)
+    }
+  }
+  private def rowArrayValueOf(inner_schema: StructType, index: Int): LuaValue = {
+    val values: Array[LuaValue] = row.getAs[Array[Row]](index).map(new LuaRowValue(inner_schema, _)).toArray
+    new LuaTable(null, values, null)
+  }
+  private def arrayValueOf[T](index: Int)(implicit m: ClassTag[T]): LuaValue = {
+    val values: Array[LuaValue] = row.getAs[Array[T]](index).map {
+      _ match {
+        case str: String => LuaValue.valueOf(str)
+        case num: Int => LuaValue.valueOf(num)
+        case fnum: Float => LuaValue.valueOf(fnum)
+        case dfnum: Double => LuaValue.valueOf(dfnum)
+      }
+    }.toArray
+    new LuaTable(null, values, null)
+  }
+}
+class PartitionableStringArray(val items: Array[String]) extends Serializable {
+  override val hashCode = {
+    val some_prime = 31
+    var result = 1
+    for (str <- items) {
+      result = result * some_prime + str.hashCode
+    }
+    result
+  }
+  override def equals(rhs: Any) = {
+    rhs match {
+      case string_array: PartitionableStringArray => items.sameElements(string_array.items)
+      case _ => false
+    }
+  }
+}
+object LuaRDD {
+  private val thread_local_globals = new ThreadLocal[Globals]
+  private val digest = MessageDigest.getInstance("SHA-1")
+  def getGlobals(): Globals = thread_local_globals.get()
+  def newGlobals(): Globals = {
+    val globals = new Globals()
+    LuaC.install(globals)
+    LoadState.install(globals)
+    globals.load(new JseBaseLib())
+    globals.load(new PackageLib())
+    globals.load(new TableLib())
+    globals.load(new StringLib())
+    globals.load(new LuaRowLib())
+    thread_local_globals.set(globals)
+    globals
+  }
+  def getGlobalsOrNew(): Globals = {
+    var globals = getGlobals()
+    if (globals == null)
+      globals = newGlobals()
+    globals
+  }
+  def getLuaCodeDigest(lua_code: String) = {
+    val hash_bytes = digest.digest(lua_code.getBytes())
+    new String(hash_bytes)
+  }
+}
+class LuaRDD(val schema: StructType, val rdd: RDD[Row]) extends Serializable {
+  private class LuaMetaData(val name: String, val byte_code: Array[Byte]) extends Serializable
+  def map(new_schema: StructType, lua_code: String): LuaRDD = {
+    val lua_byte_code = getLuaByteCode(lua_code)
+    val new_rdd = rdd.map(callMapScript(lua_byte_code, _))
+    new LuaRDD(new_schema, new_rdd)
+  }
+  def flatMap(new_schema: StructType, lua_code: String): LuaRDD = {
+    val lua_byte_code = getLuaByteCode(lua_code)
+    val new_rdd = rdd.flatMap(callFlatMapScript(lua_byte_code, _))
+    new LuaRDD(new_schema, new_rdd)
+  }
+  def filter(lua_code: String): LuaRDD = {
+    val lua_byte_code = getLuaByteCode(lua_code)
+    val new_rdd = rdd.filter(callFilterScript(lua_byte_code, _))
+    new LuaRDD(schema, new_rdd)
+  }
+  def reduceByKeys(key_columns: Array[String], lua_code: String): LuaRDD = {
+    val lua_byte_code = getLuaByteCode(lua_code)
+    val field_indices = key_columns.map(schema.fieldIndex(_))
+    val keys_rdd: RDD[Tuple2[Any, Row]] = rdd.map { case row =>
+      val keys: Seq[Any] = field_indices.map(row(_))
+      Tuple2(keys, row)
+    }
+    val reduced_rdd: RDD[Tuple2[Any, Row]] = keys_rdd.reduceByKey { case (lhs, rhs) =>
+      callReduceScript(lua_byte_code, lhs, rhs)
+    }
+    val new_rdd = reduced_rdd.map(_._2)
+    new LuaRDD(schema, new_rdd)
+  }
+  def groupByString(lua_code: String): LuaRDD = {
+    val lua_byte_code = getLuaByteCode(lua_code)
+    val new_schema = groupBySchema(StringType)
+    val new_rdd = rdd.groupBy(callGroupByStringScript(lua_byte_code, _))
+    val grouped_rdd = groupedRDD(new_rdd)
+    new LuaRDD(new_schema, grouped_rdd)
+  }
+  def groupByStringArray(lua_code: String): LuaRDD = {
+    val lua_byte_code = getLuaByteCode(lua_code)
+    val new_schema = groupBySchema(ArrayType(StringType))
+    val pre_rdd = rdd.groupBy(callGroupByStringArrayScript(lua_byte_code, _))
+    val new_rdd: RDD[(Array[String], Iterable[Row])] = pre_rdd.map { case (key, values) =>
+      (key.items, values)
+    }
+    val grouped_rdd = groupedRDD(new_rdd)
+    new LuaRDD(new_schema, grouped_rdd)
+  }
+  def groupByInt(lua_code: String): LuaRDD = {
+    val lua_byte_code = getLuaByteCode(lua_code)
+    val new_schema = groupBySchema(IntegerType)
+    val new_rdd = rdd.groupBy(callGroupByIntScript(lua_byte_code, _))
+    val grouped_rdd = groupedRDD(new_rdd)
+    new LuaRDD(new_schema, grouped_rdd)
+  }
+  def groupByFloat(lua_code: String): LuaRDD = {
+    val lua_byte_code = getLuaByteCode(lua_code)
+    val new_schema = groupBySchema(FloatType)
+    val new_rdd = rdd.groupBy(callGroupByFloatScript(lua_byte_code, _))
+    val grouped_rdd = groupedRDD(new_rdd)
+    new LuaRDD(new_schema, grouped_rdd)
+  }
+  def groupByDouble(lua_code: String): LuaRDD = {
+    val lua_byte_code = getLuaByteCode(lua_code)
+    val new_schema = groupBySchema(DoubleType)
+    val new_rdd = rdd.groupBy(callGroupByDoubleScript(lua_byte_code, _))
+    val grouped_rdd = groupedRDD(new_rdd)
+    new LuaRDD(new_schema, grouped_rdd)
+  }
+  def toDF(sql_context: SQLContext) = sql_context.createDataFrame(rdd, schema)
+  private def getLuaByteCode(lua_code: String) = {
+    val output_stream = new ByteArrayOutputStream()
+    val name = LuaRDD.getLuaCodeDigest(lua_code)
+    val prototype = LuaC.instance.compile(new ByteArrayInputStream(lua_code.getBytes()), name)
+    val success = DumpState.dump(prototype, output_stream, true)
+    output_stream.flush()
+    success match {
+      case 0 => new LuaMetaData(name, output_stream.toByteArray())
+    }
+  }
+  private def groupBySchema(data_type: DataType): StructType = {
+    val fields = Array(StructField("key", data_type), StructField("values", ArrayType(schema)))
+    StructType(fields)
+  }
+  private def groupedRDD[T](rdd: RDD[(T, Iterable[Row])]): RDD[Row] = {
+    rdd.map { case (key, values) =>
+      val row: Array[Any] = Array(key, values.toArray)
+      new GenericRow(row)
+    }
+  }
+  private def callScript(lua_byte_code: LuaMetaData, row: Row): LuaValue = {
+    val globals = LuaRDD.getGlobalsOrNew()
+    globals.set("ROW", new LuaRowValue(schema, row))
+    loadAndCallChunk(globals, lua_byte_code)
+  }
+  private def callPairScript(lua_byte_code: LuaMetaData, lhs: Row, rhs: Row): LuaValue = {
+    val globals = LuaRDD.getGlobalsOrNew()
+    globals.set("LHS", new LuaRowValue(schema, lhs))
+    globals.set("RHS", new LuaRowValue(schema, rhs))
+    loadAndCallChunk(globals, lua_byte_code)
+  }
+  private def loadAndCallChunk(globals: Globals, lua_byte_code: LuaMetaData): LuaValue = {
+    val prototype = globals.loadPrototype(new ByteArrayInputStream(lua_byte_code.byte_code), lua_byte_code.name, "b")
+    val chunk = new LuaClosure(prototype, globals)
+    chunk.call()
+  }
+  private def callMapScript(lua_byte_code: LuaMetaData, row: Row): Row = {
+    callScript(lua_byte_code, row) match {
+      case row: LuaRowValue => row.row
+      case table: LuaTable => LuaRowValue.luaTableToRow(table)
+    }
+  }
+  private def callFlatMapScript(lua_byte_code: LuaMetaData, row: Row) = {
+    callScript(lua_byte_code, row) match {
+      case list: LuaTable => {
+        (1 to list.length).map {
+          index: Int => list.get(index) match {
+            case row: LuaRowValue => row.row
+          }
+        }
+      }
+    }
+  }
+  private def callFilterScript(lua_byte_code: LuaMetaData, row: Row): Boolean = {
+    callScript(lua_byte_code, row) match {
+      case bool: LuaBoolean => bool.toboolean()
+    }
+  }
+  private def callReduceScript(lua_byte_code: LuaMetaData, lhs: Row, rhs: Row): Row = {
+    callPairScript(lua_byte_code, lhs, rhs) match {
+      case row: LuaRowValue => row.row
+      case table: LuaTable => LuaRowValue.luaTableToRow(table)
+    }
+  }
+  private def callGroupByStringScript(lua_byte_code: LuaMetaData, row: Row): String = {
+    callScript(lua_byte_code, row) match {
+      case str: LuaString => str.toString()
+    }
+  }
+  private def callGroupByStringArrayScript(lua_byte_code: LuaMetaData, row: Row): PartitionableStringArray = {
+    callScript(lua_byte_code, row) match {
+      case table: LuaTable => new PartitionableStringArray(LuaRowValue.luaTableToArray(table))
+    }
+  }
+  private def callGroupByIntScript(lua_byte_code: LuaMetaData, row: Row): Int = {
+    callScript(lua_byte_code, row) match {
+      case num: LuaInteger => num.toint()
+    }
+  }
+  private def callGroupByFloatScript(lua_byte_code: LuaMetaData, row: Row): Float = {
+    callScript(lua_byte_code, row) match {
+      case fnum: LuaDouble => fnum.tofloat()
+    }
+  }
+  private def callGroupByDoubleScript(lua_byte_code: LuaMetaData, row: Row): Double = {
+    callScript(lua_byte_code, row) match {
+      case fnum: LuaDouble => fnum.todouble()
+    }
+  }
+}