cassandra_model_spark 0.0.1.5 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 27a73c07406303279c26f7e8f112be77e17c4d1b
4
- data.tar.gz: 946084377df9d9ed40cc37e6feb9a9fedc15fa79
3
+ metadata.gz: f87155a3779ae7fa719324eac70433d9b257faa2
4
+ data.tar.gz: 0a961edb11624f11878e96e3150c23509084be39
5
5
  SHA512:
6
- metadata.gz: 06d57399fb89e4d0252ab1abbbe086e53904332afd25c66fd59ab18bd4d0e2a24a8eb14a7cc65f8c302817414e8afd8999818a74523f4e42c6d55b794b938e6c
7
- data.tar.gz: 5462ff873fbb73f82d7424a989e7377b5e1826a54a8e234fe3ee732bff4936d3ed3646f256ce71f3bd189878c4941f38d3ff1c5845bc7b75ba318a992301cedf
6
+ metadata.gz: 8212b56b4446b52625c9655bebd43551b24d0d7e43d53b97237fb21ef3355b4f32e8bacd3778f80b11804b9c2b75a73d2462ff780665656da22358f7aa973182
7
+ data.tar.gz: 40b28f31ae459c321c02c7aaca048ff992901c16097c554ae7de55f8bec5f9f0342e6c98704a5ec34d42851d32961cdb21a2eafb48b7f388ebf0e93df31ee01c
@@ -6,13 +6,14 @@ name := "cmodel_scala_helper"
6
6
  version := "0.0.1"
7
7
  scalaVersion := "2.10.4"
8
8
 
9
- val sparkVersion = "1.5.2"
9
+ val sparkVersion = "1.6.1"
10
10
  val hadoopClientVersion = "1.0.4"
11
- val cassandraConnectionVersion = "1.5.0-M3"
11
+ val cassandraConnectionVersion = "1.6.0-M1"
12
12
 
13
13
  val _targetDir = scala.util.Properties.envOrElse("TARGET_DIR", "target")
14
14
 
15
15
  javacOptions ++= Seq("-source", "1.7", "-target", "1.7")
16
+ scalacOptions ++= Seq("-feature")
16
17
 
17
18
  artifactPath in Compile in packageBin := file(s"${_targetDir}/cmodel_scala_helper.jar")
18
19
  outputPath in packageDependency := file(s"${_targetDir}/spark-assembly-${sparkVersion}-cassandra_model-hadoop${hadoopClientVersion}.jar")
@@ -30,6 +31,8 @@ libraryDependencies ++= Seq(
30
31
  "com.datastax.spark" %% "spark-cassandra-connector" % cassandraConnectionVersion,
31
32
  "com.datastax.spark" %% "spark-cassandra-connector-java" % cassandraConnectionVersion,
32
33
  "com.github.fommil.netlib" % "all" % "1.1.2",
34
+ "com.databricks" % "spark-csv_2.10" % "1.3.0",
35
+ "org.luaj" % "luaj-jse" % "3.0.1",
33
36
  "org.scalatest" % "scalatest_2.10" % "2.2.1" % "test"
34
37
  )
35
38
 
@@ -10,11 +10,12 @@ object CassandraHelper {
10
10
  def cassandraTable(sc: SparkContext, keyspace: String, table: String) = {
11
11
  sc.cassandraTable(keyspace, table)
12
12
  }
13
+
13
14
  def filterRDD(rdd: CassandraRDD[CassandraRow], restriction: HashMap[String, Any]) = {
14
15
  var result = rdd
15
- for ((k,v) <- restriction) {
16
+ for ((k, v) <- restriction) {
16
17
  result = v match {
17
- case (a: Array[Any]) => result.where(k, a : _*)
18
+ case (a: Array[Any]) => result.where(k, a: _*)
18
19
  case _ => result.where(k, v)
19
20
  }
20
21
  }
@@ -0,0 +1,28 @@
1
+ package org.apache.spark.api.cassandra_model
2
+
3
+ import org.apache.spark.rdd._
4
+ import com.datastax.spark.connector._
5
+ import com.datastax.spark.connector.rdd._
6
+
7
+ object ColumnDeserializer {
8
+ def mappedRDD(rdd: RDD[CassandraRow], column: Int): RDD[CassandraRow] = {
9
+ rdd.map(updatedRow(_, column))
10
+ }
11
+
12
+ private def updatedRow(row: CassandraRow, column: Int): CassandraRow =
13
+ {
14
+ val columns = row.columnNames
15
+ val updated_value = getDecodedValue(row, column)
16
+ val values = row.columnValues.updated(column, updated_value)
17
+
18
+ new CassandraRow(columns, values)
19
+ }
20
+
21
+ private def getDecodedValue(row: CassandraRow, column: Int): AnyRef = row.columnValues(column) match {
22
+ case (blob: Array[Byte]) => decodeValue(blob)
23
+ }
24
+
25
+ private def decodeValue(blob: Array[Byte]): AnyRef = {
26
+ new MarshalLoader(blob).getValue()
27
+ }
28
+ }
@@ -4,6 +4,7 @@ import org.apache.spark.sql.types._
4
4
 
5
5
  object DataTypeHelper {
6
6
  def getArrayType(key_type: DataType) = ArrayType(key_type)
7
+ def getArrayType = ArrayType
7
8
  def getBinaryType = BinaryType
8
9
  def getBooleanType = BooleanType
9
10
  def getByteType = ByteType
@@ -16,6 +17,7 @@ object DataTypeHelper {
16
17
  def getIntegerType = IntegerType
17
18
  def getLongType = LongType
18
19
  def getMapType(key_type: DataType, value_type: DataType) = MapType(key_type, value_type)
20
+ def getMapType = MapType
19
21
  def getMetadata = Metadata
20
22
  def getNullType = NullType
21
23
  def getPrecisionInfo = PrecisionInfo
@@ -0,0 +1,352 @@
1
+ package org.apache.spark.api.cassandra_model
2
+
3
+ import org.luaj.vm2.Globals
4
+ import org.luaj.vm2.compiler.LuaC
5
+ import org.luaj.vm2.compiler.DumpState
6
+ import org.luaj.vm2._
7
+ import org.luaj.vm2.lib.jse.JseBaseLib
8
+ import org.luaj.vm2.lib._
9
+ import org.apache.spark.rdd._
10
+ import org.apache.spark.sql._
11
+ import org.apache.spark.sql.types._
12
+ import org.apache.spark.sql.catalyst.expressions.GenericRow
13
+ import scala.reflect.ClassTag
14
+ import java.io._
15
+ import java.security.MessageDigest
16
+
17
+ object LuaRowValue {
18
+ def luaTableToArray[T](table: LuaTable)(implicit m: ClassTag[T]): Array[T] = {
19
+ val keys = table.keys()
20
+ val length = keys.length
21
+ val result = new Array[T](length)
22
+ var index = 0
23
+
24
+ keys.foreach { table_key =>
25
+ val value = table.get(table_key)
26
+ val result_value = value match {
27
+ case str: LuaString => str.toString()
28
+ case num: LuaInteger => num.toint()
29
+ case dfnum: LuaDouble => dfnum.todouble()
30
+ case inner_table: LuaTable => luaTableToArray[T](inner_table)
31
+ case inner_row: LuaRowValue => inner_row.row
32
+ }
33
+ result(index) = result_value match {
34
+ case t_value: T => t_value
35
+ }
36
+ index += 1
37
+ }
38
+ result
39
+ }
40
+
41
+ def luaTableToRow(table: LuaTable): Row = {
42
+ val row: Array[Any] = luaTableToArray(table)
43
+ new GenericRow(row)
44
+ }
45
+ }
46
+
47
+ class LuaRowValue(val schema: StructType, val row: Row) extends LuaValue {
48
+ def `type`(): Int = 999
49
+
50
+ def typename(): String = "Row"
51
+
52
+ override def tostring() = LuaValue.valueOf(row.toString())
53
+
54
+ override def toString() = row.toString()
55
+
56
+ override def get(key: LuaValue): LuaValue = {
57
+ val column_index = schema.fieldIndex(key.toString())
58
+ val field = schema(column_index)
59
+ valueOf(field.dataType, column_index)
60
+ }
61
+
62
+ override def get(column_index: Int): LuaValue = {
63
+ val field = schema(column_index)
64
+ valueOf(field.dataType, column_index)
65
+ }
66
+
67
+ private def valueOf(data_type: DataType, index: Int): LuaValue = {
68
+ data_type match {
69
+ case StringType => LuaValue.valueOf(row.getString(index))
70
+ case IntegerType => LuaValue.valueOf(row.getInt(index))
71
+ case FloatType => LuaValue.valueOf(row.getFloat(index))
72
+ case DoubleType => LuaValue.valueOf(row.getDouble(index))
73
+ case ArrayType(StringType, true) => arrayValueOf[String](index)
74
+ case ArrayType(IntegerType, true) => arrayValueOf[Int](index)
75
+ case ArrayType(FloatType, true) => arrayValueOf[Float](index)
76
+ case ArrayType(DoubleType, true) => arrayValueOf[Double](index)
77
+ case array_type: ArrayType => objectArrayValueOf(array_type, index)
78
+ case inner_schema: StructType => new LuaRowValue(inner_schema, row.getAs[Row](index))
79
+ }
80
+ }
81
+
82
+ private def objectArrayValueOf(array_type: ArrayType, index: Int): LuaValue = {
83
+ array_type.elementType match {
84
+ case inner_schema: StructType => rowArrayValueOf(inner_schema, index)
85
+ }
86
+ }
87
+
88
+ private def rowArrayValueOf(inner_schema: StructType, index: Int): LuaValue = {
89
+ val values: Array[LuaValue] = row.getAs[Array[Row]](index).map(new LuaRowValue(inner_schema, _)).toArray
90
+ new LuaTable(null, values, null)
91
+ }
92
+
93
+ private def arrayValueOf[T](index: Int)(implicit m: ClassTag[T]): LuaValue = {
94
+ val values: Array[LuaValue] = row.getAs[Array[T]](index).map {
95
+ _ match {
96
+ case str: String => LuaValue.valueOf(str)
97
+ case num: Int => LuaValue.valueOf(num)
98
+ case fnum: Float => LuaValue.valueOf(fnum)
99
+ case dfnum: Double => LuaValue.valueOf(dfnum)
100
+ }
101
+ }.toArray
102
+ new LuaTable(null, values, null)
103
+ }
104
+ }
105
+
106
+ class PartitionableStringArray(val items: Array[String]) extends Serializable {
107
+ override val hashCode = {
108
+ val some_prime = 31
109
+ var result = 1
110
+
111
+ for (str <- items) {
112
+ result = result * some_prime + str.hashCode
113
+ }
114
+ result
115
+ }
116
+
117
+ override def equals(rhs: Any) = {
118
+ rhs match {
119
+ case string_array: PartitionableStringArray => items.sameElements(string_array.items)
120
+ case _ => false
121
+ }
122
+ }
123
+ }
124
+
125
+ object LuaRDD {
126
+ private val thread_local_globals = new ThreadLocal[Globals]
127
+ private val digest = MessageDigest.getInstance("SHA-1")
128
+
129
+ def getGlobals(): Globals = thread_local_globals.get()
130
+
131
+ def newGlobals(): Globals = {
132
+ val globals = new Globals()
133
+
134
+ LuaC.install(globals)
135
+ LoadState.install(globals)
136
+ globals.load(new JseBaseLib())
137
+ globals.load(new PackageLib())
138
+ globals.load(new TableLib())
139
+ globals.load(new StringLib())
140
+ globals.load(new LuaRowLib())
141
+
142
+ thread_local_globals.set(globals)
143
+ globals
144
+ }
145
+
146
+ def getGlobalsOrNew(): Globals = {
147
+ var globals = getGlobals()
148
+ if (globals == null)
149
+ globals = newGlobals()
150
+ globals
151
+ }
152
+
153
+ def getLuaCodeDigest(lua_code: String) = {
154
+ val hash_bytes = digest.digest(lua_code.getBytes())
155
+ new String(hash_bytes)
156
+ }
157
+ }
158
+
159
+ class LuaRDD(val schema: StructType, val rdd: RDD[Row]) extends Serializable {
160
+
161
+ private class LuaMetaData(val name: String, val byte_code: Array[Byte]) extends Serializable
162
+
163
+ def map(new_schema: StructType, lua_code: String): LuaRDD = {
164
+ val lua_byte_code = getLuaByteCode(lua_code)
165
+ val new_rdd = rdd.map(callMapScript(lua_byte_code, _))
166
+ new LuaRDD(new_schema, new_rdd)
167
+ }
168
+
169
+ def flatMap(new_schema: StructType, lua_code: String): LuaRDD = {
170
+ val lua_byte_code = getLuaByteCode(lua_code)
171
+ val new_rdd = rdd.flatMap(callFlatMapScript(lua_byte_code, _))
172
+ new LuaRDD(new_schema, new_rdd)
173
+ }
174
+
175
+ def filter(lua_code: String): LuaRDD = {
176
+ val lua_byte_code = getLuaByteCode(lua_code)
177
+ val new_rdd = rdd.filter(callFilterScript(lua_byte_code, _))
178
+ new LuaRDD(schema, new_rdd)
179
+ }
180
+
181
+ def reduceByKeys(key_columns: Array[String], lua_code: String): LuaRDD = {
182
+ val lua_byte_code = getLuaByteCode(lua_code)
183
+ val field_indices = key_columns.map(schema.fieldIndex(_))
184
+ val keys_rdd: RDD[Tuple2[Any, Row]] = rdd.map { case row =>
185
+ val keys: Seq[Any] = field_indices.map(row(_))
186
+ Tuple2(keys, row)
187
+ }
188
+ val reduced_rdd: RDD[Tuple2[Any, Row]] = keys_rdd.reduceByKey { case (lhs, rhs) =>
189
+ callReduceScript(lua_byte_code, lhs, rhs)
190
+ }
191
+ val new_rdd = reduced_rdd.map(_._2)
192
+ new LuaRDD(schema, new_rdd)
193
+ }
194
+
195
+ def groupByString(lua_code: String): LuaRDD = {
196
+ val lua_byte_code = getLuaByteCode(lua_code)
197
+ val new_schema = groupBySchema(StringType)
198
+ val new_rdd = rdd.groupBy(callGroupByStringScript(lua_byte_code, _))
199
+ val grouped_rdd = groupedRDD(new_rdd)
200
+
201
+ new LuaRDD(new_schema, grouped_rdd)
202
+ }
203
+
204
+ def groupByStringArray(lua_code: String): LuaRDD = {
205
+ val lua_byte_code = getLuaByteCode(lua_code)
206
+ val new_schema = groupBySchema(ArrayType(StringType))
207
+ val pre_rdd = rdd.groupBy(callGroupByStringArrayScript(lua_byte_code, _))
208
+ val new_rdd: RDD[(Array[String], Iterable[Row])] = pre_rdd.map { case (key, values) =>
209
+ (key.items, values)
210
+ }
211
+ val grouped_rdd = groupedRDD(new_rdd)
212
+
213
+ new LuaRDD(new_schema, grouped_rdd)
214
+ }
215
+
216
+ def groupByInt(lua_code: String): LuaRDD = {
217
+ val lua_byte_code = getLuaByteCode(lua_code)
218
+ val new_schema = groupBySchema(IntegerType)
219
+ val new_rdd = rdd.groupBy(callGroupByIntScript(lua_byte_code, _))
220
+ val grouped_rdd = groupedRDD(new_rdd)
221
+
222
+ new LuaRDD(new_schema, grouped_rdd)
223
+ }
224
+
225
+ def groupByFloat(lua_code: String): LuaRDD = {
226
+ val lua_byte_code = getLuaByteCode(lua_code)
227
+ val new_schema = groupBySchema(FloatType)
228
+ val new_rdd = rdd.groupBy(callGroupByFloatScript(lua_byte_code, _))
229
+ val grouped_rdd = groupedRDD(new_rdd)
230
+
231
+ new LuaRDD(new_schema, grouped_rdd)
232
+ }
233
+
234
+ def groupByDouble(lua_code: String): LuaRDD = {
235
+ val lua_byte_code = getLuaByteCode(lua_code)
236
+ val new_schema = groupBySchema(DoubleType)
237
+ val new_rdd = rdd.groupBy(callGroupByDoubleScript(lua_byte_code, _))
238
+ val grouped_rdd = groupedRDD(new_rdd)
239
+
240
+ new LuaRDD(new_schema, grouped_rdd)
241
+ }
242
+
243
+ def toDF(sql_context: SQLContext) = sql_context.createDataFrame(rdd, schema)
244
+
245
+ private def getLuaByteCode(lua_code: String) = {
246
+ val output_stream = new ByteArrayOutputStream()
247
+ val name = LuaRDD.getLuaCodeDigest(lua_code)
248
+ val prototype = LuaC.instance.compile(new ByteArrayInputStream(lua_code.getBytes()), name)
249
+ val success = DumpState.dump(prototype, output_stream, true)
250
+
251
+ output_stream.flush()
252
+ success match {
253
+ case 0 => new LuaMetaData(name, output_stream.toByteArray())
254
+ }
255
+ }
256
+
257
+ private def groupBySchema(data_type: DataType): StructType = {
258
+ val fields = Array(StructField("key", data_type), StructField("values", ArrayType(schema)))
259
+ StructType(fields)
260
+ }
261
+
262
+ private def groupedRDD[T](rdd: RDD[(T, Iterable[Row])]): RDD[Row] = {
263
+ rdd.map { case (key, values) =>
264
+ val row: Array[Any] = Array(key, values.toArray)
265
+ new GenericRow(row)
266
+ }
267
+ }
268
+
269
+ private def callScript(lua_byte_code: LuaMetaData, row: Row): LuaValue = {
270
+ val globals = LuaRDD.getGlobalsOrNew()
271
+ globals.set("ROW", new LuaRowValue(schema, row))
272
+
273
+ loadAndCallChunk(globals, lua_byte_code)
274
+ }
275
+
276
+ private def callPairScript(lua_byte_code: LuaMetaData, lhs: Row, rhs: Row): LuaValue = {
277
+ val globals = LuaRDD.getGlobalsOrNew()
278
+ globals.set("LHS", new LuaRowValue(schema, lhs))
279
+ globals.set("RHS", new LuaRowValue(schema, rhs))
280
+
281
+ loadAndCallChunk(globals, lua_byte_code)
282
+ }
283
+
284
+ private def loadAndCallChunk(globals: Globals, lua_byte_code: LuaMetaData): LuaValue = {
285
+ val prototype = globals.loadPrototype(new ByteArrayInputStream(lua_byte_code.byte_code), lua_byte_code.name, "b")
286
+ val chunk = new LuaClosure(prototype, globals)
287
+ chunk.call()
288
+ }
289
+
290
+ private def callMapScript(lua_byte_code: LuaMetaData, row: Row): Row = {
291
+ callScript(lua_byte_code, row) match {
292
+ case row: LuaRowValue => row.row
293
+ case table: LuaTable => LuaRowValue.luaTableToRow(table)
294
+ }
295
+ }
296
+
297
+ private def callFlatMapScript(lua_byte_code: LuaMetaData, row: Row) = {
298
+ callScript(lua_byte_code, row) match {
299
+ case list: LuaTable => {
300
+ (1 to list.length).map {
301
+ index: Int => list.get(index) match {
302
+ case row: LuaRowValue => row.row
303
+ }
304
+ }
305
+ }
306
+ }
307
+ }
308
+
309
+ private def callFilterScript(lua_byte_code: LuaMetaData, row: Row): Boolean = {
310
+ callScript(lua_byte_code, row) match {
311
+ case bool: LuaBoolean => bool.toboolean()
312
+ }
313
+ }
314
+
315
+ private def callReduceScript(lua_byte_code: LuaMetaData, lhs: Row, rhs: Row): Row = {
316
+ callPairScript(lua_byte_code, lhs, rhs) match {
317
+ case row: LuaRowValue => row.row
318
+ case table: LuaTable => LuaRowValue.luaTableToRow(table)
319
+ }
320
+ }
321
+
322
+ private def callGroupByStringScript(lua_byte_code: LuaMetaData, row: Row): String = {
323
+ callScript(lua_byte_code, row) match {
324
+ case str: LuaString => str.toString()
325
+ }
326
+ }
327
+
328
+ private def callGroupByStringArrayScript(lua_byte_code: LuaMetaData, row: Row): PartitionableStringArray = {
329
+ callScript(lua_byte_code, row) match {
330
+ case table: LuaTable => new PartitionableStringArray(LuaRowValue.luaTableToArray(table))
331
+ }
332
+ }
333
+
334
+ private def callGroupByIntScript(lua_byte_code: LuaMetaData, row: Row): Int = {
335
+ callScript(lua_byte_code, row) match {
336
+ case num: LuaInteger => num.toint()
337
+ }
338
+ }
339
+
340
+ private def callGroupByFloatScript(lua_byte_code: LuaMetaData, row: Row): Float = {
341
+ callScript(lua_byte_code, row) match {
342
+ case fnum: LuaDouble => fnum.tofloat()
343
+ }
344
+ }
345
+
346
+ private def callGroupByDoubleScript(lua_byte_code: LuaMetaData, row: Row): Double = {
347
+ callScript(lua_byte_code, row) match {
348
+ case fnum: LuaDouble => fnum.todouble()
349
+ }
350
+ }
351
+
352
+ }