cassandra_model_spark 0.0.1.5-java → 0.0.4-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 15373fcc369058da2966c722c6b35ac0c164b073
4
- data.tar.gz: d840cc0cd750dea5d198f37ced59608340cced97
3
+ metadata.gz: dc8ac871e123f5a2118c4498e398b4923f81e07e
4
+ data.tar.gz: f954e055c2741a44c14bef9bd851a3db8d21d476
5
5
  SHA512:
6
- metadata.gz: c356202b4c8fc59c8936d4af43ad844a0f70895ee09a5453741354cb957c179486fc7491ec28ad8ef5445f742063a9a86897031e0feaae890b9e886d4a0422cf
7
- data.tar.gz: a0045c5c883ba0148482c97fbe293b303c30e81289be36c83eca45f3fdebe34429f23d1305f6377b250902186837bf5bfd7d8f2e58d4c5a37edb3dfc220c55ac
6
+ metadata.gz: 6fd1d8508b7807334f6245e0ee7d7c89476aad9f384e42e02d52661209f851e0a8a7fce5f73448acb6d86017fb96a8b2818cc8b141ec7db685e7101b96696c7f
7
+ data.tar.gz: a218ec4c41c0cbeff174154498e65028c0858b4dc4d269ab3082103acbb84bdcf8bf138bd67ca0ba822622cddf752c857191d4325cf631872e971e1822333e22
@@ -6,13 +6,14 @@ name := "cmodel_scala_helper"
6
6
  version := "0.0.1"
7
7
  scalaVersion := "2.10.4"
8
8
 
9
- val sparkVersion = "1.5.2"
9
+ val sparkVersion = "1.6.1"
10
10
  val hadoopClientVersion = "1.0.4"
11
- val cassandraConnectionVersion = "1.5.0-M3"
11
+ val cassandraConnectionVersion = "1.6.0-M1"
12
12
 
13
13
  val _targetDir = scala.util.Properties.envOrElse("TARGET_DIR", "target")
14
14
 
15
15
  javacOptions ++= Seq("-source", "1.7", "-target", "1.7")
16
+ scalacOptions ++= Seq("-feature")
16
17
 
17
18
  artifactPath in Compile in packageBin := file(s"${_targetDir}/cmodel_scala_helper.jar")
18
19
  outputPath in packageDependency := file(s"${_targetDir}/spark-assembly-${sparkVersion}-cassandra_model-hadoop${hadoopClientVersion}.jar")
@@ -30,6 +31,8 @@ libraryDependencies ++= Seq(
30
31
  "com.datastax.spark" %% "spark-cassandra-connector" % cassandraConnectionVersion,
31
32
  "com.datastax.spark" %% "spark-cassandra-connector-java" % cassandraConnectionVersion,
32
33
  "com.github.fommil.netlib" % "all" % "1.1.2",
34
+ "com.databricks" % "spark-csv_2.10" % "1.3.0",
35
+ "org.luaj" % "luaj-jse" % "3.0.1",
33
36
  "org.scalatest" % "scalatest_2.10" % "2.2.1" % "test"
34
37
  )
35
38
 
@@ -10,11 +10,12 @@ object CassandraHelper {
10
10
  def cassandraTable(sc: SparkContext, keyspace: String, table: String) = {
11
11
  sc.cassandraTable(keyspace, table)
12
12
  }
13
+
13
14
  def filterRDD(rdd: CassandraRDD[CassandraRow], restriction: HashMap[String, Any]) = {
14
15
  var result = rdd
15
- for ((k,v) <- restriction) {
16
+ for ((k, v) <- restriction) {
16
17
  result = v match {
17
- case (a: Array[Any]) => result.where(k, a : _*)
18
+ case (a: Array[Any]) => result.where(k, a: _*)
18
19
  case _ => result.where(k, v)
19
20
  }
20
21
  }
@@ -0,0 +1,28 @@
1
+ package org.apache.spark.api.cassandra_model
2
+
3
+ import org.apache.spark.rdd._
4
+ import com.datastax.spark.connector._
5
+ import com.datastax.spark.connector.rdd._
6
+
7
+ object ColumnDeserializer {
8
+ def mappedRDD(rdd: RDD[CassandraRow], column: Int): RDD[CassandraRow] = {
9
+ rdd.map(updatedRow(_, column))
10
+ }
11
+
12
+ private def updatedRow(row: CassandraRow, column: Int): CassandraRow =
13
+ {
14
+ val columns = row.columnNames
15
+ val updated_value = getDecodedValue(row, column)
16
+ val values = row.columnValues.updated(column, updated_value)
17
+
18
+ new CassandraRow(columns, values)
19
+ }
20
+
21
+ private def getDecodedValue(row: CassandraRow, column: Int): AnyRef = row.columnValues(column) match {
22
+ case (blob: Array[Byte]) => decodeValue(blob)
23
+ }
24
+
25
+ private def decodeValue(blob: Array[Byte]): AnyRef = {
26
+ new MarshalLoader(blob).getValue()
27
+ }
28
+ }
@@ -4,6 +4,7 @@ import org.apache.spark.sql.types._
4
4
 
5
5
  object DataTypeHelper {
6
6
  def getArrayType(key_type: DataType) = ArrayType(key_type)
7
+ def getArrayType = ArrayType
7
8
  def getBinaryType = BinaryType
8
9
  def getBooleanType = BooleanType
9
10
  def getByteType = ByteType
@@ -16,6 +17,7 @@ object DataTypeHelper {
16
17
  def getIntegerType = IntegerType
17
18
  def getLongType = LongType
18
19
  def getMapType(key_type: DataType, value_type: DataType) = MapType(key_type, value_type)
20
+ def getMapType = MapType
19
21
  def getMetadata = Metadata
20
22
  def getNullType = NullType
21
23
  def getPrecisionInfo = PrecisionInfo
@@ -0,0 +1,352 @@
1
+ package org.apache.spark.api.cassandra_model
2
+
3
+ import org.luaj.vm2.Globals
4
+ import org.luaj.vm2.compiler.LuaC
5
+ import org.luaj.vm2.compiler.DumpState
6
+ import org.luaj.vm2._
7
+ import org.luaj.vm2.lib.jse.JseBaseLib
8
+ import org.luaj.vm2.lib._
9
+ import org.apache.spark.rdd._
10
+ import org.apache.spark.sql._
11
+ import org.apache.spark.sql.types._
12
+ import org.apache.spark.sql.catalyst.expressions.GenericRow
13
+ import scala.reflect.ClassTag
14
+ import java.io._
15
+ import java.security.MessageDigest
16
+
17
+ object LuaRowValue {
18
+ def luaTableToArray[T](table: LuaTable)(implicit m: ClassTag[T]): Array[T] = {
19
+ val keys = table.keys()
20
+ val length = keys.length
21
+ val result = new Array[T](length)
22
+ var index = 0
23
+
24
+ keys.foreach { table_key =>
25
+ val value = table.get(table_key)
26
+ val result_value = value match {
27
+ case str: LuaString => str.toString()
28
+ case num: LuaInteger => num.toint()
29
+ case dfnum: LuaDouble => dfnum.todouble()
30
+ case inner_table: LuaTable => luaTableToArray[T](inner_table)
31
+ case inner_row: LuaRowValue => inner_row.row
32
+ }
33
+ result(index) = result_value match {
34
+ case t_value: T => t_value
35
+ }
36
+ index += 1
37
+ }
38
+ result
39
+ }
40
+
41
+ def luaTableToRow(table: LuaTable): Row = {
42
+ val row: Array[Any] = luaTableToArray(table)
43
+ new GenericRow(row)
44
+ }
45
+ }
46
+
47
+ class LuaRowValue(val schema: StructType, val row: Row) extends LuaValue {
48
+ def `type`(): Int = 999
49
+
50
+ def typename(): String = "Row"
51
+
52
+ override def tostring() = LuaValue.valueOf(row.toString())
53
+
54
+ override def toString() = row.toString()
55
+
56
+ override def get(key: LuaValue): LuaValue = {
57
+ val column_index = schema.fieldIndex(key.toString())
58
+ val field = schema(column_index)
59
+ valueOf(field.dataType, column_index)
60
+ }
61
+
62
+ override def get(column_index: Int): LuaValue = {
63
+ val field = schema(column_index)
64
+ valueOf(field.dataType, column_index)
65
+ }
66
+
67
+ private def valueOf(data_type: DataType, index: Int): LuaValue = {
68
+ data_type match {
69
+ case StringType => LuaValue.valueOf(row.getString(index))
70
+ case IntegerType => LuaValue.valueOf(row.getInt(index))
71
+ case FloatType => LuaValue.valueOf(row.getFloat(index))
72
+ case DoubleType => LuaValue.valueOf(row.getDouble(index))
73
+ case ArrayType(StringType, true) => arrayValueOf[String](index)
74
+ case ArrayType(IntegerType, true) => arrayValueOf[Int](index)
75
+ case ArrayType(FloatType, true) => arrayValueOf[Float](index)
76
+ case ArrayType(DoubleType, true) => arrayValueOf[Double](index)
77
+ case array_type: ArrayType => objectArrayValueOf(array_type, index)
78
+ case inner_schema: StructType => new LuaRowValue(inner_schema, row.getAs[Row](index))
79
+ }
80
+ }
81
+
82
+ private def objectArrayValueOf(array_type: ArrayType, index: Int): LuaValue = {
83
+ array_type.elementType match {
84
+ case inner_schema: StructType => rowArrayValueOf(inner_schema, index)
85
+ }
86
+ }
87
+
88
+ private def rowArrayValueOf(inner_schema: StructType, index: Int): LuaValue = {
89
+ val values: Array[LuaValue] = row.getAs[Array[Row]](index).map(new LuaRowValue(inner_schema, _)).toArray
90
+ new LuaTable(null, values, null)
91
+ }
92
+
93
+ private def arrayValueOf[T](index: Int)(implicit m: ClassTag[T]): LuaValue = {
94
+ val values: Array[LuaValue] = row.getAs[Array[T]](index).map {
95
+ _ match {
96
+ case str: String => LuaValue.valueOf(str)
97
+ case num: Int => LuaValue.valueOf(num)
98
+ case fnum: Float => LuaValue.valueOf(fnum)
99
+ case dfnum: Double => LuaValue.valueOf(dfnum)
100
+ }
101
+ }.toArray
102
+ new LuaTable(null, values, null)
103
+ }
104
+ }
105
+
106
+ class PartitionableStringArray(val items: Array[String]) extends Serializable {
107
+ override val hashCode = {
108
+ val some_prime = 31
109
+ var result = 1
110
+
111
+ for (str <- items) {
112
+ result = result * some_prime + str.hashCode
113
+ }
114
+ result
115
+ }
116
+
117
+ override def equals(rhs: Any) = {
118
+ rhs match {
119
+ case string_array: PartitionableStringArray => items.sameElements(string_array.items)
120
+ case _ => false
121
+ }
122
+ }
123
+ }
124
+
125
+ object LuaRDD {
126
+ private val thread_local_globals = new ThreadLocal[Globals]
127
+ private val digest = MessageDigest.getInstance("SHA-1")
128
+
129
+ def getGlobals(): Globals = thread_local_globals.get()
130
+
131
+ def newGlobals(): Globals = {
132
+ val globals = new Globals()
133
+
134
+ LuaC.install(globals)
135
+ LoadState.install(globals)
136
+ globals.load(new JseBaseLib())
137
+ globals.load(new PackageLib())
138
+ globals.load(new TableLib())
139
+ globals.load(new StringLib())
140
+ globals.load(new LuaRowLib())
141
+
142
+ thread_local_globals.set(globals)
143
+ globals
144
+ }
145
+
146
+ def getGlobalsOrNew(): Globals = {
147
+ var globals = getGlobals()
148
+ if (globals == null)
149
+ globals = newGlobals()
150
+ globals
151
+ }
152
+
153
+ def getLuaCodeDigest(lua_code: String) = {
154
+ val hash_bytes = digest.digest(lua_code.getBytes())
155
+ new String(hash_bytes)
156
+ }
157
+ }
158
+
159
+ class LuaRDD(val schema: StructType, val rdd: RDD[Row]) extends Serializable {
160
+
161
+ private class LuaMetaData(val name: String, val byte_code: Array[Byte]) extends Serializable
162
+
163
+ def map(new_schema: StructType, lua_code: String): LuaRDD = {
164
+ val lua_byte_code = getLuaByteCode(lua_code)
165
+ val new_rdd = rdd.map(callMapScript(lua_byte_code, _))
166
+ new LuaRDD(new_schema, new_rdd)
167
+ }
168
+
169
+ def flatMap(new_schema: StructType, lua_code: String): LuaRDD = {
170
+ val lua_byte_code = getLuaByteCode(lua_code)
171
+ val new_rdd = rdd.flatMap(callFlatMapScript(lua_byte_code, _))
172
+ new LuaRDD(new_schema, new_rdd)
173
+ }
174
+
175
+ def filter(lua_code: String): LuaRDD = {
176
+ val lua_byte_code = getLuaByteCode(lua_code)
177
+ val new_rdd = rdd.filter(callFilterScript(lua_byte_code, _))
178
+ new LuaRDD(schema, new_rdd)
179
+ }
180
+
181
+ def reduceByKeys(key_columns: Array[String], lua_code: String): LuaRDD = {
182
+ val lua_byte_code = getLuaByteCode(lua_code)
183
+ val field_indices = key_columns.map(schema.fieldIndex(_))
184
+ val keys_rdd: RDD[Tuple2[Any, Row]] = rdd.map { case row =>
185
+ val keys: Seq[Any] = field_indices.map(row(_))
186
+ Tuple2(keys, row)
187
+ }
188
+ val reduced_rdd: RDD[Tuple2[Any, Row]] = keys_rdd.reduceByKey { case (lhs, rhs) =>
189
+ callReduceScript(lua_byte_code, lhs, rhs)
190
+ }
191
+ val new_rdd = reduced_rdd.map(_._2)
192
+ new LuaRDD(schema, new_rdd)
193
+ }
194
+
195
+ def groupByString(lua_code: String): LuaRDD = {
196
+ val lua_byte_code = getLuaByteCode(lua_code)
197
+ val new_schema = groupBySchema(StringType)
198
+ val new_rdd = rdd.groupBy(callGroupByStringScript(lua_byte_code, _))
199
+ val grouped_rdd = groupedRDD(new_rdd)
200
+
201
+ new LuaRDD(new_schema, grouped_rdd)
202
+ }
203
+
204
+ def groupByStringArray(lua_code: String): LuaRDD = {
205
+ val lua_byte_code = getLuaByteCode(lua_code)
206
+ val new_schema = groupBySchema(ArrayType(StringType))
207
+ val pre_rdd = rdd.groupBy(callGroupByStringArrayScript(lua_byte_code, _))
208
+ val new_rdd: RDD[(Array[String], Iterable[Row])] = pre_rdd.map { case (key, values) =>
209
+ (key.items, values)
210
+ }
211
+ val grouped_rdd = groupedRDD(new_rdd)
212
+
213
+ new LuaRDD(new_schema, grouped_rdd)
214
+ }
215
+
216
+ def groupByInt(lua_code: String): LuaRDD = {
217
+ val lua_byte_code = getLuaByteCode(lua_code)
218
+ val new_schema = groupBySchema(IntegerType)
219
+ val new_rdd = rdd.groupBy(callGroupByIntScript(lua_byte_code, _))
220
+ val grouped_rdd = groupedRDD(new_rdd)
221
+
222
+ new LuaRDD(new_schema, grouped_rdd)
223
+ }
224
+
225
+ def groupByFloat(lua_code: String): LuaRDD = {
226
+ val lua_byte_code = getLuaByteCode(lua_code)
227
+ val new_schema = groupBySchema(FloatType)
228
+ val new_rdd = rdd.groupBy(callGroupByFloatScript(lua_byte_code, _))
229
+ val grouped_rdd = groupedRDD(new_rdd)
230
+
231
+ new LuaRDD(new_schema, grouped_rdd)
232
+ }
233
+
234
+ def groupByDouble(lua_code: String): LuaRDD = {
235
+ val lua_byte_code = getLuaByteCode(lua_code)
236
+ val new_schema = groupBySchema(DoubleType)
237
+ val new_rdd = rdd.groupBy(callGroupByDoubleScript(lua_byte_code, _))
238
+ val grouped_rdd = groupedRDD(new_rdd)
239
+
240
+ new LuaRDD(new_schema, grouped_rdd)
241
+ }
242
+
243
+ def toDF(sql_context: SQLContext) = sql_context.createDataFrame(rdd, schema)
244
+
245
+ private def getLuaByteCode(lua_code: String) = {
246
+ val output_stream = new ByteArrayOutputStream()
247
+ val name = LuaRDD.getLuaCodeDigest(lua_code)
248
+ val prototype = LuaC.instance.compile(new ByteArrayInputStream(lua_code.getBytes()), name)
249
+ val success = DumpState.dump(prototype, output_stream, true)
250
+
251
+ output_stream.flush()
252
+ success match {
253
+ case 0 => new LuaMetaData(name, output_stream.toByteArray())
254
+ }
255
+ }
256
+
257
+ private def groupBySchema(data_type: DataType): StructType = {
258
+ val fields = Array(StructField("key", data_type), StructField("values", ArrayType(schema)))
259
+ StructType(fields)
260
+ }
261
+
262
+ private def groupedRDD[T](rdd: RDD[(T, Iterable[Row])]): RDD[Row] = {
263
+ rdd.map { case (key, values) =>
264
+ val row: Array[Any] = Array(key, values.toArray)
265
+ new GenericRow(row)
266
+ }
267
+ }
268
+
269
+ private def callScript(lua_byte_code: LuaMetaData, row: Row): LuaValue = {
270
+ val globals = LuaRDD.getGlobalsOrNew()
271
+ globals.set("ROW", new LuaRowValue(schema, row))
272
+
273
+ loadAndCallChunk(globals, lua_byte_code)
274
+ }
275
+
276
+ private def callPairScript(lua_byte_code: LuaMetaData, lhs: Row, rhs: Row): LuaValue = {
277
+ val globals = LuaRDD.getGlobalsOrNew()
278
+ globals.set("LHS", new LuaRowValue(schema, lhs))
279
+ globals.set("RHS", new LuaRowValue(schema, rhs))
280
+
281
+ loadAndCallChunk(globals, lua_byte_code)
282
+ }
283
+
284
+ private def loadAndCallChunk(globals: Globals, lua_byte_code: LuaMetaData): LuaValue = {
285
+ val prototype = globals.loadPrototype(new ByteArrayInputStream(lua_byte_code.byte_code), lua_byte_code.name, "b")
286
+ val chunk = new LuaClosure(prototype, globals)
287
+ chunk.call()
288
+ }
289
+
290
+ private def callMapScript(lua_byte_code: LuaMetaData, row: Row): Row = {
291
+ callScript(lua_byte_code, row) match {
292
+ case row: LuaRowValue => row.row
293
+ case table: LuaTable => LuaRowValue.luaTableToRow(table)
294
+ }
295
+ }
296
+
297
+ private def callFlatMapScript(lua_byte_code: LuaMetaData, row: Row) = {
298
+ callScript(lua_byte_code, row) match {
299
+ case list: LuaTable => {
300
+ (1 to list.length).map {
301
+ index: Int => list.get(index) match {
302
+ case row: LuaRowValue => row.row
303
+ }
304
+ }
305
+ }
306
+ }
307
+ }
308
+
309
+ private def callFilterScript(lua_byte_code: LuaMetaData, row: Row): Boolean = {
310
+ callScript(lua_byte_code, row) match {
311
+ case bool: LuaBoolean => bool.toboolean()
312
+ }
313
+ }
314
+
315
+ private def callReduceScript(lua_byte_code: LuaMetaData, lhs: Row, rhs: Row): Row = {
316
+ callPairScript(lua_byte_code, lhs, rhs) match {
317
+ case row: LuaRowValue => row.row
318
+ case table: LuaTable => LuaRowValue.luaTableToRow(table)
319
+ }
320
+ }
321
+
322
+ private def callGroupByStringScript(lua_byte_code: LuaMetaData, row: Row): String = {
323
+ callScript(lua_byte_code, row) match {
324
+ case str: LuaString => str.toString()
325
+ }
326
+ }
327
+
328
+ private def callGroupByStringArrayScript(lua_byte_code: LuaMetaData, row: Row): PartitionableStringArray = {
329
+ callScript(lua_byte_code, row) match {
330
+ case table: LuaTable => new PartitionableStringArray(LuaRowValue.luaTableToArray(table))
331
+ }
332
+ }
333
+
334
+ private def callGroupByIntScript(lua_byte_code: LuaMetaData, row: Row): Int = {
335
+ callScript(lua_byte_code, row) match {
336
+ case num: LuaInteger => num.toint()
337
+ }
338
+ }
339
+
340
+ private def callGroupByFloatScript(lua_byte_code: LuaMetaData, row: Row): Float = {
341
+ callScript(lua_byte_code, row) match {
342
+ case fnum: LuaDouble => fnum.tofloat()
343
+ }
344
+ }
345
+
346
+ private def callGroupByDoubleScript(lua_byte_code: LuaMetaData, row: Row): Double = {
347
+ callScript(lua_byte_code, row) match {
348
+ case fnum: LuaDouble => fnum.todouble()
349
+ }
350
+ }
351
+
352
+ }