cassandra_model_spark 0.0.1.5-java → 0.0.4-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,20 +1,38 @@
1
+ module CassandraModel
2
+ module Spark
3
+ module Lib
4
+
5
+ end
6
+ end
7
+ end
8
+
1
9
  if RUBY_ENGINE == 'jruby'
2
10
  class Hash
3
11
  def to_java
4
- JavaHashMap.new(self)
12
+ CassandraModel::Spark::Lib::JavaHashMap.new(self)
5
13
  end
6
14
  end
7
15
 
8
16
  class Array
17
+ def self.from_java_array_list(array_list)
18
+ array_list.to_a
19
+ end
20
+
9
21
  def to_java_argv
10
22
  to_java(:string)
11
23
  end
12
24
  end
13
25
 
26
+ class String
27
+ def self.from_java_string(string)
28
+ string
29
+ end
30
+ end
31
+
14
32
  else
15
33
  class Hash
16
34
  def to_java
17
- JavaHashMap.new.tap do |map|
35
+ CassandraModel::Spark::Lib::JavaHashMap.new.tap do |map|
18
36
  each do |key, value|
19
37
  map.put(key, value)
20
38
  end
@@ -23,6 +41,10 @@ else
23
41
  end
24
42
 
25
43
  class Array
44
+ def self.from_java_array_list(array_list)
45
+ array_list.toArray
46
+ end
47
+
26
48
  def to_java
27
49
  self
28
50
  end
@@ -31,6 +53,12 @@ else
31
53
  self
32
54
  end
33
55
  end
56
+
57
+ class String
58
+ def self.from_java_string(string)
59
+ string.toString
60
+ end
61
+ end
34
62
  end
35
63
 
36
64
  module JavaBridge
@@ -38,7 +66,7 @@ module JavaBridge
38
66
  def import_java_object(path, options = {})
39
67
  name = options.fetch(:as) { path.split('.').last }.to_sym
40
68
  klass = "Java::#{path}"
41
- Object.const_set(name, eval(klass))
69
+ set_import_const(name, eval(klass))
42
70
  end
43
71
 
44
72
  def initialize_java_engine
@@ -47,7 +75,7 @@ module JavaBridge
47
75
  else
48
76
  def import_java_object(path, options = {})
49
77
  name = options.fetch(:as) { path.split('.').last }.to_sym
50
- Object.const_set(name, load_java_class(path))
78
+ set_import_const(name, load_java_class(path))
51
79
  end
52
80
 
53
81
  def require(path)
@@ -79,6 +107,14 @@ module JavaBridge
79
107
  end
80
108
  end
81
109
 
110
+ private
111
+
112
+ def set_import_const(name, value)
113
+ CassandraModel::Spark::Lib.const_set(name, value)
114
+ end
115
+
116
+ public
117
+
82
118
  def import_quiet
83
119
  prev_verbox = $VERBOSE
84
120
  $VERBOSE = nil
@@ -1,36 +1,51 @@
1
1
  import_java_object 'java.util.ArrayList'
2
2
  import_java_object 'org.apache.spark.SparkConf'
3
3
  import_java_object 'org.apache.spark.api.java.JavaSparkContext'
4
+ import_java_object 'org.apache.spark.streaming.api.java.JavaStreamingContext', as: 'JavaSparkStreamingContext'
5
+ import_java_object 'org.apache.spark.streaming.Duration', as: 'SparkDuration'
4
6
  import_java_object 'org.apache.spark.sql.cassandra.CassandraSQLContext'
5
7
  import_java_object 'java.util.HashMap', as: 'JavaHashMap'
6
8
  import_java_object 'org.apache.spark.sql.SQLContext', as: 'SparkSQLContext'
7
9
  import_java_object 'org.apache.spark.sql.RowFactory', as: 'SparkRowFactory'
10
+ import_java_object 'org.apache.spark.sql.catalyst.expressions.GenericRow', as: 'SqlGenericRow'
8
11
  import_java_object 'org.apache.log4j.Logger', as: 'JLogger'
9
12
  import_java_object 'org.apache.log4j.Level', as: 'JLevel'
10
13
  import_java_object 'org.apache.log4j.Priority', as: 'JPriority'
11
14
  import_java_object 'org.apache.spark.util.Utils', as: 'SparkUtils'
12
15
  import_java_object 'org.apache.spark.storage.StorageLevel', as: 'JStorageLevel'
16
+
17
+ import_java_object 'org.apache.spark.api.cassandra_model.ColumnDeserializer', as: 'SparkColumnDeserializer'
18
+ import_java_object 'org.apache.spark.api.cassandra_model.RowConversions', as: 'SqlRowConversions'
13
19
  import_java_object 'org.apache.spark.api.cassandra_model.CassandraHelper', as: 'SparkCassandraHelper'
14
- import_java_object 'org.apache.spark.api.cassandra_model.SchemaBuilder', as: 'SparkSchemaBuilder'
15
20
  import_java_object 'org.apache.spark.api.cassandra_model.DataTypeHelper', as: 'SparkSqlDataTypeHelper'
16
21
  import_java_object 'org.apache.spark.api.cassandra_model.MarshalLoader', as: 'ScalaMarshalLoader'
17
22
  import_java_object 'org.apache.spark.api.cassandra_model.MapStringStringRowMapping', as: 'SparkMapStringStringRowMapping'
18
23
  import_java_object 'org.apache.spark.api.cassandra_model.SparkRowRowMapping', as: 'SparkSparkRowRowMapping'
24
+ import_java_object 'org.apache.spark.api.cassandra_model.LuaRDD'
25
+
19
26
  import_java_object 'org.apache.spark.deploy.master.Master', as: 'SparkMaster'
20
27
  import_java_object 'org.apache.spark.deploy.worker.RubyWorkerStarter', as: 'SparkWorkerStarter'
21
28
 
29
+ if CassandraModel.const_defined?('TESTING_SCALA')
30
+ import_java_object 'com.datastax.spark.connector.CassandraRow', as: 'SparkCassandraRow'
31
+ import_java_object 'org.apache.spark.api.cassandra_model.LuaRowValue'
32
+ import_java_object 'org.apache.spark.api.cassandra_model.LuaRowLib'
33
+ end
34
+
22
35
  %w(BinaryType BooleanType ByteType DataType
23
36
  DateType Decimal DecimalType DoubleType FloatType IntegerType
24
37
  LongType Metadata NullType PrecisionInfo ShortType
38
+ ArrayType MapType
25
39
  StringType StructField StructType TimestampType).each do |sql_type|
26
- Object.const_set(:"Sql#{sql_type}", import_quiet { SparkSqlDataTypeHelper.public_send(:"get#{sql_type}") })
40
+ type = import_quiet { CassandraModel::Spark::Lib::SparkSqlDataTypeHelper.public_send(:"get#{sql_type}") }
41
+ CassandraModel::Spark::Lib.const_set(:"Sql#{sql_type}", type)
27
42
  end
28
43
 
29
44
  #noinspection RubyConstantNamingConvention
30
- SqlStringArrayType = SparkSqlDataTypeHelper.getArrayType(SqlStringType)
45
+ SqlStringArrayType = CassandraModel::Spark::Lib::SparkSqlDataTypeHelper.getArrayType(CassandraModel::Spark::Lib::SqlStringType)
31
46
 
32
47
  #noinspection RubyConstantNamingConvention
33
- SqlIntegerArrayType = SparkSqlDataTypeHelper.getArrayType(SqlIntegerType)
48
+ SqlIntegerArrayType = CassandraModel::Spark::Lib::SparkSqlDataTypeHelper.getArrayType(CassandraModel::Spark::Lib::SqlIntegerType)
34
49
 
35
50
  #noinspection RubyConstantNamingConvention
36
- SqlStringStringMapType = SparkSqlDataTypeHelper.getMapType(SqlStringType, SqlStringType)
51
+ SqlStringStringMapType = CassandraModel::Spark::Lib::SparkSqlDataTypeHelper.getMapType(CassandraModel::Spark::Lib::SqlStringType, CassandraModel::Spark::Lib::SqlStringType)
@@ -16,7 +16,7 @@ module CassandraModel
16
16
 
17
17
  def data_frame_from_frame(options)
18
18
  query_frame = @record_klass.query(@params, @options)
19
- Spark::DataFrame.new(@record_klass.record_klass, nil, options.merge(spark_data_frame: query_frame))
19
+ Spark::DataFrame.new(options.delete(:class) || @record_klass.record_klass, nil, options.merge(spark_data_frame: query_frame))
20
20
  end
21
21
 
22
22
  def data_frame_from_model(options)
@@ -30,8 +30,8 @@ module CassandraModel
30
30
  end
31
31
  memo.merge!(updated_key => value)
32
32
  end.stringify_keys.to_java
33
- rdd = SparkCassandraHelper.filterRDD(@record_klass.rdd, updated_restriction)
34
- Spark::DataFrame.new(@record_klass, rdd, options)
33
+ rdd = Spark::Lib::SparkCassandraHelper.filterRDD(@record_klass.rdd, updated_restriction)
34
+ Spark::DataFrame.new(options.delete(:class) || @record_klass, rdd, options)
35
35
  end
36
36
  end
37
37
  end
@@ -2,7 +2,7 @@ module CassandraModel
2
2
  class RawConnection
3
3
  def java_spark_context
4
4
  @spark_context ||= begin
5
- JavaSparkContext.new(spark_conf).tap do |java_spark_context|
5
+ Spark::Lib::JavaSparkContext.new(spark_conf).tap do |java_spark_context|
6
6
  java_spark_context.sc.addJar("#{Spark.classpath}/cmodel_scala_helper.jar")
7
7
  end
8
8
  end
@@ -12,10 +12,19 @@ module CassandraModel
12
12
  java_spark_context.sc
13
13
  end
14
14
 
15
+ def has_spark_context?
16
+ !!@spark_context
17
+ end
18
+
19
+ #noinspection RubyInstanceMethodNamingConvention
20
+ def create_java_spark_streaming_context
21
+ Spark::Lib::JavaSparkStreamingContext.new(java_spark_context, Spark::Lib::SparkDuration.new(2000))
22
+ end
23
+
15
24
  private
16
25
 
17
26
  def spark_conf
18
- @spark_conf ||= SparkConf.new(true).tap do |conf|
27
+ @spark_conf ||= Spark::Lib::SparkConf.new(true).tap do |conf|
19
28
  conf.set('spark.app.name', 'cassandra_model_spark')
20
29
  conf.set('spark.master', 'local[*]')
21
30
  conf.set('spark.cassandra.connection.host', config[:hosts].first)
@@ -44,4 +53,4 @@ module CassandraModel
44
53
  config.slice(:spark)
45
54
  end
46
55
  end
47
- end
56
+ end
@@ -1,7 +1,7 @@
1
1
  module CassandraModel
2
2
  class Record
3
3
  def self.rdd
4
- @spark_rdd ||= SparkCassandraHelper.cassandra_table(
4
+ @spark_rdd ||= Spark::Lib::SparkCassandraHelper.cassandra_table(
5
5
  table.connection.spark_context,
6
6
  table.connection.config[:keyspace],
7
7
  table_name)
@@ -14,5 +14,9 @@ module CassandraModel
14
14
  def self.count
15
15
  rdd.count
16
16
  end
17
+
18
+ def self.sql_schema
19
+ Spark::SqlSchema.new(cassandra_columns)
20
+ end
17
21
  end
18
22
  end
@@ -0,0 +1,47 @@
1
+ module CassandraModel
2
+ module Spark
3
+ class Schema
4
+ attr_reader :schema
5
+
6
+ def initialize(sql_schema)
7
+ @schema = sql_schema.fields.inject({}) do |memo, field|
8
+ column = field.name
9
+ type = field.dataType
10
+ memo.merge!(column.to_sym => sql_type(type))
11
+ end
12
+ end
13
+
14
+ def ==(rhs)
15
+ rhs.is_a?(Schema) && rhs.schema == schema
16
+ end
17
+
18
+ private
19
+
20
+ def sql_type(type)
21
+ case sql_type_name(type)
22
+ when 'ArrayType'
23
+ [:list, sql_type(type.elementType)]
24
+ when 'MapType'
25
+ [:map, sql_type(type.keyType), sql_type(type.valueType) ]
26
+ when 'IntegerType'
27
+ :int
28
+ when 'BooleanType'
29
+ :boolean
30
+ when 'DoubleType'
31
+ :double
32
+ when 'BinaryType'
33
+ :blob
34
+ when 'TimestampType'
35
+ :timestamp
36
+ else
37
+ :text
38
+ end
39
+ end
40
+
41
+ def sql_type_name(data_type)
42
+ data_type.getClass.getSimpleName
43
+ end
44
+
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,49 @@
1
+ module CassandraModel
2
+ module Spark
3
+ class SqlSchema
4
+ attr_reader :schema
5
+
6
+ def initialize(cassandra_schema)
7
+ fields = cassandra_schema.map do |column, type|
8
+ Lib::SqlStructField.apply(column.to_s, sql_type(type), true, Lib::SqlMetadata.empty)
9
+ end
10
+ if RUBY_ENGINE == 'jruby'
11
+ fields = fields.to_java('org.apache.spark.sql.types.StructField')
12
+ end
13
+ @schema = Lib::SqlStructType.apply(fields)
14
+ end
15
+
16
+ def ==(rhs)
17
+ rhs.is_a?(SqlSchema) && rhs.schema == schema
18
+ end
19
+
20
+ private
21
+
22
+ def sql_type(type)
23
+ case type
24
+ when Array
25
+ base_type, first_type, second_type = type
26
+ case base_type
27
+ when :map
28
+ Lib::SqlMapType.apply(sql_type(first_type), sql_type(second_type), true)
29
+ else
30
+ Lib::SqlArrayType.apply(sql_type(first_type))
31
+ end
32
+ when :int
33
+ Lib::SqlIntegerType
34
+ when :boolean
35
+ Lib::SqlBooleanType
36
+ when :double
37
+ Lib::SqlDoubleType
38
+ when :blob
39
+ Lib::SqlBinaryType
40
+ when :timestamp
41
+ Lib::SqlTimestampType
42
+ else
43
+ Lib::SqlStringType
44
+ end
45
+ end
46
+
47
+ end
48
+ end
49
+ end
metadata CHANGED
@@ -1,43 +1,55 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cassandra_model_spark
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1.5
4
+ version: 0.0.4
5
5
  platform: java
6
6
  authors:
7
7
  - Thomas RM Rogers
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-12-29 00:00:00.000000000 Z
11
+ date: 2016-05-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: cassandra_model
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - '>='
18
18
  - !ruby/object:Gem::Version
19
- version: 0.9.16
19
+ version: 0.10.0
20
+ - - <=
21
+ - !ruby/object:Gem::Version
22
+ version: '1.1'
20
23
  type: :runtime
21
24
  prerelease: false
22
25
  version_requirements: !ruby/object:Gem::Requirement
23
26
  requirements:
24
- - - ~>
27
+ - - '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 0.10.0
30
+ - - <=
25
31
  - !ruby/object:Gem::Version
26
- version: 0.9.16
32
+ version: '1.1'
27
33
  - !ruby/object:Gem::Dependency
28
34
  name: thomas_utils
29
35
  requirement: !ruby/object:Gem::Requirement
30
36
  requirements:
31
- - - ~>
37
+ - - '>='
32
38
  - !ruby/object:Gem::Version
33
39
  version: 0.1.16
40
+ - - <
41
+ - !ruby/object:Gem::Version
42
+ version: 0.3.0
34
43
  type: :runtime
35
44
  prerelease: false
36
45
  version_requirements: !ruby/object:Gem::Requirement
37
46
  requirements:
38
- - - ~>
47
+ - - '>='
39
48
  - !ruby/object:Gem::Version
40
49
  version: 0.1.16
50
+ - - <
51
+ - !ruby/object:Gem::Version
52
+ version: 0.3.0
41
53
  description: |-
42
54
  Spark integration for cassandra_model.
43
55
  Get high-performance data analytics with the ease of cassandra_model.
@@ -66,13 +78,16 @@ files:
66
78
  - ext/scala_helper/bin/spark-class
67
79
  - ext/scala_helper/build.sbt
68
80
  - ext/scala_helper/cassandra_helper.scala
81
+ - ext/scala_helper/column_deserializer.scala
69
82
  - ext/scala_helper/data_type_helper.scala
83
+ - ext/scala_helper/lua_rdd.scala
84
+ - ext/scala_helper/lua_row_lib.scala
70
85
  - ext/scala_helper/marshal_loader.scala
71
86
  - ext/scala_helper/marshal_row_mapping.scala
72
87
  - ext/scala_helper/project/plugins.sbt
88
+ - ext/scala_helper/row_conversions.scala
73
89
  - ext/scala_helper/sbin/spark-config.sh
74
90
  - ext/scala_helper/sbin/spark-daemon.sh
75
- - ext/scala_helper/schema_builder.scala
76
91
  - ext/scala_helper/worker.scala
77
92
  - lib/cassandra_model_spark.rb
78
93
  - lib/cassandra_model_spark/build.rb
@@ -85,7 +100,9 @@ files:
85
100
  - lib/cassandra_model_spark/query_builder.rb
86
101
  - lib/cassandra_model_spark/raw_connection.rb
87
102
  - lib/cassandra_model_spark/record.rb
103
+ - lib/cassandra_model_spark/schema.rb
88
104
  - lib/cassandra_model_spark/spark.rb
105
+ - lib/cassandra_model_spark/sql_schema.rb
89
106
  homepage: https://www.github.com/thomasrogers03/cassandra_model_spark
90
107
  licenses:
91
108
  - Apache License 2.0
@@ -1,35 +0,0 @@
1
- package org.apache.spark.api.cassandra_model
2
-
3
- import org.apache.spark.rdd._
4
- import com.datastax.spark.connector._
5
- import com.datastax.spark.connector.rdd._
6
- import org.apache.spark.sql._
7
- import org.apache.spark.sql.types._
8
-
9
- class SchemaBuilder {
10
- var fields = Array[StructField]()
11
-
12
- def build = StructType(fields)
13
-
14
- def cassandraRDDToRDD(rdd: RDD[CassandraRow]) = {
15
- rdd.map(
16
- p => Row.fromSeq(
17
- p.columnValues.map{
18
- p => p match {
19
- case (d: java.util.Date) => new java.sql.Timestamp(d.getTime())
20
- case _ => p
21
- }
22
- }
23
- )
24
- )
25
- }
26
-
27
- def createDataFrame(sqlc: SQLContext, rdd: RDD[CassandraRow]) = {
28
- val new_rdd = cassandraRDDToRDD(rdd)
29
- sqlc.createDataFrame(new_rdd, build)
30
- }
31
-
32
- def addColumn(name: String, sql_type: DataType) = {
33
- fields :+= StructField(name, sql_type, true)
34
- }
35
- }