cassandra_model_spark 0.0.1.5 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,20 +1,38 @@
1
+ module CassandraModel
2
+ module Spark
3
+ module Lib
4
+
5
+ end
6
+ end
7
+ end
8
+
1
9
  if RUBY_ENGINE == 'jruby'
2
10
  class Hash
3
11
  def to_java
4
- JavaHashMap.new(self)
12
+ CassandraModel::Spark::Lib::JavaHashMap.new(self)
5
13
  end
6
14
  end
7
15
 
8
16
  class Array
17
+ def self.from_java_array_list(array_list)
18
+ array_list.to_a
19
+ end
20
+
9
21
  def to_java_argv
10
22
  to_java(:string)
11
23
  end
12
24
  end
13
25
 
26
+ class String
27
+ def self.from_java_string(string)
28
+ string
29
+ end
30
+ end
31
+
14
32
  else
15
33
  class Hash
16
34
  def to_java
17
- JavaHashMap.new.tap do |map|
35
+ CassandraModel::Spark::Lib::JavaHashMap.new.tap do |map|
18
36
  each do |key, value|
19
37
  map.put(key, value)
20
38
  end
@@ -23,6 +41,10 @@ else
23
41
  end
24
42
 
25
43
  class Array
44
+ def self.from_java_array_list(array_list)
45
+ array_list.toArray
46
+ end
47
+
26
48
  def to_java
27
49
  self
28
50
  end
@@ -31,6 +53,12 @@ else
31
53
  self
32
54
  end
33
55
  end
56
+
57
+ class String
58
+ def self.from_java_string(string)
59
+ string.toString
60
+ end
61
+ end
34
62
  end
35
63
 
36
64
  module JavaBridge
@@ -38,7 +66,7 @@ module JavaBridge
38
66
  def import_java_object(path, options = {})
39
67
  name = options.fetch(:as) { path.split('.').last }.to_sym
40
68
  klass = "Java::#{path}"
41
- Object.const_set(name, eval(klass))
69
+ set_import_const(name, eval(klass))
42
70
  end
43
71
 
44
72
  def initialize_java_engine
@@ -47,7 +75,7 @@ module JavaBridge
47
75
  else
48
76
  def import_java_object(path, options = {})
49
77
  name = options.fetch(:as) { path.split('.').last }.to_sym
50
- Object.const_set(name, load_java_class(path))
78
+ set_import_const(name, load_java_class(path))
51
79
  end
52
80
 
53
81
  def require(path)
@@ -79,6 +107,14 @@ module JavaBridge
79
107
  end
80
108
  end
81
109
 
110
+ private
111
+
112
+ def set_import_const(name, value)
113
+ CassandraModel::Spark::Lib.const_set(name, value)
114
+ end
115
+
116
+ public
117
+
82
118
  def import_quiet
83
119
  prev_verbox = $VERBOSE
84
120
  $VERBOSE = nil
@@ -1,36 +1,51 @@
1
1
  import_java_object 'java.util.ArrayList'
2
2
  import_java_object 'org.apache.spark.SparkConf'
3
3
  import_java_object 'org.apache.spark.api.java.JavaSparkContext'
4
+ import_java_object 'org.apache.spark.streaming.api.java.JavaStreamingContext', as: 'JavaSparkStreamingContext'
5
+ import_java_object 'org.apache.spark.streaming.Duration', as: 'SparkDuration'
4
6
  import_java_object 'org.apache.spark.sql.cassandra.CassandraSQLContext'
5
7
  import_java_object 'java.util.HashMap', as: 'JavaHashMap'
6
8
  import_java_object 'org.apache.spark.sql.SQLContext', as: 'SparkSQLContext'
7
9
  import_java_object 'org.apache.spark.sql.RowFactory', as: 'SparkRowFactory'
10
+ import_java_object 'org.apache.spark.sql.catalyst.expressions.GenericRow', as: 'SqlGenericRow'
8
11
  import_java_object 'org.apache.log4j.Logger', as: 'JLogger'
9
12
  import_java_object 'org.apache.log4j.Level', as: 'JLevel'
10
13
  import_java_object 'org.apache.log4j.Priority', as: 'JPriority'
11
14
  import_java_object 'org.apache.spark.util.Utils', as: 'SparkUtils'
12
15
  import_java_object 'org.apache.spark.storage.StorageLevel', as: 'JStorageLevel'
16
+
17
+ import_java_object 'org.apache.spark.api.cassandra_model.ColumnDeserializer', as: 'SparkColumnDeserializer'
18
+ import_java_object 'org.apache.spark.api.cassandra_model.RowConversions', as: 'SqlRowConversions'
13
19
  import_java_object 'org.apache.spark.api.cassandra_model.CassandraHelper', as: 'SparkCassandraHelper'
14
- import_java_object 'org.apache.spark.api.cassandra_model.SchemaBuilder', as: 'SparkSchemaBuilder'
15
20
  import_java_object 'org.apache.spark.api.cassandra_model.DataTypeHelper', as: 'SparkSqlDataTypeHelper'
16
21
  import_java_object 'org.apache.spark.api.cassandra_model.MarshalLoader', as: 'ScalaMarshalLoader'
17
22
  import_java_object 'org.apache.spark.api.cassandra_model.MapStringStringRowMapping', as: 'SparkMapStringStringRowMapping'
18
23
  import_java_object 'org.apache.spark.api.cassandra_model.SparkRowRowMapping', as: 'SparkSparkRowRowMapping'
24
+ import_java_object 'org.apache.spark.api.cassandra_model.LuaRDD'
25
+
19
26
  import_java_object 'org.apache.spark.deploy.master.Master', as: 'SparkMaster'
20
27
  import_java_object 'org.apache.spark.deploy.worker.RubyWorkerStarter', as: 'SparkWorkerStarter'
21
28
 
29
+ if CassandraModel.const_defined?('TESTING_SCALA')
30
+ import_java_object 'com.datastax.spark.connector.CassandraRow', as: 'SparkCassandraRow'
31
+ import_java_object 'org.apache.spark.api.cassandra_model.LuaRowValue'
32
+ import_java_object 'org.apache.spark.api.cassandra_model.LuaRowLib'
33
+ end
34
+
22
35
  %w(BinaryType BooleanType ByteType DataType
23
36
  DateType Decimal DecimalType DoubleType FloatType IntegerType
24
37
  LongType Metadata NullType PrecisionInfo ShortType
38
+ ArrayType MapType
25
39
  StringType StructField StructType TimestampType).each do |sql_type|
26
- Object.const_set(:"Sql#{sql_type}", import_quiet { SparkSqlDataTypeHelper.public_send(:"get#{sql_type}") })
40
+ type = import_quiet { CassandraModel::Spark::Lib::SparkSqlDataTypeHelper.public_send(:"get#{sql_type}") }
41
+ CassandraModel::Spark::Lib.const_set(:"Sql#{sql_type}", type)
27
42
  end
28
43
 
29
44
  #noinspection RubyConstantNamingConvention
30
- SqlStringArrayType = SparkSqlDataTypeHelper.getArrayType(SqlStringType)
45
+ SqlStringArrayType = CassandraModel::Spark::Lib::SparkSqlDataTypeHelper.getArrayType(CassandraModel::Spark::Lib::SqlStringType)
31
46
 
32
47
  #noinspection RubyConstantNamingConvention
33
- SqlIntegerArrayType = SparkSqlDataTypeHelper.getArrayType(SqlIntegerType)
48
+ SqlIntegerArrayType = CassandraModel::Spark::Lib::SparkSqlDataTypeHelper.getArrayType(CassandraModel::Spark::Lib::SqlIntegerType)
34
49
 
35
50
  #noinspection RubyConstantNamingConvention
36
- SqlStringStringMapType = SparkSqlDataTypeHelper.getMapType(SqlStringType, SqlStringType)
51
+ SqlStringStringMapType = CassandraModel::Spark::Lib::SparkSqlDataTypeHelper.getMapType(CassandraModel::Spark::Lib::SqlStringType, CassandraModel::Spark::Lib::SqlStringType)
@@ -16,7 +16,7 @@ module CassandraModel
16
16
 
17
17
  def data_frame_from_frame(options)
18
18
  query_frame = @record_klass.query(@params, @options)
19
- Spark::DataFrame.new(@record_klass.record_klass, nil, options.merge(spark_data_frame: query_frame))
19
+ Spark::DataFrame.new(options.delete(:class) || @record_klass.record_klass, nil, options.merge(spark_data_frame: query_frame))
20
20
  end
21
21
 
22
22
  def data_frame_from_model(options)
@@ -30,8 +30,8 @@ module CassandraModel
30
30
  end
31
31
  memo.merge!(updated_key => value)
32
32
  end.stringify_keys.to_java
33
- rdd = SparkCassandraHelper.filterRDD(@record_klass.rdd, updated_restriction)
34
- Spark::DataFrame.new(@record_klass, rdd, options)
33
+ rdd = Spark::Lib::SparkCassandraHelper.filterRDD(@record_klass.rdd, updated_restriction)
34
+ Spark::DataFrame.new(options.delete(:class) || @record_klass, rdd, options)
35
35
  end
36
36
  end
37
37
  end
@@ -2,7 +2,7 @@ module CassandraModel
2
2
  class RawConnection
3
3
  def java_spark_context
4
4
  @spark_context ||= begin
5
- JavaSparkContext.new(spark_conf).tap do |java_spark_context|
5
+ Spark::Lib::JavaSparkContext.new(spark_conf).tap do |java_spark_context|
6
6
  java_spark_context.sc.addJar("#{Spark.classpath}/cmodel_scala_helper.jar")
7
7
  end
8
8
  end
@@ -12,10 +12,19 @@ module CassandraModel
12
12
  java_spark_context.sc
13
13
  end
14
14
 
15
+ def has_spark_context?
16
+ !!@spark_context
17
+ end
18
+
19
+ #noinspection RubyInstanceMethodNamingConvention
20
+ def create_java_spark_streaming_context
21
+ Spark::Lib::JavaSparkStreamingContext.new(java_spark_context, Spark::Lib::SparkDuration.new(2000))
22
+ end
23
+
15
24
  private
16
25
 
17
26
  def spark_conf
18
- @spark_conf ||= SparkConf.new(true).tap do |conf|
27
+ @spark_conf ||= Spark::Lib::SparkConf.new(true).tap do |conf|
19
28
  conf.set('spark.app.name', 'cassandra_model_spark')
20
29
  conf.set('spark.master', 'local[*]')
21
30
  conf.set('spark.cassandra.connection.host', config[:hosts].first)
@@ -44,4 +53,4 @@ module CassandraModel
44
53
  config.slice(:spark)
45
54
  end
46
55
  end
47
- end
56
+ end
@@ -1,7 +1,7 @@
1
1
  module CassandraModel
2
2
  class Record
3
3
  def self.rdd
4
- @spark_rdd ||= SparkCassandraHelper.cassandra_table(
4
+ @spark_rdd ||= Spark::Lib::SparkCassandraHelper.cassandra_table(
5
5
  table.connection.spark_context,
6
6
  table.connection.config[:keyspace],
7
7
  table_name)
@@ -14,5 +14,9 @@ module CassandraModel
14
14
  def self.count
15
15
  rdd.count
16
16
  end
17
+
18
+ def self.sql_schema
19
+ Spark::SqlSchema.new(cassandra_columns)
20
+ end
17
21
  end
18
22
  end
@@ -0,0 +1,47 @@
1
+ module CassandraModel
2
+ module Spark
3
+ class Schema
4
+ attr_reader :schema
5
+
6
+ def initialize(sql_schema)
7
+ @schema = sql_schema.fields.inject({}) do |memo, field|
8
+ column = field.name
9
+ type = field.dataType
10
+ memo.merge!(column.to_sym => sql_type(type))
11
+ end
12
+ end
13
+
14
+ def ==(rhs)
15
+ rhs.is_a?(Schema) && rhs.schema == schema
16
+ end
17
+
18
+ private
19
+
20
+ def sql_type(type)
21
+ case sql_type_name(type)
22
+ when 'ArrayType'
23
+ [:list, sql_type(type.elementType)]
24
+ when 'MapType'
25
+ [:map, sql_type(type.keyType), sql_type(type.valueType) ]
26
+ when 'IntegerType'
27
+ :int
28
+ when 'BooleanType'
29
+ :boolean
30
+ when 'DoubleType'
31
+ :double
32
+ when 'BinaryType'
33
+ :blob
34
+ when 'TimestampType'
35
+ :timestamp
36
+ else
37
+ :text
38
+ end
39
+ end
40
+
41
+ def sql_type_name(data_type)
42
+ data_type.getClass.getSimpleName
43
+ end
44
+
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,49 @@
1
+ module CassandraModel
2
+ module Spark
3
+ class SqlSchema
4
+ attr_reader :schema
5
+
6
+ def initialize(cassandra_schema)
7
+ fields = cassandra_schema.map do |column, type|
8
+ Lib::SqlStructField.apply(column.to_s, sql_type(type), true, Lib::SqlMetadata.empty)
9
+ end
10
+ if RUBY_ENGINE == 'jruby'
11
+ fields = fields.to_java('org.apache.spark.sql.types.StructField')
12
+ end
13
+ @schema = Lib::SqlStructType.apply(fields)
14
+ end
15
+
16
+ def ==(rhs)
17
+ rhs.is_a?(SqlSchema) && rhs.schema == schema
18
+ end
19
+
20
+ private
21
+
22
+ def sql_type(type)
23
+ case type
24
+ when Array
25
+ base_type, first_type, second_type = type
26
+ case base_type
27
+ when :map
28
+ Lib::SqlMapType.apply(sql_type(first_type), sql_type(second_type), true)
29
+ else
30
+ Lib::SqlArrayType.apply(sql_type(first_type))
31
+ end
32
+ when :int
33
+ Lib::SqlIntegerType
34
+ when :boolean
35
+ Lib::SqlBooleanType
36
+ when :double
37
+ Lib::SqlDoubleType
38
+ when :blob
39
+ Lib::SqlBinaryType
40
+ when :timestamp
41
+ Lib::SqlTimestampType
42
+ else
43
+ Lib::SqlStringType
44
+ end
45
+ end
46
+
47
+ end
48
+ end
49
+ end
metadata CHANGED
@@ -1,43 +1,55 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cassandra_model_spark
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1.5
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Thomas RM Rogers
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-12-29 00:00:00.000000000 Z
11
+ date: 2016-05-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: cassandra_model
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: 0.10.0
20
+ - - <=
18
21
  - !ruby/object:Gem::Version
19
- version: 0.9.16
22
+ version: '1.1'
20
23
  type: :runtime
21
24
  prerelease: false
22
25
  version_requirements: !ruby/object:Gem::Requirement
23
26
  requirements:
24
- - - ~>
27
+ - - '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 0.10.0
30
+ - - <=
25
31
  - !ruby/object:Gem::Version
26
- version: 0.9.16
32
+ version: '1.1'
27
33
  - !ruby/object:Gem::Dependency
28
34
  name: thomas_utils
29
35
  requirement: !ruby/object:Gem::Requirement
30
36
  requirements:
31
- - - ~>
37
+ - - '>='
32
38
  - !ruby/object:Gem::Version
33
39
  version: 0.1.16
40
+ - - <
41
+ - !ruby/object:Gem::Version
42
+ version: 0.3.0
34
43
  type: :runtime
35
44
  prerelease: false
36
45
  version_requirements: !ruby/object:Gem::Requirement
37
46
  requirements:
38
- - - ~>
47
+ - - '>='
39
48
  - !ruby/object:Gem::Version
40
49
  version: 0.1.16
50
+ - - <
51
+ - !ruby/object:Gem::Version
52
+ version: 0.3.0
41
53
  - !ruby/object:Gem::Dependency
42
54
  name: rjb
43
55
  requirement: !ruby/object:Gem::Requirement
@@ -80,13 +92,16 @@ files:
80
92
  - ext/scala_helper/bin/spark-class
81
93
  - ext/scala_helper/build.sbt
82
94
  - ext/scala_helper/cassandra_helper.scala
95
+ - ext/scala_helper/column_deserializer.scala
83
96
  - ext/scala_helper/data_type_helper.scala
97
+ - ext/scala_helper/lua_rdd.scala
98
+ - ext/scala_helper/lua_row_lib.scala
84
99
  - ext/scala_helper/marshal_loader.scala
85
100
  - ext/scala_helper/marshal_row_mapping.scala
86
101
  - ext/scala_helper/project/plugins.sbt
102
+ - ext/scala_helper/row_conversions.scala
87
103
  - ext/scala_helper/sbin/spark-config.sh
88
104
  - ext/scala_helper/sbin/spark-daemon.sh
89
- - ext/scala_helper/schema_builder.scala
90
105
  - ext/scala_helper/worker.scala
91
106
  - lib/cassandra_model_spark.rb
92
107
  - lib/cassandra_model_spark/build.rb
@@ -99,7 +114,9 @@ files:
99
114
  - lib/cassandra_model_spark/query_builder.rb
100
115
  - lib/cassandra_model_spark/raw_connection.rb
101
116
  - lib/cassandra_model_spark/record.rb
117
+ - lib/cassandra_model_spark/schema.rb
102
118
  - lib/cassandra_model_spark/spark.rb
119
+ - lib/cassandra_model_spark/sql_schema.rb
103
120
  homepage: https://www.github.com/thomasrogers03/cassandra_model_spark
104
121
  licenses:
105
122
  - Apache License 2.0
@@ -1,35 +0,0 @@
1
- package org.apache.spark.api.cassandra_model
2
-
3
- import org.apache.spark.rdd._
4
- import com.datastax.spark.connector._
5
- import com.datastax.spark.connector.rdd._
6
- import org.apache.spark.sql._
7
- import org.apache.spark.sql.types._
8
-
9
- class SchemaBuilder {
10
- var fields = Array[StructField]()
11
-
12
- def build = StructType(fields)
13
-
14
- def cassandraRDDToRDD(rdd: RDD[CassandraRow]) = {
15
- rdd.map(
16
- p => Row.fromSeq(
17
- p.columnValues.map{
18
- p => p match {
19
- case (d: java.util.Date) => new java.sql.Timestamp(d.getTime())
20
- case _ => p
21
- }
22
- }
23
- )
24
- )
25
- }
26
-
27
- def createDataFrame(sqlc: SQLContext, rdd: RDD[CassandraRow]) = {
28
- val new_rdd = cassandraRDDToRDD(rdd)
29
- sqlc.createDataFrame(new_rdd, build)
30
- }
31
-
32
- def addColumn(name: String, sql_type: DataType) = {
33
- fields :+= StructField(name, sql_type, true)
34
- }
35
- }