cassandra_model_spark 0.0.1.5-java → 0.0.4-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/scala_helper/build.sbt +5 -2
- data/ext/scala_helper/cassandra_helper.scala +3 -2
- data/ext/scala_helper/column_deserializer.scala +28 -0
- data/ext/scala_helper/data_type_helper.scala +2 -0
- data/ext/scala_helper/lua_rdd.scala +352 -0
- data/ext/scala_helper/lua_row_lib.scala +108 -0
- data/ext/scala_helper/marshal_loader.scala +6 -6
- data/ext/scala_helper/marshal_row_mapping.scala +11 -3
- data/ext/scala_helper/row_conversions.scala +20 -0
- data/lib/cassandra_model_spark.rb +2 -0
- data/lib/cassandra_model_spark/connection_cache.rb +2 -2
- data/lib/cassandra_model_spark/data_frame.rb +121 -35
- data/lib/cassandra_model_spark/java_bridge.rb +40 -4
- data/lib/cassandra_model_spark/java_classes.rb +20 -5
- data/lib/cassandra_model_spark/query_builder.rb +3 -3
- data/lib/cassandra_model_spark/raw_connection.rb +12 -3
- data/lib/cassandra_model_spark/record.rb +5 -1
- data/lib/cassandra_model_spark/schema.rb +47 -0
- data/lib/cassandra_model_spark/sql_schema.rb +49 -0
- metadata +26 -9
- data/ext/scala_helper/schema_builder.scala +0 -35
@@ -1,20 +1,38 @@
|
|
1
|
+
module CassandraModel
|
2
|
+
module Spark
|
3
|
+
module Lib
|
4
|
+
|
5
|
+
end
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
1
9
|
if RUBY_ENGINE == 'jruby'
|
2
10
|
class Hash
|
3
11
|
def to_java
|
4
|
-
JavaHashMap.new(self)
|
12
|
+
CassandraModel::Spark::Lib::JavaHashMap.new(self)
|
5
13
|
end
|
6
14
|
end
|
7
15
|
|
8
16
|
class Array
|
17
|
+
def self.from_java_array_list(array_list)
|
18
|
+
array_list.to_a
|
19
|
+
end
|
20
|
+
|
9
21
|
def to_java_argv
|
10
22
|
to_java(:string)
|
11
23
|
end
|
12
24
|
end
|
13
25
|
|
26
|
+
class String
|
27
|
+
def self.from_java_string(string)
|
28
|
+
string
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
14
32
|
else
|
15
33
|
class Hash
|
16
34
|
def to_java
|
17
|
-
JavaHashMap.new.tap do |map|
|
35
|
+
CassandraModel::Spark::Lib::JavaHashMap.new.tap do |map|
|
18
36
|
each do |key, value|
|
19
37
|
map.put(key, value)
|
20
38
|
end
|
@@ -23,6 +41,10 @@ else
|
|
23
41
|
end
|
24
42
|
|
25
43
|
class Array
|
44
|
+
def self.from_java_array_list(array_list)
|
45
|
+
array_list.toArray
|
46
|
+
end
|
47
|
+
|
26
48
|
def to_java
|
27
49
|
self
|
28
50
|
end
|
@@ -31,6 +53,12 @@ else
|
|
31
53
|
self
|
32
54
|
end
|
33
55
|
end
|
56
|
+
|
57
|
+
class String
|
58
|
+
def self.from_java_string(string)
|
59
|
+
string.toString
|
60
|
+
end
|
61
|
+
end
|
34
62
|
end
|
35
63
|
|
36
64
|
module JavaBridge
|
@@ -38,7 +66,7 @@ module JavaBridge
|
|
38
66
|
def import_java_object(path, options = {})
|
39
67
|
name = options.fetch(:as) { path.split('.').last }.to_sym
|
40
68
|
klass = "Java::#{path}"
|
41
|
-
|
69
|
+
set_import_const(name, eval(klass))
|
42
70
|
end
|
43
71
|
|
44
72
|
def initialize_java_engine
|
@@ -47,7 +75,7 @@ module JavaBridge
|
|
47
75
|
else
|
48
76
|
def import_java_object(path, options = {})
|
49
77
|
name = options.fetch(:as) { path.split('.').last }.to_sym
|
50
|
-
|
78
|
+
set_import_const(name, load_java_class(path))
|
51
79
|
end
|
52
80
|
|
53
81
|
def require(path)
|
@@ -79,6 +107,14 @@ module JavaBridge
|
|
79
107
|
end
|
80
108
|
end
|
81
109
|
|
110
|
+
private
|
111
|
+
|
112
|
+
def set_import_const(name, value)
|
113
|
+
CassandraModel::Spark::Lib.const_set(name, value)
|
114
|
+
end
|
115
|
+
|
116
|
+
public
|
117
|
+
|
82
118
|
def import_quiet
|
83
119
|
prev_verbox = $VERBOSE
|
84
120
|
$VERBOSE = nil
|
@@ -1,36 +1,51 @@
|
|
1
1
|
import_java_object 'java.util.ArrayList'
|
2
2
|
import_java_object 'org.apache.spark.SparkConf'
|
3
3
|
import_java_object 'org.apache.spark.api.java.JavaSparkContext'
|
4
|
+
import_java_object 'org.apache.spark.streaming.api.java.JavaStreamingContext', as: 'JavaSparkStreamingContext'
|
5
|
+
import_java_object 'org.apache.spark.streaming.Duration', as: 'SparkDuration'
|
4
6
|
import_java_object 'org.apache.spark.sql.cassandra.CassandraSQLContext'
|
5
7
|
import_java_object 'java.util.HashMap', as: 'JavaHashMap'
|
6
8
|
import_java_object 'org.apache.spark.sql.SQLContext', as: 'SparkSQLContext'
|
7
9
|
import_java_object 'org.apache.spark.sql.RowFactory', as: 'SparkRowFactory'
|
10
|
+
import_java_object 'org.apache.spark.sql.catalyst.expressions.GenericRow', as: 'SqlGenericRow'
|
8
11
|
import_java_object 'org.apache.log4j.Logger', as: 'JLogger'
|
9
12
|
import_java_object 'org.apache.log4j.Level', as: 'JLevel'
|
10
13
|
import_java_object 'org.apache.log4j.Priority', as: 'JPriority'
|
11
14
|
import_java_object 'org.apache.spark.util.Utils', as: 'SparkUtils'
|
12
15
|
import_java_object 'org.apache.spark.storage.StorageLevel', as: 'JStorageLevel'
|
16
|
+
|
17
|
+
import_java_object 'org.apache.spark.api.cassandra_model.ColumnDeserializer', as: 'SparkColumnDeserializer'
|
18
|
+
import_java_object 'org.apache.spark.api.cassandra_model.RowConversions', as: 'SqlRowConversions'
|
13
19
|
import_java_object 'org.apache.spark.api.cassandra_model.CassandraHelper', as: 'SparkCassandraHelper'
|
14
|
-
import_java_object 'org.apache.spark.api.cassandra_model.SchemaBuilder', as: 'SparkSchemaBuilder'
|
15
20
|
import_java_object 'org.apache.spark.api.cassandra_model.DataTypeHelper', as: 'SparkSqlDataTypeHelper'
|
16
21
|
import_java_object 'org.apache.spark.api.cassandra_model.MarshalLoader', as: 'ScalaMarshalLoader'
|
17
22
|
import_java_object 'org.apache.spark.api.cassandra_model.MapStringStringRowMapping', as: 'SparkMapStringStringRowMapping'
|
18
23
|
import_java_object 'org.apache.spark.api.cassandra_model.SparkRowRowMapping', as: 'SparkSparkRowRowMapping'
|
24
|
+
import_java_object 'org.apache.spark.api.cassandra_model.LuaRDD'
|
25
|
+
|
19
26
|
import_java_object 'org.apache.spark.deploy.master.Master', as: 'SparkMaster'
|
20
27
|
import_java_object 'org.apache.spark.deploy.worker.RubyWorkerStarter', as: 'SparkWorkerStarter'
|
21
28
|
|
29
|
+
if CassandraModel.const_defined?('TESTING_SCALA')
|
30
|
+
import_java_object 'com.datastax.spark.connector.CassandraRow', as: 'SparkCassandraRow'
|
31
|
+
import_java_object 'org.apache.spark.api.cassandra_model.LuaRowValue'
|
32
|
+
import_java_object 'org.apache.spark.api.cassandra_model.LuaRowLib'
|
33
|
+
end
|
34
|
+
|
22
35
|
%w(BinaryType BooleanType ByteType DataType
|
23
36
|
DateType Decimal DecimalType DoubleType FloatType IntegerType
|
24
37
|
LongType Metadata NullType PrecisionInfo ShortType
|
38
|
+
ArrayType MapType
|
25
39
|
StringType StructField StructType TimestampType).each do |sql_type|
|
26
|
-
|
40
|
+
type = import_quiet { CassandraModel::Spark::Lib::SparkSqlDataTypeHelper.public_send(:"get#{sql_type}") }
|
41
|
+
CassandraModel::Spark::Lib.const_set(:"Sql#{sql_type}", type)
|
27
42
|
end
|
28
43
|
|
29
44
|
#noinspection RubyConstantNamingConvention
|
30
|
-
SqlStringArrayType = SparkSqlDataTypeHelper.getArrayType(SqlStringType)
|
45
|
+
SqlStringArrayType = CassandraModel::Spark::Lib::SparkSqlDataTypeHelper.getArrayType(CassandraModel::Spark::Lib::SqlStringType)
|
31
46
|
|
32
47
|
#noinspection RubyConstantNamingConvention
|
33
|
-
SqlIntegerArrayType = SparkSqlDataTypeHelper.getArrayType(SqlIntegerType)
|
48
|
+
SqlIntegerArrayType = CassandraModel::Spark::Lib::SparkSqlDataTypeHelper.getArrayType(CassandraModel::Spark::Lib::SqlIntegerType)
|
34
49
|
|
35
50
|
#noinspection RubyConstantNamingConvention
|
36
|
-
SqlStringStringMapType = SparkSqlDataTypeHelper.getMapType(SqlStringType, SqlStringType)
|
51
|
+
SqlStringStringMapType = CassandraModel::Spark::Lib::SparkSqlDataTypeHelper.getMapType(CassandraModel::Spark::Lib::SqlStringType, CassandraModel::Spark::Lib::SqlStringType)
|
@@ -16,7 +16,7 @@ module CassandraModel
|
|
16
16
|
|
17
17
|
def data_frame_from_frame(options)
|
18
18
|
query_frame = @record_klass.query(@params, @options)
|
19
|
-
Spark::DataFrame.new(@record_klass.record_klass, nil, options.merge(spark_data_frame: query_frame))
|
19
|
+
Spark::DataFrame.new(options.delete(:class) || @record_klass.record_klass, nil, options.merge(spark_data_frame: query_frame))
|
20
20
|
end
|
21
21
|
|
22
22
|
def data_frame_from_model(options)
|
@@ -30,8 +30,8 @@ module CassandraModel
|
|
30
30
|
end
|
31
31
|
memo.merge!(updated_key => value)
|
32
32
|
end.stringify_keys.to_java
|
33
|
-
rdd = SparkCassandraHelper.filterRDD(@record_klass.rdd, updated_restriction)
|
34
|
-
Spark::DataFrame.new(@record_klass, rdd, options)
|
33
|
+
rdd = Spark::Lib::SparkCassandraHelper.filterRDD(@record_klass.rdd, updated_restriction)
|
34
|
+
Spark::DataFrame.new(options.delete(:class) || @record_klass, rdd, options)
|
35
35
|
end
|
36
36
|
end
|
37
37
|
end
|
@@ -2,7 +2,7 @@ module CassandraModel
|
|
2
2
|
class RawConnection
|
3
3
|
def java_spark_context
|
4
4
|
@spark_context ||= begin
|
5
|
-
JavaSparkContext.new(spark_conf).tap do |java_spark_context|
|
5
|
+
Spark::Lib::JavaSparkContext.new(spark_conf).tap do |java_spark_context|
|
6
6
|
java_spark_context.sc.addJar("#{Spark.classpath}/cmodel_scala_helper.jar")
|
7
7
|
end
|
8
8
|
end
|
@@ -12,10 +12,19 @@ module CassandraModel
|
|
12
12
|
java_spark_context.sc
|
13
13
|
end
|
14
14
|
|
15
|
+
def has_spark_context?
|
16
|
+
!!@spark_context
|
17
|
+
end
|
18
|
+
|
19
|
+
#noinspection RubyInstanceMethodNamingConvention
|
20
|
+
def create_java_spark_streaming_context
|
21
|
+
Spark::Lib::JavaSparkStreamingContext.new(java_spark_context, Spark::Lib::SparkDuration.new(2000))
|
22
|
+
end
|
23
|
+
|
15
24
|
private
|
16
25
|
|
17
26
|
def spark_conf
|
18
|
-
@spark_conf ||= SparkConf.new(true).tap do |conf|
|
27
|
+
@spark_conf ||= Spark::Lib::SparkConf.new(true).tap do |conf|
|
19
28
|
conf.set('spark.app.name', 'cassandra_model_spark')
|
20
29
|
conf.set('spark.master', 'local[*]')
|
21
30
|
conf.set('spark.cassandra.connection.host', config[:hosts].first)
|
@@ -44,4 +53,4 @@ module CassandraModel
|
|
44
53
|
config.slice(:spark)
|
45
54
|
end
|
46
55
|
end
|
47
|
-
end
|
56
|
+
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module CassandraModel
|
2
2
|
class Record
|
3
3
|
def self.rdd
|
4
|
-
@spark_rdd ||= SparkCassandraHelper.cassandra_table(
|
4
|
+
@spark_rdd ||= Spark::Lib::SparkCassandraHelper.cassandra_table(
|
5
5
|
table.connection.spark_context,
|
6
6
|
table.connection.config[:keyspace],
|
7
7
|
table_name)
|
@@ -14,5 +14,9 @@ module CassandraModel
|
|
14
14
|
def self.count
|
15
15
|
rdd.count
|
16
16
|
end
|
17
|
+
|
18
|
+
def self.sql_schema
|
19
|
+
Spark::SqlSchema.new(cassandra_columns)
|
20
|
+
end
|
17
21
|
end
|
18
22
|
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module CassandraModel
|
2
|
+
module Spark
|
3
|
+
class Schema
|
4
|
+
attr_reader :schema
|
5
|
+
|
6
|
+
def initialize(sql_schema)
|
7
|
+
@schema = sql_schema.fields.inject({}) do |memo, field|
|
8
|
+
column = field.name
|
9
|
+
type = field.dataType
|
10
|
+
memo.merge!(column.to_sym => sql_type(type))
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def ==(rhs)
|
15
|
+
rhs.is_a?(Schema) && rhs.schema == schema
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def sql_type(type)
|
21
|
+
case sql_type_name(type)
|
22
|
+
when 'ArrayType'
|
23
|
+
[:list, sql_type(type.elementType)]
|
24
|
+
when 'MapType'
|
25
|
+
[:map, sql_type(type.keyType), sql_type(type.valueType) ]
|
26
|
+
when 'IntegerType'
|
27
|
+
:int
|
28
|
+
when 'BooleanType'
|
29
|
+
:boolean
|
30
|
+
when 'DoubleType'
|
31
|
+
:double
|
32
|
+
when 'BinaryType'
|
33
|
+
:blob
|
34
|
+
when 'TimestampType'
|
35
|
+
:timestamp
|
36
|
+
else
|
37
|
+
:text
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def sql_type_name(data_type)
|
42
|
+
data_type.getClass.getSimpleName
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module CassandraModel
|
2
|
+
module Spark
|
3
|
+
class SqlSchema
|
4
|
+
attr_reader :schema
|
5
|
+
|
6
|
+
def initialize(cassandra_schema)
|
7
|
+
fields = cassandra_schema.map do |column, type|
|
8
|
+
Lib::SqlStructField.apply(column.to_s, sql_type(type), true, Lib::SqlMetadata.empty)
|
9
|
+
end
|
10
|
+
if RUBY_ENGINE == 'jruby'
|
11
|
+
fields = fields.to_java('org.apache.spark.sql.types.StructField')
|
12
|
+
end
|
13
|
+
@schema = Lib::SqlStructType.apply(fields)
|
14
|
+
end
|
15
|
+
|
16
|
+
def ==(rhs)
|
17
|
+
rhs.is_a?(SqlSchema) && rhs.schema == schema
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def sql_type(type)
|
23
|
+
case type
|
24
|
+
when Array
|
25
|
+
base_type, first_type, second_type = type
|
26
|
+
case base_type
|
27
|
+
when :map
|
28
|
+
Lib::SqlMapType.apply(sql_type(first_type), sql_type(second_type), true)
|
29
|
+
else
|
30
|
+
Lib::SqlArrayType.apply(sql_type(first_type))
|
31
|
+
end
|
32
|
+
when :int
|
33
|
+
Lib::SqlIntegerType
|
34
|
+
when :boolean
|
35
|
+
Lib::SqlBooleanType
|
36
|
+
when :double
|
37
|
+
Lib::SqlDoubleType
|
38
|
+
when :blob
|
39
|
+
Lib::SqlBinaryType
|
40
|
+
when :timestamp
|
41
|
+
Lib::SqlTimestampType
|
42
|
+
else
|
43
|
+
Lib::SqlStringType
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
metadata
CHANGED
@@ -1,43 +1,55 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cassandra_model_spark
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Thomas RM Rogers
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-05-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: cassandra_model
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - '>='
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 0.
|
19
|
+
version: 0.10.0
|
20
|
+
- - <=
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '1.1'
|
20
23
|
type: :runtime
|
21
24
|
prerelease: false
|
22
25
|
version_requirements: !ruby/object:Gem::Requirement
|
23
26
|
requirements:
|
24
|
-
- -
|
27
|
+
- - '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 0.10.0
|
30
|
+
- - <=
|
25
31
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
32
|
+
version: '1.1'
|
27
33
|
- !ruby/object:Gem::Dependency
|
28
34
|
name: thomas_utils
|
29
35
|
requirement: !ruby/object:Gem::Requirement
|
30
36
|
requirements:
|
31
|
-
- -
|
37
|
+
- - '>='
|
32
38
|
- !ruby/object:Gem::Version
|
33
39
|
version: 0.1.16
|
40
|
+
- - <
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: 0.3.0
|
34
43
|
type: :runtime
|
35
44
|
prerelease: false
|
36
45
|
version_requirements: !ruby/object:Gem::Requirement
|
37
46
|
requirements:
|
38
|
-
- -
|
47
|
+
- - '>='
|
39
48
|
- !ruby/object:Gem::Version
|
40
49
|
version: 0.1.16
|
50
|
+
- - <
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 0.3.0
|
41
53
|
description: |-
|
42
54
|
Spark integration for cassandra_model.
|
43
55
|
Get high-performance data analytics with the ease of cassandra_model.
|
@@ -66,13 +78,16 @@ files:
|
|
66
78
|
- ext/scala_helper/bin/spark-class
|
67
79
|
- ext/scala_helper/build.sbt
|
68
80
|
- ext/scala_helper/cassandra_helper.scala
|
81
|
+
- ext/scala_helper/column_deserializer.scala
|
69
82
|
- ext/scala_helper/data_type_helper.scala
|
83
|
+
- ext/scala_helper/lua_rdd.scala
|
84
|
+
- ext/scala_helper/lua_row_lib.scala
|
70
85
|
- ext/scala_helper/marshal_loader.scala
|
71
86
|
- ext/scala_helper/marshal_row_mapping.scala
|
72
87
|
- ext/scala_helper/project/plugins.sbt
|
88
|
+
- ext/scala_helper/row_conversions.scala
|
73
89
|
- ext/scala_helper/sbin/spark-config.sh
|
74
90
|
- ext/scala_helper/sbin/spark-daemon.sh
|
75
|
-
- ext/scala_helper/schema_builder.scala
|
76
91
|
- ext/scala_helper/worker.scala
|
77
92
|
- lib/cassandra_model_spark.rb
|
78
93
|
- lib/cassandra_model_spark/build.rb
|
@@ -85,7 +100,9 @@ files:
|
|
85
100
|
- lib/cassandra_model_spark/query_builder.rb
|
86
101
|
- lib/cassandra_model_spark/raw_connection.rb
|
87
102
|
- lib/cassandra_model_spark/record.rb
|
103
|
+
- lib/cassandra_model_spark/schema.rb
|
88
104
|
- lib/cassandra_model_spark/spark.rb
|
105
|
+
- lib/cassandra_model_spark/sql_schema.rb
|
89
106
|
homepage: https://www.github.com/thomasrogers03/cassandra_model_spark
|
90
107
|
licenses:
|
91
108
|
- Apache License 2.0
|
@@ -1,35 +0,0 @@
|
|
1
|
-
package org.apache.spark.api.cassandra_model
|
2
|
-
|
3
|
-
import org.apache.spark.rdd._
|
4
|
-
import com.datastax.spark.connector._
|
5
|
-
import com.datastax.spark.connector.rdd._
|
6
|
-
import org.apache.spark.sql._
|
7
|
-
import org.apache.spark.sql.types._
|
8
|
-
|
9
|
-
class SchemaBuilder {
|
10
|
-
var fields = Array[StructField]()
|
11
|
-
|
12
|
-
def build = StructType(fields)
|
13
|
-
|
14
|
-
def cassandraRDDToRDD(rdd: RDD[CassandraRow]) = {
|
15
|
-
rdd.map(
|
16
|
-
p => Row.fromSeq(
|
17
|
-
p.columnValues.map{
|
18
|
-
p => p match {
|
19
|
-
case (d: java.util.Date) => new java.sql.Timestamp(d.getTime())
|
20
|
-
case _ => p
|
21
|
-
}
|
22
|
-
}
|
23
|
-
)
|
24
|
-
)
|
25
|
-
}
|
26
|
-
|
27
|
-
def createDataFrame(sqlc: SQLContext, rdd: RDD[CassandraRow]) = {
|
28
|
-
val new_rdd = cassandraRDDToRDD(rdd)
|
29
|
-
sqlc.createDataFrame(new_rdd, build)
|
30
|
-
}
|
31
|
-
|
32
|
-
def addColumn(name: String, sql_type: DataType) = {
|
33
|
-
fields :+= StructField(name, sql_type, true)
|
34
|
-
}
|
35
|
-
}
|