cassandra_model_spark 0.0.1.5 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ext/scala_helper/build.sbt +5 -2
- data/ext/scala_helper/cassandra_helper.scala +3 -2
- data/ext/scala_helper/column_deserializer.scala +28 -0
- data/ext/scala_helper/data_type_helper.scala +2 -0
- data/ext/scala_helper/lua_rdd.scala +352 -0
- data/ext/scala_helper/lua_row_lib.scala +108 -0
- data/ext/scala_helper/marshal_loader.scala +6 -6
- data/ext/scala_helper/marshal_row_mapping.scala +11 -3
- data/ext/scala_helper/row_conversions.scala +20 -0
- data/lib/cassandra_model_spark.rb +2 -0
- data/lib/cassandra_model_spark/connection_cache.rb +2 -2
- data/lib/cassandra_model_spark/data_frame.rb +121 -35
- data/lib/cassandra_model_spark/java_bridge.rb +40 -4
- data/lib/cassandra_model_spark/java_classes.rb +20 -5
- data/lib/cassandra_model_spark/query_builder.rb +3 -3
- data/lib/cassandra_model_spark/raw_connection.rb +12 -3
- data/lib/cassandra_model_spark/record.rb +5 -1
- data/lib/cassandra_model_spark/schema.rb +47 -0
- data/lib/cassandra_model_spark/sql_schema.rb +49 -0
- metadata +26 -9
- data/ext/scala_helper/schema_builder.scala +0 -35
@@ -1,20 +1,38 @@
|
|
1
|
+
module CassandraModel
|
2
|
+
module Spark
|
3
|
+
module Lib
|
4
|
+
|
5
|
+
end
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
1
9
|
if RUBY_ENGINE == 'jruby'
|
2
10
|
class Hash
|
3
11
|
def to_java
|
4
|
-
JavaHashMap.new(self)
|
12
|
+
CassandraModel::Spark::Lib::JavaHashMap.new(self)
|
5
13
|
end
|
6
14
|
end
|
7
15
|
|
8
16
|
class Array
|
17
|
+
def self.from_java_array_list(array_list)
|
18
|
+
array_list.to_a
|
19
|
+
end
|
20
|
+
|
9
21
|
def to_java_argv
|
10
22
|
to_java(:string)
|
11
23
|
end
|
12
24
|
end
|
13
25
|
|
26
|
+
class String
|
27
|
+
def self.from_java_string(string)
|
28
|
+
string
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
14
32
|
else
|
15
33
|
class Hash
|
16
34
|
def to_java
|
17
|
-
JavaHashMap.new.tap do |map|
|
35
|
+
CassandraModel::Spark::Lib::JavaHashMap.new.tap do |map|
|
18
36
|
each do |key, value|
|
19
37
|
map.put(key, value)
|
20
38
|
end
|
@@ -23,6 +41,10 @@ else
|
|
23
41
|
end
|
24
42
|
|
25
43
|
class Array
|
44
|
+
def self.from_java_array_list(array_list)
|
45
|
+
array_list.toArray
|
46
|
+
end
|
47
|
+
|
26
48
|
def to_java
|
27
49
|
self
|
28
50
|
end
|
@@ -31,6 +53,12 @@ else
|
|
31
53
|
self
|
32
54
|
end
|
33
55
|
end
|
56
|
+
|
57
|
+
class String
|
58
|
+
def self.from_java_string(string)
|
59
|
+
string.toString
|
60
|
+
end
|
61
|
+
end
|
34
62
|
end
|
35
63
|
|
36
64
|
module JavaBridge
|
@@ -38,7 +66,7 @@ module JavaBridge
|
|
38
66
|
def import_java_object(path, options = {})
|
39
67
|
name = options.fetch(:as) { path.split('.').last }.to_sym
|
40
68
|
klass = "Java::#{path}"
|
41
|
-
|
69
|
+
set_import_const(name, eval(klass))
|
42
70
|
end
|
43
71
|
|
44
72
|
def initialize_java_engine
|
@@ -47,7 +75,7 @@ module JavaBridge
|
|
47
75
|
else
|
48
76
|
def import_java_object(path, options = {})
|
49
77
|
name = options.fetch(:as) { path.split('.').last }.to_sym
|
50
|
-
|
78
|
+
set_import_const(name, load_java_class(path))
|
51
79
|
end
|
52
80
|
|
53
81
|
def require(path)
|
@@ -79,6 +107,14 @@ module JavaBridge
|
|
79
107
|
end
|
80
108
|
end
|
81
109
|
|
110
|
+
private
|
111
|
+
|
112
|
+
def set_import_const(name, value)
|
113
|
+
CassandraModel::Spark::Lib.const_set(name, value)
|
114
|
+
end
|
115
|
+
|
116
|
+
public
|
117
|
+
|
82
118
|
def import_quiet
|
83
119
|
prev_verbox = $VERBOSE
|
84
120
|
$VERBOSE = nil
|
@@ -1,36 +1,51 @@
|
|
1
1
|
import_java_object 'java.util.ArrayList'
|
2
2
|
import_java_object 'org.apache.spark.SparkConf'
|
3
3
|
import_java_object 'org.apache.spark.api.java.JavaSparkContext'
|
4
|
+
import_java_object 'org.apache.spark.streaming.api.java.JavaStreamingContext', as: 'JavaSparkStreamingContext'
|
5
|
+
import_java_object 'org.apache.spark.streaming.Duration', as: 'SparkDuration'
|
4
6
|
import_java_object 'org.apache.spark.sql.cassandra.CassandraSQLContext'
|
5
7
|
import_java_object 'java.util.HashMap', as: 'JavaHashMap'
|
6
8
|
import_java_object 'org.apache.spark.sql.SQLContext', as: 'SparkSQLContext'
|
7
9
|
import_java_object 'org.apache.spark.sql.RowFactory', as: 'SparkRowFactory'
|
10
|
+
import_java_object 'org.apache.spark.sql.catalyst.expressions.GenericRow', as: 'SqlGenericRow'
|
8
11
|
import_java_object 'org.apache.log4j.Logger', as: 'JLogger'
|
9
12
|
import_java_object 'org.apache.log4j.Level', as: 'JLevel'
|
10
13
|
import_java_object 'org.apache.log4j.Priority', as: 'JPriority'
|
11
14
|
import_java_object 'org.apache.spark.util.Utils', as: 'SparkUtils'
|
12
15
|
import_java_object 'org.apache.spark.storage.StorageLevel', as: 'JStorageLevel'
|
16
|
+
|
17
|
+
import_java_object 'org.apache.spark.api.cassandra_model.ColumnDeserializer', as: 'SparkColumnDeserializer'
|
18
|
+
import_java_object 'org.apache.spark.api.cassandra_model.RowConversions', as: 'SqlRowConversions'
|
13
19
|
import_java_object 'org.apache.spark.api.cassandra_model.CassandraHelper', as: 'SparkCassandraHelper'
|
14
|
-
import_java_object 'org.apache.spark.api.cassandra_model.SchemaBuilder', as: 'SparkSchemaBuilder'
|
15
20
|
import_java_object 'org.apache.spark.api.cassandra_model.DataTypeHelper', as: 'SparkSqlDataTypeHelper'
|
16
21
|
import_java_object 'org.apache.spark.api.cassandra_model.MarshalLoader', as: 'ScalaMarshalLoader'
|
17
22
|
import_java_object 'org.apache.spark.api.cassandra_model.MapStringStringRowMapping', as: 'SparkMapStringStringRowMapping'
|
18
23
|
import_java_object 'org.apache.spark.api.cassandra_model.SparkRowRowMapping', as: 'SparkSparkRowRowMapping'
|
24
|
+
import_java_object 'org.apache.spark.api.cassandra_model.LuaRDD'
|
25
|
+
|
19
26
|
import_java_object 'org.apache.spark.deploy.master.Master', as: 'SparkMaster'
|
20
27
|
import_java_object 'org.apache.spark.deploy.worker.RubyWorkerStarter', as: 'SparkWorkerStarter'
|
21
28
|
|
29
|
+
if CassandraModel.const_defined?('TESTING_SCALA')
|
30
|
+
import_java_object 'com.datastax.spark.connector.CassandraRow', as: 'SparkCassandraRow'
|
31
|
+
import_java_object 'org.apache.spark.api.cassandra_model.LuaRowValue'
|
32
|
+
import_java_object 'org.apache.spark.api.cassandra_model.LuaRowLib'
|
33
|
+
end
|
34
|
+
|
22
35
|
%w(BinaryType BooleanType ByteType DataType
|
23
36
|
DateType Decimal DecimalType DoubleType FloatType IntegerType
|
24
37
|
LongType Metadata NullType PrecisionInfo ShortType
|
38
|
+
ArrayType MapType
|
25
39
|
StringType StructField StructType TimestampType).each do |sql_type|
|
26
|
-
|
40
|
+
type = import_quiet { CassandraModel::Spark::Lib::SparkSqlDataTypeHelper.public_send(:"get#{sql_type}") }
|
41
|
+
CassandraModel::Spark::Lib.const_set(:"Sql#{sql_type}", type)
|
27
42
|
end
|
28
43
|
|
29
44
|
#noinspection RubyConstantNamingConvention
|
30
|
-
SqlStringArrayType = SparkSqlDataTypeHelper.getArrayType(SqlStringType)
|
45
|
+
SqlStringArrayType = CassandraModel::Spark::Lib::SparkSqlDataTypeHelper.getArrayType(CassandraModel::Spark::Lib::SqlStringType)
|
31
46
|
|
32
47
|
#noinspection RubyConstantNamingConvention
|
33
|
-
SqlIntegerArrayType = SparkSqlDataTypeHelper.getArrayType(SqlIntegerType)
|
48
|
+
SqlIntegerArrayType = CassandraModel::Spark::Lib::SparkSqlDataTypeHelper.getArrayType(CassandraModel::Spark::Lib::SqlIntegerType)
|
34
49
|
|
35
50
|
#noinspection RubyConstantNamingConvention
|
36
|
-
SqlStringStringMapType = SparkSqlDataTypeHelper.getMapType(SqlStringType, SqlStringType)
|
51
|
+
SqlStringStringMapType = CassandraModel::Spark::Lib::SparkSqlDataTypeHelper.getMapType(CassandraModel::Spark::Lib::SqlStringType, CassandraModel::Spark::Lib::SqlStringType)
|
@@ -16,7 +16,7 @@ module CassandraModel
|
|
16
16
|
|
17
17
|
def data_frame_from_frame(options)
|
18
18
|
query_frame = @record_klass.query(@params, @options)
|
19
|
-
Spark::DataFrame.new(@record_klass.record_klass, nil, options.merge(spark_data_frame: query_frame))
|
19
|
+
Spark::DataFrame.new(options.delete(:class) || @record_klass.record_klass, nil, options.merge(spark_data_frame: query_frame))
|
20
20
|
end
|
21
21
|
|
22
22
|
def data_frame_from_model(options)
|
@@ -30,8 +30,8 @@ module CassandraModel
|
|
30
30
|
end
|
31
31
|
memo.merge!(updated_key => value)
|
32
32
|
end.stringify_keys.to_java
|
33
|
-
rdd = SparkCassandraHelper.filterRDD(@record_klass.rdd, updated_restriction)
|
34
|
-
Spark::DataFrame.new(@record_klass, rdd, options)
|
33
|
+
rdd = Spark::Lib::SparkCassandraHelper.filterRDD(@record_klass.rdd, updated_restriction)
|
34
|
+
Spark::DataFrame.new(options.delete(:class) || @record_klass, rdd, options)
|
35
35
|
end
|
36
36
|
end
|
37
37
|
end
|
@@ -2,7 +2,7 @@ module CassandraModel
|
|
2
2
|
class RawConnection
|
3
3
|
def java_spark_context
|
4
4
|
@spark_context ||= begin
|
5
|
-
JavaSparkContext.new(spark_conf).tap do |java_spark_context|
|
5
|
+
Spark::Lib::JavaSparkContext.new(spark_conf).tap do |java_spark_context|
|
6
6
|
java_spark_context.sc.addJar("#{Spark.classpath}/cmodel_scala_helper.jar")
|
7
7
|
end
|
8
8
|
end
|
@@ -12,10 +12,19 @@ module CassandraModel
|
|
12
12
|
java_spark_context.sc
|
13
13
|
end
|
14
14
|
|
15
|
+
def has_spark_context?
|
16
|
+
!!@spark_context
|
17
|
+
end
|
18
|
+
|
19
|
+
#noinspection RubyInstanceMethodNamingConvention
|
20
|
+
def create_java_spark_streaming_context
|
21
|
+
Spark::Lib::JavaSparkStreamingContext.new(java_spark_context, Spark::Lib::SparkDuration.new(2000))
|
22
|
+
end
|
23
|
+
|
15
24
|
private
|
16
25
|
|
17
26
|
def spark_conf
|
18
|
-
@spark_conf ||= SparkConf.new(true).tap do |conf|
|
27
|
+
@spark_conf ||= Spark::Lib::SparkConf.new(true).tap do |conf|
|
19
28
|
conf.set('spark.app.name', 'cassandra_model_spark')
|
20
29
|
conf.set('spark.master', 'local[*]')
|
21
30
|
conf.set('spark.cassandra.connection.host', config[:hosts].first)
|
@@ -44,4 +53,4 @@ module CassandraModel
|
|
44
53
|
config.slice(:spark)
|
45
54
|
end
|
46
55
|
end
|
47
|
-
end
|
56
|
+
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module CassandraModel
|
2
2
|
class Record
|
3
3
|
def self.rdd
|
4
|
-
@spark_rdd ||= SparkCassandraHelper.cassandra_table(
|
4
|
+
@spark_rdd ||= Spark::Lib::SparkCassandraHelper.cassandra_table(
|
5
5
|
table.connection.spark_context,
|
6
6
|
table.connection.config[:keyspace],
|
7
7
|
table_name)
|
@@ -14,5 +14,9 @@ module CassandraModel
|
|
14
14
|
def self.count
|
15
15
|
rdd.count
|
16
16
|
end
|
17
|
+
|
18
|
+
def self.sql_schema
|
19
|
+
Spark::SqlSchema.new(cassandra_columns)
|
20
|
+
end
|
17
21
|
end
|
18
22
|
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module CassandraModel
|
2
|
+
module Spark
|
3
|
+
class Schema
|
4
|
+
attr_reader :schema
|
5
|
+
|
6
|
+
def initialize(sql_schema)
|
7
|
+
@schema = sql_schema.fields.inject({}) do |memo, field|
|
8
|
+
column = field.name
|
9
|
+
type = field.dataType
|
10
|
+
memo.merge!(column.to_sym => sql_type(type))
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def ==(rhs)
|
15
|
+
rhs.is_a?(Schema) && rhs.schema == schema
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def sql_type(type)
|
21
|
+
case sql_type_name(type)
|
22
|
+
when 'ArrayType'
|
23
|
+
[:list, sql_type(type.elementType)]
|
24
|
+
when 'MapType'
|
25
|
+
[:map, sql_type(type.keyType), sql_type(type.valueType) ]
|
26
|
+
when 'IntegerType'
|
27
|
+
:int
|
28
|
+
when 'BooleanType'
|
29
|
+
:boolean
|
30
|
+
when 'DoubleType'
|
31
|
+
:double
|
32
|
+
when 'BinaryType'
|
33
|
+
:blob
|
34
|
+
when 'TimestampType'
|
35
|
+
:timestamp
|
36
|
+
else
|
37
|
+
:text
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def sql_type_name(data_type)
|
42
|
+
data_type.getClass.getSimpleName
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module CassandraModel
|
2
|
+
module Spark
|
3
|
+
class SqlSchema
|
4
|
+
attr_reader :schema
|
5
|
+
|
6
|
+
def initialize(cassandra_schema)
|
7
|
+
fields = cassandra_schema.map do |column, type|
|
8
|
+
Lib::SqlStructField.apply(column.to_s, sql_type(type), true, Lib::SqlMetadata.empty)
|
9
|
+
end
|
10
|
+
if RUBY_ENGINE == 'jruby'
|
11
|
+
fields = fields.to_java('org.apache.spark.sql.types.StructField')
|
12
|
+
end
|
13
|
+
@schema = Lib::SqlStructType.apply(fields)
|
14
|
+
end
|
15
|
+
|
16
|
+
def ==(rhs)
|
17
|
+
rhs.is_a?(SqlSchema) && rhs.schema == schema
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def sql_type(type)
|
23
|
+
case type
|
24
|
+
when Array
|
25
|
+
base_type, first_type, second_type = type
|
26
|
+
case base_type
|
27
|
+
when :map
|
28
|
+
Lib::SqlMapType.apply(sql_type(first_type), sql_type(second_type), true)
|
29
|
+
else
|
30
|
+
Lib::SqlArrayType.apply(sql_type(first_type))
|
31
|
+
end
|
32
|
+
when :int
|
33
|
+
Lib::SqlIntegerType
|
34
|
+
when :boolean
|
35
|
+
Lib::SqlBooleanType
|
36
|
+
when :double
|
37
|
+
Lib::SqlDoubleType
|
38
|
+
when :blob
|
39
|
+
Lib::SqlBinaryType
|
40
|
+
when :timestamp
|
41
|
+
Lib::SqlTimestampType
|
42
|
+
else
|
43
|
+
Lib::SqlStringType
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
metadata
CHANGED
@@ -1,43 +1,55 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cassandra_model_spark
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Thomas RM Rogers
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-05-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: cassandra_model
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.10.0
|
20
|
+
- - <=
|
18
21
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
22
|
+
version: '1.1'
|
20
23
|
type: :runtime
|
21
24
|
prerelease: false
|
22
25
|
version_requirements: !ruby/object:Gem::Requirement
|
23
26
|
requirements:
|
24
|
-
- -
|
27
|
+
- - '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 0.10.0
|
30
|
+
- - <=
|
25
31
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
32
|
+
version: '1.1'
|
27
33
|
- !ruby/object:Gem::Dependency
|
28
34
|
name: thomas_utils
|
29
35
|
requirement: !ruby/object:Gem::Requirement
|
30
36
|
requirements:
|
31
|
-
- -
|
37
|
+
- - '>='
|
32
38
|
- !ruby/object:Gem::Version
|
33
39
|
version: 0.1.16
|
40
|
+
- - <
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: 0.3.0
|
34
43
|
type: :runtime
|
35
44
|
prerelease: false
|
36
45
|
version_requirements: !ruby/object:Gem::Requirement
|
37
46
|
requirements:
|
38
|
-
- -
|
47
|
+
- - '>='
|
39
48
|
- !ruby/object:Gem::Version
|
40
49
|
version: 0.1.16
|
50
|
+
- - <
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 0.3.0
|
41
53
|
- !ruby/object:Gem::Dependency
|
42
54
|
name: rjb
|
43
55
|
requirement: !ruby/object:Gem::Requirement
|
@@ -80,13 +92,16 @@ files:
|
|
80
92
|
- ext/scala_helper/bin/spark-class
|
81
93
|
- ext/scala_helper/build.sbt
|
82
94
|
- ext/scala_helper/cassandra_helper.scala
|
95
|
+
- ext/scala_helper/column_deserializer.scala
|
83
96
|
- ext/scala_helper/data_type_helper.scala
|
97
|
+
- ext/scala_helper/lua_rdd.scala
|
98
|
+
- ext/scala_helper/lua_row_lib.scala
|
84
99
|
- ext/scala_helper/marshal_loader.scala
|
85
100
|
- ext/scala_helper/marshal_row_mapping.scala
|
86
101
|
- ext/scala_helper/project/plugins.sbt
|
102
|
+
- ext/scala_helper/row_conversions.scala
|
87
103
|
- ext/scala_helper/sbin/spark-config.sh
|
88
104
|
- ext/scala_helper/sbin/spark-daemon.sh
|
89
|
-
- ext/scala_helper/schema_builder.scala
|
90
105
|
- ext/scala_helper/worker.scala
|
91
106
|
- lib/cassandra_model_spark.rb
|
92
107
|
- lib/cassandra_model_spark/build.rb
|
@@ -99,7 +114,9 @@ files:
|
|
99
114
|
- lib/cassandra_model_spark/query_builder.rb
|
100
115
|
- lib/cassandra_model_spark/raw_connection.rb
|
101
116
|
- lib/cassandra_model_spark/record.rb
|
117
|
+
- lib/cassandra_model_spark/schema.rb
|
102
118
|
- lib/cassandra_model_spark/spark.rb
|
119
|
+
- lib/cassandra_model_spark/sql_schema.rb
|
103
120
|
homepage: https://www.github.com/thomasrogers03/cassandra_model_spark
|
104
121
|
licenses:
|
105
122
|
- Apache License 2.0
|
@@ -1,35 +0,0 @@
|
|
1
|
-
package org.apache.spark.api.cassandra_model
|
2
|
-
|
3
|
-
import org.apache.spark.rdd._
|
4
|
-
import com.datastax.spark.connector._
|
5
|
-
import com.datastax.spark.connector.rdd._
|
6
|
-
import org.apache.spark.sql._
|
7
|
-
import org.apache.spark.sql.types._
|
8
|
-
|
9
|
-
class SchemaBuilder {
|
10
|
-
var fields = Array[StructField]()
|
11
|
-
|
12
|
-
def build = StructType(fields)
|
13
|
-
|
14
|
-
def cassandraRDDToRDD(rdd: RDD[CassandraRow]) = {
|
15
|
-
rdd.map(
|
16
|
-
p => Row.fromSeq(
|
17
|
-
p.columnValues.map{
|
18
|
-
p => p match {
|
19
|
-
case (d: java.util.Date) => new java.sql.Timestamp(d.getTime())
|
20
|
-
case _ => p
|
21
|
-
}
|
22
|
-
}
|
23
|
-
)
|
24
|
-
)
|
25
|
-
}
|
26
|
-
|
27
|
-
def createDataFrame(sqlc: SQLContext, rdd: RDD[CassandraRow]) = {
|
28
|
-
val new_rdd = cassandraRDDToRDD(rdd)
|
29
|
-
sqlc.createDataFrame(new_rdd, build)
|
30
|
-
}
|
31
|
-
|
32
|
-
def addColumn(name: String, sql_type: DataType) = {
|
33
|
-
fields :+= StructField(name, sql_type, true)
|
34
|
-
}
|
35
|
-
}
|