cassandra_model_spark 0.0.1.5-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +17 -0
- data/bin/cmodel-spark-build +7 -0
- data/bin/cmodel-spark-env.rb +11 -0
- data/bin/cmodel-spark-master +22 -0
- data/bin/cmodel-spark-run-master +4 -0
- data/bin/cmodel-spark-run-master.sh +8 -0
- data/bin/cmodel-spark-run-slave +4 -0
- data/bin/cmodel-spark-run-slave.sh +8 -0
- data/bin/cmodel-spark-slaves +22 -0
- data/ext/scala_helper/bin/load-spark-env.sh +63 -0
- data/ext/scala_helper/bin/spark-class +87 -0
- data/ext/scala_helper/build.sbt +62 -0
- data/ext/scala_helper/cassandra_helper.scala +23 -0
- data/ext/scala_helper/data_type_helper.scala +27 -0
- data/ext/scala_helper/marshal_loader.scala +204 -0
- data/ext/scala_helper/marshal_row_mapping.scala +85 -0
- data/ext/scala_helper/project/plugins.sbt +6 -0
- data/ext/scala_helper/sbin/spark-config.sh +30 -0
- data/ext/scala_helper/sbin/spark-daemon.sh +223 -0
- data/ext/scala_helper/schema_builder.scala +35 -0
- data/ext/scala_helper/worker.scala +13 -0
- data/lib/cassandra_model_spark.rb +42 -0
- data/lib/cassandra_model_spark/build.rb +24 -0
- data/lib/cassandra_model_spark/column_cast.rb +44 -0
- data/lib/cassandra_model_spark/connection_cache.rb +9 -0
- data/lib/cassandra_model_spark/data_frame.rb +374 -0
- data/lib/cassandra_model_spark/java_bridge.rb +91 -0
- data/lib/cassandra_model_spark/java_classes.rb +36 -0
- data/lib/cassandra_model_spark/launcher.rb +150 -0
- data/lib/cassandra_model_spark/query_builder.rb +37 -0
- data/lib/cassandra_model_spark/raw_connection.rb +47 -0
- data/lib/cassandra_model_spark/record.rb +18 -0
- data/lib/cassandra_model_spark/spark.rb +33 -0
- metadata +113 -0
@@ -0,0 +1,36 @@
|
|
1
|
+
import_java_object 'java.util.ArrayList'
|
2
|
+
import_java_object 'org.apache.spark.SparkConf'
|
3
|
+
import_java_object 'org.apache.spark.api.java.JavaSparkContext'
|
4
|
+
import_java_object 'org.apache.spark.sql.cassandra.CassandraSQLContext'
|
5
|
+
import_java_object 'java.util.HashMap', as: 'JavaHashMap'
|
6
|
+
import_java_object 'org.apache.spark.sql.SQLContext', as: 'SparkSQLContext'
|
7
|
+
import_java_object 'org.apache.spark.sql.RowFactory', as: 'SparkRowFactory'
|
8
|
+
import_java_object 'org.apache.log4j.Logger', as: 'JLogger'
|
9
|
+
import_java_object 'org.apache.log4j.Level', as: 'JLevel'
|
10
|
+
import_java_object 'org.apache.log4j.Priority', as: 'JPriority'
|
11
|
+
import_java_object 'org.apache.spark.util.Utils', as: 'SparkUtils'
|
12
|
+
import_java_object 'org.apache.spark.storage.StorageLevel', as: 'JStorageLevel'
|
13
|
+
import_java_object 'org.apache.spark.api.cassandra_model.CassandraHelper', as: 'SparkCassandraHelper'
|
14
|
+
import_java_object 'org.apache.spark.api.cassandra_model.SchemaBuilder', as: 'SparkSchemaBuilder'
|
15
|
+
import_java_object 'org.apache.spark.api.cassandra_model.DataTypeHelper', as: 'SparkSqlDataTypeHelper'
|
16
|
+
import_java_object 'org.apache.spark.api.cassandra_model.MarshalLoader', as: 'ScalaMarshalLoader'
|
17
|
+
import_java_object 'org.apache.spark.api.cassandra_model.MapStringStringRowMapping', as: 'SparkMapStringStringRowMapping'
|
18
|
+
import_java_object 'org.apache.spark.api.cassandra_model.SparkRowRowMapping', as: 'SparkSparkRowRowMapping'
|
19
|
+
import_java_object 'org.apache.spark.deploy.master.Master', as: 'SparkMaster'
|
20
|
+
import_java_object 'org.apache.spark.deploy.worker.RubyWorkerStarter', as: 'SparkWorkerStarter'
|
21
|
+
|
22
|
+
%w(BinaryType BooleanType ByteType DataType
|
23
|
+
DateType Decimal DecimalType DoubleType FloatType IntegerType
|
24
|
+
LongType Metadata NullType PrecisionInfo ShortType
|
25
|
+
StringType StructField StructType TimestampType).each do |sql_type|
|
26
|
+
Object.const_set(:"Sql#{sql_type}", import_quiet { SparkSqlDataTypeHelper.public_send(:"get#{sql_type}") })
|
27
|
+
end
|
28
|
+
|
29
|
+
#noinspection RubyConstantNamingConvention
|
30
|
+
SqlStringArrayType = SparkSqlDataTypeHelper.getArrayType(SqlStringType)
|
31
|
+
|
32
|
+
#noinspection RubyConstantNamingConvention
|
33
|
+
SqlIntegerArrayType = SparkSqlDataTypeHelper.getArrayType(SqlIntegerType)
|
34
|
+
|
35
|
+
#noinspection RubyConstantNamingConvention
|
36
|
+
SqlStringStringMapType = SparkSqlDataTypeHelper.getMapType(SqlStringType, SqlStringType)
|
@@ -0,0 +1,150 @@
|
|
1
|
+
require 'socket'
|
2
|
+
|
3
|
+
module CassandraModel
|
4
|
+
module Spark
|
5
|
+
class Launcher
|
6
|
+
def start_master
|
7
|
+
system(env, "#{spark_daemon} start #{start_master_args}")
|
8
|
+
add_master_jars
|
9
|
+
end
|
10
|
+
|
11
|
+
def run_master
|
12
|
+
validate_env!
|
13
|
+
|
14
|
+
result = SparkMaster.startRpcEnvAndEndpoint(master_config[:host], master_config[:master_port], master_config[:ui_port], spark_conf)._1
|
15
|
+
wait_for_shutdown do
|
16
|
+
result.shutdown
|
17
|
+
result.awaitTermination
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def start_slaves
|
22
|
+
workers.map do |worker|
|
23
|
+
system(env, "#{spark_daemon} start #{start_slave_args(worker)}")
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def run_slave
|
28
|
+
validate_env!
|
29
|
+
|
30
|
+
result = SparkWorkerStarter.startWorker(master_url, slave_config[:host], master_config[:master_port], master_config[:ui_port], spark_conf)
|
31
|
+
wait_for_shutdown do
|
32
|
+
result.shutdown
|
33
|
+
result.awaitTermination
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def stop_master
|
38
|
+
system(env, "#{spark_daemon} stop #{master_args}")
|
39
|
+
end
|
40
|
+
|
41
|
+
def stop_slaves
|
42
|
+
workers.map do |worker|
|
43
|
+
system(env, "#{spark_daemon} stop #{slave_args(worker)}")
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def spark_conf
|
50
|
+
@spark_conf ||= ConnectionCache[nil].send(:spark_conf)
|
51
|
+
end
|
52
|
+
|
53
|
+
def wait_for_shutdown
|
54
|
+
begin
|
55
|
+
loop { sleep 0.2 }
|
56
|
+
rescue Interrupt
|
57
|
+
yield
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def to_argv(args)
|
62
|
+
args.split.to_java_argv
|
63
|
+
end
|
64
|
+
|
65
|
+
def validate_env!
|
66
|
+
unless ENV['SPARK_HOME'] && File.expand_path(ENV['SPARK_HOME']) == Spark.home
|
67
|
+
raise 'Spark enviroment not set correctly'
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def add_master_jars
|
72
|
+
ConnectionCache[nil].tap do |connection|
|
73
|
+
connection.config = {spark: {master: master_url}}
|
74
|
+
connection.spark_context.addJar("#{Spark.classpath}/cmodel_scala_helper.jar")
|
75
|
+
end
|
76
|
+
ConnectionCache.clear
|
77
|
+
end
|
78
|
+
|
79
|
+
def workers
|
80
|
+
slave_config[:worker_count].to_i.times.map { |index| index + 1 }
|
81
|
+
end
|
82
|
+
|
83
|
+
def start_master_args
|
84
|
+
"#{master_args} #{run_master_args}"
|
85
|
+
end
|
86
|
+
|
87
|
+
def run_master_args
|
88
|
+
"--ip #{Socket.gethostname} --port #{master_config[:master_port]} --webui-port #{master_config[:ui_port]} -h #{master_config[:host]}"
|
89
|
+
end
|
90
|
+
|
91
|
+
def start_slave_args(id)
|
92
|
+
"#{slave_args(id)} #{run_slave_args}"
|
93
|
+
end
|
94
|
+
|
95
|
+
def run_slave_args
|
96
|
+
"--webui-port #{slave_config[:ui_port]} #{master_url}"
|
97
|
+
end
|
98
|
+
|
99
|
+
def master_args
|
100
|
+
"org.apache.spark.deploy.master.Master #{master_config[:id]}"
|
101
|
+
end
|
102
|
+
|
103
|
+
def slave_args(id)
|
104
|
+
"org.apache.spark.deploy.worker.Worker #{id}"
|
105
|
+
end
|
106
|
+
|
107
|
+
def spark_daemon
|
108
|
+
"#{Spark.home}/sbin/spark-daemon.sh"
|
109
|
+
end
|
110
|
+
|
111
|
+
def master_url
|
112
|
+
"spark://#{master_config[:host]}:#{master_config[:master_port]}"
|
113
|
+
end
|
114
|
+
|
115
|
+
def master_config
|
116
|
+
config.merge(config.fetch(:master) { {} })
|
117
|
+
end
|
118
|
+
|
119
|
+
def slave_config
|
120
|
+
config.merge(config.fetch(:slave) { {} })
|
121
|
+
end
|
122
|
+
|
123
|
+
def config
|
124
|
+
@config ||= begin
|
125
|
+
override_config = ConnectionCache[nil].config.fetch(:spark_daemon) { {} }
|
126
|
+
{
|
127
|
+
id: 1,
|
128
|
+
ui_port: 8180,
|
129
|
+
master_port: 7077,
|
130
|
+
worker_count: 1,
|
131
|
+
host: Socket.gethostname,
|
132
|
+
}.merge(override_config)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
def env
|
137
|
+
@env ||= spark_env.merge(ENV.to_hash)
|
138
|
+
end
|
139
|
+
|
140
|
+
def spark_env
|
141
|
+
@spark_env ||= {
|
142
|
+
'SPARK_HOME' => Spark.home,
|
143
|
+
'SPARK_CLASSPATH' => Spark.classpath,
|
144
|
+
'SPARK_JARS' => Dir["#{Spark.classpath}/*.jar"] * ',',
|
145
|
+
}
|
146
|
+
end
|
147
|
+
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module CassandraModel
|
2
|
+
class QueryBuilder
|
3
|
+
def group(*columns)
|
4
|
+
append_option(columns, :group)
|
5
|
+
end
|
6
|
+
|
7
|
+
def as_data_frame(options = {})
|
8
|
+
if @record_klass.is_a?(Spark::DataFrame)
|
9
|
+
data_frame_from_frame(options)
|
10
|
+
else
|
11
|
+
data_frame_from_model(options)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def data_frame_from_frame(options)
|
18
|
+
query_frame = @record_klass.query(@params, @options)
|
19
|
+
Spark::DataFrame.new(@record_klass.record_klass, nil, options.merge(spark_data_frame: query_frame))
|
20
|
+
end
|
21
|
+
|
22
|
+
def data_frame_from_model(options)
|
23
|
+
updated_restriction = @record_klass.restriction_attributes(@params).inject({}) do |memo, (key, value)|
|
24
|
+
updated_key = if value.is_a?(Array)
|
25
|
+
value = value.to_java
|
26
|
+
updated_key = key.is_a?(ThomasUtils::KeyComparer) ? key.to_s : "#{key} IN"
|
27
|
+
"#{updated_key} (#{(%w(?)*value.count)*','})"
|
28
|
+
else
|
29
|
+
key.is_a?(ThomasUtils::KeyComparer) ? "#{key} ?" : "#{key} = ?"
|
30
|
+
end
|
31
|
+
memo.merge!(updated_key => value)
|
32
|
+
end.stringify_keys.to_java
|
33
|
+
rdd = SparkCassandraHelper.filterRDD(@record_klass.rdd, updated_restriction)
|
34
|
+
Spark::DataFrame.new(@record_klass, rdd, options)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module CassandraModel
|
2
|
+
class RawConnection
|
3
|
+
def java_spark_context
|
4
|
+
@spark_context ||= begin
|
5
|
+
JavaSparkContext.new(spark_conf).tap do |java_spark_context|
|
6
|
+
java_spark_context.sc.addJar("#{Spark.classpath}/cmodel_scala_helper.jar")
|
7
|
+
end
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
def spark_context
|
12
|
+
java_spark_context.sc
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def spark_conf
|
18
|
+
@spark_conf ||= SparkConf.new(true).tap do |conf|
|
19
|
+
conf.set('spark.app.name', 'cassandra_model_spark')
|
20
|
+
conf.set('spark.master', 'local[*]')
|
21
|
+
conf.set('spark.cassandra.connection.host', config[:hosts].first)
|
22
|
+
flat_spark_config.each { |key, value| conf.set(key, value) }
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def flat_spark_config(config = spark_config)
|
27
|
+
config.inject({}) do |memo, (key, value)|
|
28
|
+
if value.is_a?(Hash)
|
29
|
+
memo.merge!(child_spark_conf(key, value))
|
30
|
+
else
|
31
|
+
memo.merge!(key.to_s => value)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def child_spark_conf(key, value)
|
37
|
+
child_conf = flat_spark_config(value)
|
38
|
+
child_conf.inject({}) do |child_memo, (child_key, child_value)|
|
39
|
+
child_memo.merge!("#{key}.#{child_key}" => child_value)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def spark_config
|
44
|
+
config.slice(:spark)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module CassandraModel
|
2
|
+
class Record
|
3
|
+
def self.rdd
|
4
|
+
@spark_rdd ||= SparkCassandraHelper.cassandra_table(
|
5
|
+
table.connection.spark_context,
|
6
|
+
table.connection.config[:keyspace],
|
7
|
+
table_name)
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.rdd_row_mapping
|
11
|
+
nil
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.count
|
15
|
+
rdd.count
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
3
|
+
module CassandraModel
|
4
|
+
module Spark
|
5
|
+
class << self
|
6
|
+
def root
|
7
|
+
@gem_root ||= File.expand_path('../../..', __FILE__)
|
8
|
+
end
|
9
|
+
|
10
|
+
def home
|
11
|
+
@home ||= (ENV['SPARK_HOME'] || default_home)
|
12
|
+
end
|
13
|
+
|
14
|
+
def classpath
|
15
|
+
@classpath ||= (ENV['SPARK_CLASSPATH'] || default_classpath)
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def default_classpath
|
21
|
+
File.expand_path('./lib/', home).tap do |path|
|
22
|
+
FileUtils.mkdir_p(path)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def default_home
|
27
|
+
File.expand_path('~/.cassandra_model_spark').tap do |path|
|
28
|
+
FileUtils.mkdir_p(path)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
metadata
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: cassandra_model_spark
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1.5
|
5
|
+
platform: java
|
6
|
+
authors:
|
7
|
+
- Thomas RM Rogers
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-12-29 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: cassandra_model
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.9.16
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.9.16
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: thomas_utils
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.1.16
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ~>
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.1.16
|
41
|
+
description: |-
|
42
|
+
Spark integration for cassandra_model.
|
43
|
+
Get high-performance data analytics with the ease of cassandra_model.
|
44
|
+
Inspired by the ruby-spark gem.
|
45
|
+
email: thomasrogers03@gmail.com
|
46
|
+
executables:
|
47
|
+
- cmodel-spark-build
|
48
|
+
- cmodel-spark-env.rb
|
49
|
+
- cmodel-spark-master
|
50
|
+
- cmodel-spark-slaves
|
51
|
+
- cmodel-spark-run-master
|
52
|
+
- cmodel-spark-run-slave
|
53
|
+
extensions: []
|
54
|
+
extra_rdoc_files: []
|
55
|
+
files:
|
56
|
+
- README.md
|
57
|
+
- bin/cmodel-spark-build
|
58
|
+
- bin/cmodel-spark-env.rb
|
59
|
+
- bin/cmodel-spark-master
|
60
|
+
- bin/cmodel-spark-run-master
|
61
|
+
- bin/cmodel-spark-run-master.sh
|
62
|
+
- bin/cmodel-spark-run-slave
|
63
|
+
- bin/cmodel-spark-run-slave.sh
|
64
|
+
- bin/cmodel-spark-slaves
|
65
|
+
- ext/scala_helper/bin/load-spark-env.sh
|
66
|
+
- ext/scala_helper/bin/spark-class
|
67
|
+
- ext/scala_helper/build.sbt
|
68
|
+
- ext/scala_helper/cassandra_helper.scala
|
69
|
+
- ext/scala_helper/data_type_helper.scala
|
70
|
+
- ext/scala_helper/marshal_loader.scala
|
71
|
+
- ext/scala_helper/marshal_row_mapping.scala
|
72
|
+
- ext/scala_helper/project/plugins.sbt
|
73
|
+
- ext/scala_helper/sbin/spark-config.sh
|
74
|
+
- ext/scala_helper/sbin/spark-daemon.sh
|
75
|
+
- ext/scala_helper/schema_builder.scala
|
76
|
+
- ext/scala_helper/worker.scala
|
77
|
+
- lib/cassandra_model_spark.rb
|
78
|
+
- lib/cassandra_model_spark/build.rb
|
79
|
+
- lib/cassandra_model_spark/column_cast.rb
|
80
|
+
- lib/cassandra_model_spark/connection_cache.rb
|
81
|
+
- lib/cassandra_model_spark/data_frame.rb
|
82
|
+
- lib/cassandra_model_spark/java_bridge.rb
|
83
|
+
- lib/cassandra_model_spark/java_classes.rb
|
84
|
+
- lib/cassandra_model_spark/launcher.rb
|
85
|
+
- lib/cassandra_model_spark/query_builder.rb
|
86
|
+
- lib/cassandra_model_spark/raw_connection.rb
|
87
|
+
- lib/cassandra_model_spark/record.rb
|
88
|
+
- lib/cassandra_model_spark/spark.rb
|
89
|
+
homepage: https://www.github.com/thomasrogers03/cassandra_model_spark
|
90
|
+
licenses:
|
91
|
+
- Apache License 2.0
|
92
|
+
metadata: {}
|
93
|
+
post_install_message:
|
94
|
+
rdoc_options: []
|
95
|
+
require_paths:
|
96
|
+
- lib
|
97
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
98
|
+
requirements:
|
99
|
+
- - '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
103
|
+
requirements:
|
104
|
+
- - '>='
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: '0'
|
107
|
+
requirements: []
|
108
|
+
rubyforge_project:
|
109
|
+
rubygems_version: 2.4.8
|
110
|
+
signing_key:
|
111
|
+
specification_version: 4
|
112
|
+
summary: Spark integration for cassandra_model
|
113
|
+
test_files: []
|