cassandra_model_spark 0.0.1.5-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +17 -0
  3. data/bin/cmodel-spark-build +7 -0
  4. data/bin/cmodel-spark-env.rb +11 -0
  5. data/bin/cmodel-spark-master +22 -0
  6. data/bin/cmodel-spark-run-master +4 -0
  7. data/bin/cmodel-spark-run-master.sh +8 -0
  8. data/bin/cmodel-spark-run-slave +4 -0
  9. data/bin/cmodel-spark-run-slave.sh +8 -0
  10. data/bin/cmodel-spark-slaves +22 -0
  11. data/ext/scala_helper/bin/load-spark-env.sh +63 -0
  12. data/ext/scala_helper/bin/spark-class +87 -0
  13. data/ext/scala_helper/build.sbt +62 -0
  14. data/ext/scala_helper/cassandra_helper.scala +23 -0
  15. data/ext/scala_helper/data_type_helper.scala +27 -0
  16. data/ext/scala_helper/marshal_loader.scala +204 -0
  17. data/ext/scala_helper/marshal_row_mapping.scala +85 -0
  18. data/ext/scala_helper/project/plugins.sbt +6 -0
  19. data/ext/scala_helper/sbin/spark-config.sh +30 -0
  20. data/ext/scala_helper/sbin/spark-daemon.sh +223 -0
  21. data/ext/scala_helper/schema_builder.scala +35 -0
  22. data/ext/scala_helper/worker.scala +13 -0
  23. data/lib/cassandra_model_spark.rb +42 -0
  24. data/lib/cassandra_model_spark/build.rb +24 -0
  25. data/lib/cassandra_model_spark/column_cast.rb +44 -0
  26. data/lib/cassandra_model_spark/connection_cache.rb +9 -0
  27. data/lib/cassandra_model_spark/data_frame.rb +374 -0
  28. data/lib/cassandra_model_spark/java_bridge.rb +91 -0
  29. data/lib/cassandra_model_spark/java_classes.rb +36 -0
  30. data/lib/cassandra_model_spark/launcher.rb +150 -0
  31. data/lib/cassandra_model_spark/query_builder.rb +37 -0
  32. data/lib/cassandra_model_spark/raw_connection.rb +47 -0
  33. data/lib/cassandra_model_spark/record.rb +18 -0
  34. data/lib/cassandra_model_spark/spark.rb +33 -0
  35. metadata +113 -0
@@ -0,0 +1,36 @@
1
+ import_java_object 'java.util.ArrayList'
2
+ import_java_object 'org.apache.spark.SparkConf'
3
+ import_java_object 'org.apache.spark.api.java.JavaSparkContext'
4
+ import_java_object 'org.apache.spark.sql.cassandra.CassandraSQLContext'
5
+ import_java_object 'java.util.HashMap', as: 'JavaHashMap'
6
+ import_java_object 'org.apache.spark.sql.SQLContext', as: 'SparkSQLContext'
7
+ import_java_object 'org.apache.spark.sql.RowFactory', as: 'SparkRowFactory'
8
+ import_java_object 'org.apache.log4j.Logger', as: 'JLogger'
9
+ import_java_object 'org.apache.log4j.Level', as: 'JLevel'
10
+ import_java_object 'org.apache.log4j.Priority', as: 'JPriority'
11
+ import_java_object 'org.apache.spark.util.Utils', as: 'SparkUtils'
12
+ import_java_object 'org.apache.spark.storage.StorageLevel', as: 'JStorageLevel'
13
+ import_java_object 'org.apache.spark.api.cassandra_model.CassandraHelper', as: 'SparkCassandraHelper'
14
+ import_java_object 'org.apache.spark.api.cassandra_model.SchemaBuilder', as: 'SparkSchemaBuilder'
15
+ import_java_object 'org.apache.spark.api.cassandra_model.DataTypeHelper', as: 'SparkSqlDataTypeHelper'
16
+ import_java_object 'org.apache.spark.api.cassandra_model.MarshalLoader', as: 'ScalaMarshalLoader'
17
+ import_java_object 'org.apache.spark.api.cassandra_model.MapStringStringRowMapping', as: 'SparkMapStringStringRowMapping'
18
+ import_java_object 'org.apache.spark.api.cassandra_model.SparkRowRowMapping', as: 'SparkSparkRowRowMapping'
19
+ import_java_object 'org.apache.spark.deploy.master.Master', as: 'SparkMaster'
20
+ import_java_object 'org.apache.spark.deploy.worker.RubyWorkerStarter', as: 'SparkWorkerStarter'
21
+
22
+ %w(BinaryType BooleanType ByteType DataType
23
+ DateType Decimal DecimalType DoubleType FloatType IntegerType
24
+ LongType Metadata NullType PrecisionInfo ShortType
25
+ StringType StructField StructType TimestampType).each do |sql_type|
26
+ Object.const_set(:"Sql#{sql_type}", import_quiet { SparkSqlDataTypeHelper.public_send(:"get#{sql_type}") })
27
+ end
28
+
29
+ #noinspection RubyConstantNamingConvention
30
+ SqlStringArrayType = SparkSqlDataTypeHelper.getArrayType(SqlStringType)
31
+
32
+ #noinspection RubyConstantNamingConvention
33
+ SqlIntegerArrayType = SparkSqlDataTypeHelper.getArrayType(SqlIntegerType)
34
+
35
+ #noinspection RubyConstantNamingConvention
36
+ SqlStringStringMapType = SparkSqlDataTypeHelper.getMapType(SqlStringType, SqlStringType)
@@ -0,0 +1,150 @@
1
+ require 'socket'
2
+
3
+ module CassandraModel
4
+ module Spark
5
+ class Launcher
6
+ def start_master
7
+ system(env, "#{spark_daemon} start #{start_master_args}")
8
+ add_master_jars
9
+ end
10
+
11
+ def run_master
12
+ validate_env!
13
+
14
+ result = SparkMaster.startRpcEnvAndEndpoint(master_config[:host], master_config[:master_port], master_config[:ui_port], spark_conf)._1
15
+ wait_for_shutdown do
16
+ result.shutdown
17
+ result.awaitTermination
18
+ end
19
+ end
20
+
21
+ def start_slaves
22
+ workers.map do |worker|
23
+ system(env, "#{spark_daemon} start #{start_slave_args(worker)}")
24
+ end
25
+ end
26
+
27
+ def run_slave
28
+ validate_env!
29
+
30
+ result = SparkWorkerStarter.startWorker(master_url, slave_config[:host], master_config[:master_port], master_config[:ui_port], spark_conf)
31
+ wait_for_shutdown do
32
+ result.shutdown
33
+ result.awaitTermination
34
+ end
35
+ end
36
+
37
+ def stop_master
38
+ system(env, "#{spark_daemon} stop #{master_args}")
39
+ end
40
+
41
+ def stop_slaves
42
+ workers.map do |worker|
43
+ system(env, "#{spark_daemon} stop #{slave_args(worker)}")
44
+ end
45
+ end
46
+
47
+ private
48
+
49
+ def spark_conf
50
+ @spark_conf ||= ConnectionCache[nil].send(:spark_conf)
51
+ end
52
+
53
+ def wait_for_shutdown
54
+ begin
55
+ loop { sleep 0.2 }
56
+ rescue Interrupt
57
+ yield
58
+ end
59
+ end
60
+
61
+ def to_argv(args)
62
+ args.split.to_java_argv
63
+ end
64
+
65
+ def validate_env!
66
+ unless ENV['SPARK_HOME'] && File.expand_path(ENV['SPARK_HOME']) == Spark.home
67
+ raise 'Spark enviroment not set correctly'
68
+ end
69
+ end
70
+
71
+ def add_master_jars
72
+ ConnectionCache[nil].tap do |connection|
73
+ connection.config = {spark: {master: master_url}}
74
+ connection.spark_context.addJar("#{Spark.classpath}/cmodel_scala_helper.jar")
75
+ end
76
+ ConnectionCache.clear
77
+ end
78
+
79
+ def workers
80
+ slave_config[:worker_count].to_i.times.map { |index| index + 1 }
81
+ end
82
+
83
+ def start_master_args
84
+ "#{master_args} #{run_master_args}"
85
+ end
86
+
87
+ def run_master_args
88
+ "--ip #{Socket.gethostname} --port #{master_config[:master_port]} --webui-port #{master_config[:ui_port]} -h #{master_config[:host]}"
89
+ end
90
+
91
+ def start_slave_args(id)
92
+ "#{slave_args(id)} #{run_slave_args}"
93
+ end
94
+
95
+ def run_slave_args
96
+ "--webui-port #{slave_config[:ui_port]} #{master_url}"
97
+ end
98
+
99
+ def master_args
100
+ "org.apache.spark.deploy.master.Master #{master_config[:id]}"
101
+ end
102
+
103
+ def slave_args(id)
104
+ "org.apache.spark.deploy.worker.Worker #{id}"
105
+ end
106
+
107
+ def spark_daemon
108
+ "#{Spark.home}/sbin/spark-daemon.sh"
109
+ end
110
+
111
+ def master_url
112
+ "spark://#{master_config[:host]}:#{master_config[:master_port]}"
113
+ end
114
+
115
+ def master_config
116
+ config.merge(config.fetch(:master) { {} })
117
+ end
118
+
119
+ def slave_config
120
+ config.merge(config.fetch(:slave) { {} })
121
+ end
122
+
123
+ def config
124
+ @config ||= begin
125
+ override_config = ConnectionCache[nil].config.fetch(:spark_daemon) { {} }
126
+ {
127
+ id: 1,
128
+ ui_port: 8180,
129
+ master_port: 7077,
130
+ worker_count: 1,
131
+ host: Socket.gethostname,
132
+ }.merge(override_config)
133
+ end
134
+ end
135
+
136
+ def env
137
+ @env ||= spark_env.merge(ENV.to_hash)
138
+ end
139
+
140
+ def spark_env
141
+ @spark_env ||= {
142
+ 'SPARK_HOME' => Spark.home,
143
+ 'SPARK_CLASSPATH' => Spark.classpath,
144
+ 'SPARK_JARS' => Dir["#{Spark.classpath}/*.jar"] * ',',
145
+ }
146
+ end
147
+
148
+ end
149
+ end
150
+ end
@@ -0,0 +1,37 @@
1
+ module CassandraModel
2
+ class QueryBuilder
3
+ def group(*columns)
4
+ append_option(columns, :group)
5
+ end
6
+
7
+ def as_data_frame(options = {})
8
+ if @record_klass.is_a?(Spark::DataFrame)
9
+ data_frame_from_frame(options)
10
+ else
11
+ data_frame_from_model(options)
12
+ end
13
+ end
14
+
15
+ private
16
+
17
+ def data_frame_from_frame(options)
18
+ query_frame = @record_klass.query(@params, @options)
19
+ Spark::DataFrame.new(@record_klass.record_klass, nil, options.merge(spark_data_frame: query_frame))
20
+ end
21
+
22
+ def data_frame_from_model(options)
23
+ updated_restriction = @record_klass.restriction_attributes(@params).inject({}) do |memo, (key, value)|
24
+ updated_key = if value.is_a?(Array)
25
+ value = value.to_java
26
+ updated_key = key.is_a?(ThomasUtils::KeyComparer) ? key.to_s : "#{key} IN"
27
+ "#{updated_key} (#{(%w(?)*value.count)*','})"
28
+ else
29
+ key.is_a?(ThomasUtils::KeyComparer) ? "#{key} ?" : "#{key} = ?"
30
+ end
31
+ memo.merge!(updated_key => value)
32
+ end.stringify_keys.to_java
33
+ rdd = SparkCassandraHelper.filterRDD(@record_klass.rdd, updated_restriction)
34
+ Spark::DataFrame.new(@record_klass, rdd, options)
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,47 @@
1
+ module CassandraModel
2
+ class RawConnection
3
+ def java_spark_context
4
+ @spark_context ||= begin
5
+ JavaSparkContext.new(spark_conf).tap do |java_spark_context|
6
+ java_spark_context.sc.addJar("#{Spark.classpath}/cmodel_scala_helper.jar")
7
+ end
8
+ end
9
+ end
10
+
11
+ def spark_context
12
+ java_spark_context.sc
13
+ end
14
+
15
+ private
16
+
17
+ def spark_conf
18
+ @spark_conf ||= SparkConf.new(true).tap do |conf|
19
+ conf.set('spark.app.name', 'cassandra_model_spark')
20
+ conf.set('spark.master', 'local[*]')
21
+ conf.set('spark.cassandra.connection.host', config[:hosts].first)
22
+ flat_spark_config.each { |key, value| conf.set(key, value) }
23
+ end
24
+ end
25
+
26
+ def flat_spark_config(config = spark_config)
27
+ config.inject({}) do |memo, (key, value)|
28
+ if value.is_a?(Hash)
29
+ memo.merge!(child_spark_conf(key, value))
30
+ else
31
+ memo.merge!(key.to_s => value)
32
+ end
33
+ end
34
+ end
35
+
36
+ def child_spark_conf(key, value)
37
+ child_conf = flat_spark_config(value)
38
+ child_conf.inject({}) do |child_memo, (child_key, child_value)|
39
+ child_memo.merge!("#{key}.#{child_key}" => child_value)
40
+ end
41
+ end
42
+
43
+ def spark_config
44
+ config.slice(:spark)
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,18 @@
1
+ module CassandraModel
2
+ class Record
3
+ def self.rdd
4
+ @spark_rdd ||= SparkCassandraHelper.cassandra_table(
5
+ table.connection.spark_context,
6
+ table.connection.config[:keyspace],
7
+ table_name)
8
+ end
9
+
10
+ def self.rdd_row_mapping
11
+ nil
12
+ end
13
+
14
+ def self.count
15
+ rdd.count
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,33 @@
1
+ require 'fileutils'
2
+
3
+ module CassandraModel
4
+ module Spark
5
+ class << self
6
+ def root
7
+ @gem_root ||= File.expand_path('../../..', __FILE__)
8
+ end
9
+
10
+ def home
11
+ @home ||= (ENV['SPARK_HOME'] || default_home)
12
+ end
13
+
14
+ def classpath
15
+ @classpath ||= (ENV['SPARK_CLASSPATH'] || default_classpath)
16
+ end
17
+
18
+ private
19
+
20
+ def default_classpath
21
+ File.expand_path('./lib/', home).tap do |path|
22
+ FileUtils.mkdir_p(path)
23
+ end
24
+ end
25
+
26
+ def default_home
27
+ File.expand_path('~/.cassandra_model_spark').tap do |path|
28
+ FileUtils.mkdir_p(path)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
metadata ADDED
@@ -0,0 +1,113 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cassandra_model_spark
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1.5
5
+ platform: java
6
+ authors:
7
+ - Thomas RM Rogers
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-12-29 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: cassandra_model
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 0.9.16
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 0.9.16
27
+ - !ruby/object:Gem::Dependency
28
+ name: thomas_utils
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 0.1.16
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: 0.1.16
41
+ description: |-
42
+ Spark integration for cassandra_model.
43
+ Get high-performance data analytics with the ease of cassandra_model.
44
+ Inspired by the ruby-spark gem.
45
+ email: thomasrogers03@gmail.com
46
+ executables:
47
+ - cmodel-spark-build
48
+ - cmodel-spark-env.rb
49
+ - cmodel-spark-master
50
+ - cmodel-spark-slaves
51
+ - cmodel-spark-run-master
52
+ - cmodel-spark-run-slave
53
+ extensions: []
54
+ extra_rdoc_files: []
55
+ files:
56
+ - README.md
57
+ - bin/cmodel-spark-build
58
+ - bin/cmodel-spark-env.rb
59
+ - bin/cmodel-spark-master
60
+ - bin/cmodel-spark-run-master
61
+ - bin/cmodel-spark-run-master.sh
62
+ - bin/cmodel-spark-run-slave
63
+ - bin/cmodel-spark-run-slave.sh
64
+ - bin/cmodel-spark-slaves
65
+ - ext/scala_helper/bin/load-spark-env.sh
66
+ - ext/scala_helper/bin/spark-class
67
+ - ext/scala_helper/build.sbt
68
+ - ext/scala_helper/cassandra_helper.scala
69
+ - ext/scala_helper/data_type_helper.scala
70
+ - ext/scala_helper/marshal_loader.scala
71
+ - ext/scala_helper/marshal_row_mapping.scala
72
+ - ext/scala_helper/project/plugins.sbt
73
+ - ext/scala_helper/sbin/spark-config.sh
74
+ - ext/scala_helper/sbin/spark-daemon.sh
75
+ - ext/scala_helper/schema_builder.scala
76
+ - ext/scala_helper/worker.scala
77
+ - lib/cassandra_model_spark.rb
78
+ - lib/cassandra_model_spark/build.rb
79
+ - lib/cassandra_model_spark/column_cast.rb
80
+ - lib/cassandra_model_spark/connection_cache.rb
81
+ - lib/cassandra_model_spark/data_frame.rb
82
+ - lib/cassandra_model_spark/java_bridge.rb
83
+ - lib/cassandra_model_spark/java_classes.rb
84
+ - lib/cassandra_model_spark/launcher.rb
85
+ - lib/cassandra_model_spark/query_builder.rb
86
+ - lib/cassandra_model_spark/raw_connection.rb
87
+ - lib/cassandra_model_spark/record.rb
88
+ - lib/cassandra_model_spark/spark.rb
89
+ homepage: https://www.github.com/thomasrogers03/cassandra_model_spark
90
+ licenses:
91
+ - Apache License 2.0
92
+ metadata: {}
93
+ post_install_message:
94
+ rdoc_options: []
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - '>='
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ requirements: []
108
+ rubyforge_project:
109
+ rubygems_version: 2.4.8
110
+ signing_key:
111
+ specification_version: 4
112
+ summary: Spark integration for cassandra_model
113
+ test_files: []