cassandra_model_spark 0.0.1.5-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +17 -0
  3. data/bin/cmodel-spark-build +7 -0
  4. data/bin/cmodel-spark-env.rb +11 -0
  5. data/bin/cmodel-spark-master +22 -0
  6. data/bin/cmodel-spark-run-master +4 -0
  7. data/bin/cmodel-spark-run-master.sh +8 -0
  8. data/bin/cmodel-spark-run-slave +4 -0
  9. data/bin/cmodel-spark-run-slave.sh +8 -0
  10. data/bin/cmodel-spark-slaves +22 -0
  11. data/ext/scala_helper/bin/load-spark-env.sh +63 -0
  12. data/ext/scala_helper/bin/spark-class +87 -0
  13. data/ext/scala_helper/build.sbt +62 -0
  14. data/ext/scala_helper/cassandra_helper.scala +23 -0
  15. data/ext/scala_helper/data_type_helper.scala +27 -0
  16. data/ext/scala_helper/marshal_loader.scala +204 -0
  17. data/ext/scala_helper/marshal_row_mapping.scala +85 -0
  18. data/ext/scala_helper/project/plugins.sbt +6 -0
  19. data/ext/scala_helper/sbin/spark-config.sh +30 -0
  20. data/ext/scala_helper/sbin/spark-daemon.sh +223 -0
  21. data/ext/scala_helper/schema_builder.scala +35 -0
  22. data/ext/scala_helper/worker.scala +13 -0
  23. data/lib/cassandra_model_spark.rb +42 -0
  24. data/lib/cassandra_model_spark/build.rb +24 -0
  25. data/lib/cassandra_model_spark/column_cast.rb +44 -0
  26. data/lib/cassandra_model_spark/connection_cache.rb +9 -0
  27. data/lib/cassandra_model_spark/data_frame.rb +374 -0
  28. data/lib/cassandra_model_spark/java_bridge.rb +91 -0
  29. data/lib/cassandra_model_spark/java_classes.rb +36 -0
  30. data/lib/cassandra_model_spark/launcher.rb +150 -0
  31. data/lib/cassandra_model_spark/query_builder.rb +37 -0
  32. data/lib/cassandra_model_spark/raw_connection.rb +47 -0
  33. data/lib/cassandra_model_spark/record.rb +18 -0
  34. data/lib/cassandra_model_spark/spark.rb +33 -0
  35. metadata +113 -0
@@ -0,0 +1,36 @@
1
+ import_java_object 'java.util.ArrayList'
2
+ import_java_object 'org.apache.spark.SparkConf'
3
+ import_java_object 'org.apache.spark.api.java.JavaSparkContext'
4
+ import_java_object 'org.apache.spark.sql.cassandra.CassandraSQLContext'
5
+ import_java_object 'java.util.HashMap', as: 'JavaHashMap'
6
+ import_java_object 'org.apache.spark.sql.SQLContext', as: 'SparkSQLContext'
7
+ import_java_object 'org.apache.spark.sql.RowFactory', as: 'SparkRowFactory'
8
+ import_java_object 'org.apache.log4j.Logger', as: 'JLogger'
9
+ import_java_object 'org.apache.log4j.Level', as: 'JLevel'
10
+ import_java_object 'org.apache.log4j.Priority', as: 'JPriority'
11
+ import_java_object 'org.apache.spark.util.Utils', as: 'SparkUtils'
12
+ import_java_object 'org.apache.spark.storage.StorageLevel', as: 'JStorageLevel'
13
+ import_java_object 'org.apache.spark.api.cassandra_model.CassandraHelper', as: 'SparkCassandraHelper'
14
+ import_java_object 'org.apache.spark.api.cassandra_model.SchemaBuilder', as: 'SparkSchemaBuilder'
15
+ import_java_object 'org.apache.spark.api.cassandra_model.DataTypeHelper', as: 'SparkSqlDataTypeHelper'
16
+ import_java_object 'org.apache.spark.api.cassandra_model.MarshalLoader', as: 'ScalaMarshalLoader'
17
+ import_java_object 'org.apache.spark.api.cassandra_model.MapStringStringRowMapping', as: 'SparkMapStringStringRowMapping'
18
+ import_java_object 'org.apache.spark.api.cassandra_model.SparkRowRowMapping', as: 'SparkSparkRowRowMapping'
19
+ import_java_object 'org.apache.spark.deploy.master.Master', as: 'SparkMaster'
20
+ import_java_object 'org.apache.spark.deploy.worker.RubyWorkerStarter', as: 'SparkWorkerStarter'
21
+
22
+ %w(BinaryType BooleanType ByteType DataType
23
+ DateType Decimal DecimalType DoubleType FloatType IntegerType
24
+ LongType Metadata NullType PrecisionInfo ShortType
25
+ StringType StructField StructType TimestampType).each do |sql_type|
26
+ Object.const_set(:"Sql#{sql_type}", import_quiet { SparkSqlDataTypeHelper.public_send(:"get#{sql_type}") })
27
+ end
28
+
29
+ #noinspection RubyConstantNamingConvention
30
+ SqlStringArrayType = SparkSqlDataTypeHelper.getArrayType(SqlStringType)
31
+
32
+ #noinspection RubyConstantNamingConvention
33
+ SqlIntegerArrayType = SparkSqlDataTypeHelper.getArrayType(SqlIntegerType)
34
+
35
+ #noinspection RubyConstantNamingConvention
36
+ SqlStringStringMapType = SparkSqlDataTypeHelper.getMapType(SqlStringType, SqlStringType)
@@ -0,0 +1,150 @@
1
+ require 'socket'
2
+
3
+ module CassandraModel
4
+ module Spark
5
+ class Launcher
6
+ def start_master
7
+ system(env, "#{spark_daemon} start #{start_master_args}")
8
+ add_master_jars
9
+ end
10
+
11
+ def run_master
12
+ validate_env!
13
+
14
+ result = SparkMaster.startRpcEnvAndEndpoint(master_config[:host], master_config[:master_port], master_config[:ui_port], spark_conf)._1
15
+ wait_for_shutdown do
16
+ result.shutdown
17
+ result.awaitTermination
18
+ end
19
+ end
20
+
21
+ def start_slaves
22
+ workers.map do |worker|
23
+ system(env, "#{spark_daemon} start #{start_slave_args(worker)}")
24
+ end
25
+ end
26
+
27
+ def run_slave
28
+ validate_env!
29
+
30
+ result = SparkWorkerStarter.startWorker(master_url, slave_config[:host], master_config[:master_port], master_config[:ui_port], spark_conf)
31
+ wait_for_shutdown do
32
+ result.shutdown
33
+ result.awaitTermination
34
+ end
35
+ end
36
+
37
+ def stop_master
38
+ system(env, "#{spark_daemon} stop #{master_args}")
39
+ end
40
+
41
+ def stop_slaves
42
+ workers.map do |worker|
43
+ system(env, "#{spark_daemon} stop #{slave_args(worker)}")
44
+ end
45
+ end
46
+
47
+ private
48
+
49
+ def spark_conf
50
+ @spark_conf ||= ConnectionCache[nil].send(:spark_conf)
51
+ end
52
+
53
+ def wait_for_shutdown
54
+ begin
55
+ loop { sleep 0.2 }
56
+ rescue Interrupt
57
+ yield
58
+ end
59
+ end
60
+
61
+ def to_argv(args)
62
+ args.split.to_java_argv
63
+ end
64
+
65
+ def validate_env!
66
+ unless ENV['SPARK_HOME'] && File.expand_path(ENV['SPARK_HOME']) == Spark.home
67
+ raise 'Spark enviroment not set correctly'
68
+ end
69
+ end
70
+
71
+ def add_master_jars
72
+ ConnectionCache[nil].tap do |connection|
73
+ connection.config = {spark: {master: master_url}}
74
+ connection.spark_context.addJar("#{Spark.classpath}/cmodel_scala_helper.jar")
75
+ end
76
+ ConnectionCache.clear
77
+ end
78
+
79
+ def workers
80
+ slave_config[:worker_count].to_i.times.map { |index| index + 1 }
81
+ end
82
+
83
+ def start_master_args
84
+ "#{master_args} #{run_master_args}"
85
+ end
86
+
87
+ def run_master_args
88
+ "--ip #{Socket.gethostname} --port #{master_config[:master_port]} --webui-port #{master_config[:ui_port]} -h #{master_config[:host]}"
89
+ end
90
+
91
+ def start_slave_args(id)
92
+ "#{slave_args(id)} #{run_slave_args}"
93
+ end
94
+
95
+ def run_slave_args
96
+ "--webui-port #{slave_config[:ui_port]} #{master_url}"
97
+ end
98
+
99
+ def master_args
100
+ "org.apache.spark.deploy.master.Master #{master_config[:id]}"
101
+ end
102
+
103
+ def slave_args(id)
104
+ "org.apache.spark.deploy.worker.Worker #{id}"
105
+ end
106
+
107
+ def spark_daemon
108
+ "#{Spark.home}/sbin/spark-daemon.sh"
109
+ end
110
+
111
+ def master_url
112
+ "spark://#{master_config[:host]}:#{master_config[:master_port]}"
113
+ end
114
+
115
+ def master_config
116
+ config.merge(config.fetch(:master) { {} })
117
+ end
118
+
119
+ def slave_config
120
+ config.merge(config.fetch(:slave) { {} })
121
+ end
122
+
123
+ def config
124
+ @config ||= begin
125
+ override_config = ConnectionCache[nil].config.fetch(:spark_daemon) { {} }
126
+ {
127
+ id: 1,
128
+ ui_port: 8180,
129
+ master_port: 7077,
130
+ worker_count: 1,
131
+ host: Socket.gethostname,
132
+ }.merge(override_config)
133
+ end
134
+ end
135
+
136
+ def env
137
+ @env ||= spark_env.merge(ENV.to_hash)
138
+ end
139
+
140
+ def spark_env
141
+ @spark_env ||= {
142
+ 'SPARK_HOME' => Spark.home,
143
+ 'SPARK_CLASSPATH' => Spark.classpath,
144
+ 'SPARK_JARS' => Dir["#{Spark.classpath}/*.jar"] * ',',
145
+ }
146
+ end
147
+
148
+ end
149
+ end
150
+ end
@@ -0,0 +1,37 @@
1
+ module CassandraModel
2
+ class QueryBuilder
3
+ def group(*columns)
4
+ append_option(columns, :group)
5
+ end
6
+
7
+ def as_data_frame(options = {})
8
+ if @record_klass.is_a?(Spark::DataFrame)
9
+ data_frame_from_frame(options)
10
+ else
11
+ data_frame_from_model(options)
12
+ end
13
+ end
14
+
15
+ private
16
+
17
+ def data_frame_from_frame(options)
18
+ query_frame = @record_klass.query(@params, @options)
19
+ Spark::DataFrame.new(@record_klass.record_klass, nil, options.merge(spark_data_frame: query_frame))
20
+ end
21
+
22
+ def data_frame_from_model(options)
23
+ updated_restriction = @record_klass.restriction_attributes(@params).inject({}) do |memo, (key, value)|
24
+ updated_key = if value.is_a?(Array)
25
+ value = value.to_java
26
+ updated_key = key.is_a?(ThomasUtils::KeyComparer) ? key.to_s : "#{key} IN"
27
+ "#{updated_key} (#{(%w(?)*value.count)*','})"
28
+ else
29
+ key.is_a?(ThomasUtils::KeyComparer) ? "#{key} ?" : "#{key} = ?"
30
+ end
31
+ memo.merge!(updated_key => value)
32
+ end.stringify_keys.to_java
33
+ rdd = SparkCassandraHelper.filterRDD(@record_klass.rdd, updated_restriction)
34
+ Spark::DataFrame.new(@record_klass, rdd, options)
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,47 @@
1
+ module CassandraModel
2
+ class RawConnection
3
+ def java_spark_context
4
+ @spark_context ||= begin
5
+ JavaSparkContext.new(spark_conf).tap do |java_spark_context|
6
+ java_spark_context.sc.addJar("#{Spark.classpath}/cmodel_scala_helper.jar")
7
+ end
8
+ end
9
+ end
10
+
11
+ def spark_context
12
+ java_spark_context.sc
13
+ end
14
+
15
+ private
16
+
17
+ def spark_conf
18
+ @spark_conf ||= SparkConf.new(true).tap do |conf|
19
+ conf.set('spark.app.name', 'cassandra_model_spark')
20
+ conf.set('spark.master', 'local[*]')
21
+ conf.set('spark.cassandra.connection.host', config[:hosts].first)
22
+ flat_spark_config.each { |key, value| conf.set(key, value) }
23
+ end
24
+ end
25
+
26
+ def flat_spark_config(config = spark_config)
27
+ config.inject({}) do |memo, (key, value)|
28
+ if value.is_a?(Hash)
29
+ memo.merge!(child_spark_conf(key, value))
30
+ else
31
+ memo.merge!(key.to_s => value)
32
+ end
33
+ end
34
+ end
35
+
36
+ def child_spark_conf(key, value)
37
+ child_conf = flat_spark_config(value)
38
+ child_conf.inject({}) do |child_memo, (child_key, child_value)|
39
+ child_memo.merge!("#{key}.#{child_key}" => child_value)
40
+ end
41
+ end
42
+
43
+ def spark_config
44
+ config.slice(:spark)
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,18 @@
1
+ module CassandraModel
2
+ class Record
3
+ def self.rdd
4
+ @spark_rdd ||= SparkCassandraHelper.cassandra_table(
5
+ table.connection.spark_context,
6
+ table.connection.config[:keyspace],
7
+ table_name)
8
+ end
9
+
10
+ def self.rdd_row_mapping
11
+ nil
12
+ end
13
+
14
+ def self.count
15
+ rdd.count
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,33 @@
1
+ require 'fileutils'
2
+
3
+ module CassandraModel
4
+ module Spark
5
+ class << self
6
+ def root
7
+ @gem_root ||= File.expand_path('../../..', __FILE__)
8
+ end
9
+
10
+ def home
11
+ @home ||= (ENV['SPARK_HOME'] || default_home)
12
+ end
13
+
14
+ def classpath
15
+ @classpath ||= (ENV['SPARK_CLASSPATH'] || default_classpath)
16
+ end
17
+
18
+ private
19
+
20
+ def default_classpath
21
+ File.expand_path('./lib/', home).tap do |path|
22
+ FileUtils.mkdir_p(path)
23
+ end
24
+ end
25
+
26
+ def default_home
27
+ File.expand_path('~/.cassandra_model_spark').tap do |path|
28
+ FileUtils.mkdir_p(path)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
metadata ADDED
@@ -0,0 +1,113 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cassandra_model_spark
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1.5
5
+ platform: java
6
+ authors:
7
+ - Thomas RM Rogers
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-12-29 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: cassandra_model
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 0.9.16
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 0.9.16
27
+ - !ruby/object:Gem::Dependency
28
+ name: thomas_utils
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 0.1.16
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: 0.1.16
41
+ description: |-
42
+ Spark integration for cassandra_model.
43
+ Get high-performance data analytics with the ease of cassandra_model.
44
+ Inspired by the ruby-spark gem.
45
+ email: thomasrogers03@gmail.com
46
+ executables:
47
+ - cmodel-spark-build
48
+ - cmodel-spark-env.rb
49
+ - cmodel-spark-master
50
+ - cmodel-spark-slaves
51
+ - cmodel-spark-run-master
52
+ - cmodel-spark-run-slave
53
+ extensions: []
54
+ extra_rdoc_files: []
55
+ files:
56
+ - README.md
57
+ - bin/cmodel-spark-build
58
+ - bin/cmodel-spark-env.rb
59
+ - bin/cmodel-spark-master
60
+ - bin/cmodel-spark-run-master
61
+ - bin/cmodel-spark-run-master.sh
62
+ - bin/cmodel-spark-run-slave
63
+ - bin/cmodel-spark-run-slave.sh
64
+ - bin/cmodel-spark-slaves
65
+ - ext/scala_helper/bin/load-spark-env.sh
66
+ - ext/scala_helper/bin/spark-class
67
+ - ext/scala_helper/build.sbt
68
+ - ext/scala_helper/cassandra_helper.scala
69
+ - ext/scala_helper/data_type_helper.scala
70
+ - ext/scala_helper/marshal_loader.scala
71
+ - ext/scala_helper/marshal_row_mapping.scala
72
+ - ext/scala_helper/project/plugins.sbt
73
+ - ext/scala_helper/sbin/spark-config.sh
74
+ - ext/scala_helper/sbin/spark-daemon.sh
75
+ - ext/scala_helper/schema_builder.scala
76
+ - ext/scala_helper/worker.scala
77
+ - lib/cassandra_model_spark.rb
78
+ - lib/cassandra_model_spark/build.rb
79
+ - lib/cassandra_model_spark/column_cast.rb
80
+ - lib/cassandra_model_spark/connection_cache.rb
81
+ - lib/cassandra_model_spark/data_frame.rb
82
+ - lib/cassandra_model_spark/java_bridge.rb
83
+ - lib/cassandra_model_spark/java_classes.rb
84
+ - lib/cassandra_model_spark/launcher.rb
85
+ - lib/cassandra_model_spark/query_builder.rb
86
+ - lib/cassandra_model_spark/raw_connection.rb
87
+ - lib/cassandra_model_spark/record.rb
88
+ - lib/cassandra_model_spark/spark.rb
89
+ homepage: https://www.github.com/thomasrogers03/cassandra_model_spark
90
+ licenses:
91
+ - Apache License 2.0
92
+ metadata: {}
93
+ post_install_message:
94
+ rdoc_options: []
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - '>='
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ requirements: []
108
+ rubyforge_project:
109
+ rubygems_version: 2.4.8
110
+ signing_key:
111
+ specification_version: 4
112
+ summary: Spark integration for cassandra_model
113
+ test_files: []