cassandra_model_spark 0.0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +17 -0
  3. data/bin/cmodel-spark-build +7 -0
  4. data/bin/cmodel-spark-env.rb +11 -0
  5. data/bin/cmodel-spark-master +22 -0
  6. data/bin/cmodel-spark-run-master +4 -0
  7. data/bin/cmodel-spark-run-master.sh +8 -0
  8. data/bin/cmodel-spark-run-slave +4 -0
  9. data/bin/cmodel-spark-run-slave.sh +8 -0
  10. data/bin/cmodel-spark-slaves +22 -0
  11. data/ext/scala_helper/bin/load-spark-env.sh +63 -0
  12. data/ext/scala_helper/bin/spark-class +87 -0
  13. data/ext/scala_helper/build.sbt +62 -0
  14. data/ext/scala_helper/cassandra_helper.scala +23 -0
  15. data/ext/scala_helper/data_type_helper.scala +27 -0
  16. data/ext/scala_helper/marshal_loader.scala +204 -0
  17. data/ext/scala_helper/marshal_row_mapping.scala +85 -0
  18. data/ext/scala_helper/project/plugins.sbt +6 -0
  19. data/ext/scala_helper/sbin/spark-config.sh +30 -0
  20. data/ext/scala_helper/sbin/spark-daemon.sh +223 -0
  21. data/ext/scala_helper/schema_builder.scala +35 -0
  22. data/ext/scala_helper/worker.scala +13 -0
  23. data/lib/cassandra_model_spark/build.rb +24 -0
  24. data/lib/cassandra_model_spark/column_cast.rb +44 -0
  25. data/lib/cassandra_model_spark/connection_cache.rb +9 -0
  26. data/lib/cassandra_model_spark/data_frame.rb +374 -0
  27. data/lib/cassandra_model_spark/java_bridge.rb +91 -0
  28. data/lib/cassandra_model_spark/java_classes.rb +36 -0
  29. data/lib/cassandra_model_spark/launcher.rb +150 -0
  30. data/lib/cassandra_model_spark/query_builder.rb +37 -0
  31. data/lib/cassandra_model_spark/raw_connection.rb +47 -0
  32. data/lib/cassandra_model_spark/record.rb +18 -0
  33. data/lib/cassandra_model_spark/spark.rb +33 -0
  34. data/lib/cassandra_model_spark.rb +42 -0
  35. metadata +127 -0
@@ -0,0 +1,374 @@
1
+ module CassandraModel
2
+ module Spark
3
+ class DataFrame
4
+ include QueryHelper
5
+
6
+ SQL_TYPE_MAP = {
7
+ int: SqlIntegerType,
8
+ text: SqlStringType,
9
+ double: SqlDoubleType,
10
+ timestamp: SqlTimestampType,
11
+ }.freeze
12
+ #noinspection RubyStringKeysInHashInspection
13
+ SQL_RUBY_TYPE_FUNCTIONS = {
14
+ 'IntegerType' => :getInt,
15
+ 'LongType' => :getLong,
16
+ 'StringType' => :getString,
17
+ 'DoubleType' => :getDouble,
18
+ 'TimestampType' => :getTimestamp,
19
+ 'MapType(StringType,StringType,true)' => :getMap,
20
+ }
21
+
22
+ attr_reader :table_name, :record_klass
23
+
24
+ def initialize(record_klass, rdd, options = {})
25
+ @table_name = options.fetch(:alias) { record_klass.table_name }
26
+ @sql_context = options[:sql_context]
27
+ initialize_frame_from_existing(options)
28
+ @record_klass = record_klass
29
+
30
+ initialize_row_mapping(options)
31
+ initialize_rdd(rdd)
32
+ end
33
+
34
+ def derived?
35
+ !!@derived
36
+ end
37
+
38
+ def sql_context
39
+ @sql_context ||= create_sql_context
40
+ end
41
+
42
+ def union(rhs)
43
+ unless record_klass == rhs.record_klass
44
+ raise ArgumentError, 'Cannot union DataFrames with different Record types!'
45
+ end
46
+ DataFrame.new(record_klass, rdd.union(rhs.rdd))
47
+ end
48
+
49
+ def spark_data_frame
50
+ @frame ||= SparkSchemaBuilder.new.tap do |builder|
51
+ record_klass.cassandra_columns.each do |name, type|
52
+ select_name = record_klass.normalized_column(name)
53
+ mapped_type = row_type_mapping[select_name]
54
+ type = if mapped_type
55
+ name = mapped_type[:name]
56
+ mapped_type[:type]
57
+ else
58
+ SQL_TYPE_MAP.fetch(type) { SqlStringType }
59
+ end
60
+ builder.add_column(name.to_s, type)
61
+ end
62
+ end.create_data_frame(sql_context, rdd).tap { |frame| frame.register_temp_table(table_name.to_s) }
63
+ end
64
+
65
+ def cache
66
+ spark_data_frame.cache
67
+ end
68
+
69
+ def uncache
70
+ spark_data_frame.unpersist
71
+ end
72
+
73
+ def cached(&block)
74
+ spark_data_frame.cache
75
+ instance_eval(&block)
76
+ spark_data_frame.unpersist
77
+ end
78
+
79
+ def normalized(alias_table_name = nil)
80
+ return self unless rdd
81
+
82
+ select_options = record_klass.columns.inject({}) do |memo, column|
83
+ row_mapped_column = row_type_mapping.fetch(column) { {name: column} }[:name]
84
+ memo.merge!(row_mapped_column => {as: row_mapped_column})
85
+ end
86
+ alias_name = alias_table_name || :"normalized_#{table_name}"
87
+ select(select_options).as_data_frame(alias: alias_name)
88
+ end
89
+
90
+ def request_async(*_)
91
+ ResultPaginator.new(first_async) {}
92
+ end
93
+
94
+ def first_async(*_)
95
+ Cassandra::Future.error(NotImplementedError.new)
96
+ end
97
+
98
+ def sql(query)
99
+ spark_data_frame
100
+ query = sql_context.sql(query)
101
+ query.collect.map do |row|
102
+ row_to_record(query.schema, row)
103
+ end
104
+
105
+ end
106
+
107
+ def query(restriction, options)
108
+ spark_data_frame
109
+ select_clause = select_columns(options)
110
+ group_clause = group_clause(:group, 'GROUP BY', options)
111
+ order_clause = group_clause(:order_by, 'ORDER BY', options)
112
+ limit_clause = if options[:limit]
113
+ " LIMIT #{options[:limit]}"
114
+ end
115
+ where_clause = query_where_clause(restriction)
116
+ sql_context.sql("SELECT #{select_clause} FROM #{table_name}#{where_clause}#{group_clause}#{order_clause}#{limit_clause}")
117
+ end
118
+
119
+ def request(restriction = {}, options = {})
120
+ query = query(restriction, options)
121
+ query.collect.map do |row|
122
+ row_to_record(query.schema, row)
123
+ end
124
+ end
125
+
126
+ def first(restriction = {}, options = {})
127
+ query = query(restriction, options)
128
+ row = query.first
129
+ row_to_record(query.schema, row)
130
+ end
131
+
132
+ def ==(rhs)
133
+ rhs.is_a?(DataFrame) &&
134
+ record_klass == rhs.record_klass &&
135
+ ((rdd && rdd == rhs.rdd) || (!rdd && spark_data_frame == rhs.spark_data_frame))
136
+ end
137
+
138
+ protected
139
+
140
+ attr_reader :rdd
141
+
142
+ private
143
+
144
+ def initialize_frame_from_existing(options)
145
+ @frame = options[:spark_data_frame]
146
+ if @frame
147
+ raise ArgumentError, 'DataFrames created from Spark DataFrames require aliases!' unless options[:alias]
148
+ @frame.register_temp_table(options[:alias].to_s)
149
+ @sql_context = @frame.sql_context
150
+ end
151
+ end
152
+
153
+ def initialize_rdd(rdd)
154
+ if rdd
155
+ @rdd = if @row_mapping[:mapper]
156
+ @row_mapping[:mapper].mappedRDD(rdd)
157
+ else
158
+ rdd
159
+ end
160
+ else
161
+ @derived = true
162
+ end
163
+ end
164
+
165
+ def initialize_row_mapping(options)
166
+ @row_mapping = options.fetch(:row_mapping) do
167
+ @record_klass.rdd_row_mapping || {}
168
+ end
169
+ end
170
+
171
+ def row_type_mapping
172
+ @row_mapping[:type_map] ||= {}
173
+ end
174
+
175
+ def create_sql_context
176
+ CassandraSQLContext.new(record_klass.table.connection.spark_context).tap do |context|
177
+ context.setKeyspace(record_klass.table.connection.config[:keyspace])
178
+ end
179
+ end
180
+
181
+ def row_to_record(schema, row)
182
+ attributes = row_attributes(row, schema)
183
+
184
+ if valid_record?(attributes)
185
+ record_klass.new(attributes)
186
+ else
187
+ attributes
188
+ end
189
+ end
190
+
191
+ def row_attributes(row, schema)
192
+ attributes = {}
193
+ schema.fields.each_with_index do |field, index|
194
+ value = field_value(field, index, row)
195
+ column = field.name
196
+ attributes.merge!(column => value)
197
+ end
198
+ record_klass.normalized_attributes(attributes)
199
+ end
200
+
201
+ def valid_record?(attributes)
202
+ available_columns = record_klass.columns + record_klass.deferred_columns
203
+ attributes.keys.all? { |column| available_columns.include?(column) }
204
+ end
205
+
206
+ def field_value(field, index, row)
207
+ data_type = field.data_type
208
+ if column_is_struct?(data_type)
209
+ row_attributes(row.get(index), data_type)
210
+ else
211
+ decode_column_value(data_type, index, row)
212
+ end
213
+ end
214
+
215
+ def decode_column_value(data_type, index, row)
216
+ sql_type = data_type.to_string
217
+ converter = SQL_RUBY_TYPE_FUNCTIONS.fetch(sql_type) { :getString }
218
+ value = row.public_send(converter, index)
219
+
220
+ value = decode_hash(value) if column_is_string_map?(sql_type)
221
+ value
222
+ end
223
+
224
+ def decode_hash(value)
225
+ Hash[value.toSeq.array.to_a.map! { |pair| [pair._1.to_string, pair._2.to_string] }]
226
+ end
227
+
228
+ def column_is_string_map?(sql_type)
229
+ sql_type == 'MapType(StringType,StringType,true)'
230
+ end
231
+
232
+ def column_is_struct?(data_type)
233
+ data_type.getClass.getSimpleName == 'StructType'
234
+ end
235
+
236
+ def select_columns(options)
237
+ options[:select] ? clean_select_columns(options) * ', ' : '*'
238
+ end
239
+
240
+ def group_clause(type, prefix, options)
241
+ if options[type]
242
+ updated_clause = options[type].map do |column|
243
+ if column.is_a?(Hash)
244
+ column, direction = column.first
245
+ updated_column = quoted_column(column)
246
+ "#{updated_column} #{direction.upcase}"
247
+ else
248
+ quoted_column(column)
249
+ end
250
+ end * ', '
251
+ " #{prefix} #{updated_clause}"
252
+ end
253
+ end
254
+
255
+ def group_child_clause(child, updated_column)
256
+ child, direction = if child.is_a?(Hash)
257
+ child.first
258
+ else
259
+ [child]
260
+ end
261
+ direction_clause = (" #{direction.upcase}" if direction)
262
+ "#{updated_column}.`#{child}`#{direction_clause}"
263
+ end
264
+
265
+ def clean_select_columns(options)
266
+ options[:select].map do |column|
267
+ if column.is_a?(Hash)
268
+ updated_column(column)
269
+ else
270
+ quoted_column(column)
271
+ end
272
+ end
273
+ end
274
+
275
+ def updated_column(column)
276
+ column, options = column.first
277
+
278
+ if options.is_a?(Symbol)
279
+ updated_column = if column.is_a?(ThomasUtils::KeyChild)
280
+ "#{column}".gsub(/\./, '_')
281
+ else
282
+ column
283
+ end
284
+ options = {aggregate: options, as: :"#{updated_column}_#{options}"}
285
+ end
286
+
287
+ column = quoted_column(column)
288
+ column = aggregate_column(column, options) if options[:aggregate]
289
+ column = "#{column} AS #{options[:as]}" if options[:as]
290
+ column
291
+ end
292
+
293
+ def quoted_column(column)
294
+ return column.map { |child_column| quoted_column(child_column) } * ', ' if column.is_a?(Array)
295
+
296
+ if column == :*
297
+ '*'
298
+ elsif column.respond_to?(:quote)
299
+ column.quote('`')
300
+ else
301
+ "`#{select_column(column)}`"
302
+ end
303
+ end
304
+
305
+ def aggregate_column(column, options)
306
+ case options[:aggregate]
307
+ when :count_distinct
308
+ "COUNT(#{distinct_aggregate(column)})"
309
+ when :distinct
310
+ distinct_aggregate(column)
311
+ when :variance
312
+ variance_column(column)
313
+ when :stddev
314
+ "POW(#{variance_column(column)},0.5)"
315
+ else
316
+ if options[:aggregate] =~ /^cast_/
317
+ type = options[:aggregate].to_s.match(/^cast_(.+)$/)[1]
318
+ "CAST(#{column} AS #{type.upcase})"
319
+ else
320
+ "#{options[:aggregate].to_s.upcase}(#{column})"
321
+ end
322
+ end
323
+ end
324
+
325
+ def distinct_aggregate(column)
326
+ "DISTINCT #{column}"
327
+ end
328
+
329
+ def variance_column(column)
330
+ "AVG(POW(#{column},2)) - POW(AVG(#{column}),2)"
331
+ end
332
+
333
+ def query_where_clause(restriction)
334
+ if restriction.present?
335
+ restriction_clause = restriction.map do |key, value|
336
+ updated_key = if key.is_a?(ThomasUtils::KeyComparer)
337
+ select_key = if key.key.respond_to?(:new_key)
338
+ select_key = select_column(key.key.key)
339
+ key.key.new_key(select_key)
340
+ else
341
+ select_column(key.key)
342
+ end
343
+ key.new_key(select_key).quote('`')
344
+ elsif key.is_a?(ThomasUtils::KeyChild)
345
+ new_key = select_column(key.key)
346
+ updated_key = key.new_key(new_key)
347
+ quoted_restriction(updated_key)
348
+ else
349
+ select_key = select_column(key)
350
+ quoted_restriction(select_key)
351
+ end
352
+ value = "'#{value}'" if value.is_a?(String) || value.is_a?(Time)
353
+ "#{updated_key} #{value}"
354
+ end * ' AND '
355
+ " WHERE #{restriction_clause}"
356
+ end
357
+ end
358
+
359
+ def select_column(key)
360
+ new_key = record_klass.select_column(key)
361
+ available_columns.include?(new_key) ? new_key : key
362
+ end
363
+
364
+ def available_columns
365
+ @available_columns ||= spark_data_frame.schema.fields.map(&:name).map(&:to_sym)
366
+ end
367
+
368
+ def quoted_restriction(updated_key)
369
+ ThomasUtils::KeyComparer.new(updated_key, '=').quote('`')
370
+ end
371
+
372
+ end
373
+ end
374
+ end
@@ -0,0 +1,91 @@
1
+ if RUBY_ENGINE == 'jruby'
2
+ class Hash
3
+ def to_java
4
+ JavaHashMap.new(self)
5
+ end
6
+ end
7
+
8
+ class Array
9
+ def to_java_argv
10
+ to_java(:string)
11
+ end
12
+ end
13
+
14
+ else
15
+ class Hash
16
+ def to_java
17
+ JavaHashMap.new.tap do |map|
18
+ each do |key, value|
19
+ map.put(key, value)
20
+ end
21
+ end
22
+ end
23
+ end
24
+
25
+ class Array
26
+ def to_java
27
+ self
28
+ end
29
+
30
+ def to_java_argv
31
+ self
32
+ end
33
+ end
34
+ end
35
+
36
+ module JavaBridge
37
+ if RUBY_ENGINE == 'jruby'
38
+ def import_java_object(path, options = {})
39
+ name = options.fetch(:as) { path.split('.').last }.to_sym
40
+ klass = "Java::#{path}"
41
+ Object.const_set(name, eval(klass))
42
+ end
43
+
44
+ def initialize_java_engine
45
+ # nothing to do here
46
+ end
47
+ else
48
+ def import_java_object(path, options = {})
49
+ name = options.fetch(:as) { path.split('.').last }.to_sym
50
+ Object.const_set(name, load_java_class(path))
51
+ end
52
+
53
+ def require(path)
54
+ # hack to make importing jars work like jruby
55
+ if path =~ /\.jar$/i
56
+ java_jar_list << path
57
+ else
58
+ super
59
+ end
60
+ end
61
+
62
+ def initialize_java_engine
63
+ # have to load everything in one go here
64
+ Rjb.load(java_jar_list * platform_path_separator)
65
+ end
66
+
67
+ private
68
+
69
+ def platform_path_separator
70
+ @platform_separator ||= RbConfig::CONFIG['host_os'] =~ /mswin|mingw/ ? ';' : ':'
71
+ end
72
+
73
+ def java_jar_list
74
+ @java_jar_list ||= []
75
+ end
76
+
77
+ def load_java_class(path)
78
+ import_quiet { Rjb.import(path) }
79
+ end
80
+ end
81
+
82
+ def import_quiet
83
+ prev_verbox = $VERBOSE
84
+ $VERBOSE = nil
85
+ yield
86
+ ensure
87
+ $VERBOSE = prev_verbox
88
+ end
89
+ end
90
+
91
+ include JavaBridge
@@ -0,0 +1,36 @@
1
+ import_java_object 'java.util.ArrayList'
2
+ import_java_object 'org.apache.spark.SparkConf'
3
+ import_java_object 'org.apache.spark.api.java.JavaSparkContext'
4
+ import_java_object 'org.apache.spark.sql.cassandra.CassandraSQLContext'
5
+ import_java_object 'java.util.HashMap', as: 'JavaHashMap'
6
+ import_java_object 'org.apache.spark.sql.SQLContext', as: 'SparkSQLContext'
7
+ import_java_object 'org.apache.spark.sql.RowFactory', as: 'SparkRowFactory'
8
+ import_java_object 'org.apache.log4j.Logger', as: 'JLogger'
9
+ import_java_object 'org.apache.log4j.Level', as: 'JLevel'
10
+ import_java_object 'org.apache.log4j.Priority', as: 'JPriority'
11
+ import_java_object 'org.apache.spark.util.Utils', as: 'SparkUtils'
12
+ import_java_object 'org.apache.spark.storage.StorageLevel', as: 'JStorageLevel'
13
+ import_java_object 'org.apache.spark.api.cassandra_model.CassandraHelper', as: 'SparkCassandraHelper'
14
+ import_java_object 'org.apache.spark.api.cassandra_model.SchemaBuilder', as: 'SparkSchemaBuilder'
15
+ import_java_object 'org.apache.spark.api.cassandra_model.DataTypeHelper', as: 'SparkSqlDataTypeHelper'
16
+ import_java_object 'org.apache.spark.api.cassandra_model.MarshalLoader', as: 'ScalaMarshalLoader'
17
+ import_java_object 'org.apache.spark.api.cassandra_model.MapStringStringRowMapping', as: 'SparkMapStringStringRowMapping'
18
+ import_java_object 'org.apache.spark.api.cassandra_model.SparkRowRowMapping', as: 'SparkSparkRowRowMapping'
19
+ import_java_object 'org.apache.spark.deploy.master.Master', as: 'SparkMaster'
20
+ import_java_object 'org.apache.spark.deploy.worker.RubyWorkerStarter', as: 'SparkWorkerStarter'
21
+
22
+ %w(BinaryType BooleanType ByteType DataType
23
+ DateType Decimal DecimalType DoubleType FloatType IntegerType
24
+ LongType Metadata NullType PrecisionInfo ShortType
25
+ StringType StructField StructType TimestampType).each do |sql_type|
26
+ Object.const_set(:"Sql#{sql_type}", import_quiet { SparkSqlDataTypeHelper.public_send(:"get#{sql_type}") })
27
+ end
28
+
29
+ #noinspection RubyConstantNamingConvention
30
+ SqlStringArrayType = SparkSqlDataTypeHelper.getArrayType(SqlStringType)
31
+
32
+ #noinspection RubyConstantNamingConvention
33
+ SqlIntegerArrayType = SparkSqlDataTypeHelper.getArrayType(SqlIntegerType)
34
+
35
+ #noinspection RubyConstantNamingConvention
36
+ SqlStringStringMapType = SparkSqlDataTypeHelper.getMapType(SqlStringType, SqlStringType)
@@ -0,0 +1,150 @@
1
+ require 'socket'
2
+
3
+ module CassandraModel
4
+ module Spark
5
+ class Launcher
6
+ def start_master
7
+ system(env, "#{spark_daemon} start #{start_master_args}")
8
+ add_master_jars
9
+ end
10
+
11
+ def run_master
12
+ validate_env!
13
+
14
+ result = SparkMaster.startRpcEnvAndEndpoint(master_config[:host], master_config[:master_port], master_config[:ui_port], spark_conf)._1
15
+ wait_for_shutdown do
16
+ result.shutdown
17
+ result.awaitTermination
18
+ end
19
+ end
20
+
21
+ def start_slaves
22
+ workers.map do |worker|
23
+ system(env, "#{spark_daemon} start #{start_slave_args(worker)}")
24
+ end
25
+ end
26
+
27
+ def run_slave
28
+ validate_env!
29
+
30
+ result = SparkWorkerStarter.startWorker(master_url, slave_config[:host], master_config[:master_port], master_config[:ui_port], spark_conf)
31
+ wait_for_shutdown do
32
+ result.shutdown
33
+ result.awaitTermination
34
+ end
35
+ end
36
+
37
+ def stop_master
38
+ system(env, "#{spark_daemon} stop #{master_args}")
39
+ end
40
+
41
+ def stop_slaves
42
+ workers.map do |worker|
43
+ system(env, "#{spark_daemon} stop #{slave_args(worker)}")
44
+ end
45
+ end
46
+
47
+ private
48
+
49
+ def spark_conf
50
+ @spark_conf ||= ConnectionCache[nil].send(:spark_conf)
51
+ end
52
+
53
+ def wait_for_shutdown
54
+ begin
55
+ loop { sleep 0.2 }
56
+ rescue Interrupt
57
+ yield
58
+ end
59
+ end
60
+
61
+ def to_argv(args)
62
+ args.split.to_java_argv
63
+ end
64
+
65
+ def validate_env!
66
+ unless ENV['SPARK_HOME'] && File.expand_path(ENV['SPARK_HOME']) == Spark.home
67
+ raise 'Spark enviroment not set correctly'
68
+ end
69
+ end
70
+
71
+ def add_master_jars
72
+ ConnectionCache[nil].tap do |connection|
73
+ connection.config = {spark: {master: master_url}}
74
+ connection.spark_context.addJar("#{Spark.classpath}/cmodel_scala_helper.jar")
75
+ end
76
+ ConnectionCache.clear
77
+ end
78
+
79
+ def workers
80
+ slave_config[:worker_count].to_i.times.map { |index| index + 1 }
81
+ end
82
+
83
+ def start_master_args
84
+ "#{master_args} #{run_master_args}"
85
+ end
86
+
87
+ def run_master_args
88
+ "--ip #{Socket.gethostname} --port #{master_config[:master_port]} --webui-port #{master_config[:ui_port]} -h #{master_config[:host]}"
89
+ end
90
+
91
+ def start_slave_args(id)
92
+ "#{slave_args(id)} #{run_slave_args}"
93
+ end
94
+
95
+ def run_slave_args
96
+ "--webui-port #{slave_config[:ui_port]} #{master_url}"
97
+ end
98
+
99
+ def master_args
100
+ "org.apache.spark.deploy.master.Master #{master_config[:id]}"
101
+ end
102
+
103
+ def slave_args(id)
104
+ "org.apache.spark.deploy.worker.Worker #{id}"
105
+ end
106
+
107
+ def spark_daemon
108
+ "#{Spark.home}/sbin/spark-daemon.sh"
109
+ end
110
+
111
+ def master_url
112
+ "spark://#{master_config[:host]}:#{master_config[:master_port]}"
113
+ end
114
+
115
+ def master_config
116
+ config.merge(config.fetch(:master) { {} })
117
+ end
118
+
119
+ def slave_config
120
+ config.merge(config.fetch(:slave) { {} })
121
+ end
122
+
123
+ def config
124
+ @config ||= begin
125
+ override_config = ConnectionCache[nil].config.fetch(:spark_daemon) { {} }
126
+ {
127
+ id: 1,
128
+ ui_port: 8180,
129
+ master_port: 7077,
130
+ worker_count: 1,
131
+ host: Socket.gethostname,
132
+ }.merge(override_config)
133
+ end
134
+ end
135
+
136
+ def env
137
+ @env ||= spark_env.merge(ENV.to_hash)
138
+ end
139
+
140
+ def spark_env
141
+ @spark_env ||= {
142
+ 'SPARK_HOME' => Spark.home,
143
+ 'SPARK_CLASSPATH' => Spark.classpath,
144
+ 'SPARK_JARS' => Dir["#{Spark.classpath}/*.jar"] * ',',
145
+ }
146
+ end
147
+
148
+ end
149
+ end
150
+ end
@@ -0,0 +1,37 @@
1
+ module CassandraModel
2
+ class QueryBuilder
3
+ def group(*columns)
4
+ append_option(columns, :group)
5
+ end
6
+
7
+ def as_data_frame(options = {})
8
+ if @record_klass.is_a?(Spark::DataFrame)
9
+ data_frame_from_frame(options)
10
+ else
11
+ data_frame_from_model(options)
12
+ end
13
+ end
14
+
15
+ private
16
+
17
+ def data_frame_from_frame(options)
18
+ query_frame = @record_klass.query(@params, @options)
19
+ Spark::DataFrame.new(@record_klass.record_klass, nil, options.merge(spark_data_frame: query_frame))
20
+ end
21
+
22
+ def data_frame_from_model(options)
23
+ updated_restriction = @record_klass.restriction_attributes(@params).inject({}) do |memo, (key, value)|
24
+ updated_key = if value.is_a?(Array)
25
+ value = value.to_java
26
+ updated_key = key.is_a?(ThomasUtils::KeyComparer) ? key.to_s : "#{key} IN"
27
+ "#{updated_key} (#{(%w(?)*value.count)*','})"
28
+ else
29
+ key.is_a?(ThomasUtils::KeyComparer) ? "#{key} ?" : "#{key} = ?"
30
+ end
31
+ memo.merge!(updated_key => value)
32
+ end.stringify_keys.to_java
33
+ rdd = SparkCassandraHelper.filterRDD(@record_klass.rdd, updated_restriction)
34
+ Spark::DataFrame.new(@record_klass, rdd, options)
35
+ end
36
+ end
37
+ end