cassandra_model_spark 0.0.1.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +17 -0
  3. data/bin/cmodel-spark-build +7 -0
  4. data/bin/cmodel-spark-env.rb +11 -0
  5. data/bin/cmodel-spark-master +22 -0
  6. data/bin/cmodel-spark-run-master +4 -0
  7. data/bin/cmodel-spark-run-master.sh +8 -0
  8. data/bin/cmodel-spark-run-slave +4 -0
  9. data/bin/cmodel-spark-run-slave.sh +8 -0
  10. data/bin/cmodel-spark-slaves +22 -0
  11. data/ext/scala_helper/bin/load-spark-env.sh +63 -0
  12. data/ext/scala_helper/bin/spark-class +87 -0
  13. data/ext/scala_helper/build.sbt +62 -0
  14. data/ext/scala_helper/cassandra_helper.scala +23 -0
  15. data/ext/scala_helper/data_type_helper.scala +27 -0
  16. data/ext/scala_helper/marshal_loader.scala +204 -0
  17. data/ext/scala_helper/marshal_row_mapping.scala +85 -0
  18. data/ext/scala_helper/project/plugins.sbt +6 -0
  19. data/ext/scala_helper/sbin/spark-config.sh +30 -0
  20. data/ext/scala_helper/sbin/spark-daemon.sh +223 -0
  21. data/ext/scala_helper/schema_builder.scala +35 -0
  22. data/ext/scala_helper/worker.scala +13 -0
  23. data/lib/cassandra_model_spark/build.rb +24 -0
  24. data/lib/cassandra_model_spark/column_cast.rb +44 -0
  25. data/lib/cassandra_model_spark/connection_cache.rb +9 -0
  26. data/lib/cassandra_model_spark/data_frame.rb +374 -0
  27. data/lib/cassandra_model_spark/java_bridge.rb +91 -0
  28. data/lib/cassandra_model_spark/java_classes.rb +36 -0
  29. data/lib/cassandra_model_spark/launcher.rb +150 -0
  30. data/lib/cassandra_model_spark/query_builder.rb +37 -0
  31. data/lib/cassandra_model_spark/raw_connection.rb +47 -0
  32. data/lib/cassandra_model_spark/record.rb +18 -0
  33. data/lib/cassandra_model_spark/spark.rb +33 -0
  34. data/lib/cassandra_model_spark.rb +42 -0
  35. metadata +127 -0
@@ -0,0 +1,374 @@
1
+ module CassandraModel
2
+ module Spark
3
+ class DataFrame
4
+ include QueryHelper
5
+
6
+ SQL_TYPE_MAP = {
7
+ int: SqlIntegerType,
8
+ text: SqlStringType,
9
+ double: SqlDoubleType,
10
+ timestamp: SqlTimestampType,
11
+ }.freeze
12
+ #noinspection RubyStringKeysInHashInspection
13
+ SQL_RUBY_TYPE_FUNCTIONS = {
14
+ 'IntegerType' => :getInt,
15
+ 'LongType' => :getLong,
16
+ 'StringType' => :getString,
17
+ 'DoubleType' => :getDouble,
18
+ 'TimestampType' => :getTimestamp,
19
+ 'MapType(StringType,StringType,true)' => :getMap,
20
+ }
21
+
22
+ attr_reader :table_name, :record_klass
23
+
24
+ def initialize(record_klass, rdd, options = {})
25
+ @table_name = options.fetch(:alias) { record_klass.table_name }
26
+ @sql_context = options[:sql_context]
27
+ initialize_frame_from_existing(options)
28
+ @record_klass = record_klass
29
+
30
+ initialize_row_mapping(options)
31
+ initialize_rdd(rdd)
32
+ end
33
+
34
+ def derived?
35
+ !!@derived
36
+ end
37
+
38
+ def sql_context
39
+ @sql_context ||= create_sql_context
40
+ end
41
+
42
+ def union(rhs)
43
+ unless record_klass == rhs.record_klass
44
+ raise ArgumentError, 'Cannot union DataFrames with different Record types!'
45
+ end
46
+ DataFrame.new(record_klass, rdd.union(rhs.rdd))
47
+ end
48
+
49
+ def spark_data_frame
50
+ @frame ||= SparkSchemaBuilder.new.tap do |builder|
51
+ record_klass.cassandra_columns.each do |name, type|
52
+ select_name = record_klass.normalized_column(name)
53
+ mapped_type = row_type_mapping[select_name]
54
+ type = if mapped_type
55
+ name = mapped_type[:name]
56
+ mapped_type[:type]
57
+ else
58
+ SQL_TYPE_MAP.fetch(type) { SqlStringType }
59
+ end
60
+ builder.add_column(name.to_s, type)
61
+ end
62
+ end.create_data_frame(sql_context, rdd).tap { |frame| frame.register_temp_table(table_name.to_s) }
63
+ end
64
+
65
+ def cache
66
+ spark_data_frame.cache
67
+ end
68
+
69
+ def uncache
70
+ spark_data_frame.unpersist
71
+ end
72
+
73
+ def cached(&block)
74
+ spark_data_frame.cache
75
+ instance_eval(&block)
76
+ spark_data_frame.unpersist
77
+ end
78
+
79
+ def normalized(alias_table_name = nil)
80
+ return self unless rdd
81
+
82
+ select_options = record_klass.columns.inject({}) do |memo, column|
83
+ row_mapped_column = row_type_mapping.fetch(column) { {name: column} }[:name]
84
+ memo.merge!(row_mapped_column => {as: row_mapped_column})
85
+ end
86
+ alias_name = alias_table_name || :"normalized_#{table_name}"
87
+ select(select_options).as_data_frame(alias: alias_name)
88
+ end
89
+
90
+ def request_async(*_)
91
+ ResultPaginator.new(first_async) {}
92
+ end
93
+
94
+ def first_async(*_)
95
+ Cassandra::Future.error(NotImplementedError.new)
96
+ end
97
+
98
+ def sql(query)
99
+ spark_data_frame
100
+ query = sql_context.sql(query)
101
+ query.collect.map do |row|
102
+ row_to_record(query.schema, row)
103
+ end
104
+
105
+ end
106
+
107
+ def query(restriction, options)
108
+ spark_data_frame
109
+ select_clause = select_columns(options)
110
+ group_clause = group_clause(:group, 'GROUP BY', options)
111
+ order_clause = group_clause(:order_by, 'ORDER BY', options)
112
+ limit_clause = if options[:limit]
113
+ " LIMIT #{options[:limit]}"
114
+ end
115
+ where_clause = query_where_clause(restriction)
116
+ sql_context.sql("SELECT #{select_clause} FROM #{table_name}#{where_clause}#{group_clause}#{order_clause}#{limit_clause}")
117
+ end
118
+
119
+ def request(restriction = {}, options = {})
120
+ query = query(restriction, options)
121
+ query.collect.map do |row|
122
+ row_to_record(query.schema, row)
123
+ end
124
+ end
125
+
126
+ def first(restriction = {}, options = {})
127
+ query = query(restriction, options)
128
+ row = query.first
129
+ row_to_record(query.schema, row)
130
+ end
131
+
132
+ def ==(rhs)
133
+ rhs.is_a?(DataFrame) &&
134
+ record_klass == rhs.record_klass &&
135
+ ((rdd && rdd == rhs.rdd) || (!rdd && spark_data_frame == rhs.spark_data_frame))
136
+ end
137
+
138
+ protected
139
+
140
+ attr_reader :rdd
141
+
142
+ private
143
+
144
+ def initialize_frame_from_existing(options)
145
+ @frame = options[:spark_data_frame]
146
+ if @frame
147
+ raise ArgumentError, 'DataFrames created from Spark DataFrames require aliases!' unless options[:alias]
148
+ @frame.register_temp_table(options[:alias].to_s)
149
+ @sql_context = @frame.sql_context
150
+ end
151
+ end
152
+
153
+ def initialize_rdd(rdd)
154
+ if rdd
155
+ @rdd = if @row_mapping[:mapper]
156
+ @row_mapping[:mapper].mappedRDD(rdd)
157
+ else
158
+ rdd
159
+ end
160
+ else
161
+ @derived = true
162
+ end
163
+ end
164
+
165
+ def initialize_row_mapping(options)
166
+ @row_mapping = options.fetch(:row_mapping) do
167
+ @record_klass.rdd_row_mapping || {}
168
+ end
169
+ end
170
+
171
+ def row_type_mapping
172
+ @row_mapping[:type_map] ||= {}
173
+ end
174
+
175
+ def create_sql_context
176
+ CassandraSQLContext.new(record_klass.table.connection.spark_context).tap do |context|
177
+ context.setKeyspace(record_klass.table.connection.config[:keyspace])
178
+ end
179
+ end
180
+
181
+ def row_to_record(schema, row)
182
+ attributes = row_attributes(row, schema)
183
+
184
+ if valid_record?(attributes)
185
+ record_klass.new(attributes)
186
+ else
187
+ attributes
188
+ end
189
+ end
190
+
191
+ def row_attributes(row, schema)
192
+ attributes = {}
193
+ schema.fields.each_with_index do |field, index|
194
+ value = field_value(field, index, row)
195
+ column = field.name
196
+ attributes.merge!(column => value)
197
+ end
198
+ record_klass.normalized_attributes(attributes)
199
+ end
200
+
201
+ def valid_record?(attributes)
202
+ available_columns = record_klass.columns + record_klass.deferred_columns
203
+ attributes.keys.all? { |column| available_columns.include?(column) }
204
+ end
205
+
206
+ def field_value(field, index, row)
207
+ data_type = field.data_type
208
+ if column_is_struct?(data_type)
209
+ row_attributes(row.get(index), data_type)
210
+ else
211
+ decode_column_value(data_type, index, row)
212
+ end
213
+ end
214
+
215
+ def decode_column_value(data_type, index, row)
216
+ sql_type = data_type.to_string
217
+ converter = SQL_RUBY_TYPE_FUNCTIONS.fetch(sql_type) { :getString }
218
+ value = row.public_send(converter, index)
219
+
220
+ value = decode_hash(value) if column_is_string_map?(sql_type)
221
+ value
222
+ end
223
+
224
+ def decode_hash(value)
225
+ Hash[value.toSeq.array.to_a.map! { |pair| [pair._1.to_string, pair._2.to_string] }]
226
+ end
227
+
228
+ def column_is_string_map?(sql_type)
229
+ sql_type == 'MapType(StringType,StringType,true)'
230
+ end
231
+
232
+ def column_is_struct?(data_type)
233
+ data_type.getClass.getSimpleName == 'StructType'
234
+ end
235
+
236
+ def select_columns(options)
237
+ options[:select] ? clean_select_columns(options) * ', ' : '*'
238
+ end
239
+
240
+ def group_clause(type, prefix, options)
241
+ if options[type]
242
+ updated_clause = options[type].map do |column|
243
+ if column.is_a?(Hash)
244
+ column, direction = column.first
245
+ updated_column = quoted_column(column)
246
+ "#{updated_column} #{direction.upcase}"
247
+ else
248
+ quoted_column(column)
249
+ end
250
+ end * ', '
251
+ " #{prefix} #{updated_clause}"
252
+ end
253
+ end
254
+
255
+ def group_child_clause(child, updated_column)
256
+ child, direction = if child.is_a?(Hash)
257
+ child.first
258
+ else
259
+ [child]
260
+ end
261
+ direction_clause = (" #{direction.upcase}" if direction)
262
+ "#{updated_column}.`#{child}`#{direction_clause}"
263
+ end
264
+
265
+ def clean_select_columns(options)
266
+ options[:select].map do |column|
267
+ if column.is_a?(Hash)
268
+ updated_column(column)
269
+ else
270
+ quoted_column(column)
271
+ end
272
+ end
273
+ end
274
+
275
+ def updated_column(column)
276
+ column, options = column.first
277
+
278
+ if options.is_a?(Symbol)
279
+ updated_column = if column.is_a?(ThomasUtils::KeyChild)
280
+ "#{column}".gsub(/\./, '_')
281
+ else
282
+ column
283
+ end
284
+ options = {aggregate: options, as: :"#{updated_column}_#{options}"}
285
+ end
286
+
287
+ column = quoted_column(column)
288
+ column = aggregate_column(column, options) if options[:aggregate]
289
+ column = "#{column} AS #{options[:as]}" if options[:as]
290
+ column
291
+ end
292
+
293
+ def quoted_column(column)
294
+ return column.map { |child_column| quoted_column(child_column) } * ', ' if column.is_a?(Array)
295
+
296
+ if column == :*
297
+ '*'
298
+ elsif column.respond_to?(:quote)
299
+ column.quote('`')
300
+ else
301
+ "`#{select_column(column)}`"
302
+ end
303
+ end
304
+
305
+ def aggregate_column(column, options)
306
+ case options[:aggregate]
307
+ when :count_distinct
308
+ "COUNT(#{distinct_aggregate(column)})"
309
+ when :distinct
310
+ distinct_aggregate(column)
311
+ when :variance
312
+ variance_column(column)
313
+ when :stddev
314
+ "POW(#{variance_column(column)},0.5)"
315
+ else
316
+ if options[:aggregate] =~ /^cast_/
317
+ type = options[:aggregate].to_s.match(/^cast_(.+)$/)[1]
318
+ "CAST(#{column} AS #{type.upcase})"
319
+ else
320
+ "#{options[:aggregate].to_s.upcase}(#{column})"
321
+ end
322
+ end
323
+ end
324
+
325
+ def distinct_aggregate(column)
326
+ "DISTINCT #{column}"
327
+ end
328
+
329
+ def variance_column(column)
330
+ "AVG(POW(#{column},2)) - POW(AVG(#{column}),2)"
331
+ end
332
+
333
+ def query_where_clause(restriction)
334
+ if restriction.present?
335
+ restriction_clause = restriction.map do |key, value|
336
+ updated_key = if key.is_a?(ThomasUtils::KeyComparer)
337
+ select_key = if key.key.respond_to?(:new_key)
338
+ select_key = select_column(key.key.key)
339
+ key.key.new_key(select_key)
340
+ else
341
+ select_column(key.key)
342
+ end
343
+ key.new_key(select_key).quote('`')
344
+ elsif key.is_a?(ThomasUtils::KeyChild)
345
+ new_key = select_column(key.key)
346
+ updated_key = key.new_key(new_key)
347
+ quoted_restriction(updated_key)
348
+ else
349
+ select_key = select_column(key)
350
+ quoted_restriction(select_key)
351
+ end
352
+ value = "'#{value}'" if value.is_a?(String) || value.is_a?(Time)
353
+ "#{updated_key} #{value}"
354
+ end * ' AND '
355
+ " WHERE #{restriction_clause}"
356
+ end
357
+ end
358
+
359
+ def select_column(key)
360
+ new_key = record_klass.select_column(key)
361
+ available_columns.include?(new_key) ? new_key : key
362
+ end
363
+
364
+ def available_columns
365
+ @available_columns ||= spark_data_frame.schema.fields.map(&:name).map(&:to_sym)
366
+ end
367
+
368
+ def quoted_restriction(updated_key)
369
+ ThomasUtils::KeyComparer.new(updated_key, '=').quote('`')
370
+ end
371
+
372
+ end
373
+ end
374
+ end
@@ -0,0 +1,91 @@
1
+ if RUBY_ENGINE == 'jruby'
2
+ class Hash
3
+ def to_java
4
+ JavaHashMap.new(self)
5
+ end
6
+ end
7
+
8
+ class Array
9
+ def to_java_argv
10
+ to_java(:string)
11
+ end
12
+ end
13
+
14
+ else
15
+ class Hash
16
+ def to_java
17
+ JavaHashMap.new.tap do |map|
18
+ each do |key, value|
19
+ map.put(key, value)
20
+ end
21
+ end
22
+ end
23
+ end
24
+
25
+ class Array
26
+ def to_java
27
+ self
28
+ end
29
+
30
+ def to_java_argv
31
+ self
32
+ end
33
+ end
34
+ end
35
+
36
+ module JavaBridge
37
+ if RUBY_ENGINE == 'jruby'
38
+ def import_java_object(path, options = {})
39
+ name = options.fetch(:as) { path.split('.').last }.to_sym
40
+ klass = "Java::#{path}"
41
+ Object.const_set(name, eval(klass))
42
+ end
43
+
44
+ def initialize_java_engine
45
+ # nothing to do here
46
+ end
47
+ else
48
+ def import_java_object(path, options = {})
49
+ name = options.fetch(:as) { path.split('.').last }.to_sym
50
+ Object.const_set(name, load_java_class(path))
51
+ end
52
+
53
+ def require(path)
54
+ # hack to make importing jars work like jruby
55
+ if path =~ /\.jar$/i
56
+ java_jar_list << path
57
+ else
58
+ super
59
+ end
60
+ end
61
+
62
+ def initialize_java_engine
63
+ # have to load everything in one go here
64
+ Rjb.load(java_jar_list * platform_path_separator)
65
+ end
66
+
67
+ private
68
+
69
+ def platform_path_separator
70
+ @platform_separator ||= RbConfig::CONFIG['host_os'] =~ /mswin|mingw/ ? ';' : ':'
71
+ end
72
+
73
+ def java_jar_list
74
+ @java_jar_list ||= []
75
+ end
76
+
77
+ def load_java_class(path)
78
+ import_quiet { Rjb.import(path) }
79
+ end
80
+ end
81
+
82
+ def import_quiet
83
+ prev_verbox = $VERBOSE
84
+ $VERBOSE = nil
85
+ yield
86
+ ensure
87
+ $VERBOSE = prev_verbox
88
+ end
89
+ end
90
+
91
+ include JavaBridge
@@ -0,0 +1,36 @@
1
+ import_java_object 'java.util.ArrayList'
2
+ import_java_object 'org.apache.spark.SparkConf'
3
+ import_java_object 'org.apache.spark.api.java.JavaSparkContext'
4
+ import_java_object 'org.apache.spark.sql.cassandra.CassandraSQLContext'
5
+ import_java_object 'java.util.HashMap', as: 'JavaHashMap'
6
+ import_java_object 'org.apache.spark.sql.SQLContext', as: 'SparkSQLContext'
7
+ import_java_object 'org.apache.spark.sql.RowFactory', as: 'SparkRowFactory'
8
+ import_java_object 'org.apache.log4j.Logger', as: 'JLogger'
9
+ import_java_object 'org.apache.log4j.Level', as: 'JLevel'
10
+ import_java_object 'org.apache.log4j.Priority', as: 'JPriority'
11
+ import_java_object 'org.apache.spark.util.Utils', as: 'SparkUtils'
12
+ import_java_object 'org.apache.spark.storage.StorageLevel', as: 'JStorageLevel'
13
+ import_java_object 'org.apache.spark.api.cassandra_model.CassandraHelper', as: 'SparkCassandraHelper'
14
+ import_java_object 'org.apache.spark.api.cassandra_model.SchemaBuilder', as: 'SparkSchemaBuilder'
15
+ import_java_object 'org.apache.spark.api.cassandra_model.DataTypeHelper', as: 'SparkSqlDataTypeHelper'
16
+ import_java_object 'org.apache.spark.api.cassandra_model.MarshalLoader', as: 'ScalaMarshalLoader'
17
+ import_java_object 'org.apache.spark.api.cassandra_model.MapStringStringRowMapping', as: 'SparkMapStringStringRowMapping'
18
+ import_java_object 'org.apache.spark.api.cassandra_model.SparkRowRowMapping', as: 'SparkSparkRowRowMapping'
19
+ import_java_object 'org.apache.spark.deploy.master.Master', as: 'SparkMaster'
20
+ import_java_object 'org.apache.spark.deploy.worker.RubyWorkerStarter', as: 'SparkWorkerStarter'
21
+
22
+ %w(BinaryType BooleanType ByteType DataType
23
+ DateType Decimal DecimalType DoubleType FloatType IntegerType
24
+ LongType Metadata NullType PrecisionInfo ShortType
25
+ StringType StructField StructType TimestampType).each do |sql_type|
26
+ Object.const_set(:"Sql#{sql_type}", import_quiet { SparkSqlDataTypeHelper.public_send(:"get#{sql_type}") })
27
+ end
28
+
29
+ #noinspection RubyConstantNamingConvention
30
+ SqlStringArrayType = SparkSqlDataTypeHelper.getArrayType(SqlStringType)
31
+
32
+ #noinspection RubyConstantNamingConvention
33
+ SqlIntegerArrayType = SparkSqlDataTypeHelper.getArrayType(SqlIntegerType)
34
+
35
+ #noinspection RubyConstantNamingConvention
36
+ SqlStringStringMapType = SparkSqlDataTypeHelper.getMapType(SqlStringType, SqlStringType)
@@ -0,0 +1,150 @@
1
+ require 'socket'
2
+
3
+ module CassandraModel
4
+ module Spark
5
+ class Launcher
6
+ def start_master
7
+ system(env, "#{spark_daemon} start #{start_master_args}")
8
+ add_master_jars
9
+ end
10
+
11
+ def run_master
12
+ validate_env!
13
+
14
+ result = SparkMaster.startRpcEnvAndEndpoint(master_config[:host], master_config[:master_port], master_config[:ui_port], spark_conf)._1
15
+ wait_for_shutdown do
16
+ result.shutdown
17
+ result.awaitTermination
18
+ end
19
+ end
20
+
21
+ def start_slaves
22
+ workers.map do |worker|
23
+ system(env, "#{spark_daemon} start #{start_slave_args(worker)}")
24
+ end
25
+ end
26
+
27
+ def run_slave
28
+ validate_env!
29
+
30
+ result = SparkWorkerStarter.startWorker(master_url, slave_config[:host], master_config[:master_port], master_config[:ui_port], spark_conf)
31
+ wait_for_shutdown do
32
+ result.shutdown
33
+ result.awaitTermination
34
+ end
35
+ end
36
+
37
+ def stop_master
38
+ system(env, "#{spark_daemon} stop #{master_args}")
39
+ end
40
+
41
+ def stop_slaves
42
+ workers.map do |worker|
43
+ system(env, "#{spark_daemon} stop #{slave_args(worker)}")
44
+ end
45
+ end
46
+
47
+ private
48
+
49
+ def spark_conf
50
+ @spark_conf ||= ConnectionCache[nil].send(:spark_conf)
51
+ end
52
+
53
+ def wait_for_shutdown
54
+ begin
55
+ loop { sleep 0.2 }
56
+ rescue Interrupt
57
+ yield
58
+ end
59
+ end
60
+
61
+ def to_argv(args)
62
+ args.split.to_java_argv
63
+ end
64
+
65
+ def validate_env!
66
+ unless ENV['SPARK_HOME'] && File.expand_path(ENV['SPARK_HOME']) == Spark.home
67
+ raise 'Spark enviroment not set correctly'
68
+ end
69
+ end
70
+
71
+ def add_master_jars
72
+ ConnectionCache[nil].tap do |connection|
73
+ connection.config = {spark: {master: master_url}}
74
+ connection.spark_context.addJar("#{Spark.classpath}/cmodel_scala_helper.jar")
75
+ end
76
+ ConnectionCache.clear
77
+ end
78
+
79
+ def workers
80
+ slave_config[:worker_count].to_i.times.map { |index| index + 1 }
81
+ end
82
+
83
+ def start_master_args
84
+ "#{master_args} #{run_master_args}"
85
+ end
86
+
87
+ def run_master_args
88
+ "--ip #{Socket.gethostname} --port #{master_config[:master_port]} --webui-port #{master_config[:ui_port]} -h #{master_config[:host]}"
89
+ end
90
+
91
+ def start_slave_args(id)
92
+ "#{slave_args(id)} #{run_slave_args}"
93
+ end
94
+
95
+ def run_slave_args
96
+ "--webui-port #{slave_config[:ui_port]} #{master_url}"
97
+ end
98
+
99
+ def master_args
100
+ "org.apache.spark.deploy.master.Master #{master_config[:id]}"
101
+ end
102
+
103
+ def slave_args(id)
104
+ "org.apache.spark.deploy.worker.Worker #{id}"
105
+ end
106
+
107
+ def spark_daemon
108
+ "#{Spark.home}/sbin/spark-daemon.sh"
109
+ end
110
+
111
+ def master_url
112
+ "spark://#{master_config[:host]}:#{master_config[:master_port]}"
113
+ end
114
+
115
+ def master_config
116
+ config.merge(config.fetch(:master) { {} })
117
+ end
118
+
119
+ def slave_config
120
+ config.merge(config.fetch(:slave) { {} })
121
+ end
122
+
123
+ def config
124
+ @config ||= begin
125
+ override_config = ConnectionCache[nil].config.fetch(:spark_daemon) { {} }
126
+ {
127
+ id: 1,
128
+ ui_port: 8180,
129
+ master_port: 7077,
130
+ worker_count: 1,
131
+ host: Socket.gethostname,
132
+ }.merge(override_config)
133
+ end
134
+ end
135
+
136
+ def env
137
+ @env ||= spark_env.merge(ENV.to_hash)
138
+ end
139
+
140
+ def spark_env
141
+ @spark_env ||= {
142
+ 'SPARK_HOME' => Spark.home,
143
+ 'SPARK_CLASSPATH' => Spark.classpath,
144
+ 'SPARK_JARS' => Dir["#{Spark.classpath}/*.jar"] * ',',
145
+ }
146
+ end
147
+
148
+ end
149
+ end
150
+ end
@@ -0,0 +1,37 @@
1
+ module CassandraModel
2
+ class QueryBuilder
3
+ def group(*columns)
4
+ append_option(columns, :group)
5
+ end
6
+
7
+ def as_data_frame(options = {})
8
+ if @record_klass.is_a?(Spark::DataFrame)
9
+ data_frame_from_frame(options)
10
+ else
11
+ data_frame_from_model(options)
12
+ end
13
+ end
14
+
15
+ private
16
+
17
+ def data_frame_from_frame(options)
18
+ query_frame = @record_klass.query(@params, @options)
19
+ Spark::DataFrame.new(@record_klass.record_klass, nil, options.merge(spark_data_frame: query_frame))
20
+ end
21
+
22
+ def data_frame_from_model(options)
23
+ updated_restriction = @record_klass.restriction_attributes(@params).inject({}) do |memo, (key, value)|
24
+ updated_key = if value.is_a?(Array)
25
+ value = value.to_java
26
+ updated_key = key.is_a?(ThomasUtils::KeyComparer) ? key.to_s : "#{key} IN"
27
+ "#{updated_key} (#{(%w(?)*value.count)*','})"
28
+ else
29
+ key.is_a?(ThomasUtils::KeyComparer) ? "#{key} ?" : "#{key} = ?"
30
+ end
31
+ memo.merge!(updated_key => value)
32
+ end.stringify_keys.to_java
33
+ rdd = SparkCassandraHelper.filterRDD(@record_klass.rdd, updated_restriction)
34
+ Spark::DataFrame.new(@record_klass, rdd, options)
35
+ end
36
+ end
37
+ end