cassandra_model_spark 0.0.1.5-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +17 -0
  3. data/bin/cmodel-spark-build +7 -0
  4. data/bin/cmodel-spark-env.rb +11 -0
  5. data/bin/cmodel-spark-master +22 -0
  6. data/bin/cmodel-spark-run-master +4 -0
  7. data/bin/cmodel-spark-run-master.sh +8 -0
  8. data/bin/cmodel-spark-run-slave +4 -0
  9. data/bin/cmodel-spark-run-slave.sh +8 -0
  10. data/bin/cmodel-spark-slaves +22 -0
  11. data/ext/scala_helper/bin/load-spark-env.sh +63 -0
  12. data/ext/scala_helper/bin/spark-class +87 -0
  13. data/ext/scala_helper/build.sbt +62 -0
  14. data/ext/scala_helper/cassandra_helper.scala +23 -0
  15. data/ext/scala_helper/data_type_helper.scala +27 -0
  16. data/ext/scala_helper/marshal_loader.scala +204 -0
  17. data/ext/scala_helper/marshal_row_mapping.scala +85 -0
  18. data/ext/scala_helper/project/plugins.sbt +6 -0
  19. data/ext/scala_helper/sbin/spark-config.sh +30 -0
  20. data/ext/scala_helper/sbin/spark-daemon.sh +223 -0
  21. data/ext/scala_helper/schema_builder.scala +35 -0
  22. data/ext/scala_helper/worker.scala +13 -0
  23. data/lib/cassandra_model_spark.rb +42 -0
  24. data/lib/cassandra_model_spark/build.rb +24 -0
  25. data/lib/cassandra_model_spark/column_cast.rb +44 -0
  26. data/lib/cassandra_model_spark/connection_cache.rb +9 -0
  27. data/lib/cassandra_model_spark/data_frame.rb +374 -0
  28. data/lib/cassandra_model_spark/java_bridge.rb +91 -0
  29. data/lib/cassandra_model_spark/java_classes.rb +36 -0
  30. data/lib/cassandra_model_spark/launcher.rb +150 -0
  31. data/lib/cassandra_model_spark/query_builder.rb +37 -0
  32. data/lib/cassandra_model_spark/raw_connection.rb +47 -0
  33. data/lib/cassandra_model_spark/record.rb +18 -0
  34. data/lib/cassandra_model_spark/spark.rb +33 -0
  35. metadata +113 -0
@@ -0,0 +1,44 @@
1
+ module CassandraModel
2
+ module Spark
3
+ class ColumnCast
4
+ include ThomasUtils::SymbolHelpers
5
+
6
+ attr_reader :key
7
+
8
+ def initialize(key, type)
9
+ @key = key
10
+ @type = type.to_s.upcase
11
+ end
12
+
13
+ def quote(quote)
14
+ quoted_key = if @key.respond_to?(:quote)
15
+ @key.quote(quote)
16
+ else
17
+ "#{quote}#{@key}#{quote}"
18
+ end
19
+ "CAST(#{quoted_key} AS #{@type})"
20
+ end
21
+
22
+ def new_key(key)
23
+ self.class.new(key, @type)
24
+ end
25
+
26
+ end
27
+ end
28
+ end
29
+
30
+ module ThomasUtils
31
+ class KeyChild
32
+ def cast_as(type)
33
+ CassandraModel::Spark::ColumnCast.new(self, type)
34
+ end
35
+ alias :* :cast_as
36
+ end
37
+ end
38
+
39
+ class Symbol
40
+ def cast_as(type)
41
+ CassandraModel::Spark::ColumnCast.new(self, type)
42
+ end
43
+ alias :* :cast_as
44
+ end
@@ -0,0 +1,9 @@
1
+ module CassandraModel
2
+ class ConnectionCache
3
+ def self.clear
4
+ @@cache.values.map(&:java_spark_context).map(&:stop)
5
+ @@cache.values.map(&:shutdown)
6
+ @@cache.clear
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,374 @@
1
+ module CassandraModel
2
+ module Spark
3
+ class DataFrame
4
+ include QueryHelper
5
+
6
+ SQL_TYPE_MAP = {
7
+ int: SqlIntegerType,
8
+ text: SqlStringType,
9
+ double: SqlDoubleType,
10
+ timestamp: SqlTimestampType,
11
+ }.freeze
12
+ #noinspection RubyStringKeysInHashInspection
13
+ SQL_RUBY_TYPE_FUNCTIONS = {
14
+ 'IntegerType' => :getInt,
15
+ 'LongType' => :getLong,
16
+ 'StringType' => :getString,
17
+ 'DoubleType' => :getDouble,
18
+ 'TimestampType' => :getTimestamp,
19
+ 'MapType(StringType,StringType,true)' => :getMap,
20
+ }
21
+
22
+ attr_reader :table_name, :record_klass
23
+
24
+ def initialize(record_klass, rdd, options = {})
25
+ @table_name = options.fetch(:alias) { record_klass.table_name }
26
+ @sql_context = options[:sql_context]
27
+ initialize_frame_from_existing(options)
28
+ @record_klass = record_klass
29
+
30
+ initialize_row_mapping(options)
31
+ initialize_rdd(rdd)
32
+ end
33
+
34
+ def derived?
35
+ !!@derived
36
+ end
37
+
38
+ def sql_context
39
+ @sql_context ||= create_sql_context
40
+ end
41
+
42
+ def union(rhs)
43
+ unless record_klass == rhs.record_klass
44
+ raise ArgumentError, 'Cannot union DataFrames with different Record types!'
45
+ end
46
+ DataFrame.new(record_klass, rdd.union(rhs.rdd))
47
+ end
48
+
49
+ def spark_data_frame
50
+ @frame ||= SparkSchemaBuilder.new.tap do |builder|
51
+ record_klass.cassandra_columns.each do |name, type|
52
+ select_name = record_klass.normalized_column(name)
53
+ mapped_type = row_type_mapping[select_name]
54
+ type = if mapped_type
55
+ name = mapped_type[:name]
56
+ mapped_type[:type]
57
+ else
58
+ SQL_TYPE_MAP.fetch(type) { SqlStringType }
59
+ end
60
+ builder.add_column(name.to_s, type)
61
+ end
62
+ end.create_data_frame(sql_context, rdd).tap { |frame| frame.register_temp_table(table_name.to_s) }
63
+ end
64
+
65
+ def cache
66
+ spark_data_frame.cache
67
+ end
68
+
69
+ def uncache
70
+ spark_data_frame.unpersist
71
+ end
72
+
73
+ def cached(&block)
74
+ spark_data_frame.cache
75
+ instance_eval(&block)
76
+ spark_data_frame.unpersist
77
+ end
78
+
79
+ def normalized(alias_table_name = nil)
80
+ return self unless rdd
81
+
82
+ select_options = record_klass.columns.inject({}) do |memo, column|
83
+ row_mapped_column = row_type_mapping.fetch(column) { {name: column} }[:name]
84
+ memo.merge!(row_mapped_column => {as: row_mapped_column})
85
+ end
86
+ alias_name = alias_table_name || :"normalized_#{table_name}"
87
+ select(select_options).as_data_frame(alias: alias_name)
88
+ end
89
+
90
+ def request_async(*_)
91
+ ResultPaginator.new(first_async) {}
92
+ end
93
+
94
+ def first_async(*_)
95
+ Cassandra::Future.error(NotImplementedError.new)
96
+ end
97
+
98
+ def sql(query)
99
+ spark_data_frame
100
+ query = sql_context.sql(query)
101
+ query.collect.map do |row|
102
+ row_to_record(query.schema, row)
103
+ end
104
+
105
+ end
106
+
107
+ def query(restriction, options)
108
+ spark_data_frame
109
+ select_clause = select_columns(options)
110
+ group_clause = group_clause(:group, 'GROUP BY', options)
111
+ order_clause = group_clause(:order_by, 'ORDER BY', options)
112
+ limit_clause = if options[:limit]
113
+ " LIMIT #{options[:limit]}"
114
+ end
115
+ where_clause = query_where_clause(restriction)
116
+ sql_context.sql("SELECT #{select_clause} FROM #{table_name}#{where_clause}#{group_clause}#{order_clause}#{limit_clause}")
117
+ end
118
+
119
+ def request(restriction = {}, options = {})
120
+ query = query(restriction, options)
121
+ query.collect.map do |row|
122
+ row_to_record(query.schema, row)
123
+ end
124
+ end
125
+
126
+ def first(restriction = {}, options = {})
127
+ query = query(restriction, options)
128
+ row = query.first
129
+ row_to_record(query.schema, row)
130
+ end
131
+
132
+ def ==(rhs)
133
+ rhs.is_a?(DataFrame) &&
134
+ record_klass == rhs.record_klass &&
135
+ ((rdd && rdd == rhs.rdd) || (!rdd && spark_data_frame == rhs.spark_data_frame))
136
+ end
137
+
138
+ protected
139
+
140
+ attr_reader :rdd
141
+
142
+ private
143
+
144
+ def initialize_frame_from_existing(options)
145
+ @frame = options[:spark_data_frame]
146
+ if @frame
147
+ raise ArgumentError, 'DataFrames created from Spark DataFrames require aliases!' unless options[:alias]
148
+ @frame.register_temp_table(options[:alias].to_s)
149
+ @sql_context = @frame.sql_context
150
+ end
151
+ end
152
+
153
+ def initialize_rdd(rdd)
154
+ if rdd
155
+ @rdd = if @row_mapping[:mapper]
156
+ @row_mapping[:mapper].mappedRDD(rdd)
157
+ else
158
+ rdd
159
+ end
160
+ else
161
+ @derived = true
162
+ end
163
+ end
164
+
165
+ def initialize_row_mapping(options)
166
+ @row_mapping = options.fetch(:row_mapping) do
167
+ @record_klass.rdd_row_mapping || {}
168
+ end
169
+ end
170
+
171
+ def row_type_mapping
172
+ @row_mapping[:type_map] ||= {}
173
+ end
174
+
175
+ def create_sql_context
176
+ CassandraSQLContext.new(record_klass.table.connection.spark_context).tap do |context|
177
+ context.setKeyspace(record_klass.table.connection.config[:keyspace])
178
+ end
179
+ end
180
+
181
+ def row_to_record(schema, row)
182
+ attributes = row_attributes(row, schema)
183
+
184
+ if valid_record?(attributes)
185
+ record_klass.new(attributes)
186
+ else
187
+ attributes
188
+ end
189
+ end
190
+
191
+ def row_attributes(row, schema)
192
+ attributes = {}
193
+ schema.fields.each_with_index do |field, index|
194
+ value = field_value(field, index, row)
195
+ column = field.name
196
+ attributes.merge!(column => value)
197
+ end
198
+ record_klass.normalized_attributes(attributes)
199
+ end
200
+
201
+ def valid_record?(attributes)
202
+ available_columns = record_klass.columns + record_klass.deferred_columns
203
+ attributes.keys.all? { |column| available_columns.include?(column) }
204
+ end
205
+
206
+ def field_value(field, index, row)
207
+ data_type = field.data_type
208
+ if column_is_struct?(data_type)
209
+ row_attributes(row.get(index), data_type)
210
+ else
211
+ decode_column_value(data_type, index, row)
212
+ end
213
+ end
214
+
215
+ def decode_column_value(data_type, index, row)
216
+ sql_type = data_type.to_string
217
+ converter = SQL_RUBY_TYPE_FUNCTIONS.fetch(sql_type) { :getString }
218
+ value = row.public_send(converter, index)
219
+
220
+ value = decode_hash(value) if column_is_string_map?(sql_type)
221
+ value
222
+ end
223
+
224
+ def decode_hash(value)
225
+ Hash[value.toSeq.array.to_a.map! { |pair| [pair._1.to_string, pair._2.to_string] }]
226
+ end
227
+
228
+ def column_is_string_map?(sql_type)
229
+ sql_type == 'MapType(StringType,StringType,true)'
230
+ end
231
+
232
+ def column_is_struct?(data_type)
233
+ data_type.getClass.getSimpleName == 'StructType'
234
+ end
235
+
236
+ def select_columns(options)
237
+ options[:select] ? clean_select_columns(options) * ', ' : '*'
238
+ end
239
+
240
+ def group_clause(type, prefix, options)
241
+ if options[type]
242
+ updated_clause = options[type].map do |column|
243
+ if column.is_a?(Hash)
244
+ column, direction = column.first
245
+ updated_column = quoted_column(column)
246
+ "#{updated_column} #{direction.upcase}"
247
+ else
248
+ quoted_column(column)
249
+ end
250
+ end * ', '
251
+ " #{prefix} #{updated_clause}"
252
+ end
253
+ end
254
+
255
+ def group_child_clause(child, updated_column)
256
+ child, direction = if child.is_a?(Hash)
257
+ child.first
258
+ else
259
+ [child]
260
+ end
261
+ direction_clause = (" #{direction.upcase}" if direction)
262
+ "#{updated_column}.`#{child}`#{direction_clause}"
263
+ end
264
+
265
+ def clean_select_columns(options)
266
+ options[:select].map do |column|
267
+ if column.is_a?(Hash)
268
+ updated_column(column)
269
+ else
270
+ quoted_column(column)
271
+ end
272
+ end
273
+ end
274
+
275
+ def updated_column(column)
276
+ column, options = column.first
277
+
278
+ if options.is_a?(Symbol)
279
+ updated_column = if column.is_a?(ThomasUtils::KeyChild)
280
+ "#{column}".gsub(/\./, '_')
281
+ else
282
+ column
283
+ end
284
+ options = {aggregate: options, as: :"#{updated_column}_#{options}"}
285
+ end
286
+
287
+ column = quoted_column(column)
288
+ column = aggregate_column(column, options) if options[:aggregate]
289
+ column = "#{column} AS #{options[:as]}" if options[:as]
290
+ column
291
+ end
292
+
293
+ def quoted_column(column)
294
+ return column.map { |child_column| quoted_column(child_column) } * ', ' if column.is_a?(Array)
295
+
296
+ if column == :*
297
+ '*'
298
+ elsif column.respond_to?(:quote)
299
+ column.quote('`')
300
+ else
301
+ "`#{select_column(column)}`"
302
+ end
303
+ end
304
+
305
+ def aggregate_column(column, options)
306
+ case options[:aggregate]
307
+ when :count_distinct
308
+ "COUNT(#{distinct_aggregate(column)})"
309
+ when :distinct
310
+ distinct_aggregate(column)
311
+ when :variance
312
+ variance_column(column)
313
+ when :stddev
314
+ "POW(#{variance_column(column)},0.5)"
315
+ else
316
+ if options[:aggregate] =~ /^cast_/
317
+ type = options[:aggregate].to_s.match(/^cast_(.+)$/)[1]
318
+ "CAST(#{column} AS #{type.upcase})"
319
+ else
320
+ "#{options[:aggregate].to_s.upcase}(#{column})"
321
+ end
322
+ end
323
+ end
324
+
325
+ def distinct_aggregate(column)
326
+ "DISTINCT #{column}"
327
+ end
328
+
329
+ def variance_column(column)
330
+ "AVG(POW(#{column},2)) - POW(AVG(#{column}),2)"
331
+ end
332
+
333
+ def query_where_clause(restriction)
334
+ if restriction.present?
335
+ restriction_clause = restriction.map do |key, value|
336
+ updated_key = if key.is_a?(ThomasUtils::KeyComparer)
337
+ select_key = if key.key.respond_to?(:new_key)
338
+ select_key = select_column(key.key.key)
339
+ key.key.new_key(select_key)
340
+ else
341
+ select_column(key.key)
342
+ end
343
+ key.new_key(select_key).quote('`')
344
+ elsif key.is_a?(ThomasUtils::KeyChild)
345
+ new_key = select_column(key.key)
346
+ updated_key = key.new_key(new_key)
347
+ quoted_restriction(updated_key)
348
+ else
349
+ select_key = select_column(key)
350
+ quoted_restriction(select_key)
351
+ end
352
+ value = "'#{value}'" if value.is_a?(String) || value.is_a?(Time)
353
+ "#{updated_key} #{value}"
354
+ end * ' AND '
355
+ " WHERE #{restriction_clause}"
356
+ end
357
+ end
358
+
359
+ def select_column(key)
360
+ new_key = record_klass.select_column(key)
361
+ available_columns.include?(new_key) ? new_key : key
362
+ end
363
+
364
+ def available_columns
365
+ @available_columns ||= spark_data_frame.schema.fields.map(&:name).map(&:to_sym)
366
+ end
367
+
368
+ def quoted_restriction(updated_key)
369
+ ThomasUtils::KeyComparer.new(updated_key, '=').quote('`')
370
+ end
371
+
372
+ end
373
+ end
374
+ end
@@ -0,0 +1,91 @@
1
+ if RUBY_ENGINE == 'jruby'
2
+ class Hash
3
+ def to_java
4
+ JavaHashMap.new(self)
5
+ end
6
+ end
7
+
8
+ class Array
9
+ def to_java_argv
10
+ to_java(:string)
11
+ end
12
+ end
13
+
14
+ else
15
+ class Hash
16
+ def to_java
17
+ JavaHashMap.new.tap do |map|
18
+ each do |key, value|
19
+ map.put(key, value)
20
+ end
21
+ end
22
+ end
23
+ end
24
+
25
+ class Array
26
+ def to_java
27
+ self
28
+ end
29
+
30
+ def to_java_argv
31
+ self
32
+ end
33
+ end
34
+ end
35
+
36
+ module JavaBridge
37
+ if RUBY_ENGINE == 'jruby'
38
+ def import_java_object(path, options = {})
39
+ name = options.fetch(:as) { path.split('.').last }.to_sym
40
+ klass = "Java::#{path}"
41
+ Object.const_set(name, eval(klass))
42
+ end
43
+
44
+ def initialize_java_engine
45
+ # nothing to do here
46
+ end
47
+ else
48
+ def import_java_object(path, options = {})
49
+ name = options.fetch(:as) { path.split('.').last }.to_sym
50
+ Object.const_set(name, load_java_class(path))
51
+ end
52
+
53
+ def require(path)
54
+ # hack to make importing jars work like jruby
55
+ if path =~ /\.jar$/i
56
+ java_jar_list << path
57
+ else
58
+ super
59
+ end
60
+ end
61
+
62
+ def initialize_java_engine
63
+ # have to load everything in one go here
64
+ Rjb.load(java_jar_list * platform_path_separator)
65
+ end
66
+
67
+ private
68
+
69
+ def platform_path_separator
70
+ @platform_separator ||= RbConfig::CONFIG['host_os'] =~ /mswin|mingw/ ? ';' : ':'
71
+ end
72
+
73
+ def java_jar_list
74
+ @java_jar_list ||= []
75
+ end
76
+
77
+ def load_java_class(path)
78
+ import_quiet { Rjb.import(path) }
79
+ end
80
+ end
81
+
82
+ def import_quiet
83
+ prev_verbox = $VERBOSE
84
+ $VERBOSE = nil
85
+ yield
86
+ ensure
87
+ $VERBOSE = prev_verbox
88
+ end
89
+ end
90
+
91
+ include JavaBridge