cassandra_model_spark 0.0.1.5-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +17 -0
  3. data/bin/cmodel-spark-build +7 -0
  4. data/bin/cmodel-spark-env.rb +11 -0
  5. data/bin/cmodel-spark-master +22 -0
  6. data/bin/cmodel-spark-run-master +4 -0
  7. data/bin/cmodel-spark-run-master.sh +8 -0
  8. data/bin/cmodel-spark-run-slave +4 -0
  9. data/bin/cmodel-spark-run-slave.sh +8 -0
  10. data/bin/cmodel-spark-slaves +22 -0
  11. data/ext/scala_helper/bin/load-spark-env.sh +63 -0
  12. data/ext/scala_helper/bin/spark-class +87 -0
  13. data/ext/scala_helper/build.sbt +62 -0
  14. data/ext/scala_helper/cassandra_helper.scala +23 -0
  15. data/ext/scala_helper/data_type_helper.scala +27 -0
  16. data/ext/scala_helper/marshal_loader.scala +204 -0
  17. data/ext/scala_helper/marshal_row_mapping.scala +85 -0
  18. data/ext/scala_helper/project/plugins.sbt +6 -0
  19. data/ext/scala_helper/sbin/spark-config.sh +30 -0
  20. data/ext/scala_helper/sbin/spark-daemon.sh +223 -0
  21. data/ext/scala_helper/schema_builder.scala +35 -0
  22. data/ext/scala_helper/worker.scala +13 -0
  23. data/lib/cassandra_model_spark.rb +42 -0
  24. data/lib/cassandra_model_spark/build.rb +24 -0
  25. data/lib/cassandra_model_spark/column_cast.rb +44 -0
  26. data/lib/cassandra_model_spark/connection_cache.rb +9 -0
  27. data/lib/cassandra_model_spark/data_frame.rb +374 -0
  28. data/lib/cassandra_model_spark/java_bridge.rb +91 -0
  29. data/lib/cassandra_model_spark/java_classes.rb +36 -0
  30. data/lib/cassandra_model_spark/launcher.rb +150 -0
  31. data/lib/cassandra_model_spark/query_builder.rb +37 -0
  32. data/lib/cassandra_model_spark/raw_connection.rb +47 -0
  33. data/lib/cassandra_model_spark/record.rb +18 -0
  34. data/lib/cassandra_model_spark/spark.rb +33 -0
  35. metadata +113 -0
@@ -0,0 +1,44 @@
1
+ module CassandraModel
2
+ module Spark
3
+ class ColumnCast
4
+ include ThomasUtils::SymbolHelpers
5
+
6
+ attr_reader :key
7
+
8
+ def initialize(key, type)
9
+ @key = key
10
+ @type = type.to_s.upcase
11
+ end
12
+
13
+ def quote(quote)
14
+ quoted_key = if @key.respond_to?(:quote)
15
+ @key.quote(quote)
16
+ else
17
+ "#{quote}#{@key}#{quote}"
18
+ end
19
+ "CAST(#{quoted_key} AS #{@type})"
20
+ end
21
+
22
+ def new_key(key)
23
+ self.class.new(key, @type)
24
+ end
25
+
26
+ end
27
+ end
28
+ end
29
+
30
+ module ThomasUtils
31
+ class KeyChild
32
+ def cast_as(type)
33
+ CassandraModel::Spark::ColumnCast.new(self, type)
34
+ end
35
+ alias :* :cast_as
36
+ end
37
+ end
38
+
39
+ class Symbol
40
+ def cast_as(type)
41
+ CassandraModel::Spark::ColumnCast.new(self, type)
42
+ end
43
+ alias :* :cast_as
44
+ end
@@ -0,0 +1,9 @@
1
+ module CassandraModel
2
+ class ConnectionCache
3
+ def self.clear
4
+ @@cache.values.map(&:java_spark_context).map(&:stop)
5
+ @@cache.values.map(&:shutdown)
6
+ @@cache.clear
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,374 @@
1
+ module CassandraModel
2
+ module Spark
3
+ class DataFrame
4
+ include QueryHelper
5
+
6
+ SQL_TYPE_MAP = {
7
+ int: SqlIntegerType,
8
+ text: SqlStringType,
9
+ double: SqlDoubleType,
10
+ timestamp: SqlTimestampType,
11
+ }.freeze
12
+ #noinspection RubyStringKeysInHashInspection
13
+ SQL_RUBY_TYPE_FUNCTIONS = {
14
+ 'IntegerType' => :getInt,
15
+ 'LongType' => :getLong,
16
+ 'StringType' => :getString,
17
+ 'DoubleType' => :getDouble,
18
+ 'TimestampType' => :getTimestamp,
19
+ 'MapType(StringType,StringType,true)' => :getMap,
20
+ }
21
+
22
+ attr_reader :table_name, :record_klass
23
+
24
+ def initialize(record_klass, rdd, options = {})
25
+ @table_name = options.fetch(:alias) { record_klass.table_name }
26
+ @sql_context = options[:sql_context]
27
+ initialize_frame_from_existing(options)
28
+ @record_klass = record_klass
29
+
30
+ initialize_row_mapping(options)
31
+ initialize_rdd(rdd)
32
+ end
33
+
34
+ def derived?
35
+ !!@derived
36
+ end
37
+
38
+ def sql_context
39
+ @sql_context ||= create_sql_context
40
+ end
41
+
42
+ def union(rhs)
43
+ unless record_klass == rhs.record_klass
44
+ raise ArgumentError, 'Cannot union DataFrames with different Record types!'
45
+ end
46
+ DataFrame.new(record_klass, rdd.union(rhs.rdd))
47
+ end
48
+
49
+ def spark_data_frame
50
+ @frame ||= SparkSchemaBuilder.new.tap do |builder|
51
+ record_klass.cassandra_columns.each do |name, type|
52
+ select_name = record_klass.normalized_column(name)
53
+ mapped_type = row_type_mapping[select_name]
54
+ type = if mapped_type
55
+ name = mapped_type[:name]
56
+ mapped_type[:type]
57
+ else
58
+ SQL_TYPE_MAP.fetch(type) { SqlStringType }
59
+ end
60
+ builder.add_column(name.to_s, type)
61
+ end
62
+ end.create_data_frame(sql_context, rdd).tap { |frame| frame.register_temp_table(table_name.to_s) }
63
+ end
64
+
65
+ def cache
66
+ spark_data_frame.cache
67
+ end
68
+
69
+ def uncache
70
+ spark_data_frame.unpersist
71
+ end
72
+
73
+ def cached(&block)
74
+ spark_data_frame.cache
75
+ instance_eval(&block)
76
+ spark_data_frame.unpersist
77
+ end
78
+
79
+ def normalized(alias_table_name = nil)
80
+ return self unless rdd
81
+
82
+ select_options = record_klass.columns.inject({}) do |memo, column|
83
+ row_mapped_column = row_type_mapping.fetch(column) { {name: column} }[:name]
84
+ memo.merge!(row_mapped_column => {as: row_mapped_column})
85
+ end
86
+ alias_name = alias_table_name || :"normalized_#{table_name}"
87
+ select(select_options).as_data_frame(alias: alias_name)
88
+ end
89
+
90
+ def request_async(*_)
91
+ ResultPaginator.new(first_async) {}
92
+ end
93
+
94
+ def first_async(*_)
95
+ Cassandra::Future.error(NotImplementedError.new)
96
+ end
97
+
98
+ def sql(query)
99
+ spark_data_frame
100
+ query = sql_context.sql(query)
101
+ query.collect.map do |row|
102
+ row_to_record(query.schema, row)
103
+ end
104
+
105
+ end
106
+
107
+ def query(restriction, options)
108
+ spark_data_frame
109
+ select_clause = select_columns(options)
110
+ group_clause = group_clause(:group, 'GROUP BY', options)
111
+ order_clause = group_clause(:order_by, 'ORDER BY', options)
112
+ limit_clause = if options[:limit]
113
+ " LIMIT #{options[:limit]}"
114
+ end
115
+ where_clause = query_where_clause(restriction)
116
+ sql_context.sql("SELECT #{select_clause} FROM #{table_name}#{where_clause}#{group_clause}#{order_clause}#{limit_clause}")
117
+ end
118
+
119
+ def request(restriction = {}, options = {})
120
+ query = query(restriction, options)
121
+ query.collect.map do |row|
122
+ row_to_record(query.schema, row)
123
+ end
124
+ end
125
+
126
+ def first(restriction = {}, options = {})
127
+ query = query(restriction, options)
128
+ row = query.first
129
+ row_to_record(query.schema, row)
130
+ end
131
+
132
+ def ==(rhs)
133
+ rhs.is_a?(DataFrame) &&
134
+ record_klass == rhs.record_klass &&
135
+ ((rdd && rdd == rhs.rdd) || (!rdd && spark_data_frame == rhs.spark_data_frame))
136
+ end
137
+
138
+ protected
139
+
140
+ attr_reader :rdd
141
+
142
+ private
143
+
144
+ def initialize_frame_from_existing(options)
145
+ @frame = options[:spark_data_frame]
146
+ if @frame
147
+ raise ArgumentError, 'DataFrames created from Spark DataFrames require aliases!' unless options[:alias]
148
+ @frame.register_temp_table(options[:alias].to_s)
149
+ @sql_context = @frame.sql_context
150
+ end
151
+ end
152
+
153
+ def initialize_rdd(rdd)
154
+ if rdd
155
+ @rdd = if @row_mapping[:mapper]
156
+ @row_mapping[:mapper].mappedRDD(rdd)
157
+ else
158
+ rdd
159
+ end
160
+ else
161
+ @derived = true
162
+ end
163
+ end
164
+
165
+ def initialize_row_mapping(options)
166
+ @row_mapping = options.fetch(:row_mapping) do
167
+ @record_klass.rdd_row_mapping || {}
168
+ end
169
+ end
170
+
171
+ def row_type_mapping
172
+ @row_mapping[:type_map] ||= {}
173
+ end
174
+
175
+ def create_sql_context
176
+ CassandraSQLContext.new(record_klass.table.connection.spark_context).tap do |context|
177
+ context.setKeyspace(record_klass.table.connection.config[:keyspace])
178
+ end
179
+ end
180
+
181
+ def row_to_record(schema, row)
182
+ attributes = row_attributes(row, schema)
183
+
184
+ if valid_record?(attributes)
185
+ record_klass.new(attributes)
186
+ else
187
+ attributes
188
+ end
189
+ end
190
+
191
+ def row_attributes(row, schema)
192
+ attributes = {}
193
+ schema.fields.each_with_index do |field, index|
194
+ value = field_value(field, index, row)
195
+ column = field.name
196
+ attributes.merge!(column => value)
197
+ end
198
+ record_klass.normalized_attributes(attributes)
199
+ end
200
+
201
+ def valid_record?(attributes)
202
+ available_columns = record_klass.columns + record_klass.deferred_columns
203
+ attributes.keys.all? { |column| available_columns.include?(column) }
204
+ end
205
+
206
+ def field_value(field, index, row)
207
+ data_type = field.data_type
208
+ if column_is_struct?(data_type)
209
+ row_attributes(row.get(index), data_type)
210
+ else
211
+ decode_column_value(data_type, index, row)
212
+ end
213
+ end
214
+
215
+ def decode_column_value(data_type, index, row)
216
+ sql_type = data_type.to_string
217
+ converter = SQL_RUBY_TYPE_FUNCTIONS.fetch(sql_type) { :getString }
218
+ value = row.public_send(converter, index)
219
+
220
+ value = decode_hash(value) if column_is_string_map?(sql_type)
221
+ value
222
+ end
223
+
224
+ def decode_hash(value)
225
+ Hash[value.toSeq.array.to_a.map! { |pair| [pair._1.to_string, pair._2.to_string] }]
226
+ end
227
+
228
+ def column_is_string_map?(sql_type)
229
+ sql_type == 'MapType(StringType,StringType,true)'
230
+ end
231
+
232
+ def column_is_struct?(data_type)
233
+ data_type.getClass.getSimpleName == 'StructType'
234
+ end
235
+
236
+ def select_columns(options)
237
+ options[:select] ? clean_select_columns(options) * ', ' : '*'
238
+ end
239
+
240
+ def group_clause(type, prefix, options)
241
+ if options[type]
242
+ updated_clause = options[type].map do |column|
243
+ if column.is_a?(Hash)
244
+ column, direction = column.first
245
+ updated_column = quoted_column(column)
246
+ "#{updated_column} #{direction.upcase}"
247
+ else
248
+ quoted_column(column)
249
+ end
250
+ end * ', '
251
+ " #{prefix} #{updated_clause}"
252
+ end
253
+ end
254
+
255
+ def group_child_clause(child, updated_column)
256
+ child, direction = if child.is_a?(Hash)
257
+ child.first
258
+ else
259
+ [child]
260
+ end
261
+ direction_clause = (" #{direction.upcase}" if direction)
262
+ "#{updated_column}.`#{child}`#{direction_clause}"
263
+ end
264
+
265
+ def clean_select_columns(options)
266
+ options[:select].map do |column|
267
+ if column.is_a?(Hash)
268
+ updated_column(column)
269
+ else
270
+ quoted_column(column)
271
+ end
272
+ end
273
+ end
274
+
275
+ def updated_column(column)
276
+ column, options = column.first
277
+
278
+ if options.is_a?(Symbol)
279
+ updated_column = if column.is_a?(ThomasUtils::KeyChild)
280
+ "#{column}".gsub(/\./, '_')
281
+ else
282
+ column
283
+ end
284
+ options = {aggregate: options, as: :"#{updated_column}_#{options}"}
285
+ end
286
+
287
+ column = quoted_column(column)
288
+ column = aggregate_column(column, options) if options[:aggregate]
289
+ column = "#{column} AS #{options[:as]}" if options[:as]
290
+ column
291
+ end
292
+
293
+ def quoted_column(column)
294
+ return column.map { |child_column| quoted_column(child_column) } * ', ' if column.is_a?(Array)
295
+
296
+ if column == :*
297
+ '*'
298
+ elsif column.respond_to?(:quote)
299
+ column.quote('`')
300
+ else
301
+ "`#{select_column(column)}`"
302
+ end
303
+ end
304
+
305
+ def aggregate_column(column, options)
306
+ case options[:aggregate]
307
+ when :count_distinct
308
+ "COUNT(#{distinct_aggregate(column)})"
309
+ when :distinct
310
+ distinct_aggregate(column)
311
+ when :variance
312
+ variance_column(column)
313
+ when :stddev
314
+ "POW(#{variance_column(column)},0.5)"
315
+ else
316
+ if options[:aggregate] =~ /^cast_/
317
+ type = options[:aggregate].to_s.match(/^cast_(.+)$/)[1]
318
+ "CAST(#{column} AS #{type.upcase})"
319
+ else
320
+ "#{options[:aggregate].to_s.upcase}(#{column})"
321
+ end
322
+ end
323
+ end
324
+
325
+ def distinct_aggregate(column)
326
+ "DISTINCT #{column}"
327
+ end
328
+
329
+ def variance_column(column)
330
+ "AVG(POW(#{column},2)) - POW(AVG(#{column}),2)"
331
+ end
332
+
333
+ def query_where_clause(restriction)
334
+ if restriction.present?
335
+ restriction_clause = restriction.map do |key, value|
336
+ updated_key = if key.is_a?(ThomasUtils::KeyComparer)
337
+ select_key = if key.key.respond_to?(:new_key)
338
+ select_key = select_column(key.key.key)
339
+ key.key.new_key(select_key)
340
+ else
341
+ select_column(key.key)
342
+ end
343
+ key.new_key(select_key).quote('`')
344
+ elsif key.is_a?(ThomasUtils::KeyChild)
345
+ new_key = select_column(key.key)
346
+ updated_key = key.new_key(new_key)
347
+ quoted_restriction(updated_key)
348
+ else
349
+ select_key = select_column(key)
350
+ quoted_restriction(select_key)
351
+ end
352
+ value = "'#{value}'" if value.is_a?(String) || value.is_a?(Time)
353
+ "#{updated_key} #{value}"
354
+ end * ' AND '
355
+ " WHERE #{restriction_clause}"
356
+ end
357
+ end
358
+
359
+ def select_column(key)
360
+ new_key = record_klass.select_column(key)
361
+ available_columns.include?(new_key) ? new_key : key
362
+ end
363
+
364
+ def available_columns
365
+ @available_columns ||= spark_data_frame.schema.fields.map(&:name).map(&:to_sym)
366
+ end
367
+
368
+ def quoted_restriction(updated_key)
369
+ ThomasUtils::KeyComparer.new(updated_key, '=').quote('`')
370
+ end
371
+
372
+ end
373
+ end
374
+ end
@@ -0,0 +1,91 @@
1
+ if RUBY_ENGINE == 'jruby'
2
+ class Hash
3
+ def to_java
4
+ JavaHashMap.new(self)
5
+ end
6
+ end
7
+
8
+ class Array
9
+ def to_java_argv
10
+ to_java(:string)
11
+ end
12
+ end
13
+
14
+ else
15
+ class Hash
16
+ def to_java
17
+ JavaHashMap.new.tap do |map|
18
+ each do |key, value|
19
+ map.put(key, value)
20
+ end
21
+ end
22
+ end
23
+ end
24
+
25
+ class Array
26
+ def to_java
27
+ self
28
+ end
29
+
30
+ def to_java_argv
31
+ self
32
+ end
33
+ end
34
+ end
35
+
36
+ module JavaBridge
37
+ if RUBY_ENGINE == 'jruby'
38
+ def import_java_object(path, options = {})
39
+ name = options.fetch(:as) { path.split('.').last }.to_sym
40
+ klass = "Java::#{path}"
41
+ Object.const_set(name, eval(klass))
42
+ end
43
+
44
+ def initialize_java_engine
45
+ # nothing to do here
46
+ end
47
+ else
48
+ def import_java_object(path, options = {})
49
+ name = options.fetch(:as) { path.split('.').last }.to_sym
50
+ Object.const_set(name, load_java_class(path))
51
+ end
52
+
53
+ def require(path)
54
+ # hack to make importing jars work like jruby
55
+ if path =~ /\.jar$/i
56
+ java_jar_list << path
57
+ else
58
+ super
59
+ end
60
+ end
61
+
62
+ def initialize_java_engine
63
+ # have to load everything in one go here
64
+ Rjb.load(java_jar_list * platform_path_separator)
65
+ end
66
+
67
+ private
68
+
69
+ def platform_path_separator
70
+ @platform_separator ||= RbConfig::CONFIG['host_os'] =~ /mswin|mingw/ ? ';' : ':'
71
+ end
72
+
73
+ def java_jar_list
74
+ @java_jar_list ||= []
75
+ end
76
+
77
+ def load_java_class(path)
78
+ import_quiet { Rjb.import(path) }
79
+ end
80
+ end
81
+
82
+ def import_quiet
83
+ prev_verbox = $VERBOSE
84
+ $VERBOSE = nil
85
+ yield
86
+ ensure
87
+ $VERBOSE = prev_verbox
88
+ end
89
+ end
90
+
91
+ include JavaBridge