mobilize-hive 1.0.07

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,530 @@
1
+ module Mobilize
2
+ module Hive
3
+ def Hive.config
4
+ Base.config('hive')
5
+ end
6
+
7
+ def Hive.exec_path(cluster)
8
+ Hive.clusters[cluster]['exec_path']
9
+ end
10
+
11
+ def Hive.output_db(cluster)
12
+ Hive.clusters[cluster]['output_db']
13
+ end
14
+
15
+ def Hive.output_db_user(cluster)
16
+ output_db_node = Hadoop.gateway_node(cluster)
17
+ output_db_user = Ssh.host(output_db_node)['user']
18
+ output_db_user
19
+ end
20
+
21
+ def Hive.clusters
22
+ Hive.config['clusters']
23
+ end
24
+
25
+ def Hive.slot_ids(cluster)
26
+ (1..Hive.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
27
+ end
28
+
29
+ def Hive.slot_worker_by_cluster_and_path(cluster,path)
30
+ working_slots = Mobilize::Resque.jobs('working').map{|j| j['hive_slot'] if (j and j['hive_slot'])}.compact
31
+ Hive.slot_ids(cluster).each do |slot_id|
32
+ unless working_slots.include?(slot_id)
33
+ Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
34
+ return slot_id
35
+ end
36
+ end
37
+ #return false if none are available
38
+ return false
39
+ end
40
+
41
+ def Hive.unslot_worker_by_path(path)
42
+ begin
43
+ Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
44
+ return true
45
+ rescue
46
+ return false
47
+ end
48
+ end
49
+
50
+ #get field names and partition datatypes and size of a hive table
51
+ def Hive.table_stats(db,table,cluster,user)
52
+ describe_sql = "use #{db};describe extended #{table}"
53
+ describe_output = Hive.run(describe_sql,cluster,user)
54
+ describe_output.split("location:").last.split(",").first
55
+ #get location, fields, partitions
56
+ result_hash = {}
57
+ result_hash['location'] = describe_output.split("location:").last.split(",").first
58
+ #get fields
59
+ field_defs = describe_output.split(" \nDetailed Table Information").first.split(
60
+ "\n").map{|f|
61
+ f.strip.split("\t").ie{|fa|
62
+ {"name"=>fa.first,"datatype"=>fa.second} if fa.first}}.compact
63
+ #check for partititons
64
+ if describe_output.index("partitionKeys:[FieldSchema")
65
+ part_field_string = describe_output.split("partitionKeys:[").last.split("]").first
66
+ #parse weird schema using yaml plus gsubs
67
+ yaml_fields = "---" + part_field_string.gsub("FieldSchema","\n").gsub(
68
+ ")","").gsub(
69
+ ",","\n ").gsub(
70
+ "(","- ").gsub(
71
+ "null","").gsub(
72
+ ":",": ")
73
+ #return partitions without the comment part
74
+ result_hash['partitions'] = YAML.load(yaml_fields).map{|ph| ph.delete('comment');ph}
75
+ #get rid of fields in fields section that are also partitions
76
+ result_hash['partitions'].map{|p| p['name']}.each{|n| field_defs.delete_if{|f| f['name']==n}}
77
+ end
78
+ #assign field defs after removing partitions
79
+ result_hash['field_defs'] = field_defs
80
+ #get size
81
+ result_hash['size'] = Hadoop.run("fs -dus #{result_hash['location']}",cluster,user).split("\t").last.strip.to_i
82
+ return result_hash
83
+ end
84
+
85
+ #run a generic hive command, with the option of passing a file hash to be locally available
86
+ def Hive.run(hql,cluster,user,file_hash=nil)
87
+ filename = hql.to_md5
88
+ file_hash||= {}
89
+ file_hash[filename] = hql
90
+ #silent mode so we don't have logs in stderr; clip output
91
+ #at hadoop read limit
92
+ command = "#{Hive.exec_path(cluster)} -S -f #{filename} | head -c #{Hadoop.read_limit}"
93
+ gateway_node = Hadoop.gateway_node(cluster)
94
+ Ssh.run(gateway_node,command,user,file_hash)
95
+ end
96
+
97
+ def Hive.run_by_stage_path(stage_path)
98
+ s = Stage.where(:path=>stage_path).first
99
+ u = s.job.runner.user
100
+ params = s.params
101
+ user = params['user']
102
+ cluster = params['cluster'] || Hive.clusters.keys.first
103
+ node = Hadoop.gateway_node(cluster)
104
+ node_user = Ssh.host(node)['user']
105
+ if user and !Ssh.sudoers(node).include?(u.name)
106
+ raise "#{u.name} does not have su permissions for #{node}"
107
+ elsif user.nil? and Ssh.su_all_users(node)
108
+ user = u.name
109
+ end
110
+
111
+ #slot Hive worker if available
112
+ slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
113
+ return false unless slot_id
114
+
115
+ #output table stores stage output
116
+ output_db,output_table = [Hive.output_db(cluster),stage_path.gridsafe]
117
+ output_path = [output_db,output_table].join(".")
118
+ out_url = "hive://#{cluster}/#{output_db}/#{output_table}"
119
+
120
+ #get hql
121
+ if params['hql']
122
+ hql = params['hql']
123
+ else
124
+ #user has passed in a gsheet hql
125
+ gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
126
+ #return blank response if there are no slots available
127
+ return nil unless gdrive_slot
128
+ source_dst = s.source_dsts(gdrive_slot).first
129
+ Gdrive.unslot_worker_by_path(stage_path)
130
+ hql = source_dst.read(user)
131
+ end
132
+
133
+ #check for select at end
134
+ hql_array = hql.split(";").map{|hc| hc.strip}.reject{|hc| hc.length==0}
135
+ if hql_array.last.downcase.starts_with?("select")
136
+ #nil if no prior commands
137
+ prior_hql = hql_array[0..-2].join(";") if hql_array.length > 1
138
+ select_hql = hql_array.last
139
+ output_table_hql = ["drop table if exists #{output_path}",
140
+ "create table #{output_path} as #{select_hql};"].join(";")
141
+ full_hql = [prior_hql, output_table_hql].compact.join(";")
142
+ Hive.run(full_hql, cluster, user)
143
+ #make sure node user owns the stage result directory
144
+ output_table_stats = Hive.table_stats(output_db,output_table,cluster,node_user)
145
+ output_table_location = output_table_stats['location']
146
+ chown_command = "#{Hadoop.exec_path(cluster)} fs -chown -R #{node_user} '#{output_table_location}'"
147
+ Ssh.run(node,chown_command,node_user)
148
+ #already populated, make sure dataset exists
149
+ Dataset.find_or_create_by_url(out_url)
150
+ else
151
+ out_string = Hive.run(hql, cluster, user)
152
+ out_string = "result\n#{out_string}"
153
+ Dataset.write_by_url(out_url,out_string,node_user)
154
+ end
155
+ #unslot worker
156
+ Hive.unslot_worker_by_path(stage_path)
157
+ out_url
158
+ end
159
+
160
+ def Hive.schema_hash(schema_path,user,gdrive_slot)
161
+ if schema_path.index("/")
162
+ #slashes mean sheets
163
+ out_tsv = Gsheet.find_by_path(schema_path,gdrive_slot).read(user)
164
+ else
165
+ u = User.where(:name=>user).first
166
+ #check sheets in runner
167
+ r = u.runner
168
+ runner_sheet = r.gbook(gdrive_slot).worksheet_by_title(schema_path)
169
+ out_tsv = if runner_sheet
170
+ runner_sheet.read(user)
171
+ else
172
+ #check for gfile. will fail if there isn't one.
173
+ Gfile.find_by_path(schema_path).read(user)
174
+ end
175
+ #use Gridfs to cache gdrive results
176
+ file_name = schema_path.split("/").last
177
+ out_url = "gridfs://#{schema_path}/#{file_name}"
178
+ Dataset.write_by_url(out_url,out_tsv,user)
179
+ schema_tsv = Dataset.find_by_url(out_url).read(user)
180
+ schema_hash = {}
181
+ schema_tsv.tsv_to_hash_array.each do |ha|
182
+ schema_hash[ha['name']] = ha['datatype']
183
+ end
184
+ schema_hash
185
+ end
186
+ end
187
+
188
+ def Hive.path_params(cluster, path, user)
189
+ db, table, partitions = path.gsub(".","/").split("/").ie{|sp| [sp.first, sp.second, sp[2..-1]]}
190
+ #get existing table stats if any
191
+ curr_stats = begin
192
+ Hive.table_stats(db, table, cluster, user)
193
+ rescue
194
+ nil
195
+ end
196
+ {"db"=>db,
197
+ "table"=>table,
198
+ "partitions"=>partitions,
199
+ "curr_stats"=>curr_stats}
200
+ end
201
+
202
+ def Hive.hql_to_table(cluster, source_hql, target_path, user, drop=false, schema_hash=nil)
203
+ target_params = Hive.path_params(cluster, target_path, user)
204
+ target_table_path = ['db','table'].map{|k| target_params[k]}.join(".")
205
+ target_partitions = target_params['partitions'].to_a
206
+ target_table_stats = target_params['curr_stats']
207
+
208
+ #create temporary table so we can identify fields etc.
209
+ temp_db = Hive.output_db(cluster)
210
+ temp_table_name = (source_hql+target_path).to_md5
211
+ temp_table_path = [temp_db,temp_table_name].join(".")
212
+ temp_drop_hql = "drop table if exists #{temp_table_path};"
213
+ temp_create_hql = "#{temp_drop_hql}create table #{temp_table_path} as #{source_hql}"
214
+ Hive.run(temp_create_hql,cluster,user)
215
+
216
+ source_params = Hive.path_params(cluster, temp_table_path, user)
217
+ source_table_path = ['db','table'].map{|k| source_params[k]}.join(".")
218
+ source_table_stats = source_params['curr_stats']
219
+ source_fields = source_table_stats['field_defs']
220
+
221
+ if target_partitions.length == 0 and
222
+ target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
223
+ #no partitions in either user params or the target table
224
+
225
+ target_headers = source_fields.map{|f| f['name']}
226
+
227
+ target_field_stmt = target_headers.map{|h| "`#{h}`"}.join(",")
228
+
229
+ field_defs = {}
230
+ target_headers.each do |name|
231
+ datatype = schema_hash[name] || "string"
232
+ field_defs[name]=datatype
233
+ end
234
+
235
+ field_def_stmt = "(#{field_defs.map do |name,datatype|
236
+ "`#{name}` #{datatype}"
237
+ end.join(",")})"
238
+
239
+ #always drop when no partititons
240
+ target_drop_hql = "drop table if exists #{target_table_path};"
241
+
242
+ target_create_hql = "create table if not exists #{target_table_path} #{field_def_stmt};"
243
+
244
+ target_insert_hql = "insert overwrite table #{target_table_path} select #{target_field_stmt} from #{source_table_path};"
245
+
246
+ target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql,temp_drop_hql].join
247
+
248
+ Hive.run(target_full_hql, cluster, user)
249
+
250
+ elsif target_partitions.length > 0 and
251
+ target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == target_partitions}
252
+ #partitions and no target table or same partitions in both target table and user params
253
+
254
+ target_headers = source_fields.map{|f| f['name']}.reject{|h| target_partitions.include?(h)}
255
+
256
+ field_defs = {}
257
+ target_headers.each do |name|
258
+ datatype = schema_hash[name] || "string"
259
+ field_defs[name]=datatype
260
+ end
261
+
262
+ field_def_stmt = "(#{field_defs.map do |name,datatype|
263
+ "`#{name}` #{datatype}"
264
+ end.join(",")})"
265
+
266
+ part_defs = {}
267
+ target_partitions.each do |name|
268
+ datatype = schema_hash[name] || "string"
269
+ part_defs[name] = datatype
270
+ end
271
+
272
+ part_def_stmt = "(#{part_defs.map do |name,datatype|
273
+ "`#{name}` #{datatype}"
274
+ end.join(",")})"
275
+
276
+ target_field_stmt = target_headers.map{|h| "`#{h}`"}.join(",")
277
+
278
+ target_part_stmt = target_partitions.map{|h| "`#{h}`"}.join(",")
279
+
280
+ target_set_hql = ["set hive.exec.dynamic.partition.mode=nonstrict;",
281
+ "set hive.exec.max.dynamic.partitions.pernode=1000;",
282
+ "set hive.exec.dynamic.partition=true;",
283
+ "set hive.exec.max.created.files = 200000;",
284
+ "set hive.max.created.files = 200000;"].join
285
+
286
+ if drop or target_table_stats.nil?
287
+ target_drop_hql = "drop table if exists #{target_table_path};"
288
+ target_create_hql = target_drop_hql +
289
+ "create table if not exists #{target_table_path} #{field_def_stmt} " +
290
+ "partitioned by #{part_def_stmt};"
291
+
292
+ else
293
+ target_db,target_table = target_table_path.split(".")
294
+ #get all the permutations of possible partititons
295
+ part_perm_hql = "set hive.cli.print.header=true;select distinct #{target_part_stmt} from #{source_table_path};"
296
+ part_perm_tsv = Hive.run(part_perm_hql, cluster, user)
297
+ #having gotten the permutations, ensure they are dropped
298
+ part_hash_array = part_perm_tsv.tsv_to_hash_array
299
+ part_drop_hql = part_hash_array.map do |h|
300
+ part_drop_stmt = h.map do |name,value|
301
+ part_defs[name[1..-2]]=="string" ? "#{name}='#{value}'" : "#{name}=#{value}"
302
+ end.join(",")
303
+ "use #{target_db};alter table #{target_table} drop if exists partition (#{part_drop_stmt});"
304
+ end.join
305
+ target_create_hql = part_drop_hql
306
+ end
307
+
308
+ target_insert_hql = "insert overwrite table #{target_table_path} " +
309
+ "partition (#{target_part_stmt}) " +
310
+ "select #{target_field_stmt},#{target_part_stmt} from #{source_table_path};"
311
+
312
+ target_full_hql = [target_set_hql, target_create_hql, target_insert_hql, temp_drop_hql].join
313
+
314
+ Hive.run(target_full_hql, cluster, user)
315
+ else
316
+ error_msg = "Incompatible partition specs"
317
+ raise error_msg
318
+ end
319
+ return target_path
320
+ end
321
+
322
+ #turn a tsv into a hive table.
323
+ #Accepts options to drop existing target if any
324
+ #also schema with column datatype overrides
325
+ def Hive.tsv_to_table(cluster, source_tsv, target_path, user, drop=false, schema_hash=nil)
326
+ source_headers = source_tsv.tsv_header_array
327
+
328
+ target_params = Hive.path_params(cluster, target_path, user)
329
+ target_db,target_table = ['db','table'].map{|k| target_params[k]}
330
+ target_table_path = [target_db,target_table].join(".")
331
+ target_partitions = target_params['partitions'].to_a
332
+ target_table_stats = target_params['curr_stats']
333
+
334
+ schema_hash ||= {}
335
+
336
+ if target_partitions.length == 0 and
337
+ target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
338
+ #no partitions in either user params or the target table
339
+ #or drop and start fresh
340
+
341
+ #one file only, strip headers, replace tab with ctrl-a for hive
342
+ source_rows = source_tsv.split("\n")[1..-1].join("\n").gsub("\t","\001")
343
+ source_tsv_filename = "000000_0"
344
+ file_hash = {source_tsv_filename=>source_rows}
345
+
346
+ field_defs = source_headers.map do |name|
347
+ datatype = schema_hash[name] || "string"
348
+ "`#{name}` #{datatype}"
349
+ end.ie{|fs| "(#{fs.join(",")})"}
350
+
351
+ #for single insert, use drop table and create table always
352
+ target_drop_hql = "drop table if exists #{target_table_path}"
353
+
354
+ target_create_hql = "create table #{target_table_path} #{field_defs}"
355
+
356
+ #load source data
357
+ target_insert_hql = "load data local inpath '#{source_tsv_filename}' overwrite into table #{target_table_path};"
358
+
359
+ target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql].join(";")
360
+
361
+ Hive.run(target_full_hql, cluster, user, file_hash)
362
+
363
+ elsif target_partitions.length > 0 and
364
+ target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == target_partitions}
365
+ #partitions and no target table
366
+ #or same partitions in both target table and user params
367
+ #or drop and start fresh
368
+
369
+ target_headers = source_headers.reject{|h| target_partitions.include?(h)}
370
+
371
+ field_defs = "(#{target_headers.map do |name|
372
+ datatype = schema_hash[name] || "string"
373
+ "`#{name}` #{datatype}"
374
+ end.join(",")})"
375
+
376
+ partition_defs = "(#{target_partitions.map do |name|
377
+ datatype = schema_hash[name] || "string"
378
+ "#{name} #{datatype}"
379
+ end.join(",")})"
380
+
381
+ target_drop_hql = drop ? "drop table if exists #{target_table_path};" : ""
382
+
383
+ target_create_hql = target_drop_hql +
384
+ "create table if not exists #{target_table_path} #{field_defs} " +
385
+ "partitioned by #{partition_defs}"
386
+
387
+ #create target table early if not here
388
+ Hive.run(target_create_hql, cluster, user)
389
+
390
+ target_table_stats = Hive.table_stats(target_db, target_table, cluster, user)
391
+
392
+ #create data hash from source hash array
393
+ data_hash = {}
394
+ source_hash_array = source_tsv.tsv_to_hash_array
395
+ source_hash_array.each do |ha|
396
+ tpmk = target_partitions.map{|pn| "#{pn}=#{ha[pn]}"}.join("/")
397
+ tpmv = ha.reject{|k,v| target_partitions.include?(k)}.values.join("\001")
398
+ if data_hash[tpmk]
399
+ data_hash[tpmk] += "\n#{tpmv}"
400
+ else
401
+ data_hash[tpmk] = tpmv
402
+ end
403
+ end
404
+
405
+ #go through completed data hash and write each key value to the table in question
406
+ data_hash.each do |tpmk,tpmv|
407
+ base_filename = "000000_0"
408
+ part_pairs = tpmk.split("/").map{|p| p.split("=").ie{|pa| ["#{pa.first}","#{pa.second}"]}}
409
+ part_dir = part_pairs.map{|pp| "#{pp.first}=#{pp.second}"}.join("/")
410
+ part_stmt = part_pairs.map{|pp| "#{pp.first}='#{pp.second}'"}.join(",")
411
+ hdfs_dir = "#{target_table_stats['location']}/#{part_dir}"
412
+ hdfs_source_path = "/#{hdfs_dir.split("/")[3..-2].join("/")}/#{base_filename}"
413
+ hdfs_target_path = "/#{hdfs_dir.split("/")[3..-1].join("/")}/#{base_filename}"
414
+ #load partition into source path
415
+ puts "Writing to #{hdfs_source_path} for #{user} at #{Time.now.utc}"
416
+ Hdfs.write(hdfs_source_path,tpmv,user)
417
+ #let Hive know where the partition is
418
+ target_add_part_hql = "use #{target_db};alter table #{target_table} add if not exists partition (#{part_stmt}) location '#{hdfs_target_path}'"
419
+ target_insert_part_hql = "load data inpath '#{hdfs_source_path}' overwrite into table #{target_table} partition (#{part_stmt});"
420
+ target_part_hql = [target_add_part_hql,target_insert_part_hql].join(";")
421
+ puts "Adding partition #{tpmk} to #{target_table_path} for #{user} at #{Time.now.utc}"
422
+ Hive.run(target_part_hql, cluster, user)
423
+ end
424
+ else
425
+ error_msg = "Incompatible partition specs: " +
426
+ "target table:#{target_table_stats['partitions'].to_s}, " +
427
+ "user_params:#{target_partitions.to_s}"
428
+ raise error_msg
429
+ end
430
+ return target_path
431
+ end
432
+
433
+ def Hive.write_by_stage_path(stage_path)
434
+ s = Stage.where(:path=>stage_path).first
435
+ u = s.job.runner.user
436
+ params = s.params
437
+ user = params['user']
438
+ cluster = params['cluster'] || Hive.clusters.keys.first
439
+
440
+ #slot Hive worker if available
441
+ slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
442
+ return false unless slot_id
443
+
444
+ node = Hadoop.gateway_node(cluster)
445
+ node_user = Ssh.host(node)['user']
446
+ if user and !Ssh.sudoers(node).include?(u.name)
447
+ raise "#{u.name} does not have su permissions for #{node}"
448
+ elsif user.nil? and Ssh.su_all_users(node)
449
+ user = u.name
450
+ end
451
+
452
+ #determine path for target
453
+ target_path = params['target']
454
+
455
+ gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
456
+ #return blank response if there are no slots available
457
+ return nil unless gdrive_slot
458
+ source_dst = s.source_dsts(gdrive_slot).first
459
+ schema_hash = params['schema'] ? Hive.schema_hash(params['schema'],user,gdrive_slot) : {}
460
+ Gdrive.unslot_worker_by_path(stage_path)
461
+
462
+ #drop target before create/insert?
463
+ drop = params['drop']
464
+
465
+ #determine source
466
+ source_tsv,source_hql = [nil]*2
467
+ if params['hql']
468
+ source_hql = params['hql']
469
+ elsif source_dst
470
+ if source_dst.handler == 'hive'
471
+ #source table
472
+ cluster,source_path = source_dst.path.split("/").ie{|sp| [sp.first, sp[1..-1].join(".")]}
473
+ source_hql = "select * from #{source_path};"
474
+ elsif ['gridfs','hdfs'].include?(source_dst.handler)
475
+ if source_dst.path.ie{|sdp| sdp.index(/\.[A-Za-z]ql$/) or sdp.ends_with?(".ql")}
476
+ source_hql = source_dst.read(user)
477
+ else
478
+ #tsv from sheet
479
+ source_tsv = source_dst.read(user)
480
+ end
481
+ end
482
+ end
483
+
484
+ out_string = if source_hql
485
+ Hive.hql_to_table(cluster, source_hql, target_path, user, drop, schema_hash)
486
+ elsif source_tsv
487
+ Hive.tsv_to_table(cluster, source_tsv, target_path, user, drop, schema_hash)
488
+ else
489
+ raise "Unable to determine source tsv or source hql"
490
+ end
491
+
492
+
493
+
494
+ #unslot worker and write result
495
+ Hive.unslot_worker_by_path(stage_path)
496
+
497
+ #output table stores stage output
498
+ out_string = "result\n#{out_string}"
499
+ output_db,output_table = [Hive.output_db(cluster),stage_path.gridsafe]
500
+ out_url = "hive://#{cluster}/#{output_db}/#{output_table}"
501
+ Dataset.write_by_url(out_url,out_string,node_user)
502
+ out_url
503
+ end
504
+
505
+ def Hive.read_by_dataset_path(dst_path,user)
506
+ cluster,source_path = dst_path.split("/").ie do |sp|
507
+ if sp.length == 2
508
+ [Hive.clusters.first.first,sp.join(".")]
509
+ else
510
+ [sp.first, sp[1..-1].join(".")]
511
+ end
512
+ end
513
+ hql = "set hive.cli.print.header=true;select * from #{source_path};"
514
+ Hive.run(hql,cluster,user)
515
+ end
516
+
517
+ def Hive.write_by_dataset_path(dst_path,source_tsv,user)
518
+ cluster,target_path = dst_path.split("/").ie do |sp|
519
+ if sp.length == 2
520
+ [Hive.clusters.first.first,sp.join(".")]
521
+ else
522
+ [sp.first, sp[1..-1].join(".")]
523
+ end
524
+ end
525
+ drop = true
526
+ Hive.tsv_to_table(cluster, source_tsv, target_path, user, drop)
527
+ end
528
+ end
529
+
530
+ end
@@ -0,0 +1,38 @@
1
+ namespace :mobilize_hive do
2
+ desc "Set up config and log folders and files"
3
+ task :setup do
4
+ sample_dir = File.dirname(__FILE__) + '/../samples/'
5
+ sample_files = Dir.entries(sample_dir)
6
+ config_dir = (ENV['MOBILIZE_CONFIG_DIR'] ||= "config/mobilize/")
7
+ log_dir = (ENV['MOBILIZE_LOG_DIR'] ||= "log/")
8
+ full_config_dir = "#{ENV['PWD']}/#{config_dir}"
9
+ full_log_dir = "#{ENV['PWD']}/#{log_dir}"
10
+ unless File.exists?(full_config_dir)
11
+ puts "creating #{config_dir}"
12
+ `mkdir -p #{full_config_dir}`
13
+ end
14
+ unless File.exists?(full_log_dir)
15
+ puts "creating #{log_dir}"
16
+ `mkdir -p #{full_log_dir}`
17
+ end
18
+ sample_files.each do |fname|
19
+ unless File.exists?("#{full_config_dir}#{fname}")
20
+ puts "creating #{config_dir}#{fname}"
21
+ `cp #{sample_dir}#{fname} #{full_config_dir}#{fname}`
22
+ end
23
+ end
24
+ #make sure that the jobtracker.yml is updated to include the
25
+ #mobilize-ssh library
26
+ jt_config_file = "#{config_dir}jobtracker.yml"
27
+ if File.exists?(jt_config_file)
28
+ yml_hash = YAML.load_file(jt_config_file)
29
+ yml_hash.keys.each do |k|
30
+ if yml_hash[k]['extensions'] and !yml_hash[k]['extensions'].include?('mobilize-hive')
31
+ puts "adding mobilize-hive to jobtracker.yml/#{k}/extensions"
32
+ yml_hash[k]['extensions'] = yml_hash[k]['extensions'].to_a + ['mobilize-hive']
33
+ end
34
+ end
35
+ File.open(jt_config_file,"w") {|f| f.print(yml_hash.to_yaml)}
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,5 @@
1
+ module Mobilize
2
+ module Hive
3
+ VERSION = "1.0.07"
4
+ end
5
+ end
@@ -0,0 +1,8 @@
1
+ require "mobilize-hive/version"
2
+ require "mobilize-hdfs"
3
+
4
+ module Mobilize
5
+ module Hive
6
+ end
7
+ end
8
+ require "mobilize-hive/handlers/hive"
@@ -0,0 +1,19 @@
1
+ ---
2
+ development:
3
+ clusters:
4
+ dev_cluster:
5
+ max_slots: 5
6
+ temp_table_db: mobilize
7
+ exec_path: /path/to/hive
8
+ test:
9
+ clusters:
10
+ test_cluster:
11
+ max_slots: 5
12
+ temp_table_db: mobilize
13
+ exec_path: /path/to/hive
14
+ production:
15
+ clusters:
16
+ prod_cluster:
17
+ max_slots: 5
18
+ temp_table_db: mobilize
19
+ exec_path: /path/to/hive
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'mobilize-hive/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "mobilize-hive"
8
+ gem.version = Mobilize::Hive::VERSION
9
+ gem.authors = ["Cassio Paes-Leme"]
10
+ gem.email = ["cpaesleme@ngmoco.com"]
11
+ gem.description = %q{Adds hive read, write, and run support to mobilize-hdfs}
12
+ gem.summary = %q{Adds hive read, write, and run support to mobilize-hdfs}
13
+ gem.homepage = "http://github.com/dena/mobilize-hive"
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+ gem.add_runtime_dependency "mobilize-hdfs","1.0.07"
20
+ end
@@ -0,0 +1,26 @@
1
+ ---
2
+ - name: hive_test_1
3
+ active: true
4
+ trigger: once
5
+ status: ""
6
+ stage1: hive.write target:"mobilize/hive_test_1/act_date", drop:true,
7
+ source:"Runner_mobilize(test)/hive_test_1.in", schema:"hive_test_1.schema"
8
+ stage2: hive.run source:"hive_test_1.hql"
9
+ stage3: hive.run hql:"show databases;"
10
+ stage4: gsheet.write source:"stage2", target:"hive_test_1_stage_2.out"
11
+ stage5: gsheet.write source:"stage3", target:"hive_test_1_stage_3.out"
12
+ - name: hive_test_2
13
+ active: true
14
+ trigger: after hive_test_1
15
+ status: ""
16
+ stage1: hive.write source:"hdfs://user/mobilize/test/test_hdfs_1.out", target:"mobilize.hive_test_2", drop:true
17
+ stage2: hive.run hql:"select * from mobilize.hive_test_2;"
18
+ stage3: gsheet.write source:"stage2", target:"hive_test_2.out"
19
+ - name: hive_test_3
20
+ active: true
21
+ trigger: after hive_test_2
22
+ status: ""
23
+ stage1: hive.run hql:"select act_date as `date`,product,category,value from mobilize.hive_test_1;"
24
+ stage2: hive.write source:"stage1",target:"mobilize/hive_test_3/date/product", drop:true
25
+ stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3/date/product", drop:false
26
+ stage4: gsheet.write source:"hive://mobilize/hive_test_3", target:"hive_test_3.out"
@@ -0,0 +1 @@
1
+ select act_date,product, sum(value) as sum from mobilize.hive_test_1 group by act_date,product;