mobilize-hive 1.0.07

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,530 @@
1
+ module Mobilize
2
+ module Hive
3
+ def Hive.config
4
+ Base.config('hive')
5
+ end
6
+
7
+ def Hive.exec_path(cluster)
8
+ Hive.clusters[cluster]['exec_path']
9
+ end
10
+
11
+ def Hive.output_db(cluster)
12
+ Hive.clusters[cluster]['output_db']
13
+ end
14
+
15
+ def Hive.output_db_user(cluster)
16
+ output_db_node = Hadoop.gateway_node(cluster)
17
+ output_db_user = Ssh.host(output_db_node)['user']
18
+ output_db_user
19
+ end
20
+
21
+ def Hive.clusters
22
+ Hive.config['clusters']
23
+ end
24
+
25
+ def Hive.slot_ids(cluster)
26
+ (1..Hive.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
27
+ end
28
+
29
+ def Hive.slot_worker_by_cluster_and_path(cluster,path)
30
+ working_slots = Mobilize::Resque.jobs('working').map{|j| j['hive_slot'] if (j and j['hive_slot'])}.compact
31
+ Hive.slot_ids(cluster).each do |slot_id|
32
+ unless working_slots.include?(slot_id)
33
+ Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
34
+ return slot_id
35
+ end
36
+ end
37
+ #return false if none are available
38
+ return false
39
+ end
40
+
41
+ def Hive.unslot_worker_by_path(path)
42
+ begin
43
+ Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
44
+ return true
45
+ rescue
46
+ return false
47
+ end
48
+ end
49
+
50
+ #get field names and partition datatypes and size of a hive table
51
+ def Hive.table_stats(db,table,cluster,user)
52
+ describe_sql = "use #{db};describe extended #{table}"
53
+ describe_output = Hive.run(describe_sql,cluster,user)
54
+ describe_output.split("location:").last.split(",").first
55
+ #get location, fields, partitions
56
+ result_hash = {}
57
+ result_hash['location'] = describe_output.split("location:").last.split(",").first
58
+ #get fields
59
+ field_defs = describe_output.split(" \nDetailed Table Information").first.split(
60
+ "\n").map{|f|
61
+ f.strip.split("\t").ie{|fa|
62
+ {"name"=>fa.first,"datatype"=>fa.second} if fa.first}}.compact
63
+ #check for partititons
64
+ if describe_output.index("partitionKeys:[FieldSchema")
65
+ part_field_string = describe_output.split("partitionKeys:[").last.split("]").first
66
+ #parse weird schema using yaml plus gsubs
67
+ yaml_fields = "---" + part_field_string.gsub("FieldSchema","\n").gsub(
68
+ ")","").gsub(
69
+ ",","\n ").gsub(
70
+ "(","- ").gsub(
71
+ "null","").gsub(
72
+ ":",": ")
73
+ #return partitions without the comment part
74
+ result_hash['partitions'] = YAML.load(yaml_fields).map{|ph| ph.delete('comment');ph}
75
+ #get rid of fields in fields section that are also partitions
76
+ result_hash['partitions'].map{|p| p['name']}.each{|n| field_defs.delete_if{|f| f['name']==n}}
77
+ end
78
+ #assign field defs after removing partitions
79
+ result_hash['field_defs'] = field_defs
80
+ #get size
81
+ result_hash['size'] = Hadoop.run("fs -dus #{result_hash['location']}",cluster,user).split("\t").last.strip.to_i
82
+ return result_hash
83
+ end
84
+
85
+ #run a generic hive command, with the option of passing a file hash to be locally available
86
+ def Hive.run(hql,cluster,user,file_hash=nil)
87
+ filename = hql.to_md5
88
+ file_hash||= {}
89
+ file_hash[filename] = hql
90
+ #silent mode so we don't have logs in stderr; clip output
91
+ #at hadoop read limit
92
+ command = "#{Hive.exec_path(cluster)} -S -f #{filename} | head -c #{Hadoop.read_limit}"
93
+ gateway_node = Hadoop.gateway_node(cluster)
94
+ Ssh.run(gateway_node,command,user,file_hash)
95
+ end
96
+
97
+ def Hive.run_by_stage_path(stage_path)
98
+ s = Stage.where(:path=>stage_path).first
99
+ u = s.job.runner.user
100
+ params = s.params
101
+ user = params['user']
102
+ cluster = params['cluster'] || Hive.clusters.keys.first
103
+ node = Hadoop.gateway_node(cluster)
104
+ node_user = Ssh.host(node)['user']
105
+ if user and !Ssh.sudoers(node).include?(u.name)
106
+ raise "#{u.name} does not have su permissions for #{node}"
107
+ elsif user.nil? and Ssh.su_all_users(node)
108
+ user = u.name
109
+ end
110
+
111
+ #slot Hive worker if available
112
+ slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
113
+ return false unless slot_id
114
+
115
+ #output table stores stage output
116
+ output_db,output_table = [Hive.output_db(cluster),stage_path.gridsafe]
117
+ output_path = [output_db,output_table].join(".")
118
+ out_url = "hive://#{cluster}/#{output_db}/#{output_table}"
119
+
120
+ #get hql
121
+ if params['hql']
122
+ hql = params['hql']
123
+ else
124
+ #user has passed in a gsheet hql
125
+ gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
126
+ #return blank response if there are no slots available
127
+ return nil unless gdrive_slot
128
+ source_dst = s.source_dsts(gdrive_slot).first
129
+ Gdrive.unslot_worker_by_path(stage_path)
130
+ hql = source_dst.read(user)
131
+ end
132
+
133
+ #check for select at end
134
+ hql_array = hql.split(";").map{|hc| hc.strip}.reject{|hc| hc.length==0}
135
+ if hql_array.last.downcase.starts_with?("select")
136
+ #nil if no prior commands
137
+ prior_hql = hql_array[0..-2].join(";") if hql_array.length > 1
138
+ select_hql = hql_array.last
139
+ output_table_hql = ["drop table if exists #{output_path}",
140
+ "create table #{output_path} as #{select_hql};"].join(";")
141
+ full_hql = [prior_hql, output_table_hql].compact.join(";")
142
+ Hive.run(full_hql, cluster, user)
143
+ #make sure node user owns the stage result directory
144
+ output_table_stats = Hive.table_stats(output_db,output_table,cluster,node_user)
145
+ output_table_location = output_table_stats['location']
146
+ chown_command = "#{Hadoop.exec_path(cluster)} fs -chown -R #{node_user} '#{output_table_location}'"
147
+ Ssh.run(node,chown_command,node_user)
148
+ #already populated, make sure dataset exists
149
+ Dataset.find_or_create_by_url(out_url)
150
+ else
151
+ out_string = Hive.run(hql, cluster, user)
152
+ out_string = "result\n#{out_string}"
153
+ Dataset.write_by_url(out_url,out_string,node_user)
154
+ end
155
+ #unslot worker
156
+ Hive.unslot_worker_by_path(stage_path)
157
+ out_url
158
+ end
159
+
160
+ def Hive.schema_hash(schema_path,user,gdrive_slot)
161
+ if schema_path.index("/")
162
+ #slashes mean sheets
163
+ out_tsv = Gsheet.find_by_path(schema_path,gdrive_slot).read(user)
164
+ else
165
+ u = User.where(:name=>user).first
166
+ #check sheets in runner
167
+ r = u.runner
168
+ runner_sheet = r.gbook(gdrive_slot).worksheet_by_title(schema_path)
169
+ out_tsv = if runner_sheet
170
+ runner_sheet.read(user)
171
+ else
172
+ #check for gfile. will fail if there isn't one.
173
+ Gfile.find_by_path(schema_path).read(user)
174
+ end
175
+ #use Gridfs to cache gdrive results
176
+ file_name = schema_path.split("/").last
177
+ out_url = "gridfs://#{schema_path}/#{file_name}"
178
+ Dataset.write_by_url(out_url,out_tsv,user)
179
+ schema_tsv = Dataset.find_by_url(out_url).read(user)
180
+ schema_hash = {}
181
+ schema_tsv.tsv_to_hash_array.each do |ha|
182
+ schema_hash[ha['name']] = ha['datatype']
183
+ end
184
+ schema_hash
185
+ end
186
+ end
187
+
188
+ def Hive.path_params(cluster, path, user)
189
+ db, table, partitions = path.gsub(".","/").split("/").ie{|sp| [sp.first, sp.second, sp[2..-1]]}
190
+ #get existing table stats if any
191
+ curr_stats = begin
192
+ Hive.table_stats(db, table, cluster, user)
193
+ rescue
194
+ nil
195
+ end
196
+ {"db"=>db,
197
+ "table"=>table,
198
+ "partitions"=>partitions,
199
+ "curr_stats"=>curr_stats}
200
+ end
201
+
202
+ def Hive.hql_to_table(cluster, source_hql, target_path, user, drop=false, schema_hash=nil)
203
+ target_params = Hive.path_params(cluster, target_path, user)
204
+ target_table_path = ['db','table'].map{|k| target_params[k]}.join(".")
205
+ target_partitions = target_params['partitions'].to_a
206
+ target_table_stats = target_params['curr_stats']
207
+
208
+ #create temporary table so we can identify fields etc.
209
+ temp_db = Hive.output_db(cluster)
210
+ temp_table_name = (source_hql+target_path).to_md5
211
+ temp_table_path = [temp_db,temp_table_name].join(".")
212
+ temp_drop_hql = "drop table if exists #{temp_table_path};"
213
+ temp_create_hql = "#{temp_drop_hql}create table #{temp_table_path} as #{source_hql}"
214
+ Hive.run(temp_create_hql,cluster,user)
215
+
216
+ source_params = Hive.path_params(cluster, temp_table_path, user)
217
+ source_table_path = ['db','table'].map{|k| source_params[k]}.join(".")
218
+ source_table_stats = source_params['curr_stats']
219
+ source_fields = source_table_stats['field_defs']
220
+
221
+ if target_partitions.length == 0 and
222
+ target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
223
+ #no partitions in either user params or the target table
224
+
225
+ target_headers = source_fields.map{|f| f['name']}
226
+
227
+ target_field_stmt = target_headers.map{|h| "`#{h}`"}.join(",")
228
+
229
+ field_defs = {}
230
+ target_headers.each do |name|
231
+ datatype = schema_hash[name] || "string"
232
+ field_defs[name]=datatype
233
+ end
234
+
235
+ field_def_stmt = "(#{field_defs.map do |name,datatype|
236
+ "`#{name}` #{datatype}"
237
+ end.join(",")})"
238
+
239
+ #always drop when no partititons
240
+ target_drop_hql = "drop table if exists #{target_table_path};"
241
+
242
+ target_create_hql = "create table if not exists #{target_table_path} #{field_def_stmt};"
243
+
244
+ target_insert_hql = "insert overwrite table #{target_table_path} select #{target_field_stmt} from #{source_table_path};"
245
+
246
+ target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql,temp_drop_hql].join
247
+
248
+ Hive.run(target_full_hql, cluster, user)
249
+
250
+ elsif target_partitions.length > 0 and
251
+ target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == target_partitions}
252
+ #partitions and no target table or same partitions in both target table and user params
253
+
254
+ target_headers = source_fields.map{|f| f['name']}.reject{|h| target_partitions.include?(h)}
255
+
256
+ field_defs = {}
257
+ target_headers.each do |name|
258
+ datatype = schema_hash[name] || "string"
259
+ field_defs[name]=datatype
260
+ end
261
+
262
+ field_def_stmt = "(#{field_defs.map do |name,datatype|
263
+ "`#{name}` #{datatype}"
264
+ end.join(",")})"
265
+
266
+ part_defs = {}
267
+ target_partitions.each do |name|
268
+ datatype = schema_hash[name] || "string"
269
+ part_defs[name] = datatype
270
+ end
271
+
272
+ part_def_stmt = "(#{part_defs.map do |name,datatype|
273
+ "`#{name}` #{datatype}"
274
+ end.join(",")})"
275
+
276
+ target_field_stmt = target_headers.map{|h| "`#{h}`"}.join(",")
277
+
278
+ target_part_stmt = target_partitions.map{|h| "`#{h}`"}.join(",")
279
+
280
+ target_set_hql = ["set hive.exec.dynamic.partition.mode=nonstrict;",
281
+ "set hive.exec.max.dynamic.partitions.pernode=1000;",
282
+ "set hive.exec.dynamic.partition=true;",
283
+ "set hive.exec.max.created.files = 200000;",
284
+ "set hive.max.created.files = 200000;"].join
285
+
286
+ if drop or target_table_stats.nil?
287
+ target_drop_hql = "drop table if exists #{target_table_path};"
288
+ target_create_hql = target_drop_hql +
289
+ "create table if not exists #{target_table_path} #{field_def_stmt} " +
290
+ "partitioned by #{part_def_stmt};"
291
+
292
+ else
293
+ target_db,target_table = target_table_path.split(".")
294
+ #get all the permutations of possible partititons
295
+ part_perm_hql = "set hive.cli.print.header=true;select distinct #{target_part_stmt} from #{source_table_path};"
296
+ part_perm_tsv = Hive.run(part_perm_hql, cluster, user)
297
+ #having gotten the permutations, ensure they are dropped
298
+ part_hash_array = part_perm_tsv.tsv_to_hash_array
299
+ part_drop_hql = part_hash_array.map do |h|
300
+ part_drop_stmt = h.map do |name,value|
301
+ part_defs[name[1..-2]]=="string" ? "#{name}='#{value}'" : "#{name}=#{value}"
302
+ end.join(",")
303
+ "use #{target_db};alter table #{target_table} drop if exists partition (#{part_drop_stmt});"
304
+ end.join
305
+ target_create_hql = part_drop_hql
306
+ end
307
+
308
+ target_insert_hql = "insert overwrite table #{target_table_path} " +
309
+ "partition (#{target_part_stmt}) " +
310
+ "select #{target_field_stmt},#{target_part_stmt} from #{source_table_path};"
311
+
312
+ target_full_hql = [target_set_hql, target_create_hql, target_insert_hql, temp_drop_hql].join
313
+
314
+ Hive.run(target_full_hql, cluster, user)
315
+ else
316
+ error_msg = "Incompatible partition specs"
317
+ raise error_msg
318
+ end
319
+ return target_path
320
+ end
321
+
322
+ #turn a tsv into a hive table.
323
+ #Accepts options to drop existing target if any
324
+ #also schema with column datatype overrides
325
+ def Hive.tsv_to_table(cluster, source_tsv, target_path, user, drop=false, schema_hash=nil)
326
+ source_headers = source_tsv.tsv_header_array
327
+
328
+ target_params = Hive.path_params(cluster, target_path, user)
329
+ target_db,target_table = ['db','table'].map{|k| target_params[k]}
330
+ target_table_path = [target_db,target_table].join(".")
331
+ target_partitions = target_params['partitions'].to_a
332
+ target_table_stats = target_params['curr_stats']
333
+
334
+ schema_hash ||= {}
335
+
336
+ if target_partitions.length == 0 and
337
+ target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
338
+ #no partitions in either user params or the target table
339
+ #or drop and start fresh
340
+
341
+ #one file only, strip headers, replace tab with ctrl-a for hive
342
+ source_rows = source_tsv.split("\n")[1..-1].join("\n").gsub("\t","\001")
343
+ source_tsv_filename = "000000_0"
344
+ file_hash = {source_tsv_filename=>source_rows}
345
+
346
+ field_defs = source_headers.map do |name|
347
+ datatype = schema_hash[name] || "string"
348
+ "`#{name}` #{datatype}"
349
+ end.ie{|fs| "(#{fs.join(",")})"}
350
+
351
+ #for single insert, use drop table and create table always
352
+ target_drop_hql = "drop table if exists #{target_table_path}"
353
+
354
+ target_create_hql = "create table #{target_table_path} #{field_defs}"
355
+
356
+ #load source data
357
+ target_insert_hql = "load data local inpath '#{source_tsv_filename}' overwrite into table #{target_table_path};"
358
+
359
+ target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql].join(";")
360
+
361
+ Hive.run(target_full_hql, cluster, user, file_hash)
362
+
363
+ elsif target_partitions.length > 0 and
364
+ target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == target_partitions}
365
+ #partitions and no target table
366
+ #or same partitions in both target table and user params
367
+ #or drop and start fresh
368
+
369
+ target_headers = source_headers.reject{|h| target_partitions.include?(h)}
370
+
371
+ field_defs = "(#{target_headers.map do |name|
372
+ datatype = schema_hash[name] || "string"
373
+ "`#{name}` #{datatype}"
374
+ end.join(",")})"
375
+
376
+ partition_defs = "(#{target_partitions.map do |name|
377
+ datatype = schema_hash[name] || "string"
378
+ "#{name} #{datatype}"
379
+ end.join(",")})"
380
+
381
+ target_drop_hql = drop ? "drop table if exists #{target_table_path};" : ""
382
+
383
+ target_create_hql = target_drop_hql +
384
+ "create table if not exists #{target_table_path} #{field_defs} " +
385
+ "partitioned by #{partition_defs}"
386
+
387
+ #create target table early if not here
388
+ Hive.run(target_create_hql, cluster, user)
389
+
390
+ target_table_stats = Hive.table_stats(target_db, target_table, cluster, user)
391
+
392
+ #create data hash from source hash array
393
+ data_hash = {}
394
+ source_hash_array = source_tsv.tsv_to_hash_array
395
+ source_hash_array.each do |ha|
396
+ tpmk = target_partitions.map{|pn| "#{pn}=#{ha[pn]}"}.join("/")
397
+ tpmv = ha.reject{|k,v| target_partitions.include?(k)}.values.join("\001")
398
+ if data_hash[tpmk]
399
+ data_hash[tpmk] += "\n#{tpmv}"
400
+ else
401
+ data_hash[tpmk] = tpmv
402
+ end
403
+ end
404
+
405
+ #go through completed data hash and write each key value to the table in question
406
+ data_hash.each do |tpmk,tpmv|
407
+ base_filename = "000000_0"
408
+ part_pairs = tpmk.split("/").map{|p| p.split("=").ie{|pa| ["#{pa.first}","#{pa.second}"]}}
409
+ part_dir = part_pairs.map{|pp| "#{pp.first}=#{pp.second}"}.join("/")
410
+ part_stmt = part_pairs.map{|pp| "#{pp.first}='#{pp.second}'"}.join(",")
411
+ hdfs_dir = "#{target_table_stats['location']}/#{part_dir}"
412
+ hdfs_source_path = "/#{hdfs_dir.split("/")[3..-2].join("/")}/#{base_filename}"
413
+ hdfs_target_path = "/#{hdfs_dir.split("/")[3..-1].join("/")}/#{base_filename}"
414
+ #load partition into source path
415
+ puts "Writing to #{hdfs_source_path} for #{user} at #{Time.now.utc}"
416
+ Hdfs.write(hdfs_source_path,tpmv,user)
417
+ #let Hive know where the partition is
418
+ target_add_part_hql = "use #{target_db};alter table #{target_table} add if not exists partition (#{part_stmt}) location '#{hdfs_target_path}'"
419
+ target_insert_part_hql = "load data inpath '#{hdfs_source_path}' overwrite into table #{target_table} partition (#{part_stmt});"
420
+ target_part_hql = [target_add_part_hql,target_insert_part_hql].join(";")
421
+ puts "Adding partition #{tpmk} to #{target_table_path} for #{user} at #{Time.now.utc}"
422
+ Hive.run(target_part_hql, cluster, user)
423
+ end
424
+ else
425
+ error_msg = "Incompatible partition specs: " +
426
+ "target table:#{target_table_stats['partitions'].to_s}, " +
427
+ "user_params:#{target_partitions.to_s}"
428
+ raise error_msg
429
+ end
430
+ return target_path
431
+ end
432
+
433
+ def Hive.write_by_stage_path(stage_path)
434
+ s = Stage.where(:path=>stage_path).first
435
+ u = s.job.runner.user
436
+ params = s.params
437
+ user = params['user']
438
+ cluster = params['cluster'] || Hive.clusters.keys.first
439
+
440
+ #slot Hive worker if available
441
+ slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
442
+ return false unless slot_id
443
+
444
+ node = Hadoop.gateway_node(cluster)
445
+ node_user = Ssh.host(node)['user']
446
+ if user and !Ssh.sudoers(node).include?(u.name)
447
+ raise "#{u.name} does not have su permissions for #{node}"
448
+ elsif user.nil? and Ssh.su_all_users(node)
449
+ user = u.name
450
+ end
451
+
452
+ #determine path for target
453
+ target_path = params['target']
454
+
455
+ gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
456
+ #return blank response if there are no slots available
457
+ return nil unless gdrive_slot
458
+ source_dst = s.source_dsts(gdrive_slot).first
459
+ schema_hash = params['schema'] ? Hive.schema_hash(params['schema'],user,gdrive_slot) : {}
460
+ Gdrive.unslot_worker_by_path(stage_path)
461
+
462
+ #drop target before create/insert?
463
+ drop = params['drop']
464
+
465
+ #determine source
466
+ source_tsv,source_hql = [nil]*2
467
+ if params['hql']
468
+ source_hql = params['hql']
469
+ elsif source_dst
470
+ if source_dst.handler == 'hive'
471
+ #source table
472
+ cluster,source_path = source_dst.path.split("/").ie{|sp| [sp.first, sp[1..-1].join(".")]}
473
+ source_hql = "select * from #{source_path};"
474
+ elsif ['gridfs','hdfs'].include?(source_dst.handler)
475
+ if source_dst.path.ie{|sdp| sdp.index(/\.[A-Za-z]ql$/) or sdp.ends_with?(".ql")}
476
+ source_hql = source_dst.read(user)
477
+ else
478
+ #tsv from sheet
479
+ source_tsv = source_dst.read(user)
480
+ end
481
+ end
482
+ end
483
+
484
+ out_string = if source_hql
485
+ Hive.hql_to_table(cluster, source_hql, target_path, user, drop, schema_hash)
486
+ elsif source_tsv
487
+ Hive.tsv_to_table(cluster, source_tsv, target_path, user, drop, schema_hash)
488
+ else
489
+ raise "Unable to determine source tsv or source hql"
490
+ end
491
+
492
+
493
+
494
+ #unslot worker and write result
495
+ Hive.unslot_worker_by_path(stage_path)
496
+
497
+ #output table stores stage output
498
+ out_string = "result\n#{out_string}"
499
+ output_db,output_table = [Hive.output_db(cluster),stage_path.gridsafe]
500
+ out_url = "hive://#{cluster}/#{output_db}/#{output_table}"
501
+ Dataset.write_by_url(out_url,out_string,node_user)
502
+ out_url
503
+ end
504
+
505
+ def Hive.read_by_dataset_path(dst_path,user)
506
+ cluster,source_path = dst_path.split("/").ie do |sp|
507
+ if sp.length == 2
508
+ [Hive.clusters.first.first,sp.join(".")]
509
+ else
510
+ [sp.first, sp[1..-1].join(".")]
511
+ end
512
+ end
513
+ hql = "set hive.cli.print.header=true;select * from #{source_path};"
514
+ Hive.run(hql,cluster,user)
515
+ end
516
+
517
+ def Hive.write_by_dataset_path(dst_path,source_tsv,user)
518
+ cluster,target_path = dst_path.split("/").ie do |sp|
519
+ if sp.length == 2
520
+ [Hive.clusters.first.first,sp.join(".")]
521
+ else
522
+ [sp.first, sp[1..-1].join(".")]
523
+ end
524
+ end
525
+ drop = true
526
+ Hive.tsv_to_table(cluster, source_tsv, target_path, user, drop)
527
+ end
528
+ end
529
+
530
+ end
@@ -0,0 +1,38 @@
1
+ namespace :mobilize_hive do
2
+ desc "Set up config and log folders and files"
3
+ task :setup do
4
+ sample_dir = File.dirname(__FILE__) + '/../samples/'
5
+ sample_files = Dir.entries(sample_dir)
6
+ config_dir = (ENV['MOBILIZE_CONFIG_DIR'] ||= "config/mobilize/")
7
+ log_dir = (ENV['MOBILIZE_LOG_DIR'] ||= "log/")
8
+ full_config_dir = "#{ENV['PWD']}/#{config_dir}"
9
+ full_log_dir = "#{ENV['PWD']}/#{log_dir}"
10
+ unless File.exists?(full_config_dir)
11
+ puts "creating #{config_dir}"
12
+ `mkdir -p #{full_config_dir}`
13
+ end
14
+ unless File.exists?(full_log_dir)
15
+ puts "creating #{log_dir}"
16
+ `mkdir -p #{full_log_dir}`
17
+ end
18
+ sample_files.each do |fname|
19
+ unless File.exists?("#{full_config_dir}#{fname}")
20
+ puts "creating #{config_dir}#{fname}"
21
+ `cp #{sample_dir}#{fname} #{full_config_dir}#{fname}`
22
+ end
23
+ end
24
+ #make sure that the jobtracker.yml is updated to include the
25
+ #mobilize-ssh library
26
+ jt_config_file = "#{config_dir}jobtracker.yml"
27
+ if File.exists?(jt_config_file)
28
+ yml_hash = YAML.load_file(jt_config_file)
29
+ yml_hash.keys.each do |k|
30
+ if yml_hash[k]['extensions'] and !yml_hash[k]['extensions'].include?('mobilize-hive')
31
+ puts "adding mobilize-hive to jobtracker.yml/#{k}/extensions"
32
+ yml_hash[k]['extensions'] = yml_hash[k]['extensions'].to_a + ['mobilize-hive']
33
+ end
34
+ end
35
+ File.open(jt_config_file,"w") {|f| f.print(yml_hash.to_yaml)}
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,5 @@
1
+ module Mobilize
2
+ module Hive
3
+ VERSION = "1.0.07"
4
+ end
5
+ end
@@ -0,0 +1,8 @@
1
+ require "mobilize-hive/version"
2
+ require "mobilize-hdfs"
3
+
4
+ module Mobilize
5
+ module Hive
6
+ end
7
+ end
8
+ require "mobilize-hive/handlers/hive"
@@ -0,0 +1,19 @@
1
+ ---
2
+ development:
3
+ clusters:
4
+ dev_cluster:
5
+ max_slots: 5
6
+ temp_table_db: mobilize
7
+ exec_path: /path/to/hive
8
+ test:
9
+ clusters:
10
+ test_cluster:
11
+ max_slots: 5
12
+ temp_table_db: mobilize
13
+ exec_path: /path/to/hive
14
+ production:
15
+ clusters:
16
+ prod_cluster:
17
+ max_slots: 5
18
+ temp_table_db: mobilize
19
+ exec_path: /path/to/hive
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'mobilize-hive/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "mobilize-hive"
8
+ gem.version = Mobilize::Hive::VERSION
9
+ gem.authors = ["Cassio Paes-Leme"]
10
+ gem.email = ["cpaesleme@ngmoco.com"]
11
+ gem.description = %q{Adds hive read, write, and run support to mobilize-hdfs}
12
+ gem.summary = %q{Adds hive read, write, and run support to mobilize-hdfs}
13
+ gem.homepage = "http://github.com/dena/mobilize-hive"
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+ gem.add_runtime_dependency "mobilize-hdfs","1.0.07"
20
+ end
@@ -0,0 +1,26 @@
1
+ ---
2
+ - name: hive_test_1
3
+ active: true
4
+ trigger: once
5
+ status: ""
6
+ stage1: hive.write target:"mobilize/hive_test_1/act_date", drop:true,
7
+ source:"Runner_mobilize(test)/hive_test_1.in", schema:"hive_test_1.schema"
8
+ stage2: hive.run source:"hive_test_1.hql"
9
+ stage3: hive.run hql:"show databases;"
10
+ stage4: gsheet.write source:"stage2", target:"hive_test_1_stage_2.out"
11
+ stage5: gsheet.write source:"stage3", target:"hive_test_1_stage_3.out"
12
+ - name: hive_test_2
13
+ active: true
14
+ trigger: after hive_test_1
15
+ status: ""
16
+ stage1: hive.write source:"hdfs://user/mobilize/test/test_hdfs_1.out", target:"mobilize.hive_test_2", drop:true
17
+ stage2: hive.run hql:"select * from mobilize.hive_test_2;"
18
+ stage3: gsheet.write source:"stage2", target:"hive_test_2.out"
19
+ - name: hive_test_3
20
+ active: true
21
+ trigger: after hive_test_2
22
+ status: ""
23
+ stage1: hive.run hql:"select act_date as `date`,product,category,value from mobilize.hive_test_1;"
24
+ stage2: hive.write source:"stage1",target:"mobilize/hive_test_3/date/product", drop:true
25
+ stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3/date/product", drop:false
26
+ stage4: gsheet.write source:"hive://mobilize/hive_test_3", target:"hive_test_3.out"
@@ -0,0 +1 @@
1
+ select act_date,product, sum(value) as sum from mobilize.hive_test_1 group by act_date,product;