mobilize-hive 1.0.07
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +201 -0
- data/README.md +229 -0
- data/Rakefile +19 -0
- data/lib/mobilize-hive/handlers/hive.rb +530 -0
- data/lib/mobilize-hive/tasks.rb +38 -0
- data/lib/mobilize-hive/version.rb +5 -0
- data/lib/mobilize-hive.rb +8 -0
- data/lib/samples/hive.yml +19 -0
- data/mobilize-hive.gemspec +20 -0
- data/test/hive_job_rows.yml +26 -0
- data/test/hive_test_1.hql +1 -0
- data/test/hive_test_1_in.yml +41 -0
- data/test/hive_test_1_schema.yml +3 -0
- data/test/mobilize-hive_test.rb +71 -0
- data/test/redis-test.conf +540 -0
- data/test/test_helper.rb +10 -0
- metadata +87 -0
@@ -0,0 +1,530 @@
|
|
1
|
+
module Mobilize
|
2
|
+
module Hive
|
3
|
+
def Hive.config
|
4
|
+
Base.config('hive')
|
5
|
+
end
|
6
|
+
|
7
|
+
def Hive.exec_path(cluster)
|
8
|
+
Hive.clusters[cluster]['exec_path']
|
9
|
+
end
|
10
|
+
|
11
|
+
def Hive.output_db(cluster)
|
12
|
+
Hive.clusters[cluster]['output_db']
|
13
|
+
end
|
14
|
+
|
15
|
+
def Hive.output_db_user(cluster)
|
16
|
+
output_db_node = Hadoop.gateway_node(cluster)
|
17
|
+
output_db_user = Ssh.host(output_db_node)['user']
|
18
|
+
output_db_user
|
19
|
+
end
|
20
|
+
|
21
|
+
def Hive.clusters
|
22
|
+
Hive.config['clusters']
|
23
|
+
end
|
24
|
+
|
25
|
+
def Hive.slot_ids(cluster)
|
26
|
+
(1..Hive.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
|
27
|
+
end
|
28
|
+
|
29
|
+
def Hive.slot_worker_by_cluster_and_path(cluster,path)
|
30
|
+
working_slots = Mobilize::Resque.jobs('working').map{|j| j['hive_slot'] if (j and j['hive_slot'])}.compact
|
31
|
+
Hive.slot_ids(cluster).each do |slot_id|
|
32
|
+
unless working_slots.include?(slot_id)
|
33
|
+
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
|
34
|
+
return slot_id
|
35
|
+
end
|
36
|
+
end
|
37
|
+
#return false if none are available
|
38
|
+
return false
|
39
|
+
end
|
40
|
+
|
41
|
+
def Hive.unslot_worker_by_path(path)
|
42
|
+
begin
|
43
|
+
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
|
44
|
+
return true
|
45
|
+
rescue
|
46
|
+
return false
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
#get field names and partition datatypes and size of a hive table
|
51
|
+
def Hive.table_stats(db,table,cluster,user)
|
52
|
+
describe_sql = "use #{db};describe extended #{table}"
|
53
|
+
describe_output = Hive.run(describe_sql,cluster,user)
|
54
|
+
describe_output.split("location:").last.split(",").first
|
55
|
+
#get location, fields, partitions
|
56
|
+
result_hash = {}
|
57
|
+
result_hash['location'] = describe_output.split("location:").last.split(",").first
|
58
|
+
#get fields
|
59
|
+
field_defs = describe_output.split(" \nDetailed Table Information").first.split(
|
60
|
+
"\n").map{|f|
|
61
|
+
f.strip.split("\t").ie{|fa|
|
62
|
+
{"name"=>fa.first,"datatype"=>fa.second} if fa.first}}.compact
|
63
|
+
#check for partititons
|
64
|
+
if describe_output.index("partitionKeys:[FieldSchema")
|
65
|
+
part_field_string = describe_output.split("partitionKeys:[").last.split("]").first
|
66
|
+
#parse weird schema using yaml plus gsubs
|
67
|
+
yaml_fields = "---" + part_field_string.gsub("FieldSchema","\n").gsub(
|
68
|
+
")","").gsub(
|
69
|
+
",","\n ").gsub(
|
70
|
+
"(","- ").gsub(
|
71
|
+
"null","").gsub(
|
72
|
+
":",": ")
|
73
|
+
#return partitions without the comment part
|
74
|
+
result_hash['partitions'] = YAML.load(yaml_fields).map{|ph| ph.delete('comment');ph}
|
75
|
+
#get rid of fields in fields section that are also partitions
|
76
|
+
result_hash['partitions'].map{|p| p['name']}.each{|n| field_defs.delete_if{|f| f['name']==n}}
|
77
|
+
end
|
78
|
+
#assign field defs after removing partitions
|
79
|
+
result_hash['field_defs'] = field_defs
|
80
|
+
#get size
|
81
|
+
result_hash['size'] = Hadoop.run("fs -dus #{result_hash['location']}",cluster,user).split("\t").last.strip.to_i
|
82
|
+
return result_hash
|
83
|
+
end
|
84
|
+
|
85
|
+
#run a generic hive command, with the option of passing a file hash to be locally available
|
86
|
+
def Hive.run(hql,cluster,user,file_hash=nil)
|
87
|
+
filename = hql.to_md5
|
88
|
+
file_hash||= {}
|
89
|
+
file_hash[filename] = hql
|
90
|
+
#silent mode so we don't have logs in stderr; clip output
|
91
|
+
#at hadoop read limit
|
92
|
+
command = "#{Hive.exec_path(cluster)} -S -f #{filename} | head -c #{Hadoop.read_limit}"
|
93
|
+
gateway_node = Hadoop.gateway_node(cluster)
|
94
|
+
Ssh.run(gateway_node,command,user,file_hash)
|
95
|
+
end
|
96
|
+
|
97
|
+
def Hive.run_by_stage_path(stage_path)
|
98
|
+
s = Stage.where(:path=>stage_path).first
|
99
|
+
u = s.job.runner.user
|
100
|
+
params = s.params
|
101
|
+
user = params['user']
|
102
|
+
cluster = params['cluster'] || Hive.clusters.keys.first
|
103
|
+
node = Hadoop.gateway_node(cluster)
|
104
|
+
node_user = Ssh.host(node)['user']
|
105
|
+
if user and !Ssh.sudoers(node).include?(u.name)
|
106
|
+
raise "#{u.name} does not have su permissions for #{node}"
|
107
|
+
elsif user.nil? and Ssh.su_all_users(node)
|
108
|
+
user = u.name
|
109
|
+
end
|
110
|
+
|
111
|
+
#slot Hive worker if available
|
112
|
+
slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
|
113
|
+
return false unless slot_id
|
114
|
+
|
115
|
+
#output table stores stage output
|
116
|
+
output_db,output_table = [Hive.output_db(cluster),stage_path.gridsafe]
|
117
|
+
output_path = [output_db,output_table].join(".")
|
118
|
+
out_url = "hive://#{cluster}/#{output_db}/#{output_table}"
|
119
|
+
|
120
|
+
#get hql
|
121
|
+
if params['hql']
|
122
|
+
hql = params['hql']
|
123
|
+
else
|
124
|
+
#user has passed in a gsheet hql
|
125
|
+
gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
|
126
|
+
#return blank response if there are no slots available
|
127
|
+
return nil unless gdrive_slot
|
128
|
+
source_dst = s.source_dsts(gdrive_slot).first
|
129
|
+
Gdrive.unslot_worker_by_path(stage_path)
|
130
|
+
hql = source_dst.read(user)
|
131
|
+
end
|
132
|
+
|
133
|
+
#check for select at end
|
134
|
+
hql_array = hql.split(";").map{|hc| hc.strip}.reject{|hc| hc.length==0}
|
135
|
+
if hql_array.last.downcase.starts_with?("select")
|
136
|
+
#nil if no prior commands
|
137
|
+
prior_hql = hql_array[0..-2].join(";") if hql_array.length > 1
|
138
|
+
select_hql = hql_array.last
|
139
|
+
output_table_hql = ["drop table if exists #{output_path}",
|
140
|
+
"create table #{output_path} as #{select_hql};"].join(";")
|
141
|
+
full_hql = [prior_hql, output_table_hql].compact.join(";")
|
142
|
+
Hive.run(full_hql, cluster, user)
|
143
|
+
#make sure node user owns the stage result directory
|
144
|
+
output_table_stats = Hive.table_stats(output_db,output_table,cluster,node_user)
|
145
|
+
output_table_location = output_table_stats['location']
|
146
|
+
chown_command = "#{Hadoop.exec_path(cluster)} fs -chown -R #{node_user} '#{output_table_location}'"
|
147
|
+
Ssh.run(node,chown_command,node_user)
|
148
|
+
#already populated, make sure dataset exists
|
149
|
+
Dataset.find_or_create_by_url(out_url)
|
150
|
+
else
|
151
|
+
out_string = Hive.run(hql, cluster, user)
|
152
|
+
out_string = "result\n#{out_string}"
|
153
|
+
Dataset.write_by_url(out_url,out_string,node_user)
|
154
|
+
end
|
155
|
+
#unslot worker
|
156
|
+
Hive.unslot_worker_by_path(stage_path)
|
157
|
+
out_url
|
158
|
+
end
|
159
|
+
|
160
|
+
def Hive.schema_hash(schema_path,user,gdrive_slot)
|
161
|
+
if schema_path.index("/")
|
162
|
+
#slashes mean sheets
|
163
|
+
out_tsv = Gsheet.find_by_path(schema_path,gdrive_slot).read(user)
|
164
|
+
else
|
165
|
+
u = User.where(:name=>user).first
|
166
|
+
#check sheets in runner
|
167
|
+
r = u.runner
|
168
|
+
runner_sheet = r.gbook(gdrive_slot).worksheet_by_title(schema_path)
|
169
|
+
out_tsv = if runner_sheet
|
170
|
+
runner_sheet.read(user)
|
171
|
+
else
|
172
|
+
#check for gfile. will fail if there isn't one.
|
173
|
+
Gfile.find_by_path(schema_path).read(user)
|
174
|
+
end
|
175
|
+
#use Gridfs to cache gdrive results
|
176
|
+
file_name = schema_path.split("/").last
|
177
|
+
out_url = "gridfs://#{schema_path}/#{file_name}"
|
178
|
+
Dataset.write_by_url(out_url,out_tsv,user)
|
179
|
+
schema_tsv = Dataset.find_by_url(out_url).read(user)
|
180
|
+
schema_hash = {}
|
181
|
+
schema_tsv.tsv_to_hash_array.each do |ha|
|
182
|
+
schema_hash[ha['name']] = ha['datatype']
|
183
|
+
end
|
184
|
+
schema_hash
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
def Hive.path_params(cluster, path, user)
|
189
|
+
db, table, partitions = path.gsub(".","/").split("/").ie{|sp| [sp.first, sp.second, sp[2..-1]]}
|
190
|
+
#get existing table stats if any
|
191
|
+
curr_stats = begin
|
192
|
+
Hive.table_stats(db, table, cluster, user)
|
193
|
+
rescue
|
194
|
+
nil
|
195
|
+
end
|
196
|
+
{"db"=>db,
|
197
|
+
"table"=>table,
|
198
|
+
"partitions"=>partitions,
|
199
|
+
"curr_stats"=>curr_stats}
|
200
|
+
end
|
201
|
+
|
202
|
+
def Hive.hql_to_table(cluster, source_hql, target_path, user, drop=false, schema_hash=nil)
|
203
|
+
target_params = Hive.path_params(cluster, target_path, user)
|
204
|
+
target_table_path = ['db','table'].map{|k| target_params[k]}.join(".")
|
205
|
+
target_partitions = target_params['partitions'].to_a
|
206
|
+
target_table_stats = target_params['curr_stats']
|
207
|
+
|
208
|
+
#create temporary table so we can identify fields etc.
|
209
|
+
temp_db = Hive.output_db(cluster)
|
210
|
+
temp_table_name = (source_hql+target_path).to_md5
|
211
|
+
temp_table_path = [temp_db,temp_table_name].join(".")
|
212
|
+
temp_drop_hql = "drop table if exists #{temp_table_path};"
|
213
|
+
temp_create_hql = "#{temp_drop_hql}create table #{temp_table_path} as #{source_hql}"
|
214
|
+
Hive.run(temp_create_hql,cluster,user)
|
215
|
+
|
216
|
+
source_params = Hive.path_params(cluster, temp_table_path, user)
|
217
|
+
source_table_path = ['db','table'].map{|k| source_params[k]}.join(".")
|
218
|
+
source_table_stats = source_params['curr_stats']
|
219
|
+
source_fields = source_table_stats['field_defs']
|
220
|
+
|
221
|
+
if target_partitions.length == 0 and
|
222
|
+
target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
|
223
|
+
#no partitions in either user params or the target table
|
224
|
+
|
225
|
+
target_headers = source_fields.map{|f| f['name']}
|
226
|
+
|
227
|
+
target_field_stmt = target_headers.map{|h| "`#{h}`"}.join(",")
|
228
|
+
|
229
|
+
field_defs = {}
|
230
|
+
target_headers.each do |name|
|
231
|
+
datatype = schema_hash[name] || "string"
|
232
|
+
field_defs[name]=datatype
|
233
|
+
end
|
234
|
+
|
235
|
+
field_def_stmt = "(#{field_defs.map do |name,datatype|
|
236
|
+
"`#{name}` #{datatype}"
|
237
|
+
end.join(",")})"
|
238
|
+
|
239
|
+
#always drop when no partititons
|
240
|
+
target_drop_hql = "drop table if exists #{target_table_path};"
|
241
|
+
|
242
|
+
target_create_hql = "create table if not exists #{target_table_path} #{field_def_stmt};"
|
243
|
+
|
244
|
+
target_insert_hql = "insert overwrite table #{target_table_path} select #{target_field_stmt} from #{source_table_path};"
|
245
|
+
|
246
|
+
target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql,temp_drop_hql].join
|
247
|
+
|
248
|
+
Hive.run(target_full_hql, cluster, user)
|
249
|
+
|
250
|
+
elsif target_partitions.length > 0 and
|
251
|
+
target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == target_partitions}
|
252
|
+
#partitions and no target table or same partitions in both target table and user params
|
253
|
+
|
254
|
+
target_headers = source_fields.map{|f| f['name']}.reject{|h| target_partitions.include?(h)}
|
255
|
+
|
256
|
+
field_defs = {}
|
257
|
+
target_headers.each do |name|
|
258
|
+
datatype = schema_hash[name] || "string"
|
259
|
+
field_defs[name]=datatype
|
260
|
+
end
|
261
|
+
|
262
|
+
field_def_stmt = "(#{field_defs.map do |name,datatype|
|
263
|
+
"`#{name}` #{datatype}"
|
264
|
+
end.join(",")})"
|
265
|
+
|
266
|
+
part_defs = {}
|
267
|
+
target_partitions.each do |name|
|
268
|
+
datatype = schema_hash[name] || "string"
|
269
|
+
part_defs[name] = datatype
|
270
|
+
end
|
271
|
+
|
272
|
+
part_def_stmt = "(#{part_defs.map do |name,datatype|
|
273
|
+
"`#{name}` #{datatype}"
|
274
|
+
end.join(",")})"
|
275
|
+
|
276
|
+
target_field_stmt = target_headers.map{|h| "`#{h}`"}.join(",")
|
277
|
+
|
278
|
+
target_part_stmt = target_partitions.map{|h| "`#{h}`"}.join(",")
|
279
|
+
|
280
|
+
target_set_hql = ["set hive.exec.dynamic.partition.mode=nonstrict;",
|
281
|
+
"set hive.exec.max.dynamic.partitions.pernode=1000;",
|
282
|
+
"set hive.exec.dynamic.partition=true;",
|
283
|
+
"set hive.exec.max.created.files = 200000;",
|
284
|
+
"set hive.max.created.files = 200000;"].join
|
285
|
+
|
286
|
+
if drop or target_table_stats.nil?
|
287
|
+
target_drop_hql = "drop table if exists #{target_table_path};"
|
288
|
+
target_create_hql = target_drop_hql +
|
289
|
+
"create table if not exists #{target_table_path} #{field_def_stmt} " +
|
290
|
+
"partitioned by #{part_def_stmt};"
|
291
|
+
|
292
|
+
else
|
293
|
+
target_db,target_table = target_table_path.split(".")
|
294
|
+
#get all the permutations of possible partititons
|
295
|
+
part_perm_hql = "set hive.cli.print.header=true;select distinct #{target_part_stmt} from #{source_table_path};"
|
296
|
+
part_perm_tsv = Hive.run(part_perm_hql, cluster, user)
|
297
|
+
#having gotten the permutations, ensure they are dropped
|
298
|
+
part_hash_array = part_perm_tsv.tsv_to_hash_array
|
299
|
+
part_drop_hql = part_hash_array.map do |h|
|
300
|
+
part_drop_stmt = h.map do |name,value|
|
301
|
+
part_defs[name[1..-2]]=="string" ? "#{name}='#{value}'" : "#{name}=#{value}"
|
302
|
+
end.join(",")
|
303
|
+
"use #{target_db};alter table #{target_table} drop if exists partition (#{part_drop_stmt});"
|
304
|
+
end.join
|
305
|
+
target_create_hql = part_drop_hql
|
306
|
+
end
|
307
|
+
|
308
|
+
target_insert_hql = "insert overwrite table #{target_table_path} " +
|
309
|
+
"partition (#{target_part_stmt}) " +
|
310
|
+
"select #{target_field_stmt},#{target_part_stmt} from #{source_table_path};"
|
311
|
+
|
312
|
+
target_full_hql = [target_set_hql, target_create_hql, target_insert_hql, temp_drop_hql].join
|
313
|
+
|
314
|
+
Hive.run(target_full_hql, cluster, user)
|
315
|
+
else
|
316
|
+
error_msg = "Incompatible partition specs"
|
317
|
+
raise error_msg
|
318
|
+
end
|
319
|
+
return target_path
|
320
|
+
end
|
321
|
+
|
322
|
+
#turn a tsv into a hive table.
|
323
|
+
#Accepts options to drop existing target if any
|
324
|
+
#also schema with column datatype overrides
|
325
|
+
def Hive.tsv_to_table(cluster, source_tsv, target_path, user, drop=false, schema_hash=nil)
|
326
|
+
source_headers = source_tsv.tsv_header_array
|
327
|
+
|
328
|
+
target_params = Hive.path_params(cluster, target_path, user)
|
329
|
+
target_db,target_table = ['db','table'].map{|k| target_params[k]}
|
330
|
+
target_table_path = [target_db,target_table].join(".")
|
331
|
+
target_partitions = target_params['partitions'].to_a
|
332
|
+
target_table_stats = target_params['curr_stats']
|
333
|
+
|
334
|
+
schema_hash ||= {}
|
335
|
+
|
336
|
+
if target_partitions.length == 0 and
|
337
|
+
target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
|
338
|
+
#no partitions in either user params or the target table
|
339
|
+
#or drop and start fresh
|
340
|
+
|
341
|
+
#one file only, strip headers, replace tab with ctrl-a for hive
|
342
|
+
source_rows = source_tsv.split("\n")[1..-1].join("\n").gsub("\t","\001")
|
343
|
+
source_tsv_filename = "000000_0"
|
344
|
+
file_hash = {source_tsv_filename=>source_rows}
|
345
|
+
|
346
|
+
field_defs = source_headers.map do |name|
|
347
|
+
datatype = schema_hash[name] || "string"
|
348
|
+
"`#{name}` #{datatype}"
|
349
|
+
end.ie{|fs| "(#{fs.join(",")})"}
|
350
|
+
|
351
|
+
#for single insert, use drop table and create table always
|
352
|
+
target_drop_hql = "drop table if exists #{target_table_path}"
|
353
|
+
|
354
|
+
target_create_hql = "create table #{target_table_path} #{field_defs}"
|
355
|
+
|
356
|
+
#load source data
|
357
|
+
target_insert_hql = "load data local inpath '#{source_tsv_filename}' overwrite into table #{target_table_path};"
|
358
|
+
|
359
|
+
target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql].join(";")
|
360
|
+
|
361
|
+
Hive.run(target_full_hql, cluster, user, file_hash)
|
362
|
+
|
363
|
+
elsif target_partitions.length > 0 and
|
364
|
+
target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == target_partitions}
|
365
|
+
#partitions and no target table
|
366
|
+
#or same partitions in both target table and user params
|
367
|
+
#or drop and start fresh
|
368
|
+
|
369
|
+
target_headers = source_headers.reject{|h| target_partitions.include?(h)}
|
370
|
+
|
371
|
+
field_defs = "(#{target_headers.map do |name|
|
372
|
+
datatype = schema_hash[name] || "string"
|
373
|
+
"`#{name}` #{datatype}"
|
374
|
+
end.join(",")})"
|
375
|
+
|
376
|
+
partition_defs = "(#{target_partitions.map do |name|
|
377
|
+
datatype = schema_hash[name] || "string"
|
378
|
+
"#{name} #{datatype}"
|
379
|
+
end.join(",")})"
|
380
|
+
|
381
|
+
target_drop_hql = drop ? "drop table if exists #{target_table_path};" : ""
|
382
|
+
|
383
|
+
target_create_hql = target_drop_hql +
|
384
|
+
"create table if not exists #{target_table_path} #{field_defs} " +
|
385
|
+
"partitioned by #{partition_defs}"
|
386
|
+
|
387
|
+
#create target table early if not here
|
388
|
+
Hive.run(target_create_hql, cluster, user)
|
389
|
+
|
390
|
+
target_table_stats = Hive.table_stats(target_db, target_table, cluster, user)
|
391
|
+
|
392
|
+
#create data hash from source hash array
|
393
|
+
data_hash = {}
|
394
|
+
source_hash_array = source_tsv.tsv_to_hash_array
|
395
|
+
source_hash_array.each do |ha|
|
396
|
+
tpmk = target_partitions.map{|pn| "#{pn}=#{ha[pn]}"}.join("/")
|
397
|
+
tpmv = ha.reject{|k,v| target_partitions.include?(k)}.values.join("\001")
|
398
|
+
if data_hash[tpmk]
|
399
|
+
data_hash[tpmk] += "\n#{tpmv}"
|
400
|
+
else
|
401
|
+
data_hash[tpmk] = tpmv
|
402
|
+
end
|
403
|
+
end
|
404
|
+
|
405
|
+
#go through completed data hash and write each key value to the table in question
|
406
|
+
data_hash.each do |tpmk,tpmv|
|
407
|
+
base_filename = "000000_0"
|
408
|
+
part_pairs = tpmk.split("/").map{|p| p.split("=").ie{|pa| ["#{pa.first}","#{pa.second}"]}}
|
409
|
+
part_dir = part_pairs.map{|pp| "#{pp.first}=#{pp.second}"}.join("/")
|
410
|
+
part_stmt = part_pairs.map{|pp| "#{pp.first}='#{pp.second}'"}.join(",")
|
411
|
+
hdfs_dir = "#{target_table_stats['location']}/#{part_dir}"
|
412
|
+
hdfs_source_path = "/#{hdfs_dir.split("/")[3..-2].join("/")}/#{base_filename}"
|
413
|
+
hdfs_target_path = "/#{hdfs_dir.split("/")[3..-1].join("/")}/#{base_filename}"
|
414
|
+
#load partition into source path
|
415
|
+
puts "Writing to #{hdfs_source_path} for #{user} at #{Time.now.utc}"
|
416
|
+
Hdfs.write(hdfs_source_path,tpmv,user)
|
417
|
+
#let Hive know where the partition is
|
418
|
+
target_add_part_hql = "use #{target_db};alter table #{target_table} add if not exists partition (#{part_stmt}) location '#{hdfs_target_path}'"
|
419
|
+
target_insert_part_hql = "load data inpath '#{hdfs_source_path}' overwrite into table #{target_table} partition (#{part_stmt});"
|
420
|
+
target_part_hql = [target_add_part_hql,target_insert_part_hql].join(";")
|
421
|
+
puts "Adding partition #{tpmk} to #{target_table_path} for #{user} at #{Time.now.utc}"
|
422
|
+
Hive.run(target_part_hql, cluster, user)
|
423
|
+
end
|
424
|
+
else
|
425
|
+
error_msg = "Incompatible partition specs: " +
|
426
|
+
"target table:#{target_table_stats['partitions'].to_s}, " +
|
427
|
+
"user_params:#{target_partitions.to_s}"
|
428
|
+
raise error_msg
|
429
|
+
end
|
430
|
+
return target_path
|
431
|
+
end
|
432
|
+
|
433
|
+
def Hive.write_by_stage_path(stage_path)
|
434
|
+
s = Stage.where(:path=>stage_path).first
|
435
|
+
u = s.job.runner.user
|
436
|
+
params = s.params
|
437
|
+
user = params['user']
|
438
|
+
cluster = params['cluster'] || Hive.clusters.keys.first
|
439
|
+
|
440
|
+
#slot Hive worker if available
|
441
|
+
slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
|
442
|
+
return false unless slot_id
|
443
|
+
|
444
|
+
node = Hadoop.gateway_node(cluster)
|
445
|
+
node_user = Ssh.host(node)['user']
|
446
|
+
if user and !Ssh.sudoers(node).include?(u.name)
|
447
|
+
raise "#{u.name} does not have su permissions for #{node}"
|
448
|
+
elsif user.nil? and Ssh.su_all_users(node)
|
449
|
+
user = u.name
|
450
|
+
end
|
451
|
+
|
452
|
+
#determine path for target
|
453
|
+
target_path = params['target']
|
454
|
+
|
455
|
+
gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
|
456
|
+
#return blank response if there are no slots available
|
457
|
+
return nil unless gdrive_slot
|
458
|
+
source_dst = s.source_dsts(gdrive_slot).first
|
459
|
+
schema_hash = params['schema'] ? Hive.schema_hash(params['schema'],user,gdrive_slot) : {}
|
460
|
+
Gdrive.unslot_worker_by_path(stage_path)
|
461
|
+
|
462
|
+
#drop target before create/insert?
|
463
|
+
drop = params['drop']
|
464
|
+
|
465
|
+
#determine source
|
466
|
+
source_tsv,source_hql = [nil]*2
|
467
|
+
if params['hql']
|
468
|
+
source_hql = params['hql']
|
469
|
+
elsif source_dst
|
470
|
+
if source_dst.handler == 'hive'
|
471
|
+
#source table
|
472
|
+
cluster,source_path = source_dst.path.split("/").ie{|sp| [sp.first, sp[1..-1].join(".")]}
|
473
|
+
source_hql = "select * from #{source_path};"
|
474
|
+
elsif ['gridfs','hdfs'].include?(source_dst.handler)
|
475
|
+
if source_dst.path.ie{|sdp| sdp.index(/\.[A-Za-z]ql$/) or sdp.ends_with?(".ql")}
|
476
|
+
source_hql = source_dst.read(user)
|
477
|
+
else
|
478
|
+
#tsv from sheet
|
479
|
+
source_tsv = source_dst.read(user)
|
480
|
+
end
|
481
|
+
end
|
482
|
+
end
|
483
|
+
|
484
|
+
out_string = if source_hql
|
485
|
+
Hive.hql_to_table(cluster, source_hql, target_path, user, drop, schema_hash)
|
486
|
+
elsif source_tsv
|
487
|
+
Hive.tsv_to_table(cluster, source_tsv, target_path, user, drop, schema_hash)
|
488
|
+
else
|
489
|
+
raise "Unable to determine source tsv or source hql"
|
490
|
+
end
|
491
|
+
|
492
|
+
|
493
|
+
|
494
|
+
#unslot worker and write result
|
495
|
+
Hive.unslot_worker_by_path(stage_path)
|
496
|
+
|
497
|
+
#output table stores stage output
|
498
|
+
out_string = "result\n#{out_string}"
|
499
|
+
output_db,output_table = [Hive.output_db(cluster),stage_path.gridsafe]
|
500
|
+
out_url = "hive://#{cluster}/#{output_db}/#{output_table}"
|
501
|
+
Dataset.write_by_url(out_url,out_string,node_user)
|
502
|
+
out_url
|
503
|
+
end
|
504
|
+
|
505
|
+
def Hive.read_by_dataset_path(dst_path,user)
|
506
|
+
cluster,source_path = dst_path.split("/").ie do |sp|
|
507
|
+
if sp.length == 2
|
508
|
+
[Hive.clusters.first.first,sp.join(".")]
|
509
|
+
else
|
510
|
+
[sp.first, sp[1..-1].join(".")]
|
511
|
+
end
|
512
|
+
end
|
513
|
+
hql = "set hive.cli.print.header=true;select * from #{source_path};"
|
514
|
+
Hive.run(hql,cluster,user)
|
515
|
+
end
|
516
|
+
|
517
|
+
def Hive.write_by_dataset_path(dst_path,source_tsv,user)
|
518
|
+
cluster,target_path = dst_path.split("/").ie do |sp|
|
519
|
+
if sp.length == 2
|
520
|
+
[Hive.clusters.first.first,sp.join(".")]
|
521
|
+
else
|
522
|
+
[sp.first, sp[1..-1].join(".")]
|
523
|
+
end
|
524
|
+
end
|
525
|
+
drop = true
|
526
|
+
Hive.tsv_to_table(cluster, source_tsv, target_path, user, drop)
|
527
|
+
end
|
528
|
+
end
|
529
|
+
|
530
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
namespace :mobilize_hive do
|
2
|
+
desc "Set up config and log folders and files"
|
3
|
+
task :setup do
|
4
|
+
sample_dir = File.dirname(__FILE__) + '/../samples/'
|
5
|
+
sample_files = Dir.entries(sample_dir)
|
6
|
+
config_dir = (ENV['MOBILIZE_CONFIG_DIR'] ||= "config/mobilize/")
|
7
|
+
log_dir = (ENV['MOBILIZE_LOG_DIR'] ||= "log/")
|
8
|
+
full_config_dir = "#{ENV['PWD']}/#{config_dir}"
|
9
|
+
full_log_dir = "#{ENV['PWD']}/#{log_dir}"
|
10
|
+
unless File.exists?(full_config_dir)
|
11
|
+
puts "creating #{config_dir}"
|
12
|
+
`mkdir -p #{full_config_dir}`
|
13
|
+
end
|
14
|
+
unless File.exists?(full_log_dir)
|
15
|
+
puts "creating #{log_dir}"
|
16
|
+
`mkdir -p #{full_log_dir}`
|
17
|
+
end
|
18
|
+
sample_files.each do |fname|
|
19
|
+
unless File.exists?("#{full_config_dir}#{fname}")
|
20
|
+
puts "creating #{config_dir}#{fname}"
|
21
|
+
`cp #{sample_dir}#{fname} #{full_config_dir}#{fname}`
|
22
|
+
end
|
23
|
+
end
|
24
|
+
#make sure that the jobtracker.yml is updated to include the
|
25
|
+
#mobilize-ssh library
|
26
|
+
jt_config_file = "#{config_dir}jobtracker.yml"
|
27
|
+
if File.exists?(jt_config_file)
|
28
|
+
yml_hash = YAML.load_file(jt_config_file)
|
29
|
+
yml_hash.keys.each do |k|
|
30
|
+
if yml_hash[k]['extensions'] and !yml_hash[k]['extensions'].include?('mobilize-hive')
|
31
|
+
puts "adding mobilize-hive to jobtracker.yml/#{k}/extensions"
|
32
|
+
yml_hash[k]['extensions'] = yml_hash[k]['extensions'].to_a + ['mobilize-hive']
|
33
|
+
end
|
34
|
+
end
|
35
|
+
File.open(jt_config_file,"w") {|f| f.print(yml_hash.to_yaml)}
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
---
|
2
|
+
development:
|
3
|
+
clusters:
|
4
|
+
dev_cluster:
|
5
|
+
max_slots: 5
|
6
|
+
temp_table_db: mobilize
|
7
|
+
exec_path: /path/to/hive
|
8
|
+
test:
|
9
|
+
clusters:
|
10
|
+
test_cluster:
|
11
|
+
max_slots: 5
|
12
|
+
temp_table_db: mobilize
|
13
|
+
exec_path: /path/to/hive
|
14
|
+
production:
|
15
|
+
clusters:
|
16
|
+
prod_cluster:
|
17
|
+
max_slots: 5
|
18
|
+
temp_table_db: mobilize
|
19
|
+
exec_path: /path/to/hive
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'mobilize-hive/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "mobilize-hive"
|
8
|
+
gem.version = Mobilize::Hive::VERSION
|
9
|
+
gem.authors = ["Cassio Paes-Leme"]
|
10
|
+
gem.email = ["cpaesleme@ngmoco.com"]
|
11
|
+
gem.description = %q{Adds hive read, write, and run support to mobilize-hdfs}
|
12
|
+
gem.summary = %q{Adds hive read, write, and run support to mobilize-hdfs}
|
13
|
+
gem.homepage = "http://github.com/dena/mobilize-hive"
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
gem.add_runtime_dependency "mobilize-hdfs","1.0.07"
|
20
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
---
|
2
|
+
- name: hive_test_1
|
3
|
+
active: true
|
4
|
+
trigger: once
|
5
|
+
status: ""
|
6
|
+
stage1: hive.write target:"mobilize/hive_test_1/act_date", drop:true,
|
7
|
+
source:"Runner_mobilize(test)/hive_test_1.in", schema:"hive_test_1.schema"
|
8
|
+
stage2: hive.run source:"hive_test_1.hql"
|
9
|
+
stage3: hive.run hql:"show databases;"
|
10
|
+
stage4: gsheet.write source:"stage2", target:"hive_test_1_stage_2.out"
|
11
|
+
stage5: gsheet.write source:"stage3", target:"hive_test_1_stage_3.out"
|
12
|
+
- name: hive_test_2
|
13
|
+
active: true
|
14
|
+
trigger: after hive_test_1
|
15
|
+
status: ""
|
16
|
+
stage1: hive.write source:"hdfs://user/mobilize/test/test_hdfs_1.out", target:"mobilize.hive_test_2", drop:true
|
17
|
+
stage2: hive.run hql:"select * from mobilize.hive_test_2;"
|
18
|
+
stage3: gsheet.write source:"stage2", target:"hive_test_2.out"
|
19
|
+
- name: hive_test_3
|
20
|
+
active: true
|
21
|
+
trigger: after hive_test_2
|
22
|
+
status: ""
|
23
|
+
stage1: hive.run hql:"select act_date as `date`,product,category,value from mobilize.hive_test_1;"
|
24
|
+
stage2: hive.write source:"stage1",target:"mobilize/hive_test_3/date/product", drop:true
|
25
|
+
stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3/date/product", drop:false
|
26
|
+
stage4: gsheet.write source:"hive://mobilize/hive_test_3", target:"hive_test_3.out"
|
@@ -0,0 +1 @@
|
|
1
|
+
select act_date,product, sum(value) as sum from mobilize.hive_test_1 group by act_date,product;
|